Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git

This commit is contained in:
Stephen Rothwell 2024-12-20 13:32:50 +11:00
commit ea2356f802
155 changed files with 3344 additions and 2442 deletions

View File

@ -194,8 +194,6 @@ is applicable::
WDT Watchdog support is enabled.
X86-32 X86-32, aka i386 architecture is enabled.
X86-64 X86-64 architecture is enabled.
More X86-64 boot options can be found in
Documentation/arch/x86/x86_64/boot-options.rst.
X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64)
X86_UV SGI UV support is enabled.
XEN Xen support is enabled
@ -213,7 +211,6 @@ Do not modify the syntax of boot loader parameters without extreme
need or coordination with <Documentation/arch/x86/boot.rst>.
There are also arch-specific kernel-parameters not documented here.
See for example <Documentation/arch/x86/x86_64/boot-options.rst>.
Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
a trailing = on the name of any parameter states that that parameter will

View File

@ -21,6 +21,10 @@
strictly ACPI specification compliant.
rsdt -- prefer RSDT over (default) XSDT
copy_dsdt -- copy DSDT to memory
nocmcff -- Disable firmware first mode for corrected
errors. This disables parsing the HEST CMC error
source to check if firmware has set the FF flag. This
may result in duplicate corrected error reports.
nospcr -- disable console in ACPI SPCR table as
default _serial_ console on ARM64
For ARM64, ONLY "acpi=off", "acpi=on", "acpi=force" or
@ -405,6 +409,8 @@
not play well with APC CPU idle - disable it if you have
APC and your system crashes randomly.
apic [APIC,X86-64] Use IO-APIC. Default.
apic= [APIC,X86,EARLY] Advanced Programmable Interrupt Controller
Change the output verbosity while booting
Format: { quiet (default) | verbose | debug }
@ -424,6 +430,10 @@
useful so that a dump capture kernel won't be
shot down by NMI
apicpmtimer Do APIC timer calibration using the pmtimer. Implies
apicmaintimer. Useful when your PIT timer is totally
broken.
autoconf= [IPV6]
See Documentation/networking/ipv6.rst.
@ -1726,6 +1736,8 @@
off: Disable GDS mitigation.
gbpages [X86] Use GB pages for kernel direct mappings.
gcov_persist= [GCOV] When non-zero (default), profiling data for
kernel modules is saved and remains accessible via
debugfs, even when the module is unloaded/reloaded.
@ -2008,12 +2020,21 @@
idle= [X86,EARLY]
Format: idle=poll, idle=halt, idle=nomwait
Poll forces a polling idle loop that can slightly
improve the performance of waking up a idle CPU, but
will use a lot of power and make the system run hot.
Not recommended.
idle=poll: Don't do power saving in the idle loop
using HLT, but poll for rescheduling event. This will
make the CPUs eat a lot more power, but may be useful
to get slightly better performance in multiprocessor
benchmarks. It also makes some profiling using
performance counters more accurate. Please note that
on systems with MONITOR/MWAIT support (like Intel
EM64T CPUs) this option has no performance advantage
over the normal idle loop. It may also interact badly
with hyperthreading.
idle=halt: Halt is forced to be used for CPU idle.
In such case C2/C3 won't be used again.
idle=nomwait: Disable mwait for CPU C-states
idxd.sva= [HW]
@ -2311,20 +2332,73 @@
relaxed
iommu= [X86,EARLY]
off
Don't initialize and use any kind of IOMMU.
force
Force the use of the hardware IOMMU even when
it is not actually needed (e.g. because < 3 GB
memory).
noforce
Don't force hardware IOMMU usage when it is not
needed. (default).
biomerge
panic
nopanic
merge
nomerge
soft
pt [X86]
nopt [X86]
nobypass [PPC/POWERNV]
Use software bounce buffering (SWIOTLB) (default for
Intel machines). This can be used to prevent the usage
of an available hardware IOMMU.
[X86]
pt
[X86]
nopt
[PPC/POWERNV]
nobypass
Disable IOMMU bypass, using IOMMU for PCI devices.
[X86]
AMD Gart HW IOMMU-specific options:
<size>
Set the size of the remapping area in bytes.
allowed
Overwrite iommu off workarounds for specific chipsets
fullflush
Flush IOMMU on each allocation (default).
nofullflush
Don't use IOMMU fullflush.
memaper[=<order>]
Allocate an own aperture over RAM with size
32MB<<order. (default: order=1, i.e. 64MB)
merge
Do scatter-gather (SG) merging. Implies "force"
(experimental).
nomerge
Don't do scatter-gather (SG) merging.
noaperture
Ask the IOMMU not to touch the aperture for AGP.
noagp
Don't initialize the AGP driver and use full aperture.
panic
Always panic when IOMMU overflows.
iommu.forcedac= [ARM64,X86,EARLY] Control IOVA allocation for PCI devices.
Format: { "0" | "1" }
0 - Try to allocate a 32-bit DMA address first, before
@ -2432,7 +2506,9 @@
specified in the flag list (default: domain):
nohz
Disable the tick when a single task runs.
Disable the tick when a single task runs as well as
disabling other kernel noises like having RCU callbacks
offloaded. This is equivalent to the nohz_full parameter.
A residual 1Hz tick is offloaded to workqueues, which you
need to affine to housekeeping through the global
@ -3259,9 +3335,77 @@
devices can be requested on-demand with the
/dev/loop-control interface.
mce [X86-32] Machine Check Exception
mce= [X86-{32,64}]
Please see Documentation/arch/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
off
disable machine check
no_cmci
disable CMCI(Corrected Machine Check Interrupt) that
Intel processor supports. Usually this disablement is
not recommended, but it might be handy if your
hardware is misbehaving.
Note that you'll get more problems without CMCI than
with due to the shared banks, i.e. you might get
duplicated error logs.
dont_log_ce
don't make logs for corrected errors. All events
reported as corrected are silently cleared by OS. This
option will be useful if you have no interest in any
of corrected errors.
ignore_ce
disable features for corrected errors, e.g.
polling timer and CMCI. All events reported as
corrected are not cleared by OS and remained in its
error banks.
Usually this disablement is not recommended, however
if there is an agent checking/clearing corrected
errors (e.g. BIOS or hardware monitoring
applications), conflicting with OS's error handling,
and you cannot deactivate the agent, then this option
will be a help.
no_lmce
do not opt-in to Local MCE delivery. Use legacy method
to broadcast MCEs.
bootlog
enable logging of machine checks left over from
booting. Disabled by default on AMD Fam10h and older
because some BIOS leave bogus ones.
If your BIOS doesn't do that it's a good idea to
enable though to make sure you log even machine check
events that result in a reboot. On Intel systems it is
enabled by default.
nobootlog
disable boot machine check logging.
monarchtimeout (number)
sets the time in us to wait for other CPUs on machine
checks. 0 to disable.
bios_cmci_threshold
don't overwrite the bios-set CMCI threshold. This boot
option prevents Linux from overwriting the CMCI
threshold set by the bios. Without this option, Linux
always sets the CMCI threshold to 1. Enabling this may
make memory predictive failure analysis less effective
if the bios sets thresholds for memory errors since we
will not see details for all errors.
recovery
force-enable recoverable machine check code paths
Everything else is in sysfs now.
mce=option [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst
md= [HW] RAID subsystems devices and level
See Documentation/admin-guide/md.rst.
@ -3887,6 +4031,8 @@
noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any
IOAPICs that may be present in the system.
noapictimer [APIC,X86] Don't set up the APIC timer
noautogroup Disable scheduler automatic task group creation.
nocache [ARM,EARLY]
@ -3934,6 +4080,8 @@
register save and restore. The kernel will only save
legacy floating-point registers on task switch.
nogbpages [X86] Do not use GB pages for kernel direct mappings.
no_hash_pointers
[KNL,EARLY]
Force pointers printed to the console or buffers to be
@ -3960,6 +4108,8 @@
the impact of the sleep instructions. This is also
useful when using JTAG debugger.
nohpet [X86] Don't use the HPET timer.
nohugeiomap [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge I/O mappings.
nohugevmalloc [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge vmalloc mappings.
@ -4111,8 +4261,10 @@
nosync [HW,M68K] Disables sync negotiation for all devices.
no_timer_check [X86,APIC] Disables the code which tests for
broken timer IRQ sources.
no_timer_check [X86,APIC] Disables the code which tests for broken
timer IRQ sources, i.e., the IO-APIC timer. This can
work around problems with incorrect timer
initialization on some boards.
no_uaccess_flush
[PPC,EARLY] Don't flush the L1-D cache after accessing user data.
@ -4192,6 +4344,11 @@
If given as an integer followed by 'U', it will
divide each physical node into N emulated nodes.
numa=noacpi [X86] Don't parse the SRAT table for NUMA setup
numa=nohmat [X86] Don't parse the HMAT table for NUMA setup, or
soft-reserved memory partitioning.
numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic
NUMA balancing.
Allowed values are enable and disable
@ -5715,6 +5872,55 @@
reboot_cpu is s[mp]#### with #### being the processor
to be used for rebooting.
acpi
Use the ACPI RESET_REG in the FADT. If ACPI is not
configured or the ACPI reset does not work, the reboot
path attempts the reset using the keyboard controller.
bios
Use the CPU reboot vector for warm reset
cold
Set the cold reboot flag
default
There are some built-in platform specific "quirks"
- you may see: "reboot: <name> series board detected.
Selecting <type> for reboots." In the case where you
think the quirk is in error (e.g. you have newer BIOS,
or newer board) using this option will ignore the
built-in quirk table, and use the generic default
reboot actions.
efi
Use efi reset_system runtime service. If EFI is not
configured or the EFI reset does not work, the reboot
path attempts the reset using the keyboard controller.
force
Don't stop other CPUs on reboot. This can make reboot
more reliable in some cases.
kbd
Use the keyboard controller. cold reset (default)
pci
Use a write to the PCI config space register 0xcf9 to
trigger reboot.
triple
Force a triple fault (init)
warm
Don't set the cold reboot flag
Using warm reset will be much faster especially on big
memory systems because the BIOS will not go through
the memory check. Disadvantage is that not all
hardware will be completely reinitialized on reboot so
there may be boot problems on some systems.
refscale.holdoff= [KNL]
Set test-start holdoff period. The purpose of
this parameter is to delay the start of the
@ -6106,7 +6312,16 @@
serialnumber [BUGS=X86-32]
sev=option[,option...] [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst
sev=option[,option...] [X86-64]
debug
Enable debug messages.
nosnp
Do not enable SEV-SNP (applies to host/hypervisor
only). Setting 'nosnp' avoids the RMP check overhead
in memory accesses when users do not want to run
SEV-SNP guests.
shapers= [NET]
Maximal number of shapers.

View File

@ -130,8 +130,126 @@ SNP feature support.
More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR
Reverse Map Table (RMP)
=======================
The RMP is a structure in system memory that is used to ensure a one-to-one
mapping between system physical addresses and guest physical addresses. Each
page of memory that is potentially assignable to guests has one entry within
the RMP.
The RMP table can be either contiguous in memory or a collection of segments
in memory.
Contiguous RMP
--------------
Support for this form of the RMP is present when support for SEV-SNP is
present, which can be determined using the CPUID instruction::
0x8000001f[eax]:
Bit[4] indicates support for SEV-SNP
The location of the RMP is identified to the hardware through two MSRs::
0xc0010132 (RMP_BASE):
System physical address of the first byte of the RMP
0xc0010133 (RMP_END):
System physical address of the last byte of the RMP
Hardware requires that RMP_BASE and (RPM_END + 1) be 8KB aligned, but SEV
firmware increases the alignment requirement to require a 1MB alignment.
The RMP consists of a 16KB region used for processor bookkeeping followed
by the RMP entries, which are 16 bytes in size. The size of the RMP
determines the range of physical memory that the hypervisor can assign to
SEV-SNP guests. The RMP covers the system physical address from::
0 to ((RMP_END + 1 - RMP_BASE - 16KB) / 16B) x 4KB.
The current Linux support relies on BIOS to allocate/reserve the memory for
the RMP and to set RMP_BASE and RMP_END appropriately. Linux uses the MSR
values to locate the RMP and determine the size of the RMP. The RMP must
cover all of system memory in order for Linux to enable SEV-SNP.
Segmented RMP
-------------
Segmented RMP support is a new way of representing the layout of an RMP.
Initial RMP support required the RMP table to be contiguous in memory.
RMP accesses from a NUMA node on which the RMP doesn't reside
can take longer than accesses from a NUMA node on which the RMP resides.
Segmented RMP support allows the RMP entries to be located on the same
node as the memory the RMP is covering, potentially reducing latency
associated with accessing an RMP entry associated with the memory. Each
RMP segment covers a specific range of system physical addresses.
Support for this form of the RMP can be determined using the CPUID
instruction::
0x8000001f[eax]:
Bit[23] indicates support for segmented RMP
If supported, segmented RMP attributes can be found using the CPUID
instruction::
0x80000025[eax]:
Bits[5:0] minimum supported RMP segment size
Bits[11:6] maximum supported RMP segment size
0x80000025[ebx]:
Bits[9:0] number of cacheable RMP segment definitions
Bit[10] indicates if the number of cacheable RMP segments
is a hard limit
To enable a segmented RMP, a new MSR is available::
0xc0010136 (RMP_CFG):
Bit[0] indicates if segmented RMP is enabled
Bits[13:8] contains the size of memory covered by an RMP
segment (expressed as a power of 2)
The RMP segment size defined in the RMP_CFG MSR applies to all segments
of the RMP. Therefore each RMP segment covers a specific range of system
physical addresses. For example, if the RMP_CFG MSR value is 0x2401, then
the RMP segment coverage value is 0x24 => 36, meaning the size of memory
covered by an RMP segment is 64GB (1 << 36). So the first RMP segment
covers physical addresses from 0 to 0xF_FFFF_FFFF, the second RMP segment
covers physical addresses from 0x10_0000_0000 to 0x1F_FFFF_FFFF, etc.
When a segmented RMP is enabled, RMP_BASE points to the RMP bookkeeping
area as it does today (16K in size). However, instead of RMP entries
beginning immediately after the bookkeeping area, there is a 4K RMP
segment table (RST). Each entry in the RST is 8-bytes in size and represents
an RMP segment::
Bits[19:0] mapped size (in GB)
The mapped size can be less than the defined segment size.
A value of zero, indicates that no RMP exists for the range
of system physical addresses associated with this segment.
Bits[51:20] segment physical address
This address is left shift 20-bits (or just masked when
read) to form the physical address of the segment (1MB
alignment).
The RST can hold 512 segment entries but can be limited in size to the number
of cacheable RMP segments (CPUID 0x80000025_EBX[9:0]) if the number of cacheable
RMP segments is a hard limit (CPUID 0x80000025_EBX[10]).
The current Linux support relies on BIOS to allocate/reserve the memory for
the segmented RMP (the bookkeeping area, RST, and all segments), build the RST
and to set RMP_BASE, RMP_END, and RMP_CFG appropriately. Linux uses the MSR
values to locate the RMP and determine the size and location of the RMP
segments. The RMP must cover all of system memory in order for Linux to enable
SEV-SNP.
More details in the AMD64 APM Vol 2, section "15.36.3 Reverse Map Table",
docID: 24593.
Secure VM Service Module (SVSM)
===============================
SNP provides a feature called Virtual Machine Privilege Levels (VMPL) which
defines four privilege levels at which guest software can run. The most
privileged level is 0 and numerically higher numbers have lesser privileges.

View File

@ -384,6 +384,16 @@ When monitoring is enabled all MON groups will also contain:
Available only with debug option. The identifier used by hardware
for the monitor group. On x86 this is the RMID.
When the "mba_MBps" mount option is used all CTRL_MON groups will also contain:
"mba_MBps_event":
Reading this file shows which memory bandwidth event is used
as input to the software feedback loop that keeps memory bandwidth
below the value specified in the schemata file. Writing the
name of one of the supported memory bandwidth events found in
/sys/fs/resctrl/info/L3_MON/mon_features changes the input
event.
Resource allocation rules
-------------------------

View File

@ -135,6 +135,10 @@ Thread-related topology information in the kernel:
The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
"core_id."
- topology_logical_core_id();
The logical core ID to which a thread belongs.
System topology examples

View File

@ -1,312 +0,0 @@
.. SPDX-License-Identifier: GPL-2.0
===========================
AMD64 Specific Boot Options
===========================
There are many others (usually documented in driver documentation), but
only the AMD64 specific ones are listed here.
Machine check
=============
Please see Documentation/arch/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
mce=off
Disable machine check
mce=no_cmci
Disable CMCI(Corrected Machine Check Interrupt) that
Intel processor supports. Usually this disablement is
not recommended, but it might be handy if your hardware
is misbehaving.
Note that you'll get more problems without CMCI than with
due to the shared banks, i.e. you might get duplicated
error logs.
mce=dont_log_ce
Don't make logs for corrected errors. All events reported
as corrected are silently cleared by OS.
This option will be useful if you have no interest in any
of corrected errors.
mce=ignore_ce
Disable features for corrected errors, e.g. polling timer
and CMCI. All events reported as corrected are not cleared
by OS and remained in its error banks.
Usually this disablement is not recommended, however if
there is an agent checking/clearing corrected errors
(e.g. BIOS or hardware monitoring applications), conflicting
with OS's error handling, and you cannot deactivate the agent,
then this option will be a help.
mce=no_lmce
Do not opt-in to Local MCE delivery. Use legacy method
to broadcast MCEs.
mce=bootlog
Enable logging of machine checks left over from booting.
Disabled by default on AMD Fam10h and older because some BIOS
leave bogus ones.
If your BIOS doesn't do that it's a good idea to enable though
to make sure you log even machine check events that result
in a reboot. On Intel systems it is enabled by default.
mce=nobootlog
Disable boot machine check logging.
mce=monarchtimeout (number)
monarchtimeout:
Sets the time in us to wait for other CPUs on machine checks. 0
to disable.
mce=bios_cmci_threshold
Don't overwrite the bios-set CMCI threshold. This boot option
prevents Linux from overwriting the CMCI threshold set by the
bios. Without this option, Linux always sets the CMCI
threshold to 1. Enabling this may make memory predictive failure
analysis less effective if the bios sets thresholds for memory
errors since we will not see details for all errors.
mce=recovery
Force-enable recoverable machine check code paths
nomce (for compatibility with i386)
same as mce=off
Everything else is in sysfs now.
APICs
=====
apic
Use IO-APIC. Default
noapic
Don't use the IO-APIC.
disableapic
Don't use the local APIC
nolapic
Don't use the local APIC (alias for i386 compatibility)
pirq=...
See Documentation/arch/x86/i386/IO-APIC.rst
noapictimer
Don't set up the APIC timer
no_timer_check
Don't check the IO-APIC timer. This can work around
problems with incorrect timer initialization on some boards.
apicpmtimer
Do APIC timer calibration using the pmtimer. Implies
apicmaintimer. Useful when your PIT timer is totally broken.
Timing
======
notsc
Deprecated, use tsc=unstable instead.
nohpet
Don't use the HPET timer.
Idle loop
=========
idle=poll
Don't do power saving in the idle loop using HLT, but poll for rescheduling
event. This will make the CPUs eat a lot more power, but may be useful
to get slightly better performance in multiprocessor benchmarks. It also
makes some profiling using performance counters more accurate.
Please note that on systems with MONITOR/MWAIT support (like Intel EM64T
CPUs) this option has no performance advantage over the normal idle loop.
It may also interact badly with hyperthreading.
Rebooting
=========
reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] | p[ci] [, [w]arm | [c]old]
bios
Use the CPU reboot vector for warm reset
warm
Don't set the cold reboot flag
cold
Set the cold reboot flag
triple
Force a triple fault (init)
kbd
Use the keyboard controller. cold reset (default)
acpi
Use the ACPI RESET_REG in the FADT. If ACPI is not configured or
the ACPI reset does not work, the reboot path attempts the reset
using the keyboard controller.
efi
Use efi reset_system runtime service. If EFI is not configured or
the EFI reset does not work, the reboot path attempts the reset using
the keyboard controller.
pci
Use a write to the PCI config space register 0xcf9 to trigger reboot.
Using warm reset will be much faster especially on big memory
systems because the BIOS will not go through the memory check.
Disadvantage is that not all hardware will be completely reinitialized
on reboot so there may be boot problems on some systems.
reboot=force
Don't stop other CPUs on reboot. This can make reboot more reliable
in some cases.
reboot=default
There are some built-in platform specific "quirks" - you may see:
"reboot: <name> series board detected. Selecting <type> for reboots."
In the case where you think the quirk is in error (e.g. you have
newer BIOS, or newer board) using this option will ignore the built-in
quirk table, and use the generic default reboot actions.
NUMA
====
numa=off
Only set up a single NUMA node spanning all memory.
numa=noacpi
Don't parse the SRAT table for NUMA setup
numa=nohmat
Don't parse the HMAT table for NUMA setup, or soft-reserved memory
partitioning.
ACPI
====
acpi=off
Don't enable ACPI
acpi=ht
Use ACPI boot table parsing, but don't enable ACPI interpreter
acpi=force
Force ACPI on (currently not needed)
acpi=strict
Disable out of spec ACPI workarounds.
acpi_sci={edge,level,high,low}
Set up ACPI SCI interrupt.
acpi=noirq
Don't route interrupts
acpi=nocmcff
Disable firmware first mode for corrected errors. This
disables parsing the HEST CMC error source to check if
firmware has set the FF flag. This may result in
duplicate corrected error reports.
PCI
===
pci=off
Don't use PCI
pci=conf1
Use conf1 access.
pci=conf2
Use conf2 access.
pci=rom
Assign ROMs.
pci=assign-busses
Assign busses
pci=irqmask=MASK
Set PCI interrupt mask to MASK
pci=lastbus=NUMBER
Scan up to NUMBER busses, no matter what the mptable says.
pci=noacpi
Don't use ACPI to set up PCI interrupt routing.
IOMMU (input/output memory management unit)
===========================================
Multiple x86-64 PCI-DMA mapping implementations exist, for example:
1. <kernel/dma/direct.c>: use no hardware/software IOMMU at all
(e.g. because you have < 3 GB memory).
Kernel boot message: "PCI-DMA: Disabling IOMMU"
2. <arch/x86/kernel/amd_gart_64.c>: AMD GART based hardware IOMMU.
Kernel boot message: "PCI-DMA: using GART IOMMU"
3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used
e.g. if there is no hardware IOMMU in the system and it is need because
you have >3GB memory or told the kernel to us it (iommu=soft))
Kernel boot message: "PCI-DMA: Using software bounce buffering
for IO (SWIOTLB)"
::
iommu=[<size>][,noagp][,off][,force][,noforce]
[,memaper[=<order>]][,merge][,fullflush][,nomerge]
[,noaperture]
General iommu options:
off
Don't initialize and use any kind of IOMMU.
noforce
Don't force hardware IOMMU usage when it is not needed. (default).
force
Force the use of the hardware IOMMU even when it is
not actually needed (e.g. because < 3 GB memory).
soft
Use software bounce buffering (SWIOTLB) (default for
Intel machines). This can be used to prevent the usage
of an available hardware IOMMU.
iommu options only relevant to the AMD GART hardware IOMMU:
<size>
Set the size of the remapping area in bytes.
allowed
Overwrite iommu off workarounds for specific chipsets.
fullflush
Flush IOMMU on each allocation (default).
nofullflush
Don't use IOMMU fullflush.
memaper[=<order>]
Allocate an own aperture over RAM with size 32MB<<order.
(default: order=1, i.e. 64MB)
merge
Do scatter-gather (SG) merging. Implies "force" (experimental).
nomerge
Don't do scatter-gather (SG) merging.
noaperture
Ask the IOMMU not to touch the aperture for AGP.
noagp
Don't initialize the AGP driver and use full aperture.
panic
Always panic when IOMMU overflows.
iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
implementation:
swiotlb=<slots>[,force,noforce]
<slots>
Prereserve that many 2K slots for the software IO bounce buffering.
force
Force all IO through the software TLB.
noforce
Do not initialize the software TLB.
Miscellaneous
=============
nogbpages
Do not use GB pages for kernel direct mappings.
gbpages
Use GB pages for kernel direct mappings.
AMD SEV (Secure Encrypted Virtualization)
=========================================
Options relating to AMD SEV, specified via the following format:
::
sev=option1[,option2]
The available options are:
debug
Enable debug messages.
nosnp
Do not enable SEV-SNP (applies to host/hypervisor only). Setting
'nosnp' avoids the RMP check overhead in memory accesses when
users do not want to run SEV-SNP guests.

View File

@ -18,7 +18,7 @@ For more information on the features of cpusets, see
Documentation/admin-guide/cgroup-v1/cpusets.rst.
There are a number of different configurations you can use for your needs. For
more information on the numa=fake command line option and its various ways of
configuring fake nodes, see Documentation/arch/x86/x86_64/boot-options.rst.
configuring fake nodes, see Documentation/admin-guide/kernel-parameters.txt
For the purposes of this introduction, we'll assume a very primitive NUMA
emulation setup of "numa=fake=4*512,". This will split our system memory into

View File

@ -7,7 +7,6 @@ x86_64 Support
.. toctree::
:maxdepth: 2
boot-options
uefi
mm
5level-paging

View File

@ -127,29 +127,6 @@ void crash_smp_send_stop(void)
cpus_stopped = 1;
}
static void machine_kexec_mask_interrupts(void)
{
unsigned int i;
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
struct irq_chip *chip;
chip = irq_desc_get_chip(desc);
if (!chip)
continue;
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
chip->irq_eoi(&desc->irq_data);
if (chip->irq_mask)
chip->irq_mask(&desc->irq_data);
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
chip->irq_disable(&desc->irq_data);
}
}
void machine_crash_shutdown(struct pt_regs *regs)
{
local_irq_disable();

View File

@ -149,6 +149,7 @@ config ARM64
select GENERIC_IDLE_POLL_SETUP
select GENERIC_IOREMAP
select GENERIC_IRQ_IPI
select GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL

View File

@ -207,37 +207,6 @@ void machine_kexec(struct kimage *kimage)
BUG(); /* Should never get here. */
}
static void machine_kexec_mask_interrupts(void)
{
unsigned int i;
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
struct irq_chip *chip;
int ret;
chip = irq_desc_get_chip(desc);
if (!chip)
continue;
/*
* First try to remove the active state. If this
* fails, try to EOI the interrupt.
*/
ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
if (ret && irqd_irq_inprogress(&desc->irq_data) &&
chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
if (chip->irq_mask)
chip->irq_mask(&desc->irq_data);
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
chip->irq_disable(&desc->irq_data);
}
}
/**
* machine_crash_shutdown - shutdown non-crashing cpus and save registers
*/

View File

@ -4,6 +4,7 @@
#include <asm/break.h>
#include <linux/stringify.h>
#include <linux/objtool.h>
#ifndef CONFIG_DEBUG_BUGVERBOSE
#define _BUGVERBOSE_LOCATION(file, line)
@ -33,25 +34,25 @@
#define ASM_BUG_FLAGS(flags) \
__BUG_ENTRY(flags) \
break BRK_BUG
break BRK_BUG;
#define ASM_BUG() ASM_BUG_FLAGS(0)
#define __BUG_FLAGS(flags) \
asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)));
#define __BUG_FLAGS(flags, extra) \
asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)) \
extra);
#define __WARN_FLAGS(flags) \
do { \
instrumentation_begin(); \
__BUG_FLAGS(BUGFLAG_WARNING|(flags)); \
annotate_reachable(); \
__BUG_FLAGS(BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE(10001b));\
instrumentation_end(); \
} while (0)
#define BUG() \
do { \
instrumentation_begin(); \
__BUG_FLAGS(0); \
__BUG_FLAGS(0, ""); \
unreachable(); \
} while (0)

View File

@ -61,7 +61,6 @@ struct pt_regs;
extern void kexec_smp_wait(void); /* get and clear naca physid, wait for
master to copy new code to 0 */
extern void default_machine_kexec(struct kimage *image);
extern void machine_kexec_mask_interrupts(void);
void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_code_buffer,
unsigned long start_address) __noreturn;

View File

@ -22,28 +22,6 @@
#include <asm/setup.h>
#include <asm/firmware.h>
void machine_kexec_mask_interrupts(void) {
unsigned int i;
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
struct irq_chip *chip;
chip = irq_desc_get_chip(desc);
if (!chip)
continue;
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
chip->irq_eoi(&desc->irq_data);
if (chip->irq_mask)
chip->irq_mask(&desc->irq_data);
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
chip->irq_disable(&desc->irq_data);
}
}
#ifdef CONFIG_CRASH_DUMP
void machine_crash_shutdown(struct pt_regs *regs)
{

View File

@ -7,6 +7,7 @@
* Copyright (C) 2005 IBM Corporation.
*/
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/mm.h>
#include <linux/string.h>

View File

@ -114,29 +114,6 @@ void machine_shutdown(void)
#endif
}
static void machine_kexec_mask_interrupts(void)
{
unsigned int i;
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
struct irq_chip *chip;
chip = irq_desc_get_chip(desc);
if (!chip)
continue;
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
chip->irq_eoi(&desc->irq_data);
if (chip->irq_mask)
chip->irq_mask(&desc->irq_data);
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
chip->irq_disable(&desc->irq_data);
}
}
/*
* machine_crash_shutdown - Prepare to kexec after a kernel crash
*

View File

@ -981,7 +981,7 @@ static int cfdiag_push_sample(struct perf_event *event,
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = cpuhw->usedss;
raw.frag.data = cpuhw->stop;
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
overflow = perf_event_overflow(event, &data, &regs);

View File

@ -981,7 +981,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
cpuhw->flags &= ~PMU_F_ENABLED;
}
/* perf_exclude_event() - Filter event
/* perf_event_exclude() - Filter event
* @event: The perf event
* @regs: pt_regs structure
* @sde_regs: Sample-data-entry (sde) regs structure
@ -990,7 +990,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
*
* Return non-zero if the event shall be excluded.
*/
static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs,
static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs,
struct perf_sf_sde_regs *sde_regs)
{
if (event->attr.exclude_user && user_mode(regs))
@ -1073,7 +1073,7 @@ static int perf_push_sample(struct perf_event *event,
data.tid_entry.pid = basic->hpp & LPP_PID_MASK;
overflow = 0;
if (perf_exclude_event(event, &regs, sde_regs))
if (perf_event_exclude(event, &regs, sde_regs))
goto out;
if (perf_event_overflow(event, &data, &regs)) {
overflow = 1;

View File

@ -478,7 +478,7 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = rawsize;
raw.frag.data = cpump->save;
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
overflow = perf_event_overflow(event, &data, &regs);

View File

@ -503,7 +503,7 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = rawsize;
raw.frag.data = cpump->save;
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
overflow = perf_event_overflow(event, &data, &regs);

View File

@ -97,7 +97,7 @@ config IOMMU_DEBUG
code. When you use it make sure you have a big enough
IOMMU/AGP aperture. Most of the options enabled by this can
be set more finegrained using the iommu= command line
options. See Documentation/arch/x86/x86_64/boot-options.rst for more
options. See Documentation/admin-guide/kernel-parameters.txt for more
details.
config IOMMU_LEAK

View File

@ -25,10 +25,6 @@
#include "efi.h"
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
#include <linux/ctype.h>
#include <generated/utsversion.h>
#include <generated/utsrelease.h>

View File

@ -777,15 +777,10 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr,
val = sev_es_rd_ghcb_msr();
if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
"Wrong PSC response code: 0x%x\n",
(unsigned int)GHCB_RESP_CODE(val)))
if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP)
goto e_term;
if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
"Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
paddr, GHCB_MSR_PSC_RESP_VAL(val)))
if (GHCB_MSR_PSC_RESP_VAL(val))
goto e_term;
/* Page validation must be performed after changing to private */
@ -821,7 +816,7 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
}
void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
unsigned long npages)
{
/*
@ -2361,8 +2356,8 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
call.rcx = pa;
ret = svsm_perform_call_protocol(&call);
if (ret)
panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", ret, call.rax_out);
while (ret)
cpu_relax(); /* too early to panic */
RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa;
RIP_REL_REF(boot_svsm_caa_pa) = pa;

View File

@ -498,7 +498,7 @@ static const struct snp_cpuid_table *snp_cpuid_get_table(void)
*
* Return: XSAVE area size on success, 0 otherwise.
*/
static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
u64 xfeatures_found = 0;
@ -576,8 +576,9 @@ static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpui
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
}
static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
struct cpuid_leaf *leaf)
static int __head
snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
struct cpuid_leaf *leaf)
{
struct cpuid_leaf leaf_hv = *leaf;
@ -1243,7 +1244,7 @@ static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svs
__pval_terminate(pfn, action, page_size, ret, svsm_ret);
}
static void svsm_pval_4k_page(unsigned long paddr, bool validate)
static void __head svsm_pval_4k_page(unsigned long paddr, bool validate)
{
struct svsm_pvalidate_call *pc;
struct svsm_call call = {};
@ -1275,12 +1276,13 @@ static void svsm_pval_4k_page(unsigned long paddr, bool validate)
ret = svsm_perform_call_protocol(&call);
if (ret)
svsm_pval_terminate(pc, ret, call.rax_out);
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
native_local_irq_restore(flags);
}
static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool validate)
static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr,
bool validate)
{
int ret;
@ -1293,7 +1295,7 @@ static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool val
} else {
ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
if (ret)
__pval_terminate(PHYS_PFN(paddr), validate, RMP_PG_SIZE_4K, ret, 0);
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
}
}

View File

@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0
obj-y += tdx.o tdx-shared.o tdcall.o
obj-y += debug.o tdcall.o tdx.o tdx-shared.o

69
arch/x86/coco/tdx/debug.c Normal file
View File

@ -0,0 +1,69 @@
// SPDX-License-Identifier: GPL-2.0
#undef pr_fmt
#define pr_fmt(fmt) "tdx: " fmt
#include <linux/array_size.h>
#include <linux/printk.h>
#include <asm/tdx.h>
#define DEF_TDX_ATTR_NAME(_name) [TDX_ATTR_##_name##_BIT] = __stringify(_name)
static __initdata const char *tdx_attributes[] = {
DEF_TDX_ATTR_NAME(DEBUG),
DEF_TDX_ATTR_NAME(HGS_PLUS_PROF),
DEF_TDX_ATTR_NAME(PERF_PROF),
DEF_TDX_ATTR_NAME(PMT_PROF),
DEF_TDX_ATTR_NAME(ICSSD),
DEF_TDX_ATTR_NAME(LASS),
DEF_TDX_ATTR_NAME(SEPT_VE_DISABLE),
DEF_TDX_ATTR_NAME(MIGRTABLE),
DEF_TDX_ATTR_NAME(PKS),
DEF_TDX_ATTR_NAME(KL),
DEF_TDX_ATTR_NAME(TPA),
DEF_TDX_ATTR_NAME(PERFMON),
};
#define DEF_TD_CTLS_NAME(_name) [TD_CTLS_##_name##_BIT] = __stringify(_name)
static __initdata const char *tdcs_td_ctls[] = {
DEF_TD_CTLS_NAME(PENDING_VE_DISABLE),
DEF_TD_CTLS_NAME(ENUM_TOPOLOGY),
DEF_TD_CTLS_NAME(VIRT_CPUID2),
DEF_TD_CTLS_NAME(REDUCE_VE),
DEF_TD_CTLS_NAME(LOCK),
};
void __init tdx_dump_attributes(u64 td_attr)
{
pr_info("Attributes:");
for (int i = 0; i < ARRAY_SIZE(tdx_attributes); i++) {
if (!tdx_attributes[i])
continue;
if (td_attr & BIT(i))
pr_cont(" %s", tdx_attributes[i]);
td_attr &= ~BIT(i);
}
if (td_attr)
pr_cont(" unknown:%#llx", td_attr);
pr_cont("\n");
}
void __init tdx_dump_td_ctls(u64 td_ctls)
{
pr_info("TD_CTLS:");
for (int i = 0; i < ARRAY_SIZE(tdcs_td_ctls); i++) {
if (!tdcs_td_ctls[i])
continue;
if (td_ctls & BIT(i))
pr_cont(" %s", tdcs_td_ctls[i]);
td_ctls &= ~BIT(i);
}
if (td_ctls)
pr_cont(" unknown:%#llx", td_ctls);
pr_cont("\n");
}

View File

@ -32,9 +32,6 @@
#define VE_GET_PORT_NUM(e) ((e) >> 16)
#define VE_IS_IO_STRING(e) ((e) & BIT(4))
#define ATTR_DEBUG BIT(0)
#define ATTR_SEPT_VE_DISABLE BIT(28)
/* TDX Module call error codes */
#define TDCALL_RETURN_CODE(a) ((a) >> 32)
#define TDCALL_INVALID_OPERAND 0xc0000100
@ -200,14 +197,14 @@ static void __noreturn tdx_panic(const char *msg)
*
* TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM
* controls if the guest will receive such #VE with TD attribute
* ATTR_SEPT_VE_DISABLE.
* TDX_ATTR_SEPT_VE_DISABLE.
*
* Newer TDX modules allow the guest to control if it wants to receive SEPT
* violation #VEs.
*
* Check if the feature is available and disable SEPT #VE if possible.
*
* If the TD is allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE
* If the TD is allowed to disable/enable SEPT #VEs, the TDX_ATTR_SEPT_VE_DISABLE
* attribute is no longer reliable. It reflects the initial state of the
* control for the TD, but it will not be updated if someone (e.g. bootloader)
* changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to
@ -216,14 +213,14 @@ static void __noreturn tdx_panic(const char *msg)
static void disable_sept_ve(u64 td_attr)
{
const char *msg = "TD misconfiguration: SEPT #VE has to be disabled";
bool debug = td_attr & ATTR_DEBUG;
bool debug = td_attr & TDX_ATTR_DEBUG;
u64 config, controls;
/* Is this TD allowed to disable SEPT #VE */
tdg_vm_rd(TDCS_CONFIG_FLAGS, &config);
if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) {
/* No SEPT #VE controls for the guest: check the attribute */
if (td_attr & ATTR_SEPT_VE_DISABLE)
if (td_attr & TDX_ATTR_SEPT_VE_DISABLE)
return;
/* Relax SEPT_VE_DISABLE check for debug TD for backtraces */
@ -274,6 +271,20 @@ static void enable_cpu_topology_enumeration(void)
tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY);
}
static void reduce_unnecessary_ve(void)
{
u64 err = tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_REDUCE_VE, TD_CTLS_REDUCE_VE);
if (err == TDX_SUCCESS)
return;
/*
* Enabling REDUCE_VE includes ENUM_TOPOLOGY. Only try to
* enable ENUM_TOPOLOGY if REDUCE_VE was not successful.
*/
enable_cpu_topology_enumeration();
}
static void tdx_setup(u64 *cc_mask)
{
struct tdx_module_args args = {};
@ -305,7 +316,8 @@ static void tdx_setup(u64 *cc_mask)
tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
disable_sept_ve(td_attr);
enable_cpu_topology_enumeration();
reduce_unnecessary_ve();
}
/*
@ -1025,6 +1037,20 @@ static void tdx_kexec_finish(void)
}
}
static __init void tdx_announce(void)
{
struct tdx_module_args args = {};
u64 controls;
pr_info("Guest detected\n");
tdcall(TDG_VP_INFO, &args);
tdx_dump_attributes(args.rdx);
tdg_vm_rd(TDCS_TD_CTLS, &controls);
tdx_dump_td_ctls(controls);
}
void __init tdx_early_init(void)
{
u64 cc_mask;
@ -1094,5 +1120,5 @@ void __init tdx_early_init(void)
*/
x86_cpuinit.parallel_bringup = false;
pr_info("Guest detected\n");
tdx_announce();
}

View File

@ -308,10 +308,9 @@ SYM_CODE_END(xen_error_entry)
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.endif
call \cfunc
/* For some configurations \cfunc ends up being a noreturn. */
REACHABLE
ANNOTATE_REACHABLE
call \cfunc
jmp error_return
.endm
@ -529,10 +528,10 @@ SYM_CODE_START(\asmsym)
movq %rsp, %rdi /* pt_regs pointer into first argument */
movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
call \cfunc
/* For some configurations \cfunc ends up being a noreturn. */
REACHABLE
ANNOTATE_REACHABLE
call \cfunc
jmp paranoid_exit

View File

@ -1001,8 +1001,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
if (has_branch_stack(event))
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);

View File

@ -31,6 +31,8 @@ static u32 ibs_caps;
#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
/* attr.config2 */
#define IBS_SW_FILTER_MASK 1
/*
* IBS states:
@ -290,6 +292,16 @@ static int perf_ibs_init(struct perf_event *event)
if (has_branch_stack(event))
return -EOPNOTSUPP;
/* handle exclude_{user,kernel} in the IRQ handler */
if (event->attr.exclude_host || event->attr.exclude_guest ||
event->attr.exclude_idle)
return -EINVAL;
if (!(event->attr.config2 & IBS_SW_FILTER_MASK) &&
(event->attr.exclude_kernel || event->attr.exclude_user ||
event->attr.exclude_hv))
return -EINVAL;
ret = validate_group(event);
if (ret)
return ret;
@ -550,24 +562,14 @@ static struct attribute *attrs_empty[] = {
NULL,
};
static struct attribute_group empty_format_group = {
.name = "format",
.attrs = attrs_empty,
};
static struct attribute_group empty_caps_group = {
.name = "caps",
.attrs = attrs_empty,
};
static const struct attribute_group *empty_attr_groups[] = {
&empty_format_group,
&empty_caps_group,
NULL,
};
PMU_FORMAT_ATTR(rand_en, "config:57");
PMU_FORMAT_ATTR(cnt_ctl, "config:19");
PMU_FORMAT_ATTR(swfilt, "config2:0");
PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
@ -578,8 +580,9 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int
return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
}
static struct attribute *rand_en_attrs[] = {
static struct attribute *fetch_attrs[] = {
&format_attr_rand_en.attr,
&format_attr_swfilt.attr,
NULL,
};
@ -593,9 +596,9 @@ static struct attribute *zen4_ibs_extensions_attrs[] = {
NULL,
};
static struct attribute_group group_rand_en = {
static struct attribute_group group_fetch_formats = {
.name = "format",
.attrs = rand_en_attrs,
.attrs = fetch_attrs,
};
static struct attribute_group group_fetch_l3missonly = {
@ -611,7 +614,7 @@ static struct attribute_group group_zen4_ibs_extensions = {
};
static const struct attribute_group *fetch_attr_groups[] = {
&group_rand_en,
&group_fetch_formats,
&empty_caps_group,
NULL,
};
@ -628,6 +631,11 @@ cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
}
static struct attribute *op_attrs[] = {
&format_attr_swfilt.attr,
NULL,
};
static struct attribute *cnt_ctl_attrs[] = {
&format_attr_cnt_ctl.attr,
NULL,
@ -638,6 +646,11 @@ static struct attribute *op_l3missonly_attrs[] = {
NULL,
};
static struct attribute_group group_op_formats = {
.name = "format",
.attrs = op_attrs,
};
static struct attribute_group group_cnt_ctl = {
.name = "format",
.attrs = cnt_ctl_attrs,
@ -650,6 +663,12 @@ static struct attribute_group group_op_l3missonly = {
.is_visible = zen4_ibs_extensions_is_visible,
};
static const struct attribute_group *op_attr_groups[] = {
&group_op_formats,
&empty_caps_group,
NULL,
};
static const struct attribute_group *op_attr_update[] = {
&group_cnt_ctl,
&group_op_l3missonly,
@ -667,7 +686,6 @@ static struct perf_ibs perf_ibs_fetch = {
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
},
.msr = MSR_AMD64_IBSFETCHCTL,
.config_mask = IBS_FETCH_CONFIG_MASK,
@ -691,7 +709,6 @@ static struct perf_ibs perf_ibs_op = {
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
},
.msr = MSR_AMD64_IBSOPCTL,
.config_mask = IBS_OP_CONFIG_MASK,
@ -1111,6 +1128,12 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
regs.flags |= PERF_EFLAGS_EXACT;
}
if ((event->attr.config2 & IBS_SW_FILTER_MASK) &&
perf_exclude_event(event, &regs)) {
throttle = perf_event_account_interrupt(event);
goto out;
}
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw = (struct perf_raw_record){
.frag = {
@ -1118,7 +1141,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
.data = ibs_data.data,
},
};
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
if (perf_ibs == &perf_ibs_op)
@ -1129,8 +1152,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
* recorded as part of interrupt regs. Thus we need to use rip from
* interrupt regs while unwinding call stack.
*/
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
perf_sample_save_callchain(&data, event, iregs);
perf_sample_save_callchain(&data, event, iregs);
throttle = perf_event_overflow(event, &data, &regs);
out:
@ -1228,7 +1250,7 @@ static __init int perf_ibs_op_init(void)
if (ibs_caps & IBS_CAPS_ZEN4)
perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
perf_ibs_op.pmu.attr_groups = empty_attr_groups;
perf_ibs_op.pmu.attr_groups = op_attr_groups;
perf_ibs_op.pmu.attr_update = op_attr_update;
return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");

View File

@ -1707,8 +1707,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
perf_sample_data_init(&data, 0, event->hw.last_period);
if (has_branch_stack(event))
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);

View File

@ -5371,42 +5371,32 @@ static __init void intel_clovertown_quirk(void)
x86_pmu.pebs_constraints = NULL;
}
static const struct x86_cpu_desc isolation_ucodes[] = {
INTEL_CPU_DESC(INTEL_HASWELL, 3, 0x0000001f),
INTEL_CPU_DESC(INTEL_HASWELL_L, 1, 0x0000001e),
INTEL_CPU_DESC(INTEL_HASWELL_G, 1, 0x00000015),
INTEL_CPU_DESC(INTEL_HASWELL_X, 2, 0x00000037),
INTEL_CPU_DESC(INTEL_HASWELL_X, 4, 0x0000000a),
INTEL_CPU_DESC(INTEL_BROADWELL, 4, 0x00000023),
INTEL_CPU_DESC(INTEL_BROADWELL_G, 1, 0x00000014),
INTEL_CPU_DESC(INTEL_BROADWELL_D, 2, 0x00000010),
INTEL_CPU_DESC(INTEL_BROADWELL_D, 3, 0x07000009),
INTEL_CPU_DESC(INTEL_BROADWELL_D, 4, 0x0f000009),
INTEL_CPU_DESC(INTEL_BROADWELL_D, 5, 0x0e000002),
INTEL_CPU_DESC(INTEL_BROADWELL_X, 1, 0x0b000014),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 3, 0x00000021),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 4, 0x00000000),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 5, 0x00000000),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 6, 0x00000000),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 7, 0x00000000),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 11, 0x00000000),
INTEL_CPU_DESC(INTEL_SKYLAKE_L, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_SKYLAKE, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_KABYLAKE, 9, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 9, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 10, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 11, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 12, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE, 10, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE, 11, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE, 12, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE, 13, 0x0000004e),
static const struct x86_cpu_id isolation_ucodes[] = {
X86_MATCH_VFM_STEPS(INTEL_HASWELL, 3, 3, 0x0000001f),
X86_MATCH_VFM_STEPS(INTEL_HASWELL_L, 1, 1, 0x0000001e),
X86_MATCH_VFM_STEPS(INTEL_HASWELL_G, 1, 1, 0x00000015),
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 2, 2, 0x00000037),
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 4, 4, 0x0000000a),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL, 4, 4, 0x00000023),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_G, 1, 1, 0x00000014),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 2, 2, 0x00000010),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 3, 3, 0x07000009),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 4, 4, 0x0f000009),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 5, 5, 0x0e000002),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_X, 1, 1, 0x0b000014),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 3, 3, 0x00000021),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 4, 7, 0x00000000),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 11, 11, 0x00000000),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_L, 3, 3, 0x0000007c),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE, 3, 3, 0x0000007c),
X86_MATCH_VFM_STEPS(INTEL_KABYLAKE, 9, 13, 0x0000004e),
X86_MATCH_VFM_STEPS(INTEL_KABYLAKE_L, 9, 12, 0x0000004e),
{}
};
static void intel_check_pebs_isolation(void)
{
x86_pmu.pebs_no_isolation = !x86_cpu_has_min_microcode_rev(isolation_ucodes);
x86_pmu.pebs_no_isolation = !x86_match_min_microcode_rev(isolation_ucodes);
}
static __init void intel_pebs_isolation_quirk(void)
@ -5416,16 +5406,16 @@ static __init void intel_pebs_isolation_quirk(void)
intel_check_pebs_isolation();
}
static const struct x86_cpu_desc pebs_ucodes[] = {
INTEL_CPU_DESC(INTEL_SANDYBRIDGE, 7, 0x00000028),
INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 6, 0x00000618),
INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 7, 0x0000070c),
static const struct x86_cpu_id pebs_ucodes[] = {
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE, 7, 7, 0x00000028),
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 6, 6, 0x00000618),
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 7, 7, 0x0000070c),
{}
};
static bool intel_snb_pebs_broken(void)
{
return !x86_cpu_has_min_microcode_rev(pebs_ucodes);
return !x86_match_min_microcode_rev(pebs_ucodes);
}
static void intel_snb_check_microcode(void)

View File

@ -1789,8 +1789,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
if (sample_type & PERF_SAMPLE_CALLCHAIN)
perf_sample_save_callchain(data, event, iregs);
perf_sample_save_callchain(data, event, iregs);
/*
* We use the interrupt regs as a base because the PEBS record does not
@ -1889,8 +1888,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
if (x86_pmu.intel_cap.pebs_format >= 3)
setup_pebs_time(event, data, pebs->tsc);
if (has_branch_stack(event))
perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
}
static void adaptive_pebs_save_regs(struct pt_regs *regs,
@ -1917,8 +1915,6 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
}
#define PEBS_LATENCY_MASK 0xffff
#define PEBS_CACHE_LATENCY_OFFSET 32
#define PEBS_RETIRE_LATENCY_OFFSET 32
/*
* With adaptive PEBS the layout depends on what fields are configured.
@ -1932,8 +1928,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct pebs_basic *basic = __pebs;
void *next_record = basic + 1;
u64 sample_type;
u64 format_size;
u64 sample_type, format_group;
struct pebs_meminfo *meminfo = NULL;
struct pebs_gprs *gprs = NULL;
struct x86_perf_regs *perf_regs;
@ -1945,7 +1940,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
perf_regs->xmm_regs = NULL;
sample_type = event->attr.sample_type;
format_size = basic->format_size;
format_group = basic->format_group;
perf_sample_data_init(data, 0, event->hw.last_period);
data->period = event->hw.last_period;
@ -1957,8 +1952,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
if (sample_type & PERF_SAMPLE_CALLCHAIN)
perf_sample_save_callchain(data, event, iregs);
perf_sample_save_callchain(data, event, iregs);
*regs = *iregs;
/* The ip in basic is EventingIP */
@ -1967,7 +1961,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY)
data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
data->weight.var3_w = basic->retire_latency;
else
data->weight.var3_w = 0;
}
@ -1977,12 +1971,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* But PERF_SAMPLE_TRANSACTION needs gprs->ax.
* Save the pointer here but process later.
*/
if (format_size & PEBS_DATACFG_MEMINFO) {
if (format_group & PEBS_DATACFG_MEMINFO) {
meminfo = next_record;
next_record = meminfo + 1;
}
if (format_size & PEBS_DATACFG_GP) {
if (format_group & PEBS_DATACFG_GP) {
gprs = next_record;
next_record = gprs + 1;
@ -1995,14 +1989,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
adaptive_pebs_save_regs(regs, gprs);
}
if (format_size & PEBS_DATACFG_MEMINFO) {
if (format_group & PEBS_DATACFG_MEMINFO) {
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
u64 weight = meminfo->latency;
u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
meminfo->cache_latency : meminfo->mem_latency;
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
data->weight.var2_w = weight & PEBS_LATENCY_MASK;
weight >>= PEBS_CACHE_LATENCY_OFFSET;
}
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY)
data->weight.var2_w = meminfo->instr_latency;
/*
* Although meminfo::latency is defined as a u64,
@ -2010,12 +2003,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* in practice on Ice Lake and earlier platforms.
*/
if (sample_type & PERF_SAMPLE_WEIGHT) {
data->weight.full = weight ?:
data->weight.full = latency ?:
intel_get_tsx_weight(meminfo->tsx_tuning);
} else {
data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
data->weight.var1_dw = (u32)latency ?:
intel_get_tsx_weight(meminfo->tsx_tuning);
}
data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
}
@ -2036,16 +2030,16 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
}
if (format_size & PEBS_DATACFG_XMMS) {
if (format_group & PEBS_DATACFG_XMMS) {
struct pebs_xmm *xmm = next_record;
next_record = xmm + 1;
perf_regs->xmm_regs = xmm->xmm;
}
if (format_size & PEBS_DATACFG_LBRS) {
if (format_group & PEBS_DATACFG_LBRS) {
struct lbr_entry *lbr = next_record;
int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT)
& 0xff) + 1;
next_record = next_record + num_lbr * sizeof(struct lbr_entry);
@ -2055,11 +2049,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
}
WARN_ONCE(next_record != __pebs + (format_size >> 48),
"PEBS record size %llu, expected %llu, config %llx\n",
format_size >> 48,
WARN_ONCE(next_record != __pebs + basic->format_size,
"PEBS record size %u, expected %llu, config %llx\n",
basic->format_size,
(u64)(next_record - __pebs),
basic->format_size);
format_group);
}
static inline void *
@ -2170,46 +2164,33 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
return 0;
}
typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *,
struct perf_sample_data *, struct pt_regs *);
static struct pt_regs dummy_iregs;
static __always_inline void
__intel_pmu_pebs_event(struct perf_event *event,
struct pt_regs *iregs,
struct pt_regs *regs,
struct perf_sample_data *data,
void *base, void *top,
int bit, int count,
void (*setup_sample)(struct perf_event *,
struct pt_regs *,
void *,
struct perf_sample_data *,
struct pt_regs *))
void *at,
setup_fn setup_sample)
{
setup_sample(event, iregs, at, data, regs);
perf_event_output(event, data, regs);
}
static __always_inline void
__intel_pmu_pebs_last_event(struct perf_event *event,
struct pt_regs *iregs,
struct pt_regs *regs,
struct perf_sample_data *data,
void *at,
int count,
setup_fn setup_sample)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *at = get_next_pebs_record_by_bit(base, top, bit);
static struct pt_regs dummy_iregs;
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
* Now, auto-reload is only enabled in fixed period mode.
* The reload value is always hwc->sample_period.
* May need to change it, if auto-reload is enabled in
* freq mode later.
*/
intel_pmu_save_and_restart_reload(event, count);
} else if (!intel_pmu_save_and_restart(event))
return;
if (!iregs)
iregs = &dummy_iregs;
while (count > 1) {
setup_sample(event, iregs, at, data, regs);
perf_event_output(event, data, regs);
at += cpuc->pebs_record_size;
at = get_next_pebs_record_by_bit(at, top, bit);
count--;
}
setup_sample(event, iregs, at, data, regs);
if (iregs == &dummy_iregs) {
@ -2228,6 +2209,44 @@ __intel_pmu_pebs_event(struct perf_event *event,
if (perf_event_overflow(event, data, regs))
x86_pmu_stop(event, 0);
}
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
* Now, auto-reload is only enabled in fixed period mode.
* The reload value is always hwc->sample_period.
* May need to change it, if auto-reload is enabled in
* freq mode later.
*/
intel_pmu_save_and_restart_reload(event, count);
} else
intel_pmu_save_and_restart(event);
}
static __always_inline void
__intel_pmu_pebs_events(struct perf_event *event,
struct pt_regs *iregs,
struct perf_sample_data *data,
void *base, void *top,
int bit, int count,
setup_fn setup_sample)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *at = get_next_pebs_record_by_bit(base, top, bit);
int cnt = count;
if (!iregs)
iregs = &dummy_iregs;
while (cnt > 1) {
__intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample);
at += cpuc->pebs_record_size;
at = get_next_pebs_record_by_bit(at, top, bit);
cnt--;
}
__intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample);
}
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
@ -2264,8 +2283,8 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_
return;
}
__intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
setup_pebs_fixed_sample_data);
__intel_pmu_pebs_events(event, iregs, data, at, top, 0, n,
setup_pebs_fixed_sample_data);
}
static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
@ -2396,9 +2415,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
}
if (counts[bit]) {
__intel_pmu_pebs_event(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_fixed_sample_data);
__intel_pmu_pebs_events(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_fixed_sample_data);
}
}
}
@ -2406,8 +2425,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
struct pebs_basic *basic;
struct perf_event *event;
void *base, *at, *top;
int bit;
@ -2429,30 +2452,41 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
return;
}
for (at = base; at < top; at += cpuc->pebs_record_size) {
if (!iregs)
iregs = &dummy_iregs;
/* Process all but the last event for each counter. */
for (at = base; at < top; at += basic->format_size) {
u64 pebs_status;
pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
pebs_status &= mask;
basic = at;
if (basic->format_size != cpuc->pebs_record_size)
continue;
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX)
counts[bit]++;
pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask;
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
event = cpuc->events[bit];
if (WARN_ON_ONCE(!event) ||
WARN_ON_ONCE(!event->attr.precise_ip))
continue;
if (counts[bit]++) {
__intel_pmu_pebs_event(event, iregs, regs, data, last[bit],
setup_pebs_adaptive_sample_data);
}
last[bit] = at;
}
}
for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
if (counts[bit] == 0)
if (!counts[bit])
continue;
event = cpuc->events[bit];
if (WARN_ON_ONCE(!event))
continue;
if (WARN_ON_ONCE(!event->attr.precise_ip))
continue;
__intel_pmu_pebs_event(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_adaptive_sample_data);
__intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
counts[bit], setup_pebs_adaptive_sample_data);
}
}

View File

@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/device.h>
#include <asm/cpuid.h>
#include <asm/perf_event.h>
#include <asm/insn.h>
#include <asm/io.h>
@ -201,10 +202,10 @@ static int __init pt_pmu_hw_init(void)
* otherwise, zero for numerator stands for "not enumerated"
* as per SDM
*/
if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
if (boot_cpu_data.cpuid_level >= CPUID_LEAF_TSC) {
u32 eax, ebx, ecx, edx;
cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
cpuid(CPUID_LEAF_TSC, &eax, &ebx, &ecx, &edx);
pt_pmu.tsc_art_num = ebx;
pt_pmu.tsc_art_den = eax;

View File

@ -37,9 +37,6 @@ struct topa_entry {
u64 rsvd4 : 12;
};
/* TSC to Core Crystal Clock Ratio */
#define CPUID_TSC_LEAF 0x15
struct pt_pmu {
struct pmu pmu;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];

View File

@ -39,6 +39,10 @@
* event: rapl_energy_psys
* perf code: 0x5
*
* core counter: consumption of a single physical core
* event: rapl_energy_core (power_core PMU)
* perf code: 0x1
*
* We manage those counters as free running (read-only). They may be
* use simultaneously by other tools, such as turbostat.
*
@ -70,18 +74,22 @@ MODULE_LICENSE("GPL");
/*
* RAPL energy status counters
*/
enum perf_rapl_events {
enum perf_rapl_pkg_events {
PERF_RAPL_PP0 = 0, /* all cores */
PERF_RAPL_PKG, /* entire package */
PERF_RAPL_RAM, /* DRAM */
PERF_RAPL_PP1, /* gpu */
PERF_RAPL_PSYS, /* psys */
PERF_RAPL_MAX,
NR_RAPL_DOMAINS = PERF_RAPL_MAX,
PERF_RAPL_PKG_EVENTS_MAX,
NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
};
static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
#define PERF_RAPL_CORE 0 /* single core */
#define PERF_RAPL_CORE_EVENTS_MAX 1
#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
"pp0-core",
"package",
"dram",
@ -89,6 +97,8 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
"psys",
};
static const char *const rapl_core_domain_name __initconst = "core";
/*
* event code: LSB 8 bits, passed in attr->config
* any other bit is reserved
@ -112,7 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = { \
* considered as either pkg-scope or die-scope, and we are considering
* them as die-scope.
*/
#define rapl_pmu_is_pkg_scope() \
#define rapl_pkg_pmu_is_pkg_scope() \
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
@ -129,7 +139,8 @@ struct rapl_pmu {
struct rapl_pmus {
struct pmu pmu;
unsigned int nr_rapl_pmu;
struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
unsigned int cntr_mask;
struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
};
enum rapl_unit_quirk {
@ -139,44 +150,43 @@ enum rapl_unit_quirk {
};
struct rapl_model {
struct perf_msr *rapl_msrs;
unsigned long events;
struct perf_msr *rapl_pkg_msrs;
struct perf_msr *rapl_core_msrs;
unsigned long pkg_events;
unsigned long core_events;
unsigned int msr_power_unit;
enum rapl_unit_quirk unit_quirk;
};
/* 1/2^hw_unit Joule */
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
static struct rapl_pmus *rapl_pmus;
static unsigned int rapl_cntr_mask;
static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
static int rapl_core_hw_unit __read_mostly;
static struct rapl_pmus *rapl_pmus_pkg;
static struct rapl_pmus *rapl_pmus_core;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
static struct rapl_model *rapl_model;
/*
* Helper functions to get the correct topology macros according to the
* Helper function to get the correct topology id according to the
* RAPL PMU scope.
*/
static inline unsigned int get_rapl_pmu_idx(int cpu)
static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
{
return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
topology_logical_die_id(cpu);
}
static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
{
return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
topology_die_cpumask(cpu);
}
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
{
unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
/*
* The unsigned check also catches the '-1' return value for non
* existent mappings in the topology map.
* Returns unsigned int, which converts the '-1' return value
* (for non-existent mappings in topology map) to UINT_MAX, so
* the error check in the caller is simplified.
*/
return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
switch (scope) {
case PERF_PMU_SCOPE_PKG:
return topology_logical_package_id(cpu);
case PERF_PMU_SCOPE_DIE:
return topology_logical_die_id(cpu);
case PERF_PMU_SCOPE_CORE:
return topology_logical_core_id(cpu);
default:
return -EINVAL;
}
}
static inline u64 rapl_read_counter(struct perf_event *event)
@ -186,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
return raw;
}
static inline u64 rapl_scale(u64 v, int cfg)
static inline u64 rapl_scale(u64 v, struct perf_event *event)
{
if (cfg > NR_RAPL_DOMAINS) {
pr_warn("Invalid domain %d, failed to scale data\n", cfg);
return v;
}
int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
hw_unit = rapl_core_hw_unit;
/*
* scale delta to smallest unit (1/2^32)
* users must then scale back: count * 1/(1e9*2^32) to get Joules
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
return v << (32 - rapl_hw_unit[cfg - 1]);
return v << (32 - hw_unit);
}
static u64 rapl_event_update(struct perf_event *event)
@ -225,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
sdelta = rapl_scale(delta, event->hw.config);
sdelta = rapl_scale(delta, event);
local64_add(sdelta, &event->count);
@ -240,34 +251,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{
struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
struct perf_event *event;
unsigned long flags;
if (!pmu->n_active)
if (!rapl_pmu->n_active)
return HRTIMER_NORESTART;
raw_spin_lock_irqsave(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
list_for_each_entry(event, &pmu->active_list, active_entry)
list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
rapl_event_update(event);
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
hrtimer_forward_now(hrtimer, pmu->timer_interval);
hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
return HRTIMER_RESTART;
}
static void rapl_hrtimer_init(struct rapl_pmu *pmu)
static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
{
struct hrtimer *hr = &pmu->hrtimer;
struct hrtimer *hr = &rapl_pmu->hrtimer;
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hr->function = rapl_hrtimer_handle;
}
static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
struct perf_event *event)
{
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
@ -275,39 +286,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
event->hw.state = 0;
list_add_tail(&event->active_entry, &pmu->active_list);
list_add_tail(&event->active_entry, &rapl_pmu->active_list);
local64_set(&event->hw.prev_count, rapl_read_counter(event));
pmu->n_active++;
if (pmu->n_active == 1)
rapl_start_hrtimer(pmu);
rapl_pmu->n_active++;
if (rapl_pmu->n_active == 1)
rapl_start_hrtimer(rapl_pmu);
}
static void rapl_pmu_event_start(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private;
struct rapl_pmu *rapl_pmu = event->pmu_private;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
__rapl_pmu_event_start(pmu, event);
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
__rapl_pmu_event_start(rapl_pmu, event);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
}
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private;
struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
/* mark event as deactivated and stopped */
if (!(hwc->state & PERF_HES_STOPPED)) {
WARN_ON_ONCE(pmu->n_active <= 0);
pmu->n_active--;
if (pmu->n_active == 0)
hrtimer_cancel(&pmu->hrtimer);
WARN_ON_ONCE(rapl_pmu->n_active <= 0);
rapl_pmu->n_active--;
if (rapl_pmu->n_active == 0)
hrtimer_cancel(&rapl_pmu->hrtimer);
list_del(&event->active_entry);
@ -325,23 +336,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
hwc->state |= PERF_HES_UPTODATE;
}
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
}
static int rapl_pmu_event_add(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private;
struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (mode & PERF_EF_START)
__rapl_pmu_event_start(pmu, event);
__rapl_pmu_event_start(rapl_pmu, event);
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
return 0;
}
@ -354,12 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
static int rapl_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, ret = 0;
struct rapl_pmu *pmu;
int bit, rapl_pmus_scope, ret = 0;
struct rapl_pmu *rapl_pmu;
unsigned int rapl_pmu_idx;
struct rapl_pmus *rapl_pmus;
/* only look at RAPL events */
if (event->attr.type != rapl_pmus->pmu.type)
return -ENOENT;
/* unsupported modes and filters */
if (event->attr.sample_period) /* no sampling */
return -EINVAL;
/* check only supported bits are set */
if (event->attr.config & ~RAPL_EVENT_MASK)
@ -368,26 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
if (!rapl_pmus)
return -EINVAL;
rapl_pmus_scope = rapl_pmus->pmu.scope;
cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
bit = cfg - 1;
if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
/* only look at RAPL package events */
if (event->attr.type != rapl_pmus_pkg->pmu.type)
return -ENOENT;
cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
return -EINVAL;
bit = cfg - 1;
event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
} else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
/* only look at RAPL core events */
if (event->attr.type != rapl_pmus_core->pmu.type)
return -ENOENT;
cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
return -EINVAL;
bit = cfg - 1;
event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
} else
return -EINVAL;
/* check event supported */
if (!(rapl_cntr_mask & (1 << bit)))
if (!(rapl_pmus->cntr_mask & (1 << bit)))
return -EINVAL;
/* unsupported modes and filters */
if (event->attr.sample_period) /* no sampling */
rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
return -EINVAL;
/* must be done before validate_group */
pmu = cpu_to_rapl_pmu(event->cpu);
if (!pmu)
rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
if (!rapl_pmu)
return -EINVAL;
event->pmu_private = pmu;
event->hw.event_base = rapl_msrs[bit].msr;
event->pmu_private = rapl_pmu;
event->hw.config = cfg;
event->hw.idx = bit;
@ -404,12 +440,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules");
/*
* we compute in 0.23 nJ increments regardless of MSR
@ -419,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890
RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10");
/*
* There are no default events, but we need to create
@ -451,6 +490,12 @@ static const struct attribute_group *rapl_attr_groups[] = {
NULL,
};
static const struct attribute_group *rapl_core_attr_groups[] = {
&rapl_pmu_format_group,
&rapl_pmu_events_group,
NULL,
};
static struct attribute *rapl_events_cores[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_cores_unit),
@ -511,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = {
.attrs = rapl_events_psys,
};
static struct attribute *rapl_events_core[] = {
EVENT_PTR(rapl_core),
EVENT_PTR(rapl_core_unit),
EVENT_PTR(rapl_core_scale),
NULL,
};
static struct attribute_group rapl_events_core_group = {
.name = "events",
.attrs = rapl_events_core,
};
static bool test_msr(int idx, void *data)
{
return test_bit(idx, (unsigned long *) data);
@ -536,11 +593,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
};
/*
* Force to PERF_RAPL_MAX size due to:
* - perf_msr_probe(PERF_RAPL_MAX)
* Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
* - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
* - want to use same event codes across both architectures
*/
static struct perf_msr amd_rapl_msrs[] = {
static struct perf_msr amd_rapl_pkg_msrs[] = {
[PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
[PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
[PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
@ -548,18 +605,25 @@ static struct perf_msr amd_rapl_msrs[] = {
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
};
static int rapl_check_hw_unit(struct rapl_model *rm)
static struct perf_msr amd_rapl_core_msrs[] = {
[PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group,
test_msr, false, RAPL_MSR_MASK },
};
static int rapl_check_hw_unit(void)
{
u64 msr_rapl_power_unit_bits;
int i;
/* protect rdmsrl() to handle virtualization */
if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
return -1;
for (i = 0; i < NR_RAPL_DOMAINS; i++)
rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
switch (rm->unit_quirk) {
rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
switch (rapl_model->unit_quirk) {
/*
* DRAM domain on HSW server and KNL has fixed energy unit which can be
* different than the unit from power unit MSR. See
@ -567,17 +631,16 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
* of 2. Datasheet, September 2014, Reference Number: 330784-001 "
*/
case RAPL_UNIT_QUIRK_INTEL_HSW:
rapl_hw_unit[PERF_RAPL_RAM] = 16;
rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
break;
/* SPR uses a fixed energy unit for Psys domain. */
case RAPL_UNIT_QUIRK_INTEL_SPR:
rapl_hw_unit[PERF_RAPL_PSYS] = 0;
rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
break;
default:
break;
}
/*
* Calculate the timer rate:
* Use reference of 200W for scaling the timeout to avoid counter
@ -586,9 +649,9 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
* if hw unit is 32, then we use 2 ms 1/200/2
*/
rapl_timer_ms = 2;
if (rapl_hw_unit[0] < 32) {
if (rapl_pkg_hw_unit[0] < 32) {
rapl_timer_ms = (1000 / (2 * 100));
rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
}
return 0;
}
@ -596,24 +659,32 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
static void __init rapl_advertise(void)
{
int i;
int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
if (rapl_pmus_core)
num_counters += hweight32(rapl_pmus_core->cntr_mask);
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
hweight32(rapl_cntr_mask), rapl_timer_ms);
num_counters, rapl_timer_ms);
for (i = 0; i < NR_RAPL_DOMAINS; i++) {
if (rapl_cntr_mask & (1 << i)) {
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
pr_info("hw unit of domain %s 2^-%d Joules\n",
rapl_domain_names[i], rapl_hw_unit[i]);
rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
}
}
if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE)))
pr_info("hw unit of domain %s 2^-%d Joules\n",
rapl_core_domain_name, rapl_core_hw_unit);
}
static void cleanup_rapl_pmus(void)
static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
{
int i;
for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
kfree(rapl_pmus->pmus[i]);
kfree(rapl_pmus->rapl_pmu[i]);
kfree(rapl_pmus);
}
@ -626,46 +697,60 @@ static const struct attribute_group *rapl_attr_update[] = {
NULL,
};
static int __init init_rapl_pmu(void)
static const struct attribute_group *rapl_core_attr_update[] = {
&rapl_events_core_group,
NULL,
};
static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
{
struct rapl_pmu *pmu;
struct rapl_pmu *rapl_pmu;
int idx;
for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
if (!pmu)
rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL);
if (!rapl_pmu)
goto free;
raw_spin_lock_init(&pmu->lock);
INIT_LIST_HEAD(&pmu->active_list);
pmu->pmu = &rapl_pmus->pmu;
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(pmu);
raw_spin_lock_init(&rapl_pmu->lock);
INIT_LIST_HEAD(&rapl_pmu->active_list);
rapl_pmu->pmu = &rapl_pmus->pmu;
rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(rapl_pmu);
rapl_pmus->pmus[idx] = pmu;
rapl_pmus->rapl_pmu[idx] = rapl_pmu;
}
return 0;
free:
for (; idx > 0; idx--)
kfree(rapl_pmus->pmus[idx - 1]);
kfree(rapl_pmus->rapl_pmu[idx - 1]);
return -ENOMEM;
}
static int __init init_rapl_pmus(void)
static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
const struct attribute_group **rapl_attr_groups,
const struct attribute_group **rapl_attr_update)
{
int nr_rapl_pmu = topology_max_packages();
int rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
struct rapl_pmus *rapl_pmus;
if (!rapl_pmu_is_pkg_scope()) {
nr_rapl_pmu *= topology_max_dies_per_package();
rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
}
/*
* rapl_pmu_scope must be either PKG, DIE or CORE
*/
if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
nr_rapl_pmu *= topology_max_dies_per_package();
else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE)
nr_rapl_pmu *= topology_num_cores_per_package();
else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG)
return -EINVAL;
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;
*rapl_pmus_ptr = rapl_pmus;
rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
rapl_pmus->pmu.attr_groups = rapl_attr_groups;
rapl_pmus->pmu.attr_update = rapl_attr_update;
@ -680,75 +765,77 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
return init_rapl_pmu();
return init_rapl_pmu(rapl_pmus);
}
static struct rapl_model model_snb = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_PP1),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_snbep = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_hsw = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PP1),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_hsx = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_knl = {
.events = BIT(PERF_RAPL_PKG) |
.pkg_events = BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_skl = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PP1) |
BIT(PERF_RAPL_PSYS),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_spr = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PSYS),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_spr_msrs,
.rapl_pkg_msrs = intel_rapl_spr_msrs,
};
static struct rapl_model model_amd_hygon = {
.events = BIT(PERF_RAPL_PKG),
.pkg_events = BIT(PERF_RAPL_PKG),
.core_events = BIT(PERF_RAPL_CORE),
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
.rapl_msrs = amd_rapl_msrs,
.rapl_pkg_msrs = amd_rapl_pkg_msrs,
.rapl_core_msrs = amd_rapl_core_msrs,
};
static const struct x86_cpu_id rapl_model_match[] __initconst = {
@ -804,45 +891,73 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;
struct rapl_model *rm;
int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
int ret;
if (rapl_pkg_pmu_is_pkg_scope())
rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
id = x86_match_cpu(rapl_model_match);
if (!id)
return -ENODEV;
rm = (struct rapl_model *) id->driver_data;
rapl_model = (struct rapl_model *) id->driver_data;
rapl_msrs = rm->rapl_msrs;
rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
false, (void *) &rm->events);
ret = rapl_check_hw_unit(rm);
ret = rapl_check_hw_unit();
if (ret)
return ret;
ret = init_rapl_pmus();
ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
rapl_attr_update);
if (ret)
return ret;
ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
PERF_RAPL_PKG_EVENTS_MAX, false,
(void *) &rapl_model->pkg_events);
ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
if (ret)
goto out;
if (rapl_model->core_events) {
ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
rapl_core_attr_groups,
rapl_core_attr_update);
if (ret) {
pr_warn("power-core PMU initialization failed (%d)\n", ret);
goto core_init_failed;
}
rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
PERF_RAPL_CORE_EVENTS_MAX, false,
(void *) &rapl_model->core_events);
ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1);
if (ret) {
pr_warn("power-core PMU registration failed (%d)\n", ret);
cleanup_rapl_pmus(rapl_pmus_core);
}
}
core_init_failed:
rapl_advertise();
return 0;
out:
pr_warn("Initialization failed (%d), disabled\n", ret);
cleanup_rapl_pmus();
cleanup_rapl_pmus(rapl_pmus_pkg);
return ret;
}
module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void)
{
perf_pmu_unregister(&rapl_pmus->pmu);
cleanup_rapl_pmus();
if (rapl_pmus_core) {
perf_pmu_unregister(&rapl_pmus_core->pmu);
cleanup_rapl_pmus(rapl_pmus_core);
}
perf_pmu_unregister(&rapl_pmus_pkg->pmu);
cleanup_rapl_pmus(rapl_pmus_pkg);
}
module_exit(intel_rapl_exit);

View File

@ -664,7 +664,7 @@ void __init hv_vtom_init(void)
x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
/* Set WB as the default cache mode. */
mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK);
guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
}
#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */

View File

@ -4,6 +4,7 @@
#include <linux/types.h>
#include <linux/stringify.h>
#include <linux/objtool.h>
#include <asm/asm.h>
#define ALT_FLAGS_SHIFT 16
@ -54,16 +55,6 @@
#define LOCK_PREFIX ""
#endif
/*
* objtool annotation to ignore the alternatives and only consider the original
* instruction(s).
*/
#define ANNOTATE_IGNORE_ALTERNATIVE \
"999:\n\t" \
".pushsection .discard.ignore_alts\n\t" \
".long 999b\n\t" \
".popsection\n\t"
/*
* The patching flags are part of the upper bits of the @ft_flags parameter when
* specifying them. The split is currently like this:
@ -310,17 +301,6 @@ void nop_func(void);
.endm
#endif
/*
* objtool annotation to ignore the alternatives and only consider the original
* instruction(s).
*/
.macro ANNOTATE_IGNORE_ALTERNATIVE
.Lannotate_\@:
.pushsection .discard.ignore_alts
.long .Lannotate_\@
.popsection
.endm
/*
* Issue one struct alt_instr descriptor entry (need to put it into
* the section .altinstructions, see below). This entry contains

View File

@ -92,7 +92,7 @@ do { \
do { \
__auto_type __flags = BUGFLAG_WARNING|(flags); \
instrumentation_begin(); \
_BUG_FLAGS(ASM_UD2, __flags, ASM_REACHABLE); \
_BUG_FLAGS(ASM_UD2, __flags, ANNOTATE_REACHABLE(1b)); \
instrumentation_end(); \
} while (0)

View File

@ -56,7 +56,6 @@
/* x86_cpu_id::flags */
#define X86_CPU_ID_FLAG_ENTRY_VALID BIT(0)
#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
/**
* X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
* @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
@ -208,6 +207,7 @@
VFM_MODEL(vfm), \
X86_STEPPING_ANY, X86_FEATURE_ANY, data)
#define __X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
/**
* X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping
* @vfm: Encoded 8-bits each for vendor, family, model
@ -218,12 +218,13 @@
*
* feature is set to wildcard
*/
#define X86_MATCH_VFM_STEPPINGS(vfm, steppings, data) \
X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
VFM_VENDOR(vfm), \
VFM_FAMILY(vfm), \
VFM_MODEL(vfm), \
steppings, X86_FEATURE_ANY, data)
#define X86_MATCH_VFM_STEPS(vfm, min_step, max_step, data) \
X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
VFM_VENDOR(vfm), \
VFM_FAMILY(vfm), \
VFM_MODEL(vfm), \
__X86_STEPPINGS(min_step, max_step), \
X86_FEATURE_ANY, data)
/**
* X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature
@ -242,41 +243,7 @@
VFM_MODEL(vfm), \
X86_STEPPING_ANY, feature, data)
/*
* Match specific microcode revisions.
*
* vendor/family/model/stepping must be all set.
*
* Only checks against the boot CPU. When mixed-stepping configs are
* valid for a CPU model, add a quirk for every valid stepping and
* do the fine-tuning in the quirk handler.
*/
struct x86_cpu_desc {
u8 x86_family;
u8 x86_vendor;
u8 x86_model;
u8 x86_stepping;
u32 x86_microcode_rev;
};
#define INTEL_CPU_DESC(vfm, stepping, revision) { \
.x86_family = VFM_FAMILY(vfm), \
.x86_vendor = VFM_VENDOR(vfm), \
.x86_model = VFM_MODEL(vfm), \
.x86_stepping = (stepping), \
.x86_microcode_rev = (revision), \
}
#define AMD_CPU_DESC(fam, model, stepping, revision) { \
.x86_family = (fam), \
.x86_vendor = X86_VENDOR_AMD, \
.x86_model = (model), \
.x86_stepping = (stepping), \
.x86_microcode_rev = (revision), \
}
extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table);
extern bool x86_match_min_microcode_rev(const struct x86_cpu_id *table);
#endif /* _ASM_X86_CPU_DEVICE_ID */

View File

@ -132,11 +132,12 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
x86_this_cpu_test_bit(bit, cpu_info.x86_capability))
/*
* This macro is for detection of features which need kernel
* infrastructure to be used. It may *not* directly test the CPU
* itself. Use the cpu_has() family if you want true runtime
* testing of CPU features, like in hypervisor code where you are
* supporting a possible guest feature where host support for it
* This is the default CPU features testing macro to use in code.
*
* It is for detection of features which need kernel infrastructure to be
* used. It may *not* directly test the CPU itself. Use the cpu_has() family
* if you want true runtime testing of CPU features, like in hypervisor code
* where you are supporting a possible guest feature where host support for it
* is not relevant.
*/
#define cpu_feature_enabled(bit) \
@ -161,13 +162,6 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
/*
* Static testing of CPU features. Used the same as boot_cpu_has(). It
* statically patches the target code for additional performance. Use
* static_cpu_has() only in fast paths, where every cycle counts. Which
* means that the boot_cpu_has() variant is already fast enough for the
* majority of cases and you should stick to using it as it is generally
* only two instructions: a RIP-relative MOV and a TEST.
*
* Do not use an "m" constraint for [cap_byte] here: gcc doesn't know
* that this is only used on a fallback path and will sometimes cause
* it to manifest the address of boot_cpu_data in a register, fouling

View File

@ -83,8 +83,8 @@
#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */
#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */
#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */
#define X86_FEATURE_P3 ( 3*32+ 6) /* P3 */
#define X86_FEATURE_P4 ( 3*32+ 7) /* P4 */
/* Free ( 3*32+ 6) */
/* Free ( 3*32+ 7) */
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */
#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */
#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */
@ -451,6 +451,8 @@
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */
#define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */
#define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */
#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */

View File

@ -21,6 +21,13 @@ enum cpuid_regs_idx {
CPUID_EDX,
};
#define CPUID_LEAF_MWAIT 0x5
#define CPUID_LEAF_DCA 0x9
#define CPUID_LEAF_XSTATE 0x0d
#define CPUID_LEAF_TSC 0x15
#define CPUID_LEAF_FREQ 0x16
#define CPUID_LEAF_TILE 0x1d
#ifdef CONFIG_X86_32
bool have_cpuid_p(void);
#else

View File

@ -12,10 +12,6 @@
/* Bit 63 of XCR0 is reserved for future expansion */
#define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
#define XSTATE_CPUID 0x0000000d
#define TILE_CPUID 0x0000001d
#define FXSAVE_SIZE 512
#define XSAVE_HDR_SIZE 64

View File

@ -2,7 +2,7 @@
#ifndef _ASM_X86_INIT_H
#define _ASM_X86_INIT_H
#define __head __section(".head.text")
#define __head __section(".head.text") __no_sanitize_undefined
struct x86_mapping_info {
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */

View File

@ -100,8 +100,8 @@
}
#define ASM_CALL_ARG0 \
"call %c[__func] \n" \
ASM_REACHABLE
"1: call %c[__func] \n" \
ANNOTATE_REACHABLE(1b)
#define ASM_CALL_ARG1 \
"movq %[arg1], %%rdi \n" \

View File

@ -8,14 +8,9 @@
# define PA_PGD 2
# define PA_SWAP_PAGE 3
# define PAGES_NR 4
#else
# define PA_CONTROL_PAGE 0
# define VA_CONTROL_PAGE 1
# define PA_TABLE_PAGE 2
# define PA_SWAP_PAGE 3
# define PAGES_NR 4
#endif
# define KEXEC_CONTROL_PAGE_SIZE 4096
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
#ifndef __ASSEMBLY__
@ -43,7 +38,6 @@ struct kimage;
/* Maximum address we can use for the control code buffer */
# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
# define KEXEC_CONTROL_PAGE_SIZE 4096
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_386
@ -58,11 +52,12 @@ struct kimage;
/* Maximum address we can use for the control pages */
# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
/* Allocate one page for the pdp and the second for the code */
# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_X86_64
extern unsigned long kexec_va_control_page;
extern unsigned long kexec_pa_table_page;
extern unsigned long kexec_pa_swap_page;
#endif
/*
@ -125,7 +120,7 @@ relocate_kernel(unsigned long indirection_page,
#else
unsigned long
relocate_kernel(unsigned long indirection_page,
unsigned long page_list,
unsigned long pa_control_page,
unsigned long start_address,
unsigned int preserve_context,
unsigned int host_mem_enc_active);
@ -145,6 +140,19 @@ struct kimage_arch {
};
#else
struct kimage_arch {
/*
* This is a kimage control page, as it must not overlap with either
* source or destination address ranges.
*/
pgd_t *pgd;
/*
* The virtual mapping of the control code page itself is used only
* during the transition, while the current kernel's pages are all
* in place. Thus the intermediate page table pages used to map it
* are not control pages, but instead just normal pages obtained
* with get_zeroed_page(). And have to be tracked (below) so that
* they can be freed.
*/
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;

View File

@ -37,6 +37,8 @@ typedef struct {
*/
atomic64_t tlb_gen;
unsigned long next_trim_cpumask;
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct rw_semaphore ldt_usr_sem;
struct ldt_struct *ldt;

View File

@ -151,6 +151,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
mm->context.next_trim_cpumask = jiffies + HZ;
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {

View File

@ -644,6 +644,7 @@
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
#define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b
#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
#define MSR_AMD64_SEV 0xc0010131
#define MSR_AMD64_SEV_ENABLED_BIT 0
@ -682,11 +683,12 @@
#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT)
#define MSR_AMD64_SNP_RESV_BIT 18
#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT)
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
#define MSR_AMD64_RMP_BASE 0xc0010132
#define MSR_AMD64_RMP_END 0xc0010133
#define MSR_AMD64_RMP_CFG 0xc0010136
#define MSR_AMD64_SEG_RMP_ENABLED_BIT 0
#define MSR_AMD64_SEG_RMP_ENABLED BIT_ULL(MSR_AMD64_SEG_RMP_ENABLED_BIT)
#define MSR_AMD64_RMP_SEGMENT_SHIFT(x) (((x) & GENMASK_ULL(13, 8)) >> 8)
#define MSR_SVSM_CAA 0xc001f000

View File

@ -58,8 +58,8 @@ struct mtrr_state_type {
*/
# ifdef CONFIG_MTRR
void mtrr_bp_init(void);
void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var,
mtrr_type def_type);
void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
mtrr_type def_type);
extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
extern void mtrr_save_fixed_ranges(void *);
extern void mtrr_save_state(void);
@ -75,9 +75,9 @@ void mtrr_disable(void);
void mtrr_enable(void);
void mtrr_generic_set_state(void);
# else
static inline void mtrr_overwrite_state(struct mtrr_var_range *var,
unsigned int num_var,
mtrr_type def_type)
static inline void guest_force_mtrr_state(struct mtrr_var_range *var,
unsigned int num_var,
mtrr_type def_type)
{
}

View File

@ -15,7 +15,6 @@
#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK)
#define MWAIT_C1_SUBSTATE_MASK 0xf0
#define CPUID_MWAIT_LEAF 5
#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
#define CPUID5_ECX_INTERRUPT_BREAK 0x2

View File

@ -179,18 +179,6 @@
#ifdef __ASSEMBLY__
/*
* This should be used immediately before an indirect jump/call. It tells
* objtool the subsequent indirect jump/call is vouched safe for retpoline
* builds.
*/
.macro ANNOTATE_RETPOLINE_SAFE
.Lhere_\@:
.pushsection .discard.retpoline_safe
.long .Lhere_\@
.popsection
.endm
/*
* (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions
* vs RETBleed validation.
@ -350,12 +338,6 @@
#else /* __ASSEMBLY__ */
#define ANNOTATE_RETPOLINE_SAFE \
"999:\n\t" \
".pushsection .discard.retpoline_safe\n\t" \
".long 999b\n\t" \
".popsection\n\t"
typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
extern retpoline_thunk_t __x86_indirect_thunk_array[];
extern retpoline_thunk_t __x86_indirect_call_thunk_array[];

View File

@ -180,13 +180,6 @@ static inline void halt(void)
PVOP_VCALL0(irq.halt);
}
extern noinstr void pv_native_wbinvd(void);
static __always_inline void wbinvd(void)
{
PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
}
static inline u64 paravirt_read_msr(unsigned msr)
{
return PVOP_CALL1(u64, cpu.read_msr, msr);

View File

@ -86,8 +86,6 @@ struct pv_cpu_ops {
void (*update_io_bitmap)(void);
#endif
void (*wbinvd)(void);
/* cpuid emulation, mostly so that caps bits can be disabled */
void (*cpuid)(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx);

View File

@ -422,7 +422,9 @@ static inline bool is_topdown_idx(int idx)
*/
struct pebs_basic {
u64 format_size;
u64 format_group:32,
retire_latency:16,
format_size:16;
u64 ip;
u64 applicable_counters;
u64 tsc;
@ -431,7 +433,17 @@ struct pebs_basic {
struct pebs_meminfo {
u64 address;
u64 aux;
u64 latency;
union {
/* pre Alder Lake */
u64 mem_latency;
/* Alder Lake and later */
struct {
u64 instr_latency:16;
u64 pad2:16;
u64 cache_latency:16;
u64 pad3:16;
};
};
u64 tsx_tuning;
};

View File

@ -98,6 +98,7 @@ struct cpuinfo_topology {
// Logical ID mappings
u32 logical_pkg_id;
u32 logical_die_id;
u32 logical_core_id;
// AMD Node ID and Nodes per Package info
u32 amd_node_id;

View File

@ -5,6 +5,7 @@
#include <asm-generic/sections.h>
#include <asm/extable.h>
extern char __relocate_kernel_start[], __relocate_kernel_end[];
extern char __brk_base[], __brk_limit[];
extern char __end_rodata_aligned[];

View File

@ -49,7 +49,7 @@ extern unsigned long saved_video_mode;
extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp);
extern void startup_64_setup_gdt_idt(void);
extern void early_setup_idt(void);
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);

View File

@ -19,6 +19,32 @@
#define TDG_VM_RD 7
#define TDG_VM_WR 8
/* TDX attributes */
#define TDX_ATTR_DEBUG_BIT 0
#define TDX_ATTR_DEBUG BIT_ULL(TDX_ATTR_DEBUG_BIT)
#define TDX_ATTR_HGS_PLUS_PROF_BIT 4
#define TDX_ATTR_HGS_PLUS_PROF BIT_ULL(TDX_ATTR_HGS_PLUS_PROF_BIT)
#define TDX_ATTR_PERF_PROF_BIT 5
#define TDX_ATTR_PERF_PROF BIT_ULL(TDX_ATTR_PERF_PROF_BIT)
#define TDX_ATTR_PMT_PROF_BIT 6
#define TDX_ATTR_PMT_PROF BIT_ULL(TDX_ATTR_PMT_PROF_BIT)
#define TDX_ATTR_ICSSD_BIT 16
#define TDX_ATTR_ICSSD BIT_ULL(TDX_ATTR_ICSSD_BIT)
#define TDX_ATTR_LASS_BIT 27
#define TDX_ATTR_LASS BIT_ULL(TDX_ATTR_LASS_BIT)
#define TDX_ATTR_SEPT_VE_DISABLE_BIT 28
#define TDX_ATTR_SEPT_VE_DISABLE BIT_ULL(TDX_ATTR_SEPT_VE_DISABLE_BIT)
#define TDX_ATTR_MIGRTABLE_BIT 29
#define TDX_ATTR_MIGRTABLE BIT_ULL(TDX_ATTR_MIGRTABLE_BIT)
#define TDX_ATTR_PKS_BIT 30
#define TDX_ATTR_PKS BIT_ULL(TDX_ATTR_PKS_BIT)
#define TDX_ATTR_KL_BIT 31
#define TDX_ATTR_KL BIT_ULL(TDX_ATTR_KL_BIT)
#define TDX_ATTR_TPA_BIT 62
#define TDX_ATTR_TPA BIT_ULL(TDX_ATTR_TPA_BIT)
#define TDX_ATTR_PERFMON_BIT 63
#define TDX_ATTR_PERFMON BIT_ULL(TDX_ATTR_PERFMON_BIT)
/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */
#define TDCS_CONFIG_FLAGS 0x1110000300000016
#define TDCS_TD_CTLS 0x1110000300000017
@ -29,8 +55,16 @@
#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1)
/* TDCS_TD_CTLS bits */
#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0)
#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(1)
#define TD_CTLS_PENDING_VE_DISABLE_BIT 0
#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(TD_CTLS_PENDING_VE_DISABLE_BIT)
#define TD_CTLS_ENUM_TOPOLOGY_BIT 1
#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(TD_CTLS_ENUM_TOPOLOGY_BIT)
#define TD_CTLS_VIRT_CPUID2_BIT 2
#define TD_CTLS_VIRT_CPUID2 BIT_ULL(TD_CTLS_VIRT_CPUID2_BIT)
#define TD_CTLS_REDUCE_VE_BIT 3
#define TD_CTLS_REDUCE_VE BIT_ULL(TD_CTLS_REDUCE_VE_BIT)
#define TD_CTLS_LOCK_BIT 63
#define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT)
/* TDX hypercall Leaf IDs */
#define TDVMCALL_MAP_GPA 0x10001

View File

@ -115,7 +115,7 @@ static inline void wrpkru(u32 pkru)
}
#endif
static __always_inline void native_wbinvd(void)
static __always_inline void wbinvd(void)
{
asm volatile("wbinvd": : :"memory");
}
@ -167,12 +167,6 @@ static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
}
static __always_inline void wbinvd(void)
{
native_wbinvd();
}
#endif /* CONFIG_PARAVIRT_XXL */
static __always_inline void clflush(volatile void *__p)

View File

@ -66,6 +66,9 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
u64 tdx_hcall_get_quote(u8 *buf, size_t size);
void __init tdx_dump_attributes(u64 td_attr);
void __init tdx_dump_td_ctls(u64 td_ctls);
#else
static inline void tdx_early_init(void) { };

View File

@ -222,6 +222,7 @@ struct flush_tlb_info {
unsigned int initiating_cpu;
u8 stride_shift;
u8 freed_tables;
u8 trim_cpumask;
};
void flush_tlb_local(void);

View File

@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu);
#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id)
#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id)
#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id)
#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id)
#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id)
#define topology_ppin(cpu) (cpu_data(cpu).ppin)

View File

@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <acpi/processor.h>
#include <asm/cpuid.h>
#include <asm/mwait.h>
#include <asm/special_insns.h>
@ -128,7 +129,7 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
unsigned int cstate_type; /* C-state type and not ACPI C-state type */
unsigned int num_cstate_subtype;
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
/* Check whether this particular cx_type (in CST) is supported or not */
cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) &
@ -172,7 +173,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
struct cpuinfo_x86 *c = &cpu_data(cpu);
long retval;
if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
if (!cpu_cstate_entry || c->cpuid_level < CPUID_LEAF_MWAIT)
return -1;
if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)

View File

@ -1854,11 +1854,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
return temp_state;
}
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
{
lockdep_assert_irqs_disabled();
switch_mm_irqs_off(NULL, prev_state.mm, current);
/* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
/*
* Restore the breakpoints if they were disabled before the temporary mm
* was loaded.
@ -1867,9 +1874,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
hw_breakpoint_restore();
}
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;
static void text_poke_memcpy(void *dst, const void *src, size_t len)
{
memcpy(dst, src, len);

View File

@ -509,19 +509,19 @@ static struct clock_event_device lapic_clockevent = {
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
static const struct x86_cpu_id deadline_match[] __initconst = {
X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x2, 0x2, 0x3a), /* EP */
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x4, 0x4, 0x0f), /* EX */
X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020),
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x2, 0x2, 0x00000011),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x3, 0x3, 0x0700000e),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x4, 0x4, 0x0f00000c),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x5, 0x5, 0x0e000003),
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x3, 0x3, 0x01000136),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x4, 0x4, 0x02000014),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x5, 0xf, 0),
X86_MATCH_VFM(INTEL_HASWELL, 0x22),
X86_MATCH_VFM(INTEL_HASWELL_L, 0x20),
@ -2582,19 +2582,12 @@ int apic_is_clustered_box(void)
/*
* APIC command line parameters
*/
static int __init setup_disableapic(char *arg)
static int __init setup_nolapic(char *arg)
{
apic_is_disabled = true;
setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
early_param("disableapic", setup_disableapic);
/* same as disableapic, for compatibility */
static int __init setup_nolapic(char *arg)
{
return setup_disableapic(arg);
}
early_param("nolapic", setup_nolapic);
static int __init parse_lapic_timer_c2_ok(char *arg)

View File

@ -139,9 +139,15 @@ static bool skip_addr(void *dest)
return true;
#endif
#ifdef CONFIG_KEXEC_CORE
# ifdef CONFIG_X86_64
if (dest >= (void *)__relocate_kernel_start &&
dest < (void *)__relocate_kernel_end)
return true;
# else
if (dest >= (void *)relocate_kernel &&
dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE)
return true;
# endif
#endif
return false;
}

View File

@ -355,10 +355,15 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c)
/*
* RMP table entry format is not architectural and is defined by the
* per-processor PPR. Restrict SNP support on the known CPU models
* for which the RMP table entry format is currently defined for.
* for which the RMP table entry format is currently defined or for
* processors which support the architecturally defined RMPREAD
* instruction.
*/
if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
c->x86 >= 0x19 && snp_probe_rmptable_info()) {
(cpu_feature_enabled(X86_FEATURE_ZEN3) ||
cpu_feature_enabled(X86_FEATURE_ZEN4) ||
cpu_feature_enabled(X86_FEATURE_RMPREAD)) &&
snp_probe_rmptable_info()) {
cc_platform_set(CC_ATTR_HOST_SEV_SNP);
} else {
setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
@ -795,10 +800,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
static const struct x86_cpu_desc erratum_1386_microcode[] = {
AMD_CPU_DESC(0x17, 0x1, 0x2, 0x0800126e),
AMD_CPU_DESC(0x17, 0x31, 0x0, 0x08301052),
{},
static const struct x86_cpu_id erratum_1386_microcode[] = {
X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
};
static void fix_erratum_1386(struct cpuinfo_x86 *c)
@ -814,7 +818,7 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c)
* Clear the feature flag only on microcode revisions which
* don't have the fix.
*/
if (x86_cpu_has_min_microcode_rev(erratum_1386_microcode))
if (x86_match_min_microcode_rev(erratum_1386_microcode))
return;
clear_cpu_cap(c, X86_FEATURE_XSAVES);

View File

@ -29,6 +29,7 @@
#include <asm/alternative.h>
#include <asm/cmdline.h>
#include <asm/cpuid.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/doublefault.h>
@ -636,9 +637,9 @@ struct cpuid_dependent_feature {
static const struct cpuid_dependent_feature
cpuid_dependent_features[] = {
{ X86_FEATURE_MWAIT, 0x00000005 },
{ X86_FEATURE_DCA, 0x00000009 },
{ X86_FEATURE_XSAVE, 0x0000000d },
{ X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT },
{ X86_FEATURE_DCA, CPUID_LEAF_DCA },
{ X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE },
{ 0, 0 }
};
@ -1201,8 +1202,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define VULNBL(vendor, family, model, blacklist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
#define VULNBL_INTEL_STEPPINGS(vfm, steppings, issues) \
X86_MATCH_VFM_STEPPINGS(vfm, steppings, issues)
#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
#define VULNBL_AMD(family, blacklist) \
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
@ -1227,43 +1228,43 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define RFDS BIT(7)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(INTEL_IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_L, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_G, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_G, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS),
VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS),
VULNBL_INTEL_STEPPINGS(INTEL_LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO),
VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO),
VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO),
VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED),
VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS),
VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS),
VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED),
VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS),
VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS),
VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),

View File

@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p)
seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);

View File

@ -599,11 +599,6 @@ static void init_intel(struct cpuinfo_x86 *c)
if (p)
strcpy(c->x86_model_id, p);
}
if (c->x86 == 15)
set_cpu_cap(c, X86_FEATURE_P4);
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_P3);
#endif
/* Work around errata */

View File

@ -6,7 +6,7 @@
#include <linux/slab.h>
/**
* x86_match_cpu - match current CPU again an array of x86_cpu_ids
* x86_match_cpu - match current CPU against an array of x86_cpu_ids
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
* {}.
*
@ -56,33 +56,13 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
}
EXPORT_SYMBOL(x86_match_cpu);
static const struct x86_cpu_desc *
x86_match_cpu_with_stepping(const struct x86_cpu_desc *match)
bool x86_match_min_microcode_rev(const struct x86_cpu_id *table)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
const struct x86_cpu_desc *m;
const struct x86_cpu_id *res = x86_match_cpu(table);
for (m = match; m->x86_family | m->x86_model; m++) {
if (c->x86_vendor != m->x86_vendor)
continue;
if (c->x86 != m->x86_family)
continue;
if (c->x86_model != m->x86_model)
continue;
if (c->x86_stepping != m->x86_stepping)
continue;
return m;
}
return NULL;
}
bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table)
{
const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table);
if (!res || res->x86_microcode_rev > boot_cpu_data.microcode)
if (!res || res->driver_data > boot_cpu_data.microcode)
return false;
return true;
}
EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev);
EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev);

View File

@ -423,7 +423,7 @@ void __init mtrr_copy_map(void)
}
/**
* mtrr_overwrite_state - set static MTRR state
* guest_force_mtrr_state - set static MTRR state for a guest
*
* Used to set MTRR state via different means (e.g. with data obtained from
* a hypervisor).
@ -436,8 +436,8 @@ void __init mtrr_copy_map(void)
* @num_var: length of the @var array
* @def_type: default caching type
*/
void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var,
mtrr_type def_type)
void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
mtrr_type def_type)
{
unsigned int i;

View File

@ -625,7 +625,7 @@ void mtrr_save_state(void)
static int __init mtrr_init_finalize(void)
{
/*
* Map might exist if mtrr_overwrite_state() has been called or if
* Map might exist if guest_force_mtrr_state() has been called or if
* mtrr_enabled() returns true.
*/
mtrr_copy_map();

View File

@ -234,7 +234,9 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r)
r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
else
r->membw.throttle_mode = THREAD_THROTTLE_MAX;
thread_throttle_mode_init();
resctrl_file_fflags_init("thread_throttle_mode",
RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
r->alloc_capable = true;
@ -961,6 +963,11 @@ static __init bool get_rdt_mon_resources(void)
if (!rdt_mon_features)
return false;
if (is_mbm_local_enabled())
mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
else if (is_mbm_total_enabled())
mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
return !rdt_get_mon_l3_config(r);
}

View File

@ -518,6 +518,76 @@ static int smp_mon_event_count(void *arg)
return 0;
}
ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct rdtgroup *rdtgrp;
int ret = 0;
/* Valid input requires a trailing newline */
if (nbytes == 0 || buf[nbytes - 1] != '\n')
return -EINVAL;
buf[nbytes - 1] = '\0';
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
rdtgroup_kn_unlock(of->kn);
return -ENOENT;
}
rdt_last_cmd_clear();
if (!strcmp(buf, "mbm_local_bytes")) {
if (is_mbm_local_enabled())
rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
else
ret = -EINVAL;
} else if (!strcmp(buf, "mbm_total_bytes")) {
if (is_mbm_total_enabled())
rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
else
ret = -EINVAL;
} else {
ret = -EINVAL;
}
if (ret)
rdt_last_cmd_printf("Unsupported event id '%s'\n", buf);
rdtgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
struct seq_file *s, void *v)
{
struct rdtgroup *rdtgrp;
int ret = 0;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (rdtgrp) {
switch (rdtgrp->mba_mbps_event) {
case QOS_L3_MBM_LOCAL_EVENT_ID:
seq_puts(s, "mbm_local_bytes\n");
break;
case QOS_L3_MBM_TOTAL_EVENT_ID:
seq_puts(s, "mbm_total_bytes\n");
break;
default:
pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event);
ret = -EINVAL;
break;
}
} else {
ret = -ENOENT;
}
rdtgroup_kn_unlock(of->kn);
return ret;
}
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
cpumask_t *cpumask, int evtid, int first)

View File

@ -283,6 +283,7 @@ struct pseudo_lock_region {
* monitor only or ctrl_mon group
* @mon: mongroup related data
* @mode: mode of resource group
* @mba_mbps_event: input monitoring event id when mba_sc is enabled
* @plr: pseudo-locked region
*/
struct rdtgroup {
@ -295,6 +296,7 @@ struct rdtgroup {
enum rdt_group_type type;
struct mongroup mon;
enum rdtgrp_mode mode;
enum resctrl_event_id mba_mbps_event;
struct pseudo_lock_region *plr;
};
@ -508,6 +510,7 @@ extern struct mutex rdtgroup_mutex;
extern struct rdt_hw_resource rdt_resources_all[];
extern struct rdtgroup rdtgroup_default;
extern struct dentry *debugfs_resctrl;
extern enum resctrl_event_id mba_mbps_default_event;
enum resctrl_res_level {
RDT_RESOURCE_L3,
@ -607,6 +610,10 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int rdtgroup_schemata_show(struct kernfs_open_file *of,
struct seq_file *s, void *v);
ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
struct seq_file *s, void *v);
bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
unsigned long cbm, int closid, bool exclusive);
unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d,
@ -647,10 +654,8 @@ void cqm_handle_limbo(struct work_struct *work);
bool has_busy_rmid(struct rdt_mon_domain *d);
void __check_limbo(struct rdt_mon_domain *d, bool force_free);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
void __init thread_throttle_mode_init(void);
void __init mbm_config_rftype_init(const char *config);
void resctrl_file_fflags_init(const char *config, unsigned long fflags);
void rdt_staged_configs_clear(void);
bool closid_allocated(unsigned int closid);
int resctrl_find_cleanest_closid(void);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */

View File

@ -663,9 +663,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
*/
static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
struct mbm_state *m = &rr->d->mbm_local[idx];
u64 cur_bw, bytes, cur_bytes;
struct mbm_state *m;
m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
if (WARN_ON_ONCE(!m))
return;
cur_bytes = rr->val;
bytes = cur_bytes - m->prev_bw_bytes;
@ -752,20 +755,20 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
u32 closid, rmid, cur_msr_val, new_msr_val;
struct mbm_state *pmbm_data, *cmbm_data;
struct rdt_ctrl_domain *dom_mba;
enum resctrl_event_id evt_id;
struct rdt_resource *r_mba;
u32 cur_bw, user_bw, idx;
struct list_head *head;
struct rdtgroup *entry;
if (!is_mbm_local_enabled())
return;
u32 cur_bw, user_bw;
r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
evt_id = rgrp->mba_mbps_event;
closid = rgrp->closid;
rmid = rgrp->mon.rmid;
idx = resctrl_arch_rmid_idx_encode(closid, rmid);
pmbm_data = &dom_mbm->mbm_local[idx];
pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id);
if (WARN_ON_ONCE(!pmbm_data))
return;
dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
if (!dom_mba) {
@ -784,7 +787,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
*/
head = &rgrp->mon.crdtgrp_list;
list_for_each_entry(entry, head, mon.crdtgrp_list) {
cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id);
if (WARN_ON_ONCE(!cmbm_data))
return;
cur_bw += cmbm_data->prev_bw;
}
@ -813,54 +818,45 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
}
static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
u32 closid, u32 rmid)
static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
u32 closid, u32 rmid, enum resctrl_event_id evtid)
{
struct rmid_read rr = {0};
rr.r = r;
rr.d = d;
rr.evtid = evtid;
rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
if (IS_ERR(rr.arch_mon_ctx)) {
pr_warn_ratelimited("Failed to allocate monitor context: %ld",
PTR_ERR(rr.arch_mon_ctx));
return;
}
__mon_event_count(closid, rmid, &rr);
/*
* This is protected from concurrent reads from user
* as both the user and we hold the global mutex.
* If the software controller is enabled, compute the
* bandwidth for this event id.
*/
if (is_mbm_total_enabled()) {
rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
rr.val = 0;
rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
if (IS_ERR(rr.arch_mon_ctx)) {
pr_warn_ratelimited("Failed to allocate monitor context: %ld",
PTR_ERR(rr.arch_mon_ctx));
return;
}
if (is_mba_sc(NULL))
mbm_bw_count(closid, rmid, &rr);
__mon_event_count(closid, rmid, &rr);
resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
}
resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
}
if (is_mbm_local_enabled()) {
rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
rr.val = 0;
rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
if (IS_ERR(rr.arch_mon_ctx)) {
pr_warn_ratelimited("Failed to allocate monitor context: %ld",
PTR_ERR(rr.arch_mon_ctx));
return;
}
static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
u32 closid, u32 rmid)
{
/*
* This is protected from concurrent reads from user as both
* the user and overflow handler hold the global mutex.
*/
if (is_mbm_total_enabled())
mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
__mon_event_count(closid, rmid, &rr);
/*
* Call the MBA software controller only for the
* control groups and when user has enabled
* the software controller explicitly.
*/
if (is_mba_sc(NULL))
mbm_bw_count(closid, rmid, &rr);
resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
}
if (is_mbm_local_enabled())
mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
}
/*
@ -1224,11 +1220,13 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
mbm_total_event.configurable = true;
mbm_config_rftype_init("mbm_total_bytes_config");
resctrl_file_fflags_init("mbm_total_bytes_config",
RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
}
if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
mbm_local_event.configurable = true;
mbm_config_rftype_init("mbm_local_bytes_config");
resctrl_file_fflags_init("mbm_local_bytes_config",
RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
}
}

View File

@ -459,7 +459,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
* increase likelihood that allocated cache portion will be filled
* with associated memory.
*/
native_wbinvd();
wbinvd();
/*
* Always called with interrupts enabled. By disabling interrupts
@ -1205,20 +1205,14 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
plr->cpu = cpu;
if (sel == 1)
thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
cpu_to_node(cpu),
"pseudo_lock_measure/%u",
cpu);
thread = kthread_run_on_cpu(measure_cycles_lat_fn, plr,
cpu, "pseudo_lock_measure/%u");
else if (sel == 2)
thread = kthread_create_on_node(measure_l2_residency, plr,
cpu_to_node(cpu),
"pseudo_lock_measure/%u",
cpu);
thread = kthread_run_on_cpu(measure_l2_residency, plr,
cpu, "pseudo_lock_measure/%u");
else if (sel == 3)
thread = kthread_create_on_node(measure_l3_residency, plr,
cpu_to_node(cpu),
"pseudo_lock_measure/%u",
cpu);
thread = kthread_run_on_cpu(measure_l3_residency, plr,
cpu, "pseudo_lock_measure/%u");
else
goto out;
@ -1226,8 +1220,6 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
ret = PTR_ERR(thread);
goto out;
}
kthread_bind(thread, cpu);
wake_up_process(thread);
ret = wait_event_interruptible(plr->lock_thread_wq,
plr->thread_done == 1);
@ -1315,18 +1307,14 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
plr->thread_done = 0;
thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp,
cpu_to_node(plr->cpu),
"pseudo_lock/%u", plr->cpu);
thread = kthread_run_on_cpu(pseudo_lock_fn, rdtgrp,
plr->cpu, "pseudo_lock/%u");
if (IS_ERR(thread)) {
ret = PTR_ERR(thread);
rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
goto out_cstates;
}
kthread_bind(thread, plr->cpu);
wake_up_process(thread);
ret = wait_event_interruptible(plr->lock_thread_wq,
plr->thread_done == 1);
if (ret < 0) {

View File

@ -65,6 +65,15 @@ static void rdtgroup_destroy_root(void);
struct dentry *debugfs_resctrl;
/*
* Memory bandwidth monitoring event to use for the default CTRL_MON group
* and each new CTRL_MON group created by the user. Only relevant when
* the filesystem is mounted with the "mba_MBps" option so it does not
* matter that it remains uninitialized on systems that do not support
* the "mba_MBps" option.
*/
enum resctrl_event_id mba_mbps_default_event;
static bool resctrl_debug;
void rdt_last_cmd_clear(void)
@ -1941,6 +1950,13 @@ static struct rftype res_common_files[] = {
.seq_show = rdtgroup_schemata_show,
.fflags = RFTYPE_CTRL_BASE,
},
{
.name = "mba_MBps_event",
.mode = 0644,
.kf_ops = &rdtgroup_kf_single_ops,
.write = rdtgroup_mba_mbps_event_write,
.seq_show = rdtgroup_mba_mbps_event_show,
},
{
.name = "mode",
.mode = 0644,
@ -2020,24 +2036,13 @@ static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
return NULL;
}
void __init thread_throttle_mode_init(void)
{
struct rftype *rft;
rft = rdtgroup_get_rftype_by_name("thread_throttle_mode");
if (!rft)
return;
rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB;
}
void __init mbm_config_rftype_init(const char *config)
void resctrl_file_fflags_init(const char *config, unsigned long fflags)
{
struct rftype *rft;
rft = rdtgroup_get_rftype_by_name(config);
if (rft)
rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE;
rft->fflags = fflags;
}
/**
@ -2343,7 +2348,7 @@ static bool supports_mba_mbps(void)
struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
return (is_mbm_local_enabled() &&
return (is_mbm_enabled() &&
r->alloc_capable && is_mba_linear() &&
r->ctrl_scope == rmbm->mon_scope);
}
@ -2357,6 +2362,7 @@ static int set_mba_sc(bool mba_sc)
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
u32 num_closid = resctrl_arch_get_num_closid(r);
struct rdt_ctrl_domain *d;
unsigned long fflags;
int i;
if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
@ -2364,11 +2370,16 @@ static int set_mba_sc(bool mba_sc)
r->membw.mba_sc = mba_sc;
rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
for (i = 0; i < num_closid; i++)
d->mbps_val[i] = MBA_MAX_MBPS;
}
fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
resctrl_file_fflags_init("mba_MBps_event", fflags);
return 0;
}
@ -2768,7 +2779,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->enable_cdpl2 = true;
return 0;
case Opt_mba_mbps:
msg = "mba_MBps requires local MBM and linear scale MBA at L3 scope";
msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
if (!supports_mba_mbps())
return invalfc(fc, msg);
ctx->enable_mba_mbps = true;
@ -3622,6 +3633,8 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
rdt_last_cmd_puts("kernfs subdir error\n");
goto out_del_list;
}
if (is_mba_sc(NULL))
rdtgrp->mba_mbps_event = mba_mbps_default_event;
}
goto out_unlock;

View File

@ -428,7 +428,7 @@ void __init topology_apply_cmdline_limits_early(void)
{
unsigned int possible = nr_cpu_ids;
/* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' */
/* 'maxcpus=0' 'nosmp' 'nolapic' */
if (!setup_max_cpus || apic_is_disabled)
possible = 1;

View File

@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early)
if (!early) {
c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
}
/* Package relative core ID */

View File

@ -20,6 +20,7 @@
#include <asm/fpu/signal.h>
#include <asm/fpu/xcr.h>
#include <asm/cpuid.h>
#include <asm/tlbflush.h>
#include <asm/prctl.h>
#include <asm/elf.h>
@ -232,7 +233,7 @@ static void __init setup_xstate_cache(void)
xmm_space);
for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
xstate_sizes[i] = eax;
xstate_flags[i] = ecx;
@ -398,7 +399,7 @@ int xfeature_size(int xfeature_nr)
u32 eax, ebx, ecx, edx;
CHECK_XFEATURE(xfeature_nr);
cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
return eax;
}
@ -441,9 +442,9 @@ static void __init __xstate_dump_leaves(void)
* just in case there are some goodies up there
*/
for (i = 0; i < XFEATURE_MAX + 10; i++) {
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
XSTATE_CPUID, i, eax, ebx, ecx, edx);
CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
}
}
@ -484,7 +485,7 @@ static int __init check_xtile_data_against_struct(int size)
* Check the maximum palette id:
* eax: the highest numbered palette subleaf.
*/
cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);
/*
* Cross-check each tile size and find the maximum number of
@ -498,7 +499,7 @@ static int __init check_xtile_data_against_struct(int size)
* eax[31:16]: bytes per title
* ebx[31:16]: the max names (or max number of tiles)
*/
cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
tile_size = eax >> 16;
max = ebx >> 16;
@ -633,7 +634,7 @@ static unsigned int __init get_compacted_size(void)
* are no supervisor states, but XSAVEC still uses compacted
* format.
*/
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
return ebx;
}
@ -674,7 +675,7 @@ static unsigned int __init get_xsave_size_user(void)
* containing all the *user* state components
* corresponding to bits currently set in XCR0.
*/
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
return ebx;
}
@ -763,21 +764,16 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
return;
}
if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
WARN_ON_FPU(1);
return;
}
/*
* Find user xstates supported by the processor.
*/
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
/*
* Find supervisor xstates supported by the processor.
*/
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {

View File

@ -91,9 +91,11 @@ static inline bool check_la57_support(void)
return true;
}
static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
pmdval_t *pmd,
unsigned long p2v_offset)
{
unsigned long vaddr, vaddr_end;
unsigned long paddr, paddr_end;
int i;
/* Encrypt the kernel and related (if SME is active) */
@ -106,10 +108,10 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* attribute.
*/
if (sme_get_me_mask()) {
vaddr = (unsigned long)__start_bss_decrypted;
vaddr_end = (unsigned long)__end_bss_decrypted;
paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted);
paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted);
for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
for (; paddr < paddr_end; paddr += PMD_SIZE) {
/*
* On SNP, transition the page to shared in the RMP table so that
* it is consistent with the page table attribute change.
@ -118,11 +120,11 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* mapping (kernel .text). PVALIDATE, by way of
* early_snp_set_memory_shared(), requires a valid virtual
* address but the kernel is currently running off of the identity
* mapping so use __pa() to get a *currently* valid virtual address.
* mapping so use the PA to get a *currently* valid virtual address.
*/
early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD);
early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
i = pmd_index(vaddr);
i = pmd_index(paddr - p2v_offset);
pmd[i] -= sme_get_me_mask();
}
}
@ -138,12 +140,15 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* doesn't have to generate PC-relative relocations when accessing globals from
* that function. Clang actually does not generate them, which leads to
* boot-time crashes. To work around this problem, every global pointer must
* be accessed using RIP_REL_REF().
* be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined
* by subtracting p2v_offset from the RIP-relative address.
*/
unsigned long __head __startup_64(unsigned long physaddr,
unsigned long __head __startup_64(unsigned long p2v_offset,
struct boot_params *bp)
{
pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text);
unsigned long va_text, va_end;
unsigned long pgtable_flags;
unsigned long load_delta;
pgdval_t *pgd;
@ -163,13 +168,16 @@ unsigned long __head __startup_64(unsigned long physaddr,
* Compute the delta between the address I am compiled to run at
* and the address I am actually running at.
*/
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
load_delta = __START_KERNEL_map + p2v_offset;
RIP_REL_REF(phys_base) = load_delta;
/* Is the address not 2M aligned? */
if (load_delta & ~PMD_MASK)
for (;;);
va_text = physaddr - p2v_offset;
va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset;
/* Include the SME encryption mask in the fixup value */
load_delta += sme_get_me_mask();
@ -178,7 +186,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
pgd = &RIP_REL_REF(early_top_pgt)->pgd;
pgd[pgd_index(__START_KERNEL_map)] += load_delta;
if (la57) {
if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
@ -230,7 +238,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;
for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
int idx = i + (physaddr >> PMD_SHIFT);
pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
@ -255,11 +263,11 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd;
/* invalidate pages before the kernel image */
for (i = 0; i < pmd_index((unsigned long)_text); i++)
for (i = 0; i < pmd_index(va_text); i++)
pmd[i] &= ~_PAGE_PRESENT;
/* fixup pages that are part of the kernel image */
for (; i <= pmd_index((unsigned long)_end); i++)
for (; i <= pmd_index(va_end); i++)
if (pmd[i] & _PAGE_PRESENT)
pmd[i] += load_delta;
@ -267,7 +275,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
for (; i < PTRS_PER_PMD; i++)
pmd[i] &= ~_PAGE_PRESENT;
return sme_postprocess_startup(bp, pmd);
return sme_postprocess_startup(bp, pmd, p2v_offset);
}
/* Wipe all early page tables except for the kernel symbol map */

View File

@ -94,13 +94,19 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Sanitize CPU configuration */
call verify_cpu
/*
* Derive the kernel's physical-to-virtual offset from the physical and
* virtual addresses of common_startup_64().
*/
leaq common_startup_64(%rip), %rdi
subq .Lcommon_startup_64(%rip), %rdi
/*
* Perform pagetable fixups. Additionally, if SME is active, encrypt
* the kernel and retrieve the modifier (SME encryption mask if SME
* is active) to be added to the initial pgdir entry that will be
* programmed into CR3.
*/
leaq _text(%rip), %rdi
movq %r15, %rsi
call __startup_64
@ -128,11 +134,11 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Branch to the common startup code at its kernel virtual address */
ANNOTATE_RETPOLINE_SAFE
jmp *0f(%rip)
jmp *.Lcommon_startup_64(%rip)
SYM_CODE_END(startup_64)
__INITRODATA
0: .quad common_startup_64
SYM_DATA_LOCAL(.Lcommon_startup_64, .quad common_startup_64)
.text
SYM_CODE_START(secondary_startup_64)

View File

@ -7,6 +7,7 @@
#include <linux/cpu.h>
#include <linux/irq.h>
#include <asm/cpuid.h>
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
#include <asm/time.h>
@ -927,10 +928,7 @@ static bool __init mwait_pc10_supported(void)
if (!cpu_feature_enabled(X86_FEATURE_MWAIT))
return false;
if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
return false;
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &mwait_substates);
return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) &&
(ecx & CPUID5_ECX_INTERRUPT_BREAK) &&

View File

@ -28,19 +28,19 @@ static ssize_t version_show(struct kobject *kobj,
static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
struct bin_attribute *bin_attr,
const struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
memcpy(buf, (void *)&boot_params + off, count);
return count;
}
static struct bin_attribute boot_params_data_attr = {
static const struct bin_attribute boot_params_data_attr = {
.attr = {
.name = "data",
.mode = S_IRUGO,
},
.read = boot_params_data_read,
.read_new = boot_params_data_read,
.size = sizeof(boot_params),
};
@ -49,14 +49,14 @@ static struct attribute *boot_params_version_attrs[] = {
NULL,
};
static struct bin_attribute *boot_params_data_attrs[] = {
static const struct bin_attribute *const boot_params_data_attrs[] = {
&boot_params_data_attr,
NULL,
};
static const struct attribute_group boot_params_attr_group = {
.attrs = boot_params_version_attrs,
.bin_attrs = boot_params_data_attrs,
.bin_attrs_new = boot_params_data_attrs,
};
static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
@ -172,7 +172,7 @@ static ssize_t type_show(struct kobject *kobj,
static ssize_t setup_data_data_read(struct file *fp,
struct kobject *kobj,
struct bin_attribute *bin_attr,
const struct bin_attribute *bin_attr,
char *buf,
loff_t off, size_t count)
{
@ -250,7 +250,7 @@ static struct bin_attribute data_attr __ro_after_init = {
.name = "data",
.mode = S_IRUGO,
},
.read = setup_data_data_read,
.read_new = setup_data_data_read,
};
static struct attribute *setup_data_type_attrs[] = {
@ -258,14 +258,14 @@ static struct attribute *setup_data_type_attrs[] = {
NULL,
};
static struct bin_attribute *setup_data_data_attrs[] = {
static const struct bin_attribute *const setup_data_data_attrs[] = {
&data_attr,
NULL,
};
static const struct attribute_group setup_data_attr_group = {
.attrs = setup_data_type_attrs,
.bin_attrs = setup_data_data_attrs,
.bin_attrs_new = setup_data_data_attrs,
};
static int __init create_setup_data_node(struct kobject *parent,

View File

@ -983,7 +983,7 @@ static void __init kvm_init_platform(void)
x86_platform.apic_post_init = kvm_apic_init;
/* Set WB as the default cache mode for SEV-SNP and TDX */
mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK);
guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
}
#if defined(CONFIG_AMD_MEM_ENCRYPT)

View File

@ -146,7 +146,8 @@ static void free_transition_pgtable(struct kimage *image)
image->arch.pte = NULL;
}
static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
unsigned long control_page)
{
pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
unsigned long vaddr, paddr;
@ -156,8 +157,13 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
pmd_t *pmd;
pte_t *pte;
vaddr = (unsigned long)relocate_kernel;
paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
/*
* For the transition to the identity mapped page tables, the control
* code page also needs to be mapped at the virtual address it starts
* off running from.
*/
vaddr = (unsigned long)__va(control_page);
paddr = control_page;
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
@ -216,7 +222,7 @@ static void *alloc_pgt_page(void *data)
return p;
}
static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
static int init_pgtable(struct kimage *image, unsigned long control_page)
{
struct x86_mapping_info info = {
.alloc_pgt_page = alloc_pgt_page,
@ -225,12 +231,12 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
.kernpg_flag = _KERNPG_TABLE_NOENC,
};
unsigned long mstart, mend;
pgd_t *level4p;
int result;
int i;
level4p = (pgd_t *)__va(start_pgtable);
clear_page(level4p);
image->arch.pgd = alloc_pgt_page(image);
if (!image->arch.pgd)
return -ENOMEM;
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
info.page_flag |= _PAGE_ENC;
@ -244,8 +250,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
mstart = pfn_mapped[i].start << PAGE_SHIFT;
mend = pfn_mapped[i].end << PAGE_SHIFT;
result = kernel_ident_mapping_init(&info,
level4p, mstart, mend);
result = kernel_ident_mapping_init(&info, image->arch.pgd,
mstart, mend);
if (result)
return result;
}
@ -260,8 +266,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
result = kernel_ident_mapping_init(&info,
level4p, mstart, mend);
result = kernel_ident_mapping_init(&info, image->arch.pgd,
mstart, mend);
if (result)
return result;
@ -271,15 +277,19 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
* Prepare EFI systab and ACPI tables for kexec kernel since they are
* not covered by pfn_mapped.
*/
result = map_efi_systab(&info, level4p);
result = map_efi_systab(&info, image->arch.pgd);
if (result)
return result;
result = map_acpi_tables(&info, level4p);
result = map_acpi_tables(&info, image->arch.pgd);
if (result)
return result;
return init_transition_pgtable(image, level4p);
/*
* This must be last because the intermediate page table pages it
* allocates will not be control pages and may overlap the image.
*/
return init_transition_pgtable(image, image->arch.pgd, control_page);
}
static void load_segments(void)
@ -296,22 +306,35 @@ static void load_segments(void)
int machine_kexec_prepare(struct kimage *image)
{
unsigned long start_pgtable;
void *control_page = page_address(image->control_code_page);
unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
unsigned long reloc_end = (unsigned long)__relocate_kernel_end;
int result;
/* Calculate the offsets */
start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
/* Setup the identity mapped 64bit page table */
result = init_pgtable(image, start_pgtable);
result = init_pgtable(image, __pa(control_page));
if (result)
return result;
kexec_va_control_page = (unsigned long)control_page;
kexec_pa_table_page = (unsigned long)__pa(image->arch.pgd);
if (image->type == KEXEC_TYPE_DEFAULT)
kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT;
__memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start);
set_memory_rox((unsigned long)control_page, 1);
return 0;
}
void machine_kexec_cleanup(struct kimage *image)
{
void *control_page = page_address(image->control_code_page);
set_memory_nx((unsigned long)control_page, 1);
set_memory_rw((unsigned long)control_page, 1);
free_transition_pgtable(image);
}
@ -321,7 +344,12 @@ void machine_kexec_cleanup(struct kimage *image)
*/
void machine_kexec(struct kimage *image)
{
unsigned long page_list[PAGES_NR];
unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page,
unsigned long pa_control_page,
unsigned long start_address,
unsigned int preserve_context,
unsigned int host_mem_enc_active);
unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
unsigned int host_mem_enc_active;
int save_ftrace_enabled;
void *control_page;
@ -357,17 +385,14 @@ void machine_kexec(struct kimage *image)
#endif
}
control_page = page_address(image->control_code_page) + PAGE_SIZE;
__memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
control_page = page_address(image->control_code_page);
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
page_list[PA_TABLE_PAGE] =
(unsigned long)__pa(page_address(image->control_code_page));
if (image->type == KEXEC_TYPE_DEFAULT)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
<< PAGE_SHIFT);
/*
* Allow for the possibility that relocate_kernel might not be at
* the very start of the page.
*/
relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel -
reloc_start;
/*
* The segment registers are funny things, they have both a
@ -388,11 +413,11 @@ void machine_kexec(struct kimage *image)
native_gdt_invalidate();
/* now call it */
image->start = relocate_kernel((unsigned long)image->head,
(unsigned long)page_list,
image->start,
image->preserve_context,
host_mem_enc_active);
image->start = relocate_kernel_ptr((unsigned long)image->head,
virt_to_phys(control_page),
image->start,
image->preserve_context,
host_mem_enc_active);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@ -573,8 +598,7 @@ static void kexec_mark_crashkres(bool protect)
/* Don't touch the control code page used in crash_kexec().*/
control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
/* Control code page is located in the 2nd page. */
kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
kexec_mark_range(crashk_res.start, control - 1, protect);
control += KEXEC_CONTROL_PAGE_SIZE;
kexec_mark_range(control, crashk_res.end, protect);
}

View File

@ -123,11 +123,6 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
native_set_debugreg(regno, val);
}
noinstr void pv_native_wbinvd(void)
{
native_wbinvd();
}
static noinstr void pv_native_safe_halt(void)
{
native_safe_halt();
@ -155,7 +150,6 @@ struct paravirt_patch_template pv_ops = {
.cpu.read_cr0 = native_read_cr0,
.cpu.write_cr0 = native_write_cr0,
.cpu.write_cr4 = native_write_cr4,
.cpu.wbinvd = pv_native_wbinvd,
.cpu.read_msr = native_read_msr,
.cpu.write_msr = native_write_msr,
.cpu.read_msr_safe = native_read_msr_safe,

View File

@ -108,10 +108,6 @@ void __init pci_iommu_alloc(void)
swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags);
}
/*
* See <Documentation/arch/x86/x86_64/boot-options.rst> for the iommu kernel
* parameter documentation.
*/
static __init int iommu_setup(char *p)
{
iommu_merge = 1;

View File

@ -30,6 +30,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/entry-common.h>
#include <asm/cpu.h>
#include <asm/cpuid.h>
#include <asm/apic.h>
#include <linux/uaccess.h>
#include <asm/mwait.h>
@ -825,7 +826,7 @@ void __noreturn stop_this_cpu(void *dummy)
* X86_FEATURE_SME due to cmdline options.
*/
if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
native_wbinvd();
wbinvd();
/*
* This brings a cache line back and dirties it, but
@ -838,7 +839,7 @@ void __noreturn stop_this_cpu(void *dummy)
#ifdef CONFIG_SMP
if (smp_ops.stop_this_cpu) {
smp_ops.stop_this_cpu();
unreachable();
BUG();
}
#endif
@ -846,7 +847,7 @@ void __noreturn stop_this_cpu(void *dummy)
/*
* Use native_halt() so that memory contents don't change
* (stack usage and variables) after possibly issuing the
* native_wbinvd() above.
* wbinvd() above.
*/
native_halt();
}
@ -877,7 +878,7 @@ static __init bool prefer_mwait_c1_over_halt(void)
if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E))
return false;
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
/*
* If MWAIT extensions are not available, it is safe to use MWAIT

View File

@ -883,7 +883,7 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
if (smp_ops.stop_this_cpu) {
smp_ops.stop_this_cpu();
unreachable();
BUG();
}
/* Assume hlt works */

View File

@ -24,33 +24,30 @@
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
/*
* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
* ~ control_page + PAGE_SIZE are used as data storage and stack for
* jumping back
* The .text.relocate_kernel and .data.relocate_kernel sections are copied
* into the control page, and the remainder of the page is used as the stack.
*/
#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
.section .data.relocate_kernel,"a";
/* Minimal CPU state */
#define RSP DATA(0x0)
#define CR0 DATA(0x8)
#define CR3 DATA(0x10)
#define CR4 DATA(0x18)
SYM_DATA_LOCAL(saved_rsp, .quad 0)
SYM_DATA_LOCAL(saved_cr0, .quad 0)
SYM_DATA_LOCAL(saved_cr3, .quad 0)
SYM_DATA_LOCAL(saved_cr4, .quad 0)
/* other data */
SYM_DATA(kexec_va_control_page, .quad 0)
SYM_DATA(kexec_pa_table_page, .quad 0)
SYM_DATA(kexec_pa_swap_page, .quad 0)
SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
/* other data */
#define CP_PA_TABLE_PAGE DATA(0x20)
#define CP_PA_SWAP_PAGE DATA(0x28)
#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
.text
.align PAGE_SIZE
.section .text.relocate_kernel,"ax";
.code64
SYM_CODE_START_NOALIGN(relocate_range)
SYM_CODE_START_NOALIGN(relocate_kernel)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
/*
* %rdi indirection_page
* %rsi page_list
* %rsi pa_control_page
* %rdx start address
* %rcx preserve_context
* %r8 host_mem_enc_active
@ -65,51 +62,36 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
pushq %r15
pushf
movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
movq %rsp, RSP(%r11)
movq %cr0, %rax
movq %rax, CR0(%r11)
movq %cr3, %rax
movq %rax, CR3(%r11)
movq %cr4, %rax
movq %rax, CR4(%r11)
/* Save CR4. Required to enable the right paging mode later. */
movq %rax, %r13
/* zero out flags, and disable interrupts */
pushq $0
popfq
/* Save SME active flag */
movq %r8, %r12
/*
* get physical address of control page now
* this is impossible after page table switch
*/
movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
/* get physical address of page table now too */
movq PTR(PA_TABLE_PAGE)(%rsi), %r9
/* get physical address of swap page now */
movq PTR(PA_SWAP_PAGE)(%rsi), %r10
/* save some information for jumping back */
movq %r9, CP_PA_TABLE_PAGE(%r11)
movq %r10, CP_PA_SWAP_PAGE(%r11)
movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
/* Switch to the identity mapped page tables */
movq %cr3, %rax
movq kexec_pa_table_page(%rip), %r9
movq %r9, %cr3
/* Save %rsp and CRs. */
movq %rsp, saved_rsp(%rip)
movq %rax, saved_cr3(%rip)
movq %cr0, %rax
movq %rax, saved_cr0(%rip)
/* Leave CR4 in %r13 to enable the right paging mode later. */
movq %cr4, %r13
movq %r13, saved_cr4(%rip)
/* save indirection list for jumping back */
movq %rdi, pa_backup_pages_map(%rip)
/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
movq %rcx, %r11
/* setup a new stack at the end of the physical control page */
lea PAGE_SIZE(%r8), %rsp
lea PAGE_SIZE(%rsi), %rsp
/* jump to identity mapped page */
addq $(identity_mapped - relocate_kernel), %r8
pushq %r8
addq $(identity_mapped - relocate_kernel), %rsi
pushq %rsi
ANNOTATE_UNRET_SAFE
ret
int3
@ -117,6 +99,15 @@ SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
UNWIND_HINT_END_OF_STACK
/*
* %rdi indirection page
* %rdx start address
* %r8 host_mem_enc_active
* %r9 page table page
* %r11 preserve_context
* %r13 original CR4 when relocate_kernel() was invoked
*/
/* set return address to 0 if not preserving context */
pushq $0
/* store the start address on the stack */
@ -166,13 +157,11 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* entries that will conflict with the now unencrypted memory
* used by kexec. Flush the caches before copying the kernel.
*/
testq %r12, %r12
testq %r8, %r8
jz .Lsme_off
wbinvd
.Lsme_off:
/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
movq %rcx, %r11
call swap_pages
/*
@ -184,13 +173,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movq %cr3, %rax
movq %rax, %cr3
testq %r11, %r11 /* preserve_context */
jnz .Lrelocate
/*
* set all of the registers to known values
* leave %rsp alone
*/
testq %r11, %r11
jnz .Lrelocate
xorl %eax, %eax
xorl %ebx, %ebx
xorl %ecx, %ecx
@ -220,13 +210,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* get the re-entry point of the peer system */
movq 0(%rsp), %rbp
leaq relocate_kernel(%rip), %r8
movq CP_PA_SWAP_PAGE(%r8), %r10
movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
movq CP_PA_TABLE_PAGE(%r8), %rax
movq kexec_pa_swap_page(%rip), %r10
movq pa_backup_pages_map(%rip), %rdi
movq kexec_pa_table_page(%rip), %rax
movq %rax, %cr3
lea PAGE_SIZE(%r8), %rsp
call swap_pages
movq $virtual_mapped, %rax
movq kexec_va_control_page(%rip), %rax
addq $(virtual_mapped - relocate_kernel), %rax
pushq %rax
ANNOTATE_UNRET_SAFE
ret
@ -236,11 +227,11 @@ SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // RET target, above
movq RSP(%r8), %rsp
movq CR4(%r8), %rax
movq saved_rsp(%rip), %rsp
movq saved_cr4(%rip), %rax
movq %rax, %cr4
movq CR3(%r8), %rax
movq CR0(%r8), %r8
movq saved_cr3(%rip), %rax
movq saved_cr0(%rip), %r8
movq %rax, %cr3
movq %r8, %cr0
@ -270,37 +261,40 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movq %rdi, %rcx /* Put the indirection_page in %rcx */
xorl %edi, %edi
xorl %esi, %esi
jmp 1f
jmp .Lstart /* Should start with an indirection record */
0: /* top, read another word for the indirection page */
.Lloop: /* top, read another word for the indirection page */
movq (%rbx), %rcx
addq $8, %rbx
1:
.Lstart:
testb $0x1, %cl /* is it a destination page? */
jz 2f
jz .Lnotdest
movq %rcx, %rdi
andq $0xfffffffffffff000, %rdi
jmp 0b
2:
jmp .Lloop
.Lnotdest:
testb $0x2, %cl /* is it an indirection page? */
jz 2f
jz .Lnotind
movq %rcx, %rbx
andq $0xfffffffffffff000, %rbx
jmp 0b
2:
jmp .Lloop
.Lnotind:
testb $0x4, %cl /* is it the done indicator? */
jz 2f
jmp 3f
2:
jz .Lnotdone
jmp .Ldone
.Lnotdone:
testb $0x8, %cl /* is it the source indicator? */
jz 0b /* Ignore it otherwise */
jz .Lloop /* Ignore it otherwise */
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
movq %rdi, %rdx /* Save destination page to %rdx */
movq %rsi, %rax /* Save source page to %rax */
testq %r11, %r11 /* Only actually swap for ::preserve_context */
jz .Lnoswap
/* copy source page to swap page */
movq %r10, %rdi
movl $512, %ecx
@ -315,16 +309,14 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
/* copy swap page to destination page */
movq %rdx, %rdi
movq %r10, %rsi
.Lnoswap:
movl $512, %ecx
rep ; movsq
lea PAGE_SIZE(%rax), %rsi
jmp 0b
3:
jmp .Lloop
.Ldone:
ANNOTATE_UNRET_SAFE
ret
int3
SYM_CODE_END(swap_pages)
.skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc
SYM_CODE_END(relocate_range);

View File

@ -64,6 +64,7 @@
#include <asm/acpi.h>
#include <asm/cacheinfo.h>
#include <asm/cpuid.h>
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
@ -1291,10 +1292,8 @@ static inline void mwait_play_dead(void)
return;
if (!this_cpu_has(X86_FEATURE_CLFLUSH))
return;
if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
return;
eax = CPUID_MWAIT_LEAF;
eax = CPUID_LEAF_MWAIT;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);

View File

@ -16,6 +16,7 @@
#include <linux/static_key.h>
#include <linux/static_call.h>
#include <asm/cpuid.h>
#include <asm/hpet.h>
#include <asm/timer.h>
#include <asm/vgtod.h>
@ -665,13 +666,13 @@ unsigned long native_calibrate_tsc(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
if (boot_cpu_data.cpuid_level < 0x15)
if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
return 0;
eax_denominator = ebx_numerator = ecx_hz = edx = 0;
/* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
if (ebx_numerator == 0 || eax_denominator == 0)
return 0;
@ -680,8 +681,8 @@ unsigned long native_calibrate_tsc(void)
/*
* Denverton SoCs don't report crystal clock, and also don't support
* CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal
* clock.
* CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz
* crystal clock.
*/
if (crystal_khz == 0 &&
boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
@ -700,10 +701,10 @@ unsigned long native_calibrate_tsc(void)
* clock, but we can easily calculate it to a high degree of accuracy
* by considering the crystal ratio and the CPU speed.
*/
if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) {
if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
unsigned int eax_base_mhz, ebx, ecx, edx;
cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx);
cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
crystal_khz = eax_base_mhz * 1000 *
eax_denominator / ebx_numerator;
}
@ -738,12 +739,12 @@ static unsigned long cpu_khz_from_cpuid(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
if (boot_cpu_data.cpuid_level < 0x16)
if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
return 0;
eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
return eax_base_mhz * 1000;
}
@ -1067,10 +1068,8 @@ core_initcall(cpufreq_register_tsc_scaling);
#endif /* CONFIG_CPU_FREQ */
#define ART_CPUID_LEAF (0x15)
#define ART_MIN_DENOMINATOR (1)
/*
* If ART is present detect the numerator:denominator to convert to TSC
*/
@ -1078,7 +1077,7 @@ static void __init detect_art(void)
{
unsigned int unused;
if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
return;
/*
@ -1091,7 +1090,7 @@ static void __init detect_art(void)
tsc_async_resets)
return;
cpuid(ART_CPUID_LEAF, &art_base_clk.denominator,
cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator,
&art_base_clk.numerator, &art_base_clk.freq_khz, &unused);
art_base_clk.freq_khz /= KHZ;

View File

@ -28,6 +28,7 @@
#include <asm/orc_lookup.h>
#include <asm/cache.h>
#include <asm/boot.h>
#include <asm/kexec.h>
#undef i386 /* in case the preprocessor is a 32bit one */
@ -95,7 +96,19 @@ const_pcpu_hot = pcpu_hot;
#define BSS_DECRYPTED
#endif
#if defined(CONFIG_X86_64) && defined(CONFIG_KEXEC_CORE)
#define KEXEC_RELOCATE_KERNEL \
. = ALIGN(0x100); \
__relocate_kernel_start = .; \
*(.text.relocate_kernel); \
*(.data.relocate_kernel); \
__relocate_kernel_end = .;
ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX_SIZE,
"relocate_kernel code too large!")
#else
#define KEXEC_RELOCATE_KERNEL
#endif
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(6); /* RW_ */
@ -121,19 +134,6 @@ SECTIONS
.text : AT(ADDR(.text) - LOAD_OFFSET) {
_text = .;
_stext = .;
/* bootstrapping code */
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
SOFTIRQENTRY_TEXT
#ifdef CONFIG_MITIGATION_RETPOLINE
*(.text..__x86.indirect_thunk)
*(.text..__x86.return_thunk)
#endif
STATIC_CALL_TEXT
ALIGN_ENTRY_TEXT_BEGIN
*(.text..__x86.rethunk_untrain)
ENTRY_TEXT
@ -147,10 +147,26 @@ SECTIONS
*(.text..__x86.rethunk_safe)
#endif
ALIGN_ENTRY_TEXT_END
TEXT_TEXT
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
SOFTIRQENTRY_TEXT
#ifdef CONFIG_MITIGATION_RETPOLINE
*(.text..__x86.indirect_thunk)
*(.text..__x86.return_thunk)
#endif
STATIC_CALL_TEXT
*(.gnu.warning)
} :text = 0xcccccccc
/* bootstrapping code */
.head.text : AT(ADDR(.head.text) - LOAD_OFFSET) {
HEAD_TEXT
} :text = 0xcccccccc
/* End of text section, which should occupy whole number of pages */
_etext = .;
. = ALIGN(PAGE_SIZE);
@ -181,6 +197,7 @@ SECTIONS
DATA_DATA
CONSTRUCTORS
KEXEC_RELOCATE_KERNEL
/* rarely changed data like cpu maps */
READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)

View File

@ -3820,7 +3820,7 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
goto next_range;
}
unreachable();
BUG();
}
static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)

View File

@ -678,7 +678,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
ASM_CALL_ARG3,
, [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
unreachable();
BUG();
}
#endif

Some files were not shown because too many files have changed in this diff Show More