Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git

# Conflicts:
#	include/linux/mm.h
#	include/linux/mm_types.h
#	include/linux/mmap_lock.h
#	kernel/fork.c
#	mm/init-mm.c
#	tools/testing/vma/vma_internal.h
This commit is contained in:
Stephen Rothwell 2025-01-13 13:27:49 +11:00
commit bb0a44fa74
218 changed files with 5047 additions and 3789 deletions

View File

@ -194,8 +194,6 @@ is applicable::
WDT Watchdog support is enabled.
X86-32 X86-32, aka i386 architecture is enabled.
X86-64 X86-64 architecture is enabled.
More X86-64 boot options can be found in
Documentation/arch/x86/x86_64/boot-options.rst.
X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64)
X86_UV SGI UV support is enabled.
XEN Xen support is enabled
@ -213,7 +211,6 @@ Do not modify the syntax of boot loader parameters without extreme
need or coordination with <Documentation/arch/x86/boot.rst>.
There are also arch-specific kernel-parameters not documented here.
See for example <Documentation/arch/x86/x86_64/boot-options.rst>.
Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
a trailing = on the name of any parameter states that that parameter will

View File

@ -21,6 +21,10 @@
strictly ACPI specification compliant.
rsdt -- prefer RSDT over (default) XSDT
copy_dsdt -- copy DSDT to memory
nocmcff -- Disable firmware first mode for corrected
errors. This disables parsing the HEST CMC error
source to check if firmware has set the FF flag. This
may result in duplicate corrected error reports.
nospcr -- disable console in ACPI SPCR table as
default _serial_ console on ARM64
For ARM64, ONLY "acpi=off", "acpi=on", "acpi=force" or
@ -405,6 +409,8 @@
not play well with APC CPU idle - disable it if you have
APC and your system crashes randomly.
apic [APIC,X86-64] Use IO-APIC. Default.
apic= [APIC,X86,EARLY] Advanced Programmable Interrupt Controller
Change the output verbosity while booting
Format: { quiet (default) | verbose | debug }
@ -424,6 +430,10 @@
useful so that a dump capture kernel won't be
shot down by NMI
apicpmtimer Do APIC timer calibration using the pmtimer. Implies
apicmaintimer. Useful when your PIT timer is totally
broken.
autoconf= [IPV6]
See Documentation/networking/ipv6.rst.
@ -1726,6 +1736,8 @@
off: Disable GDS mitigation.
gbpages [X86] Use GB pages for kernel direct mappings.
gcov_persist= [GCOV] When non-zero (default), profiling data for
kernel modules is saved and remains accessible via
debugfs, even when the module is unloaded/reloaded.
@ -2008,12 +2020,21 @@
idle= [X86,EARLY]
Format: idle=poll, idle=halt, idle=nomwait
Poll forces a polling idle loop that can slightly
improve the performance of waking up a idle CPU, but
will use a lot of power and make the system run hot.
Not recommended.
idle=poll: Don't do power saving in the idle loop
using HLT, but poll for rescheduling event. This will
make the CPUs eat a lot more power, but may be useful
to get slightly better performance in multiprocessor
benchmarks. It also makes some profiling using
performance counters more accurate. Please note that
on systems with MONITOR/MWAIT support (like Intel
EM64T CPUs) this option has no performance advantage
over the normal idle loop. It may also interact badly
with hyperthreading.
idle=halt: Halt is forced to be used for CPU idle.
In such case C2/C3 won't be used again.
idle=nomwait: Disable mwait for CPU C-states
idxd.sva= [HW]
@ -2311,20 +2332,73 @@
relaxed
iommu= [X86,EARLY]
off
Don't initialize and use any kind of IOMMU.
force
Force the use of the hardware IOMMU even when
it is not actually needed (e.g. because < 3 GB
memory).
noforce
Don't force hardware IOMMU usage when it is not
needed. (default).
biomerge
panic
nopanic
merge
nomerge
soft
pt [X86]
nopt [X86]
nobypass [PPC/POWERNV]
Use software bounce buffering (SWIOTLB) (default for
Intel machines). This can be used to prevent the usage
of an available hardware IOMMU.
[X86]
pt
[X86]
nopt
[PPC/POWERNV]
nobypass
Disable IOMMU bypass, using IOMMU for PCI devices.
[X86]
AMD Gart HW IOMMU-specific options:
<size>
Set the size of the remapping area in bytes.
allowed
Overwrite iommu off workarounds for specific chipsets
fullflush
Flush IOMMU on each allocation (default).
nofullflush
Don't use IOMMU fullflush.
memaper[=<order>]
Allocate an own aperture over RAM with size
32MB<<order. (default: order=1, i.e. 64MB)
merge
Do scatter-gather (SG) merging. Implies "force"
(experimental).
nomerge
Don't do scatter-gather (SG) merging.
noaperture
Ask the IOMMU not to touch the aperture for AGP.
noagp
Don't initialize the AGP driver and use full aperture.
panic
Always panic when IOMMU overflows.
iommu.forcedac= [ARM64,X86,EARLY] Control IOVA allocation for PCI devices.
Format: { "0" | "1" }
0 - Try to allocate a 32-bit DMA address first, before
@ -2432,7 +2506,9 @@
specified in the flag list (default: domain):
nohz
Disable the tick when a single task runs.
Disable the tick when a single task runs as well as
disabling other kernel noises like having RCU callbacks
offloaded. This is equivalent to the nohz_full parameter.
A residual 1Hz tick is offloaded to workqueues, which you
need to affine to housekeeping through the global
@ -2695,7 +2771,7 @@
VMs, i.e. on the 0=>1 and 1=>0 transitions of the
number of VMs.
Enabling virtualization at module lode avoids potential
Enabling virtualization at module load avoids potential
latency for creation of the 0=>1 VM, as KVM serializes
virtualization enabling across all online CPUs. The
"cost" of enabling virtualization when KVM is loaded,
@ -3259,9 +3335,77 @@
devices can be requested on-demand with the
/dev/loop-control interface.
mce [X86-32] Machine Check Exception
mce= [X86-{32,64}]
Please see Documentation/arch/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
off
disable machine check
no_cmci
disable CMCI(Corrected Machine Check Interrupt) that
Intel processor supports. Usually this disablement is
not recommended, but it might be handy if your
hardware is misbehaving.
Note that you'll get more problems without CMCI than
with due to the shared banks, i.e. you might get
duplicated error logs.
dont_log_ce
don't make logs for corrected errors. All events
reported as corrected are silently cleared by OS. This
option will be useful if you have no interest in any
of corrected errors.
ignore_ce
disable features for corrected errors, e.g.
polling timer and CMCI. All events reported as
corrected are not cleared by OS and remained in its
error banks.
Usually this disablement is not recommended, however
if there is an agent checking/clearing corrected
errors (e.g. BIOS or hardware monitoring
applications), conflicting with OS's error handling,
and you cannot deactivate the agent, then this option
will be a help.
no_lmce
do not opt-in to Local MCE delivery. Use legacy method
to broadcast MCEs.
bootlog
enable logging of machine checks left over from
booting. Disabled by default on AMD Fam10h and older
because some BIOS leave bogus ones.
If your BIOS doesn't do that it's a good idea to
enable though to make sure you log even machine check
events that result in a reboot. On Intel systems it is
enabled by default.
nobootlog
disable boot machine check logging.
monarchtimeout (number)
sets the time in us to wait for other CPUs on machine
checks. 0 to disable.
bios_cmci_threshold
don't overwrite the bios-set CMCI threshold. This boot
option prevents Linux from overwriting the CMCI
threshold set by the bios. Without this option, Linux
always sets the CMCI threshold to 1. Enabling this may
make memory predictive failure analysis less effective
if the bios sets thresholds for memory errors since we
will not see details for all errors.
recovery
force-enable recoverable machine check code paths
Everything else is in sysfs now.
mce=option [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst
md= [HW] RAID subsystems devices and level
See Documentation/admin-guide/md.rst.
@ -3887,6 +4031,8 @@
noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any
IOAPICs that may be present in the system.
noapictimer [APIC,X86] Don't set up the APIC timer
noautogroup Disable scheduler automatic task group creation.
nocache [ARM,EARLY]
@ -3934,6 +4080,8 @@
register save and restore. The kernel will only save
legacy floating-point registers on task switch.
nogbpages [X86] Do not use GB pages for kernel direct mappings.
no_hash_pointers
[KNL,EARLY]
Force pointers printed to the console or buffers to be
@ -3960,6 +4108,8 @@
the impact of the sleep instructions. This is also
useful when using JTAG debugger.
nohpet [X86] Don't use the HPET timer.
nohugeiomap [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge I/O mappings.
nohugevmalloc [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge vmalloc mappings.
@ -4111,8 +4261,10 @@
nosync [HW,M68K] Disables sync negotiation for all devices.
no_timer_check [X86,APIC] Disables the code which tests for
broken timer IRQ sources.
no_timer_check [X86,APIC] Disables the code which tests for broken
timer IRQ sources, i.e., the IO-APIC timer. This can
work around problems with incorrect timer
initialization on some boards.
no_uaccess_flush
[PPC,EARLY] Don't flush the L1-D cache after accessing user data.
@ -4192,6 +4344,11 @@
If given as an integer followed by 'U', it will
divide each physical node into N emulated nodes.
numa=noacpi [X86] Don't parse the SRAT table for NUMA setup
numa=nohmat [X86] Don't parse the HMAT table for NUMA setup, or
soft-reserved memory partitioning.
numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic
NUMA balancing.
Allowed values are enable and disable
@ -5715,6 +5872,55 @@
reboot_cpu is s[mp]#### with #### being the processor
to be used for rebooting.
acpi
Use the ACPI RESET_REG in the FADT. If ACPI is not
configured or the ACPI reset does not work, the reboot
path attempts the reset using the keyboard controller.
bios
Use the CPU reboot vector for warm reset
cold
Set the cold reboot flag
default
There are some built-in platform specific "quirks"
- you may see: "reboot: <name> series board detected.
Selecting <type> for reboots." In the case where you
think the quirk is in error (e.g. you have newer BIOS,
or newer board) using this option will ignore the
built-in quirk table, and use the generic default
reboot actions.
efi
Use efi reset_system runtime service. If EFI is not
configured or the EFI reset does not work, the reboot
path attempts the reset using the keyboard controller.
force
Don't stop other CPUs on reboot. This can make reboot
more reliable in some cases.
kbd
Use the keyboard controller. cold reset (default)
pci
Use a write to the PCI config space register 0xcf9 to
trigger reboot.
triple
Force a triple fault (init)
warm
Don't set the cold reboot flag
Using warm reset will be much faster especially on big
memory systems because the BIOS will not go through
the memory check. Disadvantage is that not all
hardware will be completely reinitialized on reboot so
there may be boot problems on some systems.
refscale.holdoff= [KNL]
Set test-start holdoff period. The purpose of
this parameter is to delay the start of the
@ -6106,7 +6312,16 @@
serialnumber [BUGS=X86-32]
sev=option[,option...] [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst
sev=option[,option...] [X86-64]
debug
Enable debug messages.
nosnp
Do not enable SEV-SNP (applies to host/hypervisor
only). Setting 'nosnp' avoids the RMP check overhead
in memory accesses when users do not want to run
SEV-SNP guests.
shapers= [NET]
Maximal number of shapers.

View File

@ -130,8 +130,126 @@ SNP feature support.
More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR
Reverse Map Table (RMP)
=======================
The RMP is a structure in system memory that is used to ensure a one-to-one
mapping between system physical addresses and guest physical addresses. Each
page of memory that is potentially assignable to guests has one entry within
the RMP.
The RMP table can be either contiguous in memory or a collection of segments
in memory.
Contiguous RMP
--------------
Support for this form of the RMP is present when support for SEV-SNP is
present, which can be determined using the CPUID instruction::
0x8000001f[eax]:
Bit[4] indicates support for SEV-SNP
The location of the RMP is identified to the hardware through two MSRs::
0xc0010132 (RMP_BASE):
System physical address of the first byte of the RMP
0xc0010133 (RMP_END):
System physical address of the last byte of the RMP
Hardware requires that RMP_BASE and (RPM_END + 1) be 8KB aligned, but SEV
firmware increases the alignment requirement to require a 1MB alignment.
The RMP consists of a 16KB region used for processor bookkeeping followed
by the RMP entries, which are 16 bytes in size. The size of the RMP
determines the range of physical memory that the hypervisor can assign to
SEV-SNP guests. The RMP covers the system physical address from::
0 to ((RMP_END + 1 - RMP_BASE - 16KB) / 16B) x 4KB.
The current Linux support relies on BIOS to allocate/reserve the memory for
the RMP and to set RMP_BASE and RMP_END appropriately. Linux uses the MSR
values to locate the RMP and determine the size of the RMP. The RMP must
cover all of system memory in order for Linux to enable SEV-SNP.
Segmented RMP
-------------
Segmented RMP support is a new way of representing the layout of an RMP.
Initial RMP support required the RMP table to be contiguous in memory.
RMP accesses from a NUMA node on which the RMP doesn't reside
can take longer than accesses from a NUMA node on which the RMP resides.
Segmented RMP support allows the RMP entries to be located on the same
node as the memory the RMP is covering, potentially reducing latency
associated with accessing an RMP entry associated with the memory. Each
RMP segment covers a specific range of system physical addresses.
Support for this form of the RMP can be determined using the CPUID
instruction::
0x8000001f[eax]:
Bit[23] indicates support for segmented RMP
If supported, segmented RMP attributes can be found using the CPUID
instruction::
0x80000025[eax]:
Bits[5:0] minimum supported RMP segment size
Bits[11:6] maximum supported RMP segment size
0x80000025[ebx]:
Bits[9:0] number of cacheable RMP segment definitions
Bit[10] indicates if the number of cacheable RMP segments
is a hard limit
To enable a segmented RMP, a new MSR is available::
0xc0010136 (RMP_CFG):
Bit[0] indicates if segmented RMP is enabled
Bits[13:8] contains the size of memory covered by an RMP
segment (expressed as a power of 2)
The RMP segment size defined in the RMP_CFG MSR applies to all segments
of the RMP. Therefore each RMP segment covers a specific range of system
physical addresses. For example, if the RMP_CFG MSR value is 0x2401, then
the RMP segment coverage value is 0x24 => 36, meaning the size of memory
covered by an RMP segment is 64GB (1 << 36). So the first RMP segment
covers physical addresses from 0 to 0xF_FFFF_FFFF, the second RMP segment
covers physical addresses from 0x10_0000_0000 to 0x1F_FFFF_FFFF, etc.
When a segmented RMP is enabled, RMP_BASE points to the RMP bookkeeping
area as it does today (16K in size). However, instead of RMP entries
beginning immediately after the bookkeeping area, there is a 4K RMP
segment table (RST). Each entry in the RST is 8-bytes in size and represents
an RMP segment::
Bits[19:0] mapped size (in GB)
The mapped size can be less than the defined segment size.
A value of zero, indicates that no RMP exists for the range
of system physical addresses associated with this segment.
Bits[51:20] segment physical address
This address is left shift 20-bits (or just masked when
read) to form the physical address of the segment (1MB
alignment).
The RST can hold 512 segment entries but can be limited in size to the number
of cacheable RMP segments (CPUID 0x80000025_EBX[9:0]) if the number of cacheable
RMP segments is a hard limit (CPUID 0x80000025_EBX[10]).
The current Linux support relies on BIOS to allocate/reserve the memory for
the segmented RMP (the bookkeeping area, RST, and all segments), build the RST
and to set RMP_BASE, RMP_END, and RMP_CFG appropriately. Linux uses the MSR
values to locate the RMP and determine the size and location of the RMP
segments. The RMP must cover all of system memory in order for Linux to enable
SEV-SNP.
More details in the AMD64 APM Vol 2, section "15.36.3 Reverse Map Table",
docID: 24593.
Secure VM Service Module (SVSM)
===============================
SNP provides a feature called Virtual Machine Privilege Levels (VMPL) which
defines four privilege levels at which guest software can run. The most
privileged level is 0 and numerically higher numbers have lesser privileges.

View File

@ -384,6 +384,16 @@ When monitoring is enabled all MON groups will also contain:
Available only with debug option. The identifier used by hardware
for the monitor group. On x86 this is the RMID.
When the "mba_MBps" mount option is used all CTRL_MON groups will also contain:
"mba_MBps_event":
Reading this file shows which memory bandwidth event is used
as input to the software feedback loop that keeps memory bandwidth
below the value specified in the schemata file. Writing the
name of one of the supported memory bandwidth events found in
/sys/fs/resctrl/info/L3_MON/mon_features changes the input
event.
Resource allocation rules
-------------------------

View File

@ -135,6 +135,10 @@ Thread-related topology information in the kernel:
The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
"core_id."
- topology_logical_core_id();
The logical core ID to which a thread belongs.
System topology examples

View File

@ -1,312 +0,0 @@
.. SPDX-License-Identifier: GPL-2.0
===========================
AMD64 Specific Boot Options
===========================
There are many others (usually documented in driver documentation), but
only the AMD64 specific ones are listed here.
Machine check
=============
Please see Documentation/arch/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
mce=off
Disable machine check
mce=no_cmci
Disable CMCI(Corrected Machine Check Interrupt) that
Intel processor supports. Usually this disablement is
not recommended, but it might be handy if your hardware
is misbehaving.
Note that you'll get more problems without CMCI than with
due to the shared banks, i.e. you might get duplicated
error logs.
mce=dont_log_ce
Don't make logs for corrected errors. All events reported
as corrected are silently cleared by OS.
This option will be useful if you have no interest in any
of corrected errors.
mce=ignore_ce
Disable features for corrected errors, e.g. polling timer
and CMCI. All events reported as corrected are not cleared
by OS and remained in its error banks.
Usually this disablement is not recommended, however if
there is an agent checking/clearing corrected errors
(e.g. BIOS or hardware monitoring applications), conflicting
with OS's error handling, and you cannot deactivate the agent,
then this option will be a help.
mce=no_lmce
Do not opt-in to Local MCE delivery. Use legacy method
to broadcast MCEs.
mce=bootlog
Enable logging of machine checks left over from booting.
Disabled by default on AMD Fam10h and older because some BIOS
leave bogus ones.
If your BIOS doesn't do that it's a good idea to enable though
to make sure you log even machine check events that result
in a reboot. On Intel systems it is enabled by default.
mce=nobootlog
Disable boot machine check logging.
mce=monarchtimeout (number)
monarchtimeout:
Sets the time in us to wait for other CPUs on machine checks. 0
to disable.
mce=bios_cmci_threshold
Don't overwrite the bios-set CMCI threshold. This boot option
prevents Linux from overwriting the CMCI threshold set by the
bios. Without this option, Linux always sets the CMCI
threshold to 1. Enabling this may make memory predictive failure
analysis less effective if the bios sets thresholds for memory
errors since we will not see details for all errors.
mce=recovery
Force-enable recoverable machine check code paths
nomce (for compatibility with i386)
same as mce=off
Everything else is in sysfs now.
APICs
=====
apic
Use IO-APIC. Default
noapic
Don't use the IO-APIC.
disableapic
Don't use the local APIC
nolapic
Don't use the local APIC (alias for i386 compatibility)
pirq=...
See Documentation/arch/x86/i386/IO-APIC.rst
noapictimer
Don't set up the APIC timer
no_timer_check
Don't check the IO-APIC timer. This can work around
problems with incorrect timer initialization on some boards.
apicpmtimer
Do APIC timer calibration using the pmtimer. Implies
apicmaintimer. Useful when your PIT timer is totally broken.
Timing
======
notsc
Deprecated, use tsc=unstable instead.
nohpet
Don't use the HPET timer.
Idle loop
=========
idle=poll
Don't do power saving in the idle loop using HLT, but poll for rescheduling
event. This will make the CPUs eat a lot more power, but may be useful
to get slightly better performance in multiprocessor benchmarks. It also
makes some profiling using performance counters more accurate.
Please note that on systems with MONITOR/MWAIT support (like Intel EM64T
CPUs) this option has no performance advantage over the normal idle loop.
It may also interact badly with hyperthreading.
Rebooting
=========
reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] | p[ci] [, [w]arm | [c]old]
bios
Use the CPU reboot vector for warm reset
warm
Don't set the cold reboot flag
cold
Set the cold reboot flag
triple
Force a triple fault (init)
kbd
Use the keyboard controller. cold reset (default)
acpi
Use the ACPI RESET_REG in the FADT. If ACPI is not configured or
the ACPI reset does not work, the reboot path attempts the reset
using the keyboard controller.
efi
Use efi reset_system runtime service. If EFI is not configured or
the EFI reset does not work, the reboot path attempts the reset using
the keyboard controller.
pci
Use a write to the PCI config space register 0xcf9 to trigger reboot.
Using warm reset will be much faster especially on big memory
systems because the BIOS will not go through the memory check.
Disadvantage is that not all hardware will be completely reinitialized
on reboot so there may be boot problems on some systems.
reboot=force
Don't stop other CPUs on reboot. This can make reboot more reliable
in some cases.
reboot=default
There are some built-in platform specific "quirks" - you may see:
"reboot: <name> series board detected. Selecting <type> for reboots."
In the case where you think the quirk is in error (e.g. you have
newer BIOS, or newer board) using this option will ignore the built-in
quirk table, and use the generic default reboot actions.
NUMA
====
numa=off
Only set up a single NUMA node spanning all memory.
numa=noacpi
Don't parse the SRAT table for NUMA setup
numa=nohmat
Don't parse the HMAT table for NUMA setup, or soft-reserved memory
partitioning.
ACPI
====
acpi=off
Don't enable ACPI
acpi=ht
Use ACPI boot table parsing, but don't enable ACPI interpreter
acpi=force
Force ACPI on (currently not needed)
acpi=strict
Disable out of spec ACPI workarounds.
acpi_sci={edge,level,high,low}
Set up ACPI SCI interrupt.
acpi=noirq
Don't route interrupts
acpi=nocmcff
Disable firmware first mode for corrected errors. This
disables parsing the HEST CMC error source to check if
firmware has set the FF flag. This may result in
duplicate corrected error reports.
PCI
===
pci=off
Don't use PCI
pci=conf1
Use conf1 access.
pci=conf2
Use conf2 access.
pci=rom
Assign ROMs.
pci=assign-busses
Assign busses
pci=irqmask=MASK
Set PCI interrupt mask to MASK
pci=lastbus=NUMBER
Scan up to NUMBER busses, no matter what the mptable says.
pci=noacpi
Don't use ACPI to set up PCI interrupt routing.
IOMMU (input/output memory management unit)
===========================================
Multiple x86-64 PCI-DMA mapping implementations exist, for example:
1. <kernel/dma/direct.c>: use no hardware/software IOMMU at all
(e.g. because you have < 3 GB memory).
Kernel boot message: "PCI-DMA: Disabling IOMMU"
2. <arch/x86/kernel/amd_gart_64.c>: AMD GART based hardware IOMMU.
Kernel boot message: "PCI-DMA: using GART IOMMU"
3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used
e.g. if there is no hardware IOMMU in the system and it is need because
you have >3GB memory or told the kernel to us it (iommu=soft))
Kernel boot message: "PCI-DMA: Using software bounce buffering
for IO (SWIOTLB)"
::
iommu=[<size>][,noagp][,off][,force][,noforce]
[,memaper[=<order>]][,merge][,fullflush][,nomerge]
[,noaperture]
General iommu options:
off
Don't initialize and use any kind of IOMMU.
noforce
Don't force hardware IOMMU usage when it is not needed. (default).
force
Force the use of the hardware IOMMU even when it is
not actually needed (e.g. because < 3 GB memory).
soft
Use software bounce buffering (SWIOTLB) (default for
Intel machines). This can be used to prevent the usage
of an available hardware IOMMU.
iommu options only relevant to the AMD GART hardware IOMMU:
<size>
Set the size of the remapping area in bytes.
allowed
Overwrite iommu off workarounds for specific chipsets.
fullflush
Flush IOMMU on each allocation (default).
nofullflush
Don't use IOMMU fullflush.
memaper[=<order>]
Allocate an own aperture over RAM with size 32MB<<order.
(default: order=1, i.e. 64MB)
merge
Do scatter-gather (SG) merging. Implies "force" (experimental).
nomerge
Don't do scatter-gather (SG) merging.
noaperture
Ask the IOMMU not to touch the aperture for AGP.
noagp
Don't initialize the AGP driver and use full aperture.
panic
Always panic when IOMMU overflows.
iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
implementation:
swiotlb=<slots>[,force,noforce]
<slots>
Prereserve that many 2K slots for the software IO bounce buffering.
force
Force all IO through the software TLB.
noforce
Do not initialize the software TLB.
Miscellaneous
=============
nogbpages
Do not use GB pages for kernel direct mappings.
gbpages
Use GB pages for kernel direct mappings.
AMD SEV (Secure Encrypted Virtualization)
=========================================
Options relating to AMD SEV, specified via the following format:
::
sev=option1[,option2]
The available options are:
debug
Enable debug messages.
nosnp
Do not enable SEV-SNP (applies to host/hypervisor only). Setting
'nosnp' avoids the RMP check overhead in memory accesses when
users do not want to run SEV-SNP guests.

View File

@ -18,7 +18,7 @@ For more information on the features of cpusets, see
Documentation/admin-guide/cgroup-v1/cpusets.rst.
There are a number of different configurations you can use for your needs. For
more information on the numa=fake command line option and its various ways of
configuring fake nodes, see Documentation/arch/x86/x86_64/boot-options.rst.
configuring fake nodes, see Documentation/admin-guide/kernel-parameters.txt
For the purposes of this introduction, we'll assume a very primitive NUMA
emulation setup of "numa=fake=4*512,". This will split our system memory into

View File

@ -7,7 +7,6 @@ x86_64 Support
.. toctree::
:maxdepth: 2
boot-options
uefi
mm
5level-paging

View File

@ -2,6 +2,12 @@
Scheduler Statistics
====================
Version 17 of schedstats removed 'lb_imbalance' field as it has no
significance anymore and instead added more relevant fields namely
'lb_imbalance_load', 'lb_imbalance_util', 'lb_imbalance_task' and
'lb_imbalance_misfit'. The domain field prints the name of the
corresponding sched domain from this version onwards.
Version 16 of schedstats changed the order of definitions within
'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES]
columns in show_schedstat(). In particular the position of CPU_IDLE
@ -9,7 +15,9 @@ and __CPU_NOT_IDLE changed places. The size of the array is unchanged.
Version 15 of schedstats dropped counters for some sched_yield:
yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is
identical to version 14.
identical to version 14. Details are available at
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/scheduler/sched-stats.txt?id=1e1dbb259c79b
Version 14 of schedstats includes support for sched_domains, which hit the
mainline kernel in 2.6.20 although it is identical to the stats from version
@ -26,7 +34,14 @@ cpus on the machine, while domain0 is the most tightly focused domain,
sometimes balancing only between pairs of cpus. At this time, there
are no architectures which need more than three domain levels. The first
field in the domain stats is a bit map indicating which cpus are affected
by that domain.
by that domain. Details are available at
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=b762f3ffb797c
The schedstat documentation is maintained version 10 onwards and is not
updated for version 11 and 12. The details for version 10 are available at
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=1da177e4c3f4
These fields are counters, and only increment. Programs which make use
of these will need to start with a baseline observation and then calculate
@ -71,88 +86,97 @@ Domain statistics
-----------------
One of these is produced per domain for each cpu described. (Note that if
CONFIG_SMP is not defined, *no* domains are utilized and these lines
will not appear in the output.)
will not appear in the output. <name> is an extension to the domain field
that prints the name of the corresponding sched domain. It can appear in
schedstat version 17 and above, and requires CONFIG_SCHED_DEBUG.)
domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
domain<N> <name> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
The first field is a bit mask indicating what cpus this domain operates over.
The next 24 are a variety of sched_balance_rq() statistics in grouped into types
of idleness (idle, busy, and newly idle):
The next 33 are a variety of sched_balance_rq() statistics in grouped into types
of idleness (busy, idle and newly idle):
1) # of times in this domain sched_balance_rq() was called when the
cpu was idle
2) # of times in this domain sched_balance_rq() checked but found
the load did not require balancing when the cpu was idle
3) # of times in this domain sched_balance_rq() tried to move one or
more tasks and failed, when the cpu was idle
4) sum of imbalances discovered (if any) with each call to
sched_balance_rq() in this domain when the cpu was idle
5) # of times in this domain pull_task() was called when the cpu
was idle
6) # of times in this domain pull_task() was called even though
the target task was cache-hot when idle
7) # of times in this domain sched_balance_rq() was called but did
not find a busier queue while the cpu was idle
8) # of times in this domain a busier queue was found while the
cpu was idle but no busier group was found
9) # of times in this domain sched_balance_rq() was called when the
cpu was busy
10) # of times in this domain sched_balance_rq() checked but found the
2) # of times in this domain sched_balance_rq() checked but found the
load did not require balancing when busy
11) # of times in this domain sched_balance_rq() tried to move one or
3) # of times in this domain sched_balance_rq() tried to move one or
more tasks and failed, when the cpu was busy
12) sum of imbalances discovered (if any) with each call to
sched_balance_rq() in this domain when the cpu was busy
13) # of times in this domain pull_task() was called when busy
14) # of times in this domain pull_task() was called even though the
4) Total imbalance in load when the cpu was busy
5) Total imbalance in utilization when the cpu was busy
6) Total imbalance in number of tasks when the cpu was busy
7) Total imbalance due to misfit tasks when the cpu was busy
8) # of times in this domain pull_task() was called when busy
9) # of times in this domain pull_task() was called even though the
target task was cache-hot when busy
15) # of times in this domain sched_balance_rq() was called but did not
10) # of times in this domain sched_balance_rq() was called but did not
find a busier queue while the cpu was busy
16) # of times in this domain a busier queue was found while the cpu
11) # of times in this domain a busier queue was found while the cpu
was busy but no busier group was found
17) # of times in this domain sched_balance_rq() was called when the
cpu was just becoming idle
18) # of times in this domain sched_balance_rq() checked but found the
12) # of times in this domain sched_balance_rq() was called when the
cpu was idle
13) # of times in this domain sched_balance_rq() checked but found
the load did not require balancing when the cpu was idle
14) # of times in this domain sched_balance_rq() tried to move one or
more tasks and failed, when the cpu was idle
15) Total imbalance in load when the cpu was idle
16) Total imbalance in utilization when the cpu was idle
17) Total imbalance in number of tasks when the cpu was idle
18) Total imbalance due to misfit tasks when the cpu was idle
19) # of times in this domain pull_task() was called when the cpu
was idle
20) # of times in this domain pull_task() was called even though
the target task was cache-hot when idle
21) # of times in this domain sched_balance_rq() was called but did
not find a busier queue while the cpu was idle
22) # of times in this domain a busier queue was found while the
cpu was idle but no busier group was found
23) # of times in this domain sched_balance_rq() was called when the
was just becoming idle
24) # of times in this domain sched_balance_rq() checked but found the
load did not require balancing when the cpu was just becoming idle
19) # of times in this domain sched_balance_rq() tried to move one or more
25) # of times in this domain sched_balance_rq() tried to move one or more
tasks and failed, when the cpu was just becoming idle
20) sum of imbalances discovered (if any) with each call to
sched_balance_rq() in this domain when the cpu was just becoming idle
21) # of times in this domain pull_task() was called when newly idle
22) # of times in this domain pull_task() was called even though the
26) Total imbalance in load when the cpu was just becoming idle
27) Total imbalance in utilization when the cpu was just becoming idle
28) Total imbalance in number of tasks when the cpu was just becoming idle
29) Total imbalance due to misfit tasks when the cpu was just becoming idle
30) # of times in this domain pull_task() was called when newly idle
31) # of times in this domain pull_task() was called even though the
target task was cache-hot when just becoming idle
23) # of times in this domain sched_balance_rq() was called but did not
32) # of times in this domain sched_balance_rq() was called but did not
find a busier queue while the cpu was just becoming idle
24) # of times in this domain a busier queue was found while the cpu
33) # of times in this domain a busier queue was found while the cpu
was just becoming idle but no busier group was found
Next three are active_load_balance() statistics:
25) # of times active_load_balance() was called
26) # of times active_load_balance() tried to move a task and failed
27) # of times active_load_balance() successfully moved a task
34) # of times active_load_balance() was called
35) # of times active_load_balance() tried to move a task and failed
36) # of times active_load_balance() successfully moved a task
Next three are sched_balance_exec() statistics:
28) sbe_cnt is not used
29) sbe_balanced is not used
30) sbe_pushed is not used
37) sbe_cnt is not used
38) sbe_balanced is not used
39) sbe_pushed is not used
Next three are sched_balance_fork() statistics:
31) sbf_cnt is not used
32) sbf_balanced is not used
33) sbf_pushed is not used
40) sbf_cnt is not used
41) sbf_balanced is not used
42) sbf_pushed is not used
Next three are try_to_wake_up() statistics:
34) # of times in this domain try_to_wake_up() awoke a task that
43) # of times in this domain try_to_wake_up() awoke a task that
last ran on a different cpu in this domain
35) # of times in this domain try_to_wake_up() moved a task to the
44) # of times in this domain try_to_wake_up() moved a task to the
waking cpu because it was cache-cold on its own cpu anyway
36) # of times in this domain try_to_wake_up() started passive balancing
45) # of times in this domain try_to_wake_up() started passive balancing
/proc/<pid>/schedstat
---------------------

View File

@ -1120,6 +1120,14 @@ L: linux-i2c@vger.kernel.org
S: Supported
F: drivers/i2c/busses/i2c-amd-asf-plat.c
AMD NODE DRIVER
M: Mario Limonciello <mario.limonciello@amd.com>
M: Yazen Ghannam <yazen.ghannam@amd.com>
L: linux-kernel@vger.kernel.org
S: Supported
F: arch/x86/include/asm/amd_node.h
F: arch/x86/kernel/amd_node.c
AMD PDS CORE DRIVER
M: Shannon Nelson <shannon.nelson@amd.com>
M: Brett Creeley <brett.creeley@amd.com>
@ -13480,8 +13488,8 @@ LOCKING PRIMITIVES
M: Peter Zijlstra <peterz@infradead.org>
M: Ingo Molnar <mingo@redhat.com>
M: Will Deacon <will@kernel.org>
M: Boqun Feng <boqun.feng@gmail.com> (LOCKDEP & RUST)
R: Waiman Long <longman@redhat.com>
R: Boqun Feng <boqun.feng@gmail.com> (LOCKDEP)
L: linux-kernel@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
@ -13495,6 +13503,11 @@ F: include/linux/seqlock.h
F: include/linux/spinlock*.h
F: kernel/locking/
F: lib/locking*.[ch]
F: rust/helpers/mutex.c
F: rust/helpers/spinlock.c
F: rust/kernel/sync/lock.rs
F: rust/kernel/sync/lock/
F: rust/kernel/sync/locked_by.rs
X: kernel/locking/locktorture.c
LOGICAL DISK MANAGER SUPPORT (LDM, Windows 2000/XP/Vista Dynamic Disks)
@ -22546,7 +22559,7 @@ F: arch/*/kernel/static_call.c
F: include/linux/jump_label*.h
F: include/linux/static_call*.h
F: kernel/jump_label.c
F: kernel/static_call.c
F: kernel/static_call*.c
STI AUDIO (ASoC) DRIVERS
M: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>

View File

@ -127,29 +127,6 @@ void crash_smp_send_stop(void)
cpus_stopped = 1;
}
static void machine_kexec_mask_interrupts(void)
{
unsigned int i;
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
struct irq_chip *chip;
chip = irq_desc_get_chip(desc);
if (!chip)
continue;
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
chip->irq_eoi(&desc->irq_data);
if (chip->irq_mask)
chip->irq_mask(&desc->irq_data);
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
chip->irq_disable(&desc->irq_data);
}
}
void machine_crash_shutdown(struct pt_regs *regs)
{
local_irq_disable();

View File

@ -149,6 +149,7 @@ config ARM64
select GENERIC_IDLE_POLL_SETUP
select GENERIC_IOREMAP
select GENERIC_IRQ_IPI
select GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL

View File

@ -207,37 +207,6 @@ void machine_kexec(struct kimage *kimage)
BUG(); /* Should never get here. */
}
static void machine_kexec_mask_interrupts(void)
{
unsigned int i;
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
struct irq_chip *chip;
int ret;
chip = irq_desc_get_chip(desc);
if (!chip)
continue;
/*
* First try to remove the active state. If this
* fails, try to EOI the interrupt.
*/
ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
if (ret && irqd_irq_inprogress(&desc->irq_data) &&
chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
if (chip->irq_mask)
chip->irq_mask(&desc->irq_data);
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
chip->irq_disable(&desc->irq_data);
}
}
/**
* machine_crash_shutdown - shutdown non-crashing cpus and save registers
*/

View File

@ -4,6 +4,7 @@
#include <asm/break.h>
#include <linux/stringify.h>
#include <linux/objtool.h>
#ifndef CONFIG_DEBUG_BUGVERBOSE
#define _BUGVERBOSE_LOCATION(file, line)
@ -33,25 +34,25 @@
#define ASM_BUG_FLAGS(flags) \
__BUG_ENTRY(flags) \
break BRK_BUG
break BRK_BUG;
#define ASM_BUG() ASM_BUG_FLAGS(0)
#define __BUG_FLAGS(flags) \
asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)));
#define __BUG_FLAGS(flags, extra) \
asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)) \
extra);
#define __WARN_FLAGS(flags) \
do { \
instrumentation_begin(); \
__BUG_FLAGS(BUGFLAG_WARNING|(flags)); \
annotate_reachable(); \
__BUG_FLAGS(BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE(10001b));\
instrumentation_end(); \
} while (0)
#define BUG() \
do { \
instrumentation_begin(); \
__BUG_FLAGS(0); \
__BUG_FLAGS(0, ""); \
unreachable(); \
} while (0)

View File

@ -61,7 +61,6 @@ struct pt_regs;
extern void kexec_smp_wait(void); /* get and clear naca physid, wait for
master to copy new code to 0 */
extern void default_machine_kexec(struct kimage *image);
extern void machine_kexec_mask_interrupts(void);
void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_code_buffer,
unsigned long start_address) __noreturn;

View File

@ -22,28 +22,6 @@
#include <asm/setup.h>
#include <asm/firmware.h>
void machine_kexec_mask_interrupts(void) {
unsigned int i;
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
struct irq_chip *chip;
chip = irq_desc_get_chip(desc);
if (!chip)
continue;
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
chip->irq_eoi(&desc->irq_data);
if (chip->irq_mask)
chip->irq_mask(&desc->irq_data);
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
chip->irq_disable(&desc->irq_data);
}
}
#ifdef CONFIG_CRASH_DUMP
void machine_crash_shutdown(struct pt_regs *regs)
{

View File

@ -7,6 +7,7 @@
* Copyright (C) 2005 IBM Corporation.
*/
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/mm.h>
#include <linux/string.h>

View File

@ -114,29 +114,6 @@ void machine_shutdown(void)
#endif
}
static void machine_kexec_mask_interrupts(void)
{
unsigned int i;
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
struct irq_chip *chip;
chip = irq_desc_get_chip(desc);
if (!chip)
continue;
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
chip->irq_eoi(&desc->irq_data);
if (chip->irq_mask)
chip->irq_mask(&desc->irq_data);
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
chip->irq_disable(&desc->irq_data);
}
}
/*
* machine_crash_shutdown - Prepare to kexec after a kernel crash
*

View File

@ -981,7 +981,7 @@ static int cfdiag_push_sample(struct perf_event *event,
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = cpuhw->usedss;
raw.frag.data = cpuhw->stop;
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
overflow = perf_event_overflow(event, &data, &regs);

View File

@ -981,7 +981,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
cpuhw->flags &= ~PMU_F_ENABLED;
}
/* perf_exclude_event() - Filter event
/* perf_event_exclude() - Filter event
* @event: The perf event
* @regs: pt_regs structure
* @sde_regs: Sample-data-entry (sde) regs structure
@ -990,7 +990,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
*
* Return non-zero if the event shall be excluded.
*/
static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs,
static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs,
struct perf_sf_sde_regs *sde_regs)
{
if (event->attr.exclude_user && user_mode(regs))
@ -1073,7 +1073,7 @@ static int perf_push_sample(struct perf_event *event,
data.tid_entry.pid = basic->hpp & LPP_PID_MASK;
overflow = 0;
if (perf_exclude_event(event, &regs, sde_regs))
if (perf_event_exclude(event, &regs, sde_regs))
goto out;
if (perf_event_overflow(event, &data, &regs)) {
overflow = 1;

View File

@ -478,7 +478,7 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = rawsize;
raw.frag.data = cpump->save;
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
overflow = perf_event_overflow(event, &data, &regs);

View File

@ -503,7 +503,7 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = rawsize;
raw.frag.data = cpump->save;
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
overflow = perf_event_overflow(event, &data, &regs);

View File

@ -1190,7 +1190,7 @@ config X86_MCE_INTEL
config X86_MCE_AMD
def_bool y
prompt "AMD MCE features"
depends on X86_MCE && X86_LOCAL_APIC && AMD_NB
depends on X86_MCE && X86_LOCAL_APIC
help
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
@ -1560,6 +1560,7 @@ config AMD_MEM_ENCRYPT
select ARCH_HAS_CC_PLATFORM
select X86_MEM_ENCRYPT
select UNACCEPTED_MEMORY
select CRYPTO_LIB_AESGCM
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
@ -3129,6 +3130,10 @@ config TS5500
endif # X86_32
config AMD_NB
def_bool y
depends on AMD_NODE
config AMD_NODE
def_bool y
depends on CPU_SUP_AMD && PCI

View File

@ -97,7 +97,7 @@ config IOMMU_DEBUG
code. When you use it make sure you have a big enough
IOMMU/AGP aperture. Most of the options enabled by this can
be set more finegrained using the iommu= command line
options. See Documentation/arch/x86/x86_64/boot-options.rst for more
options. See Documentation/admin-guide/kernel-parameters.txt for more
details.
config IOMMU_LEAK

View File

@ -25,10 +25,6 @@
#include "efi.h"
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
#include <linux/ctype.h>
#include <generated/utsversion.h>
#include <generated/utsrelease.h>

View File

@ -401,7 +401,8 @@ finish:
* by the guest kernel. As and when a new feature is implemented in the
* guest kernel, a corresponding bit should be added to the mask.
*/
#define SNP_FEATURES_PRESENT MSR_AMD64_SNP_DEBUG_SWAP
#define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \
MSR_AMD64_SNP_SECURE_TSC)
u64 snp_get_unsupported_features(u64 status)
{

View File

@ -65,7 +65,6 @@ static __maybe_unused __always_inline bool amd_cc_platform_vtom(enum cc_attr att
* up under SME the trampoline area cannot be encrypted, whereas under SEV
* the trampoline area must be encrypted.
*/
static bool noinstr amd_cc_platform_has(enum cc_attr attr)
{
#ifdef CONFIG_AMD_MEM_ENCRYPT
@ -97,6 +96,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr)
case CC_ATTR_GUEST_SEV_SNP:
return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
case CC_ATTR_GUEST_SNP_SECURE_TSC:
return sev_status & MSR_AMD64_SNP_SECURE_TSC;
case CC_ATTR_HOST_SEV_SNP:
return cc_flags.host_sev_snp;

View File

@ -13,3 +13,6 @@ KCOV_INSTRUMENT_core.o := n
# With some compiler versions the generated code results in boot hangs, caused
# by several compilation units. To be safe, disable all instrumentation.
KCSAN_SANITIZE := n
# Clang 14 and older may fail to respect __no_sanitize_undefined when inlining
UBSAN_SANITIZE := n

View File

@ -25,6 +25,7 @@
#include <linux/psp-sev.h>
#include <linux/dmi.h>
#include <uapi/linux/sev-guest.h>
#include <crypto/gcm.h>
#include <asm/init.h>
#include <asm/cpu_entry_area.h>
@ -95,6 +96,15 @@ static u64 sev_hv_features __ro_after_init;
/* Secrets page physical address from the CC blob */
static u64 secrets_pa __ro_after_init;
/*
* For Secure TSC guests, the BSP fetches TSC_INFO using SNP guest messaging and
* initializes snp_tsc_scale and snp_tsc_offset. These values are replicated
* across the APs VMSA fields (TSC_SCALE and TSC_OFFSET).
*/
static u64 snp_tsc_scale __ro_after_init;
static u64 snp_tsc_offset __ro_after_init;
static u64 snp_tsc_freq_khz __ro_after_init;
/* #VC handler runtime per-CPU data */
struct sev_es_runtime_data {
struct ghcb ghcb_page;
@ -777,15 +787,10 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr,
val = sev_es_rd_ghcb_msr();
if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
"Wrong PSC response code: 0x%x\n",
(unsigned int)GHCB_RESP_CODE(val)))
if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP)
goto e_term;
if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
"Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
paddr, GHCB_MSR_PSC_RESP_VAL(val)))
if (GHCB_MSR_PSC_RESP_VAL(val))
goto e_term;
/* Page validation must be performed after changing to private */
@ -821,7 +826,7 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
}
void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
unsigned long npages)
{
/*
@ -1276,6 +1281,12 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
vmsa->vmpl = snp_vmpl;
vmsa->sev_features = sev_status >> 2;
/* Populate AP's TSC scale/offset to get accurate TSC values. */
if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC)) {
vmsa->tsc_scale = snp_tsc_scale;
vmsa->tsc_offset = snp_tsc_offset;
}
/* Switch the page over to a VMSA page now that it is initialized */
ret = snp_set_vmsa(vmsa, caa, apic_id, true);
if (ret) {
@ -1418,6 +1429,41 @@ static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write)
return ES_OK;
}
/*
* TSC related accesses should not exit to the hypervisor when a guest is
* executing with Secure TSC enabled, so special handling is required for
* accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ.
*/
static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write)
{
u64 tsc;
/*
* GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled.
* Terminate the SNP guest when the interception is enabled.
*/
if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ)
return ES_VMM_ERROR;
/*
* Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC
* to return undefined values, so ignore all writes.
*
* Reads: Reads of MSR_IA32_TSC should return the current TSC value, use
* the value returned by rdtsc_ordered().
*/
if (write) {
WARN_ONCE(1, "TSC MSR writes are verboten!\n");
return ES_OK;
}
tsc = rdtsc_ordered();
regs->ax = lower_32_bits(tsc);
regs->dx = upper_32_bits(tsc);
return ES_OK;
}
static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
struct pt_regs *regs = ctxt->regs;
@ -1427,8 +1473,18 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
/* Is it a WRMSR? */
write = ctxt->insn.opcode.bytes[1] == 0x30;
if (regs->cx == MSR_SVSM_CAA)
switch (regs->cx) {
case MSR_SVSM_CAA:
return __vc_handle_msr_caa(regs, write);
case MSR_IA32_TSC:
case MSR_AMD64_GUEST_TSC_FREQ:
if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
return __vc_handle_secure_tsc_msrs(regs, write);
else
break;
default:
break;
}
ghcb_set_rcx(ghcb, regs->cx);
if (write) {
@ -2360,7 +2416,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
call.rcx = pa;
ret = svsm_perform_call_protocol(&call);
if (ret)
panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", ret, call.rax_out);
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL);
RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa;
RIP_REL_REF(boot_svsm_caa_pa) = pa;
@ -2506,8 +2562,8 @@ int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call,
}
EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req);
int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
struct snp_guest_request_ioctl *rio)
static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
struct snp_guest_request_ioctl *rio)
{
struct ghcb_state state;
struct es_em_ctxt ctxt;
@ -2569,7 +2625,6 @@ e_restore_irq:
return ret;
}
EXPORT_SYMBOL_GPL(snp_issue_guest_request);
static struct platform_device sev_guest_device = {
.name = "sev-guest",
@ -2578,15 +2633,9 @@ static struct platform_device sev_guest_device = {
static int __init snp_init_platform_device(void)
{
struct sev_guest_platform_data data;
if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return -ENODEV;
data.secrets_gpa = secrets_pa;
if (platform_device_add_data(&sev_guest_device, &data, sizeof(data)))
return -ENODEV;
if (platform_device_register(&sev_guest_device))
return -ENODEV;
@ -2665,3 +2714,581 @@ static int __init sev_sysfs_init(void)
}
arch_initcall(sev_sysfs_init);
#endif // CONFIG_SYSFS
static void free_shared_pages(void *buf, size_t sz)
{
unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
int ret;
if (!buf)
return;
ret = set_memory_encrypted((unsigned long)buf, npages);
if (ret) {
WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n");
return;
}
__free_pages(virt_to_page(buf), get_order(sz));
}
static void *alloc_shared_pages(size_t sz)
{
unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
struct page *page;
int ret;
page = alloc_pages(GFP_KERNEL_ACCOUNT, get_order(sz));
if (!page)
return NULL;
ret = set_memory_decrypted((unsigned long)page_address(page), npages);
if (ret) {
pr_err("failed to mark page shared, ret=%d\n", ret);
__free_pages(page, get_order(sz));
return NULL;
}
return page_address(page);
}
static u8 *get_vmpck(int id, struct snp_secrets_page *secrets, u32 **seqno)
{
u8 *key = NULL;
switch (id) {
case 0:
*seqno = &secrets->os_area.msg_seqno_0;
key = secrets->vmpck0;
break;
case 1:
*seqno = &secrets->os_area.msg_seqno_1;
key = secrets->vmpck1;
break;
case 2:
*seqno = &secrets->os_area.msg_seqno_2;
key = secrets->vmpck2;
break;
case 3:
*seqno = &secrets->os_area.msg_seqno_3;
key = secrets->vmpck3;
break;
default:
break;
}
return key;
}
static struct aesgcm_ctx *snp_init_crypto(u8 *key, size_t keylen)
{
struct aesgcm_ctx *ctx;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return NULL;
if (aesgcm_expandkey(ctx, key, keylen, AUTHTAG_LEN)) {
pr_err("Crypto context initialization failed\n");
kfree(ctx);
return NULL;
}
return ctx;
}
int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id)
{
/* Adjust the default VMPCK key based on the executing VMPL level */
if (vmpck_id == -1)
vmpck_id = snp_vmpl;
mdesc->vmpck = get_vmpck(vmpck_id, mdesc->secrets, &mdesc->os_area_msg_seqno);
if (!mdesc->vmpck) {
pr_err("Invalid VMPCK%d communication key\n", vmpck_id);
return -EINVAL;
}
/* Verify that VMPCK is not zero. */
if (!memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) {
pr_err("Empty VMPCK%d communication key\n", vmpck_id);
return -EINVAL;
}
mdesc->vmpck_id = vmpck_id;
mdesc->ctx = snp_init_crypto(mdesc->vmpck, VMPCK_KEY_LEN);
if (!mdesc->ctx)
return -ENOMEM;
return 0;
}
EXPORT_SYMBOL_GPL(snp_msg_init);
struct snp_msg_desc *snp_msg_alloc(void)
{
struct snp_msg_desc *mdesc;
void __iomem *mem;
BUILD_BUG_ON(sizeof(struct snp_guest_msg) > PAGE_SIZE);
mdesc = kzalloc(sizeof(struct snp_msg_desc), GFP_KERNEL);
if (!mdesc)
return ERR_PTR(-ENOMEM);
mem = ioremap_encrypted(secrets_pa, PAGE_SIZE);
if (!mem)
goto e_free_mdesc;
mdesc->secrets = (__force struct snp_secrets_page *)mem;
/* Allocate the shared page used for the request and response message. */
mdesc->request = alloc_shared_pages(sizeof(struct snp_guest_msg));
if (!mdesc->request)
goto e_unmap;
mdesc->response = alloc_shared_pages(sizeof(struct snp_guest_msg));
if (!mdesc->response)
goto e_free_request;
mdesc->certs_data = alloc_shared_pages(SEV_FW_BLOB_MAX_SIZE);
if (!mdesc->certs_data)
goto e_free_response;
/* initial the input address for guest request */
mdesc->input.req_gpa = __pa(mdesc->request);
mdesc->input.resp_gpa = __pa(mdesc->response);
mdesc->input.data_gpa = __pa(mdesc->certs_data);
return mdesc;
e_free_response:
free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg));
e_free_request:
free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg));
e_unmap:
iounmap(mem);
e_free_mdesc:
kfree(mdesc);
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(snp_msg_alloc);
void snp_msg_free(struct snp_msg_desc *mdesc)
{
if (!mdesc)
return;
kfree(mdesc->ctx);
free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg));
free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg));
free_shared_pages(mdesc->certs_data, SEV_FW_BLOB_MAX_SIZE);
iounmap((__force void __iomem *)mdesc->secrets);
memset(mdesc, 0, sizeof(*mdesc));
kfree(mdesc);
}
EXPORT_SYMBOL_GPL(snp_msg_free);
/* Mutex to serialize the shared buffer access and command handling. */
static DEFINE_MUTEX(snp_cmd_mutex);
/*
* If an error is received from the host or AMD Secure Processor (ASP) there
* are two options. Either retry the exact same encrypted request or discontinue
* using the VMPCK.
*
* This is because in the current encryption scheme GHCB v2 uses AES-GCM to
* encrypt the requests. The IV for this scheme is the sequence number. GCM
* cannot tolerate IV reuse.
*
* The ASP FW v1.51 only increments the sequence numbers on a successful
* guest<->ASP back and forth and only accepts messages at its exact sequence
* number.
*
* So if the sequence number were to be reused the encryption scheme is
* vulnerable. If the sequence number were incremented for a fresh IV the ASP
* will reject the request.
*/
static void snp_disable_vmpck(struct snp_msg_desc *mdesc)
{
pr_alert("Disabling VMPCK%d communication key to prevent IV reuse.\n",
mdesc->vmpck_id);
memzero_explicit(mdesc->vmpck, VMPCK_KEY_LEN);
mdesc->vmpck = NULL;
}
static inline u64 __snp_get_msg_seqno(struct snp_msg_desc *mdesc)
{
u64 count;
lockdep_assert_held(&snp_cmd_mutex);
/* Read the current message sequence counter from secrets pages */
count = *mdesc->os_area_msg_seqno;
return count + 1;
}
/* Return a non-zero on success */
static u64 snp_get_msg_seqno(struct snp_msg_desc *mdesc)
{
u64 count = __snp_get_msg_seqno(mdesc);
/*
* The message sequence counter for the SNP guest request is a 64-bit
* value but the version 2 of GHCB specification defines a 32-bit storage
* for it. If the counter exceeds the 32-bit value then return zero.
* The caller should check the return value, but if the caller happens to
* not check the value and use it, then the firmware treats zero as an
* invalid number and will fail the message request.
*/
if (count >= UINT_MAX) {
pr_err("request message sequence counter overflow\n");
return 0;
}
return count;
}
static void snp_inc_msg_seqno(struct snp_msg_desc *mdesc)
{
/*
* The counter is also incremented by the PSP, so increment it by 2
* and save in secrets page.
*/
*mdesc->os_area_msg_seqno += 2;
}
static int verify_and_dec_payload(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
{
struct snp_guest_msg *resp_msg = &mdesc->secret_response;
struct snp_guest_msg *req_msg = &mdesc->secret_request;
struct snp_guest_msg_hdr *req_msg_hdr = &req_msg->hdr;
struct snp_guest_msg_hdr *resp_msg_hdr = &resp_msg->hdr;
struct aesgcm_ctx *ctx = mdesc->ctx;
u8 iv[GCM_AES_IV_SIZE] = {};
pr_debug("response [seqno %lld type %d version %d sz %d]\n",
resp_msg_hdr->msg_seqno, resp_msg_hdr->msg_type, resp_msg_hdr->msg_version,
resp_msg_hdr->msg_sz);
/* Copy response from shared memory to encrypted memory. */
memcpy(resp_msg, mdesc->response, sizeof(*resp_msg));
/* Verify that the sequence counter is incremented by 1 */
if (unlikely(resp_msg_hdr->msg_seqno != (req_msg_hdr->msg_seqno + 1)))
return -EBADMSG;
/* Verify response message type and version number. */
if (resp_msg_hdr->msg_type != (req_msg_hdr->msg_type + 1) ||
resp_msg_hdr->msg_version != req_msg_hdr->msg_version)
return -EBADMSG;
/*
* If the message size is greater than our buffer length then return
* an error.
*/
if (unlikely((resp_msg_hdr->msg_sz + ctx->authsize) > req->resp_sz))
return -EBADMSG;
/* Decrypt the payload */
memcpy(iv, &resp_msg_hdr->msg_seqno, min(sizeof(iv), sizeof(resp_msg_hdr->msg_seqno)));
if (!aesgcm_decrypt(ctx, req->resp_buf, resp_msg->payload, resp_msg_hdr->msg_sz,
&resp_msg_hdr->algo, AAD_LEN, iv, resp_msg_hdr->authtag))
return -EBADMSG;
return 0;
}
static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_req *req)
{
struct snp_guest_msg *msg = &mdesc->secret_request;
struct snp_guest_msg_hdr *hdr = &msg->hdr;
struct aesgcm_ctx *ctx = mdesc->ctx;
u8 iv[GCM_AES_IV_SIZE] = {};
memset(msg, 0, sizeof(*msg));
hdr->algo = SNP_AEAD_AES_256_GCM;
hdr->hdr_version = MSG_HDR_VER;
hdr->hdr_sz = sizeof(*hdr);
hdr->msg_type = req->msg_type;
hdr->msg_version = req->msg_version;
hdr->msg_seqno = seqno;
hdr->msg_vmpck = req->vmpck_id;
hdr->msg_sz = req->req_sz;
/* Verify the sequence number is non-zero */
if (!hdr->msg_seqno)
return -ENOSR;
pr_debug("request [seqno %lld type %d version %d sz %d]\n",
hdr->msg_seqno, hdr->msg_type, hdr->msg_version, hdr->msg_sz);
if (WARN_ON((req->req_sz + ctx->authsize) > sizeof(msg->payload)))
return -EBADMSG;
memcpy(iv, &hdr->msg_seqno, min(sizeof(iv), sizeof(hdr->msg_seqno)));
aesgcm_encrypt(ctx, msg->payload, req->req_buf, req->req_sz, &hdr->algo,
AAD_LEN, iv, hdr->authtag);
return 0;
}
static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
struct snp_guest_request_ioctl *rio)
{
unsigned long req_start = jiffies;
unsigned int override_npages = 0;
u64 override_err = 0;
int rc;
retry_request:
/*
* Call firmware to process the request. In this function the encrypted
* message enters shared memory with the host. So after this call the
* sequence number must be incremented or the VMPCK must be deleted to
* prevent reuse of the IV.
*/
rc = snp_issue_guest_request(req, &mdesc->input, rio);
switch (rc) {
case -ENOSPC:
/*
* If the extended guest request fails due to having too
* small of a certificate data buffer, retry the same
* guest request without the extended data request in
* order to increment the sequence number and thus avoid
* IV reuse.
*/
override_npages = mdesc->input.data_npages;
req->exit_code = SVM_VMGEXIT_GUEST_REQUEST;
/*
* Override the error to inform callers the given extended
* request buffer size was too small and give the caller the
* required buffer size.
*/
override_err = SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN);
/*
* If this call to the firmware succeeds, the sequence number can
* be incremented allowing for continued use of the VMPCK. If
* there is an error reflected in the return value, this value
* is checked further down and the result will be the deletion
* of the VMPCK and the error code being propagated back to the
* user as an ioctl() return code.
*/
goto retry_request;
/*
* The host may return SNP_GUEST_VMM_ERR_BUSY if the request has been
* throttled. Retry in the driver to avoid returning and reusing the
* message sequence number on a different message.
*/
case -EAGAIN:
if (jiffies - req_start > SNP_REQ_MAX_RETRY_DURATION) {
rc = -ETIMEDOUT;
break;
}
schedule_timeout_killable(SNP_REQ_RETRY_DELAY);
goto retry_request;
}
/*
* Increment the message sequence number. There is no harm in doing
* this now because decryption uses the value stored in the response
* structure and any failure will wipe the VMPCK, preventing further
* use anyway.
*/
snp_inc_msg_seqno(mdesc);
if (override_err) {
rio->exitinfo2 = override_err;
/*
* If an extended guest request was issued and the supplied certificate
* buffer was not large enough, a standard guest request was issued to
* prevent IV reuse. If the standard request was successful, return -EIO
* back to the caller as would have originally been returned.
*/
if (!rc && override_err == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
rc = -EIO;
}
if (override_npages)
mdesc->input.data_npages = override_npages;
return rc;
}
int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
struct snp_guest_request_ioctl *rio)
{
u64 seqno;
int rc;
guard(mutex)(&snp_cmd_mutex);
/* Check if the VMPCK is not empty */
if (!mdesc->vmpck || !memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) {
pr_err_ratelimited("VMPCK is disabled\n");
return -ENOTTY;
}
/* Get message sequence and verify that its a non-zero */
seqno = snp_get_msg_seqno(mdesc);
if (!seqno)
return -EIO;
/* Clear shared memory's response for the host to populate. */
memset(mdesc->response, 0, sizeof(struct snp_guest_msg));
/* Encrypt the userspace provided payload in mdesc->secret_request. */
rc = enc_payload(mdesc, seqno, req);
if (rc)
return rc;
/*
* Write the fully encrypted request to the shared unencrypted
* request page.
*/
memcpy(mdesc->request, &mdesc->secret_request, sizeof(mdesc->secret_request));
rc = __handle_guest_request(mdesc, req, rio);
if (rc) {
if (rc == -EIO &&
rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
return rc;
pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n",
rc, rio->exitinfo2);
snp_disable_vmpck(mdesc);
return rc;
}
rc = verify_and_dec_payload(mdesc, req);
if (rc) {
pr_alert("Detected unexpected decode failure from ASP. rc: %d\n", rc);
snp_disable_vmpck(mdesc);
return rc;
}
return 0;
}
EXPORT_SYMBOL_GPL(snp_send_guest_request);
static int __init snp_get_tsc_info(void)
{
struct snp_guest_request_ioctl *rio;
struct snp_tsc_info_resp *tsc_resp;
struct snp_tsc_info_req *tsc_req;
struct snp_msg_desc *mdesc;
struct snp_guest_req *req;
int rc = -ENOMEM;
tsc_req = kzalloc(sizeof(*tsc_req), GFP_KERNEL);
if (!tsc_req)
return rc;
/*
* The intermediate response buffer is used while decrypting the
* response payload. Make sure that it has enough space to cover
* the authtag.
*/
tsc_resp = kzalloc(sizeof(*tsc_resp) + AUTHTAG_LEN, GFP_KERNEL);
if (!tsc_resp)
goto e_free_tsc_req;
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req)
goto e_free_tsc_resp;
rio = kzalloc(sizeof(*rio), GFP_KERNEL);
if (!rio)
goto e_free_req;
mdesc = snp_msg_alloc();
if (IS_ERR_OR_NULL(mdesc))
goto e_free_rio;
rc = snp_msg_init(mdesc, snp_vmpl);
if (rc)
goto e_free_mdesc;
req->msg_version = MSG_HDR_VER;
req->msg_type = SNP_MSG_TSC_INFO_REQ;
req->vmpck_id = snp_vmpl;
req->req_buf = tsc_req;
req->req_sz = sizeof(*tsc_req);
req->resp_buf = (void *)tsc_resp;
req->resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN;
req->exit_code = SVM_VMGEXIT_GUEST_REQUEST;
rc = snp_send_guest_request(mdesc, req, rio);
if (rc)
goto e_request;
pr_debug("%s: response status 0x%x scale 0x%llx offset 0x%llx factor 0x%x\n",
__func__, tsc_resp->status, tsc_resp->tsc_scale, tsc_resp->tsc_offset,
tsc_resp->tsc_factor);
if (!tsc_resp->status) {
snp_tsc_scale = tsc_resp->tsc_scale;
snp_tsc_offset = tsc_resp->tsc_offset;
} else {
pr_err("Failed to get TSC info, response status 0x%x\n", tsc_resp->status);
rc = -EIO;
}
e_request:
/* The response buffer contains sensitive data, explicitly clear it. */
memzero_explicit(tsc_resp, sizeof(*tsc_resp) + AUTHTAG_LEN);
e_free_mdesc:
snp_msg_free(mdesc);
e_free_rio:
kfree(rio);
e_free_req:
kfree(req);
e_free_tsc_resp:
kfree(tsc_resp);
e_free_tsc_req:
kfree(tsc_req);
return rc;
}
void __init snp_secure_tsc_prepare(void)
{
if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
return;
if (snp_get_tsc_info()) {
pr_alert("Unable to retrieve Secure TSC info from ASP\n");
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECURE_TSC);
}
pr_debug("SecureTSC enabled");
}
static unsigned long securetsc_get_tsc_khz(void)
{
return snp_tsc_freq_khz;
}
void __init snp_secure_tsc_init(void)
{
unsigned long long tsc_freq_mhz;
if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
return;
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
rdmsrl(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz);
snp_tsc_freq_khz = (unsigned long)(tsc_freq_mhz * 1000);
x86_platform.calibrate_cpu = securetsc_get_tsc_khz;
x86_platform.calibrate_tsc = securetsc_get_tsc_khz;
}

View File

@ -498,7 +498,7 @@ static const struct snp_cpuid_table *snp_cpuid_get_table(void)
*
* Return: XSAVE area size on success, 0 otherwise.
*/
static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
u64 xfeatures_found = 0;
@ -576,8 +576,9 @@ static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpui
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
}
static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
struct cpuid_leaf *leaf)
static int __head
snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
struct cpuid_leaf *leaf)
{
struct cpuid_leaf leaf_hv = *leaf;
@ -1140,6 +1141,16 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
enum es_result ret;
/*
* The hypervisor should not be intercepting RDTSC/RDTSCP when Secure
* TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP
* instructions are being intercepted. If this should occur and Secure
* TSC is enabled, guest execution should be terminated as the guest
* cannot rely on the TSC value provided by the hypervisor.
*/
if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
return ES_VMM_ERROR;
ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
if (ret != ES_OK)
return ret;
@ -1243,7 +1254,7 @@ static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svs
__pval_terminate(pfn, action, page_size, ret, svsm_ret);
}
static void svsm_pval_4k_page(unsigned long paddr, bool validate)
static void __head svsm_pval_4k_page(unsigned long paddr, bool validate)
{
struct svsm_pvalidate_call *pc;
struct svsm_call call = {};
@ -1275,12 +1286,13 @@ static void svsm_pval_4k_page(unsigned long paddr, bool validate)
ret = svsm_perform_call_protocol(&call);
if (ret)
svsm_pval_terminate(pc, ret, call.rax_out);
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
native_local_irq_restore(flags);
}
static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool validate)
static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr,
bool validate)
{
int ret;
@ -1293,7 +1305,7 @@ static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool val
} else {
ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
if (ret)
__pval_terminate(PHYS_PFN(paddr), validate, RMP_PG_SIZE_4K, ret, 0);
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
}
}

View File

@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0
obj-y += tdx.o tdx-shared.o tdcall.o
obj-y += debug.o tdcall.o tdx.o tdx-shared.o

69
arch/x86/coco/tdx/debug.c Normal file
View File

@ -0,0 +1,69 @@
// SPDX-License-Identifier: GPL-2.0
#undef pr_fmt
#define pr_fmt(fmt) "tdx: " fmt
#include <linux/array_size.h>
#include <linux/printk.h>
#include <asm/tdx.h>
#define DEF_TDX_ATTR_NAME(_name) [TDX_ATTR_##_name##_BIT] = __stringify(_name)
static __initdata const char *tdx_attributes[] = {
DEF_TDX_ATTR_NAME(DEBUG),
DEF_TDX_ATTR_NAME(HGS_PLUS_PROF),
DEF_TDX_ATTR_NAME(PERF_PROF),
DEF_TDX_ATTR_NAME(PMT_PROF),
DEF_TDX_ATTR_NAME(ICSSD),
DEF_TDX_ATTR_NAME(LASS),
DEF_TDX_ATTR_NAME(SEPT_VE_DISABLE),
DEF_TDX_ATTR_NAME(MIGRTABLE),
DEF_TDX_ATTR_NAME(PKS),
DEF_TDX_ATTR_NAME(KL),
DEF_TDX_ATTR_NAME(TPA),
DEF_TDX_ATTR_NAME(PERFMON),
};
#define DEF_TD_CTLS_NAME(_name) [TD_CTLS_##_name##_BIT] = __stringify(_name)
static __initdata const char *tdcs_td_ctls[] = {
DEF_TD_CTLS_NAME(PENDING_VE_DISABLE),
DEF_TD_CTLS_NAME(ENUM_TOPOLOGY),
DEF_TD_CTLS_NAME(VIRT_CPUID2),
DEF_TD_CTLS_NAME(REDUCE_VE),
DEF_TD_CTLS_NAME(LOCK),
};
void __init tdx_dump_attributes(u64 td_attr)
{
pr_info("Attributes:");
for (int i = 0; i < ARRAY_SIZE(tdx_attributes); i++) {
if (!tdx_attributes[i])
continue;
if (td_attr & BIT(i))
pr_cont(" %s", tdx_attributes[i]);
td_attr &= ~BIT(i);
}
if (td_attr)
pr_cont(" unknown:%#llx", td_attr);
pr_cont("\n");
}
void __init tdx_dump_td_ctls(u64 td_ctls)
{
pr_info("TD_CTLS:");
for (int i = 0; i < ARRAY_SIZE(tdcs_td_ctls); i++) {
if (!tdcs_td_ctls[i])
continue;
if (td_ctls & BIT(i))
pr_cont(" %s", tdcs_td_ctls[i]);
td_ctls &= ~BIT(i);
}
if (td_ctls)
pr_cont(" unknown:%#llx", td_ctls);
pr_cont("\n");
}

View File

@ -32,9 +32,6 @@
#define VE_GET_PORT_NUM(e) ((e) >> 16)
#define VE_IS_IO_STRING(e) ((e) & BIT(4))
#define ATTR_DEBUG BIT(0)
#define ATTR_SEPT_VE_DISABLE BIT(28)
/* TDX Module call error codes */
#define TDCALL_RETURN_CODE(a) ((a) >> 32)
#define TDCALL_INVALID_OPERAND 0xc0000100
@ -200,14 +197,14 @@ static void __noreturn tdx_panic(const char *msg)
*
* TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM
* controls if the guest will receive such #VE with TD attribute
* ATTR_SEPT_VE_DISABLE.
* TDX_ATTR_SEPT_VE_DISABLE.
*
* Newer TDX modules allow the guest to control if it wants to receive SEPT
* violation #VEs.
*
* Check if the feature is available and disable SEPT #VE if possible.
*
* If the TD is allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE
* If the TD is allowed to disable/enable SEPT #VEs, the TDX_ATTR_SEPT_VE_DISABLE
* attribute is no longer reliable. It reflects the initial state of the
* control for the TD, but it will not be updated if someone (e.g. bootloader)
* changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to
@ -216,14 +213,14 @@ static void __noreturn tdx_panic(const char *msg)
static void disable_sept_ve(u64 td_attr)
{
const char *msg = "TD misconfiguration: SEPT #VE has to be disabled";
bool debug = td_attr & ATTR_DEBUG;
bool debug = td_attr & TDX_ATTR_DEBUG;
u64 config, controls;
/* Is this TD allowed to disable SEPT #VE */
tdg_vm_rd(TDCS_CONFIG_FLAGS, &config);
if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) {
/* No SEPT #VE controls for the guest: check the attribute */
if (td_attr & ATTR_SEPT_VE_DISABLE)
if (td_attr & TDX_ATTR_SEPT_VE_DISABLE)
return;
/* Relax SEPT_VE_DISABLE check for debug TD for backtraces */
@ -274,6 +271,20 @@ static void enable_cpu_topology_enumeration(void)
tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY);
}
static void reduce_unnecessary_ve(void)
{
u64 err = tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_REDUCE_VE, TD_CTLS_REDUCE_VE);
if (err == TDX_SUCCESS)
return;
/*
* Enabling REDUCE_VE includes ENUM_TOPOLOGY. Only try to
* enable ENUM_TOPOLOGY if REDUCE_VE was not successful.
*/
enable_cpu_topology_enumeration();
}
static void tdx_setup(u64 *cc_mask)
{
struct tdx_module_args args = {};
@ -305,7 +316,8 @@ static void tdx_setup(u64 *cc_mask)
tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
disable_sept_ve(td_attr);
enable_cpu_topology_enumeration();
reduce_unnecessary_ve();
}
/*
@ -1025,6 +1037,20 @@ static void tdx_kexec_finish(void)
}
}
static __init void tdx_announce(void)
{
struct tdx_module_args args = {};
u64 controls;
pr_info("Guest detected\n");
tdcall(TDG_VP_INFO, &args);
tdx_dump_attributes(args.rdx);
tdg_vm_rd(TDCS_TD_CTLS, &controls);
tdx_dump_td_ctls(controls);
}
void __init tdx_early_init(void)
{
u64 cc_mask;
@ -1094,5 +1120,5 @@ void __init tdx_early_init(void)
*/
x86_cpuinit.parallel_bringup = false;
pr_info("Guest detected\n");
tdx_announce();
}

View File

@ -308,10 +308,9 @@ SYM_CODE_END(xen_error_entry)
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.endif
call \cfunc
/* For some configurations \cfunc ends up being a noreturn. */
REACHABLE
ANNOTATE_REACHABLE
call \cfunc
jmp error_return
.endm
@ -529,10 +528,10 @@ SYM_CODE_START(\asmsym)
movq %rsp, %rdi /* pt_regs pointer into first argument */
movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
call \cfunc
/* For some configurations \cfunc ends up being a noreturn. */
REACHABLE
ANNOTATE_REACHABLE
call \cfunc
jmp paranoid_exit

View File

@ -1001,8 +1001,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
if (has_branch_stack(event))
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);

View File

@ -31,6 +31,8 @@ static u32 ibs_caps;
#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
/* attr.config2 */
#define IBS_SW_FILTER_MASK 1
/*
* IBS states:
@ -290,6 +292,16 @@ static int perf_ibs_init(struct perf_event *event)
if (has_branch_stack(event))
return -EOPNOTSUPP;
/* handle exclude_{user,kernel} in the IRQ handler */
if (event->attr.exclude_host || event->attr.exclude_guest ||
event->attr.exclude_idle)
return -EINVAL;
if (!(event->attr.config2 & IBS_SW_FILTER_MASK) &&
(event->attr.exclude_kernel || event->attr.exclude_user ||
event->attr.exclude_hv))
return -EINVAL;
ret = validate_group(event);
if (ret)
return ret;
@ -550,24 +562,14 @@ static struct attribute *attrs_empty[] = {
NULL,
};
static struct attribute_group empty_format_group = {
.name = "format",
.attrs = attrs_empty,
};
static struct attribute_group empty_caps_group = {
.name = "caps",
.attrs = attrs_empty,
};
static const struct attribute_group *empty_attr_groups[] = {
&empty_format_group,
&empty_caps_group,
NULL,
};
PMU_FORMAT_ATTR(rand_en, "config:57");
PMU_FORMAT_ATTR(cnt_ctl, "config:19");
PMU_FORMAT_ATTR(swfilt, "config2:0");
PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
@ -578,8 +580,9 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int
return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
}
static struct attribute *rand_en_attrs[] = {
static struct attribute *fetch_attrs[] = {
&format_attr_rand_en.attr,
&format_attr_swfilt.attr,
NULL,
};
@ -593,9 +596,9 @@ static struct attribute *zen4_ibs_extensions_attrs[] = {
NULL,
};
static struct attribute_group group_rand_en = {
static struct attribute_group group_fetch_formats = {
.name = "format",
.attrs = rand_en_attrs,
.attrs = fetch_attrs,
};
static struct attribute_group group_fetch_l3missonly = {
@ -611,7 +614,7 @@ static struct attribute_group group_zen4_ibs_extensions = {
};
static const struct attribute_group *fetch_attr_groups[] = {
&group_rand_en,
&group_fetch_formats,
&empty_caps_group,
NULL,
};
@ -628,6 +631,11 @@ cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
}
static struct attribute *op_attrs[] = {
&format_attr_swfilt.attr,
NULL,
};
static struct attribute *cnt_ctl_attrs[] = {
&format_attr_cnt_ctl.attr,
NULL,
@ -638,6 +646,11 @@ static struct attribute *op_l3missonly_attrs[] = {
NULL,
};
static struct attribute_group group_op_formats = {
.name = "format",
.attrs = op_attrs,
};
static struct attribute_group group_cnt_ctl = {
.name = "format",
.attrs = cnt_ctl_attrs,
@ -650,6 +663,12 @@ static struct attribute_group group_op_l3missonly = {
.is_visible = zen4_ibs_extensions_is_visible,
};
static const struct attribute_group *op_attr_groups[] = {
&group_op_formats,
&empty_caps_group,
NULL,
};
static const struct attribute_group *op_attr_update[] = {
&group_cnt_ctl,
&group_op_l3missonly,
@ -667,7 +686,6 @@ static struct perf_ibs perf_ibs_fetch = {
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
},
.msr = MSR_AMD64_IBSFETCHCTL,
.config_mask = IBS_FETCH_CONFIG_MASK,
@ -691,7 +709,6 @@ static struct perf_ibs perf_ibs_op = {
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
},
.msr = MSR_AMD64_IBSOPCTL,
.config_mask = IBS_OP_CONFIG_MASK,
@ -1111,6 +1128,12 @@ fail:
regs.flags |= PERF_EFLAGS_EXACT;
}
if ((event->attr.config2 & IBS_SW_FILTER_MASK) &&
perf_exclude_event(event, &regs)) {
throttle = perf_event_account_interrupt(event);
goto out;
}
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw = (struct perf_raw_record){
.frag = {
@ -1118,7 +1141,7 @@ fail:
.data = ibs_data.data,
},
};
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
if (perf_ibs == &perf_ibs_op)
@ -1129,8 +1152,7 @@ fail:
* recorded as part of interrupt regs. Thus we need to use rip from
* interrupt regs while unwinding call stack.
*/
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
perf_sample_save_callchain(&data, event, iregs);
perf_sample_save_callchain(&data, event, iregs);
throttle = perf_event_overflow(event, &data, &regs);
out:
@ -1228,7 +1250,7 @@ static __init int perf_ibs_op_init(void)
if (ibs_caps & IBS_CAPS_ZEN4)
perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
perf_ibs_op.pmu.attr_groups = empty_attr_groups;
perf_ibs_op.pmu.attr_groups = op_attr_groups;
perf_ibs_op.pmu.attr_update = op_attr_update;
return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");

View File

@ -1707,8 +1707,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
perf_sample_data_init(&data, 0, event->hw.last_period);
if (has_branch_stack(event))
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);

View File

@ -2826,6 +2826,9 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
return;
idx = INTEL_PMC_IDX_FIXED_SLOTS;
if (event->attr.config1 & INTEL_TD_CFG_METRIC_CLEAR)
bits |= INTEL_FIXED_3_METRICS_CLEAR;
}
intel_set_masks(event, idx);
@ -4081,7 +4084,12 @@ static int intel_pmu_hw_config(struct perf_event *event)
* is used in a metrics group, it too cannot support sampling.
*/
if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) {
if (event->attr.config1 || event->attr.config2)
/* The metrics_clear can only be set for the slots event */
if (event->attr.config1 &&
(!is_slots_event(event) || (event->attr.config1 & ~INTEL_TD_CFG_METRIC_CLEAR)))
return -EINVAL;
if (event->attr.config2)
return -EINVAL;
/*
@ -4690,6 +4698,8 @@ PMU_FORMAT_ATTR(in_tx, "config:32" );
PMU_FORMAT_ATTR(in_tx_cp, "config:33" );
PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */
PMU_FORMAT_ATTR(metrics_clear, "config1:0"); /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */
static ssize_t umask2_show(struct device *dev,
struct device_attribute *attr,
char *page)
@ -4709,6 +4719,7 @@ static struct device_attribute format_attr_umask2 =
static struct attribute *format_evtsel_ext_attrs[] = {
&format_attr_umask2.attr,
&format_attr_eq.attr,
&format_attr_metrics_clear.attr,
NULL
};
@ -4733,6 +4744,13 @@ evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i)
if (i == 1)
return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0;
/* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */
if (i == 2) {
union perf_capabilities intel_cap = hybrid(dev_get_drvdata(dev), intel_cap);
return intel_cap.rdpmc_metrics_clear ? attr->mode : 0;
}
return 0;
}
@ -5381,42 +5399,32 @@ static __init void intel_clovertown_quirk(void)
x86_pmu.pebs_constraints = NULL;
}
static const struct x86_cpu_desc isolation_ucodes[] = {
INTEL_CPU_DESC(INTEL_HASWELL, 3, 0x0000001f),
INTEL_CPU_DESC(INTEL_HASWELL_L, 1, 0x0000001e),
INTEL_CPU_DESC(INTEL_HASWELL_G, 1, 0x00000015),
INTEL_CPU_DESC(INTEL_HASWELL_X, 2, 0x00000037),
INTEL_CPU_DESC(INTEL_HASWELL_X, 4, 0x0000000a),
INTEL_CPU_DESC(INTEL_BROADWELL, 4, 0x00000023),
INTEL_CPU_DESC(INTEL_BROADWELL_G, 1, 0x00000014),
INTEL_CPU_DESC(INTEL_BROADWELL_D, 2, 0x00000010),
INTEL_CPU_DESC(INTEL_BROADWELL_D, 3, 0x07000009),
INTEL_CPU_DESC(INTEL_BROADWELL_D, 4, 0x0f000009),
INTEL_CPU_DESC(INTEL_BROADWELL_D, 5, 0x0e000002),
INTEL_CPU_DESC(INTEL_BROADWELL_X, 1, 0x0b000014),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 3, 0x00000021),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 4, 0x00000000),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 5, 0x00000000),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 6, 0x00000000),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 7, 0x00000000),
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 11, 0x00000000),
INTEL_CPU_DESC(INTEL_SKYLAKE_L, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_SKYLAKE, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_KABYLAKE, 9, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 9, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 10, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 11, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 12, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE, 10, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE, 11, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE, 12, 0x0000004e),
INTEL_CPU_DESC(INTEL_KABYLAKE, 13, 0x0000004e),
static const struct x86_cpu_id isolation_ucodes[] = {
X86_MATCH_VFM_STEPS(INTEL_HASWELL, 3, 3, 0x0000001f),
X86_MATCH_VFM_STEPS(INTEL_HASWELL_L, 1, 1, 0x0000001e),
X86_MATCH_VFM_STEPS(INTEL_HASWELL_G, 1, 1, 0x00000015),
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 2, 2, 0x00000037),
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 4, 4, 0x0000000a),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL, 4, 4, 0x00000023),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_G, 1, 1, 0x00000014),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 2, 2, 0x00000010),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 3, 3, 0x07000009),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 4, 4, 0x0f000009),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 5, 5, 0x0e000002),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_X, 1, 1, 0x0b000014),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 3, 3, 0x00000021),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 4, 7, 0x00000000),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 11, 11, 0x00000000),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_L, 3, 3, 0x0000007c),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE, 3, 3, 0x0000007c),
X86_MATCH_VFM_STEPS(INTEL_KABYLAKE, 9, 13, 0x0000004e),
X86_MATCH_VFM_STEPS(INTEL_KABYLAKE_L, 9, 12, 0x0000004e),
{}
};
static void intel_check_pebs_isolation(void)
{
x86_pmu.pebs_no_isolation = !x86_cpu_has_min_microcode_rev(isolation_ucodes);
x86_pmu.pebs_no_isolation = !x86_match_min_microcode_rev(isolation_ucodes);
}
static __init void intel_pebs_isolation_quirk(void)
@ -5426,16 +5434,16 @@ static __init void intel_pebs_isolation_quirk(void)
intel_check_pebs_isolation();
}
static const struct x86_cpu_desc pebs_ucodes[] = {
INTEL_CPU_DESC(INTEL_SANDYBRIDGE, 7, 0x00000028),
INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 6, 0x00000618),
INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 7, 0x0000070c),
static const struct x86_cpu_id pebs_ucodes[] = {
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE, 7, 7, 0x00000028),
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 6, 6, 0x00000618),
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 7, 7, 0x0000070c),
{}
};
static bool intel_snb_pebs_broken(void)
{
return !x86_cpu_has_min_microcode_rev(pebs_ucodes);
return !x86_match_min_microcode_rev(pebs_ucodes);
}
static void intel_snb_check_microcode(void)

View File

@ -1789,8 +1789,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
if (sample_type & PERF_SAMPLE_CALLCHAIN)
perf_sample_save_callchain(data, event, iregs);
perf_sample_save_callchain(data, event, iregs);
/*
* We use the interrupt regs as a base because the PEBS record does not
@ -1889,8 +1888,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
if (x86_pmu.intel_cap.pebs_format >= 3)
setup_pebs_time(event, data, pebs->tsc);
if (has_branch_stack(event))
perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
}
static void adaptive_pebs_save_regs(struct pt_regs *regs,
@ -1917,8 +1915,6 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
}
#define PEBS_LATENCY_MASK 0xffff
#define PEBS_CACHE_LATENCY_OFFSET 32
#define PEBS_RETIRE_LATENCY_OFFSET 32
/*
* With adaptive PEBS the layout depends on what fields are configured.
@ -1932,8 +1928,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct pebs_basic *basic = __pebs;
void *next_record = basic + 1;
u64 sample_type;
u64 format_size;
u64 sample_type, format_group;
struct pebs_meminfo *meminfo = NULL;
struct pebs_gprs *gprs = NULL;
struct x86_perf_regs *perf_regs;
@ -1945,7 +1940,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
perf_regs->xmm_regs = NULL;
sample_type = event->attr.sample_type;
format_size = basic->format_size;
format_group = basic->format_group;
perf_sample_data_init(data, 0, event->hw.last_period);
data->period = event->hw.last_period;
@ -1957,8 +1952,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
if (sample_type & PERF_SAMPLE_CALLCHAIN)
perf_sample_save_callchain(data, event, iregs);
perf_sample_save_callchain(data, event, iregs);
*regs = *iregs;
/* The ip in basic is EventingIP */
@ -1967,7 +1961,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY)
data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
data->weight.var3_w = basic->retire_latency;
else
data->weight.var3_w = 0;
}
@ -1977,12 +1971,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* But PERF_SAMPLE_TRANSACTION needs gprs->ax.
* Save the pointer here but process later.
*/
if (format_size & PEBS_DATACFG_MEMINFO) {
if (format_group & PEBS_DATACFG_MEMINFO) {
meminfo = next_record;
next_record = meminfo + 1;
}
if (format_size & PEBS_DATACFG_GP) {
if (format_group & PEBS_DATACFG_GP) {
gprs = next_record;
next_record = gprs + 1;
@ -1995,14 +1989,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
adaptive_pebs_save_regs(regs, gprs);
}
if (format_size & PEBS_DATACFG_MEMINFO) {
if (format_group & PEBS_DATACFG_MEMINFO) {
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
u64 weight = meminfo->latency;
u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
meminfo->cache_latency : meminfo->mem_latency;
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
data->weight.var2_w = weight & PEBS_LATENCY_MASK;
weight >>= PEBS_CACHE_LATENCY_OFFSET;
}
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY)
data->weight.var2_w = meminfo->instr_latency;
/*
* Although meminfo::latency is defined as a u64,
@ -2010,12 +2003,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* in practice on Ice Lake and earlier platforms.
*/
if (sample_type & PERF_SAMPLE_WEIGHT) {
data->weight.full = weight ?:
data->weight.full = latency ?:
intel_get_tsx_weight(meminfo->tsx_tuning);
} else {
data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
data->weight.var1_dw = (u32)latency ?:
intel_get_tsx_weight(meminfo->tsx_tuning);
}
data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
}
@ -2036,16 +2030,16 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
}
if (format_size & PEBS_DATACFG_XMMS) {
if (format_group & PEBS_DATACFG_XMMS) {
struct pebs_xmm *xmm = next_record;
next_record = xmm + 1;
perf_regs->xmm_regs = xmm->xmm;
}
if (format_size & PEBS_DATACFG_LBRS) {
if (format_group & PEBS_DATACFG_LBRS) {
struct lbr_entry *lbr = next_record;
int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT)
& 0xff) + 1;
next_record = next_record + num_lbr * sizeof(struct lbr_entry);
@ -2055,11 +2049,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
}
WARN_ONCE(next_record != __pebs + (format_size >> 48),
"PEBS record size %llu, expected %llu, config %llx\n",
format_size >> 48,
WARN_ONCE(next_record != __pebs + basic->format_size,
"PEBS record size %u, expected %llu, config %llx\n",
basic->format_size,
(u64)(next_record - __pebs),
basic->format_size);
format_group);
}
static inline void *
@ -2170,46 +2164,33 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
return 0;
}
typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *,
struct perf_sample_data *, struct pt_regs *);
static struct pt_regs dummy_iregs;
static __always_inline void
__intel_pmu_pebs_event(struct perf_event *event,
struct pt_regs *iregs,
struct pt_regs *regs,
struct perf_sample_data *data,
void *base, void *top,
int bit, int count,
void (*setup_sample)(struct perf_event *,
struct pt_regs *,
void *,
struct perf_sample_data *,
struct pt_regs *))
void *at,
setup_fn setup_sample)
{
setup_sample(event, iregs, at, data, regs);
perf_event_output(event, data, regs);
}
static __always_inline void
__intel_pmu_pebs_last_event(struct perf_event *event,
struct pt_regs *iregs,
struct pt_regs *regs,
struct perf_sample_data *data,
void *at,
int count,
setup_fn setup_sample)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *at = get_next_pebs_record_by_bit(base, top, bit);
static struct pt_regs dummy_iregs;
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
* Now, auto-reload is only enabled in fixed period mode.
* The reload value is always hwc->sample_period.
* May need to change it, if auto-reload is enabled in
* freq mode later.
*/
intel_pmu_save_and_restart_reload(event, count);
} else if (!intel_pmu_save_and_restart(event))
return;
if (!iregs)
iregs = &dummy_iregs;
while (count > 1) {
setup_sample(event, iregs, at, data, regs);
perf_event_output(event, data, regs);
at += cpuc->pebs_record_size;
at = get_next_pebs_record_by_bit(at, top, bit);
count--;
}
setup_sample(event, iregs, at, data, regs);
if (iregs == &dummy_iregs) {
@ -2228,6 +2209,44 @@ __intel_pmu_pebs_event(struct perf_event *event,
if (perf_event_overflow(event, data, regs))
x86_pmu_stop(event, 0);
}
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
* Now, auto-reload is only enabled in fixed period mode.
* The reload value is always hwc->sample_period.
* May need to change it, if auto-reload is enabled in
* freq mode later.
*/
intel_pmu_save_and_restart_reload(event, count);
} else
intel_pmu_save_and_restart(event);
}
static __always_inline void
__intel_pmu_pebs_events(struct perf_event *event,
struct pt_regs *iregs,
struct perf_sample_data *data,
void *base, void *top,
int bit, int count,
setup_fn setup_sample)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *at = get_next_pebs_record_by_bit(base, top, bit);
int cnt = count;
if (!iregs)
iregs = &dummy_iregs;
while (cnt > 1) {
__intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample);
at += cpuc->pebs_record_size;
at = get_next_pebs_record_by_bit(at, top, bit);
cnt--;
}
__intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample);
}
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
@ -2264,8 +2283,8 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_
return;
}
__intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
setup_pebs_fixed_sample_data);
__intel_pmu_pebs_events(event, iregs, data, at, top, 0, n,
setup_pebs_fixed_sample_data);
}
static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
@ -2396,9 +2415,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
}
if (counts[bit]) {
__intel_pmu_pebs_event(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_fixed_sample_data);
__intel_pmu_pebs_events(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_fixed_sample_data);
}
}
}
@ -2406,8 +2425,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
struct pebs_basic *basic;
struct perf_event *event;
void *base, *at, *top;
int bit;
@ -2429,30 +2452,41 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
return;
}
for (at = base; at < top; at += cpuc->pebs_record_size) {
if (!iregs)
iregs = &dummy_iregs;
/* Process all but the last event for each counter. */
for (at = base; at < top; at += basic->format_size) {
u64 pebs_status;
pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
pebs_status &= mask;
basic = at;
if (basic->format_size != cpuc->pebs_record_size)
continue;
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX)
counts[bit]++;
pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask;
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
event = cpuc->events[bit];
if (WARN_ON_ONCE(!event) ||
WARN_ON_ONCE(!event->attr.precise_ip))
continue;
if (counts[bit]++) {
__intel_pmu_pebs_event(event, iregs, regs, data, last[bit],
setup_pebs_adaptive_sample_data);
}
last[bit] = at;
}
}
for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
if (counts[bit] == 0)
if (!counts[bit])
continue;
event = cpuc->events[bit];
if (WARN_ON_ONCE(!event))
continue;
if (WARN_ON_ONCE(!event->attr.precise_ip))
continue;
__intel_pmu_pebs_event(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_adaptive_sample_data);
__intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
counts[bit], setup_pebs_adaptive_sample_data);
}
}

View File

@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/device.h>
#include <asm/cpuid.h>
#include <asm/perf_event.h>
#include <asm/insn.h>
#include <asm/io.h>
@ -201,10 +202,10 @@ static int __init pt_pmu_hw_init(void)
* otherwise, zero for numerator stands for "not enumerated"
* as per SDM
*/
if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
if (boot_cpu_data.cpuid_level >= CPUID_LEAF_TSC) {
u32 eax, ebx, ecx, edx;
cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
cpuid(CPUID_LEAF_TSC, &eax, &ebx, &ecx, &edx);
pt_pmu.tsc_art_num = ebx;
pt_pmu.tsc_art_den = eax;

View File

@ -37,9 +37,6 @@ struct topa_entry {
u64 rsvd4 : 12;
};
/* TSC to Core Crystal Clock Ratio */
#define CPUID_TSC_LEAF 0x15
struct pt_pmu {
struct pmu pmu;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];

View File

@ -745,7 +745,7 @@ static int uncore_pmu_event_init(struct perf_event *event)
pmu = uncore_event_to_pmu(event);
/* no device found for this pmu */
if (pmu->func_id < 0)
if (!pmu->registered)
return -ENOENT;
/* Sampling not supported yet */
@ -992,7 +992,7 @@ static void uncore_types_exit(struct intel_uncore_type **types)
uncore_type_exit(*types);
}
static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
static int __init uncore_type_init(struct intel_uncore_type *type)
{
struct intel_uncore_pmu *pmus;
size_t size;
@ -1005,7 +1005,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
for (i = 0; i < type->num_boxes; i++) {
pmus[i].func_id = setid ? i : -1;
pmus[i].pmu_idx = i;
pmus[i].type = type;
pmus[i].boxes = kzalloc(size, GFP_KERNEL);
@ -1055,12 +1054,12 @@ err:
}
static int __init
uncore_types_init(struct intel_uncore_type **types, bool setid)
uncore_types_init(struct intel_uncore_type **types)
{
int ret;
for (; *types; types++) {
ret = uncore_type_init(*types, setid);
ret = uncore_type_init(*types);
if (ret)
return ret;
}
@ -1160,11 +1159,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
if (!box)
return -ENOMEM;
if (pmu->func_id < 0)
pmu->func_id = pdev->devfn;
else
WARN_ON_ONCE(pmu->func_id != pdev->devfn);
atomic_inc(&box->refcnt);
box->dieid = die;
box->pci_dev = pdev;
@ -1410,7 +1404,7 @@ static int __init uncore_pci_init(void)
goto err;
}
ret = uncore_types_init(uncore_pci_uncores, false);
ret = uncore_types_init(uncore_pci_uncores);
if (ret)
goto errtype;
@ -1678,7 +1672,7 @@ static int __init uncore_cpu_init(void)
{
int ret;
ret = uncore_types_init(uncore_msr_uncores, true);
ret = uncore_types_init(uncore_msr_uncores);
if (ret)
goto err;
@ -1697,7 +1691,7 @@ static int __init uncore_mmio_init(void)
struct intel_uncore_type **types = uncore_mmio_uncores;
int ret;
ret = uncore_types_init(types, true);
ret = uncore_types_init(types);
if (ret)
goto err;

View File

@ -125,7 +125,6 @@ struct intel_uncore_pmu {
struct pmu pmu;
char name[UNCORE_PMU_NAME_LEN];
int pmu_idx;
int func_id;
bool registered;
atomic_t activeboxes;
cpumask_t cpu_mask;

View File

@ -910,7 +910,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
pmu = uncore_event_to_pmu(event);
/* no device found for this pmu */
if (pmu->func_id < 0)
if (!pmu->registered)
return -ENOENT;
/* Sampling not supported yet */

View File

@ -6684,17 +6684,8 @@ void spr_uncore_mmio_init(void)
/* GNR uncore support */
#define UNCORE_GNR_NUM_UNCORE_TYPES 23
#define UNCORE_GNR_TYPE_15 15
#define UNCORE_GNR_B2UPI 18
#define UNCORE_GNR_TYPE_21 21
#define UNCORE_GNR_TYPE_22 22
int gnr_uncore_units_ignore[] = {
UNCORE_SPR_UPI,
UNCORE_GNR_TYPE_15,
UNCORE_GNR_B2UPI,
UNCORE_GNR_TYPE_21,
UNCORE_GNR_TYPE_22,
UNCORE_IGNORE_END
};
@ -6703,6 +6694,31 @@ static struct intel_uncore_type gnr_uncore_ubox = {
.attr_update = uncore_alias_groups,
};
static struct intel_uncore_type gnr_uncore_pciex8 = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "pciex8",
};
static struct intel_uncore_type gnr_uncore_pciex16 = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "pciex16",
};
static struct intel_uncore_type gnr_uncore_upi = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "upi",
};
static struct intel_uncore_type gnr_uncore_b2upi = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "b2upi",
};
static struct intel_uncore_type gnr_uncore_b2hot = {
.name = "b2hot",
.attr_update = uncore_alias_groups,
};
static struct intel_uncore_type gnr_uncore_b2cmi = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "b2cmi",
@ -6727,21 +6743,21 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = {
&gnr_uncore_ubox,
&spr_uncore_imc,
NULL,
&gnr_uncore_upi,
NULL,
NULL,
NULL,
&spr_uncore_cxlcm,
&spr_uncore_cxldp,
NULL,
NULL,
NULL,
NULL,
NULL,
&gnr_uncore_b2hot,
&gnr_uncore_b2cmi,
&gnr_uncore_b2cxl,
NULL,
&gnr_uncore_b2upi,
NULL,
&gnr_uncore_mdf_sbo,
NULL,
NULL,
&gnr_uncore_pciex16,
&gnr_uncore_pciex8,
};
static struct freerunning_counters gnr_iio_freerunning[] = {

View File

@ -624,6 +624,7 @@ union perf_capabilities {
u64 pebs_output_pt_available:1;
u64 pebs_timing_info:1;
u64 anythread_deprecated:1;
u64 rdpmc_metrics_clear:1;
};
u64 capabilities;
};

View File

@ -39,6 +39,10 @@
* event: rapl_energy_psys
* perf code: 0x5
*
* core counter: consumption of a single physical core
* event: rapl_energy_core (power_core PMU)
* perf code: 0x1
*
* We manage those counters as free running (read-only). They may be
* use simultaneously by other tools, such as turbostat.
*
@ -70,18 +74,22 @@ MODULE_LICENSE("GPL");
/*
* RAPL energy status counters
*/
enum perf_rapl_events {
enum perf_rapl_pkg_events {
PERF_RAPL_PP0 = 0, /* all cores */
PERF_RAPL_PKG, /* entire package */
PERF_RAPL_RAM, /* DRAM */
PERF_RAPL_PP1, /* gpu */
PERF_RAPL_PSYS, /* psys */
PERF_RAPL_MAX,
NR_RAPL_DOMAINS = PERF_RAPL_MAX,
PERF_RAPL_PKG_EVENTS_MAX,
NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
};
static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
#define PERF_RAPL_CORE 0 /* single core */
#define PERF_RAPL_CORE_EVENTS_MAX 1
#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
"pp0-core",
"package",
"dram",
@ -89,6 +97,8 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
"psys",
};
static const char *const rapl_core_domain_name __initconst = "core";
/*
* event code: LSB 8 bits, passed in attr->config
* any other bit is reserved
@ -112,7 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = { \
* considered as either pkg-scope or die-scope, and we are considering
* them as die-scope.
*/
#define rapl_pmu_is_pkg_scope() \
#define rapl_pkg_pmu_is_pkg_scope() \
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
@ -129,7 +139,8 @@ struct rapl_pmu {
struct rapl_pmus {
struct pmu pmu;
unsigned int nr_rapl_pmu;
struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
unsigned int cntr_mask;
struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
};
enum rapl_unit_quirk {
@ -139,44 +150,43 @@ enum rapl_unit_quirk {
};
struct rapl_model {
struct perf_msr *rapl_msrs;
unsigned long events;
struct perf_msr *rapl_pkg_msrs;
struct perf_msr *rapl_core_msrs;
unsigned long pkg_events;
unsigned long core_events;
unsigned int msr_power_unit;
enum rapl_unit_quirk unit_quirk;
};
/* 1/2^hw_unit Joule */
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
static struct rapl_pmus *rapl_pmus;
static unsigned int rapl_cntr_mask;
static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
static int rapl_core_hw_unit __read_mostly;
static struct rapl_pmus *rapl_pmus_pkg;
static struct rapl_pmus *rapl_pmus_core;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
static struct rapl_model *rapl_model;
/*
* Helper functions to get the correct topology macros according to the
* Helper function to get the correct topology id according to the
* RAPL PMU scope.
*/
static inline unsigned int get_rapl_pmu_idx(int cpu)
static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
{
return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
topology_logical_die_id(cpu);
}
static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
{
return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
topology_die_cpumask(cpu);
}
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
{
unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
/*
* The unsigned check also catches the '-1' return value for non
* existent mappings in the topology map.
* Returns unsigned int, which converts the '-1' return value
* (for non-existent mappings in topology map) to UINT_MAX, so
* the error check in the caller is simplified.
*/
return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
switch (scope) {
case PERF_PMU_SCOPE_PKG:
return topology_logical_package_id(cpu);
case PERF_PMU_SCOPE_DIE:
return topology_logical_die_id(cpu);
case PERF_PMU_SCOPE_CORE:
return topology_logical_core_id(cpu);
default:
return -EINVAL;
}
}
static inline u64 rapl_read_counter(struct perf_event *event)
@ -186,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
return raw;
}
static inline u64 rapl_scale(u64 v, int cfg)
static inline u64 rapl_scale(u64 v, struct perf_event *event)
{
if (cfg > NR_RAPL_DOMAINS) {
pr_warn("Invalid domain %d, failed to scale data\n", cfg);
return v;
}
int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
hw_unit = rapl_core_hw_unit;
/*
* scale delta to smallest unit (1/2^32)
* users must then scale back: count * 1/(1e9*2^32) to get Joules
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
return v << (32 - rapl_hw_unit[cfg - 1]);
return v << (32 - hw_unit);
}
static u64 rapl_event_update(struct perf_event *event)
@ -225,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
sdelta = rapl_scale(delta, event->hw.config);
sdelta = rapl_scale(delta, event);
local64_add(sdelta, &event->count);
@ -240,34 +251,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{
struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
struct perf_event *event;
unsigned long flags;
if (!pmu->n_active)
if (!rapl_pmu->n_active)
return HRTIMER_NORESTART;
raw_spin_lock_irqsave(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
list_for_each_entry(event, &pmu->active_list, active_entry)
list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
rapl_event_update(event);
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
hrtimer_forward_now(hrtimer, pmu->timer_interval);
hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
return HRTIMER_RESTART;
}
static void rapl_hrtimer_init(struct rapl_pmu *pmu)
static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
{
struct hrtimer *hr = &pmu->hrtimer;
struct hrtimer *hr = &rapl_pmu->hrtimer;
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hr->function = rapl_hrtimer_handle;
}
static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
struct perf_event *event)
{
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
@ -275,39 +286,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
event->hw.state = 0;
list_add_tail(&event->active_entry, &pmu->active_list);
list_add_tail(&event->active_entry, &rapl_pmu->active_list);
local64_set(&event->hw.prev_count, rapl_read_counter(event));
pmu->n_active++;
if (pmu->n_active == 1)
rapl_start_hrtimer(pmu);
rapl_pmu->n_active++;
if (rapl_pmu->n_active == 1)
rapl_start_hrtimer(rapl_pmu);
}
static void rapl_pmu_event_start(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private;
struct rapl_pmu *rapl_pmu = event->pmu_private;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
__rapl_pmu_event_start(pmu, event);
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
__rapl_pmu_event_start(rapl_pmu, event);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
}
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private;
struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
/* mark event as deactivated and stopped */
if (!(hwc->state & PERF_HES_STOPPED)) {
WARN_ON_ONCE(pmu->n_active <= 0);
pmu->n_active--;
if (pmu->n_active == 0)
hrtimer_cancel(&pmu->hrtimer);
WARN_ON_ONCE(rapl_pmu->n_active <= 0);
rapl_pmu->n_active--;
if (rapl_pmu->n_active == 0)
hrtimer_cancel(&rapl_pmu->hrtimer);
list_del(&event->active_entry);
@ -325,23 +336,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
hwc->state |= PERF_HES_UPTODATE;
}
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
}
static int rapl_pmu_event_add(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private;
struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (mode & PERF_EF_START)
__rapl_pmu_event_start(pmu, event);
__rapl_pmu_event_start(rapl_pmu, event);
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
return 0;
}
@ -354,12 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
static int rapl_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, ret = 0;
struct rapl_pmu *pmu;
int bit, rapl_pmus_scope, ret = 0;
struct rapl_pmu *rapl_pmu;
unsigned int rapl_pmu_idx;
struct rapl_pmus *rapl_pmus;
/* only look at RAPL events */
if (event->attr.type != rapl_pmus->pmu.type)
return -ENOENT;
/* unsupported modes and filters */
if (event->attr.sample_period) /* no sampling */
return -EINVAL;
/* check only supported bits are set */
if (event->attr.config & ~RAPL_EVENT_MASK)
@ -368,26 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
if (!rapl_pmus)
return -EINVAL;
rapl_pmus_scope = rapl_pmus->pmu.scope;
cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
bit = cfg - 1;
if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
/* only look at RAPL package events */
if (event->attr.type != rapl_pmus_pkg->pmu.type)
return -ENOENT;
cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
return -EINVAL;
bit = cfg - 1;
event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
} else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
/* only look at RAPL core events */
if (event->attr.type != rapl_pmus_core->pmu.type)
return -ENOENT;
cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
return -EINVAL;
bit = cfg - 1;
event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
} else
return -EINVAL;
/* check event supported */
if (!(rapl_cntr_mask & (1 << bit)))
if (!(rapl_pmus->cntr_mask & (1 << bit)))
return -EINVAL;
/* unsupported modes and filters */
if (event->attr.sample_period) /* no sampling */
rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
return -EINVAL;
/* must be done before validate_group */
pmu = cpu_to_rapl_pmu(event->cpu);
if (!pmu)
rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
if (!rapl_pmu)
return -EINVAL;
event->pmu_private = pmu;
event->hw.event_base = rapl_msrs[bit].msr;
event->pmu_private = rapl_pmu;
event->hw.config = cfg;
event->hw.idx = bit;
@ -404,12 +440,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules");
/*
* we compute in 0.23 nJ increments regardless of MSR
@ -419,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890
RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10");
/*
* There are no default events, but we need to create
@ -451,6 +490,12 @@ static const struct attribute_group *rapl_attr_groups[] = {
NULL,
};
static const struct attribute_group *rapl_core_attr_groups[] = {
&rapl_pmu_format_group,
&rapl_pmu_events_group,
NULL,
};
static struct attribute *rapl_events_cores[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_cores_unit),
@ -511,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = {
.attrs = rapl_events_psys,
};
static struct attribute *rapl_events_core[] = {
EVENT_PTR(rapl_core),
EVENT_PTR(rapl_core_unit),
EVENT_PTR(rapl_core_scale),
NULL,
};
static struct attribute_group rapl_events_core_group = {
.name = "events",
.attrs = rapl_events_core,
};
static bool test_msr(int idx, void *data)
{
return test_bit(idx, (unsigned long *) data);
@ -536,11 +593,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
};
/*
* Force to PERF_RAPL_MAX size due to:
* - perf_msr_probe(PERF_RAPL_MAX)
* Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
* - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
* - want to use same event codes across both architectures
*/
static struct perf_msr amd_rapl_msrs[] = {
static struct perf_msr amd_rapl_pkg_msrs[] = {
[PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
[PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
[PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
@ -548,18 +605,25 @@ static struct perf_msr amd_rapl_msrs[] = {
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
};
static int rapl_check_hw_unit(struct rapl_model *rm)
static struct perf_msr amd_rapl_core_msrs[] = {
[PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group,
test_msr, false, RAPL_MSR_MASK },
};
static int rapl_check_hw_unit(void)
{
u64 msr_rapl_power_unit_bits;
int i;
/* protect rdmsrl() to handle virtualization */
if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
return -1;
for (i = 0; i < NR_RAPL_DOMAINS; i++)
rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
switch (rm->unit_quirk) {
rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
switch (rapl_model->unit_quirk) {
/*
* DRAM domain on HSW server and KNL has fixed energy unit which can be
* different than the unit from power unit MSR. See
@ -567,17 +631,16 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
* of 2. Datasheet, September 2014, Reference Number: 330784-001 "
*/
case RAPL_UNIT_QUIRK_INTEL_HSW:
rapl_hw_unit[PERF_RAPL_RAM] = 16;
rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
break;
/* SPR uses a fixed energy unit for Psys domain. */
case RAPL_UNIT_QUIRK_INTEL_SPR:
rapl_hw_unit[PERF_RAPL_PSYS] = 0;
rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
break;
default:
break;
}
/*
* Calculate the timer rate:
* Use reference of 200W for scaling the timeout to avoid counter
@ -586,9 +649,9 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
* if hw unit is 32, then we use 2 ms 1/200/2
*/
rapl_timer_ms = 2;
if (rapl_hw_unit[0] < 32) {
if (rapl_pkg_hw_unit[0] < 32) {
rapl_timer_ms = (1000 / (2 * 100));
rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
}
return 0;
}
@ -596,24 +659,32 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
static void __init rapl_advertise(void)
{
int i;
int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
if (rapl_pmus_core)
num_counters += hweight32(rapl_pmus_core->cntr_mask);
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
hweight32(rapl_cntr_mask), rapl_timer_ms);
num_counters, rapl_timer_ms);
for (i = 0; i < NR_RAPL_DOMAINS; i++) {
if (rapl_cntr_mask & (1 << i)) {
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
pr_info("hw unit of domain %s 2^-%d Joules\n",
rapl_domain_names[i], rapl_hw_unit[i]);
rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
}
}
if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE)))
pr_info("hw unit of domain %s 2^-%d Joules\n",
rapl_core_domain_name, rapl_core_hw_unit);
}
static void cleanup_rapl_pmus(void)
static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
{
int i;
for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
kfree(rapl_pmus->pmus[i]);
kfree(rapl_pmus->rapl_pmu[i]);
kfree(rapl_pmus);
}
@ -626,46 +697,60 @@ static const struct attribute_group *rapl_attr_update[] = {
NULL,
};
static int __init init_rapl_pmu(void)
static const struct attribute_group *rapl_core_attr_update[] = {
&rapl_events_core_group,
NULL,
};
static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
{
struct rapl_pmu *pmu;
struct rapl_pmu *rapl_pmu;
int idx;
for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
if (!pmu)
rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL);
if (!rapl_pmu)
goto free;
raw_spin_lock_init(&pmu->lock);
INIT_LIST_HEAD(&pmu->active_list);
pmu->pmu = &rapl_pmus->pmu;
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(pmu);
raw_spin_lock_init(&rapl_pmu->lock);
INIT_LIST_HEAD(&rapl_pmu->active_list);
rapl_pmu->pmu = &rapl_pmus->pmu;
rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(rapl_pmu);
rapl_pmus->pmus[idx] = pmu;
rapl_pmus->rapl_pmu[idx] = rapl_pmu;
}
return 0;
free:
for (; idx > 0; idx--)
kfree(rapl_pmus->pmus[idx - 1]);
kfree(rapl_pmus->rapl_pmu[idx - 1]);
return -ENOMEM;
}
static int __init init_rapl_pmus(void)
static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
const struct attribute_group **rapl_attr_groups,
const struct attribute_group **rapl_attr_update)
{
int nr_rapl_pmu = topology_max_packages();
int rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
struct rapl_pmus *rapl_pmus;
if (!rapl_pmu_is_pkg_scope()) {
nr_rapl_pmu *= topology_max_dies_per_package();
rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
}
/*
* rapl_pmu_scope must be either PKG, DIE or CORE
*/
if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
nr_rapl_pmu *= topology_max_dies_per_package();
else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE)
nr_rapl_pmu *= topology_num_cores_per_package();
else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG)
return -EINVAL;
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;
*rapl_pmus_ptr = rapl_pmus;
rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
rapl_pmus->pmu.attr_groups = rapl_attr_groups;
rapl_pmus->pmu.attr_update = rapl_attr_update;
@ -680,75 +765,77 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
return init_rapl_pmu();
return init_rapl_pmu(rapl_pmus);
}
static struct rapl_model model_snb = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_PP1),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_snbep = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_hsw = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PP1),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_hsx = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_knl = {
.events = BIT(PERF_RAPL_PKG) |
.pkg_events = BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_skl = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PP1) |
BIT(PERF_RAPL_PSYS),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_spr = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PSYS),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_spr_msrs,
.rapl_pkg_msrs = intel_rapl_spr_msrs,
};
static struct rapl_model model_amd_hygon = {
.events = BIT(PERF_RAPL_PKG),
.pkg_events = BIT(PERF_RAPL_PKG),
.core_events = BIT(PERF_RAPL_CORE),
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
.rapl_msrs = amd_rapl_msrs,
.rapl_pkg_msrs = amd_rapl_pkg_msrs,
.rapl_core_msrs = amd_rapl_core_msrs,
};
static const struct x86_cpu_id rapl_model_match[] __initconst = {
@ -804,45 +891,73 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;
struct rapl_model *rm;
int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
int ret;
if (rapl_pkg_pmu_is_pkg_scope())
rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
id = x86_match_cpu(rapl_model_match);
if (!id)
return -ENODEV;
rm = (struct rapl_model *) id->driver_data;
rapl_model = (struct rapl_model *) id->driver_data;
rapl_msrs = rm->rapl_msrs;
rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
false, (void *) &rm->events);
ret = rapl_check_hw_unit(rm);
ret = rapl_check_hw_unit();
if (ret)
return ret;
ret = init_rapl_pmus();
ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
rapl_attr_update);
if (ret)
return ret;
ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
PERF_RAPL_PKG_EVENTS_MAX, false,
(void *) &rapl_model->pkg_events);
ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
if (ret)
goto out;
if (rapl_model->core_events) {
ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
rapl_core_attr_groups,
rapl_core_attr_update);
if (ret) {
pr_warn("power-core PMU initialization failed (%d)\n", ret);
goto core_init_failed;
}
rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
PERF_RAPL_CORE_EVENTS_MAX, false,
(void *) &rapl_model->core_events);
ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1);
if (ret) {
pr_warn("power-core PMU registration failed (%d)\n", ret);
cleanup_rapl_pmus(rapl_pmus_core);
}
}
core_init_failed:
rapl_advertise();
return 0;
out:
pr_warn("Initialization failed (%d), disabled\n", ret);
cleanup_rapl_pmus();
cleanup_rapl_pmus(rapl_pmus_pkg);
return ret;
}
module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void)
{
perf_pmu_unregister(&rapl_pmus->pmu);
cleanup_rapl_pmus();
if (rapl_pmus_core) {
perf_pmu_unregister(&rapl_pmus_core->pmu);
cleanup_rapl_pmus(rapl_pmus_core);
}
perf_pmu_unregister(&rapl_pmus_pkg->pmu);
cleanup_rapl_pmus(rapl_pmus_pkg);
}
module_exit(intel_rapl_exit);

View File

@ -664,7 +664,7 @@ void __init hv_vtom_init(void)
x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
/* Set WB as the default cache mode. */
mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK);
guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
}
#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */

View File

@ -4,6 +4,7 @@
#include <linux/types.h>
#include <linux/stringify.h>
#include <linux/objtool.h>
#include <asm/asm.h>
#define ALT_FLAGS_SHIFT 16
@ -54,16 +55,6 @@
#define LOCK_PREFIX ""
#endif
/*
* objtool annotation to ignore the alternatives and only consider the original
* instruction(s).
*/
#define ANNOTATE_IGNORE_ALTERNATIVE \
"999:\n\t" \
".pushsection .discard.ignore_alts\n\t" \
".long 999b\n\t" \
".popsection\n\t"
/*
* The patching flags are part of the upper bits of the @ft_flags parameter when
* specifying them. The split is currently like this:
@ -310,17 +301,6 @@ void nop_func(void);
.endm
#endif
/*
* objtool annotation to ignore the alternatives and only consider the original
* instruction(s).
*/
.macro ANNOTATE_IGNORE_ALTERNATIVE
.Lannotate_\@:
.pushsection .discard.ignore_alts
.long .Lannotate_\@
.popsection
.endm
/*
* Issue one struct alt_instr descriptor entry (need to put it into
* the section .altinstructions, see below). This entry contains

View File

@ -4,7 +4,7 @@
#include <linux/ioport.h>
#include <linux/pci.h>
#include <linux/refcount.h>
#include <asm/amd_node.h>
struct amd_nb_bus_dev_range {
u8 bus;
@ -21,49 +21,16 @@ extern int amd_numa_init(void);
extern int amd_get_subcaches(int);
extern int amd_set_subcaches(int, unsigned long);
int __must_check amd_smn_read(u16 node, u32 address, u32 *value);
int __must_check amd_smn_write(u16 node, u32 address, u32 value);
struct amd_l3_cache {
unsigned indices;
u8 subcaches[4];
};
struct threshold_block {
unsigned int block; /* Number within bank */
unsigned int bank; /* MCA bank the block belongs to */
unsigned int cpu; /* CPU which controls MCA bank */
u32 address; /* MSR address for the block */
u16 interrupt_enable; /* Enable/Disable APIC interrupt */
bool interrupt_capable; /* Bank can generate an interrupt. */
u16 threshold_limit; /*
* Value upon which threshold
* interrupt is generated.
*/
struct kobject kobj; /* sysfs object */
struct list_head miscj; /*
* List of threshold blocks
* within a bank.
*/
};
struct threshold_bank {
struct kobject *kobj;
struct threshold_block *blocks;
/* initialized to the number of CPUs on the node sharing this bank */
refcount_t cpus;
unsigned int shared;
};
struct amd_northbridge {
struct pci_dev *root;
struct pci_dev *misc;
struct pci_dev *link;
struct amd_l3_cache l3_cache;
struct threshold_bank *bank4;
};
struct amd_northbridge_info {
@ -82,23 +49,6 @@ u16 amd_nb_num(void);
bool amd_nb_has_feature(unsigned int feature);
struct amd_northbridge *node_to_amd_nb(int node);
static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev)
{
struct pci_dev *misc;
int i;
for (i = 0; i != amd_nb_num(); i++) {
misc = node_to_amd_nb(i)->misc;
if (pci_domain_nr(misc->bus) == pci_domain_nr(pdev->bus) &&
PCI_SLOT(misc->devfn) == PCI_SLOT(pdev->devfn))
return i;
}
WARN(1, "Unable to find AMD Northbridge id for %s\n", pci_name(pdev));
return 0;
}
static inline bool amd_gart_present(void)
{
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)

View File

@ -0,0 +1,36 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* AMD Node helper functions and common defines
*
* Copyright (c) 2024, Advanced Micro Devices, Inc.
* All Rights Reserved.
*
* Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
*
* Note:
* Items in this file may only be used in a single place.
* However, it's prudent to keep all AMD Node functionality
* in a unified place rather than spreading throughout the
* kernel.
*/
#ifndef _ASM_X86_AMD_NODE_H_
#define _ASM_X86_AMD_NODE_H_
#include <linux/pci.h>
#define MAX_AMD_NUM_NODES 8
#define AMD_NODE0_PCI_SLOT 0x18
struct pci_dev *amd_node_get_func(u16 node, u8 func);
struct pci_dev *amd_node_get_root(u16 node);
static inline u16 amd_num_nodes(void)
{
return topology_amd_nodes_per_pkg() * topology_max_packages();
}
int __must_check amd_smn_read(u16 node, u32 address, u32 *value);
int __must_check amd_smn_write(u16 node, u32 address, u32 value);
#endif /*_ASM_X86_AMD_NODE_H_*/

View File

@ -92,7 +92,7 @@ do { \
do { \
__auto_type __flags = BUGFLAG_WARNING|(flags); \
instrumentation_begin(); \
_BUG_FLAGS(ASM_UD2, __flags, ASM_REACHABLE); \
_BUG_FLAGS(ASM_UD2, __flags, ANNOTATE_REACHABLE(1b)); \
instrumentation_end(); \
} while (0)

View File

@ -56,7 +56,6 @@
/* x86_cpu_id::flags */
#define X86_CPU_ID_FLAG_ENTRY_VALID BIT(0)
#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
/**
* X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
* @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
@ -208,6 +207,7 @@
VFM_MODEL(vfm), \
X86_STEPPING_ANY, X86_FEATURE_ANY, data)
#define __X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
/**
* X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping
* @vfm: Encoded 8-bits each for vendor, family, model
@ -218,12 +218,13 @@
*
* feature is set to wildcard
*/
#define X86_MATCH_VFM_STEPPINGS(vfm, steppings, data) \
X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
VFM_VENDOR(vfm), \
VFM_FAMILY(vfm), \
VFM_MODEL(vfm), \
steppings, X86_FEATURE_ANY, data)
#define X86_MATCH_VFM_STEPS(vfm, min_step, max_step, data) \
X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
VFM_VENDOR(vfm), \
VFM_FAMILY(vfm), \
VFM_MODEL(vfm), \
__X86_STEPPINGS(min_step, max_step), \
X86_FEATURE_ANY, data)
/**
* X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature
@ -242,41 +243,7 @@
VFM_MODEL(vfm), \
X86_STEPPING_ANY, feature, data)
/*
* Match specific microcode revisions.
*
* vendor/family/model/stepping must be all set.
*
* Only checks against the boot CPU. When mixed-stepping configs are
* valid for a CPU model, add a quirk for every valid stepping and
* do the fine-tuning in the quirk handler.
*/
struct x86_cpu_desc {
u8 x86_family;
u8 x86_vendor;
u8 x86_model;
u8 x86_stepping;
u32 x86_microcode_rev;
};
#define INTEL_CPU_DESC(vfm, stepping, revision) { \
.x86_family = VFM_FAMILY(vfm), \
.x86_vendor = VFM_VENDOR(vfm), \
.x86_model = VFM_MODEL(vfm), \
.x86_stepping = (stepping), \
.x86_microcode_rev = (revision), \
}
#define AMD_CPU_DESC(fam, model, stepping, revision) { \
.x86_family = (fam), \
.x86_vendor = X86_VENDOR_AMD, \
.x86_model = (model), \
.x86_stepping = (stepping), \
.x86_microcode_rev = (revision), \
}
extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table);
extern bool x86_match_min_microcode_rev(const struct x86_cpu_id *table);
#endif /* _ASM_X86_CPU_DEVICE_ID */

View File

@ -132,11 +132,12 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
x86_this_cpu_test_bit(bit, cpu_info.x86_capability))
/*
* This macro is for detection of features which need kernel
* infrastructure to be used. It may *not* directly test the CPU
* itself. Use the cpu_has() family if you want true runtime
* testing of CPU features, like in hypervisor code where you are
* supporting a possible guest feature where host support for it
* This is the default CPU features testing macro to use in code.
*
* It is for detection of features which need kernel infrastructure to be
* used. It may *not* directly test the CPU itself. Use the cpu_has() family
* if you want true runtime testing of CPU features, like in hypervisor code
* where you are supporting a possible guest feature where host support for it
* is not relevant.
*/
#define cpu_feature_enabled(bit) \
@ -161,13 +162,6 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
/*
* Static testing of CPU features. Used the same as boot_cpu_has(). It
* statically patches the target code for additional performance. Use
* static_cpu_has() only in fast paths, where every cycle counts. Which
* means that the boot_cpu_has() variant is already fast enough for the
* majority of cases and you should stick to using it as it is generally
* only two instructions: a RIP-relative MOV and a TEST.
*
* Do not use an "m" constraint for [cap_byte] here: gcc doesn't know
* that this is only used on a fallback path and will sometimes cause
* it to manifest the address of boot_cpu_data in a register, fouling

View File

@ -83,8 +83,8 @@
#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */
#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */
#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */
#define X86_FEATURE_P3 ( 3*32+ 6) /* P3 */
#define X86_FEATURE_P4 ( 3*32+ 7) /* P4 */
/* Free ( 3*32+ 6) */
/* Free ( 3*32+ 7) */
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */
#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */
#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */
@ -443,14 +443,16 @@
#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* Speculative Store Bypass Disable */
/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */
#define X86_FEATURE_SME (19*32+ 0) /* "sme" AMD Secure Memory Encryption */
#define X86_FEATURE_SEV (19*32+ 1) /* "sev" AMD Secure Encrypted Virtualization */
#define X86_FEATURE_SME (19*32+ 0) /* "sme" Secure Memory Encryption */
#define X86_FEATURE_SEV (19*32+ 1) /* "sev" Secure Encrypted Virtualization */
#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */
#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" AMD Secure Encrypted Virtualization - Encrypted State */
#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" AMD Secure Encrypted Virtualization - Secure Nested Paging */
#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" Secure Encrypted Virtualization - Encrypted State */
#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" Secure Encrypted Virtualization - Secure Nested Paging */
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* hardware-enforced cache coherency */
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */
#define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */
#define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */
#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */
@ -465,6 +467,7 @@
#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */
#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */
#define X86_FEATURE_SRSO_USER_KERNEL_NO (20*32+30) /* CPU is not affected by SRSO across user/kernel boundaries */
/*
* Extended auxiliary flags: Linux defined - for features scattered in various

View File

@ -21,6 +21,13 @@ enum cpuid_regs_idx {
CPUID_EDX,
};
#define CPUID_LEAF_MWAIT 0x5
#define CPUID_LEAF_DCA 0x9
#define CPUID_LEAF_XSTATE 0x0d
#define CPUID_LEAF_TSC 0x15
#define CPUID_LEAF_FREQ 0x16
#define CPUID_LEAF_TILE 0x1d
#ifdef CONFIG_X86_32
bool have_cpuid_p(void);
#else

View File

@ -12,10 +12,6 @@
/* Bit 63 of XCR0 is reserved for future expansion */
#define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
#define XSTATE_CPUID 0x0000000d
#define TILE_CPUID 0x0000001d
#define FXSAVE_SIZE 512
#define XSAVE_HDR_SIZE 64

View File

@ -2,7 +2,7 @@
#ifndef _ASM_X86_INIT_H
#define _ASM_X86_INIT_H
#define __head __section(".head.text")
#define __head __section(".head.text") __no_sanitize_undefined
struct x86_mapping_info {
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */

View File

@ -100,8 +100,8 @@
}
#define ASM_CALL_ARG0 \
"call %c[__func] \n" \
ASM_REACHABLE
"1: call %c[__func] \n" \
ANNOTATE_REACHABLE(1b)
#define ASM_CALL_ARG1 \
"movq %[arg1], %%rdi \n" \

View File

@ -8,14 +8,9 @@
# define PA_PGD 2
# define PA_SWAP_PAGE 3
# define PAGES_NR 4
#else
# define PA_CONTROL_PAGE 0
# define VA_CONTROL_PAGE 1
# define PA_TABLE_PAGE 2
# define PA_SWAP_PAGE 3
# define PAGES_NR 4
#endif
# define KEXEC_CONTROL_PAGE_SIZE 4096
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
#ifndef __ASSEMBLY__
@ -43,7 +38,6 @@ struct kimage;
/* Maximum address we can use for the control code buffer */
# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
# define KEXEC_CONTROL_PAGE_SIZE 4096
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_386
@ -58,11 +52,12 @@ struct kimage;
/* Maximum address we can use for the control pages */
# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
/* Allocate one page for the pdp and the second for the code */
# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_X86_64
extern unsigned long kexec_va_control_page;
extern unsigned long kexec_pa_table_page;
extern unsigned long kexec_pa_swap_page;
#endif
/*
@ -125,7 +120,7 @@ relocate_kernel(unsigned long indirection_page,
#else
unsigned long
relocate_kernel(unsigned long indirection_page,
unsigned long page_list,
unsigned long pa_control_page,
unsigned long start_address,
unsigned int preserve_context,
unsigned int host_mem_enc_active);
@ -145,6 +140,19 @@ struct kimage_arch {
};
#else
struct kimage_arch {
/*
* This is a kimage control page, as it must not overlap with either
* source or destination address ranges.
*/
pgd_t *pgd;
/*
* The virtual mapping of the control code page itself is used only
* during the transition, while the current kernel's pages are all
* in place. Thus the intermediate page table pages used to map it
* are not control pages, but instead just normal pages obtained
* with get_zeroed_page(). And have to be tracked (below) so that
* they can be freed.
*/
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;

View File

@ -276,7 +276,7 @@ static inline void cmci_rediscover(void) {}
static inline void cmci_recheck(void) {}
#endif
int mce_available(struct cpuinfo_x86 *c);
bool mce_available(struct cpuinfo_x86 *c);
bool mce_is_memory_error(struct mce *m);
bool mce_is_correctable(struct mce *m);
bool mce_usable_address(struct mce *m);
@ -296,7 +296,7 @@ enum mcp_flags {
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
int mce_notify_irq(void);
bool mce_notify_irq(void);
DECLARE_PER_CPU(struct mce, injectm);
@ -386,8 +386,6 @@ static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif
static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); }
unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len);
#endif /* _ASM_X86_MCE_H */

View File

@ -37,6 +37,8 @@ typedef struct {
*/
atomic64_t tlb_gen;
unsigned long next_trim_cpumask;
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct rw_semaphore ldt_usr_sem;
struct ldt_struct *ldt;

View File

@ -151,6 +151,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
mm->context.next_trim_cpumask = jiffies + HZ;
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {

View File

@ -608,6 +608,7 @@
#define MSR_AMD_PERF_CTL 0xc0010062
#define MSR_AMD_PERF_STATUS 0xc0010063
#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064
#define MSR_AMD64_GUEST_TSC_FREQ 0xc0010134
#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
#define MSR_AMD64_OSVW_STATUS 0xc0010141
#define MSR_AMD_PPIN_CTL 0xc00102f0
@ -644,6 +645,7 @@
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
#define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b
#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
#define MSR_AMD64_SEV 0xc0010131
#define MSR_AMD64_SEV_ENABLED_BIT 0
@ -682,11 +684,12 @@
#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT)
#define MSR_AMD64_SNP_RESV_BIT 18
#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT)
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
#define MSR_AMD64_RMP_BASE 0xc0010132
#define MSR_AMD64_RMP_END 0xc0010133
#define MSR_AMD64_RMP_CFG 0xc0010136
#define MSR_AMD64_SEG_RMP_ENABLED_BIT 0
#define MSR_AMD64_SEG_RMP_ENABLED BIT_ULL(MSR_AMD64_SEG_RMP_ENABLED_BIT)
#define MSR_AMD64_RMP_SEGMENT_SHIFT(x) (((x) & GENMASK_ULL(13, 8)) >> 8)
#define MSR_SVSM_CAA 0xc001f000

View File

@ -58,8 +58,8 @@ struct mtrr_state_type {
*/
# ifdef CONFIG_MTRR
void mtrr_bp_init(void);
void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var,
mtrr_type def_type);
void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
mtrr_type def_type);
extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
extern void mtrr_save_fixed_ranges(void *);
extern void mtrr_save_state(void);
@ -75,9 +75,9 @@ void mtrr_disable(void);
void mtrr_enable(void);
void mtrr_generic_set_state(void);
# else
static inline void mtrr_overwrite_state(struct mtrr_var_range *var,
unsigned int num_var,
mtrr_type def_type)
static inline void guest_force_mtrr_state(struct mtrr_var_range *var,
unsigned int num_var,
mtrr_type def_type)
{
}

View File

@ -15,7 +15,6 @@
#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK)
#define MWAIT_C1_SUBSTATE_MASK 0xf0
#define CPUID_MWAIT_LEAF 5
#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
#define CPUID5_ECX_INTERRUPT_BREAK 0x2

View File

@ -179,18 +179,6 @@
#ifdef __ASSEMBLY__
/*
* This should be used immediately before an indirect jump/call. It tells
* objtool the subsequent indirect jump/call is vouched safe for retpoline
* builds.
*/
.macro ANNOTATE_RETPOLINE_SAFE
.Lhere_\@:
.pushsection .discard.retpoline_safe
.long .Lhere_\@
.popsection
.endm
/*
* (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions
* vs RETBleed validation.
@ -350,12 +338,6 @@
#else /* __ASSEMBLY__ */
#define ANNOTATE_RETPOLINE_SAFE \
"999:\n\t" \
".pushsection .discard.retpoline_safe\n\t" \
".long 999b\n\t" \
".popsection\n\t"
typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
extern retpoline_thunk_t __x86_indirect_thunk_array[];
extern retpoline_thunk_t __x86_indirect_call_thunk_array[];

View File

@ -180,13 +180,6 @@ static inline void halt(void)
PVOP_VCALL0(irq.halt);
}
extern noinstr void pv_native_wbinvd(void);
static __always_inline void wbinvd(void)
{
PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
}
static inline u64 paravirt_read_msr(unsigned msr)
{
return PVOP_CALL1(u64, cpu.read_msr, msr);

View File

@ -86,8 +86,6 @@ struct pv_cpu_ops {
void (*update_io_bitmap)(void);
#endif
void (*wbinvd)(void);
/* cpuid emulation, mostly so that caps bits can be disabled */
void (*cpuid)(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx);

View File

@ -41,6 +41,7 @@
#define INTEL_FIXED_0_USER (1ULL << 1)
#define INTEL_FIXED_0_ANYTHREAD (1ULL << 2)
#define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3)
#define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2)
#define HSW_IN_TX (1ULL << 32)
#define HSW_IN_TX_CHECKPOINTED (1ULL << 33)
@ -372,6 +373,9 @@ static inline bool use_fixed_pseudo_encoding(u64 code)
#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND
#define INTEL_TD_METRIC_NUM 8
#define INTEL_TD_CFG_METRIC_CLEAR_BIT 0
#define INTEL_TD_CFG_METRIC_CLEAR BIT_ULL(INTEL_TD_CFG_METRIC_CLEAR_BIT)
static inline bool is_metric_idx(int idx)
{
return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM;
@ -422,7 +426,9 @@ static inline bool is_topdown_idx(int idx)
*/
struct pebs_basic {
u64 format_size;
u64 format_group:32,
retire_latency:16,
format_size:16;
u64 ip;
u64 applicable_counters;
u64 tsc;
@ -431,7 +437,17 @@ struct pebs_basic {
struct pebs_meminfo {
u64 address;
u64 aux;
u64 latency;
union {
/* pre Alder Lake */
u64 mem_latency;
/* Alder Lake and later */
struct {
u64 instr_latency:16;
u64 pad2:16;
u64 cache_latency:16;
u64 pad3:16;
};
};
u64 tsx_tuning;
};

View File

@ -98,6 +98,7 @@ struct cpuinfo_topology {
// Logical ID mappings
u32 logical_pkg_id;
u32 logical_die_id;
u32 logical_core_id;
// AMD Node ID and Nodes per Package info
u32 amd_node_id;

View File

@ -5,6 +5,7 @@
#include <asm-generic/sections.h>
#include <asm/extable.h>
extern char __relocate_kernel_start[], __relocate_kernel_end[];
extern char __brk_base[], __brk_limit[];
extern char __end_rodata_aligned[];

View File

@ -49,7 +49,7 @@ extern unsigned long saved_video_mode;
extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp);
extern void startup_64_setup_gdt_idt(void);
extern void early_setup_idt(void);
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);

View File

@ -206,6 +206,8 @@ struct snp_psc_desc {
#define GHCB_TERM_NO_SVSM 7 /* SVSM is not advertised in the secrets page */
#define GHCB_TERM_SVSM_VMPL0 8 /* SVSM is present but has set VMPL to 0 */
#define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */
#define GHCB_TERM_SVSM_CA_REMAP_FAIL 10 /* SVSM is present but CA could not be remapped */
#define GHCB_TERM_SECURE_TSC 11 /* Secure TSC initialization failed */
#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)

View File

@ -14,6 +14,7 @@
#include <asm/insn.h>
#include <asm/sev-common.h>
#include <asm/coco.h>
#include <asm/set_memory.h>
#define GHCB_PROTOCOL_MIN 1ULL
#define GHCB_PROTOCOL_MAX 2ULL
@ -124,6 +125,9 @@ struct snp_req_data {
#define AAD_LEN 48
#define MSG_HDR_VER 1
#define SNP_REQ_MAX_RETRY_DURATION (60*HZ)
#define SNP_REQ_RETRY_DELAY (2*HZ)
/* See SNP spec SNP_GUEST_REQUEST section for the structure */
enum msg_type {
SNP_MSG_TYPE_INVALID = 0,
@ -142,6 +146,9 @@ enum msg_type {
SNP_MSG_VMRK_REQ,
SNP_MSG_VMRK_RSP,
SNP_MSG_TSC_INFO_REQ = 17,
SNP_MSG_TSC_INFO_RSP,
SNP_MSG_TYPE_MAX
};
@ -170,9 +177,20 @@ struct snp_guest_msg {
u8 payload[PAGE_SIZE - sizeof(struct snp_guest_msg_hdr)];
} __packed;
struct sev_guest_platform_data {
u64 secrets_gpa;
};
#define SNP_TSC_INFO_REQ_SZ 128
struct snp_tsc_info_req {
u8 rsvd[SNP_TSC_INFO_REQ_SZ];
} __packed;
struct snp_tsc_info_resp {
u32 status;
u32 rsvd1;
u64 tsc_scale;
u64 tsc_offset;
u32 tsc_factor;
u8 rsvd2[100];
} __packed;
struct snp_guest_req {
void *req_buf;
@ -253,6 +271,7 @@ struct snp_msg_desc {
u32 *os_area_msg_seqno;
u8 *vmpck;
int vmpck_id;
};
/*
@ -445,8 +464,6 @@ void snp_set_wakeup_secondary_cpu(void);
bool snp_init(struct boot_params *bp);
void __noreturn snp_abort(void);
void snp_dmi_setup(void);
int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
struct snp_guest_request_ioctl *rio);
int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 snp_get_unsupported_features(u64 status);
@ -458,6 +475,15 @@ void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot);
void snp_kexec_finish(void);
void snp_kexec_begin(void);
int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id);
struct snp_msg_desc *snp_msg_alloc(void);
void snp_msg_free(struct snp_msg_desc *mdesc);
int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
struct snp_guest_request_ioctl *rio);
void __init snp_secure_tsc_prepare(void);
void __init snp_secure_tsc_init(void);
#else /* !CONFIG_AMD_MEM_ENCRYPT */
#define snp_vmpl 0
@ -480,11 +506,6 @@ static inline void snp_set_wakeup_secondary_cpu(void) { }
static inline bool snp_init(struct boot_params *bp) { return false; }
static inline void snp_abort(void) { }
static inline void snp_dmi_setup(void) { }
static inline int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
struct snp_guest_request_ioctl *rio)
{
return -ENOTTY;
}
static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input)
{
return -ENOTTY;
@ -498,6 +519,13 @@ static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; }
static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { }
static inline void snp_kexec_finish(void) { }
static inline void snp_kexec_begin(void) { }
static inline int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) { return -1; }
static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; }
static inline void snp_msg_free(struct snp_msg_desc *mdesc) { }
static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
struct snp_guest_request_ioctl *rio) { return -ENODEV; }
static inline void __init snp_secure_tsc_prepare(void) { }
static inline void __init snp_secure_tsc_init(void) { }
#endif /* CONFIG_AMD_MEM_ENCRYPT */

View File

@ -19,6 +19,32 @@
#define TDG_VM_RD 7
#define TDG_VM_WR 8
/* TDX attributes */
#define TDX_ATTR_DEBUG_BIT 0
#define TDX_ATTR_DEBUG BIT_ULL(TDX_ATTR_DEBUG_BIT)
#define TDX_ATTR_HGS_PLUS_PROF_BIT 4
#define TDX_ATTR_HGS_PLUS_PROF BIT_ULL(TDX_ATTR_HGS_PLUS_PROF_BIT)
#define TDX_ATTR_PERF_PROF_BIT 5
#define TDX_ATTR_PERF_PROF BIT_ULL(TDX_ATTR_PERF_PROF_BIT)
#define TDX_ATTR_PMT_PROF_BIT 6
#define TDX_ATTR_PMT_PROF BIT_ULL(TDX_ATTR_PMT_PROF_BIT)
#define TDX_ATTR_ICSSD_BIT 16
#define TDX_ATTR_ICSSD BIT_ULL(TDX_ATTR_ICSSD_BIT)
#define TDX_ATTR_LASS_BIT 27
#define TDX_ATTR_LASS BIT_ULL(TDX_ATTR_LASS_BIT)
#define TDX_ATTR_SEPT_VE_DISABLE_BIT 28
#define TDX_ATTR_SEPT_VE_DISABLE BIT_ULL(TDX_ATTR_SEPT_VE_DISABLE_BIT)
#define TDX_ATTR_MIGRTABLE_BIT 29
#define TDX_ATTR_MIGRTABLE BIT_ULL(TDX_ATTR_MIGRTABLE_BIT)
#define TDX_ATTR_PKS_BIT 30
#define TDX_ATTR_PKS BIT_ULL(TDX_ATTR_PKS_BIT)
#define TDX_ATTR_KL_BIT 31
#define TDX_ATTR_KL BIT_ULL(TDX_ATTR_KL_BIT)
#define TDX_ATTR_TPA_BIT 62
#define TDX_ATTR_TPA BIT_ULL(TDX_ATTR_TPA_BIT)
#define TDX_ATTR_PERFMON_BIT 63
#define TDX_ATTR_PERFMON BIT_ULL(TDX_ATTR_PERFMON_BIT)
/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */
#define TDCS_CONFIG_FLAGS 0x1110000300000016
#define TDCS_TD_CTLS 0x1110000300000017
@ -29,8 +55,16 @@
#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1)
/* TDCS_TD_CTLS bits */
#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0)
#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(1)
#define TD_CTLS_PENDING_VE_DISABLE_BIT 0
#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(TD_CTLS_PENDING_VE_DISABLE_BIT)
#define TD_CTLS_ENUM_TOPOLOGY_BIT 1
#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(TD_CTLS_ENUM_TOPOLOGY_BIT)
#define TD_CTLS_VIRT_CPUID2_BIT 2
#define TD_CTLS_VIRT_CPUID2 BIT_ULL(TD_CTLS_VIRT_CPUID2_BIT)
#define TD_CTLS_REDUCE_VE_BIT 3
#define TD_CTLS_REDUCE_VE BIT_ULL(TD_CTLS_REDUCE_VE_BIT)
#define TD_CTLS_LOCK_BIT 63
#define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT)
/* TDX hypercall Leaf IDs */
#define TDVMCALL_MAP_GPA 0x10001

View File

@ -115,7 +115,7 @@ static inline void wrpkru(u32 pkru)
}
#endif
static __always_inline void native_wbinvd(void)
static __always_inline void wbinvd(void)
{
asm volatile("wbinvd": : :"memory");
}
@ -167,12 +167,6 @@ static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
}
static __always_inline void wbinvd(void)
{
native_wbinvd();
}
#endif /* CONFIG_PARAVIRT_XXL */
static __always_inline void clflush(volatile void *__p)

View File

@ -417,7 +417,9 @@ struct sev_es_save_area {
u8 reserved_0x298[80];
u32 pkru;
u32 tsc_aux;
u8 reserved_0x2f0[24];
u64 tsc_scale;
u64 tsc_offset;
u8 reserved_0x300[8];
u64 rcx;
u64 rdx;
u64 rbx;
@ -564,7 +566,7 @@ static inline void __unused_size_checks(void)
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0);
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248);
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298);
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x2f0);
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x300);
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320);
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380);
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0);

View File

@ -66,6 +66,9 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
u64 tdx_hcall_get_quote(u8 *buf, size_t size);
void __init tdx_dump_attributes(u64 td_attr);
void __init tdx_dump_td_ctls(u64 td_ctls);
#else
static inline void tdx_early_init(void) { };

View File

@ -222,6 +222,7 @@ struct flush_tlb_info {
unsigned int initiating_cpu;
u8 stride_shift;
u8 freed_tables;
u8 trim_cpumask;
};
void flush_tlb_local(void);

View File

@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu);
#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id)
#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id)
#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id)
#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id)
#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id)
#define topology_ppin(cpu) (cpu_data(cpu).ppin)

View File

@ -119,6 +119,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_AMD_NODE) += amd_node.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o

View File

@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <acpi/processor.h>
#include <asm/cpuid.h>
#include <asm/mwait.h>
#include <asm/special_insns.h>
@ -128,7 +129,7 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
unsigned int cstate_type; /* C-state type and not ACPI C-state type */
unsigned int num_cstate_subtype;
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
/* Check whether this particular cx_type (in CST) is supported or not */
cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) &
@ -172,7 +173,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
struct cpuinfo_x86 *c = &cpu_data(cpu);
long retval;
if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
if (!cpu_cstate_entry || c->cpuid_level < CPUID_LEAF_MWAIT)
return -1;
if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)

View File

@ -1854,11 +1854,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
return temp_state;
}
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
{
lockdep_assert_irqs_disabled();
switch_mm_irqs_off(NULL, prev_state.mm, current);
/* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
/*
* Restore the breakpoints if they were disabled before the temporary mm
* was loaded.
@ -1867,9 +1874,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
hw_breakpoint_restore();
}
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;
static void text_poke_memcpy(void *dst, const void *src, size_t len)
{
memcpy(dst, src, len);

View File

@ -15,66 +15,8 @@
#include <linux/pci_ids.h>
#include <asm/amd_nb.h>
#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5
#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4
#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8
#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8
#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a
#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507
#define PCI_DEVICE_ID_AMD_1AH_M60H_ROOT 0x1122
#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb
#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728
#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1
#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4
#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4
#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc
#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4
#define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F4 0x16fc
#define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4 0x124c
#define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4 0x12bc
#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4
#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c
/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
static u32 *flush_words;
static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) },
{}
};
#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704
static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
@ -84,70 +26,6 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) },
{}
};
static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) },
{}
};
static const struct pci_device_id hygon_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_ROOT) },
{}
};
static const struct pci_device_id hygon_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{}
};
static const struct pci_device_id hygon_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F4) },
{}
};
@ -178,176 +56,37 @@ struct amd_northbridge *node_to_amd_nb(int node)
}
EXPORT_SYMBOL_GPL(node_to_amd_nb);
static struct pci_dev *next_northbridge(struct pci_dev *dev,
const struct pci_device_id *ids)
{
do {
dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
if (!dev)
break;
} while (!pci_match_id(ids, dev));
return dev;
}
/*
* SMN accesses may fail in ways that are difficult to detect here in the called
* functions amd_smn_read() and amd_smn_write(). Therefore, callers must do
* their own checking based on what behavior they expect.
*
* For SMN reads, the returned value may be zero if the register is Read-as-Zero.
* Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response"
* can be checked here, and a proper error code can be returned.
*
* But the Read-as-Zero response cannot be verified here. A value of 0 may be
* correct in some cases, so callers must check that this correct is for the
* register/fields they need.
*
* For SMN writes, success can be determined through a "write and read back"
* However, this is not robust when done here.
*
* Possible issues:
*
* 1) Bits that are "Write-1-to-Clear". In this case, the read value should
* *not* match the write value.
*
* 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be
* known here.
*
* 3) Bits that are "Reserved / Set to 1". Ditto above.
*
* Callers of amd_smn_write() should do the "write and read back" check
* themselves, if needed.
*
* For #1, they can see if their target bits got cleared.
*
* For #2 and #3, they can check if their target bits got set as intended.
*
* This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then
* the operation is considered a success, and the caller does their own
* checking.
*/
static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write)
{
struct pci_dev *root;
int err = -ENODEV;
if (node >= amd_northbridges.num)
goto out;
root = node_to_amd_nb(node)->root;
if (!root)
goto out;
mutex_lock(&smn_mutex);
err = pci_write_config_dword(root, 0x60, address);
if (err) {
pr_warn("Error programming SMN address 0x%x.\n", address);
goto out_unlock;
}
err = (write ? pci_write_config_dword(root, 0x64, *value)
: pci_read_config_dword(root, 0x64, value));
out_unlock:
mutex_unlock(&smn_mutex);
out:
return err;
}
int __must_check amd_smn_read(u16 node, u32 address, u32 *value)
{
int err = __amd_smn_rw(node, address, value, false);
if (PCI_POSSIBLE_ERROR(*value)) {
err = -ENODEV;
*value = 0;
}
return err;
}
EXPORT_SYMBOL_GPL(amd_smn_read);
int __must_check amd_smn_write(u16 node, u32 address, u32 value)
{
return __amd_smn_rw(node, address, &value, true);
}
EXPORT_SYMBOL_GPL(amd_smn_write);
static int amd_cache_northbridges(void)
{
const struct pci_device_id *misc_ids = amd_nb_misc_ids;
const struct pci_device_id *link_ids = amd_nb_link_ids;
const struct pci_device_id *root_ids = amd_root_ids;
struct pci_dev *root, *misc, *link;
struct amd_northbridge *nb;
u16 roots_per_misc = 0;
u16 misc_count = 0;
u16 root_count = 0;
u16 i, j;
u16 i;
if (amd_northbridges.num)
return 0;
if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
root_ids = hygon_root_ids;
misc_ids = hygon_nb_misc_ids;
link_ids = hygon_nb_link_ids;
}
amd_northbridges.num = amd_num_nodes();
misc = NULL;
while ((misc = next_northbridge(misc, misc_ids)))
misc_count++;
if (!misc_count)
return -ENODEV;
root = NULL;
while ((root = next_northbridge(root, root_ids)))
root_count++;
if (root_count) {
roots_per_misc = root_count / misc_count;
/*
* There should be _exactly_ N roots for each DF/SMN
* interface.
*/
if (!roots_per_misc || (root_count % roots_per_misc)) {
pr_info("Unsupported AMD DF/PCI configuration found\n");
return -ENODEV;
}
}
nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL);
nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL);
if (!nb)
return -ENOMEM;
amd_northbridges.nb = nb;
amd_northbridges.num = misc_count;
link = misc = root = NULL;
for (i = 0; i < amd_northbridges.num; i++) {
node_to_amd_nb(i)->root = root =
next_northbridge(root, root_ids);
node_to_amd_nb(i)->misc = misc =
next_northbridge(misc, misc_ids);
node_to_amd_nb(i)->link = link =
next_northbridge(link, link_ids);
node_to_amd_nb(i)->root = amd_node_get_root(i);
node_to_amd_nb(i)->misc = amd_node_get_func(i, 3);
/*
* If there are more PCI root devices than data fabric/
* system management network interfaces, then the (N)
* PCI roots per DF/SMN interface are functionally the
* same (for DF/SMN access) and N-1 are redundant. N-1
* PCI roots should be skipped per DF/SMN interface so
* the following DF/SMN interfaces get mapped to
* correct PCI roots.
* Each Northbridge must have a 'misc' device.
* If not, then uninitialize everything.
*/
for (j = 1; j < roots_per_misc; j++)
root = next_northbridge(root, root_ids);
if (!node_to_amd_nb(i)->misc) {
amd_northbridges.num = 0;
kfree(nb);
return -ENODEV;
}
node_to_amd_nb(i)->link = amd_node_get_func(i, 4);
}
if (amd_gart_present())
@ -385,7 +124,6 @@ static int amd_cache_northbridges(void)
*/
bool __init early_is_amd_nb(u32 device)
{
const struct pci_device_id *misc_ids = amd_nb_misc_ids;
const struct pci_device_id *id;
u32 vendor = device & 0xffff;
@ -393,11 +131,11 @@ bool __init early_is_amd_nb(u32 device)
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return false;
if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
misc_ids = hygon_nb_misc_ids;
if (cpu_feature_enabled(X86_FEATURE_ZEN))
return false;
device >>= 16;
for (id = misc_ids; id->vendor; id++)
for (id = amd_nb_misc_ids; id->vendor; id++)
if (vendor == id->vendor && device == id->device)
return true;
return false;
@ -582,6 +320,10 @@ static __init void fix_erratum_688(void)
static __init int init_amd_nbs(void)
{
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return 0;
amd_cache_northbridges();
amd_cache_gart();

215
arch/x86/kernel/amd_node.c Normal file
View File

@ -0,0 +1,215 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* AMD Node helper functions and common defines
*
* Copyright (c) 2024, Advanced Micro Devices, Inc.
* All Rights Reserved.
*
* Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
*/
#include <asm/amd_node.h>
/*
* AMD Nodes are a physical collection of I/O devices within an SoC. There can be one
* or more nodes per package.
*
* The nodes are software-visible through PCI config space. All nodes are enumerated
* on segment 0 bus 0. The device (slot) numbers range from 0x18 to 0x1F (maximum 8
* nodes) with 0x18 corresponding to node 0, 0x19 to node 1, etc. Each node can be a
* multi-function device.
*
* On legacy systems, these node devices represent integrated Northbridge functionality.
* On Zen-based systems, these node devices represent Data Fabric functionality.
*
* See "Configuration Space Accesses" section in BKDGs or
* "Processor x86 Core" -> "Configuration Space" section in PPRs.
*/
struct pci_dev *amd_node_get_func(u16 node, u8 func)
{
if (node >= MAX_AMD_NUM_NODES)
return NULL;
return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func));
}
#define DF_BLK_INST_CNT 0x040
#define DF_CFG_ADDR_CNTL_LEGACY 0x084
#define DF_CFG_ADDR_CNTL_DF4 0xC04
#define DF_MAJOR_REVISION GENMASK(27, 24)
static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0)
{
u32 reg;
/*
* Revision fields added for DF4 and later.
*
* Major revision of '0' is found pre-DF4. Field is Read-as-Zero.
*/
if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, &reg))
return 0;
if (reg & DF_MAJOR_REVISION)
return DF_CFG_ADDR_CNTL_DF4;
return DF_CFG_ADDR_CNTL_LEGACY;
}
struct pci_dev *amd_node_get_root(u16 node)
{
struct pci_dev *root;
u16 cntl_off;
u8 bus;
if (!cpu_feature_enabled(X86_FEATURE_ZEN))
return NULL;
/*
* D18F0xXXX [Config Address Control] (DF::CfgAddressCntl)
* Bits [7:0] (SecBusNum) holds the bus number of the root device for
* this Data Fabric instance. The segment, device, and function will be 0.
*/
struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0);
if (!df_f0)
return NULL;
cntl_off = get_cfg_addr_cntl_offset(df_f0);
if (!cntl_off)
return NULL;
if (pci_read_config_byte(df_f0, cntl_off, &bus))
return NULL;
/* Grab the pointer for the actual root device instance. */
root = pci_get_domain_bus_and_slot(0, bus, 0);
pci_dbg(root, "is root for AMD node %u\n", node);
return root;
}
static struct pci_dev **amd_roots;
/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
#define SMN_INDEX_OFFSET 0x60
#define SMN_DATA_OFFSET 0x64
/*
* SMN accesses may fail in ways that are difficult to detect here in the called
* functions amd_smn_read() and amd_smn_write(). Therefore, callers must do
* their own checking based on what behavior they expect.
*
* For SMN reads, the returned value may be zero if the register is Read-as-Zero.
* Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response"
* can be checked here, and a proper error code can be returned.
*
* But the Read-as-Zero response cannot be verified here. A value of 0 may be
* correct in some cases, so callers must check that this correct is for the
* register/fields they need.
*
* For SMN writes, success can be determined through a "write and read back"
* However, this is not robust when done here.
*
* Possible issues:
*
* 1) Bits that are "Write-1-to-Clear". In this case, the read value should
* *not* match the write value.
*
* 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be
* known here.
*
* 3) Bits that are "Reserved / Set to 1". Ditto above.
*
* Callers of amd_smn_write() should do the "write and read back" check
* themselves, if needed.
*
* For #1, they can see if their target bits got cleared.
*
* For #2 and #3, they can check if their target bits got set as intended.
*
* This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then
* the operation is considered a success, and the caller does their own
* checking.
*/
static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, bool write)
{
struct pci_dev *root;
int err = -ENODEV;
if (node >= amd_num_nodes())
return err;
root = amd_roots[node];
if (!root)
return err;
guard(mutex)(&smn_mutex);
err = pci_write_config_dword(root, i_off, address);
if (err) {
pr_warn("Error programming SMN address 0x%x.\n", address);
return pcibios_err_to_errno(err);
}
err = (write ? pci_write_config_dword(root, d_off, *value)
: pci_read_config_dword(root, d_off, value));
return pcibios_err_to_errno(err);
}
int __must_check amd_smn_read(u16 node, u32 address, u32 *value)
{
int err = __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, value, false);
if (PCI_POSSIBLE_ERROR(*value)) {
err = -ENODEV;
*value = 0;
}
return err;
}
EXPORT_SYMBOL_GPL(amd_smn_read);
int __must_check amd_smn_write(u16 node, u32 address, u32 value)
{
return __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, &value, true);
}
EXPORT_SYMBOL_GPL(amd_smn_write);
static int amd_cache_roots(void)
{
u16 node, num_nodes = amd_num_nodes();
amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
if (!amd_roots)
return -ENOMEM;
for (node = 0; node < num_nodes; node++)
amd_roots[node] = amd_node_get_root(node);
return 0;
}
static int __init amd_smn_init(void)
{
int err;
if (!cpu_feature_enabled(X86_FEATURE_ZEN))
return 0;
guard(mutex)(&smn_mutex);
if (amd_roots)
return 0;
err = amd_cache_roots();
if (err)
return err;
return 0;
}
fs_initcall(amd_smn_init);

View File

@ -509,19 +509,19 @@ static struct clock_event_device lapic_clockevent = {
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
static const struct x86_cpu_id deadline_match[] __initconst = {
X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x2, 0x2, 0x3a), /* EP */
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x4, 0x4, 0x0f), /* EX */
X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020),
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x2, 0x2, 0x00000011),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x3, 0x3, 0x0700000e),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x4, 0x4, 0x0f00000c),
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x5, 0x5, 0x0e000003),
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x3, 0x3, 0x01000136),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x4, 0x4, 0x02000014),
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x5, 0xf, 0),
X86_MATCH_VFM(INTEL_HASWELL, 0x22),
X86_MATCH_VFM(INTEL_HASWELL_L, 0x20),
@ -2582,19 +2582,12 @@ int apic_is_clustered_box(void)
/*
* APIC command line parameters
*/
static int __init setup_disableapic(char *arg)
static int __init setup_nolapic(char *arg)
{
apic_is_disabled = true;
setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
early_param("disableapic", setup_disableapic);
/* same as disableapic, for compatibility */
static int __init setup_nolapic(char *arg)
{
return setup_disableapic(arg);
}
early_param("nolapic", setup_nolapic);
static int __init parse_lapic_timer_c2_ok(char *arg)

View File

@ -1165,7 +1165,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
(entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero);
} else {
apic_dbg("%s, %s, D(%02X%02X), M(%1d)\n", buf,
entry.dest_mode_logical ? "logical " : "physic al",
entry.dest_mode_logical ? "logical " : "physical",
entry.virt_destid_8_14, entry.destid_0_7, entry.delivery_mode);
}
}

View File

@ -139,9 +139,15 @@ static bool skip_addr(void *dest)
return true;
#endif
#ifdef CONFIG_KEXEC_CORE
# ifdef CONFIG_X86_64
if (dest >= (void *)__relocate_kernel_start &&
dest < (void *)__relocate_kernel_end)
return true;
# else
if (dest >= (void *)relocate_kernel &&
dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE)
return true;
# endif
#endif
return false;
}

View File

@ -355,10 +355,15 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c)
/*
* RMP table entry format is not architectural and is defined by the
* per-processor PPR. Restrict SNP support on the known CPU models
* for which the RMP table entry format is currently defined for.
* for which the RMP table entry format is currently defined or for
* processors which support the architecturally defined RMPREAD
* instruction.
*/
if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
c->x86 >= 0x19 && snp_probe_rmptable_info()) {
(cpu_feature_enabled(X86_FEATURE_ZEN3) ||
cpu_feature_enabled(X86_FEATURE_ZEN4) ||
cpu_feature_enabled(X86_FEATURE_RMPREAD)) &&
snp_probe_rmptable_info()) {
cc_platform_set(CC_ATTR_HOST_SEV_SNP);
} else {
setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
@ -795,10 +800,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
static const struct x86_cpu_desc erratum_1386_microcode[] = {
AMD_CPU_DESC(0x17, 0x1, 0x2, 0x0800126e),
AMD_CPU_DESC(0x17, 0x31, 0x0, 0x08301052),
{},
static const struct x86_cpu_id erratum_1386_microcode[] = {
X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
};
static void fix_erratum_1386(struct cpuinfo_x86 *c)
@ -814,7 +818,7 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c)
* Clear the feature flag only on microcode revisions which
* don't have the fix.
*/
if (x86_cpu_has_min_microcode_rev(erratum_1386_microcode))
if (x86_match_min_microcode_rev(erratum_1386_microcode))
return;
clear_cpu_cap(c, X86_FEATURE_XSAVES);

View File

@ -2615,6 +2615,9 @@ static void __init srso_select_mitigation(void)
break;
case SRSO_CMD_SAFE_RET:
if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))
goto ibpb_on_vmexit;
if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
/*
* Enable the return thunk for generated code
@ -2658,6 +2661,7 @@ static void __init srso_select_mitigation(void)
}
break;
ibpb_on_vmexit:
case SRSO_CMD_IBPB_ON_VMEXIT:
if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) {

View File

@ -29,6 +29,7 @@
#include <asm/alternative.h>
#include <asm/cmdline.h>
#include <asm/cpuid.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/doublefault.h>
@ -636,9 +637,9 @@ struct cpuid_dependent_feature {
static const struct cpuid_dependent_feature
cpuid_dependent_features[] = {
{ X86_FEATURE_MWAIT, 0x00000005 },
{ X86_FEATURE_DCA, 0x00000009 },
{ X86_FEATURE_XSAVE, 0x0000000d },
{ X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT },
{ X86_FEATURE_DCA, CPUID_LEAF_DCA },
{ X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE },
{ 0, 0 }
};
@ -1201,8 +1202,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define VULNBL(vendor, family, model, blacklist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
#define VULNBL_INTEL_STEPPINGS(vfm, steppings, issues) \
X86_MATCH_VFM_STEPPINGS(vfm, steppings, issues)
#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
#define VULNBL_AMD(family, blacklist) \
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
@ -1227,49 +1228,50 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define RFDS BIT(7)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(INTEL_IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_L, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_G, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_G, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS),
VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS),
VULNBL_INTEL_STEPPINGS(INTEL_LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO),
VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO),
VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO),
VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED),
VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS),
VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS),
VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED),
VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS),
VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS),
VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
VULNBL_AMD(0x19, SRSO),
VULNBL_AMD(0x1a, SRSO),
{}
};

View File

@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p)
seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);

View File

@ -599,11 +599,6 @@ static void init_intel(struct cpuinfo_x86 *c)
if (p)
strcpy(c->x86_model_id, p);
}
if (c->x86 == 15)
set_cpu_cap(c, X86_FEATURE_P4);
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_P3);
#endif
/* Work around errata */

View File

@ -6,7 +6,7 @@
#include <linux/slab.h>
/**
* x86_match_cpu - match current CPU again an array of x86_cpu_ids
* x86_match_cpu - match current CPU against an array of x86_cpu_ids
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
* {}.
*
@ -56,33 +56,13 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
}
EXPORT_SYMBOL(x86_match_cpu);
static const struct x86_cpu_desc *
x86_match_cpu_with_stepping(const struct x86_cpu_desc *match)
bool x86_match_min_microcode_rev(const struct x86_cpu_id *table)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
const struct x86_cpu_desc *m;
const struct x86_cpu_id *res = x86_match_cpu(table);
for (m = match; m->x86_family | m->x86_model; m++) {
if (c->x86_vendor != m->x86_vendor)
continue;
if (c->x86 != m->x86_family)
continue;
if (c->x86_model != m->x86_model)
continue;
if (c->x86_stepping != m->x86_stepping)
continue;
return m;
}
return NULL;
}
bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table)
{
const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table);
if (!res || res->x86_microcode_rev > boot_cpu_data.microcode)
if (!res || res->driver_data > boot_cpu_data.microcode)
return false;
return true;
}
EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev);
EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev);

View File

@ -4,8 +4,6 @@
*
* Written by Jacob Shin - AMD, Inc.
* Maintained by: Borislav Petkov <bp@alien8.de>
*
* All MC4_MISCi registers are shared between cores on a node.
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
@ -20,7 +18,6 @@
#include <linux/smp.h>
#include <linux/string.h>
#include <asm/amd_nb.h>
#include <asm/traps.h>
#include <asm/apic.h>
#include <asm/mce.h>
@ -221,6 +218,32 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
#define MAX_MCATYPE_NAME_LEN 30
static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
struct threshold_block {
/* This block's number within its bank. */
unsigned int block;
/* MCA bank number that contains this block. */
unsigned int bank;
/* CPU which controls this block's MCA bank. */
unsigned int cpu;
/* MCA_MISC MSR address for this block. */
u32 address;
/* Enable/Disable APIC interrupt. */
bool interrupt_enable;
/* Bank can generate an interrupt. */
bool interrupt_capable;
/* Value upon which threshold interrupt is generated. */
u16 threshold_limit;
/* sysfs object */
struct kobject kobj;
/* List of threshold blocks within this block's MCA bank. */
struct list_head miscj;
};
struct threshold_bank {
struct kobject *kobj;
struct threshold_block *blocks;
};
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
/*
@ -333,19 +356,6 @@ struct thresh_restart {
u16 old_limit;
};
static inline bool is_shared_bank(int bank)
{
/*
* Scalable MCA provides for only one core to have access to the MSRs of
* a shared bank.
*/
if (mce_flags.smca)
return false;
/* Bank 4 is for northbridge reporting and is thus shared */
return (bank == 4);
}
static const char *bank4_names(const struct threshold_block *b)
{
switch (b->address) {
@ -381,7 +391,7 @@ static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
return msr_high_bits & BIT(28);
}
static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
@ -389,7 +399,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
b->bank, b->block, b->address, hi, lo);
return 0;
return false;
}
if (apic != msr) {
@ -399,15 +409,15 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
* was set is reserved. Return early here:
*/
if (mce_flags.smca)
return 0;
return false;
pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n",
b->cpu, apic, b->bank, b->block, b->address, hi, lo);
return 0;
return false;
}
return 1;
return true;
};
/* Reprogram MCx_MISC MSR behind this threshold bank. */
@ -1198,35 +1208,10 @@ out_free:
return err;
}
static int __threshold_add_blocks(struct threshold_bank *b)
{
struct list_head *head = &b->blocks->miscj;
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
int err = 0;
err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
if (err)
return err;
list_for_each_entry_safe(pos, tmp, head, miscj) {
err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
if (err) {
list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
kobject_del(&pos->kobj);
return err;
}
}
return err;
}
static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
unsigned int bank)
{
struct device *dev = this_cpu_read(mce_device);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
const char *name = get_name(cpu, bank, NULL);
int err = 0;
@ -1234,26 +1219,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
if (!dev)
return -ENODEV;
if (is_shared_bank(bank)) {
nb = node_to_amd_nb(topology_amd_node_id(cpu));
/* threshold descriptor already initialized on this node? */
if (nb && nb->bank4) {
/* yes, use it */
b = nb->bank4;
err = kobject_add(b->kobj, &dev->kobj, name);
if (err)
goto out;
bp[bank] = b;
refcount_inc(&b->cpus);
err = __threshold_add_blocks(b);
goto out;
}
}
b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
if (!b) {
err = -ENOMEM;
@ -1267,17 +1232,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
goto out_free;
}
if (is_shared_bank(bank)) {
b->shared = 1;
refcount_set(&b->cpus, 1);
/* nb is already initialized, see above */
if (nb) {
WARN_ON(nb->bank4);
nb->bank4 = b;
}
}
err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
if (err)
goto out_kobj;
@ -1310,40 +1264,11 @@ static void deallocate_threshold_blocks(struct threshold_bank *bank)
kobject_put(&bank->blocks->kobj);
}
static void __threshold_remove_blocks(struct threshold_bank *b)
{
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
kobject_put(b->kobj);
list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
kobject_put(b->kobj);
}
static void threshold_remove_bank(struct threshold_bank *bank)
{
struct amd_northbridge *nb;
if (!bank->blocks)
goto out_free;
if (!bank->shared)
goto out_dealloc;
if (!refcount_dec_and_test(&bank->cpus)) {
__threshold_remove_blocks(bank);
return;
} else {
/*
* The last CPU on this node using the shared bank is going
* away, remove that bank now.
*/
nb = node_to_amd_nb(topology_amd_node_id(smp_processor_id()));
nb->bank4 = NULL;
}
out_dealloc:
deallocate_threshold_blocks(bank);
out_free:

View File

@ -151,7 +151,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm);
void mce_log(struct mce_hw_err *err)
{
if (!mce_gen_pool_add(err))
if (mce_gen_pool_add(err))
irq_work_queue(&mce_irq_work);
}
EXPORT_SYMBOL_GPL(mce_log);
@ -492,10 +492,10 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs
}
}
int mce_available(struct cpuinfo_x86 *c)
bool mce_available(struct cpuinfo_x86 *c)
{
if (mca_cfg.disabled)
return 0;
return false;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
@ -1778,7 +1778,7 @@ static void mce_timer_delete_all(void)
* Can be called from interrupt context, but not from machine check/NMI
* context.
*/
int mce_notify_irq(void)
bool mce_notify_irq(void)
{
/* Not more than two messages every minute */
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
@ -1789,9 +1789,9 @@ int mce_notify_irq(void)
if (__ratelimit(&ratelimit))
pr_info(HW_ERR "Machine check events logged\n");
return 1;
return true;
}
return 0;
return false;
}
EXPORT_SYMBOL_GPL(mce_notify_irq);
@ -1910,101 +1910,120 @@ static void __mcheck_cpu_check_banks(void)
}
}
/* Add per CPU specific workarounds here */
static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
static void apply_quirks_amd(struct cpuinfo_x86 *c)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mca_config *cfg = &mca_cfg;
if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
pr_info("unknown CPU type - not enabling MCE support\n");
return -EOPNOTSUPP;
}
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD) {
if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
/*
* disable GART TBL walk error reporting, which
* trips off incorrectly with the IOMMU & 3ware
* & Cerberus:
*/
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
}
if (c->x86 < 0x11 && cfg->bootlog < 0) {
/*
* Lots of broken BIOS around that don't clear them
* by default and leave crap in there. Don't log:
*/
cfg->bootlog = 0;
}
if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
/*
* Various K7s with broken bank 0 around. Always disable
* by default.
* disable GART TBL walk error reporting, which
* trips off incorrectly with the IOMMU & 3ware
* & Cerberus:
*/
if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
mce_banks[0].ctl = 0;
/*
* overflow_recov is supported for F15h Models 00h-0fh
* even though we don't have a CPUID bit for it.
*/
if (c->x86 == 0x15 && c->x86_model <= 0xf)
mce_flags.overflow_recov = 1;
if (c->x86 >= 0x17 && c->x86 <= 0x1A)
mce_flags.zen_ifu_quirk = 1;
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
/*
* SDM documents that on family 6 bank 0 should not be written
* because it aliases to another special BIOS controlled
* register.
* But it's not aliased anymore on model 0x1a+
* Don't ignore bank 0 completely because there could be a
* valid event later, merely don't write CTL0.
* Lots of broken BIOS around that don't clear them
* by default and leave crap in there. Don't log:
*/
if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
mce_banks[0].init = false;
/*
* All newer Intel systems support MCE broadcasting. Enable
* synchronization with a one second timeout.
*/
if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
cfg->monarch_timeout < 0)
cfg->monarch_timeout = USEC_PER_SEC;
/*
* There are also broken BIOSes on some Pentium M and
* earlier systems:
*/
if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
cfg->bootlog = 0;
if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
mce_flags.snb_ifu_quirk = 1;
/*
* Skylake, Cascacde Lake and Cooper Lake require a quirk on
* rep movs.
*/
if (c->x86_vfm == INTEL_SKYLAKE_X)
mce_flags.skx_repmov_quirk = 1;
mca_cfg.bootlog = 0;
}
if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
/*
* All newer Zhaoxin CPUs support MCE broadcasting. Enable
* synchronization with a one second timeout.
*/
if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = USEC_PER_SEC;
}
/*
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
if (c->x86 == 6 && this_cpu_read(mce_num_banks))
mce_banks[0].ctl = 0;
/*
* overflow_recov is supported for F15h Models 00h-0fh
* even though we don't have a CPUID bit for it.
*/
if (c->x86 == 0x15 && c->x86_model <= 0xf)
mce_flags.overflow_recov = 1;
if (c->x86 >= 0x17 && c->x86 <= 0x1A)
mce_flags.zen_ifu_quirk = 1;
}
static void apply_quirks_intel(struct cpuinfo_x86 *c)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
/* Older CPUs (prior to family 6) don't need quirks. */
if (c->x86_vfm < INTEL_PENTIUM_PRO)
return;
/*
* SDM documents that on family 6 bank 0 should not be written
* because it aliases to another special BIOS controlled
* register.
* But it's not aliased anymore on model 0x1a+
* Don't ignore bank 0 completely because there could be a
* valid event later, merely don't write CTL0.
*/
if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
mce_banks[0].init = false;
/*
* All newer Intel systems support MCE broadcasting. Enable
* synchronization with a one second timeout.
*/
if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0)
mca_cfg.monarch_timeout = USEC_PER_SEC;
/*
* There are also broken BIOSes on some Pentium M and
* earlier systems:
*/
if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
mca_cfg.bootlog = 0;
if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
mce_flags.snb_ifu_quirk = 1;
/*
* Skylake, Cascacde Lake and Cooper Lake require a quirk on
* rep movs.
*/
if (c->x86_vfm == INTEL_SKYLAKE_X)
mce_flags.skx_repmov_quirk = 1;
}
static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
{
/*
* All newer Zhaoxin CPUs support MCE broadcasting. Enable
* synchronization with a one second timeout.
*/
if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
if (mca_cfg.monarch_timeout < 0)
mca_cfg.monarch_timeout = USEC_PER_SEC;
}
}
/* Add per CPU specific workarounds here */
static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
struct mca_config *cfg = &mca_cfg;
switch (c->x86_vendor) {
case X86_VENDOR_UNKNOWN:
pr_info("unknown CPU type - not enabling MCE support\n");
return false;
case X86_VENDOR_AMD:
apply_quirks_amd(c);
break;
case X86_VENDOR_INTEL:
apply_quirks_intel(c);
break;
case X86_VENDOR_ZHAOXIN:
apply_quirks_zhaoxin(c);
break;
}
if (cfg->monarch_timeout < 0)
@ -2012,28 +2031,28 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (cfg->bootlog != 0)
cfg->panic_timeout = 30;
return 0;
return true;
}
static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
return 0;
return false;
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
mce_flags.p5 = 1;
return 1;
return true;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
mce_flags.winchip = 1;
return 1;
return true;
default:
return 0;
return false;
}
return 0;
return false;
}
/*
@ -2099,13 +2118,9 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_intel_feature_init(c);
break;
case X86_VENDOR_AMD: {
mce_amd_feature_init(c);
break;
}
case X86_VENDOR_AMD:
case X86_VENDOR_HYGON:
mce_hygon_feature_init(c);
mce_amd_feature_init(c);
break;
case X86_VENDOR_CENTAUR:
@ -2279,12 +2294,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_cap_init();
if (__mcheck_cpu_apply_quirks(c) < 0) {
if (!__mcheck_cpu_apply_quirks(c)) {
mca_cfg.disabled = 1;
return;
}
if (mce_gen_pool_init()) {
if (!mce_gen_pool_init()) {
mca_cfg.disabled = 1;
pr_emerg("Couldn't allocate MCE records pool!\n");
return;

View File

@ -94,64 +94,63 @@ bool mce_gen_pool_empty(void)
return llist_empty(&mce_event_llist);
}
int mce_gen_pool_add(struct mce_hw_err *err)
bool mce_gen_pool_add(struct mce_hw_err *err)
{
struct mce_evt_llist *node;
if (filter_mce(&err->m))
return -EINVAL;
return false;
if (!mce_evt_pool)
return -EINVAL;
return false;
node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
if (!node) {
pr_warn_ratelimited("MCE records pool full!\n");
return -ENOMEM;
return false;
}
memcpy(&node->err, err, sizeof(*err));
llist_add(&node->llnode, &mce_event_llist);
return 0;
return true;
}
static int mce_gen_pool_create(void)
static bool mce_gen_pool_create(void)
{
int mce_numrecords, mce_poolsz, order;
struct gen_pool *gpool;
int ret = -ENOMEM;
void *mce_pool;
order = order_base_2(sizeof(struct mce_evt_llist));
gpool = gen_pool_create(order, -1);
if (!gpool)
return ret;
return false;
mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
mce_poolsz = mce_numrecords * (1 << order);
mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
if (!mce_pool) {
gen_pool_destroy(gpool);
return ret;
return false;
}
ret = gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1);
if (ret) {
if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) {
gen_pool_destroy(gpool);
kfree(mce_pool);
return ret;
return false;
}
mce_evt_pool = gpool;
return ret;
return true;
}
int mce_gen_pool_init(void)
bool mce_gen_pool_init(void)
{
/* Just init mce_gen_pool once. */
if (mce_evt_pool)
return 0;
return true;
return mce_gen_pool_create();
}

View File

@ -75,12 +75,12 @@ static u16 cmci_threshold[MAX_NR_BANKS];
*/
#define CMCI_STORM_THRESHOLD 32749
static int cmci_supported(int *banks)
static bool cmci_supported(int *banks)
{
u64 cap;
if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
return 0;
return false;
/*
* Vendor check is not strictly needed, but the initial
@ -89,10 +89,11 @@ static int cmci_supported(int *banks)
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 0;
return false;
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
return 0;
return false;
rdmsrl(MSR_IA32_MCG_CAP, cap);
*banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK);
return !!(cap & MCG_CMCI_P);

View File

@ -31,8 +31,8 @@ struct mce_evt_llist {
void mce_gen_pool_process(struct work_struct *__unused);
bool mce_gen_pool_empty(void);
int mce_gen_pool_add(struct mce_hw_err *err);
int mce_gen_pool_init(void);
bool mce_gen_pool_add(struct mce_hw_err *err);
bool mce_gen_pool_init(void);
struct llist_node *mce_gen_pool_prepare_records(void);
int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp);

Some files were not shown because too many files have changed in this diff Show More