mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-14 17:53:39 +00:00
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
# Conflicts: # include/linux/mm.h # include/linux/mm_types.h # include/linux/mmap_lock.h # kernel/fork.c # mm/init-mm.c # tools/testing/vma/vma_internal.h
This commit is contained in:
commit
bb0a44fa74
@ -194,8 +194,6 @@ is applicable::
|
||||
WDT Watchdog support is enabled.
|
||||
X86-32 X86-32, aka i386 architecture is enabled.
|
||||
X86-64 X86-64 architecture is enabled.
|
||||
More X86-64 boot options can be found in
|
||||
Documentation/arch/x86/x86_64/boot-options.rst.
|
||||
X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64)
|
||||
X86_UV SGI UV support is enabled.
|
||||
XEN Xen support is enabled
|
||||
@ -213,7 +211,6 @@ Do not modify the syntax of boot loader parameters without extreme
|
||||
need or coordination with <Documentation/arch/x86/boot.rst>.
|
||||
|
||||
There are also arch-specific kernel-parameters not documented here.
|
||||
See for example <Documentation/arch/x86/x86_64/boot-options.rst>.
|
||||
|
||||
Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
|
||||
a trailing = on the name of any parameter states that that parameter will
|
||||
|
@ -21,6 +21,10 @@
|
||||
strictly ACPI specification compliant.
|
||||
rsdt -- prefer RSDT over (default) XSDT
|
||||
copy_dsdt -- copy DSDT to memory
|
||||
nocmcff -- Disable firmware first mode for corrected
|
||||
errors. This disables parsing the HEST CMC error
|
||||
source to check if firmware has set the FF flag. This
|
||||
may result in duplicate corrected error reports.
|
||||
nospcr -- disable console in ACPI SPCR table as
|
||||
default _serial_ console on ARM64
|
||||
For ARM64, ONLY "acpi=off", "acpi=on", "acpi=force" or
|
||||
@ -405,6 +409,8 @@
|
||||
not play well with APC CPU idle - disable it if you have
|
||||
APC and your system crashes randomly.
|
||||
|
||||
apic [APIC,X86-64] Use IO-APIC. Default.
|
||||
|
||||
apic= [APIC,X86,EARLY] Advanced Programmable Interrupt Controller
|
||||
Change the output verbosity while booting
|
||||
Format: { quiet (default) | verbose | debug }
|
||||
@ -424,6 +430,10 @@
|
||||
useful so that a dump capture kernel won't be
|
||||
shot down by NMI
|
||||
|
||||
apicpmtimer Do APIC timer calibration using the pmtimer. Implies
|
||||
apicmaintimer. Useful when your PIT timer is totally
|
||||
broken.
|
||||
|
||||
autoconf= [IPV6]
|
||||
See Documentation/networking/ipv6.rst.
|
||||
|
||||
@ -1726,6 +1736,8 @@
|
||||
|
||||
off: Disable GDS mitigation.
|
||||
|
||||
gbpages [X86] Use GB pages for kernel direct mappings.
|
||||
|
||||
gcov_persist= [GCOV] When non-zero (default), profiling data for
|
||||
kernel modules is saved and remains accessible via
|
||||
debugfs, even when the module is unloaded/reloaded.
|
||||
@ -2008,12 +2020,21 @@
|
||||
|
||||
idle= [X86,EARLY]
|
||||
Format: idle=poll, idle=halt, idle=nomwait
|
||||
Poll forces a polling idle loop that can slightly
|
||||
improve the performance of waking up a idle CPU, but
|
||||
will use a lot of power and make the system run hot.
|
||||
Not recommended.
|
||||
|
||||
idle=poll: Don't do power saving in the idle loop
|
||||
using HLT, but poll for rescheduling event. This will
|
||||
make the CPUs eat a lot more power, but may be useful
|
||||
to get slightly better performance in multiprocessor
|
||||
benchmarks. It also makes some profiling using
|
||||
performance counters more accurate. Please note that
|
||||
on systems with MONITOR/MWAIT support (like Intel
|
||||
EM64T CPUs) this option has no performance advantage
|
||||
over the normal idle loop. It may also interact badly
|
||||
with hyperthreading.
|
||||
|
||||
idle=halt: Halt is forced to be used for CPU idle.
|
||||
In such case C2/C3 won't be used again.
|
||||
|
||||
idle=nomwait: Disable mwait for CPU C-states
|
||||
|
||||
idxd.sva= [HW]
|
||||
@ -2311,20 +2332,73 @@
|
||||
relaxed
|
||||
|
||||
iommu= [X86,EARLY]
|
||||
|
||||
off
|
||||
Don't initialize and use any kind of IOMMU.
|
||||
|
||||
force
|
||||
Force the use of the hardware IOMMU even when
|
||||
it is not actually needed (e.g. because < 3 GB
|
||||
memory).
|
||||
|
||||
noforce
|
||||
Don't force hardware IOMMU usage when it is not
|
||||
needed. (default).
|
||||
|
||||
biomerge
|
||||
panic
|
||||
nopanic
|
||||
merge
|
||||
nomerge
|
||||
|
||||
soft
|
||||
pt [X86]
|
||||
nopt [X86]
|
||||
nobypass [PPC/POWERNV]
|
||||
Use software bounce buffering (SWIOTLB) (default for
|
||||
Intel machines). This can be used to prevent the usage
|
||||
of an available hardware IOMMU.
|
||||
|
||||
[X86]
|
||||
pt
|
||||
[X86]
|
||||
nopt
|
||||
[PPC/POWERNV]
|
||||
nobypass
|
||||
Disable IOMMU bypass, using IOMMU for PCI devices.
|
||||
|
||||
[X86]
|
||||
AMD Gart HW IOMMU-specific options:
|
||||
|
||||
<size>
|
||||
Set the size of the remapping area in bytes.
|
||||
|
||||
allowed
|
||||
Overwrite iommu off workarounds for specific chipsets
|
||||
|
||||
fullflush
|
||||
Flush IOMMU on each allocation (default).
|
||||
|
||||
nofullflush
|
||||
Don't use IOMMU fullflush.
|
||||
|
||||
memaper[=<order>]
|
||||
Allocate an own aperture over RAM with size
|
||||
32MB<<order. (default: order=1, i.e. 64MB)
|
||||
|
||||
merge
|
||||
Do scatter-gather (SG) merging. Implies "force"
|
||||
(experimental).
|
||||
|
||||
nomerge
|
||||
Don't do scatter-gather (SG) merging.
|
||||
|
||||
noaperture
|
||||
Ask the IOMMU not to touch the aperture for AGP.
|
||||
|
||||
noagp
|
||||
Don't initialize the AGP driver and use full aperture.
|
||||
|
||||
panic
|
||||
Always panic when IOMMU overflows.
|
||||
|
||||
iommu.forcedac= [ARM64,X86,EARLY] Control IOVA allocation for PCI devices.
|
||||
Format: { "0" | "1" }
|
||||
0 - Try to allocate a 32-bit DMA address first, before
|
||||
@ -2432,7 +2506,9 @@
|
||||
specified in the flag list (default: domain):
|
||||
|
||||
nohz
|
||||
Disable the tick when a single task runs.
|
||||
Disable the tick when a single task runs as well as
|
||||
disabling other kernel noises like having RCU callbacks
|
||||
offloaded. This is equivalent to the nohz_full parameter.
|
||||
|
||||
A residual 1Hz tick is offloaded to workqueues, which you
|
||||
need to affine to housekeeping through the global
|
||||
@ -2695,7 +2771,7 @@
|
||||
VMs, i.e. on the 0=>1 and 1=>0 transitions of the
|
||||
number of VMs.
|
||||
|
||||
Enabling virtualization at module lode avoids potential
|
||||
Enabling virtualization at module load avoids potential
|
||||
latency for creation of the 0=>1 VM, as KVM serializes
|
||||
virtualization enabling across all online CPUs. The
|
||||
"cost" of enabling virtualization when KVM is loaded,
|
||||
@ -3259,9 +3335,77 @@
|
||||
devices can be requested on-demand with the
|
||||
/dev/loop-control interface.
|
||||
|
||||
mce [X86-32] Machine Check Exception
|
||||
mce= [X86-{32,64}]
|
||||
|
||||
Please see Documentation/arch/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
|
||||
|
||||
off
|
||||
disable machine check
|
||||
|
||||
no_cmci
|
||||
disable CMCI(Corrected Machine Check Interrupt) that
|
||||
Intel processor supports. Usually this disablement is
|
||||
not recommended, but it might be handy if your
|
||||
hardware is misbehaving.
|
||||
|
||||
Note that you'll get more problems without CMCI than
|
||||
with due to the shared banks, i.e. you might get
|
||||
duplicated error logs.
|
||||
|
||||
dont_log_ce
|
||||
don't make logs for corrected errors. All events
|
||||
reported as corrected are silently cleared by OS. This
|
||||
option will be useful if you have no interest in any
|
||||
of corrected errors.
|
||||
|
||||
ignore_ce
|
||||
disable features for corrected errors, e.g.
|
||||
polling timer and CMCI. All events reported as
|
||||
corrected are not cleared by OS and remained in its
|
||||
error banks.
|
||||
|
||||
Usually this disablement is not recommended, however
|
||||
if there is an agent checking/clearing corrected
|
||||
errors (e.g. BIOS or hardware monitoring
|
||||
applications), conflicting with OS's error handling,
|
||||
and you cannot deactivate the agent, then this option
|
||||
will be a help.
|
||||
|
||||
no_lmce
|
||||
do not opt-in to Local MCE delivery. Use legacy method
|
||||
to broadcast MCEs.
|
||||
|
||||
bootlog
|
||||
enable logging of machine checks left over from
|
||||
booting. Disabled by default on AMD Fam10h and older
|
||||
because some BIOS leave bogus ones.
|
||||
|
||||
If your BIOS doesn't do that it's a good idea to
|
||||
enable though to make sure you log even machine check
|
||||
events that result in a reboot. On Intel systems it is
|
||||
enabled by default.
|
||||
|
||||
nobootlog
|
||||
disable boot machine check logging.
|
||||
|
||||
monarchtimeout (number)
|
||||
sets the time in us to wait for other CPUs on machine
|
||||
checks. 0 to disable.
|
||||
|
||||
bios_cmci_threshold
|
||||
don't overwrite the bios-set CMCI threshold. This boot
|
||||
option prevents Linux from overwriting the CMCI
|
||||
threshold set by the bios. Without this option, Linux
|
||||
always sets the CMCI threshold to 1. Enabling this may
|
||||
make memory predictive failure analysis less effective
|
||||
if the bios sets thresholds for memory errors since we
|
||||
will not see details for all errors.
|
||||
|
||||
recovery
|
||||
force-enable recoverable machine check code paths
|
||||
|
||||
Everything else is in sysfs now.
|
||||
|
||||
mce=option [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst
|
||||
|
||||
md= [HW] RAID subsystems devices and level
|
||||
See Documentation/admin-guide/md.rst.
|
||||
@ -3887,6 +4031,8 @@
|
||||
noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any
|
||||
IOAPICs that may be present in the system.
|
||||
|
||||
noapictimer [APIC,X86] Don't set up the APIC timer
|
||||
|
||||
noautogroup Disable scheduler automatic task group creation.
|
||||
|
||||
nocache [ARM,EARLY]
|
||||
@ -3934,6 +4080,8 @@
|
||||
register save and restore. The kernel will only save
|
||||
legacy floating-point registers on task switch.
|
||||
|
||||
nogbpages [X86] Do not use GB pages for kernel direct mappings.
|
||||
|
||||
no_hash_pointers
|
||||
[KNL,EARLY]
|
||||
Force pointers printed to the console or buffers to be
|
||||
@ -3960,6 +4108,8 @@
|
||||
the impact of the sleep instructions. This is also
|
||||
useful when using JTAG debugger.
|
||||
|
||||
nohpet [X86] Don't use the HPET timer.
|
||||
|
||||
nohugeiomap [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge I/O mappings.
|
||||
|
||||
nohugevmalloc [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge vmalloc mappings.
|
||||
@ -4111,8 +4261,10 @@
|
||||
|
||||
nosync [HW,M68K] Disables sync negotiation for all devices.
|
||||
|
||||
no_timer_check [X86,APIC] Disables the code which tests for
|
||||
broken timer IRQ sources.
|
||||
no_timer_check [X86,APIC] Disables the code which tests for broken
|
||||
timer IRQ sources, i.e., the IO-APIC timer. This can
|
||||
work around problems with incorrect timer
|
||||
initialization on some boards.
|
||||
|
||||
no_uaccess_flush
|
||||
[PPC,EARLY] Don't flush the L1-D cache after accessing user data.
|
||||
@ -4192,6 +4344,11 @@
|
||||
If given as an integer followed by 'U', it will
|
||||
divide each physical node into N emulated nodes.
|
||||
|
||||
numa=noacpi [X86] Don't parse the SRAT table for NUMA setup
|
||||
|
||||
numa=nohmat [X86] Don't parse the HMAT table for NUMA setup, or
|
||||
soft-reserved memory partitioning.
|
||||
|
||||
numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic
|
||||
NUMA balancing.
|
||||
Allowed values are enable and disable
|
||||
@ -5715,6 +5872,55 @@
|
||||
reboot_cpu is s[mp]#### with #### being the processor
|
||||
to be used for rebooting.
|
||||
|
||||
acpi
|
||||
Use the ACPI RESET_REG in the FADT. If ACPI is not
|
||||
configured or the ACPI reset does not work, the reboot
|
||||
path attempts the reset using the keyboard controller.
|
||||
|
||||
bios
|
||||
Use the CPU reboot vector for warm reset
|
||||
|
||||
cold
|
||||
Set the cold reboot flag
|
||||
|
||||
default
|
||||
There are some built-in platform specific "quirks"
|
||||
- you may see: "reboot: <name> series board detected.
|
||||
Selecting <type> for reboots." In the case where you
|
||||
think the quirk is in error (e.g. you have newer BIOS,
|
||||
or newer board) using this option will ignore the
|
||||
built-in quirk table, and use the generic default
|
||||
reboot actions.
|
||||
|
||||
efi
|
||||
Use efi reset_system runtime service. If EFI is not
|
||||
configured or the EFI reset does not work, the reboot
|
||||
path attempts the reset using the keyboard controller.
|
||||
|
||||
force
|
||||
Don't stop other CPUs on reboot. This can make reboot
|
||||
more reliable in some cases.
|
||||
|
||||
kbd
|
||||
Use the keyboard controller. cold reset (default)
|
||||
|
||||
pci
|
||||
Use a write to the PCI config space register 0xcf9 to
|
||||
trigger reboot.
|
||||
|
||||
triple
|
||||
Force a triple fault (init)
|
||||
|
||||
warm
|
||||
Don't set the cold reboot flag
|
||||
|
||||
Using warm reset will be much faster especially on big
|
||||
memory systems because the BIOS will not go through
|
||||
the memory check. Disadvantage is that not all
|
||||
hardware will be completely reinitialized on reboot so
|
||||
there may be boot problems on some systems.
|
||||
|
||||
|
||||
refscale.holdoff= [KNL]
|
||||
Set test-start holdoff period. The purpose of
|
||||
this parameter is to delay the start of the
|
||||
@ -6106,7 +6312,16 @@
|
||||
|
||||
serialnumber [BUGS=X86-32]
|
||||
|
||||
sev=option[,option...] [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst
|
||||
sev=option[,option...] [X86-64]
|
||||
|
||||
debug
|
||||
Enable debug messages.
|
||||
|
||||
nosnp
|
||||
Do not enable SEV-SNP (applies to host/hypervisor
|
||||
only). Setting 'nosnp' avoids the RMP check overhead
|
||||
in memory accesses when users do not want to run
|
||||
SEV-SNP guests.
|
||||
|
||||
shapers= [NET]
|
||||
Maximal number of shapers.
|
||||
|
@ -130,8 +130,126 @@ SNP feature support.
|
||||
|
||||
More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR
|
||||
|
||||
Reverse Map Table (RMP)
|
||||
=======================
|
||||
|
||||
The RMP is a structure in system memory that is used to ensure a one-to-one
|
||||
mapping between system physical addresses and guest physical addresses. Each
|
||||
page of memory that is potentially assignable to guests has one entry within
|
||||
the RMP.
|
||||
|
||||
The RMP table can be either contiguous in memory or a collection of segments
|
||||
in memory.
|
||||
|
||||
Contiguous RMP
|
||||
--------------
|
||||
|
||||
Support for this form of the RMP is present when support for SEV-SNP is
|
||||
present, which can be determined using the CPUID instruction::
|
||||
|
||||
0x8000001f[eax]:
|
||||
Bit[4] indicates support for SEV-SNP
|
||||
|
||||
The location of the RMP is identified to the hardware through two MSRs::
|
||||
|
||||
0xc0010132 (RMP_BASE):
|
||||
System physical address of the first byte of the RMP
|
||||
|
||||
0xc0010133 (RMP_END):
|
||||
System physical address of the last byte of the RMP
|
||||
|
||||
Hardware requires that RMP_BASE and (RPM_END + 1) be 8KB aligned, but SEV
|
||||
firmware increases the alignment requirement to require a 1MB alignment.
|
||||
|
||||
The RMP consists of a 16KB region used for processor bookkeeping followed
|
||||
by the RMP entries, which are 16 bytes in size. The size of the RMP
|
||||
determines the range of physical memory that the hypervisor can assign to
|
||||
SEV-SNP guests. The RMP covers the system physical address from::
|
||||
|
||||
0 to ((RMP_END + 1 - RMP_BASE - 16KB) / 16B) x 4KB.
|
||||
|
||||
The current Linux support relies on BIOS to allocate/reserve the memory for
|
||||
the RMP and to set RMP_BASE and RMP_END appropriately. Linux uses the MSR
|
||||
values to locate the RMP and determine the size of the RMP. The RMP must
|
||||
cover all of system memory in order for Linux to enable SEV-SNP.
|
||||
|
||||
Segmented RMP
|
||||
-------------
|
||||
|
||||
Segmented RMP support is a new way of representing the layout of an RMP.
|
||||
Initial RMP support required the RMP table to be contiguous in memory.
|
||||
RMP accesses from a NUMA node on which the RMP doesn't reside
|
||||
can take longer than accesses from a NUMA node on which the RMP resides.
|
||||
Segmented RMP support allows the RMP entries to be located on the same
|
||||
node as the memory the RMP is covering, potentially reducing latency
|
||||
associated with accessing an RMP entry associated with the memory. Each
|
||||
RMP segment covers a specific range of system physical addresses.
|
||||
|
||||
Support for this form of the RMP can be determined using the CPUID
|
||||
instruction::
|
||||
|
||||
0x8000001f[eax]:
|
||||
Bit[23] indicates support for segmented RMP
|
||||
|
||||
If supported, segmented RMP attributes can be found using the CPUID
|
||||
instruction::
|
||||
|
||||
0x80000025[eax]:
|
||||
Bits[5:0] minimum supported RMP segment size
|
||||
Bits[11:6] maximum supported RMP segment size
|
||||
|
||||
0x80000025[ebx]:
|
||||
Bits[9:0] number of cacheable RMP segment definitions
|
||||
Bit[10] indicates if the number of cacheable RMP segments
|
||||
is a hard limit
|
||||
|
||||
To enable a segmented RMP, a new MSR is available::
|
||||
|
||||
0xc0010136 (RMP_CFG):
|
||||
Bit[0] indicates if segmented RMP is enabled
|
||||
Bits[13:8] contains the size of memory covered by an RMP
|
||||
segment (expressed as a power of 2)
|
||||
|
||||
The RMP segment size defined in the RMP_CFG MSR applies to all segments
|
||||
of the RMP. Therefore each RMP segment covers a specific range of system
|
||||
physical addresses. For example, if the RMP_CFG MSR value is 0x2401, then
|
||||
the RMP segment coverage value is 0x24 => 36, meaning the size of memory
|
||||
covered by an RMP segment is 64GB (1 << 36). So the first RMP segment
|
||||
covers physical addresses from 0 to 0xF_FFFF_FFFF, the second RMP segment
|
||||
covers physical addresses from 0x10_0000_0000 to 0x1F_FFFF_FFFF, etc.
|
||||
|
||||
When a segmented RMP is enabled, RMP_BASE points to the RMP bookkeeping
|
||||
area as it does today (16K in size). However, instead of RMP entries
|
||||
beginning immediately after the bookkeeping area, there is a 4K RMP
|
||||
segment table (RST). Each entry in the RST is 8-bytes in size and represents
|
||||
an RMP segment::
|
||||
|
||||
Bits[19:0] mapped size (in GB)
|
||||
The mapped size can be less than the defined segment size.
|
||||
A value of zero, indicates that no RMP exists for the range
|
||||
of system physical addresses associated with this segment.
|
||||
Bits[51:20] segment physical address
|
||||
This address is left shift 20-bits (or just masked when
|
||||
read) to form the physical address of the segment (1MB
|
||||
alignment).
|
||||
|
||||
The RST can hold 512 segment entries but can be limited in size to the number
|
||||
of cacheable RMP segments (CPUID 0x80000025_EBX[9:0]) if the number of cacheable
|
||||
RMP segments is a hard limit (CPUID 0x80000025_EBX[10]).
|
||||
|
||||
The current Linux support relies on BIOS to allocate/reserve the memory for
|
||||
the segmented RMP (the bookkeeping area, RST, and all segments), build the RST
|
||||
and to set RMP_BASE, RMP_END, and RMP_CFG appropriately. Linux uses the MSR
|
||||
values to locate the RMP and determine the size and location of the RMP
|
||||
segments. The RMP must cover all of system memory in order for Linux to enable
|
||||
SEV-SNP.
|
||||
|
||||
More details in the AMD64 APM Vol 2, section "15.36.3 Reverse Map Table",
|
||||
docID: 24593.
|
||||
|
||||
Secure VM Service Module (SVSM)
|
||||
===============================
|
||||
|
||||
SNP provides a feature called Virtual Machine Privilege Levels (VMPL) which
|
||||
defines four privilege levels at which guest software can run. The most
|
||||
privileged level is 0 and numerically higher numbers have lesser privileges.
|
||||
|
@ -384,6 +384,16 @@ When monitoring is enabled all MON groups will also contain:
|
||||
Available only with debug option. The identifier used by hardware
|
||||
for the monitor group. On x86 this is the RMID.
|
||||
|
||||
When the "mba_MBps" mount option is used all CTRL_MON groups will also contain:
|
||||
|
||||
"mba_MBps_event":
|
||||
Reading this file shows which memory bandwidth event is used
|
||||
as input to the software feedback loop that keeps memory bandwidth
|
||||
below the value specified in the schemata file. Writing the
|
||||
name of one of the supported memory bandwidth events found in
|
||||
/sys/fs/resctrl/info/L3_MON/mon_features changes the input
|
||||
event.
|
||||
|
||||
Resource allocation rules
|
||||
-------------------------
|
||||
|
||||
|
@ -135,6 +135,10 @@ Thread-related topology information in the kernel:
|
||||
The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
|
||||
"core_id."
|
||||
|
||||
- topology_logical_core_id();
|
||||
|
||||
The logical core ID to which a thread belongs.
|
||||
|
||||
|
||||
|
||||
System topology examples
|
||||
|
@ -1,312 +0,0 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===========================
|
||||
AMD64 Specific Boot Options
|
||||
===========================
|
||||
|
||||
There are many others (usually documented in driver documentation), but
|
||||
only the AMD64 specific ones are listed here.
|
||||
|
||||
Machine check
|
||||
=============
|
||||
Please see Documentation/arch/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
|
||||
|
||||
mce=off
|
||||
Disable machine check
|
||||
mce=no_cmci
|
||||
Disable CMCI(Corrected Machine Check Interrupt) that
|
||||
Intel processor supports. Usually this disablement is
|
||||
not recommended, but it might be handy if your hardware
|
||||
is misbehaving.
|
||||
Note that you'll get more problems without CMCI than with
|
||||
due to the shared banks, i.e. you might get duplicated
|
||||
error logs.
|
||||
mce=dont_log_ce
|
||||
Don't make logs for corrected errors. All events reported
|
||||
as corrected are silently cleared by OS.
|
||||
This option will be useful if you have no interest in any
|
||||
of corrected errors.
|
||||
mce=ignore_ce
|
||||
Disable features for corrected errors, e.g. polling timer
|
||||
and CMCI. All events reported as corrected are not cleared
|
||||
by OS and remained in its error banks.
|
||||
Usually this disablement is not recommended, however if
|
||||
there is an agent checking/clearing corrected errors
|
||||
(e.g. BIOS or hardware monitoring applications), conflicting
|
||||
with OS's error handling, and you cannot deactivate the agent,
|
||||
then this option will be a help.
|
||||
mce=no_lmce
|
||||
Do not opt-in to Local MCE delivery. Use legacy method
|
||||
to broadcast MCEs.
|
||||
mce=bootlog
|
||||
Enable logging of machine checks left over from booting.
|
||||
Disabled by default on AMD Fam10h and older because some BIOS
|
||||
leave bogus ones.
|
||||
If your BIOS doesn't do that it's a good idea to enable though
|
||||
to make sure you log even machine check events that result
|
||||
in a reboot. On Intel systems it is enabled by default.
|
||||
mce=nobootlog
|
||||
Disable boot machine check logging.
|
||||
mce=monarchtimeout (number)
|
||||
monarchtimeout:
|
||||
Sets the time in us to wait for other CPUs on machine checks. 0
|
||||
to disable.
|
||||
mce=bios_cmci_threshold
|
||||
Don't overwrite the bios-set CMCI threshold. This boot option
|
||||
prevents Linux from overwriting the CMCI threshold set by the
|
||||
bios. Without this option, Linux always sets the CMCI
|
||||
threshold to 1. Enabling this may make memory predictive failure
|
||||
analysis less effective if the bios sets thresholds for memory
|
||||
errors since we will not see details for all errors.
|
||||
mce=recovery
|
||||
Force-enable recoverable machine check code paths
|
||||
|
||||
nomce (for compatibility with i386)
|
||||
same as mce=off
|
||||
|
||||
Everything else is in sysfs now.
|
||||
|
||||
APICs
|
||||
=====
|
||||
|
||||
apic
|
||||
Use IO-APIC. Default
|
||||
|
||||
noapic
|
||||
Don't use the IO-APIC.
|
||||
|
||||
disableapic
|
||||
Don't use the local APIC
|
||||
|
||||
nolapic
|
||||
Don't use the local APIC (alias for i386 compatibility)
|
||||
|
||||
pirq=...
|
||||
See Documentation/arch/x86/i386/IO-APIC.rst
|
||||
|
||||
noapictimer
|
||||
Don't set up the APIC timer
|
||||
|
||||
no_timer_check
|
||||
Don't check the IO-APIC timer. This can work around
|
||||
problems with incorrect timer initialization on some boards.
|
||||
|
||||
apicpmtimer
|
||||
Do APIC timer calibration using the pmtimer. Implies
|
||||
apicmaintimer. Useful when your PIT timer is totally broken.
|
||||
|
||||
Timing
|
||||
======
|
||||
|
||||
notsc
|
||||
Deprecated, use tsc=unstable instead.
|
||||
|
||||
nohpet
|
||||
Don't use the HPET timer.
|
||||
|
||||
Idle loop
|
||||
=========
|
||||
|
||||
idle=poll
|
||||
Don't do power saving in the idle loop using HLT, but poll for rescheduling
|
||||
event. This will make the CPUs eat a lot more power, but may be useful
|
||||
to get slightly better performance in multiprocessor benchmarks. It also
|
||||
makes some profiling using performance counters more accurate.
|
||||
Please note that on systems with MONITOR/MWAIT support (like Intel EM64T
|
||||
CPUs) this option has no performance advantage over the normal idle loop.
|
||||
It may also interact badly with hyperthreading.
|
||||
|
||||
Rebooting
|
||||
=========
|
||||
|
||||
reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] | p[ci] [, [w]arm | [c]old]
|
||||
bios
|
||||
Use the CPU reboot vector for warm reset
|
||||
warm
|
||||
Don't set the cold reboot flag
|
||||
cold
|
||||
Set the cold reboot flag
|
||||
triple
|
||||
Force a triple fault (init)
|
||||
kbd
|
||||
Use the keyboard controller. cold reset (default)
|
||||
acpi
|
||||
Use the ACPI RESET_REG in the FADT. If ACPI is not configured or
|
||||
the ACPI reset does not work, the reboot path attempts the reset
|
||||
using the keyboard controller.
|
||||
efi
|
||||
Use efi reset_system runtime service. If EFI is not configured or
|
||||
the EFI reset does not work, the reboot path attempts the reset using
|
||||
the keyboard controller.
|
||||
pci
|
||||
Use a write to the PCI config space register 0xcf9 to trigger reboot.
|
||||
|
||||
Using warm reset will be much faster especially on big memory
|
||||
systems because the BIOS will not go through the memory check.
|
||||
Disadvantage is that not all hardware will be completely reinitialized
|
||||
on reboot so there may be boot problems on some systems.
|
||||
|
||||
reboot=force
|
||||
Don't stop other CPUs on reboot. This can make reboot more reliable
|
||||
in some cases.
|
||||
|
||||
reboot=default
|
||||
There are some built-in platform specific "quirks" - you may see:
|
||||
"reboot: <name> series board detected. Selecting <type> for reboots."
|
||||
In the case where you think the quirk is in error (e.g. you have
|
||||
newer BIOS, or newer board) using this option will ignore the built-in
|
||||
quirk table, and use the generic default reboot actions.
|
||||
|
||||
NUMA
|
||||
====
|
||||
|
||||
numa=off
|
||||
Only set up a single NUMA node spanning all memory.
|
||||
|
||||
numa=noacpi
|
||||
Don't parse the SRAT table for NUMA setup
|
||||
|
||||
numa=nohmat
|
||||
Don't parse the HMAT table for NUMA setup, or soft-reserved memory
|
||||
partitioning.
|
||||
|
||||
ACPI
|
||||
====
|
||||
|
||||
acpi=off
|
||||
Don't enable ACPI
|
||||
acpi=ht
|
||||
Use ACPI boot table parsing, but don't enable ACPI interpreter
|
||||
acpi=force
|
||||
Force ACPI on (currently not needed)
|
||||
acpi=strict
|
||||
Disable out of spec ACPI workarounds.
|
||||
acpi_sci={edge,level,high,low}
|
||||
Set up ACPI SCI interrupt.
|
||||
acpi=noirq
|
||||
Don't route interrupts
|
||||
acpi=nocmcff
|
||||
Disable firmware first mode for corrected errors. This
|
||||
disables parsing the HEST CMC error source to check if
|
||||
firmware has set the FF flag. This may result in
|
||||
duplicate corrected error reports.
|
||||
|
||||
PCI
|
||||
===
|
||||
|
||||
pci=off
|
||||
Don't use PCI
|
||||
pci=conf1
|
||||
Use conf1 access.
|
||||
pci=conf2
|
||||
Use conf2 access.
|
||||
pci=rom
|
||||
Assign ROMs.
|
||||
pci=assign-busses
|
||||
Assign busses
|
||||
pci=irqmask=MASK
|
||||
Set PCI interrupt mask to MASK
|
||||
pci=lastbus=NUMBER
|
||||
Scan up to NUMBER busses, no matter what the mptable says.
|
||||
pci=noacpi
|
||||
Don't use ACPI to set up PCI interrupt routing.
|
||||
|
||||
IOMMU (input/output memory management unit)
|
||||
===========================================
|
||||
Multiple x86-64 PCI-DMA mapping implementations exist, for example:
|
||||
|
||||
1. <kernel/dma/direct.c>: use no hardware/software IOMMU at all
|
||||
(e.g. because you have < 3 GB memory).
|
||||
Kernel boot message: "PCI-DMA: Disabling IOMMU"
|
||||
|
||||
2. <arch/x86/kernel/amd_gart_64.c>: AMD GART based hardware IOMMU.
|
||||
Kernel boot message: "PCI-DMA: using GART IOMMU"
|
||||
|
||||
3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used
|
||||
e.g. if there is no hardware IOMMU in the system and it is need because
|
||||
you have >3GB memory or told the kernel to us it (iommu=soft))
|
||||
Kernel boot message: "PCI-DMA: Using software bounce buffering
|
||||
for IO (SWIOTLB)"
|
||||
|
||||
::
|
||||
|
||||
iommu=[<size>][,noagp][,off][,force][,noforce]
|
||||
[,memaper[=<order>]][,merge][,fullflush][,nomerge]
|
||||
[,noaperture]
|
||||
|
||||
General iommu options:
|
||||
|
||||
off
|
||||
Don't initialize and use any kind of IOMMU.
|
||||
noforce
|
||||
Don't force hardware IOMMU usage when it is not needed. (default).
|
||||
force
|
||||
Force the use of the hardware IOMMU even when it is
|
||||
not actually needed (e.g. because < 3 GB memory).
|
||||
soft
|
||||
Use software bounce buffering (SWIOTLB) (default for
|
||||
Intel machines). This can be used to prevent the usage
|
||||
of an available hardware IOMMU.
|
||||
|
||||
iommu options only relevant to the AMD GART hardware IOMMU:
|
||||
|
||||
<size>
|
||||
Set the size of the remapping area in bytes.
|
||||
allowed
|
||||
Overwrite iommu off workarounds for specific chipsets.
|
||||
fullflush
|
||||
Flush IOMMU on each allocation (default).
|
||||
nofullflush
|
||||
Don't use IOMMU fullflush.
|
||||
memaper[=<order>]
|
||||
Allocate an own aperture over RAM with size 32MB<<order.
|
||||
(default: order=1, i.e. 64MB)
|
||||
merge
|
||||
Do scatter-gather (SG) merging. Implies "force" (experimental).
|
||||
nomerge
|
||||
Don't do scatter-gather (SG) merging.
|
||||
noaperture
|
||||
Ask the IOMMU not to touch the aperture for AGP.
|
||||
noagp
|
||||
Don't initialize the AGP driver and use full aperture.
|
||||
panic
|
||||
Always panic when IOMMU overflows.
|
||||
|
||||
iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
|
||||
implementation:
|
||||
|
||||
swiotlb=<slots>[,force,noforce]
|
||||
<slots>
|
||||
Prereserve that many 2K slots for the software IO bounce buffering.
|
||||
force
|
||||
Force all IO through the software TLB.
|
||||
noforce
|
||||
Do not initialize the software TLB.
|
||||
|
||||
|
||||
Miscellaneous
|
||||
=============
|
||||
|
||||
nogbpages
|
||||
Do not use GB pages for kernel direct mappings.
|
||||
gbpages
|
||||
Use GB pages for kernel direct mappings.
|
||||
|
||||
|
||||
AMD SEV (Secure Encrypted Virtualization)
|
||||
=========================================
|
||||
Options relating to AMD SEV, specified via the following format:
|
||||
|
||||
::
|
||||
|
||||
sev=option1[,option2]
|
||||
|
||||
The available options are:
|
||||
|
||||
debug
|
||||
Enable debug messages.
|
||||
|
||||
nosnp
|
||||
Do not enable SEV-SNP (applies to host/hypervisor only). Setting
|
||||
'nosnp' avoids the RMP check overhead in memory accesses when
|
||||
users do not want to run SEV-SNP guests.
|
@ -18,7 +18,7 @@ For more information on the features of cpusets, see
|
||||
Documentation/admin-guide/cgroup-v1/cpusets.rst.
|
||||
There are a number of different configurations you can use for your needs. For
|
||||
more information on the numa=fake command line option and its various ways of
|
||||
configuring fake nodes, see Documentation/arch/x86/x86_64/boot-options.rst.
|
||||
configuring fake nodes, see Documentation/admin-guide/kernel-parameters.txt
|
||||
|
||||
For the purposes of this introduction, we'll assume a very primitive NUMA
|
||||
emulation setup of "numa=fake=4*512,". This will split our system memory into
|
||||
|
@ -7,7 +7,6 @@ x86_64 Support
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
boot-options
|
||||
uefi
|
||||
mm
|
||||
5level-paging
|
||||
|
@ -2,6 +2,12 @@
|
||||
Scheduler Statistics
|
||||
====================
|
||||
|
||||
Version 17 of schedstats removed 'lb_imbalance' field as it has no
|
||||
significance anymore and instead added more relevant fields namely
|
||||
'lb_imbalance_load', 'lb_imbalance_util', 'lb_imbalance_task' and
|
||||
'lb_imbalance_misfit'. The domain field prints the name of the
|
||||
corresponding sched domain from this version onwards.
|
||||
|
||||
Version 16 of schedstats changed the order of definitions within
|
||||
'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES]
|
||||
columns in show_schedstat(). In particular the position of CPU_IDLE
|
||||
@ -9,7 +15,9 @@ and __CPU_NOT_IDLE changed places. The size of the array is unchanged.
|
||||
|
||||
Version 15 of schedstats dropped counters for some sched_yield:
|
||||
yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is
|
||||
identical to version 14.
|
||||
identical to version 14. Details are available at
|
||||
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/scheduler/sched-stats.txt?id=1e1dbb259c79b
|
||||
|
||||
Version 14 of schedstats includes support for sched_domains, which hit the
|
||||
mainline kernel in 2.6.20 although it is identical to the stats from version
|
||||
@ -26,7 +34,14 @@ cpus on the machine, while domain0 is the most tightly focused domain,
|
||||
sometimes balancing only between pairs of cpus. At this time, there
|
||||
are no architectures which need more than three domain levels. The first
|
||||
field in the domain stats is a bit map indicating which cpus are affected
|
||||
by that domain.
|
||||
by that domain. Details are available at
|
||||
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=b762f3ffb797c
|
||||
|
||||
The schedstat documentation is maintained version 10 onwards and is not
|
||||
updated for version 11 and 12. The details for version 10 are available at
|
||||
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=1da177e4c3f4
|
||||
|
||||
These fields are counters, and only increment. Programs which make use
|
||||
of these will need to start with a baseline observation and then calculate
|
||||
@ -71,88 +86,97 @@ Domain statistics
|
||||
-----------------
|
||||
One of these is produced per domain for each cpu described. (Note that if
|
||||
CONFIG_SMP is not defined, *no* domains are utilized and these lines
|
||||
will not appear in the output.)
|
||||
will not appear in the output. <name> is an extension to the domain field
|
||||
that prints the name of the corresponding sched domain. It can appear in
|
||||
schedstat version 17 and above, and requires CONFIG_SCHED_DEBUG.)
|
||||
|
||||
domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
|
||||
domain<N> <name> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
|
||||
|
||||
The first field is a bit mask indicating what cpus this domain operates over.
|
||||
|
||||
The next 24 are a variety of sched_balance_rq() statistics in grouped into types
|
||||
of idleness (idle, busy, and newly idle):
|
||||
The next 33 are a variety of sched_balance_rq() statistics in grouped into types
|
||||
of idleness (busy, idle and newly idle):
|
||||
|
||||
1) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was idle
|
||||
2) # of times in this domain sched_balance_rq() checked but found
|
||||
the load did not require balancing when the cpu was idle
|
||||
3) # of times in this domain sched_balance_rq() tried to move one or
|
||||
more tasks and failed, when the cpu was idle
|
||||
4) sum of imbalances discovered (if any) with each call to
|
||||
sched_balance_rq() in this domain when the cpu was idle
|
||||
5) # of times in this domain pull_task() was called when the cpu
|
||||
was idle
|
||||
6) # of times in this domain pull_task() was called even though
|
||||
the target task was cache-hot when idle
|
||||
7) # of times in this domain sched_balance_rq() was called but did
|
||||
not find a busier queue while the cpu was idle
|
||||
8) # of times in this domain a busier queue was found while the
|
||||
cpu was idle but no busier group was found
|
||||
9) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was busy
|
||||
10) # of times in this domain sched_balance_rq() checked but found the
|
||||
2) # of times in this domain sched_balance_rq() checked but found the
|
||||
load did not require balancing when busy
|
||||
11) # of times in this domain sched_balance_rq() tried to move one or
|
||||
3) # of times in this domain sched_balance_rq() tried to move one or
|
||||
more tasks and failed, when the cpu was busy
|
||||
12) sum of imbalances discovered (if any) with each call to
|
||||
sched_balance_rq() in this domain when the cpu was busy
|
||||
13) # of times in this domain pull_task() was called when busy
|
||||
14) # of times in this domain pull_task() was called even though the
|
||||
4) Total imbalance in load when the cpu was busy
|
||||
5) Total imbalance in utilization when the cpu was busy
|
||||
6) Total imbalance in number of tasks when the cpu was busy
|
||||
7) Total imbalance due to misfit tasks when the cpu was busy
|
||||
8) # of times in this domain pull_task() was called when busy
|
||||
9) # of times in this domain pull_task() was called even though the
|
||||
target task was cache-hot when busy
|
||||
15) # of times in this domain sched_balance_rq() was called but did not
|
||||
10) # of times in this domain sched_balance_rq() was called but did not
|
||||
find a busier queue while the cpu was busy
|
||||
16) # of times in this domain a busier queue was found while the cpu
|
||||
11) # of times in this domain a busier queue was found while the cpu
|
||||
was busy but no busier group was found
|
||||
|
||||
17) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was just becoming idle
|
||||
18) # of times in this domain sched_balance_rq() checked but found the
|
||||
12) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was idle
|
||||
13) # of times in this domain sched_balance_rq() checked but found
|
||||
the load did not require balancing when the cpu was idle
|
||||
14) # of times in this domain sched_balance_rq() tried to move one or
|
||||
more tasks and failed, when the cpu was idle
|
||||
15) Total imbalance in load when the cpu was idle
|
||||
16) Total imbalance in utilization when the cpu was idle
|
||||
17) Total imbalance in number of tasks when the cpu was idle
|
||||
18) Total imbalance due to misfit tasks when the cpu was idle
|
||||
19) # of times in this domain pull_task() was called when the cpu
|
||||
was idle
|
||||
20) # of times in this domain pull_task() was called even though
|
||||
the target task was cache-hot when idle
|
||||
21) # of times in this domain sched_balance_rq() was called but did
|
||||
not find a busier queue while the cpu was idle
|
||||
22) # of times in this domain a busier queue was found while the
|
||||
cpu was idle but no busier group was found
|
||||
|
||||
23) # of times in this domain sched_balance_rq() was called when the
|
||||
was just becoming idle
|
||||
24) # of times in this domain sched_balance_rq() checked but found the
|
||||
load did not require balancing when the cpu was just becoming idle
|
||||
19) # of times in this domain sched_balance_rq() tried to move one or more
|
||||
25) # of times in this domain sched_balance_rq() tried to move one or more
|
||||
tasks and failed, when the cpu was just becoming idle
|
||||
20) sum of imbalances discovered (if any) with each call to
|
||||
sched_balance_rq() in this domain when the cpu was just becoming idle
|
||||
21) # of times in this domain pull_task() was called when newly idle
|
||||
22) # of times in this domain pull_task() was called even though the
|
||||
26) Total imbalance in load when the cpu was just becoming idle
|
||||
27) Total imbalance in utilization when the cpu was just becoming idle
|
||||
28) Total imbalance in number of tasks when the cpu was just becoming idle
|
||||
29) Total imbalance due to misfit tasks when the cpu was just becoming idle
|
||||
30) # of times in this domain pull_task() was called when newly idle
|
||||
31) # of times in this domain pull_task() was called even though the
|
||||
target task was cache-hot when just becoming idle
|
||||
23) # of times in this domain sched_balance_rq() was called but did not
|
||||
32) # of times in this domain sched_balance_rq() was called but did not
|
||||
find a busier queue while the cpu was just becoming idle
|
||||
24) # of times in this domain a busier queue was found while the cpu
|
||||
33) # of times in this domain a busier queue was found while the cpu
|
||||
was just becoming idle but no busier group was found
|
||||
|
||||
Next three are active_load_balance() statistics:
|
||||
|
||||
25) # of times active_load_balance() was called
|
||||
26) # of times active_load_balance() tried to move a task and failed
|
||||
27) # of times active_load_balance() successfully moved a task
|
||||
34) # of times active_load_balance() was called
|
||||
35) # of times active_load_balance() tried to move a task and failed
|
||||
36) # of times active_load_balance() successfully moved a task
|
||||
|
||||
Next three are sched_balance_exec() statistics:
|
||||
|
||||
28) sbe_cnt is not used
|
||||
29) sbe_balanced is not used
|
||||
30) sbe_pushed is not used
|
||||
37) sbe_cnt is not used
|
||||
38) sbe_balanced is not used
|
||||
39) sbe_pushed is not used
|
||||
|
||||
Next three are sched_balance_fork() statistics:
|
||||
|
||||
31) sbf_cnt is not used
|
||||
32) sbf_balanced is not used
|
||||
33) sbf_pushed is not used
|
||||
40) sbf_cnt is not used
|
||||
41) sbf_balanced is not used
|
||||
42) sbf_pushed is not used
|
||||
|
||||
Next three are try_to_wake_up() statistics:
|
||||
|
||||
34) # of times in this domain try_to_wake_up() awoke a task that
|
||||
43) # of times in this domain try_to_wake_up() awoke a task that
|
||||
last ran on a different cpu in this domain
|
||||
35) # of times in this domain try_to_wake_up() moved a task to the
|
||||
44) # of times in this domain try_to_wake_up() moved a task to the
|
||||
waking cpu because it was cache-cold on its own cpu anyway
|
||||
36) # of times in this domain try_to_wake_up() started passive balancing
|
||||
45) # of times in this domain try_to_wake_up() started passive balancing
|
||||
|
||||
/proc/<pid>/schedstat
|
||||
---------------------
|
||||
|
17
MAINTAINERS
17
MAINTAINERS
@ -1120,6 +1120,14 @@ L: linux-i2c@vger.kernel.org
|
||||
S: Supported
|
||||
F: drivers/i2c/busses/i2c-amd-asf-plat.c
|
||||
|
||||
AMD NODE DRIVER
|
||||
M: Mario Limonciello <mario.limonciello@amd.com>
|
||||
M: Yazen Ghannam <yazen.ghannam@amd.com>
|
||||
L: linux-kernel@vger.kernel.org
|
||||
S: Supported
|
||||
F: arch/x86/include/asm/amd_node.h
|
||||
F: arch/x86/kernel/amd_node.c
|
||||
|
||||
AMD PDS CORE DRIVER
|
||||
M: Shannon Nelson <shannon.nelson@amd.com>
|
||||
M: Brett Creeley <brett.creeley@amd.com>
|
||||
@ -13480,8 +13488,8 @@ LOCKING PRIMITIVES
|
||||
M: Peter Zijlstra <peterz@infradead.org>
|
||||
M: Ingo Molnar <mingo@redhat.com>
|
||||
M: Will Deacon <will@kernel.org>
|
||||
M: Boqun Feng <boqun.feng@gmail.com> (LOCKDEP & RUST)
|
||||
R: Waiman Long <longman@redhat.com>
|
||||
R: Boqun Feng <boqun.feng@gmail.com> (LOCKDEP)
|
||||
L: linux-kernel@vger.kernel.org
|
||||
S: Maintained
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
|
||||
@ -13495,6 +13503,11 @@ F: include/linux/seqlock.h
|
||||
F: include/linux/spinlock*.h
|
||||
F: kernel/locking/
|
||||
F: lib/locking*.[ch]
|
||||
F: rust/helpers/mutex.c
|
||||
F: rust/helpers/spinlock.c
|
||||
F: rust/kernel/sync/lock.rs
|
||||
F: rust/kernel/sync/lock/
|
||||
F: rust/kernel/sync/locked_by.rs
|
||||
X: kernel/locking/locktorture.c
|
||||
|
||||
LOGICAL DISK MANAGER SUPPORT (LDM, Windows 2000/XP/Vista Dynamic Disks)
|
||||
@ -22546,7 +22559,7 @@ F: arch/*/kernel/static_call.c
|
||||
F: include/linux/jump_label*.h
|
||||
F: include/linux/static_call*.h
|
||||
F: kernel/jump_label.c
|
||||
F: kernel/static_call.c
|
||||
F: kernel/static_call*.c
|
||||
|
||||
STI AUDIO (ASoC) DRIVERS
|
||||
M: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
|
||||
|
@ -127,29 +127,6 @@ void crash_smp_send_stop(void)
|
||||
cpus_stopped = 1;
|
||||
}
|
||||
|
||||
static void machine_kexec_mask_interrupts(void)
|
||||
{
|
||||
unsigned int i;
|
||||
struct irq_desc *desc;
|
||||
|
||||
for_each_irq_desc(i, desc) {
|
||||
struct irq_chip *chip;
|
||||
|
||||
chip = irq_desc_get_chip(desc);
|
||||
if (!chip)
|
||||
continue;
|
||||
|
||||
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
|
||||
chip->irq_eoi(&desc->irq_data);
|
||||
|
||||
if (chip->irq_mask)
|
||||
chip->irq_mask(&desc->irq_data);
|
||||
|
||||
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
|
||||
chip->irq_disable(&desc->irq_data);
|
||||
}
|
||||
}
|
||||
|
||||
void machine_crash_shutdown(struct pt_regs *regs)
|
||||
{
|
||||
local_irq_disable();
|
||||
|
@ -149,6 +149,7 @@ config ARM64
|
||||
select GENERIC_IDLE_POLL_SETUP
|
||||
select GENERIC_IOREMAP
|
||||
select GENERIC_IRQ_IPI
|
||||
select GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
|
||||
select GENERIC_IRQ_PROBE
|
||||
select GENERIC_IRQ_SHOW
|
||||
select GENERIC_IRQ_SHOW_LEVEL
|
||||
|
@ -207,37 +207,6 @@ void machine_kexec(struct kimage *kimage)
|
||||
BUG(); /* Should never get here. */
|
||||
}
|
||||
|
||||
static void machine_kexec_mask_interrupts(void)
|
||||
{
|
||||
unsigned int i;
|
||||
struct irq_desc *desc;
|
||||
|
||||
for_each_irq_desc(i, desc) {
|
||||
struct irq_chip *chip;
|
||||
int ret;
|
||||
|
||||
chip = irq_desc_get_chip(desc);
|
||||
if (!chip)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* First try to remove the active state. If this
|
||||
* fails, try to EOI the interrupt.
|
||||
*/
|
||||
ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
|
||||
|
||||
if (ret && irqd_irq_inprogress(&desc->irq_data) &&
|
||||
chip->irq_eoi)
|
||||
chip->irq_eoi(&desc->irq_data);
|
||||
|
||||
if (chip->irq_mask)
|
||||
chip->irq_mask(&desc->irq_data);
|
||||
|
||||
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
|
||||
chip->irq_disable(&desc->irq_data);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* machine_crash_shutdown - shutdown non-crashing cpus and save registers
|
||||
*/
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#include <asm/break.h>
|
||||
#include <linux/stringify.h>
|
||||
#include <linux/objtool.h>
|
||||
|
||||
#ifndef CONFIG_DEBUG_BUGVERBOSE
|
||||
#define _BUGVERBOSE_LOCATION(file, line)
|
||||
@ -33,25 +34,25 @@
|
||||
|
||||
#define ASM_BUG_FLAGS(flags) \
|
||||
__BUG_ENTRY(flags) \
|
||||
break BRK_BUG
|
||||
break BRK_BUG;
|
||||
|
||||
#define ASM_BUG() ASM_BUG_FLAGS(0)
|
||||
|
||||
#define __BUG_FLAGS(flags) \
|
||||
asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)));
|
||||
#define __BUG_FLAGS(flags, extra) \
|
||||
asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)) \
|
||||
extra);
|
||||
|
||||
#define __WARN_FLAGS(flags) \
|
||||
do { \
|
||||
instrumentation_begin(); \
|
||||
__BUG_FLAGS(BUGFLAG_WARNING|(flags)); \
|
||||
annotate_reachable(); \
|
||||
__BUG_FLAGS(BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE(10001b));\
|
||||
instrumentation_end(); \
|
||||
} while (0)
|
||||
|
||||
#define BUG() \
|
||||
do { \
|
||||
instrumentation_begin(); \
|
||||
__BUG_FLAGS(0); \
|
||||
__BUG_FLAGS(0, ""); \
|
||||
unreachable(); \
|
||||
} while (0)
|
||||
|
||||
|
@ -61,7 +61,6 @@ struct pt_regs;
|
||||
extern void kexec_smp_wait(void); /* get and clear naca physid, wait for
|
||||
master to copy new code to 0 */
|
||||
extern void default_machine_kexec(struct kimage *image);
|
||||
extern void machine_kexec_mask_interrupts(void);
|
||||
|
||||
void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_code_buffer,
|
||||
unsigned long start_address) __noreturn;
|
||||
|
@ -22,28 +22,6 @@
|
||||
#include <asm/setup.h>
|
||||
#include <asm/firmware.h>
|
||||
|
||||
void machine_kexec_mask_interrupts(void) {
|
||||
unsigned int i;
|
||||
struct irq_desc *desc;
|
||||
|
||||
for_each_irq_desc(i, desc) {
|
||||
struct irq_chip *chip;
|
||||
|
||||
chip = irq_desc_get_chip(desc);
|
||||
if (!chip)
|
||||
continue;
|
||||
|
||||
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
|
||||
chip->irq_eoi(&desc->irq_data);
|
||||
|
||||
if (chip->irq_mask)
|
||||
chip->irq_mask(&desc->irq_data);
|
||||
|
||||
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
|
||||
chip->irq_disable(&desc->irq_data);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CRASH_DUMP
|
||||
void machine_crash_shutdown(struct pt_regs *regs)
|
||||
{
|
||||
|
@ -7,6 +7,7 @@
|
||||
* Copyright (C) 2005 IBM Corporation.
|
||||
*/
|
||||
|
||||
#include <linux/irq.h>
|
||||
#include <linux/kexec.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/string.h>
|
||||
|
@ -114,29 +114,6 @@ void machine_shutdown(void)
|
||||
#endif
|
||||
}
|
||||
|
||||
static void machine_kexec_mask_interrupts(void)
|
||||
{
|
||||
unsigned int i;
|
||||
struct irq_desc *desc;
|
||||
|
||||
for_each_irq_desc(i, desc) {
|
||||
struct irq_chip *chip;
|
||||
|
||||
chip = irq_desc_get_chip(desc);
|
||||
if (!chip)
|
||||
continue;
|
||||
|
||||
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
|
||||
chip->irq_eoi(&desc->irq_data);
|
||||
|
||||
if (chip->irq_mask)
|
||||
chip->irq_mask(&desc->irq_data);
|
||||
|
||||
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
|
||||
chip->irq_disable(&desc->irq_data);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* machine_crash_shutdown - Prepare to kexec after a kernel crash
|
||||
*
|
||||
|
@ -981,7 +981,7 @@ static int cfdiag_push_sample(struct perf_event *event,
|
||||
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
|
||||
raw.frag.size = cpuhw->usedss;
|
||||
raw.frag.data = cpuhw->stop;
|
||||
perf_sample_save_raw_data(&data, &raw);
|
||||
perf_sample_save_raw_data(&data, event, &raw);
|
||||
}
|
||||
|
||||
overflow = perf_event_overflow(event, &data, ®s);
|
||||
|
@ -981,7 +981,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
|
||||
cpuhw->flags &= ~PMU_F_ENABLED;
|
||||
}
|
||||
|
||||
/* perf_exclude_event() - Filter event
|
||||
/* perf_event_exclude() - Filter event
|
||||
* @event: The perf event
|
||||
* @regs: pt_regs structure
|
||||
* @sde_regs: Sample-data-entry (sde) regs structure
|
||||
@ -990,7 +990,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
|
||||
*
|
||||
* Return non-zero if the event shall be excluded.
|
||||
*/
|
||||
static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs,
|
||||
static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs,
|
||||
struct perf_sf_sde_regs *sde_regs)
|
||||
{
|
||||
if (event->attr.exclude_user && user_mode(regs))
|
||||
@ -1073,7 +1073,7 @@ static int perf_push_sample(struct perf_event *event,
|
||||
data.tid_entry.pid = basic->hpp & LPP_PID_MASK;
|
||||
|
||||
overflow = 0;
|
||||
if (perf_exclude_event(event, ®s, sde_regs))
|
||||
if (perf_event_exclude(event, ®s, sde_regs))
|
||||
goto out;
|
||||
if (perf_event_overflow(event, &data, ®s)) {
|
||||
overflow = 1;
|
||||
|
@ -478,7 +478,7 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
|
||||
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
|
||||
raw.frag.size = rawsize;
|
||||
raw.frag.data = cpump->save;
|
||||
perf_sample_save_raw_data(&data, &raw);
|
||||
perf_sample_save_raw_data(&data, event, &raw);
|
||||
}
|
||||
|
||||
overflow = perf_event_overflow(event, &data, ®s);
|
||||
|
@ -503,7 +503,7 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
|
||||
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
|
||||
raw.frag.size = rawsize;
|
||||
raw.frag.data = cpump->save;
|
||||
perf_sample_save_raw_data(&data, &raw);
|
||||
perf_sample_save_raw_data(&data, event, &raw);
|
||||
}
|
||||
|
||||
overflow = perf_event_overflow(event, &data, ®s);
|
||||
|
@ -1190,7 +1190,7 @@ config X86_MCE_INTEL
|
||||
config X86_MCE_AMD
|
||||
def_bool y
|
||||
prompt "AMD MCE features"
|
||||
depends on X86_MCE && X86_LOCAL_APIC && AMD_NB
|
||||
depends on X86_MCE && X86_LOCAL_APIC
|
||||
help
|
||||
Additional support for AMD specific MCE features such as
|
||||
the DRAM Error Threshold.
|
||||
@ -1560,6 +1560,7 @@ config AMD_MEM_ENCRYPT
|
||||
select ARCH_HAS_CC_PLATFORM
|
||||
select X86_MEM_ENCRYPT
|
||||
select UNACCEPTED_MEMORY
|
||||
select CRYPTO_LIB_AESGCM
|
||||
help
|
||||
Say yes to enable support for the encryption of system memory.
|
||||
This requires an AMD processor that supports Secure Memory
|
||||
@ -3129,6 +3130,10 @@ config TS5500
|
||||
endif # X86_32
|
||||
|
||||
config AMD_NB
|
||||
def_bool y
|
||||
depends on AMD_NODE
|
||||
|
||||
config AMD_NODE
|
||||
def_bool y
|
||||
depends on CPU_SUP_AMD && PCI
|
||||
|
||||
|
@ -97,7 +97,7 @@ config IOMMU_DEBUG
|
||||
code. When you use it make sure you have a big enough
|
||||
IOMMU/AGP aperture. Most of the options enabled by this can
|
||||
be set more finegrained using the iommu= command line
|
||||
options. See Documentation/arch/x86/x86_64/boot-options.rst for more
|
||||
options. See Documentation/admin-guide/kernel-parameters.txt for more
|
||||
details.
|
||||
|
||||
config IOMMU_LEAK
|
||||
|
@ -25,10 +25,6 @@
|
||||
#include "efi.h"
|
||||
|
||||
#include <generated/compile.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/uts.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <generated/utsversion.h>
|
||||
#include <generated/utsrelease.h>
|
||||
|
||||
|
@ -401,7 +401,8 @@ finish:
|
||||
* by the guest kernel. As and when a new feature is implemented in the
|
||||
* guest kernel, a corresponding bit should be added to the mask.
|
||||
*/
|
||||
#define SNP_FEATURES_PRESENT MSR_AMD64_SNP_DEBUG_SWAP
|
||||
#define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \
|
||||
MSR_AMD64_SNP_SECURE_TSC)
|
||||
|
||||
u64 snp_get_unsupported_features(u64 status)
|
||||
{
|
||||
|
@ -65,7 +65,6 @@ static __maybe_unused __always_inline bool amd_cc_platform_vtom(enum cc_attr att
|
||||
* up under SME the trampoline area cannot be encrypted, whereas under SEV
|
||||
* the trampoline area must be encrypted.
|
||||
*/
|
||||
|
||||
static bool noinstr amd_cc_platform_has(enum cc_attr attr)
|
||||
{
|
||||
#ifdef CONFIG_AMD_MEM_ENCRYPT
|
||||
@ -97,6 +96,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr)
|
||||
case CC_ATTR_GUEST_SEV_SNP:
|
||||
return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
|
||||
|
||||
case CC_ATTR_GUEST_SNP_SECURE_TSC:
|
||||
return sev_status & MSR_AMD64_SNP_SECURE_TSC;
|
||||
|
||||
case CC_ATTR_HOST_SEV_SNP:
|
||||
return cc_flags.host_sev_snp;
|
||||
|
||||
|
@ -13,3 +13,6 @@ KCOV_INSTRUMENT_core.o := n
|
||||
# With some compiler versions the generated code results in boot hangs, caused
|
||||
# by several compilation units. To be safe, disable all instrumentation.
|
||||
KCSAN_SANITIZE := n
|
||||
|
||||
# Clang 14 and older may fail to respect __no_sanitize_undefined when inlining
|
||||
UBSAN_SANITIZE := n
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <linux/psp-sev.h>
|
||||
#include <linux/dmi.h>
|
||||
#include <uapi/linux/sev-guest.h>
|
||||
#include <crypto/gcm.h>
|
||||
|
||||
#include <asm/init.h>
|
||||
#include <asm/cpu_entry_area.h>
|
||||
@ -95,6 +96,15 @@ static u64 sev_hv_features __ro_after_init;
|
||||
/* Secrets page physical address from the CC blob */
|
||||
static u64 secrets_pa __ro_after_init;
|
||||
|
||||
/*
|
||||
* For Secure TSC guests, the BSP fetches TSC_INFO using SNP guest messaging and
|
||||
* initializes snp_tsc_scale and snp_tsc_offset. These values are replicated
|
||||
* across the APs VMSA fields (TSC_SCALE and TSC_OFFSET).
|
||||
*/
|
||||
static u64 snp_tsc_scale __ro_after_init;
|
||||
static u64 snp_tsc_offset __ro_after_init;
|
||||
static u64 snp_tsc_freq_khz __ro_after_init;
|
||||
|
||||
/* #VC handler runtime per-CPU data */
|
||||
struct sev_es_runtime_data {
|
||||
struct ghcb ghcb_page;
|
||||
@ -777,15 +787,10 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr,
|
||||
|
||||
val = sev_es_rd_ghcb_msr();
|
||||
|
||||
if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
|
||||
"Wrong PSC response code: 0x%x\n",
|
||||
(unsigned int)GHCB_RESP_CODE(val)))
|
||||
if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP)
|
||||
goto e_term;
|
||||
|
||||
if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
|
||||
"Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
|
||||
op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
|
||||
paddr, GHCB_MSR_PSC_RESP_VAL(val)))
|
||||
if (GHCB_MSR_PSC_RESP_VAL(val))
|
||||
goto e_term;
|
||||
|
||||
/* Page validation must be performed after changing to private */
|
||||
@ -821,7 +826,7 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
|
||||
early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
|
||||
}
|
||||
|
||||
void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
|
||||
void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
|
||||
unsigned long npages)
|
||||
{
|
||||
/*
|
||||
@ -1276,6 +1281,12 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
|
||||
vmsa->vmpl = snp_vmpl;
|
||||
vmsa->sev_features = sev_status >> 2;
|
||||
|
||||
/* Populate AP's TSC scale/offset to get accurate TSC values. */
|
||||
if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC)) {
|
||||
vmsa->tsc_scale = snp_tsc_scale;
|
||||
vmsa->tsc_offset = snp_tsc_offset;
|
||||
}
|
||||
|
||||
/* Switch the page over to a VMSA page now that it is initialized */
|
||||
ret = snp_set_vmsa(vmsa, caa, apic_id, true);
|
||||
if (ret) {
|
||||
@ -1418,6 +1429,41 @@ static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write)
|
||||
return ES_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
* TSC related accesses should not exit to the hypervisor when a guest is
|
||||
* executing with Secure TSC enabled, so special handling is required for
|
||||
* accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ.
|
||||
*/
|
||||
static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write)
|
||||
{
|
||||
u64 tsc;
|
||||
|
||||
/*
|
||||
* GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled.
|
||||
* Terminate the SNP guest when the interception is enabled.
|
||||
*/
|
||||
if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ)
|
||||
return ES_VMM_ERROR;
|
||||
|
||||
/*
|
||||
* Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC
|
||||
* to return undefined values, so ignore all writes.
|
||||
*
|
||||
* Reads: Reads of MSR_IA32_TSC should return the current TSC value, use
|
||||
* the value returned by rdtsc_ordered().
|
||||
*/
|
||||
if (write) {
|
||||
WARN_ONCE(1, "TSC MSR writes are verboten!\n");
|
||||
return ES_OK;
|
||||
}
|
||||
|
||||
tsc = rdtsc_ordered();
|
||||
regs->ax = lower_32_bits(tsc);
|
||||
regs->dx = upper_32_bits(tsc);
|
||||
|
||||
return ES_OK;
|
||||
}
|
||||
|
||||
static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
|
||||
{
|
||||
struct pt_regs *regs = ctxt->regs;
|
||||
@ -1427,8 +1473,18 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
|
||||
/* Is it a WRMSR? */
|
||||
write = ctxt->insn.opcode.bytes[1] == 0x30;
|
||||
|
||||
if (regs->cx == MSR_SVSM_CAA)
|
||||
switch (regs->cx) {
|
||||
case MSR_SVSM_CAA:
|
||||
return __vc_handle_msr_caa(regs, write);
|
||||
case MSR_IA32_TSC:
|
||||
case MSR_AMD64_GUEST_TSC_FREQ:
|
||||
if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
|
||||
return __vc_handle_secure_tsc_msrs(regs, write);
|
||||
else
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
ghcb_set_rcx(ghcb, regs->cx);
|
||||
if (write) {
|
||||
@ -2360,7 +2416,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
|
||||
call.rcx = pa;
|
||||
ret = svsm_perform_call_protocol(&call);
|
||||
if (ret)
|
||||
panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", ret, call.rax_out);
|
||||
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL);
|
||||
|
||||
RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa;
|
||||
RIP_REL_REF(boot_svsm_caa_pa) = pa;
|
||||
@ -2506,8 +2562,8 @@ int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req);
|
||||
|
||||
int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
|
||||
struct snp_guest_request_ioctl *rio)
|
||||
static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
|
||||
struct snp_guest_request_ioctl *rio)
|
||||
{
|
||||
struct ghcb_state state;
|
||||
struct es_em_ctxt ctxt;
|
||||
@ -2569,7 +2625,6 @@ e_restore_irq:
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(snp_issue_guest_request);
|
||||
|
||||
static struct platform_device sev_guest_device = {
|
||||
.name = "sev-guest",
|
||||
@ -2578,15 +2633,9 @@ static struct platform_device sev_guest_device = {
|
||||
|
||||
static int __init snp_init_platform_device(void)
|
||||
{
|
||||
struct sev_guest_platform_data data;
|
||||
|
||||
if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
|
||||
return -ENODEV;
|
||||
|
||||
data.secrets_gpa = secrets_pa;
|
||||
if (platform_device_add_data(&sev_guest_device, &data, sizeof(data)))
|
||||
return -ENODEV;
|
||||
|
||||
if (platform_device_register(&sev_guest_device))
|
||||
return -ENODEV;
|
||||
|
||||
@ -2665,3 +2714,581 @@ static int __init sev_sysfs_init(void)
|
||||
}
|
||||
arch_initcall(sev_sysfs_init);
|
||||
#endif // CONFIG_SYSFS
|
||||
|
||||
static void free_shared_pages(void *buf, size_t sz)
|
||||
{
|
||||
unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
|
||||
int ret;
|
||||
|
||||
if (!buf)
|
||||
return;
|
||||
|
||||
ret = set_memory_encrypted((unsigned long)buf, npages);
|
||||
if (ret) {
|
||||
WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n");
|
||||
return;
|
||||
}
|
||||
|
||||
__free_pages(virt_to_page(buf), get_order(sz));
|
||||
}
|
||||
|
||||
static void *alloc_shared_pages(size_t sz)
|
||||
{
|
||||
unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
|
||||
struct page *page;
|
||||
int ret;
|
||||
|
||||
page = alloc_pages(GFP_KERNEL_ACCOUNT, get_order(sz));
|
||||
if (!page)
|
||||
return NULL;
|
||||
|
||||
ret = set_memory_decrypted((unsigned long)page_address(page), npages);
|
||||
if (ret) {
|
||||
pr_err("failed to mark page shared, ret=%d\n", ret);
|
||||
__free_pages(page, get_order(sz));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return page_address(page);
|
||||
}
|
||||
|
||||
static u8 *get_vmpck(int id, struct snp_secrets_page *secrets, u32 **seqno)
|
||||
{
|
||||
u8 *key = NULL;
|
||||
|
||||
switch (id) {
|
||||
case 0:
|
||||
*seqno = &secrets->os_area.msg_seqno_0;
|
||||
key = secrets->vmpck0;
|
||||
break;
|
||||
case 1:
|
||||
*seqno = &secrets->os_area.msg_seqno_1;
|
||||
key = secrets->vmpck1;
|
||||
break;
|
||||
case 2:
|
||||
*seqno = &secrets->os_area.msg_seqno_2;
|
||||
key = secrets->vmpck2;
|
||||
break;
|
||||
case 3:
|
||||
*seqno = &secrets->os_area.msg_seqno_3;
|
||||
key = secrets->vmpck3;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return key;
|
||||
}
|
||||
|
||||
static struct aesgcm_ctx *snp_init_crypto(u8 *key, size_t keylen)
|
||||
{
|
||||
struct aesgcm_ctx *ctx;
|
||||
|
||||
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
|
||||
if (!ctx)
|
||||
return NULL;
|
||||
|
||||
if (aesgcm_expandkey(ctx, key, keylen, AUTHTAG_LEN)) {
|
||||
pr_err("Crypto context initialization failed\n");
|
||||
kfree(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id)
|
||||
{
|
||||
/* Adjust the default VMPCK key based on the executing VMPL level */
|
||||
if (vmpck_id == -1)
|
||||
vmpck_id = snp_vmpl;
|
||||
|
||||
mdesc->vmpck = get_vmpck(vmpck_id, mdesc->secrets, &mdesc->os_area_msg_seqno);
|
||||
if (!mdesc->vmpck) {
|
||||
pr_err("Invalid VMPCK%d communication key\n", vmpck_id);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Verify that VMPCK is not zero. */
|
||||
if (!memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) {
|
||||
pr_err("Empty VMPCK%d communication key\n", vmpck_id);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
mdesc->vmpck_id = vmpck_id;
|
||||
|
||||
mdesc->ctx = snp_init_crypto(mdesc->vmpck, VMPCK_KEY_LEN);
|
||||
if (!mdesc->ctx)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(snp_msg_init);
|
||||
|
||||
struct snp_msg_desc *snp_msg_alloc(void)
|
||||
{
|
||||
struct snp_msg_desc *mdesc;
|
||||
void __iomem *mem;
|
||||
|
||||
BUILD_BUG_ON(sizeof(struct snp_guest_msg) > PAGE_SIZE);
|
||||
|
||||
mdesc = kzalloc(sizeof(struct snp_msg_desc), GFP_KERNEL);
|
||||
if (!mdesc)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
mem = ioremap_encrypted(secrets_pa, PAGE_SIZE);
|
||||
if (!mem)
|
||||
goto e_free_mdesc;
|
||||
|
||||
mdesc->secrets = (__force struct snp_secrets_page *)mem;
|
||||
|
||||
/* Allocate the shared page used for the request and response message. */
|
||||
mdesc->request = alloc_shared_pages(sizeof(struct snp_guest_msg));
|
||||
if (!mdesc->request)
|
||||
goto e_unmap;
|
||||
|
||||
mdesc->response = alloc_shared_pages(sizeof(struct snp_guest_msg));
|
||||
if (!mdesc->response)
|
||||
goto e_free_request;
|
||||
|
||||
mdesc->certs_data = alloc_shared_pages(SEV_FW_BLOB_MAX_SIZE);
|
||||
if (!mdesc->certs_data)
|
||||
goto e_free_response;
|
||||
|
||||
/* initial the input address for guest request */
|
||||
mdesc->input.req_gpa = __pa(mdesc->request);
|
||||
mdesc->input.resp_gpa = __pa(mdesc->response);
|
||||
mdesc->input.data_gpa = __pa(mdesc->certs_data);
|
||||
|
||||
return mdesc;
|
||||
|
||||
e_free_response:
|
||||
free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg));
|
||||
e_free_request:
|
||||
free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg));
|
||||
e_unmap:
|
||||
iounmap(mem);
|
||||
e_free_mdesc:
|
||||
kfree(mdesc);
|
||||
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(snp_msg_alloc);
|
||||
|
||||
void snp_msg_free(struct snp_msg_desc *mdesc)
|
||||
{
|
||||
if (!mdesc)
|
||||
return;
|
||||
|
||||
kfree(mdesc->ctx);
|
||||
free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg));
|
||||
free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg));
|
||||
free_shared_pages(mdesc->certs_data, SEV_FW_BLOB_MAX_SIZE);
|
||||
iounmap((__force void __iomem *)mdesc->secrets);
|
||||
|
||||
memset(mdesc, 0, sizeof(*mdesc));
|
||||
kfree(mdesc);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(snp_msg_free);
|
||||
|
||||
/* Mutex to serialize the shared buffer access and command handling. */
|
||||
static DEFINE_MUTEX(snp_cmd_mutex);
|
||||
|
||||
/*
|
||||
* If an error is received from the host or AMD Secure Processor (ASP) there
|
||||
* are two options. Either retry the exact same encrypted request or discontinue
|
||||
* using the VMPCK.
|
||||
*
|
||||
* This is because in the current encryption scheme GHCB v2 uses AES-GCM to
|
||||
* encrypt the requests. The IV for this scheme is the sequence number. GCM
|
||||
* cannot tolerate IV reuse.
|
||||
*
|
||||
* The ASP FW v1.51 only increments the sequence numbers on a successful
|
||||
* guest<->ASP back and forth and only accepts messages at its exact sequence
|
||||
* number.
|
||||
*
|
||||
* So if the sequence number were to be reused the encryption scheme is
|
||||
* vulnerable. If the sequence number were incremented for a fresh IV the ASP
|
||||
* will reject the request.
|
||||
*/
|
||||
static void snp_disable_vmpck(struct snp_msg_desc *mdesc)
|
||||
{
|
||||
pr_alert("Disabling VMPCK%d communication key to prevent IV reuse.\n",
|
||||
mdesc->vmpck_id);
|
||||
memzero_explicit(mdesc->vmpck, VMPCK_KEY_LEN);
|
||||
mdesc->vmpck = NULL;
|
||||
}
|
||||
|
||||
static inline u64 __snp_get_msg_seqno(struct snp_msg_desc *mdesc)
|
||||
{
|
||||
u64 count;
|
||||
|
||||
lockdep_assert_held(&snp_cmd_mutex);
|
||||
|
||||
/* Read the current message sequence counter from secrets pages */
|
||||
count = *mdesc->os_area_msg_seqno;
|
||||
|
||||
return count + 1;
|
||||
}
|
||||
|
||||
/* Return a non-zero on success */
|
||||
static u64 snp_get_msg_seqno(struct snp_msg_desc *mdesc)
|
||||
{
|
||||
u64 count = __snp_get_msg_seqno(mdesc);
|
||||
|
||||
/*
|
||||
* The message sequence counter for the SNP guest request is a 64-bit
|
||||
* value but the version 2 of GHCB specification defines a 32-bit storage
|
||||
* for it. If the counter exceeds the 32-bit value then return zero.
|
||||
* The caller should check the return value, but if the caller happens to
|
||||
* not check the value and use it, then the firmware treats zero as an
|
||||
* invalid number and will fail the message request.
|
||||
*/
|
||||
if (count >= UINT_MAX) {
|
||||
pr_err("request message sequence counter overflow\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static void snp_inc_msg_seqno(struct snp_msg_desc *mdesc)
|
||||
{
|
||||
/*
|
||||
* The counter is also incremented by the PSP, so increment it by 2
|
||||
* and save in secrets page.
|
||||
*/
|
||||
*mdesc->os_area_msg_seqno += 2;
|
||||
}
|
||||
|
||||
static int verify_and_dec_payload(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
|
||||
{
|
||||
struct snp_guest_msg *resp_msg = &mdesc->secret_response;
|
||||
struct snp_guest_msg *req_msg = &mdesc->secret_request;
|
||||
struct snp_guest_msg_hdr *req_msg_hdr = &req_msg->hdr;
|
||||
struct snp_guest_msg_hdr *resp_msg_hdr = &resp_msg->hdr;
|
||||
struct aesgcm_ctx *ctx = mdesc->ctx;
|
||||
u8 iv[GCM_AES_IV_SIZE] = {};
|
||||
|
||||
pr_debug("response [seqno %lld type %d version %d sz %d]\n",
|
||||
resp_msg_hdr->msg_seqno, resp_msg_hdr->msg_type, resp_msg_hdr->msg_version,
|
||||
resp_msg_hdr->msg_sz);
|
||||
|
||||
/* Copy response from shared memory to encrypted memory. */
|
||||
memcpy(resp_msg, mdesc->response, sizeof(*resp_msg));
|
||||
|
||||
/* Verify that the sequence counter is incremented by 1 */
|
||||
if (unlikely(resp_msg_hdr->msg_seqno != (req_msg_hdr->msg_seqno + 1)))
|
||||
return -EBADMSG;
|
||||
|
||||
/* Verify response message type and version number. */
|
||||
if (resp_msg_hdr->msg_type != (req_msg_hdr->msg_type + 1) ||
|
||||
resp_msg_hdr->msg_version != req_msg_hdr->msg_version)
|
||||
return -EBADMSG;
|
||||
|
||||
/*
|
||||
* If the message size is greater than our buffer length then return
|
||||
* an error.
|
||||
*/
|
||||
if (unlikely((resp_msg_hdr->msg_sz + ctx->authsize) > req->resp_sz))
|
||||
return -EBADMSG;
|
||||
|
||||
/* Decrypt the payload */
|
||||
memcpy(iv, &resp_msg_hdr->msg_seqno, min(sizeof(iv), sizeof(resp_msg_hdr->msg_seqno)));
|
||||
if (!aesgcm_decrypt(ctx, req->resp_buf, resp_msg->payload, resp_msg_hdr->msg_sz,
|
||||
&resp_msg_hdr->algo, AAD_LEN, iv, resp_msg_hdr->authtag))
|
||||
return -EBADMSG;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_req *req)
|
||||
{
|
||||
struct snp_guest_msg *msg = &mdesc->secret_request;
|
||||
struct snp_guest_msg_hdr *hdr = &msg->hdr;
|
||||
struct aesgcm_ctx *ctx = mdesc->ctx;
|
||||
u8 iv[GCM_AES_IV_SIZE] = {};
|
||||
|
||||
memset(msg, 0, sizeof(*msg));
|
||||
|
||||
hdr->algo = SNP_AEAD_AES_256_GCM;
|
||||
hdr->hdr_version = MSG_HDR_VER;
|
||||
hdr->hdr_sz = sizeof(*hdr);
|
||||
hdr->msg_type = req->msg_type;
|
||||
hdr->msg_version = req->msg_version;
|
||||
hdr->msg_seqno = seqno;
|
||||
hdr->msg_vmpck = req->vmpck_id;
|
||||
hdr->msg_sz = req->req_sz;
|
||||
|
||||
/* Verify the sequence number is non-zero */
|
||||
if (!hdr->msg_seqno)
|
||||
return -ENOSR;
|
||||
|
||||
pr_debug("request [seqno %lld type %d version %d sz %d]\n",
|
||||
hdr->msg_seqno, hdr->msg_type, hdr->msg_version, hdr->msg_sz);
|
||||
|
||||
if (WARN_ON((req->req_sz + ctx->authsize) > sizeof(msg->payload)))
|
||||
return -EBADMSG;
|
||||
|
||||
memcpy(iv, &hdr->msg_seqno, min(sizeof(iv), sizeof(hdr->msg_seqno)));
|
||||
aesgcm_encrypt(ctx, msg->payload, req->req_buf, req->req_sz, &hdr->algo,
|
||||
AAD_LEN, iv, hdr->authtag);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
|
||||
struct snp_guest_request_ioctl *rio)
|
||||
{
|
||||
unsigned long req_start = jiffies;
|
||||
unsigned int override_npages = 0;
|
||||
u64 override_err = 0;
|
||||
int rc;
|
||||
|
||||
retry_request:
|
||||
/*
|
||||
* Call firmware to process the request. In this function the encrypted
|
||||
* message enters shared memory with the host. So after this call the
|
||||
* sequence number must be incremented or the VMPCK must be deleted to
|
||||
* prevent reuse of the IV.
|
||||
*/
|
||||
rc = snp_issue_guest_request(req, &mdesc->input, rio);
|
||||
switch (rc) {
|
||||
case -ENOSPC:
|
||||
/*
|
||||
* If the extended guest request fails due to having too
|
||||
* small of a certificate data buffer, retry the same
|
||||
* guest request without the extended data request in
|
||||
* order to increment the sequence number and thus avoid
|
||||
* IV reuse.
|
||||
*/
|
||||
override_npages = mdesc->input.data_npages;
|
||||
req->exit_code = SVM_VMGEXIT_GUEST_REQUEST;
|
||||
|
||||
/*
|
||||
* Override the error to inform callers the given extended
|
||||
* request buffer size was too small and give the caller the
|
||||
* required buffer size.
|
||||
*/
|
||||
override_err = SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN);
|
||||
|
||||
/*
|
||||
* If this call to the firmware succeeds, the sequence number can
|
||||
* be incremented allowing for continued use of the VMPCK. If
|
||||
* there is an error reflected in the return value, this value
|
||||
* is checked further down and the result will be the deletion
|
||||
* of the VMPCK and the error code being propagated back to the
|
||||
* user as an ioctl() return code.
|
||||
*/
|
||||
goto retry_request;
|
||||
|
||||
/*
|
||||
* The host may return SNP_GUEST_VMM_ERR_BUSY if the request has been
|
||||
* throttled. Retry in the driver to avoid returning and reusing the
|
||||
* message sequence number on a different message.
|
||||
*/
|
||||
case -EAGAIN:
|
||||
if (jiffies - req_start > SNP_REQ_MAX_RETRY_DURATION) {
|
||||
rc = -ETIMEDOUT;
|
||||
break;
|
||||
}
|
||||
schedule_timeout_killable(SNP_REQ_RETRY_DELAY);
|
||||
goto retry_request;
|
||||
}
|
||||
|
||||
/*
|
||||
* Increment the message sequence number. There is no harm in doing
|
||||
* this now because decryption uses the value stored in the response
|
||||
* structure and any failure will wipe the VMPCK, preventing further
|
||||
* use anyway.
|
||||
*/
|
||||
snp_inc_msg_seqno(mdesc);
|
||||
|
||||
if (override_err) {
|
||||
rio->exitinfo2 = override_err;
|
||||
|
||||
/*
|
||||
* If an extended guest request was issued and the supplied certificate
|
||||
* buffer was not large enough, a standard guest request was issued to
|
||||
* prevent IV reuse. If the standard request was successful, return -EIO
|
||||
* back to the caller as would have originally been returned.
|
||||
*/
|
||||
if (!rc && override_err == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
|
||||
rc = -EIO;
|
||||
}
|
||||
|
||||
if (override_npages)
|
||||
mdesc->input.data_npages = override_npages;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
|
||||
struct snp_guest_request_ioctl *rio)
|
||||
{
|
||||
u64 seqno;
|
||||
int rc;
|
||||
|
||||
guard(mutex)(&snp_cmd_mutex);
|
||||
|
||||
/* Check if the VMPCK is not empty */
|
||||
if (!mdesc->vmpck || !memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) {
|
||||
pr_err_ratelimited("VMPCK is disabled\n");
|
||||
return -ENOTTY;
|
||||
}
|
||||
|
||||
/* Get message sequence and verify that its a non-zero */
|
||||
seqno = snp_get_msg_seqno(mdesc);
|
||||
if (!seqno)
|
||||
return -EIO;
|
||||
|
||||
/* Clear shared memory's response for the host to populate. */
|
||||
memset(mdesc->response, 0, sizeof(struct snp_guest_msg));
|
||||
|
||||
/* Encrypt the userspace provided payload in mdesc->secret_request. */
|
||||
rc = enc_payload(mdesc, seqno, req);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
/*
|
||||
* Write the fully encrypted request to the shared unencrypted
|
||||
* request page.
|
||||
*/
|
||||
memcpy(mdesc->request, &mdesc->secret_request, sizeof(mdesc->secret_request));
|
||||
|
||||
rc = __handle_guest_request(mdesc, req, rio);
|
||||
if (rc) {
|
||||
if (rc == -EIO &&
|
||||
rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
|
||||
return rc;
|
||||
|
||||
pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n",
|
||||
rc, rio->exitinfo2);
|
||||
|
||||
snp_disable_vmpck(mdesc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = verify_and_dec_payload(mdesc, req);
|
||||
if (rc) {
|
||||
pr_alert("Detected unexpected decode failure from ASP. rc: %d\n", rc);
|
||||
snp_disable_vmpck(mdesc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(snp_send_guest_request);
|
||||
|
||||
static int __init snp_get_tsc_info(void)
|
||||
{
|
||||
struct snp_guest_request_ioctl *rio;
|
||||
struct snp_tsc_info_resp *tsc_resp;
|
||||
struct snp_tsc_info_req *tsc_req;
|
||||
struct snp_msg_desc *mdesc;
|
||||
struct snp_guest_req *req;
|
||||
int rc = -ENOMEM;
|
||||
|
||||
tsc_req = kzalloc(sizeof(*tsc_req), GFP_KERNEL);
|
||||
if (!tsc_req)
|
||||
return rc;
|
||||
|
||||
/*
|
||||
* The intermediate response buffer is used while decrypting the
|
||||
* response payload. Make sure that it has enough space to cover
|
||||
* the authtag.
|
||||
*/
|
||||
tsc_resp = kzalloc(sizeof(*tsc_resp) + AUTHTAG_LEN, GFP_KERNEL);
|
||||
if (!tsc_resp)
|
||||
goto e_free_tsc_req;
|
||||
|
||||
req = kzalloc(sizeof(*req), GFP_KERNEL);
|
||||
if (!req)
|
||||
goto e_free_tsc_resp;
|
||||
|
||||
rio = kzalloc(sizeof(*rio), GFP_KERNEL);
|
||||
if (!rio)
|
||||
goto e_free_req;
|
||||
|
||||
mdesc = snp_msg_alloc();
|
||||
if (IS_ERR_OR_NULL(mdesc))
|
||||
goto e_free_rio;
|
||||
|
||||
rc = snp_msg_init(mdesc, snp_vmpl);
|
||||
if (rc)
|
||||
goto e_free_mdesc;
|
||||
|
||||
req->msg_version = MSG_HDR_VER;
|
||||
req->msg_type = SNP_MSG_TSC_INFO_REQ;
|
||||
req->vmpck_id = snp_vmpl;
|
||||
req->req_buf = tsc_req;
|
||||
req->req_sz = sizeof(*tsc_req);
|
||||
req->resp_buf = (void *)tsc_resp;
|
||||
req->resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN;
|
||||
req->exit_code = SVM_VMGEXIT_GUEST_REQUEST;
|
||||
|
||||
rc = snp_send_guest_request(mdesc, req, rio);
|
||||
if (rc)
|
||||
goto e_request;
|
||||
|
||||
pr_debug("%s: response status 0x%x scale 0x%llx offset 0x%llx factor 0x%x\n",
|
||||
__func__, tsc_resp->status, tsc_resp->tsc_scale, tsc_resp->tsc_offset,
|
||||
tsc_resp->tsc_factor);
|
||||
|
||||
if (!tsc_resp->status) {
|
||||
snp_tsc_scale = tsc_resp->tsc_scale;
|
||||
snp_tsc_offset = tsc_resp->tsc_offset;
|
||||
} else {
|
||||
pr_err("Failed to get TSC info, response status 0x%x\n", tsc_resp->status);
|
||||
rc = -EIO;
|
||||
}
|
||||
|
||||
e_request:
|
||||
/* The response buffer contains sensitive data, explicitly clear it. */
|
||||
memzero_explicit(tsc_resp, sizeof(*tsc_resp) + AUTHTAG_LEN);
|
||||
e_free_mdesc:
|
||||
snp_msg_free(mdesc);
|
||||
e_free_rio:
|
||||
kfree(rio);
|
||||
e_free_req:
|
||||
kfree(req);
|
||||
e_free_tsc_resp:
|
||||
kfree(tsc_resp);
|
||||
e_free_tsc_req:
|
||||
kfree(tsc_req);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
void __init snp_secure_tsc_prepare(void)
|
||||
{
|
||||
if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
|
||||
return;
|
||||
|
||||
if (snp_get_tsc_info()) {
|
||||
pr_alert("Unable to retrieve Secure TSC info from ASP\n");
|
||||
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECURE_TSC);
|
||||
}
|
||||
|
||||
pr_debug("SecureTSC enabled");
|
||||
}
|
||||
|
||||
static unsigned long securetsc_get_tsc_khz(void)
|
||||
{
|
||||
return snp_tsc_freq_khz;
|
||||
}
|
||||
|
||||
void __init snp_secure_tsc_init(void)
|
||||
{
|
||||
unsigned long long tsc_freq_mhz;
|
||||
|
||||
if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
|
||||
return;
|
||||
|
||||
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
|
||||
rdmsrl(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz);
|
||||
snp_tsc_freq_khz = (unsigned long)(tsc_freq_mhz * 1000);
|
||||
|
||||
x86_platform.calibrate_cpu = securetsc_get_tsc_khz;
|
||||
x86_platform.calibrate_tsc = securetsc_get_tsc_khz;
|
||||
}
|
||||
|
@ -498,7 +498,7 @@ static const struct snp_cpuid_table *snp_cpuid_get_table(void)
|
||||
*
|
||||
* Return: XSAVE area size on success, 0 otherwise.
|
||||
*/
|
||||
static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
|
||||
static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
|
||||
{
|
||||
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
|
||||
u64 xfeatures_found = 0;
|
||||
@ -576,8 +576,9 @@ static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpui
|
||||
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
|
||||
}
|
||||
|
||||
static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
|
||||
struct cpuid_leaf *leaf)
|
||||
static int __head
|
||||
snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
|
||||
struct cpuid_leaf *leaf)
|
||||
{
|
||||
struct cpuid_leaf leaf_hv = *leaf;
|
||||
|
||||
@ -1140,6 +1141,16 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
|
||||
bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
|
||||
enum es_result ret;
|
||||
|
||||
/*
|
||||
* The hypervisor should not be intercepting RDTSC/RDTSCP when Secure
|
||||
* TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP
|
||||
* instructions are being intercepted. If this should occur and Secure
|
||||
* TSC is enabled, guest execution should be terminated as the guest
|
||||
* cannot rely on the TSC value provided by the hypervisor.
|
||||
*/
|
||||
if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
|
||||
return ES_VMM_ERROR;
|
||||
|
||||
ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
|
||||
if (ret != ES_OK)
|
||||
return ret;
|
||||
@ -1243,7 +1254,7 @@ static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svs
|
||||
__pval_terminate(pfn, action, page_size, ret, svsm_ret);
|
||||
}
|
||||
|
||||
static void svsm_pval_4k_page(unsigned long paddr, bool validate)
|
||||
static void __head svsm_pval_4k_page(unsigned long paddr, bool validate)
|
||||
{
|
||||
struct svsm_pvalidate_call *pc;
|
||||
struct svsm_call call = {};
|
||||
@ -1275,12 +1286,13 @@ static void svsm_pval_4k_page(unsigned long paddr, bool validate)
|
||||
|
||||
ret = svsm_perform_call_protocol(&call);
|
||||
if (ret)
|
||||
svsm_pval_terminate(pc, ret, call.rax_out);
|
||||
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
|
||||
|
||||
native_local_irq_restore(flags);
|
||||
}
|
||||
|
||||
static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool validate)
|
||||
static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr,
|
||||
bool validate)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@ -1293,7 +1305,7 @@ static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool val
|
||||
} else {
|
||||
ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
|
||||
if (ret)
|
||||
__pval_terminate(PHYS_PFN(paddr), validate, RMP_PG_SIZE_4K, ret, 0);
|
||||
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
obj-y += tdx.o tdx-shared.o tdcall.o
|
||||
obj-y += debug.o tdcall.o tdx.o tdx-shared.o
|
||||
|
69
arch/x86/coco/tdx/debug.c
Normal file
69
arch/x86/coco/tdx/debug.c
Normal file
@ -0,0 +1,69 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "tdx: " fmt
|
||||
|
||||
#include <linux/array_size.h>
|
||||
#include <linux/printk.h>
|
||||
#include <asm/tdx.h>
|
||||
|
||||
#define DEF_TDX_ATTR_NAME(_name) [TDX_ATTR_##_name##_BIT] = __stringify(_name)
|
||||
|
||||
static __initdata const char *tdx_attributes[] = {
|
||||
DEF_TDX_ATTR_NAME(DEBUG),
|
||||
DEF_TDX_ATTR_NAME(HGS_PLUS_PROF),
|
||||
DEF_TDX_ATTR_NAME(PERF_PROF),
|
||||
DEF_TDX_ATTR_NAME(PMT_PROF),
|
||||
DEF_TDX_ATTR_NAME(ICSSD),
|
||||
DEF_TDX_ATTR_NAME(LASS),
|
||||
DEF_TDX_ATTR_NAME(SEPT_VE_DISABLE),
|
||||
DEF_TDX_ATTR_NAME(MIGRTABLE),
|
||||
DEF_TDX_ATTR_NAME(PKS),
|
||||
DEF_TDX_ATTR_NAME(KL),
|
||||
DEF_TDX_ATTR_NAME(TPA),
|
||||
DEF_TDX_ATTR_NAME(PERFMON),
|
||||
};
|
||||
|
||||
#define DEF_TD_CTLS_NAME(_name) [TD_CTLS_##_name##_BIT] = __stringify(_name)
|
||||
|
||||
static __initdata const char *tdcs_td_ctls[] = {
|
||||
DEF_TD_CTLS_NAME(PENDING_VE_DISABLE),
|
||||
DEF_TD_CTLS_NAME(ENUM_TOPOLOGY),
|
||||
DEF_TD_CTLS_NAME(VIRT_CPUID2),
|
||||
DEF_TD_CTLS_NAME(REDUCE_VE),
|
||||
DEF_TD_CTLS_NAME(LOCK),
|
||||
};
|
||||
|
||||
void __init tdx_dump_attributes(u64 td_attr)
|
||||
{
|
||||
pr_info("Attributes:");
|
||||
|
||||
for (int i = 0; i < ARRAY_SIZE(tdx_attributes); i++) {
|
||||
if (!tdx_attributes[i])
|
||||
continue;
|
||||
if (td_attr & BIT(i))
|
||||
pr_cont(" %s", tdx_attributes[i]);
|
||||
td_attr &= ~BIT(i);
|
||||
}
|
||||
|
||||
if (td_attr)
|
||||
pr_cont(" unknown:%#llx", td_attr);
|
||||
pr_cont("\n");
|
||||
|
||||
}
|
||||
|
||||
void __init tdx_dump_td_ctls(u64 td_ctls)
|
||||
{
|
||||
pr_info("TD_CTLS:");
|
||||
|
||||
for (int i = 0; i < ARRAY_SIZE(tdcs_td_ctls); i++) {
|
||||
if (!tdcs_td_ctls[i])
|
||||
continue;
|
||||
if (td_ctls & BIT(i))
|
||||
pr_cont(" %s", tdcs_td_ctls[i]);
|
||||
td_ctls &= ~BIT(i);
|
||||
}
|
||||
if (td_ctls)
|
||||
pr_cont(" unknown:%#llx", td_ctls);
|
||||
pr_cont("\n");
|
||||
}
|
@ -32,9 +32,6 @@
|
||||
#define VE_GET_PORT_NUM(e) ((e) >> 16)
|
||||
#define VE_IS_IO_STRING(e) ((e) & BIT(4))
|
||||
|
||||
#define ATTR_DEBUG BIT(0)
|
||||
#define ATTR_SEPT_VE_DISABLE BIT(28)
|
||||
|
||||
/* TDX Module call error codes */
|
||||
#define TDCALL_RETURN_CODE(a) ((a) >> 32)
|
||||
#define TDCALL_INVALID_OPERAND 0xc0000100
|
||||
@ -200,14 +197,14 @@ static void __noreturn tdx_panic(const char *msg)
|
||||
*
|
||||
* TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM
|
||||
* controls if the guest will receive such #VE with TD attribute
|
||||
* ATTR_SEPT_VE_DISABLE.
|
||||
* TDX_ATTR_SEPT_VE_DISABLE.
|
||||
*
|
||||
* Newer TDX modules allow the guest to control if it wants to receive SEPT
|
||||
* violation #VEs.
|
||||
*
|
||||
* Check if the feature is available and disable SEPT #VE if possible.
|
||||
*
|
||||
* If the TD is allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE
|
||||
* If the TD is allowed to disable/enable SEPT #VEs, the TDX_ATTR_SEPT_VE_DISABLE
|
||||
* attribute is no longer reliable. It reflects the initial state of the
|
||||
* control for the TD, but it will not be updated if someone (e.g. bootloader)
|
||||
* changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to
|
||||
@ -216,14 +213,14 @@ static void __noreturn tdx_panic(const char *msg)
|
||||
static void disable_sept_ve(u64 td_attr)
|
||||
{
|
||||
const char *msg = "TD misconfiguration: SEPT #VE has to be disabled";
|
||||
bool debug = td_attr & ATTR_DEBUG;
|
||||
bool debug = td_attr & TDX_ATTR_DEBUG;
|
||||
u64 config, controls;
|
||||
|
||||
/* Is this TD allowed to disable SEPT #VE */
|
||||
tdg_vm_rd(TDCS_CONFIG_FLAGS, &config);
|
||||
if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) {
|
||||
/* No SEPT #VE controls for the guest: check the attribute */
|
||||
if (td_attr & ATTR_SEPT_VE_DISABLE)
|
||||
if (td_attr & TDX_ATTR_SEPT_VE_DISABLE)
|
||||
return;
|
||||
|
||||
/* Relax SEPT_VE_DISABLE check for debug TD for backtraces */
|
||||
@ -274,6 +271,20 @@ static void enable_cpu_topology_enumeration(void)
|
||||
tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY);
|
||||
}
|
||||
|
||||
static void reduce_unnecessary_ve(void)
|
||||
{
|
||||
u64 err = tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_REDUCE_VE, TD_CTLS_REDUCE_VE);
|
||||
|
||||
if (err == TDX_SUCCESS)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Enabling REDUCE_VE includes ENUM_TOPOLOGY. Only try to
|
||||
* enable ENUM_TOPOLOGY if REDUCE_VE was not successful.
|
||||
*/
|
||||
enable_cpu_topology_enumeration();
|
||||
}
|
||||
|
||||
static void tdx_setup(u64 *cc_mask)
|
||||
{
|
||||
struct tdx_module_args args = {};
|
||||
@ -305,7 +316,8 @@ static void tdx_setup(u64 *cc_mask)
|
||||
tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
|
||||
|
||||
disable_sept_ve(td_attr);
|
||||
enable_cpu_topology_enumeration();
|
||||
|
||||
reduce_unnecessary_ve();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1025,6 +1037,20 @@ static void tdx_kexec_finish(void)
|
||||
}
|
||||
}
|
||||
|
||||
static __init void tdx_announce(void)
|
||||
{
|
||||
struct tdx_module_args args = {};
|
||||
u64 controls;
|
||||
|
||||
pr_info("Guest detected\n");
|
||||
|
||||
tdcall(TDG_VP_INFO, &args);
|
||||
tdx_dump_attributes(args.rdx);
|
||||
|
||||
tdg_vm_rd(TDCS_TD_CTLS, &controls);
|
||||
tdx_dump_td_ctls(controls);
|
||||
}
|
||||
|
||||
void __init tdx_early_init(void)
|
||||
{
|
||||
u64 cc_mask;
|
||||
@ -1094,5 +1120,5 @@ void __init tdx_early_init(void)
|
||||
*/
|
||||
x86_cpuinit.parallel_bringup = false;
|
||||
|
||||
pr_info("Guest detected\n");
|
||||
tdx_announce();
|
||||
}
|
||||
|
@ -308,10 +308,9 @@ SYM_CODE_END(xen_error_entry)
|
||||
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
|
||||
.endif
|
||||
|
||||
call \cfunc
|
||||
|
||||
/* For some configurations \cfunc ends up being a noreturn. */
|
||||
REACHABLE
|
||||
ANNOTATE_REACHABLE
|
||||
call \cfunc
|
||||
|
||||
jmp error_return
|
||||
.endm
|
||||
@ -529,10 +528,10 @@ SYM_CODE_START(\asmsym)
|
||||
movq %rsp, %rdi /* pt_regs pointer into first argument */
|
||||
movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
|
||||
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
|
||||
call \cfunc
|
||||
|
||||
/* For some configurations \cfunc ends up being a noreturn. */
|
||||
REACHABLE
|
||||
ANNOTATE_REACHABLE
|
||||
call \cfunc
|
||||
|
||||
jmp paranoid_exit
|
||||
|
||||
|
@ -1001,8 +1001,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
|
||||
if (!x86_perf_event_set_period(event))
|
||||
continue;
|
||||
|
||||
if (has_branch_stack(event))
|
||||
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
|
||||
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
|
||||
|
||||
if (perf_event_overflow(event, &data, regs))
|
||||
x86_pmu_stop(event, 0);
|
||||
|
@ -31,6 +31,8 @@ static u32 ibs_caps;
|
||||
#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
|
||||
#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
|
||||
|
||||
/* attr.config2 */
|
||||
#define IBS_SW_FILTER_MASK 1
|
||||
|
||||
/*
|
||||
* IBS states:
|
||||
@ -290,6 +292,16 @@ static int perf_ibs_init(struct perf_event *event)
|
||||
if (has_branch_stack(event))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* handle exclude_{user,kernel} in the IRQ handler */
|
||||
if (event->attr.exclude_host || event->attr.exclude_guest ||
|
||||
event->attr.exclude_idle)
|
||||
return -EINVAL;
|
||||
|
||||
if (!(event->attr.config2 & IBS_SW_FILTER_MASK) &&
|
||||
(event->attr.exclude_kernel || event->attr.exclude_user ||
|
||||
event->attr.exclude_hv))
|
||||
return -EINVAL;
|
||||
|
||||
ret = validate_group(event);
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -550,24 +562,14 @@ static struct attribute *attrs_empty[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group empty_format_group = {
|
||||
.name = "format",
|
||||
.attrs = attrs_empty,
|
||||
};
|
||||
|
||||
static struct attribute_group empty_caps_group = {
|
||||
.name = "caps",
|
||||
.attrs = attrs_empty,
|
||||
};
|
||||
|
||||
static const struct attribute_group *empty_attr_groups[] = {
|
||||
&empty_format_group,
|
||||
&empty_caps_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
PMU_FORMAT_ATTR(rand_en, "config:57");
|
||||
PMU_FORMAT_ATTR(cnt_ctl, "config:19");
|
||||
PMU_FORMAT_ATTR(swfilt, "config2:0");
|
||||
PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
|
||||
PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
|
||||
PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
|
||||
@ -578,8 +580,9 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int
|
||||
return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
|
||||
}
|
||||
|
||||
static struct attribute *rand_en_attrs[] = {
|
||||
static struct attribute *fetch_attrs[] = {
|
||||
&format_attr_rand_en.attr,
|
||||
&format_attr_swfilt.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -593,9 +596,9 @@ static struct attribute *zen4_ibs_extensions_attrs[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group group_rand_en = {
|
||||
static struct attribute_group group_fetch_formats = {
|
||||
.name = "format",
|
||||
.attrs = rand_en_attrs,
|
||||
.attrs = fetch_attrs,
|
||||
};
|
||||
|
||||
static struct attribute_group group_fetch_l3missonly = {
|
||||
@ -611,7 +614,7 @@ static struct attribute_group group_zen4_ibs_extensions = {
|
||||
};
|
||||
|
||||
static const struct attribute_group *fetch_attr_groups[] = {
|
||||
&group_rand_en,
|
||||
&group_fetch_formats,
|
||||
&empty_caps_group,
|
||||
NULL,
|
||||
};
|
||||
@ -628,6 +631,11 @@ cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
||||
return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
|
||||
}
|
||||
|
||||
static struct attribute *op_attrs[] = {
|
||||
&format_attr_swfilt.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute *cnt_ctl_attrs[] = {
|
||||
&format_attr_cnt_ctl.attr,
|
||||
NULL,
|
||||
@ -638,6 +646,11 @@ static struct attribute *op_l3missonly_attrs[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group group_op_formats = {
|
||||
.name = "format",
|
||||
.attrs = op_attrs,
|
||||
};
|
||||
|
||||
static struct attribute_group group_cnt_ctl = {
|
||||
.name = "format",
|
||||
.attrs = cnt_ctl_attrs,
|
||||
@ -650,6 +663,12 @@ static struct attribute_group group_op_l3missonly = {
|
||||
.is_visible = zen4_ibs_extensions_is_visible,
|
||||
};
|
||||
|
||||
static const struct attribute_group *op_attr_groups[] = {
|
||||
&group_op_formats,
|
||||
&empty_caps_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group *op_attr_update[] = {
|
||||
&group_cnt_ctl,
|
||||
&group_op_l3missonly,
|
||||
@ -667,7 +686,6 @@ static struct perf_ibs perf_ibs_fetch = {
|
||||
.start = perf_ibs_start,
|
||||
.stop = perf_ibs_stop,
|
||||
.read = perf_ibs_read,
|
||||
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
|
||||
},
|
||||
.msr = MSR_AMD64_IBSFETCHCTL,
|
||||
.config_mask = IBS_FETCH_CONFIG_MASK,
|
||||
@ -691,7 +709,6 @@ static struct perf_ibs perf_ibs_op = {
|
||||
.start = perf_ibs_start,
|
||||
.stop = perf_ibs_stop,
|
||||
.read = perf_ibs_read,
|
||||
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
|
||||
},
|
||||
.msr = MSR_AMD64_IBSOPCTL,
|
||||
.config_mask = IBS_OP_CONFIG_MASK,
|
||||
@ -1111,6 +1128,12 @@ fail:
|
||||
regs.flags |= PERF_EFLAGS_EXACT;
|
||||
}
|
||||
|
||||
if ((event->attr.config2 & IBS_SW_FILTER_MASK) &&
|
||||
perf_exclude_event(event, ®s)) {
|
||||
throttle = perf_event_account_interrupt(event);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
|
||||
raw = (struct perf_raw_record){
|
||||
.frag = {
|
||||
@ -1118,7 +1141,7 @@ fail:
|
||||
.data = ibs_data.data,
|
||||
},
|
||||
};
|
||||
perf_sample_save_raw_data(&data, &raw);
|
||||
perf_sample_save_raw_data(&data, event, &raw);
|
||||
}
|
||||
|
||||
if (perf_ibs == &perf_ibs_op)
|
||||
@ -1129,8 +1152,7 @@ fail:
|
||||
* recorded as part of interrupt regs. Thus we need to use rip from
|
||||
* interrupt regs while unwinding call stack.
|
||||
*/
|
||||
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
|
||||
perf_sample_save_callchain(&data, event, iregs);
|
||||
perf_sample_save_callchain(&data, event, iregs);
|
||||
|
||||
throttle = perf_event_overflow(event, &data, ®s);
|
||||
out:
|
||||
@ -1228,7 +1250,7 @@ static __init int perf_ibs_op_init(void)
|
||||
if (ibs_caps & IBS_CAPS_ZEN4)
|
||||
perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
|
||||
|
||||
perf_ibs_op.pmu.attr_groups = empty_attr_groups;
|
||||
perf_ibs_op.pmu.attr_groups = op_attr_groups;
|
||||
perf_ibs_op.pmu.attr_update = op_attr_update;
|
||||
|
||||
return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
|
||||
|
@ -1707,8 +1707,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
|
||||
|
||||
perf_sample_data_init(&data, 0, event->hw.last_period);
|
||||
|
||||
if (has_branch_stack(event))
|
||||
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
|
||||
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
|
||||
|
||||
if (perf_event_overflow(event, &data, regs))
|
||||
x86_pmu_stop(event, 0);
|
||||
|
@ -2826,6 +2826,9 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
|
||||
return;
|
||||
|
||||
idx = INTEL_PMC_IDX_FIXED_SLOTS;
|
||||
|
||||
if (event->attr.config1 & INTEL_TD_CFG_METRIC_CLEAR)
|
||||
bits |= INTEL_FIXED_3_METRICS_CLEAR;
|
||||
}
|
||||
|
||||
intel_set_masks(event, idx);
|
||||
@ -4081,7 +4084,12 @@ static int intel_pmu_hw_config(struct perf_event *event)
|
||||
* is used in a metrics group, it too cannot support sampling.
|
||||
*/
|
||||
if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) {
|
||||
if (event->attr.config1 || event->attr.config2)
|
||||
/* The metrics_clear can only be set for the slots event */
|
||||
if (event->attr.config1 &&
|
||||
(!is_slots_event(event) || (event->attr.config1 & ~INTEL_TD_CFG_METRIC_CLEAR)))
|
||||
return -EINVAL;
|
||||
|
||||
if (event->attr.config2)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
@ -4690,6 +4698,8 @@ PMU_FORMAT_ATTR(in_tx, "config:32" );
|
||||
PMU_FORMAT_ATTR(in_tx_cp, "config:33" );
|
||||
PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */
|
||||
|
||||
PMU_FORMAT_ATTR(metrics_clear, "config1:0"); /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */
|
||||
|
||||
static ssize_t umask2_show(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
char *page)
|
||||
@ -4709,6 +4719,7 @@ static struct device_attribute format_attr_umask2 =
|
||||
static struct attribute *format_evtsel_ext_attrs[] = {
|
||||
&format_attr_umask2.attr,
|
||||
&format_attr_eq.attr,
|
||||
&format_attr_metrics_clear.attr,
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -4733,6 +4744,13 @@ evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
||||
if (i == 1)
|
||||
return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0;
|
||||
|
||||
/* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */
|
||||
if (i == 2) {
|
||||
union perf_capabilities intel_cap = hybrid(dev_get_drvdata(dev), intel_cap);
|
||||
|
||||
return intel_cap.rdpmc_metrics_clear ? attr->mode : 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -5381,42 +5399,32 @@ static __init void intel_clovertown_quirk(void)
|
||||
x86_pmu.pebs_constraints = NULL;
|
||||
}
|
||||
|
||||
static const struct x86_cpu_desc isolation_ucodes[] = {
|
||||
INTEL_CPU_DESC(INTEL_HASWELL, 3, 0x0000001f),
|
||||
INTEL_CPU_DESC(INTEL_HASWELL_L, 1, 0x0000001e),
|
||||
INTEL_CPU_DESC(INTEL_HASWELL_G, 1, 0x00000015),
|
||||
INTEL_CPU_DESC(INTEL_HASWELL_X, 2, 0x00000037),
|
||||
INTEL_CPU_DESC(INTEL_HASWELL_X, 4, 0x0000000a),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL, 4, 0x00000023),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_G, 1, 0x00000014),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_D, 2, 0x00000010),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_D, 3, 0x07000009),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_D, 4, 0x0f000009),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_D, 5, 0x0e000002),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_X, 1, 0x0b000014),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 3, 0x00000021),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 4, 0x00000000),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 5, 0x00000000),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 6, 0x00000000),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 7, 0x00000000),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 11, 0x00000000),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_L, 3, 0x0000007c),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE, 3, 0x0000007c),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE, 9, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 9, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 10, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 11, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 12, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE, 10, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE, 11, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE, 12, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE, 13, 0x0000004e),
|
||||
static const struct x86_cpu_id isolation_ucodes[] = {
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL, 3, 3, 0x0000001f),
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_L, 1, 1, 0x0000001e),
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_G, 1, 1, 0x00000015),
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 2, 2, 0x00000037),
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 4, 4, 0x0000000a),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL, 4, 4, 0x00000023),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_G, 1, 1, 0x00000014),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 2, 2, 0x00000010),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 3, 3, 0x07000009),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 4, 4, 0x0f000009),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 5, 5, 0x0e000002),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_X, 1, 1, 0x0b000014),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 3, 3, 0x00000021),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 4, 7, 0x00000000),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 11, 11, 0x00000000),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_L, 3, 3, 0x0000007c),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE, 3, 3, 0x0000007c),
|
||||
X86_MATCH_VFM_STEPS(INTEL_KABYLAKE, 9, 13, 0x0000004e),
|
||||
X86_MATCH_VFM_STEPS(INTEL_KABYLAKE_L, 9, 12, 0x0000004e),
|
||||
{}
|
||||
};
|
||||
|
||||
static void intel_check_pebs_isolation(void)
|
||||
{
|
||||
x86_pmu.pebs_no_isolation = !x86_cpu_has_min_microcode_rev(isolation_ucodes);
|
||||
x86_pmu.pebs_no_isolation = !x86_match_min_microcode_rev(isolation_ucodes);
|
||||
}
|
||||
|
||||
static __init void intel_pebs_isolation_quirk(void)
|
||||
@ -5426,16 +5434,16 @@ static __init void intel_pebs_isolation_quirk(void)
|
||||
intel_check_pebs_isolation();
|
||||
}
|
||||
|
||||
static const struct x86_cpu_desc pebs_ucodes[] = {
|
||||
INTEL_CPU_DESC(INTEL_SANDYBRIDGE, 7, 0x00000028),
|
||||
INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 6, 0x00000618),
|
||||
INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 7, 0x0000070c),
|
||||
static const struct x86_cpu_id pebs_ucodes[] = {
|
||||
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE, 7, 7, 0x00000028),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 6, 6, 0x00000618),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 7, 7, 0x0000070c),
|
||||
{}
|
||||
};
|
||||
|
||||
static bool intel_snb_pebs_broken(void)
|
||||
{
|
||||
return !x86_cpu_has_min_microcode_rev(pebs_ucodes);
|
||||
return !x86_match_min_microcode_rev(pebs_ucodes);
|
||||
}
|
||||
|
||||
static void intel_snb_check_microcode(void)
|
||||
|
@ -1789,8 +1789,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
|
||||
* previous PMI context or an (I)RET happened between the record and
|
||||
* PMI.
|
||||
*/
|
||||
if (sample_type & PERF_SAMPLE_CALLCHAIN)
|
||||
perf_sample_save_callchain(data, event, iregs);
|
||||
perf_sample_save_callchain(data, event, iregs);
|
||||
|
||||
/*
|
||||
* We use the interrupt regs as a base because the PEBS record does not
|
||||
@ -1889,8 +1888,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
|
||||
if (x86_pmu.intel_cap.pebs_format >= 3)
|
||||
setup_pebs_time(event, data, pebs->tsc);
|
||||
|
||||
if (has_branch_stack(event))
|
||||
perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
|
||||
perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
|
||||
}
|
||||
|
||||
static void adaptive_pebs_save_regs(struct pt_regs *regs,
|
||||
@ -1917,8 +1915,6 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
|
||||
}
|
||||
|
||||
#define PEBS_LATENCY_MASK 0xffff
|
||||
#define PEBS_CACHE_LATENCY_OFFSET 32
|
||||
#define PEBS_RETIRE_LATENCY_OFFSET 32
|
||||
|
||||
/*
|
||||
* With adaptive PEBS the layout depends on what fields are configured.
|
||||
@ -1932,8 +1928,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
struct pebs_basic *basic = __pebs;
|
||||
void *next_record = basic + 1;
|
||||
u64 sample_type;
|
||||
u64 format_size;
|
||||
u64 sample_type, format_group;
|
||||
struct pebs_meminfo *meminfo = NULL;
|
||||
struct pebs_gprs *gprs = NULL;
|
||||
struct x86_perf_regs *perf_regs;
|
||||
@ -1945,7 +1940,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
perf_regs->xmm_regs = NULL;
|
||||
|
||||
sample_type = event->attr.sample_type;
|
||||
format_size = basic->format_size;
|
||||
format_group = basic->format_group;
|
||||
perf_sample_data_init(data, 0, event->hw.last_period);
|
||||
data->period = event->hw.last_period;
|
||||
|
||||
@ -1957,8 +1952,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
* previous PMI context or an (I)RET happened between the record and
|
||||
* PMI.
|
||||
*/
|
||||
if (sample_type & PERF_SAMPLE_CALLCHAIN)
|
||||
perf_sample_save_callchain(data, event, iregs);
|
||||
perf_sample_save_callchain(data, event, iregs);
|
||||
|
||||
*regs = *iregs;
|
||||
/* The ip in basic is EventingIP */
|
||||
@ -1967,7 +1961,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
|
||||
if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
|
||||
if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY)
|
||||
data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
|
||||
data->weight.var3_w = basic->retire_latency;
|
||||
else
|
||||
data->weight.var3_w = 0;
|
||||
}
|
||||
@ -1977,12 +1971,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
* But PERF_SAMPLE_TRANSACTION needs gprs->ax.
|
||||
* Save the pointer here but process later.
|
||||
*/
|
||||
if (format_size & PEBS_DATACFG_MEMINFO) {
|
||||
if (format_group & PEBS_DATACFG_MEMINFO) {
|
||||
meminfo = next_record;
|
||||
next_record = meminfo + 1;
|
||||
}
|
||||
|
||||
if (format_size & PEBS_DATACFG_GP) {
|
||||
if (format_group & PEBS_DATACFG_GP) {
|
||||
gprs = next_record;
|
||||
next_record = gprs + 1;
|
||||
|
||||
@ -1995,14 +1989,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
adaptive_pebs_save_regs(regs, gprs);
|
||||
}
|
||||
|
||||
if (format_size & PEBS_DATACFG_MEMINFO) {
|
||||
if (format_group & PEBS_DATACFG_MEMINFO) {
|
||||
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
|
||||
u64 weight = meminfo->latency;
|
||||
u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
|
||||
meminfo->cache_latency : meminfo->mem_latency;
|
||||
|
||||
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
|
||||
data->weight.var2_w = weight & PEBS_LATENCY_MASK;
|
||||
weight >>= PEBS_CACHE_LATENCY_OFFSET;
|
||||
}
|
||||
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY)
|
||||
data->weight.var2_w = meminfo->instr_latency;
|
||||
|
||||
/*
|
||||
* Although meminfo::latency is defined as a u64,
|
||||
@ -2010,12 +2003,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
* in practice on Ice Lake and earlier platforms.
|
||||
*/
|
||||
if (sample_type & PERF_SAMPLE_WEIGHT) {
|
||||
data->weight.full = weight ?:
|
||||
data->weight.full = latency ?:
|
||||
intel_get_tsx_weight(meminfo->tsx_tuning);
|
||||
} else {
|
||||
data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
|
||||
data->weight.var1_dw = (u32)latency ?:
|
||||
intel_get_tsx_weight(meminfo->tsx_tuning);
|
||||
}
|
||||
|
||||
data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
|
||||
}
|
||||
|
||||
@ -2036,16 +2030,16 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
}
|
||||
}
|
||||
|
||||
if (format_size & PEBS_DATACFG_XMMS) {
|
||||
if (format_group & PEBS_DATACFG_XMMS) {
|
||||
struct pebs_xmm *xmm = next_record;
|
||||
|
||||
next_record = xmm + 1;
|
||||
perf_regs->xmm_regs = xmm->xmm;
|
||||
}
|
||||
|
||||
if (format_size & PEBS_DATACFG_LBRS) {
|
||||
if (format_group & PEBS_DATACFG_LBRS) {
|
||||
struct lbr_entry *lbr = next_record;
|
||||
int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
|
||||
int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT)
|
||||
& 0xff) + 1;
|
||||
next_record = next_record + num_lbr * sizeof(struct lbr_entry);
|
||||
|
||||
@ -2055,11 +2049,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
}
|
||||
}
|
||||
|
||||
WARN_ONCE(next_record != __pebs + (format_size >> 48),
|
||||
"PEBS record size %llu, expected %llu, config %llx\n",
|
||||
format_size >> 48,
|
||||
WARN_ONCE(next_record != __pebs + basic->format_size,
|
||||
"PEBS record size %u, expected %llu, config %llx\n",
|
||||
basic->format_size,
|
||||
(u64)(next_record - __pebs),
|
||||
basic->format_size);
|
||||
format_group);
|
||||
}
|
||||
|
||||
static inline void *
|
||||
@ -2170,46 +2164,33 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
|
||||
return 0;
|
||||
}
|
||||
|
||||
typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *,
|
||||
struct perf_sample_data *, struct pt_regs *);
|
||||
|
||||
static struct pt_regs dummy_iregs;
|
||||
|
||||
static __always_inline void
|
||||
__intel_pmu_pebs_event(struct perf_event *event,
|
||||
struct pt_regs *iregs,
|
||||
struct pt_regs *regs,
|
||||
struct perf_sample_data *data,
|
||||
void *base, void *top,
|
||||
int bit, int count,
|
||||
void (*setup_sample)(struct perf_event *,
|
||||
struct pt_regs *,
|
||||
void *,
|
||||
struct perf_sample_data *,
|
||||
struct pt_regs *))
|
||||
void *at,
|
||||
setup_fn setup_sample)
|
||||
{
|
||||
setup_sample(event, iregs, at, data, regs);
|
||||
perf_event_output(event, data, regs);
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
__intel_pmu_pebs_last_event(struct perf_event *event,
|
||||
struct pt_regs *iregs,
|
||||
struct pt_regs *regs,
|
||||
struct perf_sample_data *data,
|
||||
void *at,
|
||||
int count,
|
||||
setup_fn setup_sample)
|
||||
{
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
struct x86_perf_regs perf_regs;
|
||||
struct pt_regs *regs = &perf_regs.regs;
|
||||
void *at = get_next_pebs_record_by_bit(base, top, bit);
|
||||
static struct pt_regs dummy_iregs;
|
||||
|
||||
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
|
||||
/*
|
||||
* Now, auto-reload is only enabled in fixed period mode.
|
||||
* The reload value is always hwc->sample_period.
|
||||
* May need to change it, if auto-reload is enabled in
|
||||
* freq mode later.
|
||||
*/
|
||||
intel_pmu_save_and_restart_reload(event, count);
|
||||
} else if (!intel_pmu_save_and_restart(event))
|
||||
return;
|
||||
|
||||
if (!iregs)
|
||||
iregs = &dummy_iregs;
|
||||
|
||||
while (count > 1) {
|
||||
setup_sample(event, iregs, at, data, regs);
|
||||
perf_event_output(event, data, regs);
|
||||
at += cpuc->pebs_record_size;
|
||||
at = get_next_pebs_record_by_bit(at, top, bit);
|
||||
count--;
|
||||
}
|
||||
|
||||
setup_sample(event, iregs, at, data, regs);
|
||||
if (iregs == &dummy_iregs) {
|
||||
@ -2228,6 +2209,44 @@ __intel_pmu_pebs_event(struct perf_event *event,
|
||||
if (perf_event_overflow(event, data, regs))
|
||||
x86_pmu_stop(event, 0);
|
||||
}
|
||||
|
||||
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
|
||||
/*
|
||||
* Now, auto-reload is only enabled in fixed period mode.
|
||||
* The reload value is always hwc->sample_period.
|
||||
* May need to change it, if auto-reload is enabled in
|
||||
* freq mode later.
|
||||
*/
|
||||
intel_pmu_save_and_restart_reload(event, count);
|
||||
} else
|
||||
intel_pmu_save_and_restart(event);
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
__intel_pmu_pebs_events(struct perf_event *event,
|
||||
struct pt_regs *iregs,
|
||||
struct perf_sample_data *data,
|
||||
void *base, void *top,
|
||||
int bit, int count,
|
||||
setup_fn setup_sample)
|
||||
{
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
struct x86_perf_regs perf_regs;
|
||||
struct pt_regs *regs = &perf_regs.regs;
|
||||
void *at = get_next_pebs_record_by_bit(base, top, bit);
|
||||
int cnt = count;
|
||||
|
||||
if (!iregs)
|
||||
iregs = &dummy_iregs;
|
||||
|
||||
while (cnt > 1) {
|
||||
__intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample);
|
||||
at += cpuc->pebs_record_size;
|
||||
at = get_next_pebs_record_by_bit(at, top, bit);
|
||||
cnt--;
|
||||
}
|
||||
|
||||
__intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample);
|
||||
}
|
||||
|
||||
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
|
||||
@ -2264,8 +2283,8 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_
|
||||
return;
|
||||
}
|
||||
|
||||
__intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
|
||||
setup_pebs_fixed_sample_data);
|
||||
__intel_pmu_pebs_events(event, iregs, data, at, top, 0, n,
|
||||
setup_pebs_fixed_sample_data);
|
||||
}
|
||||
|
||||
static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
|
||||
@ -2396,9 +2415,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
|
||||
}
|
||||
|
||||
if (counts[bit]) {
|
||||
__intel_pmu_pebs_event(event, iregs, data, base,
|
||||
top, bit, counts[bit],
|
||||
setup_pebs_fixed_sample_data);
|
||||
__intel_pmu_pebs_events(event, iregs, data, base,
|
||||
top, bit, counts[bit],
|
||||
setup_pebs_fixed_sample_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2406,8 +2425,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
|
||||
static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
|
||||
{
|
||||
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
|
||||
void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
struct debug_store *ds = cpuc->ds;
|
||||
struct x86_perf_regs perf_regs;
|
||||
struct pt_regs *regs = &perf_regs.regs;
|
||||
struct pebs_basic *basic;
|
||||
struct perf_event *event;
|
||||
void *base, *at, *top;
|
||||
int bit;
|
||||
@ -2429,30 +2452,41 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
|
||||
return;
|
||||
}
|
||||
|
||||
for (at = base; at < top; at += cpuc->pebs_record_size) {
|
||||
if (!iregs)
|
||||
iregs = &dummy_iregs;
|
||||
|
||||
/* Process all but the last event for each counter. */
|
||||
for (at = base; at < top; at += basic->format_size) {
|
||||
u64 pebs_status;
|
||||
|
||||
pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
|
||||
pebs_status &= mask;
|
||||
basic = at;
|
||||
if (basic->format_size != cpuc->pebs_record_size)
|
||||
continue;
|
||||
|
||||
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX)
|
||||
counts[bit]++;
|
||||
pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask;
|
||||
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
|
||||
event = cpuc->events[bit];
|
||||
|
||||
if (WARN_ON_ONCE(!event) ||
|
||||
WARN_ON_ONCE(!event->attr.precise_ip))
|
||||
continue;
|
||||
|
||||
if (counts[bit]++) {
|
||||
__intel_pmu_pebs_event(event, iregs, regs, data, last[bit],
|
||||
setup_pebs_adaptive_sample_data);
|
||||
}
|
||||
last[bit] = at;
|
||||
}
|
||||
}
|
||||
|
||||
for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
|
||||
if (counts[bit] == 0)
|
||||
if (!counts[bit])
|
||||
continue;
|
||||
|
||||
event = cpuc->events[bit];
|
||||
if (WARN_ON_ONCE(!event))
|
||||
continue;
|
||||
|
||||
if (WARN_ON_ONCE(!event->attr.precise_ip))
|
||||
continue;
|
||||
|
||||
__intel_pmu_pebs_event(event, iregs, data, base,
|
||||
top, bit, counts[bit],
|
||||
setup_pebs_adaptive_sample_data);
|
||||
__intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
|
||||
counts[bit], setup_pebs_adaptive_sample_data);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/device.h>
|
||||
|
||||
#include <asm/cpuid.h>
|
||||
#include <asm/perf_event.h>
|
||||
#include <asm/insn.h>
|
||||
#include <asm/io.h>
|
||||
@ -201,10 +202,10 @@ static int __init pt_pmu_hw_init(void)
|
||||
* otherwise, zero for numerator stands for "not enumerated"
|
||||
* as per SDM
|
||||
*/
|
||||
if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
|
||||
if (boot_cpu_data.cpuid_level >= CPUID_LEAF_TSC) {
|
||||
u32 eax, ebx, ecx, edx;
|
||||
|
||||
cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
|
||||
cpuid(CPUID_LEAF_TSC, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
pt_pmu.tsc_art_num = ebx;
|
||||
pt_pmu.tsc_art_den = eax;
|
||||
|
@ -37,9 +37,6 @@ struct topa_entry {
|
||||
u64 rsvd4 : 12;
|
||||
};
|
||||
|
||||
/* TSC to Core Crystal Clock Ratio */
|
||||
#define CPUID_TSC_LEAF 0x15
|
||||
|
||||
struct pt_pmu {
|
||||
struct pmu pmu;
|
||||
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
|
||||
|
@ -745,7 +745,7 @@ static int uncore_pmu_event_init(struct perf_event *event)
|
||||
|
||||
pmu = uncore_event_to_pmu(event);
|
||||
/* no device found for this pmu */
|
||||
if (pmu->func_id < 0)
|
||||
if (!pmu->registered)
|
||||
return -ENOENT;
|
||||
|
||||
/* Sampling not supported yet */
|
||||
@ -992,7 +992,7 @@ static void uncore_types_exit(struct intel_uncore_type **types)
|
||||
uncore_type_exit(*types);
|
||||
}
|
||||
|
||||
static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
|
||||
static int __init uncore_type_init(struct intel_uncore_type *type)
|
||||
{
|
||||
struct intel_uncore_pmu *pmus;
|
||||
size_t size;
|
||||
@ -1005,7 +1005,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
|
||||
size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
|
||||
|
||||
for (i = 0; i < type->num_boxes; i++) {
|
||||
pmus[i].func_id = setid ? i : -1;
|
||||
pmus[i].pmu_idx = i;
|
||||
pmus[i].type = type;
|
||||
pmus[i].boxes = kzalloc(size, GFP_KERNEL);
|
||||
@ -1055,12 +1054,12 @@ err:
|
||||
}
|
||||
|
||||
static int __init
|
||||
uncore_types_init(struct intel_uncore_type **types, bool setid)
|
||||
uncore_types_init(struct intel_uncore_type **types)
|
||||
{
|
||||
int ret;
|
||||
|
||||
for (; *types; types++) {
|
||||
ret = uncore_type_init(*types, setid);
|
||||
ret = uncore_type_init(*types);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@ -1160,11 +1159,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
|
||||
if (!box)
|
||||
return -ENOMEM;
|
||||
|
||||
if (pmu->func_id < 0)
|
||||
pmu->func_id = pdev->devfn;
|
||||
else
|
||||
WARN_ON_ONCE(pmu->func_id != pdev->devfn);
|
||||
|
||||
atomic_inc(&box->refcnt);
|
||||
box->dieid = die;
|
||||
box->pci_dev = pdev;
|
||||
@ -1410,7 +1404,7 @@ static int __init uncore_pci_init(void)
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = uncore_types_init(uncore_pci_uncores, false);
|
||||
ret = uncore_types_init(uncore_pci_uncores);
|
||||
if (ret)
|
||||
goto errtype;
|
||||
|
||||
@ -1678,7 +1672,7 @@ static int __init uncore_cpu_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = uncore_types_init(uncore_msr_uncores, true);
|
||||
ret = uncore_types_init(uncore_msr_uncores);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
@ -1697,7 +1691,7 @@ static int __init uncore_mmio_init(void)
|
||||
struct intel_uncore_type **types = uncore_mmio_uncores;
|
||||
int ret;
|
||||
|
||||
ret = uncore_types_init(types, true);
|
||||
ret = uncore_types_init(types);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
|
@ -125,7 +125,6 @@ struct intel_uncore_pmu {
|
||||
struct pmu pmu;
|
||||
char name[UNCORE_PMU_NAME_LEN];
|
||||
int pmu_idx;
|
||||
int func_id;
|
||||
bool registered;
|
||||
atomic_t activeboxes;
|
||||
cpumask_t cpu_mask;
|
||||
|
@ -910,7 +910,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
|
||||
|
||||
pmu = uncore_event_to_pmu(event);
|
||||
/* no device found for this pmu */
|
||||
if (pmu->func_id < 0)
|
||||
if (!pmu->registered)
|
||||
return -ENOENT;
|
||||
|
||||
/* Sampling not supported yet */
|
||||
|
@ -6684,17 +6684,8 @@ void spr_uncore_mmio_init(void)
|
||||
/* GNR uncore support */
|
||||
|
||||
#define UNCORE_GNR_NUM_UNCORE_TYPES 23
|
||||
#define UNCORE_GNR_TYPE_15 15
|
||||
#define UNCORE_GNR_B2UPI 18
|
||||
#define UNCORE_GNR_TYPE_21 21
|
||||
#define UNCORE_GNR_TYPE_22 22
|
||||
|
||||
int gnr_uncore_units_ignore[] = {
|
||||
UNCORE_SPR_UPI,
|
||||
UNCORE_GNR_TYPE_15,
|
||||
UNCORE_GNR_B2UPI,
|
||||
UNCORE_GNR_TYPE_21,
|
||||
UNCORE_GNR_TYPE_22,
|
||||
UNCORE_IGNORE_END
|
||||
};
|
||||
|
||||
@ -6703,6 +6694,31 @@ static struct intel_uncore_type gnr_uncore_ubox = {
|
||||
.attr_update = uncore_alias_groups,
|
||||
};
|
||||
|
||||
static struct intel_uncore_type gnr_uncore_pciex8 = {
|
||||
SPR_UNCORE_PCI_COMMON_FORMAT(),
|
||||
.name = "pciex8",
|
||||
};
|
||||
|
||||
static struct intel_uncore_type gnr_uncore_pciex16 = {
|
||||
SPR_UNCORE_PCI_COMMON_FORMAT(),
|
||||
.name = "pciex16",
|
||||
};
|
||||
|
||||
static struct intel_uncore_type gnr_uncore_upi = {
|
||||
SPR_UNCORE_PCI_COMMON_FORMAT(),
|
||||
.name = "upi",
|
||||
};
|
||||
|
||||
static struct intel_uncore_type gnr_uncore_b2upi = {
|
||||
SPR_UNCORE_PCI_COMMON_FORMAT(),
|
||||
.name = "b2upi",
|
||||
};
|
||||
|
||||
static struct intel_uncore_type gnr_uncore_b2hot = {
|
||||
.name = "b2hot",
|
||||
.attr_update = uncore_alias_groups,
|
||||
};
|
||||
|
||||
static struct intel_uncore_type gnr_uncore_b2cmi = {
|
||||
SPR_UNCORE_PCI_COMMON_FORMAT(),
|
||||
.name = "b2cmi",
|
||||
@ -6727,21 +6743,21 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = {
|
||||
&gnr_uncore_ubox,
|
||||
&spr_uncore_imc,
|
||||
NULL,
|
||||
&gnr_uncore_upi,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
&spr_uncore_cxlcm,
|
||||
&spr_uncore_cxldp,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
&gnr_uncore_b2hot,
|
||||
&gnr_uncore_b2cmi,
|
||||
&gnr_uncore_b2cxl,
|
||||
NULL,
|
||||
&gnr_uncore_b2upi,
|
||||
NULL,
|
||||
&gnr_uncore_mdf_sbo,
|
||||
NULL,
|
||||
NULL,
|
||||
&gnr_uncore_pciex16,
|
||||
&gnr_uncore_pciex8,
|
||||
};
|
||||
|
||||
static struct freerunning_counters gnr_iio_freerunning[] = {
|
||||
|
@ -624,6 +624,7 @@ union perf_capabilities {
|
||||
u64 pebs_output_pt_available:1;
|
||||
u64 pebs_timing_info:1;
|
||||
u64 anythread_deprecated:1;
|
||||
u64 rdpmc_metrics_clear:1;
|
||||
};
|
||||
u64 capabilities;
|
||||
};
|
||||
|
@ -39,6 +39,10 @@
|
||||
* event: rapl_energy_psys
|
||||
* perf code: 0x5
|
||||
*
|
||||
* core counter: consumption of a single physical core
|
||||
* event: rapl_energy_core (power_core PMU)
|
||||
* perf code: 0x1
|
||||
*
|
||||
* We manage those counters as free running (read-only). They may be
|
||||
* use simultaneously by other tools, such as turbostat.
|
||||
*
|
||||
@ -70,18 +74,22 @@ MODULE_LICENSE("GPL");
|
||||
/*
|
||||
* RAPL energy status counters
|
||||
*/
|
||||
enum perf_rapl_events {
|
||||
enum perf_rapl_pkg_events {
|
||||
PERF_RAPL_PP0 = 0, /* all cores */
|
||||
PERF_RAPL_PKG, /* entire package */
|
||||
PERF_RAPL_RAM, /* DRAM */
|
||||
PERF_RAPL_PP1, /* gpu */
|
||||
PERF_RAPL_PSYS, /* psys */
|
||||
|
||||
PERF_RAPL_MAX,
|
||||
NR_RAPL_DOMAINS = PERF_RAPL_MAX,
|
||||
PERF_RAPL_PKG_EVENTS_MAX,
|
||||
NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
|
||||
};
|
||||
|
||||
static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
|
||||
#define PERF_RAPL_CORE 0 /* single core */
|
||||
#define PERF_RAPL_CORE_EVENTS_MAX 1
|
||||
#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
|
||||
|
||||
static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
|
||||
"pp0-core",
|
||||
"package",
|
||||
"dram",
|
||||
@ -89,6 +97,8 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
|
||||
"psys",
|
||||
};
|
||||
|
||||
static const char *const rapl_core_domain_name __initconst = "core";
|
||||
|
||||
/*
|
||||
* event code: LSB 8 bits, passed in attr->config
|
||||
* any other bit is reserved
|
||||
@ -112,7 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = { \
|
||||
* considered as either pkg-scope or die-scope, and we are considering
|
||||
* them as die-scope.
|
||||
*/
|
||||
#define rapl_pmu_is_pkg_scope() \
|
||||
#define rapl_pkg_pmu_is_pkg_scope() \
|
||||
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
|
||||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
|
||||
|
||||
@ -129,7 +139,8 @@ struct rapl_pmu {
|
||||
struct rapl_pmus {
|
||||
struct pmu pmu;
|
||||
unsigned int nr_rapl_pmu;
|
||||
struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
|
||||
unsigned int cntr_mask;
|
||||
struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
|
||||
};
|
||||
|
||||
enum rapl_unit_quirk {
|
||||
@ -139,44 +150,43 @@ enum rapl_unit_quirk {
|
||||
};
|
||||
|
||||
struct rapl_model {
|
||||
struct perf_msr *rapl_msrs;
|
||||
unsigned long events;
|
||||
struct perf_msr *rapl_pkg_msrs;
|
||||
struct perf_msr *rapl_core_msrs;
|
||||
unsigned long pkg_events;
|
||||
unsigned long core_events;
|
||||
unsigned int msr_power_unit;
|
||||
enum rapl_unit_quirk unit_quirk;
|
||||
};
|
||||
|
||||
/* 1/2^hw_unit Joule */
|
||||
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
|
||||
static struct rapl_pmus *rapl_pmus;
|
||||
static unsigned int rapl_cntr_mask;
|
||||
static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
|
||||
static int rapl_core_hw_unit __read_mostly;
|
||||
static struct rapl_pmus *rapl_pmus_pkg;
|
||||
static struct rapl_pmus *rapl_pmus_core;
|
||||
static u64 rapl_timer_ms;
|
||||
static struct perf_msr *rapl_msrs;
|
||||
static struct rapl_model *rapl_model;
|
||||
|
||||
/*
|
||||
* Helper functions to get the correct topology macros according to the
|
||||
* Helper function to get the correct topology id according to the
|
||||
* RAPL PMU scope.
|
||||
*/
|
||||
static inline unsigned int get_rapl_pmu_idx(int cpu)
|
||||
static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
|
||||
{
|
||||
return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
|
||||
topology_logical_die_id(cpu);
|
||||
}
|
||||
|
||||
static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
|
||||
{
|
||||
return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
|
||||
topology_die_cpumask(cpu);
|
||||
}
|
||||
|
||||
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
|
||||
{
|
||||
unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
|
||||
|
||||
/*
|
||||
* The unsigned check also catches the '-1' return value for non
|
||||
* existent mappings in the topology map.
|
||||
* Returns unsigned int, which converts the '-1' return value
|
||||
* (for non-existent mappings in topology map) to UINT_MAX, so
|
||||
* the error check in the caller is simplified.
|
||||
*/
|
||||
return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
|
||||
switch (scope) {
|
||||
case PERF_PMU_SCOPE_PKG:
|
||||
return topology_logical_package_id(cpu);
|
||||
case PERF_PMU_SCOPE_DIE:
|
||||
return topology_logical_die_id(cpu);
|
||||
case PERF_PMU_SCOPE_CORE:
|
||||
return topology_logical_core_id(cpu);
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
static inline u64 rapl_read_counter(struct perf_event *event)
|
||||
@ -186,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
|
||||
return raw;
|
||||
}
|
||||
|
||||
static inline u64 rapl_scale(u64 v, int cfg)
|
||||
static inline u64 rapl_scale(u64 v, struct perf_event *event)
|
||||
{
|
||||
if (cfg > NR_RAPL_DOMAINS) {
|
||||
pr_warn("Invalid domain %d, failed to scale data\n", cfg);
|
||||
return v;
|
||||
}
|
||||
int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
|
||||
|
||||
if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
|
||||
hw_unit = rapl_core_hw_unit;
|
||||
|
||||
/*
|
||||
* scale delta to smallest unit (1/2^32)
|
||||
* users must then scale back: count * 1/(1e9*2^32) to get Joules
|
||||
* or use ldexp(count, -32).
|
||||
* Watts = Joules/Time delta
|
||||
*/
|
||||
return v << (32 - rapl_hw_unit[cfg - 1]);
|
||||
return v << (32 - hw_unit);
|
||||
}
|
||||
|
||||
static u64 rapl_event_update(struct perf_event *event)
|
||||
@ -225,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
|
||||
delta = (new_raw_count << shift) - (prev_raw_count << shift);
|
||||
delta >>= shift;
|
||||
|
||||
sdelta = rapl_scale(delta, event->hw.config);
|
||||
sdelta = rapl_scale(delta, event);
|
||||
|
||||
local64_add(sdelta, &event->count);
|
||||
|
||||
@ -240,34 +251,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
|
||||
|
||||
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
|
||||
{
|
||||
struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
|
||||
struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
|
||||
struct perf_event *event;
|
||||
unsigned long flags;
|
||||
|
||||
if (!pmu->n_active)
|
||||
if (!rapl_pmu->n_active)
|
||||
return HRTIMER_NORESTART;
|
||||
|
||||
raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
|
||||
list_for_each_entry(event, &pmu->active_list, active_entry)
|
||||
list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
|
||||
rapl_event_update(event);
|
||||
|
||||
raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
|
||||
hrtimer_forward_now(hrtimer, pmu->timer_interval);
|
||||
hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
|
||||
|
||||
return HRTIMER_RESTART;
|
||||
}
|
||||
|
||||
static void rapl_hrtimer_init(struct rapl_pmu *pmu)
|
||||
static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
|
||||
{
|
||||
struct hrtimer *hr = &pmu->hrtimer;
|
||||
struct hrtimer *hr = &rapl_pmu->hrtimer;
|
||||
|
||||
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
hr->function = rapl_hrtimer_handle;
|
||||
}
|
||||
|
||||
static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
|
||||
static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
|
||||
struct perf_event *event)
|
||||
{
|
||||
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
|
||||
@ -275,39 +286,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
|
||||
|
||||
event->hw.state = 0;
|
||||
|
||||
list_add_tail(&event->active_entry, &pmu->active_list);
|
||||
list_add_tail(&event->active_entry, &rapl_pmu->active_list);
|
||||
|
||||
local64_set(&event->hw.prev_count, rapl_read_counter(event));
|
||||
|
||||
pmu->n_active++;
|
||||
if (pmu->n_active == 1)
|
||||
rapl_start_hrtimer(pmu);
|
||||
rapl_pmu->n_active++;
|
||||
if (rapl_pmu->n_active == 1)
|
||||
rapl_start_hrtimer(rapl_pmu);
|
||||
}
|
||||
|
||||
static void rapl_pmu_event_start(struct perf_event *event, int mode)
|
||||
{
|
||||
struct rapl_pmu *pmu = event->pmu_private;
|
||||
struct rapl_pmu *rapl_pmu = event->pmu_private;
|
||||
unsigned long flags;
|
||||
|
||||
raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
__rapl_pmu_event_start(pmu, event);
|
||||
raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
__rapl_pmu_event_start(rapl_pmu, event);
|
||||
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
}
|
||||
|
||||
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
|
||||
{
|
||||
struct rapl_pmu *pmu = event->pmu_private;
|
||||
struct rapl_pmu *rapl_pmu = event->pmu_private;
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
unsigned long flags;
|
||||
|
||||
raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
|
||||
/* mark event as deactivated and stopped */
|
||||
if (!(hwc->state & PERF_HES_STOPPED)) {
|
||||
WARN_ON_ONCE(pmu->n_active <= 0);
|
||||
pmu->n_active--;
|
||||
if (pmu->n_active == 0)
|
||||
hrtimer_cancel(&pmu->hrtimer);
|
||||
WARN_ON_ONCE(rapl_pmu->n_active <= 0);
|
||||
rapl_pmu->n_active--;
|
||||
if (rapl_pmu->n_active == 0)
|
||||
hrtimer_cancel(&rapl_pmu->hrtimer);
|
||||
|
||||
list_del(&event->active_entry);
|
||||
|
||||
@ -325,23 +336,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
|
||||
hwc->state |= PERF_HES_UPTODATE;
|
||||
}
|
||||
|
||||
raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
}
|
||||
|
||||
static int rapl_pmu_event_add(struct perf_event *event, int mode)
|
||||
{
|
||||
struct rapl_pmu *pmu = event->pmu_private;
|
||||
struct rapl_pmu *rapl_pmu = event->pmu_private;
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
unsigned long flags;
|
||||
|
||||
raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
|
||||
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
|
||||
|
||||
if (mode & PERF_EF_START)
|
||||
__rapl_pmu_event_start(pmu, event);
|
||||
__rapl_pmu_event_start(rapl_pmu, event);
|
||||
|
||||
raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -354,12 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
|
||||
static int rapl_pmu_event_init(struct perf_event *event)
|
||||
{
|
||||
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
|
||||
int bit, ret = 0;
|
||||
struct rapl_pmu *pmu;
|
||||
int bit, rapl_pmus_scope, ret = 0;
|
||||
struct rapl_pmu *rapl_pmu;
|
||||
unsigned int rapl_pmu_idx;
|
||||
struct rapl_pmus *rapl_pmus;
|
||||
|
||||
/* only look at RAPL events */
|
||||
if (event->attr.type != rapl_pmus->pmu.type)
|
||||
return -ENOENT;
|
||||
/* unsupported modes and filters */
|
||||
if (event->attr.sample_period) /* no sampling */
|
||||
return -EINVAL;
|
||||
|
||||
/* check only supported bits are set */
|
||||
if (event->attr.config & ~RAPL_EVENT_MASK)
|
||||
@ -368,26 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event)
|
||||
if (event->cpu < 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
|
||||
rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
|
||||
if (!rapl_pmus)
|
||||
return -EINVAL;
|
||||
rapl_pmus_scope = rapl_pmus->pmu.scope;
|
||||
|
||||
cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
|
||||
bit = cfg - 1;
|
||||
if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
|
||||
/* only look at RAPL package events */
|
||||
if (event->attr.type != rapl_pmus_pkg->pmu.type)
|
||||
return -ENOENT;
|
||||
|
||||
cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
|
||||
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
|
||||
return -EINVAL;
|
||||
|
||||
bit = cfg - 1;
|
||||
event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
|
||||
} else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
|
||||
/* only look at RAPL core events */
|
||||
if (event->attr.type != rapl_pmus_core->pmu.type)
|
||||
return -ENOENT;
|
||||
|
||||
cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
|
||||
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
|
||||
return -EINVAL;
|
||||
|
||||
bit = cfg - 1;
|
||||
event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
|
||||
} else
|
||||
return -EINVAL;
|
||||
|
||||
/* check event supported */
|
||||
if (!(rapl_cntr_mask & (1 << bit)))
|
||||
if (!(rapl_pmus->cntr_mask & (1 << bit)))
|
||||
return -EINVAL;
|
||||
|
||||
/* unsupported modes and filters */
|
||||
if (event->attr.sample_period) /* no sampling */
|
||||
rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
|
||||
if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
|
||||
return -EINVAL;
|
||||
|
||||
/* must be done before validate_group */
|
||||
pmu = cpu_to_rapl_pmu(event->cpu);
|
||||
if (!pmu)
|
||||
rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
|
||||
if (!rapl_pmu)
|
||||
return -EINVAL;
|
||||
event->pmu_private = pmu;
|
||||
event->hw.event_base = rapl_msrs[bit].msr;
|
||||
|
||||
event->pmu_private = rapl_pmu;
|
||||
event->hw.config = cfg;
|
||||
event->hw.idx = bit;
|
||||
|
||||
@ -404,12 +440,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
|
||||
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
|
||||
RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
|
||||
RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
|
||||
RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01");
|
||||
|
||||
RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules");
|
||||
|
||||
/*
|
||||
* we compute in 0.23 nJ increments regardless of MSR
|
||||
@ -419,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890
|
||||
RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
|
||||
RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
|
||||
RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
|
||||
RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10");
|
||||
|
||||
/*
|
||||
* There are no default events, but we need to create
|
||||
@ -451,6 +490,12 @@ static const struct attribute_group *rapl_attr_groups[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group *rapl_core_attr_groups[] = {
|
||||
&rapl_pmu_format_group,
|
||||
&rapl_pmu_events_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute *rapl_events_cores[] = {
|
||||
EVENT_PTR(rapl_cores),
|
||||
EVENT_PTR(rapl_cores_unit),
|
||||
@ -511,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = {
|
||||
.attrs = rapl_events_psys,
|
||||
};
|
||||
|
||||
static struct attribute *rapl_events_core[] = {
|
||||
EVENT_PTR(rapl_core),
|
||||
EVENT_PTR(rapl_core_unit),
|
||||
EVENT_PTR(rapl_core_scale),
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group rapl_events_core_group = {
|
||||
.name = "events",
|
||||
.attrs = rapl_events_core,
|
||||
};
|
||||
|
||||
static bool test_msr(int idx, void *data)
|
||||
{
|
||||
return test_bit(idx, (unsigned long *) data);
|
||||
@ -536,11 +593,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
|
||||
};
|
||||
|
||||
/*
|
||||
* Force to PERF_RAPL_MAX size due to:
|
||||
* - perf_msr_probe(PERF_RAPL_MAX)
|
||||
* Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
|
||||
* - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
|
||||
* - want to use same event codes across both architectures
|
||||
*/
|
||||
static struct perf_msr amd_rapl_msrs[] = {
|
||||
static struct perf_msr amd_rapl_pkg_msrs[] = {
|
||||
[PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
|
||||
[PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
|
||||
[PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
|
||||
@ -548,18 +605,25 @@ static struct perf_msr amd_rapl_msrs[] = {
|
||||
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
|
||||
};
|
||||
|
||||
static int rapl_check_hw_unit(struct rapl_model *rm)
|
||||
static struct perf_msr amd_rapl_core_msrs[] = {
|
||||
[PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group,
|
||||
test_msr, false, RAPL_MSR_MASK },
|
||||
};
|
||||
|
||||
static int rapl_check_hw_unit(void)
|
||||
{
|
||||
u64 msr_rapl_power_unit_bits;
|
||||
int i;
|
||||
|
||||
/* protect rdmsrl() to handle virtualization */
|
||||
if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
|
||||
if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
|
||||
return -1;
|
||||
for (i = 0; i < NR_RAPL_DOMAINS; i++)
|
||||
rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
|
||||
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
|
||||
rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
|
||||
|
||||
switch (rm->unit_quirk) {
|
||||
rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
|
||||
|
||||
switch (rapl_model->unit_quirk) {
|
||||
/*
|
||||
* DRAM domain on HSW server and KNL has fixed energy unit which can be
|
||||
* different than the unit from power unit MSR. See
|
||||
@ -567,17 +631,16 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
|
||||
* of 2. Datasheet, September 2014, Reference Number: 330784-001 "
|
||||
*/
|
||||
case RAPL_UNIT_QUIRK_INTEL_HSW:
|
||||
rapl_hw_unit[PERF_RAPL_RAM] = 16;
|
||||
rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
|
||||
break;
|
||||
/* SPR uses a fixed energy unit for Psys domain. */
|
||||
case RAPL_UNIT_QUIRK_INTEL_SPR:
|
||||
rapl_hw_unit[PERF_RAPL_PSYS] = 0;
|
||||
rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Calculate the timer rate:
|
||||
* Use reference of 200W for scaling the timeout to avoid counter
|
||||
@ -586,9 +649,9 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
|
||||
* if hw unit is 32, then we use 2 ms 1/200/2
|
||||
*/
|
||||
rapl_timer_ms = 2;
|
||||
if (rapl_hw_unit[0] < 32) {
|
||||
if (rapl_pkg_hw_unit[0] < 32) {
|
||||
rapl_timer_ms = (1000 / (2 * 100));
|
||||
rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
|
||||
rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -596,24 +659,32 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
|
||||
static void __init rapl_advertise(void)
|
||||
{
|
||||
int i;
|
||||
int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
|
||||
|
||||
if (rapl_pmus_core)
|
||||
num_counters += hweight32(rapl_pmus_core->cntr_mask);
|
||||
|
||||
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
|
||||
hweight32(rapl_cntr_mask), rapl_timer_ms);
|
||||
num_counters, rapl_timer_ms);
|
||||
|
||||
for (i = 0; i < NR_RAPL_DOMAINS; i++) {
|
||||
if (rapl_cntr_mask & (1 << i)) {
|
||||
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
|
||||
if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
|
||||
pr_info("hw unit of domain %s 2^-%d Joules\n",
|
||||
rapl_domain_names[i], rapl_hw_unit[i]);
|
||||
rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE)))
|
||||
pr_info("hw unit of domain %s 2^-%d Joules\n",
|
||||
rapl_core_domain_name, rapl_core_hw_unit);
|
||||
}
|
||||
|
||||
static void cleanup_rapl_pmus(void)
|
||||
static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
|
||||
kfree(rapl_pmus->pmus[i]);
|
||||
kfree(rapl_pmus->rapl_pmu[i]);
|
||||
kfree(rapl_pmus);
|
||||
}
|
||||
|
||||
@ -626,46 +697,60 @@ static const struct attribute_group *rapl_attr_update[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static int __init init_rapl_pmu(void)
|
||||
static const struct attribute_group *rapl_core_attr_update[] = {
|
||||
&rapl_events_core_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
|
||||
{
|
||||
struct rapl_pmu *pmu;
|
||||
struct rapl_pmu *rapl_pmu;
|
||||
int idx;
|
||||
|
||||
for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
|
||||
pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
|
||||
if (!pmu)
|
||||
rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL);
|
||||
if (!rapl_pmu)
|
||||
goto free;
|
||||
|
||||
raw_spin_lock_init(&pmu->lock);
|
||||
INIT_LIST_HEAD(&pmu->active_list);
|
||||
pmu->pmu = &rapl_pmus->pmu;
|
||||
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
|
||||
rapl_hrtimer_init(pmu);
|
||||
raw_spin_lock_init(&rapl_pmu->lock);
|
||||
INIT_LIST_HEAD(&rapl_pmu->active_list);
|
||||
rapl_pmu->pmu = &rapl_pmus->pmu;
|
||||
rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
|
||||
rapl_hrtimer_init(rapl_pmu);
|
||||
|
||||
rapl_pmus->pmus[idx] = pmu;
|
||||
rapl_pmus->rapl_pmu[idx] = rapl_pmu;
|
||||
}
|
||||
|
||||
return 0;
|
||||
free:
|
||||
for (; idx > 0; idx--)
|
||||
kfree(rapl_pmus->pmus[idx - 1]);
|
||||
kfree(rapl_pmus->rapl_pmu[idx - 1]);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static int __init init_rapl_pmus(void)
|
||||
static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
|
||||
const struct attribute_group **rapl_attr_groups,
|
||||
const struct attribute_group **rapl_attr_update)
|
||||
{
|
||||
int nr_rapl_pmu = topology_max_packages();
|
||||
int rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
|
||||
struct rapl_pmus *rapl_pmus;
|
||||
|
||||
if (!rapl_pmu_is_pkg_scope()) {
|
||||
nr_rapl_pmu *= topology_max_dies_per_package();
|
||||
rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
|
||||
}
|
||||
/*
|
||||
* rapl_pmu_scope must be either PKG, DIE or CORE
|
||||
*/
|
||||
if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
|
||||
nr_rapl_pmu *= topology_max_dies_per_package();
|
||||
else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE)
|
||||
nr_rapl_pmu *= topology_num_cores_per_package();
|
||||
else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG)
|
||||
return -EINVAL;
|
||||
|
||||
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
|
||||
rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
|
||||
if (!rapl_pmus)
|
||||
return -ENOMEM;
|
||||
|
||||
*rapl_pmus_ptr = rapl_pmus;
|
||||
|
||||
rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
|
||||
rapl_pmus->pmu.attr_groups = rapl_attr_groups;
|
||||
rapl_pmus->pmu.attr_update = rapl_attr_update;
|
||||
@ -680,75 +765,77 @@ static int __init init_rapl_pmus(void)
|
||||
rapl_pmus->pmu.module = THIS_MODULE;
|
||||
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
|
||||
|
||||
return init_rapl_pmu();
|
||||
return init_rapl_pmu(rapl_pmus);
|
||||
}
|
||||
|
||||
static struct rapl_model model_snb = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_PP1),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_snbep = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_hsw = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM) |
|
||||
BIT(PERF_RAPL_PP1),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_hsx = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM),
|
||||
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_knl = {
|
||||
.events = BIT(PERF_RAPL_PKG) |
|
||||
.pkg_events = BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM),
|
||||
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_skl = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM) |
|
||||
BIT(PERF_RAPL_PP1) |
|
||||
BIT(PERF_RAPL_PSYS),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_spr = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM) |
|
||||
BIT(PERF_RAPL_PSYS),
|
||||
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_spr_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_spr_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_amd_hygon = {
|
||||
.events = BIT(PERF_RAPL_PKG),
|
||||
.pkg_events = BIT(PERF_RAPL_PKG),
|
||||
.core_events = BIT(PERF_RAPL_CORE),
|
||||
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = amd_rapl_msrs,
|
||||
.rapl_pkg_msrs = amd_rapl_pkg_msrs,
|
||||
.rapl_core_msrs = amd_rapl_core_msrs,
|
||||
};
|
||||
|
||||
static const struct x86_cpu_id rapl_model_match[] __initconst = {
|
||||
@ -804,45 +891,73 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
|
||||
static int __init rapl_pmu_init(void)
|
||||
{
|
||||
const struct x86_cpu_id *id;
|
||||
struct rapl_model *rm;
|
||||
int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
|
||||
int ret;
|
||||
|
||||
if (rapl_pkg_pmu_is_pkg_scope())
|
||||
rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
|
||||
|
||||
id = x86_match_cpu(rapl_model_match);
|
||||
if (!id)
|
||||
return -ENODEV;
|
||||
|
||||
rm = (struct rapl_model *) id->driver_data;
|
||||
rapl_model = (struct rapl_model *) id->driver_data;
|
||||
|
||||
rapl_msrs = rm->rapl_msrs;
|
||||
|
||||
rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
|
||||
false, (void *) &rm->events);
|
||||
|
||||
ret = rapl_check_hw_unit(rm);
|
||||
ret = rapl_check_hw_unit();
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = init_rapl_pmus();
|
||||
ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
|
||||
rapl_attr_update);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
|
||||
rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
|
||||
PERF_RAPL_PKG_EVENTS_MAX, false,
|
||||
(void *) &rapl_model->pkg_events);
|
||||
|
||||
ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (rapl_model->core_events) {
|
||||
ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
|
||||
rapl_core_attr_groups,
|
||||
rapl_core_attr_update);
|
||||
if (ret) {
|
||||
pr_warn("power-core PMU initialization failed (%d)\n", ret);
|
||||
goto core_init_failed;
|
||||
}
|
||||
|
||||
rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
|
||||
PERF_RAPL_CORE_EVENTS_MAX, false,
|
||||
(void *) &rapl_model->core_events);
|
||||
|
||||
ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1);
|
||||
if (ret) {
|
||||
pr_warn("power-core PMU registration failed (%d)\n", ret);
|
||||
cleanup_rapl_pmus(rapl_pmus_core);
|
||||
}
|
||||
}
|
||||
|
||||
core_init_failed:
|
||||
rapl_advertise();
|
||||
return 0;
|
||||
|
||||
out:
|
||||
pr_warn("Initialization failed (%d), disabled\n", ret);
|
||||
cleanup_rapl_pmus();
|
||||
cleanup_rapl_pmus(rapl_pmus_pkg);
|
||||
return ret;
|
||||
}
|
||||
module_init(rapl_pmu_init);
|
||||
|
||||
static void __exit intel_rapl_exit(void)
|
||||
{
|
||||
perf_pmu_unregister(&rapl_pmus->pmu);
|
||||
cleanup_rapl_pmus();
|
||||
if (rapl_pmus_core) {
|
||||
perf_pmu_unregister(&rapl_pmus_core->pmu);
|
||||
cleanup_rapl_pmus(rapl_pmus_core);
|
||||
}
|
||||
perf_pmu_unregister(&rapl_pmus_pkg->pmu);
|
||||
cleanup_rapl_pmus(rapl_pmus_pkg);
|
||||
}
|
||||
module_exit(intel_rapl_exit);
|
||||
|
@ -664,7 +664,7 @@ void __init hv_vtom_init(void)
|
||||
x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
|
||||
|
||||
/* Set WB as the default cache mode. */
|
||||
mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK);
|
||||
guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
|
||||
}
|
||||
|
||||
#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/stringify.h>
|
||||
#include <linux/objtool.h>
|
||||
#include <asm/asm.h>
|
||||
|
||||
#define ALT_FLAGS_SHIFT 16
|
||||
@ -54,16 +55,6 @@
|
||||
#define LOCK_PREFIX ""
|
||||
#endif
|
||||
|
||||
/*
|
||||
* objtool annotation to ignore the alternatives and only consider the original
|
||||
* instruction(s).
|
||||
*/
|
||||
#define ANNOTATE_IGNORE_ALTERNATIVE \
|
||||
"999:\n\t" \
|
||||
".pushsection .discard.ignore_alts\n\t" \
|
||||
".long 999b\n\t" \
|
||||
".popsection\n\t"
|
||||
|
||||
/*
|
||||
* The patching flags are part of the upper bits of the @ft_flags parameter when
|
||||
* specifying them. The split is currently like this:
|
||||
@ -310,17 +301,6 @@ void nop_func(void);
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/*
|
||||
* objtool annotation to ignore the alternatives and only consider the original
|
||||
* instruction(s).
|
||||
*/
|
||||
.macro ANNOTATE_IGNORE_ALTERNATIVE
|
||||
.Lannotate_\@:
|
||||
.pushsection .discard.ignore_alts
|
||||
.long .Lannotate_\@
|
||||
.popsection
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Issue one struct alt_instr descriptor entry (need to put it into
|
||||
* the section .altinstructions, see below). This entry contains
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
#include <linux/ioport.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/refcount.h>
|
||||
#include <asm/amd_node.h>
|
||||
|
||||
struct amd_nb_bus_dev_range {
|
||||
u8 bus;
|
||||
@ -21,49 +21,16 @@ extern int amd_numa_init(void);
|
||||
extern int amd_get_subcaches(int);
|
||||
extern int amd_set_subcaches(int, unsigned long);
|
||||
|
||||
int __must_check amd_smn_read(u16 node, u32 address, u32 *value);
|
||||
int __must_check amd_smn_write(u16 node, u32 address, u32 value);
|
||||
|
||||
struct amd_l3_cache {
|
||||
unsigned indices;
|
||||
u8 subcaches[4];
|
||||
};
|
||||
|
||||
struct threshold_block {
|
||||
unsigned int block; /* Number within bank */
|
||||
unsigned int bank; /* MCA bank the block belongs to */
|
||||
unsigned int cpu; /* CPU which controls MCA bank */
|
||||
u32 address; /* MSR address for the block */
|
||||
u16 interrupt_enable; /* Enable/Disable APIC interrupt */
|
||||
bool interrupt_capable; /* Bank can generate an interrupt. */
|
||||
|
||||
u16 threshold_limit; /*
|
||||
* Value upon which threshold
|
||||
* interrupt is generated.
|
||||
*/
|
||||
|
||||
struct kobject kobj; /* sysfs object */
|
||||
struct list_head miscj; /*
|
||||
* List of threshold blocks
|
||||
* within a bank.
|
||||
*/
|
||||
};
|
||||
|
||||
struct threshold_bank {
|
||||
struct kobject *kobj;
|
||||
struct threshold_block *blocks;
|
||||
|
||||
/* initialized to the number of CPUs on the node sharing this bank */
|
||||
refcount_t cpus;
|
||||
unsigned int shared;
|
||||
};
|
||||
|
||||
struct amd_northbridge {
|
||||
struct pci_dev *root;
|
||||
struct pci_dev *misc;
|
||||
struct pci_dev *link;
|
||||
struct amd_l3_cache l3_cache;
|
||||
struct threshold_bank *bank4;
|
||||
};
|
||||
|
||||
struct amd_northbridge_info {
|
||||
@ -82,23 +49,6 @@ u16 amd_nb_num(void);
|
||||
bool amd_nb_has_feature(unsigned int feature);
|
||||
struct amd_northbridge *node_to_amd_nb(int node);
|
||||
|
||||
static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev)
|
||||
{
|
||||
struct pci_dev *misc;
|
||||
int i;
|
||||
|
||||
for (i = 0; i != amd_nb_num(); i++) {
|
||||
misc = node_to_amd_nb(i)->misc;
|
||||
|
||||
if (pci_domain_nr(misc->bus) == pci_domain_nr(pdev->bus) &&
|
||||
PCI_SLOT(misc->devfn) == PCI_SLOT(pdev->devfn))
|
||||
return i;
|
||||
}
|
||||
|
||||
WARN(1, "Unable to find AMD Northbridge id for %s\n", pci_name(pdev));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool amd_gart_present(void)
|
||||
{
|
||||
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
|
||||
|
36
arch/x86/include/asm/amd_node.h
Normal file
36
arch/x86/include/asm/amd_node.h
Normal file
@ -0,0 +1,36 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* AMD Node helper functions and common defines
|
||||
*
|
||||
* Copyright (c) 2024, Advanced Micro Devices, Inc.
|
||||
* All Rights Reserved.
|
||||
*
|
||||
* Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
|
||||
*
|
||||
* Note:
|
||||
* Items in this file may only be used in a single place.
|
||||
* However, it's prudent to keep all AMD Node functionality
|
||||
* in a unified place rather than spreading throughout the
|
||||
* kernel.
|
||||
*/
|
||||
|
||||
#ifndef _ASM_X86_AMD_NODE_H_
|
||||
#define _ASM_X86_AMD_NODE_H_
|
||||
|
||||
#include <linux/pci.h>
|
||||
|
||||
#define MAX_AMD_NUM_NODES 8
|
||||
#define AMD_NODE0_PCI_SLOT 0x18
|
||||
|
||||
struct pci_dev *amd_node_get_func(u16 node, u8 func);
|
||||
struct pci_dev *amd_node_get_root(u16 node);
|
||||
|
||||
static inline u16 amd_num_nodes(void)
|
||||
{
|
||||
return topology_amd_nodes_per_pkg() * topology_max_packages();
|
||||
}
|
||||
|
||||
int __must_check amd_smn_read(u16 node, u32 address, u32 *value);
|
||||
int __must_check amd_smn_write(u16 node, u32 address, u32 value);
|
||||
|
||||
#endif /*_ASM_X86_AMD_NODE_H_*/
|
@ -92,7 +92,7 @@ do { \
|
||||
do { \
|
||||
__auto_type __flags = BUGFLAG_WARNING|(flags); \
|
||||
instrumentation_begin(); \
|
||||
_BUG_FLAGS(ASM_UD2, __flags, ASM_REACHABLE); \
|
||||
_BUG_FLAGS(ASM_UD2, __flags, ANNOTATE_REACHABLE(1b)); \
|
||||
instrumentation_end(); \
|
||||
} while (0)
|
||||
|
||||
|
@ -56,7 +56,6 @@
|
||||
/* x86_cpu_id::flags */
|
||||
#define X86_CPU_ID_FLAG_ENTRY_VALID BIT(0)
|
||||
|
||||
#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
|
||||
/**
|
||||
* X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
|
||||
* @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
|
||||
@ -208,6 +207,7 @@
|
||||
VFM_MODEL(vfm), \
|
||||
X86_STEPPING_ANY, X86_FEATURE_ANY, data)
|
||||
|
||||
#define __X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
|
||||
/**
|
||||
* X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping
|
||||
* @vfm: Encoded 8-bits each for vendor, family, model
|
||||
@ -218,12 +218,13 @@
|
||||
*
|
||||
* feature is set to wildcard
|
||||
*/
|
||||
#define X86_MATCH_VFM_STEPPINGS(vfm, steppings, data) \
|
||||
X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
|
||||
VFM_VENDOR(vfm), \
|
||||
VFM_FAMILY(vfm), \
|
||||
VFM_MODEL(vfm), \
|
||||
steppings, X86_FEATURE_ANY, data)
|
||||
#define X86_MATCH_VFM_STEPS(vfm, min_step, max_step, data) \
|
||||
X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
|
||||
VFM_VENDOR(vfm), \
|
||||
VFM_FAMILY(vfm), \
|
||||
VFM_MODEL(vfm), \
|
||||
__X86_STEPPINGS(min_step, max_step), \
|
||||
X86_FEATURE_ANY, data)
|
||||
|
||||
/**
|
||||
* X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature
|
||||
@ -242,41 +243,7 @@
|
||||
VFM_MODEL(vfm), \
|
||||
X86_STEPPING_ANY, feature, data)
|
||||
|
||||
/*
|
||||
* Match specific microcode revisions.
|
||||
*
|
||||
* vendor/family/model/stepping must be all set.
|
||||
*
|
||||
* Only checks against the boot CPU. When mixed-stepping configs are
|
||||
* valid for a CPU model, add a quirk for every valid stepping and
|
||||
* do the fine-tuning in the quirk handler.
|
||||
*/
|
||||
|
||||
struct x86_cpu_desc {
|
||||
u8 x86_family;
|
||||
u8 x86_vendor;
|
||||
u8 x86_model;
|
||||
u8 x86_stepping;
|
||||
u32 x86_microcode_rev;
|
||||
};
|
||||
|
||||
#define INTEL_CPU_DESC(vfm, stepping, revision) { \
|
||||
.x86_family = VFM_FAMILY(vfm), \
|
||||
.x86_vendor = VFM_VENDOR(vfm), \
|
||||
.x86_model = VFM_MODEL(vfm), \
|
||||
.x86_stepping = (stepping), \
|
||||
.x86_microcode_rev = (revision), \
|
||||
}
|
||||
|
||||
#define AMD_CPU_DESC(fam, model, stepping, revision) { \
|
||||
.x86_family = (fam), \
|
||||
.x86_vendor = X86_VENDOR_AMD, \
|
||||
.x86_model = (model), \
|
||||
.x86_stepping = (stepping), \
|
||||
.x86_microcode_rev = (revision), \
|
||||
}
|
||||
|
||||
extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
|
||||
extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table);
|
||||
extern bool x86_match_min_microcode_rev(const struct x86_cpu_id *table);
|
||||
|
||||
#endif /* _ASM_X86_CPU_DEVICE_ID */
|
||||
|
@ -132,11 +132,12 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
|
||||
x86_this_cpu_test_bit(bit, cpu_info.x86_capability))
|
||||
|
||||
/*
|
||||
* This macro is for detection of features which need kernel
|
||||
* infrastructure to be used. It may *not* directly test the CPU
|
||||
* itself. Use the cpu_has() family if you want true runtime
|
||||
* testing of CPU features, like in hypervisor code where you are
|
||||
* supporting a possible guest feature where host support for it
|
||||
* This is the default CPU features testing macro to use in code.
|
||||
*
|
||||
* It is for detection of features which need kernel infrastructure to be
|
||||
* used. It may *not* directly test the CPU itself. Use the cpu_has() family
|
||||
* if you want true runtime testing of CPU features, like in hypervisor code
|
||||
* where you are supporting a possible guest feature where host support for it
|
||||
* is not relevant.
|
||||
*/
|
||||
#define cpu_feature_enabled(bit) \
|
||||
@ -161,13 +162,6 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
|
||||
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
|
||||
|
||||
/*
|
||||
* Static testing of CPU features. Used the same as boot_cpu_has(). It
|
||||
* statically patches the target code for additional performance. Use
|
||||
* static_cpu_has() only in fast paths, where every cycle counts. Which
|
||||
* means that the boot_cpu_has() variant is already fast enough for the
|
||||
* majority of cases and you should stick to using it as it is generally
|
||||
* only two instructions: a RIP-relative MOV and a TEST.
|
||||
*
|
||||
* Do not use an "m" constraint for [cap_byte] here: gcc doesn't know
|
||||
* that this is only used on a fallback path and will sometimes cause
|
||||
* it to manifest the address of boot_cpu_data in a register, fouling
|
||||
|
@ -83,8 +83,8 @@
|
||||
#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */
|
||||
#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */
|
||||
#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */
|
||||
#define X86_FEATURE_P3 ( 3*32+ 6) /* P3 */
|
||||
#define X86_FEATURE_P4 ( 3*32+ 7) /* P4 */
|
||||
/* Free ( 3*32+ 6) */
|
||||
/* Free ( 3*32+ 7) */
|
||||
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */
|
||||
#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */
|
||||
#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */
|
||||
@ -443,14 +443,16 @@
|
||||
#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* Speculative Store Bypass Disable */
|
||||
|
||||
/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */
|
||||
#define X86_FEATURE_SME (19*32+ 0) /* "sme" AMD Secure Memory Encryption */
|
||||
#define X86_FEATURE_SEV (19*32+ 1) /* "sev" AMD Secure Encrypted Virtualization */
|
||||
#define X86_FEATURE_SME (19*32+ 0) /* "sme" Secure Memory Encryption */
|
||||
#define X86_FEATURE_SEV (19*32+ 1) /* "sev" Secure Encrypted Virtualization */
|
||||
#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */
|
||||
#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" AMD Secure Encrypted Virtualization - Encrypted State */
|
||||
#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" AMD Secure Encrypted Virtualization - Secure Nested Paging */
|
||||
#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" Secure Encrypted Virtualization - Encrypted State */
|
||||
#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" Secure Encrypted Virtualization - Secure Nested Paging */
|
||||
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */
|
||||
#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */
|
||||
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */
|
||||
#define X86_FEATURE_SME_COHERENT (19*32+10) /* hardware-enforced cache coherency */
|
||||
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */
|
||||
#define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */
|
||||
#define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */
|
||||
#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
|
||||
#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */
|
||||
|
||||
@ -465,6 +467,7 @@
|
||||
#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */
|
||||
#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
|
||||
#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */
|
||||
#define X86_FEATURE_SRSO_USER_KERNEL_NO (20*32+30) /* CPU is not affected by SRSO across user/kernel boundaries */
|
||||
|
||||
/*
|
||||
* Extended auxiliary flags: Linux defined - for features scattered in various
|
||||
|
@ -21,6 +21,13 @@ enum cpuid_regs_idx {
|
||||
CPUID_EDX,
|
||||
};
|
||||
|
||||
#define CPUID_LEAF_MWAIT 0x5
|
||||
#define CPUID_LEAF_DCA 0x9
|
||||
#define CPUID_LEAF_XSTATE 0x0d
|
||||
#define CPUID_LEAF_TSC 0x15
|
||||
#define CPUID_LEAF_FREQ 0x16
|
||||
#define CPUID_LEAF_TILE 0x1d
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
bool have_cpuid_p(void);
|
||||
#else
|
||||
|
@ -12,10 +12,6 @@
|
||||
/* Bit 63 of XCR0 is reserved for future expansion */
|
||||
#define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
|
||||
|
||||
#define XSTATE_CPUID 0x0000000d
|
||||
|
||||
#define TILE_CPUID 0x0000001d
|
||||
|
||||
#define FXSAVE_SIZE 512
|
||||
|
||||
#define XSAVE_HDR_SIZE 64
|
||||
|
@ -2,7 +2,7 @@
|
||||
#ifndef _ASM_X86_INIT_H
|
||||
#define _ASM_X86_INIT_H
|
||||
|
||||
#define __head __section(".head.text")
|
||||
#define __head __section(".head.text") __no_sanitize_undefined
|
||||
|
||||
struct x86_mapping_info {
|
||||
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
|
||||
|
@ -100,8 +100,8 @@
|
||||
}
|
||||
|
||||
#define ASM_CALL_ARG0 \
|
||||
"call %c[__func] \n" \
|
||||
ASM_REACHABLE
|
||||
"1: call %c[__func] \n" \
|
||||
ANNOTATE_REACHABLE(1b)
|
||||
|
||||
#define ASM_CALL_ARG1 \
|
||||
"movq %[arg1], %%rdi \n" \
|
||||
|
@ -8,14 +8,9 @@
|
||||
# define PA_PGD 2
|
||||
# define PA_SWAP_PAGE 3
|
||||
# define PAGES_NR 4
|
||||
#else
|
||||
# define PA_CONTROL_PAGE 0
|
||||
# define VA_CONTROL_PAGE 1
|
||||
# define PA_TABLE_PAGE 2
|
||||
# define PA_SWAP_PAGE 3
|
||||
# define PAGES_NR 4
|
||||
#endif
|
||||
|
||||
# define KEXEC_CONTROL_PAGE_SIZE 4096
|
||||
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
@ -43,7 +38,6 @@ struct kimage;
|
||||
/* Maximum address we can use for the control code buffer */
|
||||
# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
|
||||
|
||||
# define KEXEC_CONTROL_PAGE_SIZE 4096
|
||||
|
||||
/* The native architecture */
|
||||
# define KEXEC_ARCH KEXEC_ARCH_386
|
||||
@ -58,11 +52,12 @@ struct kimage;
|
||||
/* Maximum address we can use for the control pages */
|
||||
# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
|
||||
|
||||
/* Allocate one page for the pdp and the second for the code */
|
||||
# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
|
||||
|
||||
/* The native architecture */
|
||||
# define KEXEC_ARCH KEXEC_ARCH_X86_64
|
||||
|
||||
extern unsigned long kexec_va_control_page;
|
||||
extern unsigned long kexec_pa_table_page;
|
||||
extern unsigned long kexec_pa_swap_page;
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -125,7 +120,7 @@ relocate_kernel(unsigned long indirection_page,
|
||||
#else
|
||||
unsigned long
|
||||
relocate_kernel(unsigned long indirection_page,
|
||||
unsigned long page_list,
|
||||
unsigned long pa_control_page,
|
||||
unsigned long start_address,
|
||||
unsigned int preserve_context,
|
||||
unsigned int host_mem_enc_active);
|
||||
@ -145,6 +140,19 @@ struct kimage_arch {
|
||||
};
|
||||
#else
|
||||
struct kimage_arch {
|
||||
/*
|
||||
* This is a kimage control page, as it must not overlap with either
|
||||
* source or destination address ranges.
|
||||
*/
|
||||
pgd_t *pgd;
|
||||
/*
|
||||
* The virtual mapping of the control code page itself is used only
|
||||
* during the transition, while the current kernel's pages are all
|
||||
* in place. Thus the intermediate page table pages used to map it
|
||||
* are not control pages, but instead just normal pages obtained
|
||||
* with get_zeroed_page(). And have to be tracked (below) so that
|
||||
* they can be freed.
|
||||
*/
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
|
@ -276,7 +276,7 @@ static inline void cmci_rediscover(void) {}
|
||||
static inline void cmci_recheck(void) {}
|
||||
#endif
|
||||
|
||||
int mce_available(struct cpuinfo_x86 *c);
|
||||
bool mce_available(struct cpuinfo_x86 *c);
|
||||
bool mce_is_memory_error(struct mce *m);
|
||||
bool mce_is_correctable(struct mce *m);
|
||||
bool mce_usable_address(struct mce *m);
|
||||
@ -296,7 +296,7 @@ enum mcp_flags {
|
||||
|
||||
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
|
||||
|
||||
int mce_notify_irq(void);
|
||||
bool mce_notify_irq(void);
|
||||
|
||||
DECLARE_PER_CPU(struct mce, injectm);
|
||||
|
||||
@ -386,8 +386,6 @@ static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
|
||||
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
|
||||
#endif
|
||||
|
||||
static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); }
|
||||
|
||||
unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len);
|
||||
|
||||
#endif /* _ASM_X86_MCE_H */
|
||||
|
@ -37,6 +37,8 @@ typedef struct {
|
||||
*/
|
||||
atomic64_t tlb_gen;
|
||||
|
||||
unsigned long next_trim_cpumask;
|
||||
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
struct rw_semaphore ldt_usr_sem;
|
||||
struct ldt_struct *ldt;
|
||||
|
@ -151,6 +151,7 @@ static inline int init_new_context(struct task_struct *tsk,
|
||||
|
||||
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
|
||||
atomic64_set(&mm->context.tlb_gen, 0);
|
||||
mm->context.next_trim_cpumask = jiffies + HZ;
|
||||
|
||||
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
|
||||
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
|
||||
|
@ -608,6 +608,7 @@
|
||||
#define MSR_AMD_PERF_CTL 0xc0010062
|
||||
#define MSR_AMD_PERF_STATUS 0xc0010063
|
||||
#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064
|
||||
#define MSR_AMD64_GUEST_TSC_FREQ 0xc0010134
|
||||
#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
|
||||
#define MSR_AMD64_OSVW_STATUS 0xc0010141
|
||||
#define MSR_AMD_PPIN_CTL 0xc00102f0
|
||||
@ -644,6 +645,7 @@
|
||||
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
|
||||
#define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b
|
||||
#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e
|
||||
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
|
||||
#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
|
||||
#define MSR_AMD64_SEV 0xc0010131
|
||||
#define MSR_AMD64_SEV_ENABLED_BIT 0
|
||||
@ -682,11 +684,12 @@
|
||||
#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT)
|
||||
#define MSR_AMD64_SNP_RESV_BIT 18
|
||||
#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT)
|
||||
|
||||
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
|
||||
|
||||
#define MSR_AMD64_RMP_BASE 0xc0010132
|
||||
#define MSR_AMD64_RMP_END 0xc0010133
|
||||
#define MSR_AMD64_RMP_CFG 0xc0010136
|
||||
#define MSR_AMD64_SEG_RMP_ENABLED_BIT 0
|
||||
#define MSR_AMD64_SEG_RMP_ENABLED BIT_ULL(MSR_AMD64_SEG_RMP_ENABLED_BIT)
|
||||
#define MSR_AMD64_RMP_SEGMENT_SHIFT(x) (((x) & GENMASK_ULL(13, 8)) >> 8)
|
||||
|
||||
#define MSR_SVSM_CAA 0xc001f000
|
||||
|
||||
|
@ -58,8 +58,8 @@ struct mtrr_state_type {
|
||||
*/
|
||||
# ifdef CONFIG_MTRR
|
||||
void mtrr_bp_init(void);
|
||||
void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var,
|
||||
mtrr_type def_type);
|
||||
void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
|
||||
mtrr_type def_type);
|
||||
extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
|
||||
extern void mtrr_save_fixed_ranges(void *);
|
||||
extern void mtrr_save_state(void);
|
||||
@ -75,9 +75,9 @@ void mtrr_disable(void);
|
||||
void mtrr_enable(void);
|
||||
void mtrr_generic_set_state(void);
|
||||
# else
|
||||
static inline void mtrr_overwrite_state(struct mtrr_var_range *var,
|
||||
unsigned int num_var,
|
||||
mtrr_type def_type)
|
||||
static inline void guest_force_mtrr_state(struct mtrr_var_range *var,
|
||||
unsigned int num_var,
|
||||
mtrr_type def_type)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK)
|
||||
#define MWAIT_C1_SUBSTATE_MASK 0xf0
|
||||
|
||||
#define CPUID_MWAIT_LEAF 5
|
||||
#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
|
||||
#define CPUID5_ECX_INTERRUPT_BREAK 0x2
|
||||
|
||||
|
@ -179,18 +179,6 @@
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
|
||||
/*
|
||||
* This should be used immediately before an indirect jump/call. It tells
|
||||
* objtool the subsequent indirect jump/call is vouched safe for retpoline
|
||||
* builds.
|
||||
*/
|
||||
.macro ANNOTATE_RETPOLINE_SAFE
|
||||
.Lhere_\@:
|
||||
.pushsection .discard.retpoline_safe
|
||||
.long .Lhere_\@
|
||||
.popsection
|
||||
.endm
|
||||
|
||||
/*
|
||||
* (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions
|
||||
* vs RETBleed validation.
|
||||
@ -350,12 +338,6 @@
|
||||
|
||||
#else /* __ASSEMBLY__ */
|
||||
|
||||
#define ANNOTATE_RETPOLINE_SAFE \
|
||||
"999:\n\t" \
|
||||
".pushsection .discard.retpoline_safe\n\t" \
|
||||
".long 999b\n\t" \
|
||||
".popsection\n\t"
|
||||
|
||||
typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
|
||||
extern retpoline_thunk_t __x86_indirect_thunk_array[];
|
||||
extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
|
||||
|
@ -180,13 +180,6 @@ static inline void halt(void)
|
||||
PVOP_VCALL0(irq.halt);
|
||||
}
|
||||
|
||||
extern noinstr void pv_native_wbinvd(void);
|
||||
|
||||
static __always_inline void wbinvd(void)
|
||||
{
|
||||
PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
|
||||
}
|
||||
|
||||
static inline u64 paravirt_read_msr(unsigned msr)
|
||||
{
|
||||
return PVOP_CALL1(u64, cpu.read_msr, msr);
|
||||
|
@ -86,8 +86,6 @@ struct pv_cpu_ops {
|
||||
void (*update_io_bitmap)(void);
|
||||
#endif
|
||||
|
||||
void (*wbinvd)(void);
|
||||
|
||||
/* cpuid emulation, mostly so that caps bits can be disabled */
|
||||
void (*cpuid)(unsigned int *eax, unsigned int *ebx,
|
||||
unsigned int *ecx, unsigned int *edx);
|
||||
|
@ -41,6 +41,7 @@
|
||||
#define INTEL_FIXED_0_USER (1ULL << 1)
|
||||
#define INTEL_FIXED_0_ANYTHREAD (1ULL << 2)
|
||||
#define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3)
|
||||
#define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2)
|
||||
|
||||
#define HSW_IN_TX (1ULL << 32)
|
||||
#define HSW_IN_TX_CHECKPOINTED (1ULL << 33)
|
||||
@ -372,6 +373,9 @@ static inline bool use_fixed_pseudo_encoding(u64 code)
|
||||
#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND
|
||||
#define INTEL_TD_METRIC_NUM 8
|
||||
|
||||
#define INTEL_TD_CFG_METRIC_CLEAR_BIT 0
|
||||
#define INTEL_TD_CFG_METRIC_CLEAR BIT_ULL(INTEL_TD_CFG_METRIC_CLEAR_BIT)
|
||||
|
||||
static inline bool is_metric_idx(int idx)
|
||||
{
|
||||
return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM;
|
||||
@ -422,7 +426,9 @@ static inline bool is_topdown_idx(int idx)
|
||||
*/
|
||||
|
||||
struct pebs_basic {
|
||||
u64 format_size;
|
||||
u64 format_group:32,
|
||||
retire_latency:16,
|
||||
format_size:16;
|
||||
u64 ip;
|
||||
u64 applicable_counters;
|
||||
u64 tsc;
|
||||
@ -431,7 +437,17 @@ struct pebs_basic {
|
||||
struct pebs_meminfo {
|
||||
u64 address;
|
||||
u64 aux;
|
||||
u64 latency;
|
||||
union {
|
||||
/* pre Alder Lake */
|
||||
u64 mem_latency;
|
||||
/* Alder Lake and later */
|
||||
struct {
|
||||
u64 instr_latency:16;
|
||||
u64 pad2:16;
|
||||
u64 cache_latency:16;
|
||||
u64 pad3:16;
|
||||
};
|
||||
};
|
||||
u64 tsx_tuning;
|
||||
};
|
||||
|
||||
|
@ -98,6 +98,7 @@ struct cpuinfo_topology {
|
||||
// Logical ID mappings
|
||||
u32 logical_pkg_id;
|
||||
u32 logical_die_id;
|
||||
u32 logical_core_id;
|
||||
|
||||
// AMD Node ID and Nodes per Package info
|
||||
u32 amd_node_id;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <asm-generic/sections.h>
|
||||
#include <asm/extable.h>
|
||||
|
||||
extern char __relocate_kernel_start[], __relocate_kernel_end[];
|
||||
extern char __brk_base[], __brk_limit[];
|
||||
extern char __end_rodata_aligned[];
|
||||
|
||||
|
@ -49,7 +49,7 @@ extern unsigned long saved_video_mode;
|
||||
|
||||
extern void reserve_standard_io_resources(void);
|
||||
extern void i386_reserve_resources(void);
|
||||
extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
|
||||
extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp);
|
||||
extern void startup_64_setup_gdt_idt(void);
|
||||
extern void early_setup_idt(void);
|
||||
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
|
||||
|
@ -206,6 +206,8 @@ struct snp_psc_desc {
|
||||
#define GHCB_TERM_NO_SVSM 7 /* SVSM is not advertised in the secrets page */
|
||||
#define GHCB_TERM_SVSM_VMPL0 8 /* SVSM is present but has set VMPL to 0 */
|
||||
#define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */
|
||||
#define GHCB_TERM_SVSM_CA_REMAP_FAIL 10 /* SVSM is present but CA could not be remapped */
|
||||
#define GHCB_TERM_SECURE_TSC 11 /* Secure TSC initialization failed */
|
||||
|
||||
#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
|
||||
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <asm/insn.h>
|
||||
#include <asm/sev-common.h>
|
||||
#include <asm/coco.h>
|
||||
#include <asm/set_memory.h>
|
||||
|
||||
#define GHCB_PROTOCOL_MIN 1ULL
|
||||
#define GHCB_PROTOCOL_MAX 2ULL
|
||||
@ -124,6 +125,9 @@ struct snp_req_data {
|
||||
#define AAD_LEN 48
|
||||
#define MSG_HDR_VER 1
|
||||
|
||||
#define SNP_REQ_MAX_RETRY_DURATION (60*HZ)
|
||||
#define SNP_REQ_RETRY_DELAY (2*HZ)
|
||||
|
||||
/* See SNP spec SNP_GUEST_REQUEST section for the structure */
|
||||
enum msg_type {
|
||||
SNP_MSG_TYPE_INVALID = 0,
|
||||
@ -142,6 +146,9 @@ enum msg_type {
|
||||
SNP_MSG_VMRK_REQ,
|
||||
SNP_MSG_VMRK_RSP,
|
||||
|
||||
SNP_MSG_TSC_INFO_REQ = 17,
|
||||
SNP_MSG_TSC_INFO_RSP,
|
||||
|
||||
SNP_MSG_TYPE_MAX
|
||||
};
|
||||
|
||||
@ -170,9 +177,20 @@ struct snp_guest_msg {
|
||||
u8 payload[PAGE_SIZE - sizeof(struct snp_guest_msg_hdr)];
|
||||
} __packed;
|
||||
|
||||
struct sev_guest_platform_data {
|
||||
u64 secrets_gpa;
|
||||
};
|
||||
#define SNP_TSC_INFO_REQ_SZ 128
|
||||
|
||||
struct snp_tsc_info_req {
|
||||
u8 rsvd[SNP_TSC_INFO_REQ_SZ];
|
||||
} __packed;
|
||||
|
||||
struct snp_tsc_info_resp {
|
||||
u32 status;
|
||||
u32 rsvd1;
|
||||
u64 tsc_scale;
|
||||
u64 tsc_offset;
|
||||
u32 tsc_factor;
|
||||
u8 rsvd2[100];
|
||||
} __packed;
|
||||
|
||||
struct snp_guest_req {
|
||||
void *req_buf;
|
||||
@ -253,6 +271,7 @@ struct snp_msg_desc {
|
||||
|
||||
u32 *os_area_msg_seqno;
|
||||
u8 *vmpck;
|
||||
int vmpck_id;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -445,8 +464,6 @@ void snp_set_wakeup_secondary_cpu(void);
|
||||
bool snp_init(struct boot_params *bp);
|
||||
void __noreturn snp_abort(void);
|
||||
void snp_dmi_setup(void);
|
||||
int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
|
||||
struct snp_guest_request_ioctl *rio);
|
||||
int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input);
|
||||
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
|
||||
u64 snp_get_unsupported_features(u64 status);
|
||||
@ -458,6 +475,15 @@ void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot);
|
||||
void snp_kexec_finish(void);
|
||||
void snp_kexec_begin(void);
|
||||
|
||||
int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id);
|
||||
struct snp_msg_desc *snp_msg_alloc(void);
|
||||
void snp_msg_free(struct snp_msg_desc *mdesc);
|
||||
int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
|
||||
struct snp_guest_request_ioctl *rio);
|
||||
|
||||
void __init snp_secure_tsc_prepare(void);
|
||||
void __init snp_secure_tsc_init(void);
|
||||
|
||||
#else /* !CONFIG_AMD_MEM_ENCRYPT */
|
||||
|
||||
#define snp_vmpl 0
|
||||
@ -480,11 +506,6 @@ static inline void snp_set_wakeup_secondary_cpu(void) { }
|
||||
static inline bool snp_init(struct boot_params *bp) { return false; }
|
||||
static inline void snp_abort(void) { }
|
||||
static inline void snp_dmi_setup(void) { }
|
||||
static inline int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
|
||||
struct snp_guest_request_ioctl *rio)
|
||||
{
|
||||
return -ENOTTY;
|
||||
}
|
||||
static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input)
|
||||
{
|
||||
return -ENOTTY;
|
||||
@ -498,6 +519,13 @@ static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; }
|
||||
static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { }
|
||||
static inline void snp_kexec_finish(void) { }
|
||||
static inline void snp_kexec_begin(void) { }
|
||||
static inline int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) { return -1; }
|
||||
static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; }
|
||||
static inline void snp_msg_free(struct snp_msg_desc *mdesc) { }
|
||||
static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
|
||||
struct snp_guest_request_ioctl *rio) { return -ENODEV; }
|
||||
static inline void __init snp_secure_tsc_prepare(void) { }
|
||||
static inline void __init snp_secure_tsc_init(void) { }
|
||||
|
||||
#endif /* CONFIG_AMD_MEM_ENCRYPT */
|
||||
|
||||
|
@ -19,6 +19,32 @@
|
||||
#define TDG_VM_RD 7
|
||||
#define TDG_VM_WR 8
|
||||
|
||||
/* TDX attributes */
|
||||
#define TDX_ATTR_DEBUG_BIT 0
|
||||
#define TDX_ATTR_DEBUG BIT_ULL(TDX_ATTR_DEBUG_BIT)
|
||||
#define TDX_ATTR_HGS_PLUS_PROF_BIT 4
|
||||
#define TDX_ATTR_HGS_PLUS_PROF BIT_ULL(TDX_ATTR_HGS_PLUS_PROF_BIT)
|
||||
#define TDX_ATTR_PERF_PROF_BIT 5
|
||||
#define TDX_ATTR_PERF_PROF BIT_ULL(TDX_ATTR_PERF_PROF_BIT)
|
||||
#define TDX_ATTR_PMT_PROF_BIT 6
|
||||
#define TDX_ATTR_PMT_PROF BIT_ULL(TDX_ATTR_PMT_PROF_BIT)
|
||||
#define TDX_ATTR_ICSSD_BIT 16
|
||||
#define TDX_ATTR_ICSSD BIT_ULL(TDX_ATTR_ICSSD_BIT)
|
||||
#define TDX_ATTR_LASS_BIT 27
|
||||
#define TDX_ATTR_LASS BIT_ULL(TDX_ATTR_LASS_BIT)
|
||||
#define TDX_ATTR_SEPT_VE_DISABLE_BIT 28
|
||||
#define TDX_ATTR_SEPT_VE_DISABLE BIT_ULL(TDX_ATTR_SEPT_VE_DISABLE_BIT)
|
||||
#define TDX_ATTR_MIGRTABLE_BIT 29
|
||||
#define TDX_ATTR_MIGRTABLE BIT_ULL(TDX_ATTR_MIGRTABLE_BIT)
|
||||
#define TDX_ATTR_PKS_BIT 30
|
||||
#define TDX_ATTR_PKS BIT_ULL(TDX_ATTR_PKS_BIT)
|
||||
#define TDX_ATTR_KL_BIT 31
|
||||
#define TDX_ATTR_KL BIT_ULL(TDX_ATTR_KL_BIT)
|
||||
#define TDX_ATTR_TPA_BIT 62
|
||||
#define TDX_ATTR_TPA BIT_ULL(TDX_ATTR_TPA_BIT)
|
||||
#define TDX_ATTR_PERFMON_BIT 63
|
||||
#define TDX_ATTR_PERFMON BIT_ULL(TDX_ATTR_PERFMON_BIT)
|
||||
|
||||
/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */
|
||||
#define TDCS_CONFIG_FLAGS 0x1110000300000016
|
||||
#define TDCS_TD_CTLS 0x1110000300000017
|
||||
@ -29,8 +55,16 @@
|
||||
#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1)
|
||||
|
||||
/* TDCS_TD_CTLS bits */
|
||||
#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0)
|
||||
#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(1)
|
||||
#define TD_CTLS_PENDING_VE_DISABLE_BIT 0
|
||||
#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(TD_CTLS_PENDING_VE_DISABLE_BIT)
|
||||
#define TD_CTLS_ENUM_TOPOLOGY_BIT 1
|
||||
#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(TD_CTLS_ENUM_TOPOLOGY_BIT)
|
||||
#define TD_CTLS_VIRT_CPUID2_BIT 2
|
||||
#define TD_CTLS_VIRT_CPUID2 BIT_ULL(TD_CTLS_VIRT_CPUID2_BIT)
|
||||
#define TD_CTLS_REDUCE_VE_BIT 3
|
||||
#define TD_CTLS_REDUCE_VE BIT_ULL(TD_CTLS_REDUCE_VE_BIT)
|
||||
#define TD_CTLS_LOCK_BIT 63
|
||||
#define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT)
|
||||
|
||||
/* TDX hypercall Leaf IDs */
|
||||
#define TDVMCALL_MAP_GPA 0x10001
|
||||
|
@ -115,7 +115,7 @@ static inline void wrpkru(u32 pkru)
|
||||
}
|
||||
#endif
|
||||
|
||||
static __always_inline void native_wbinvd(void)
|
||||
static __always_inline void wbinvd(void)
|
||||
{
|
||||
asm volatile("wbinvd": : :"memory");
|
||||
}
|
||||
@ -167,12 +167,6 @@ static inline void __write_cr4(unsigned long x)
|
||||
{
|
||||
native_write_cr4(x);
|
||||
}
|
||||
|
||||
static __always_inline void wbinvd(void)
|
||||
{
|
||||
native_wbinvd();
|
||||
}
|
||||
|
||||
#endif /* CONFIG_PARAVIRT_XXL */
|
||||
|
||||
static __always_inline void clflush(volatile void *__p)
|
||||
|
@ -417,7 +417,9 @@ struct sev_es_save_area {
|
||||
u8 reserved_0x298[80];
|
||||
u32 pkru;
|
||||
u32 tsc_aux;
|
||||
u8 reserved_0x2f0[24];
|
||||
u64 tsc_scale;
|
||||
u64 tsc_offset;
|
||||
u8 reserved_0x300[8];
|
||||
u64 rcx;
|
||||
u64 rdx;
|
||||
u64 rbx;
|
||||
@ -564,7 +566,7 @@ static inline void __unused_size_checks(void)
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x2f0);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x300);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0);
|
||||
|
@ -66,6 +66,9 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
|
||||
|
||||
u64 tdx_hcall_get_quote(u8 *buf, size_t size);
|
||||
|
||||
void __init tdx_dump_attributes(u64 td_attr);
|
||||
void __init tdx_dump_td_ctls(u64 td_ctls);
|
||||
|
||||
#else
|
||||
|
||||
static inline void tdx_early_init(void) { };
|
||||
|
@ -222,6 +222,7 @@ struct flush_tlb_info {
|
||||
unsigned int initiating_cpu;
|
||||
u8 stride_shift;
|
||||
u8 freed_tables;
|
||||
u8 trim_cpumask;
|
||||
};
|
||||
|
||||
void flush_tlb_local(void);
|
||||
|
@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu);
|
||||
#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id)
|
||||
#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id)
|
||||
#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id)
|
||||
#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id)
|
||||
#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id)
|
||||
#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id)
|
||||
#define topology_ppin(cpu) (cpu_data(cpu).ppin)
|
||||
|
@ -119,6 +119,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
|
||||
obj-$(CONFIG_HPET_TIMER) += hpet.o
|
||||
|
||||
obj-$(CONFIG_AMD_NB) += amd_nb.o
|
||||
obj-$(CONFIG_AMD_NODE) += amd_node.o
|
||||
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
|
||||
|
||||
obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <linux/sched.h>
|
||||
|
||||
#include <acpi/processor.h>
|
||||
#include <asm/cpuid.h>
|
||||
#include <asm/mwait.h>
|
||||
#include <asm/special_insns.h>
|
||||
|
||||
@ -128,7 +129,7 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
|
||||
unsigned int cstate_type; /* C-state type and not ACPI C-state type */
|
||||
unsigned int num_cstate_subtype;
|
||||
|
||||
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
|
||||
cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
/* Check whether this particular cx_type (in CST) is supported or not */
|
||||
cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) &
|
||||
@ -172,7 +173,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
|
||||
struct cpuinfo_x86 *c = &cpu_data(cpu);
|
||||
long retval;
|
||||
|
||||
if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
|
||||
if (!cpu_cstate_entry || c->cpuid_level < CPUID_LEAF_MWAIT)
|
||||
return -1;
|
||||
|
||||
if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
|
||||
|
@ -1854,11 +1854,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
|
||||
return temp_state;
|
||||
}
|
||||
|
||||
__ro_after_init struct mm_struct *poking_mm;
|
||||
__ro_after_init unsigned long poking_addr;
|
||||
|
||||
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
|
||||
{
|
||||
lockdep_assert_irqs_disabled();
|
||||
|
||||
switch_mm_irqs_off(NULL, prev_state.mm, current);
|
||||
|
||||
/* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
|
||||
cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
|
||||
|
||||
/*
|
||||
* Restore the breakpoints if they were disabled before the temporary mm
|
||||
* was loaded.
|
||||
@ -1867,9 +1874,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
|
||||
hw_breakpoint_restore();
|
||||
}
|
||||
|
||||
__ro_after_init struct mm_struct *poking_mm;
|
||||
__ro_after_init unsigned long poking_addr;
|
||||
|
||||
static void text_poke_memcpy(void *dst, const void *src, size_t len)
|
||||
{
|
||||
memcpy(dst, src, len);
|
||||
|
@ -15,66 +15,8 @@
|
||||
#include <linux/pci_ids.h>
|
||||
#include <asm/amd_nb.h>
|
||||
|
||||
#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
|
||||
#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
|
||||
#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
|
||||
#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
|
||||
#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5
|
||||
#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4
|
||||
#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
|
||||
#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8
|
||||
#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8
|
||||
#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a
|
||||
#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507
|
||||
#define PCI_DEVICE_ID_AMD_1AH_M60H_ROOT 0x1122
|
||||
#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb
|
||||
#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8
|
||||
|
||||
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
|
||||
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
|
||||
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
|
||||
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
|
||||
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
|
||||
#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728
|
||||
#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
|
||||
#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1
|
||||
#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
|
||||
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
|
||||
#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4
|
||||
#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4
|
||||
#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc
|
||||
#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4
|
||||
#define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F4 0x16fc
|
||||
#define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4 0x124c
|
||||
#define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4 0x12bc
|
||||
#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4
|
||||
#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c
|
||||
|
||||
/* Protect the PCI config register pairs used for SMN. */
|
||||
static DEFINE_MUTEX(smn_mutex);
|
||||
|
||||
static u32 *flush_words;
|
||||
|
||||
static const struct pci_device_id amd_root_ids[] = {
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) },
|
||||
{}
|
||||
};
|
||||
|
||||
#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704
|
||||
|
||||
static const struct pci_device_id amd_nb_misc_ids[] = {
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
|
||||
@ -84,70 +26,6 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) },
|
||||
{}
|
||||
};
|
||||
|
||||
static const struct pci_device_id amd_nb_link_ids[] = {
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) },
|
||||
{}
|
||||
};
|
||||
|
||||
static const struct pci_device_id hygon_root_ids[] = {
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_ROOT) },
|
||||
{}
|
||||
};
|
||||
|
||||
static const struct pci_device_id hygon_nb_misc_ids[] = {
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) },
|
||||
{}
|
||||
};
|
||||
|
||||
static const struct pci_device_id hygon_nb_link_ids[] = {
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F4) },
|
||||
{}
|
||||
};
|
||||
|
||||
@ -178,176 +56,37 @@ struct amd_northbridge *node_to_amd_nb(int node)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(node_to_amd_nb);
|
||||
|
||||
static struct pci_dev *next_northbridge(struct pci_dev *dev,
|
||||
const struct pci_device_id *ids)
|
||||
{
|
||||
do {
|
||||
dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
|
||||
if (!dev)
|
||||
break;
|
||||
} while (!pci_match_id(ids, dev));
|
||||
return dev;
|
||||
}
|
||||
|
||||
/*
|
||||
* SMN accesses may fail in ways that are difficult to detect here in the called
|
||||
* functions amd_smn_read() and amd_smn_write(). Therefore, callers must do
|
||||
* their own checking based on what behavior they expect.
|
||||
*
|
||||
* For SMN reads, the returned value may be zero if the register is Read-as-Zero.
|
||||
* Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response"
|
||||
* can be checked here, and a proper error code can be returned.
|
||||
*
|
||||
* But the Read-as-Zero response cannot be verified here. A value of 0 may be
|
||||
* correct in some cases, so callers must check that this correct is for the
|
||||
* register/fields they need.
|
||||
*
|
||||
* For SMN writes, success can be determined through a "write and read back"
|
||||
* However, this is not robust when done here.
|
||||
*
|
||||
* Possible issues:
|
||||
*
|
||||
* 1) Bits that are "Write-1-to-Clear". In this case, the read value should
|
||||
* *not* match the write value.
|
||||
*
|
||||
* 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be
|
||||
* known here.
|
||||
*
|
||||
* 3) Bits that are "Reserved / Set to 1". Ditto above.
|
||||
*
|
||||
* Callers of amd_smn_write() should do the "write and read back" check
|
||||
* themselves, if needed.
|
||||
*
|
||||
* For #1, they can see if their target bits got cleared.
|
||||
*
|
||||
* For #2 and #3, they can check if their target bits got set as intended.
|
||||
*
|
||||
* This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then
|
||||
* the operation is considered a success, and the caller does their own
|
||||
* checking.
|
||||
*/
|
||||
static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write)
|
||||
{
|
||||
struct pci_dev *root;
|
||||
int err = -ENODEV;
|
||||
|
||||
if (node >= amd_northbridges.num)
|
||||
goto out;
|
||||
|
||||
root = node_to_amd_nb(node)->root;
|
||||
if (!root)
|
||||
goto out;
|
||||
|
||||
mutex_lock(&smn_mutex);
|
||||
|
||||
err = pci_write_config_dword(root, 0x60, address);
|
||||
if (err) {
|
||||
pr_warn("Error programming SMN address 0x%x.\n", address);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
err = (write ? pci_write_config_dword(root, 0x64, *value)
|
||||
: pci_read_config_dword(root, 0x64, value));
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&smn_mutex);
|
||||
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
int __must_check amd_smn_read(u16 node, u32 address, u32 *value)
|
||||
{
|
||||
int err = __amd_smn_rw(node, address, value, false);
|
||||
|
||||
if (PCI_POSSIBLE_ERROR(*value)) {
|
||||
err = -ENODEV;
|
||||
*value = 0;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(amd_smn_read);
|
||||
|
||||
int __must_check amd_smn_write(u16 node, u32 address, u32 value)
|
||||
{
|
||||
return __amd_smn_rw(node, address, &value, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(amd_smn_write);
|
||||
|
||||
|
||||
static int amd_cache_northbridges(void)
|
||||
{
|
||||
const struct pci_device_id *misc_ids = amd_nb_misc_ids;
|
||||
const struct pci_device_id *link_ids = amd_nb_link_ids;
|
||||
const struct pci_device_id *root_ids = amd_root_ids;
|
||||
struct pci_dev *root, *misc, *link;
|
||||
struct amd_northbridge *nb;
|
||||
u16 roots_per_misc = 0;
|
||||
u16 misc_count = 0;
|
||||
u16 root_count = 0;
|
||||
u16 i, j;
|
||||
u16 i;
|
||||
|
||||
if (amd_northbridges.num)
|
||||
return 0;
|
||||
|
||||
if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
|
||||
root_ids = hygon_root_ids;
|
||||
misc_ids = hygon_nb_misc_ids;
|
||||
link_ids = hygon_nb_link_ids;
|
||||
}
|
||||
amd_northbridges.num = amd_num_nodes();
|
||||
|
||||
misc = NULL;
|
||||
while ((misc = next_northbridge(misc, misc_ids)))
|
||||
misc_count++;
|
||||
|
||||
if (!misc_count)
|
||||
return -ENODEV;
|
||||
|
||||
root = NULL;
|
||||
while ((root = next_northbridge(root, root_ids)))
|
||||
root_count++;
|
||||
|
||||
if (root_count) {
|
||||
roots_per_misc = root_count / misc_count;
|
||||
|
||||
/*
|
||||
* There should be _exactly_ N roots for each DF/SMN
|
||||
* interface.
|
||||
*/
|
||||
if (!roots_per_misc || (root_count % roots_per_misc)) {
|
||||
pr_info("Unsupported AMD DF/PCI configuration found\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
}
|
||||
|
||||
nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL);
|
||||
nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL);
|
||||
if (!nb)
|
||||
return -ENOMEM;
|
||||
|
||||
amd_northbridges.nb = nb;
|
||||
amd_northbridges.num = misc_count;
|
||||
|
||||
link = misc = root = NULL;
|
||||
for (i = 0; i < amd_northbridges.num; i++) {
|
||||
node_to_amd_nb(i)->root = root =
|
||||
next_northbridge(root, root_ids);
|
||||
node_to_amd_nb(i)->misc = misc =
|
||||
next_northbridge(misc, misc_ids);
|
||||
node_to_amd_nb(i)->link = link =
|
||||
next_northbridge(link, link_ids);
|
||||
node_to_amd_nb(i)->root = amd_node_get_root(i);
|
||||
node_to_amd_nb(i)->misc = amd_node_get_func(i, 3);
|
||||
|
||||
/*
|
||||
* If there are more PCI root devices than data fabric/
|
||||
* system management network interfaces, then the (N)
|
||||
* PCI roots per DF/SMN interface are functionally the
|
||||
* same (for DF/SMN access) and N-1 are redundant. N-1
|
||||
* PCI roots should be skipped per DF/SMN interface so
|
||||
* the following DF/SMN interfaces get mapped to
|
||||
* correct PCI roots.
|
||||
* Each Northbridge must have a 'misc' device.
|
||||
* If not, then uninitialize everything.
|
||||
*/
|
||||
for (j = 1; j < roots_per_misc; j++)
|
||||
root = next_northbridge(root, root_ids);
|
||||
if (!node_to_amd_nb(i)->misc) {
|
||||
amd_northbridges.num = 0;
|
||||
kfree(nb);
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
node_to_amd_nb(i)->link = amd_node_get_func(i, 4);
|
||||
}
|
||||
|
||||
if (amd_gart_present())
|
||||
@ -385,7 +124,6 @@ static int amd_cache_northbridges(void)
|
||||
*/
|
||||
bool __init early_is_amd_nb(u32 device)
|
||||
{
|
||||
const struct pci_device_id *misc_ids = amd_nb_misc_ids;
|
||||
const struct pci_device_id *id;
|
||||
u32 vendor = device & 0xffff;
|
||||
|
||||
@ -393,11 +131,11 @@ bool __init early_is_amd_nb(u32 device)
|
||||
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
|
||||
return false;
|
||||
|
||||
if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
|
||||
misc_ids = hygon_nb_misc_ids;
|
||||
if (cpu_feature_enabled(X86_FEATURE_ZEN))
|
||||
return false;
|
||||
|
||||
device >>= 16;
|
||||
for (id = misc_ids; id->vendor; id++)
|
||||
for (id = amd_nb_misc_ids; id->vendor; id++)
|
||||
if (vendor == id->vendor && device == id->device)
|
||||
return true;
|
||||
return false;
|
||||
@ -582,6 +320,10 @@ static __init void fix_erratum_688(void)
|
||||
|
||||
static __init int init_amd_nbs(void)
|
||||
{
|
||||
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
|
||||
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
|
||||
return 0;
|
||||
|
||||
amd_cache_northbridges();
|
||||
amd_cache_gart();
|
||||
|
||||
|
215
arch/x86/kernel/amd_node.c
Normal file
215
arch/x86/kernel/amd_node.c
Normal file
@ -0,0 +1,215 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* AMD Node helper functions and common defines
|
||||
*
|
||||
* Copyright (c) 2024, Advanced Micro Devices, Inc.
|
||||
* All Rights Reserved.
|
||||
*
|
||||
* Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
|
||||
*/
|
||||
|
||||
#include <asm/amd_node.h>
|
||||
|
||||
/*
|
||||
* AMD Nodes are a physical collection of I/O devices within an SoC. There can be one
|
||||
* or more nodes per package.
|
||||
*
|
||||
* The nodes are software-visible through PCI config space. All nodes are enumerated
|
||||
* on segment 0 bus 0. The device (slot) numbers range from 0x18 to 0x1F (maximum 8
|
||||
* nodes) with 0x18 corresponding to node 0, 0x19 to node 1, etc. Each node can be a
|
||||
* multi-function device.
|
||||
*
|
||||
* On legacy systems, these node devices represent integrated Northbridge functionality.
|
||||
* On Zen-based systems, these node devices represent Data Fabric functionality.
|
||||
*
|
||||
* See "Configuration Space Accesses" section in BKDGs or
|
||||
* "Processor x86 Core" -> "Configuration Space" section in PPRs.
|
||||
*/
|
||||
struct pci_dev *amd_node_get_func(u16 node, u8 func)
|
||||
{
|
||||
if (node >= MAX_AMD_NUM_NODES)
|
||||
return NULL;
|
||||
|
||||
return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func));
|
||||
}
|
||||
|
||||
#define DF_BLK_INST_CNT 0x040
|
||||
#define DF_CFG_ADDR_CNTL_LEGACY 0x084
|
||||
#define DF_CFG_ADDR_CNTL_DF4 0xC04
|
||||
|
||||
#define DF_MAJOR_REVISION GENMASK(27, 24)
|
||||
|
||||
static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0)
|
||||
{
|
||||
u32 reg;
|
||||
|
||||
/*
|
||||
* Revision fields added for DF4 and later.
|
||||
*
|
||||
* Major revision of '0' is found pre-DF4. Field is Read-as-Zero.
|
||||
*/
|
||||
if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, ®))
|
||||
return 0;
|
||||
|
||||
if (reg & DF_MAJOR_REVISION)
|
||||
return DF_CFG_ADDR_CNTL_DF4;
|
||||
|
||||
return DF_CFG_ADDR_CNTL_LEGACY;
|
||||
}
|
||||
|
||||
struct pci_dev *amd_node_get_root(u16 node)
|
||||
{
|
||||
struct pci_dev *root;
|
||||
u16 cntl_off;
|
||||
u8 bus;
|
||||
|
||||
if (!cpu_feature_enabled(X86_FEATURE_ZEN))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* D18F0xXXX [Config Address Control] (DF::CfgAddressCntl)
|
||||
* Bits [7:0] (SecBusNum) holds the bus number of the root device for
|
||||
* this Data Fabric instance. The segment, device, and function will be 0.
|
||||
*/
|
||||
struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0);
|
||||
if (!df_f0)
|
||||
return NULL;
|
||||
|
||||
cntl_off = get_cfg_addr_cntl_offset(df_f0);
|
||||
if (!cntl_off)
|
||||
return NULL;
|
||||
|
||||
if (pci_read_config_byte(df_f0, cntl_off, &bus))
|
||||
return NULL;
|
||||
|
||||
/* Grab the pointer for the actual root device instance. */
|
||||
root = pci_get_domain_bus_and_slot(0, bus, 0);
|
||||
|
||||
pci_dbg(root, "is root for AMD node %u\n", node);
|
||||
return root;
|
||||
}
|
||||
|
||||
static struct pci_dev **amd_roots;
|
||||
|
||||
/* Protect the PCI config register pairs used for SMN. */
|
||||
static DEFINE_MUTEX(smn_mutex);
|
||||
|
||||
#define SMN_INDEX_OFFSET 0x60
|
||||
#define SMN_DATA_OFFSET 0x64
|
||||
|
||||
/*
|
||||
* SMN accesses may fail in ways that are difficult to detect here in the called
|
||||
* functions amd_smn_read() and amd_smn_write(). Therefore, callers must do
|
||||
* their own checking based on what behavior they expect.
|
||||
*
|
||||
* For SMN reads, the returned value may be zero if the register is Read-as-Zero.
|
||||
* Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response"
|
||||
* can be checked here, and a proper error code can be returned.
|
||||
*
|
||||
* But the Read-as-Zero response cannot be verified here. A value of 0 may be
|
||||
* correct in some cases, so callers must check that this correct is for the
|
||||
* register/fields they need.
|
||||
*
|
||||
* For SMN writes, success can be determined through a "write and read back"
|
||||
* However, this is not robust when done here.
|
||||
*
|
||||
* Possible issues:
|
||||
*
|
||||
* 1) Bits that are "Write-1-to-Clear". In this case, the read value should
|
||||
* *not* match the write value.
|
||||
*
|
||||
* 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be
|
||||
* known here.
|
||||
*
|
||||
* 3) Bits that are "Reserved / Set to 1". Ditto above.
|
||||
*
|
||||
* Callers of amd_smn_write() should do the "write and read back" check
|
||||
* themselves, if needed.
|
||||
*
|
||||
* For #1, they can see if their target bits got cleared.
|
||||
*
|
||||
* For #2 and #3, they can check if their target bits got set as intended.
|
||||
*
|
||||
* This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then
|
||||
* the operation is considered a success, and the caller does their own
|
||||
* checking.
|
||||
*/
|
||||
static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, bool write)
|
||||
{
|
||||
struct pci_dev *root;
|
||||
int err = -ENODEV;
|
||||
|
||||
if (node >= amd_num_nodes())
|
||||
return err;
|
||||
|
||||
root = amd_roots[node];
|
||||
if (!root)
|
||||
return err;
|
||||
|
||||
guard(mutex)(&smn_mutex);
|
||||
|
||||
err = pci_write_config_dword(root, i_off, address);
|
||||
if (err) {
|
||||
pr_warn("Error programming SMN address 0x%x.\n", address);
|
||||
return pcibios_err_to_errno(err);
|
||||
}
|
||||
|
||||
err = (write ? pci_write_config_dword(root, d_off, *value)
|
||||
: pci_read_config_dword(root, d_off, value));
|
||||
|
||||
return pcibios_err_to_errno(err);
|
||||
}
|
||||
|
||||
int __must_check amd_smn_read(u16 node, u32 address, u32 *value)
|
||||
{
|
||||
int err = __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, value, false);
|
||||
|
||||
if (PCI_POSSIBLE_ERROR(*value)) {
|
||||
err = -ENODEV;
|
||||
*value = 0;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(amd_smn_read);
|
||||
|
||||
int __must_check amd_smn_write(u16 node, u32 address, u32 value)
|
||||
{
|
||||
return __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, &value, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(amd_smn_write);
|
||||
|
||||
static int amd_cache_roots(void)
|
||||
{
|
||||
u16 node, num_nodes = amd_num_nodes();
|
||||
|
||||
amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
|
||||
if (!amd_roots)
|
||||
return -ENOMEM;
|
||||
|
||||
for (node = 0; node < num_nodes; node++)
|
||||
amd_roots[node] = amd_node_get_root(node);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __init amd_smn_init(void)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (!cpu_feature_enabled(X86_FEATURE_ZEN))
|
||||
return 0;
|
||||
|
||||
guard(mutex)(&smn_mutex);
|
||||
|
||||
if (amd_roots)
|
||||
return 0;
|
||||
|
||||
err = amd_cache_roots();
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
fs_initcall(amd_smn_init);
|
@ -509,19 +509,19 @@ static struct clock_event_device lapic_clockevent = {
|
||||
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
|
||||
|
||||
static const struct x86_cpu_id deadline_match[] __initconst = {
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x2, 0x2, 0x3a), /* EP */
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x4, 0x4, 0x0f), /* EX */
|
||||
|
||||
X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020),
|
||||
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x2, 0x2, 0x00000011),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x3, 0x3, 0x0700000e),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x4, 0x4, 0x0f00000c),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x5, 0x5, 0x0e000003),
|
||||
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x3, 0x3, 0x01000136),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x4, 0x4, 0x02000014),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x5, 0xf, 0),
|
||||
|
||||
X86_MATCH_VFM(INTEL_HASWELL, 0x22),
|
||||
X86_MATCH_VFM(INTEL_HASWELL_L, 0x20),
|
||||
@ -2582,19 +2582,12 @@ int apic_is_clustered_box(void)
|
||||
/*
|
||||
* APIC command line parameters
|
||||
*/
|
||||
static int __init setup_disableapic(char *arg)
|
||||
static int __init setup_nolapic(char *arg)
|
||||
{
|
||||
apic_is_disabled = true;
|
||||
setup_clear_cpu_cap(X86_FEATURE_APIC);
|
||||
return 0;
|
||||
}
|
||||
early_param("disableapic", setup_disableapic);
|
||||
|
||||
/* same as disableapic, for compatibility */
|
||||
static int __init setup_nolapic(char *arg)
|
||||
{
|
||||
return setup_disableapic(arg);
|
||||
}
|
||||
early_param("nolapic", setup_nolapic);
|
||||
|
||||
static int __init parse_lapic_timer_c2_ok(char *arg)
|
||||
|
@ -1165,7 +1165,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
|
||||
(entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero);
|
||||
} else {
|
||||
apic_dbg("%s, %s, D(%02X%02X), M(%1d)\n", buf,
|
||||
entry.dest_mode_logical ? "logical " : "physic al",
|
||||
entry.dest_mode_logical ? "logical " : "physical",
|
||||
entry.virt_destid_8_14, entry.destid_0_7, entry.delivery_mode);
|
||||
}
|
||||
}
|
||||
|
@ -139,9 +139,15 @@ static bool skip_addr(void *dest)
|
||||
return true;
|
||||
#endif
|
||||
#ifdef CONFIG_KEXEC_CORE
|
||||
# ifdef CONFIG_X86_64
|
||||
if (dest >= (void *)__relocate_kernel_start &&
|
||||
dest < (void *)__relocate_kernel_end)
|
||||
return true;
|
||||
# else
|
||||
if (dest >= (void *)relocate_kernel &&
|
||||
dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE)
|
||||
return true;
|
||||
# endif
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
@ -355,10 +355,15 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c)
|
||||
/*
|
||||
* RMP table entry format is not architectural and is defined by the
|
||||
* per-processor PPR. Restrict SNP support on the known CPU models
|
||||
* for which the RMP table entry format is currently defined for.
|
||||
* for which the RMP table entry format is currently defined or for
|
||||
* processors which support the architecturally defined RMPREAD
|
||||
* instruction.
|
||||
*/
|
||||
if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
|
||||
c->x86 >= 0x19 && snp_probe_rmptable_info()) {
|
||||
(cpu_feature_enabled(X86_FEATURE_ZEN3) ||
|
||||
cpu_feature_enabled(X86_FEATURE_ZEN4) ||
|
||||
cpu_feature_enabled(X86_FEATURE_RMPREAD)) &&
|
||||
snp_probe_rmptable_info()) {
|
||||
cc_platform_set(CC_ATTR_HOST_SEV_SNP);
|
||||
} else {
|
||||
setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
|
||||
@ -795,10 +800,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
|
||||
clear_rdrand_cpuid_bit(c);
|
||||
}
|
||||
|
||||
static const struct x86_cpu_desc erratum_1386_microcode[] = {
|
||||
AMD_CPU_DESC(0x17, 0x1, 0x2, 0x0800126e),
|
||||
AMD_CPU_DESC(0x17, 0x31, 0x0, 0x08301052),
|
||||
{},
|
||||
static const struct x86_cpu_id erratum_1386_microcode[] = {
|
||||
X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
|
||||
X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
|
||||
};
|
||||
|
||||
static void fix_erratum_1386(struct cpuinfo_x86 *c)
|
||||
@ -814,7 +818,7 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c)
|
||||
* Clear the feature flag only on microcode revisions which
|
||||
* don't have the fix.
|
||||
*/
|
||||
if (x86_cpu_has_min_microcode_rev(erratum_1386_microcode))
|
||||
if (x86_match_min_microcode_rev(erratum_1386_microcode))
|
||||
return;
|
||||
|
||||
clear_cpu_cap(c, X86_FEATURE_XSAVES);
|
||||
|
@ -2615,6 +2615,9 @@ static void __init srso_select_mitigation(void)
|
||||
break;
|
||||
|
||||
case SRSO_CMD_SAFE_RET:
|
||||
if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))
|
||||
goto ibpb_on_vmexit;
|
||||
|
||||
if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
|
||||
/*
|
||||
* Enable the return thunk for generated code
|
||||
@ -2658,6 +2661,7 @@ static void __init srso_select_mitigation(void)
|
||||
}
|
||||
break;
|
||||
|
||||
ibpb_on_vmexit:
|
||||
case SRSO_CMD_IBPB_ON_VMEXIT:
|
||||
if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
|
||||
if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) {
|
||||
|
@ -29,6 +29,7 @@
|
||||
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/cmdline.h>
|
||||
#include <asm/cpuid.h>
|
||||
#include <asm/perf_event.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/doublefault.h>
|
||||
@ -636,9 +637,9 @@ struct cpuid_dependent_feature {
|
||||
|
||||
static const struct cpuid_dependent_feature
|
||||
cpuid_dependent_features[] = {
|
||||
{ X86_FEATURE_MWAIT, 0x00000005 },
|
||||
{ X86_FEATURE_DCA, 0x00000009 },
|
||||
{ X86_FEATURE_XSAVE, 0x0000000d },
|
||||
{ X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT },
|
||||
{ X86_FEATURE_DCA, CPUID_LEAF_DCA },
|
||||
{ X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE },
|
||||
{ 0, 0 }
|
||||
};
|
||||
|
||||
@ -1201,8 +1202,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
|
||||
#define VULNBL(vendor, family, model, blacklist) \
|
||||
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
|
||||
|
||||
#define VULNBL_INTEL_STEPPINGS(vfm, steppings, issues) \
|
||||
X86_MATCH_VFM_STEPPINGS(vfm, steppings, issues)
|
||||
#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
|
||||
X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
|
||||
|
||||
#define VULNBL_AMD(family, blacklist) \
|
||||
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
|
||||
@ -1227,49 +1228,50 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
|
||||
#define RFDS BIT(7)
|
||||
|
||||
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_L, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_G, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_X, X86_STEPPING_ANY, MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPING_ANY, MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_G, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_X, X86_STEPPING_ANY, MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO),
|
||||
VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO),
|
||||
VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO),
|
||||
VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED),
|
||||
VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED),
|
||||
VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
|
||||
VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS),
|
||||
|
||||
VULNBL_AMD(0x15, RETBLEED),
|
||||
VULNBL_AMD(0x16, RETBLEED),
|
||||
VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
|
||||
VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
|
||||
VULNBL_AMD(0x19, SRSO),
|
||||
VULNBL_AMD(0x1a, SRSO),
|
||||
{}
|
||||
};
|
||||
|
||||
|
@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p)
|
||||
seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
|
||||
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
|
||||
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
|
||||
seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
|
||||
seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
|
||||
seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
|
||||
seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
|
||||
|
@ -599,11 +599,6 @@ static void init_intel(struct cpuinfo_x86 *c)
|
||||
if (p)
|
||||
strcpy(c->x86_model_id, p);
|
||||
}
|
||||
|
||||
if (c->x86 == 15)
|
||||
set_cpu_cap(c, X86_FEATURE_P4);
|
||||
if (c->x86 == 6)
|
||||
set_cpu_cap(c, X86_FEATURE_P3);
|
||||
#endif
|
||||
|
||||
/* Work around errata */
|
||||
|
@ -6,7 +6,7 @@
|
||||
#include <linux/slab.h>
|
||||
|
||||
/**
|
||||
* x86_match_cpu - match current CPU again an array of x86_cpu_ids
|
||||
* x86_match_cpu - match current CPU against an array of x86_cpu_ids
|
||||
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
|
||||
* {}.
|
||||
*
|
||||
@ -56,33 +56,13 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
|
||||
}
|
||||
EXPORT_SYMBOL(x86_match_cpu);
|
||||
|
||||
static const struct x86_cpu_desc *
|
||||
x86_match_cpu_with_stepping(const struct x86_cpu_desc *match)
|
||||
bool x86_match_min_microcode_rev(const struct x86_cpu_id *table)
|
||||
{
|
||||
struct cpuinfo_x86 *c = &boot_cpu_data;
|
||||
const struct x86_cpu_desc *m;
|
||||
const struct x86_cpu_id *res = x86_match_cpu(table);
|
||||
|
||||
for (m = match; m->x86_family | m->x86_model; m++) {
|
||||
if (c->x86_vendor != m->x86_vendor)
|
||||
continue;
|
||||
if (c->x86 != m->x86_family)
|
||||
continue;
|
||||
if (c->x86_model != m->x86_model)
|
||||
continue;
|
||||
if (c->x86_stepping != m->x86_stepping)
|
||||
continue;
|
||||
return m;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table)
|
||||
{
|
||||
const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table);
|
||||
|
||||
if (!res || res->x86_microcode_rev > boot_cpu_data.microcode)
|
||||
if (!res || res->driver_data > boot_cpu_data.microcode)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev);
|
||||
EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev);
|
||||
|
@ -4,8 +4,6 @@
|
||||
*
|
||||
* Written by Jacob Shin - AMD, Inc.
|
||||
* Maintained by: Borislav Petkov <bp@alien8.de>
|
||||
*
|
||||
* All MC4_MISCi registers are shared between cores on a node.
|
||||
*/
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/notifier.h>
|
||||
@ -20,7 +18,6 @@
|
||||
#include <linux/smp.h>
|
||||
#include <linux/string.h>
|
||||
|
||||
#include <asm/amd_nb.h>
|
||||
#include <asm/traps.h>
|
||||
#include <asm/apic.h>
|
||||
#include <asm/mce.h>
|
||||
@ -221,6 +218,32 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
#define MAX_MCATYPE_NAME_LEN 30
|
||||
static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
|
||||
|
||||
struct threshold_block {
|
||||
/* This block's number within its bank. */
|
||||
unsigned int block;
|
||||
/* MCA bank number that contains this block. */
|
||||
unsigned int bank;
|
||||
/* CPU which controls this block's MCA bank. */
|
||||
unsigned int cpu;
|
||||
/* MCA_MISC MSR address for this block. */
|
||||
u32 address;
|
||||
/* Enable/Disable APIC interrupt. */
|
||||
bool interrupt_enable;
|
||||
/* Bank can generate an interrupt. */
|
||||
bool interrupt_capable;
|
||||
/* Value upon which threshold interrupt is generated. */
|
||||
u16 threshold_limit;
|
||||
/* sysfs object */
|
||||
struct kobject kobj;
|
||||
/* List of threshold blocks within this block's MCA bank. */
|
||||
struct list_head miscj;
|
||||
};
|
||||
|
||||
struct threshold_bank {
|
||||
struct kobject *kobj;
|
||||
struct threshold_block *blocks;
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
|
||||
|
||||
/*
|
||||
@ -333,19 +356,6 @@ struct thresh_restart {
|
||||
u16 old_limit;
|
||||
};
|
||||
|
||||
static inline bool is_shared_bank(int bank)
|
||||
{
|
||||
/*
|
||||
* Scalable MCA provides for only one core to have access to the MSRs of
|
||||
* a shared bank.
|
||||
*/
|
||||
if (mce_flags.smca)
|
||||
return false;
|
||||
|
||||
/* Bank 4 is for northbridge reporting and is thus shared */
|
||||
return (bank == 4);
|
||||
}
|
||||
|
||||
static const char *bank4_names(const struct threshold_block *b)
|
||||
{
|
||||
switch (b->address) {
|
||||
@ -381,7 +391,7 @@ static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
|
||||
return msr_high_bits & BIT(28);
|
||||
}
|
||||
|
||||
static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
|
||||
static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
|
||||
{
|
||||
int msr = (hi & MASK_LVTOFF_HI) >> 20;
|
||||
|
||||
@ -389,7 +399,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
|
||||
pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
|
||||
"for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
|
||||
b->bank, b->block, b->address, hi, lo);
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (apic != msr) {
|
||||
@ -399,15 +409,15 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
|
||||
* was set is reserved. Return early here:
|
||||
*/
|
||||
if (mce_flags.smca)
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
|
||||
"for bank %d, block %d (MSR%08X=0x%x%08x)\n",
|
||||
b->cpu, apic, b->bank, b->block, b->address, hi, lo);
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
return 1;
|
||||
return true;
|
||||
};
|
||||
|
||||
/* Reprogram MCx_MISC MSR behind this threshold bank. */
|
||||
@ -1198,35 +1208,10 @@ out_free:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int __threshold_add_blocks(struct threshold_bank *b)
|
||||
{
|
||||
struct list_head *head = &b->blocks->miscj;
|
||||
struct threshold_block *pos = NULL;
|
||||
struct threshold_block *tmp = NULL;
|
||||
int err = 0;
|
||||
|
||||
err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
list_for_each_entry_safe(pos, tmp, head, miscj) {
|
||||
|
||||
err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
|
||||
if (err) {
|
||||
list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
|
||||
kobject_del(&pos->kobj);
|
||||
|
||||
return err;
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
|
||||
unsigned int bank)
|
||||
{
|
||||
struct device *dev = this_cpu_read(mce_device);
|
||||
struct amd_northbridge *nb = NULL;
|
||||
struct threshold_bank *b = NULL;
|
||||
const char *name = get_name(cpu, bank, NULL);
|
||||
int err = 0;
|
||||
@ -1234,26 +1219,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
|
||||
if (!dev)
|
||||
return -ENODEV;
|
||||
|
||||
if (is_shared_bank(bank)) {
|
||||
nb = node_to_amd_nb(topology_amd_node_id(cpu));
|
||||
|
||||
/* threshold descriptor already initialized on this node? */
|
||||
if (nb && nb->bank4) {
|
||||
/* yes, use it */
|
||||
b = nb->bank4;
|
||||
err = kobject_add(b->kobj, &dev->kobj, name);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
bp[bank] = b;
|
||||
refcount_inc(&b->cpus);
|
||||
|
||||
err = __threshold_add_blocks(b);
|
||||
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
|
||||
if (!b) {
|
||||
err = -ENOMEM;
|
||||
@ -1267,17 +1232,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
if (is_shared_bank(bank)) {
|
||||
b->shared = 1;
|
||||
refcount_set(&b->cpus, 1);
|
||||
|
||||
/* nb is already initialized, see above */
|
||||
if (nb) {
|
||||
WARN_ON(nb->bank4);
|
||||
nb->bank4 = b;
|
||||
}
|
||||
}
|
||||
|
||||
err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
|
||||
if (err)
|
||||
goto out_kobj;
|
||||
@ -1310,40 +1264,11 @@ static void deallocate_threshold_blocks(struct threshold_bank *bank)
|
||||
kobject_put(&bank->blocks->kobj);
|
||||
}
|
||||
|
||||
static void __threshold_remove_blocks(struct threshold_bank *b)
|
||||
{
|
||||
struct threshold_block *pos = NULL;
|
||||
struct threshold_block *tmp = NULL;
|
||||
|
||||
kobject_put(b->kobj);
|
||||
|
||||
list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
|
||||
kobject_put(b->kobj);
|
||||
}
|
||||
|
||||
static void threshold_remove_bank(struct threshold_bank *bank)
|
||||
{
|
||||
struct amd_northbridge *nb;
|
||||
|
||||
if (!bank->blocks)
|
||||
goto out_free;
|
||||
|
||||
if (!bank->shared)
|
||||
goto out_dealloc;
|
||||
|
||||
if (!refcount_dec_and_test(&bank->cpus)) {
|
||||
__threshold_remove_blocks(bank);
|
||||
return;
|
||||
} else {
|
||||
/*
|
||||
* The last CPU on this node using the shared bank is going
|
||||
* away, remove that bank now.
|
||||
*/
|
||||
nb = node_to_amd_nb(topology_amd_node_id(smp_processor_id()));
|
||||
nb->bank4 = NULL;
|
||||
}
|
||||
|
||||
out_dealloc:
|
||||
deallocate_threshold_blocks(bank);
|
||||
|
||||
out_free:
|
||||
|
@ -151,7 +151,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm);
|
||||
|
||||
void mce_log(struct mce_hw_err *err)
|
||||
{
|
||||
if (!mce_gen_pool_add(err))
|
||||
if (mce_gen_pool_add(err))
|
||||
irq_work_queue(&mce_irq_work);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mce_log);
|
||||
@ -492,10 +492,10 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs
|
||||
}
|
||||
}
|
||||
|
||||
int mce_available(struct cpuinfo_x86 *c)
|
||||
bool mce_available(struct cpuinfo_x86 *c)
|
||||
{
|
||||
if (mca_cfg.disabled)
|
||||
return 0;
|
||||
return false;
|
||||
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
|
||||
}
|
||||
|
||||
@ -1778,7 +1778,7 @@ static void mce_timer_delete_all(void)
|
||||
* Can be called from interrupt context, but not from machine check/NMI
|
||||
* context.
|
||||
*/
|
||||
int mce_notify_irq(void)
|
||||
bool mce_notify_irq(void)
|
||||
{
|
||||
/* Not more than two messages every minute */
|
||||
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
|
||||
@ -1789,9 +1789,9 @@ int mce_notify_irq(void)
|
||||
if (__ratelimit(&ratelimit))
|
||||
pr_info(HW_ERR "Machine check events logged\n");
|
||||
|
||||
return 1;
|
||||
return true;
|
||||
}
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mce_notify_irq);
|
||||
|
||||
@ -1910,101 +1910,120 @@ static void __mcheck_cpu_check_banks(void)
|
||||
}
|
||||
}
|
||||
|
||||
/* Add per CPU specific workarounds here */
|
||||
static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
|
||||
static void apply_quirks_amd(struct cpuinfo_x86 *c)
|
||||
{
|
||||
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
|
||||
struct mca_config *cfg = &mca_cfg;
|
||||
|
||||
if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
|
||||
pr_info("unknown CPU type - not enabling MCE support\n");
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
/* This should be disabled by the BIOS, but isn't always */
|
||||
if (c->x86_vendor == X86_VENDOR_AMD) {
|
||||
if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
|
||||
/*
|
||||
* disable GART TBL walk error reporting, which
|
||||
* trips off incorrectly with the IOMMU & 3ware
|
||||
* & Cerberus:
|
||||
*/
|
||||
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
|
||||
}
|
||||
if (c->x86 < 0x11 && cfg->bootlog < 0) {
|
||||
/*
|
||||
* Lots of broken BIOS around that don't clear them
|
||||
* by default and leave crap in there. Don't log:
|
||||
*/
|
||||
cfg->bootlog = 0;
|
||||
}
|
||||
if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
|
||||
/*
|
||||
* Various K7s with broken bank 0 around. Always disable
|
||||
* by default.
|
||||
* disable GART TBL walk error reporting, which
|
||||
* trips off incorrectly with the IOMMU & 3ware
|
||||
* & Cerberus:
|
||||
*/
|
||||
if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
|
||||
mce_banks[0].ctl = 0;
|
||||
|
||||
/*
|
||||
* overflow_recov is supported for F15h Models 00h-0fh
|
||||
* even though we don't have a CPUID bit for it.
|
||||
*/
|
||||
if (c->x86 == 0x15 && c->x86_model <= 0xf)
|
||||
mce_flags.overflow_recov = 1;
|
||||
|
||||
if (c->x86 >= 0x17 && c->x86 <= 0x1A)
|
||||
mce_flags.zen_ifu_quirk = 1;
|
||||
|
||||
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
|
||||
}
|
||||
|
||||
if (c->x86_vendor == X86_VENDOR_INTEL) {
|
||||
if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
|
||||
/*
|
||||
* SDM documents that on family 6 bank 0 should not be written
|
||||
* because it aliases to another special BIOS controlled
|
||||
* register.
|
||||
* But it's not aliased anymore on model 0x1a+
|
||||
* Don't ignore bank 0 completely because there could be a
|
||||
* valid event later, merely don't write CTL0.
|
||||
* Lots of broken BIOS around that don't clear them
|
||||
* by default and leave crap in there. Don't log:
|
||||
*/
|
||||
|
||||
if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
|
||||
mce_banks[0].init = false;
|
||||
|
||||
/*
|
||||
* All newer Intel systems support MCE broadcasting. Enable
|
||||
* synchronization with a one second timeout.
|
||||
*/
|
||||
if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
|
||||
cfg->monarch_timeout < 0)
|
||||
cfg->monarch_timeout = USEC_PER_SEC;
|
||||
|
||||
/*
|
||||
* There are also broken BIOSes on some Pentium M and
|
||||
* earlier systems:
|
||||
*/
|
||||
if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
|
||||
cfg->bootlog = 0;
|
||||
|
||||
if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
|
||||
mce_flags.snb_ifu_quirk = 1;
|
||||
|
||||
/*
|
||||
* Skylake, Cascacde Lake and Cooper Lake require a quirk on
|
||||
* rep movs.
|
||||
*/
|
||||
if (c->x86_vfm == INTEL_SKYLAKE_X)
|
||||
mce_flags.skx_repmov_quirk = 1;
|
||||
mca_cfg.bootlog = 0;
|
||||
}
|
||||
|
||||
if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
|
||||
/*
|
||||
* All newer Zhaoxin CPUs support MCE broadcasting. Enable
|
||||
* synchronization with a one second timeout.
|
||||
*/
|
||||
if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
|
||||
if (cfg->monarch_timeout < 0)
|
||||
cfg->monarch_timeout = USEC_PER_SEC;
|
||||
}
|
||||
/*
|
||||
* Various K7s with broken bank 0 around. Always disable
|
||||
* by default.
|
||||
*/
|
||||
if (c->x86 == 6 && this_cpu_read(mce_num_banks))
|
||||
mce_banks[0].ctl = 0;
|
||||
|
||||
/*
|
||||
* overflow_recov is supported for F15h Models 00h-0fh
|
||||
* even though we don't have a CPUID bit for it.
|
||||
*/
|
||||
if (c->x86 == 0x15 && c->x86_model <= 0xf)
|
||||
mce_flags.overflow_recov = 1;
|
||||
|
||||
if (c->x86 >= 0x17 && c->x86 <= 0x1A)
|
||||
mce_flags.zen_ifu_quirk = 1;
|
||||
}
|
||||
|
||||
static void apply_quirks_intel(struct cpuinfo_x86 *c)
|
||||
{
|
||||
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
|
||||
|
||||
/* Older CPUs (prior to family 6) don't need quirks. */
|
||||
if (c->x86_vfm < INTEL_PENTIUM_PRO)
|
||||
return;
|
||||
|
||||
/*
|
||||
* SDM documents that on family 6 bank 0 should not be written
|
||||
* because it aliases to another special BIOS controlled
|
||||
* register.
|
||||
* But it's not aliased anymore on model 0x1a+
|
||||
* Don't ignore bank 0 completely because there could be a
|
||||
* valid event later, merely don't write CTL0.
|
||||
*/
|
||||
if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
|
||||
mce_banks[0].init = false;
|
||||
|
||||
/*
|
||||
* All newer Intel systems support MCE broadcasting. Enable
|
||||
* synchronization with a one second timeout.
|
||||
*/
|
||||
if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0)
|
||||
mca_cfg.monarch_timeout = USEC_PER_SEC;
|
||||
|
||||
/*
|
||||
* There are also broken BIOSes on some Pentium M and
|
||||
* earlier systems:
|
||||
*/
|
||||
if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
|
||||
mca_cfg.bootlog = 0;
|
||||
|
||||
if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
|
||||
mce_flags.snb_ifu_quirk = 1;
|
||||
|
||||
/*
|
||||
* Skylake, Cascacde Lake and Cooper Lake require a quirk on
|
||||
* rep movs.
|
||||
*/
|
||||
if (c->x86_vfm == INTEL_SKYLAKE_X)
|
||||
mce_flags.skx_repmov_quirk = 1;
|
||||
}
|
||||
|
||||
static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
|
||||
{
|
||||
/*
|
||||
* All newer Zhaoxin CPUs support MCE broadcasting. Enable
|
||||
* synchronization with a one second timeout.
|
||||
*/
|
||||
if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
|
||||
if (mca_cfg.monarch_timeout < 0)
|
||||
mca_cfg.monarch_timeout = USEC_PER_SEC;
|
||||
}
|
||||
}
|
||||
|
||||
/* Add per CPU specific workarounds here */
|
||||
static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
|
||||
{
|
||||
struct mca_config *cfg = &mca_cfg;
|
||||
|
||||
switch (c->x86_vendor) {
|
||||
case X86_VENDOR_UNKNOWN:
|
||||
pr_info("unknown CPU type - not enabling MCE support\n");
|
||||
return false;
|
||||
case X86_VENDOR_AMD:
|
||||
apply_quirks_amd(c);
|
||||
break;
|
||||
case X86_VENDOR_INTEL:
|
||||
apply_quirks_intel(c);
|
||||
break;
|
||||
case X86_VENDOR_ZHAOXIN:
|
||||
apply_quirks_zhaoxin(c);
|
||||
break;
|
||||
}
|
||||
|
||||
if (cfg->monarch_timeout < 0)
|
||||
@ -2012,28 +2031,28 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
|
||||
if (cfg->bootlog != 0)
|
||||
cfg->panic_timeout = 30;
|
||||
|
||||
return 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
|
||||
static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
|
||||
{
|
||||
if (c->x86 != 5)
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
switch (c->x86_vendor) {
|
||||
case X86_VENDOR_INTEL:
|
||||
intel_p5_mcheck_init(c);
|
||||
mce_flags.p5 = 1;
|
||||
return 1;
|
||||
return true;
|
||||
case X86_VENDOR_CENTAUR:
|
||||
winchip_mcheck_init(c);
|
||||
mce_flags.winchip = 1;
|
||||
return 1;
|
||||
return true;
|
||||
default:
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2099,13 +2118,9 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
|
||||
mce_intel_feature_init(c);
|
||||
break;
|
||||
|
||||
case X86_VENDOR_AMD: {
|
||||
mce_amd_feature_init(c);
|
||||
break;
|
||||
}
|
||||
|
||||
case X86_VENDOR_AMD:
|
||||
case X86_VENDOR_HYGON:
|
||||
mce_hygon_feature_init(c);
|
||||
mce_amd_feature_init(c);
|
||||
break;
|
||||
|
||||
case X86_VENDOR_CENTAUR:
|
||||
@ -2279,12 +2294,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
|
||||
|
||||
__mcheck_cpu_cap_init();
|
||||
|
||||
if (__mcheck_cpu_apply_quirks(c) < 0) {
|
||||
if (!__mcheck_cpu_apply_quirks(c)) {
|
||||
mca_cfg.disabled = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
if (mce_gen_pool_init()) {
|
||||
if (!mce_gen_pool_init()) {
|
||||
mca_cfg.disabled = 1;
|
||||
pr_emerg("Couldn't allocate MCE records pool!\n");
|
||||
return;
|
||||
|
@ -94,64 +94,63 @@ bool mce_gen_pool_empty(void)
|
||||
return llist_empty(&mce_event_llist);
|
||||
}
|
||||
|
||||
int mce_gen_pool_add(struct mce_hw_err *err)
|
||||
bool mce_gen_pool_add(struct mce_hw_err *err)
|
||||
{
|
||||
struct mce_evt_llist *node;
|
||||
|
||||
if (filter_mce(&err->m))
|
||||
return -EINVAL;
|
||||
return false;
|
||||
|
||||
if (!mce_evt_pool)
|
||||
return -EINVAL;
|
||||
return false;
|
||||
|
||||
node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
|
||||
if (!node) {
|
||||
pr_warn_ratelimited("MCE records pool full!\n");
|
||||
return -ENOMEM;
|
||||
return false;
|
||||
}
|
||||
|
||||
memcpy(&node->err, err, sizeof(*err));
|
||||
llist_add(&node->llnode, &mce_event_llist);
|
||||
|
||||
return 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
static int mce_gen_pool_create(void)
|
||||
static bool mce_gen_pool_create(void)
|
||||
{
|
||||
int mce_numrecords, mce_poolsz, order;
|
||||
struct gen_pool *gpool;
|
||||
int ret = -ENOMEM;
|
||||
void *mce_pool;
|
||||
|
||||
order = order_base_2(sizeof(struct mce_evt_llist));
|
||||
gpool = gen_pool_create(order, -1);
|
||||
if (!gpool)
|
||||
return ret;
|
||||
return false;
|
||||
|
||||
mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
|
||||
mce_poolsz = mce_numrecords * (1 << order);
|
||||
mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
|
||||
if (!mce_pool) {
|
||||
gen_pool_destroy(gpool);
|
||||
return ret;
|
||||
return false;
|
||||
}
|
||||
ret = gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1);
|
||||
if (ret) {
|
||||
|
||||
if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) {
|
||||
gen_pool_destroy(gpool);
|
||||
kfree(mce_pool);
|
||||
return ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
mce_evt_pool = gpool;
|
||||
|
||||
return ret;
|
||||
return true;
|
||||
}
|
||||
|
||||
int mce_gen_pool_init(void)
|
||||
bool mce_gen_pool_init(void)
|
||||
{
|
||||
/* Just init mce_gen_pool once. */
|
||||
if (mce_evt_pool)
|
||||
return 0;
|
||||
return true;
|
||||
|
||||
return mce_gen_pool_create();
|
||||
}
|
||||
|
@ -75,12 +75,12 @@ static u16 cmci_threshold[MAX_NR_BANKS];
|
||||
*/
|
||||
#define CMCI_STORM_THRESHOLD 32749
|
||||
|
||||
static int cmci_supported(int *banks)
|
||||
static bool cmci_supported(int *banks)
|
||||
{
|
||||
u64 cap;
|
||||
|
||||
if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Vendor check is not strictly needed, but the initial
|
||||
@ -89,10 +89,11 @@ static int cmci_supported(int *banks)
|
||||
*/
|
||||
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
|
||||
boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
rdmsrl(MSR_IA32_MCG_CAP, cap);
|
||||
*banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK);
|
||||
return !!(cap & MCG_CMCI_P);
|
||||
|
@ -31,8 +31,8 @@ struct mce_evt_llist {
|
||||
|
||||
void mce_gen_pool_process(struct work_struct *__unused);
|
||||
bool mce_gen_pool_empty(void);
|
||||
int mce_gen_pool_add(struct mce_hw_err *err);
|
||||
int mce_gen_pool_init(void);
|
||||
bool mce_gen_pool_add(struct mce_hw_err *err);
|
||||
bool mce_gen_pool_init(void);
|
||||
struct llist_node *mce_gen_pool_prepare_records(void);
|
||||
|
||||
int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp);
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user