mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-04 04:04:19 +00:00
ARM64:
* Enable the per-vcpu dirty-ring tracking mechanism, together with an option to keep the good old dirty log around for pages that are dirtied by something other than a vcpu. * Switch to the relaxed parallel fault handling, using RCU to delay page table reclaim and giving better performance under load. * Relax the MTE ABI, allowing a VMM to use the MAP_SHARED mapping option, which multi-process VMMs such as crosvm rely on (see merge commit382b5b87a9
: "Fix a number of issues with MTE, such as races on the tags being initialised vs the PG_mte_tagged flag as well as the lack of support for VM_SHARED when KVM is involved. Patches from Catalin Marinas and Peter Collingbourne"). * Merge the pKVM shadow vcpu state tracking that allows the hypervisor to have its own view of a vcpu, keeping that state private. * Add support for the PMUv3p5 architecture revision, bringing support for 64bit counters on systems that support it, and fix the no-quite-compliant CHAIN-ed counter support for the machines that actually exist out there. * Fix a handful of minor issues around 52bit VA/PA support (64kB pages only) as a prefix of the oncoming support for 4kB and 16kB pages. * Pick a small set of documentation and spelling fixes, because no good merge window would be complete without those. s390: * Second batch of the lazy destroy patches * First batch of KVM changes for kernel virtual != physical address support * Removal of a unused function x86: * Allow compiling out SMM support * Cleanup and documentation of SMM state save area format * Preserve interrupt shadow in SMM state save area * Respond to generic signals during slow page faults * Fixes and optimizations for the non-executable huge page errata fix. * Reprogram all performance counters on PMU filter change * Cleanups to Hyper-V emulation and tests * Process Hyper-V TLB flushes from a nested guest (i.e. from a L2 guest running on top of a L1 Hyper-V hypervisor) * Advertise several new Intel features * x86 Xen-for-KVM: ** Allow the Xen runstate information to cross a page boundary ** Allow XEN_RUNSTATE_UPDATE flag behaviour to be configured ** Add support for 32-bit guests in SCHEDOP_poll * Notable x86 fixes and cleanups: ** One-off fixes for various emulation flows (SGX, VMXON, NRIPS=0). ** Reinstate IBPB on emulated VM-Exit that was incorrectly dropped a few years back when eliminating unnecessary barriers when switching between vmcs01 and vmcs02. ** Clean up vmread_error_trampoline() to make it more obvious that params must be passed on the stack, even for x86-64. ** Let userspace set all supported bits in MSR_IA32_FEAT_CTL irrespective of the current guest CPUID. ** Fudge around a race with TSC refinement that results in KVM incorrectly thinking a guest needs TSC scaling when running on a CPU with a constant TSC, but no hardware-enumerated TSC frequency. ** Advertise (on AMD) that the SMM_CTL MSR is not supported ** Remove unnecessary exports Generic: * Support for responding to signals during page faults; introduces new FOLL_INTERRUPTIBLE flag that was reviewed by mm folks Selftests: * Fix an inverted check in the access tracking perf test, and restore support for asserting that there aren't too many idle pages when running on bare metal. * Fix build errors that occur in certain setups (unsure exactly what is unique about the problematic setup) due to glibc overriding static_assert() to a variant that requires a custom message. * Introduce actual atomics for clear/set_bit() in selftests * Add support for pinning vCPUs in dirty_log_perf_test. * Rename the so called "perf_util" framework to "memstress". * Add a lightweight psuedo RNG for guest use, and use it to randomize the access pattern and write vs. read percentage in the memstress tests. * Add a common ucall implementation; code dedup and pre-work for running SEV (and beyond) guests in selftests. * Provide a common constructor and arch hook, which will eventually be used by x86 to automatically select the right hypercall (AMD vs. Intel). * A bunch of added/enabled/fixed selftests for ARM64, covering memslots, breakpoints, stage-2 faults and access tracking. * x86-specific selftest changes: ** Clean up x86's page table management. ** Clean up and enhance the "smaller maxphyaddr" test, and add a related test to cover generic emulation failure. ** Clean up the nEPT support checks. ** Add X86_PROPERTY_* framework to retrieve multi-bit CPUID values. ** Fix an ordering issue in the AMX test introduced by recent conversions to use kvm_cpu_has(), and harden the code to guard against similar bugs in the future. Anything that tiggers caching of KVM's supported CPUID, kvm_cpu_has() in this case, effectively hides opt-in XSAVE features if the caching occurs before the test opts in via prctl(). Documentation: * Remove deleted ioctls from documentation * Clean up the docs for the x86 MSR filter. * Various fixes -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmOaFrcUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroPemQgAq49excg2Cc+EsHnZw3vu/QWdA0Rt KhL3OgKxuHNjCbD2O9n2t5di7eJOTQ7F7T0eDm3xPTr4FS8LQ2327/mQePU/H2CF mWOpq9RBWLzFsSTeVA2Mz9TUTkYSnDHYuRsBvHyw/n9cL76BWVzjImldFtjYjjex yAwl8c5itKH6bc7KO+5ydswbvBzODkeYKUSBNdbn6m0JGQST7XppNwIAJvpiHsii Qgpk0e4Xx9q4PXG/r5DedI6BlufBsLhv0aE9SHPzyKH3JbbUFhJYI8ZD5OhBQuYW MwxK2KlM5Jm5ud2NZDDlsMmmvd1lnYCFDyqNozaKEWC1Y5rq1AbMa51fXA== =QAYX -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm updates from Paolo Bonzini: "ARM64: - Enable the per-vcpu dirty-ring tracking mechanism, together with an option to keep the good old dirty log around for pages that are dirtied by something other than a vcpu. - Switch to the relaxed parallel fault handling, using RCU to delay page table reclaim and giving better performance under load. - Relax the MTE ABI, allowing a VMM to use the MAP_SHARED mapping option, which multi-process VMMs such as crosvm rely on (see merge commit382b5b87a9
: "Fix a number of issues with MTE, such as races on the tags being initialised vs the PG_mte_tagged flag as well as the lack of support for VM_SHARED when KVM is involved. Patches from Catalin Marinas and Peter Collingbourne"). - Merge the pKVM shadow vcpu state tracking that allows the hypervisor to have its own view of a vcpu, keeping that state private. - Add support for the PMUv3p5 architecture revision, bringing support for 64bit counters on systems that support it, and fix the no-quite-compliant CHAIN-ed counter support for the machines that actually exist out there. - Fix a handful of minor issues around 52bit VA/PA support (64kB pages only) as a prefix of the oncoming support for 4kB and 16kB pages. - Pick a small set of documentation and spelling fixes, because no good merge window would be complete without those. s390: - Second batch of the lazy destroy patches - First batch of KVM changes for kernel virtual != physical address support - Removal of a unused function x86: - Allow compiling out SMM support - Cleanup and documentation of SMM state save area format - Preserve interrupt shadow in SMM state save area - Respond to generic signals during slow page faults - Fixes and optimizations for the non-executable huge page errata fix. - Reprogram all performance counters on PMU filter change - Cleanups to Hyper-V emulation and tests - Process Hyper-V TLB flushes from a nested guest (i.e. from a L2 guest running on top of a L1 Hyper-V hypervisor) - Advertise several new Intel features - x86 Xen-for-KVM: - Allow the Xen runstate information to cross a page boundary - Allow XEN_RUNSTATE_UPDATE flag behaviour to be configured - Add support for 32-bit guests in SCHEDOP_poll - Notable x86 fixes and cleanups: - One-off fixes for various emulation flows (SGX, VMXON, NRIPS=0). - Reinstate IBPB on emulated VM-Exit that was incorrectly dropped a few years back when eliminating unnecessary barriers when switching between vmcs01 and vmcs02. - Clean up vmread_error_trampoline() to make it more obvious that params must be passed on the stack, even for x86-64. - Let userspace set all supported bits in MSR_IA32_FEAT_CTL irrespective of the current guest CPUID. - Fudge around a race with TSC refinement that results in KVM incorrectly thinking a guest needs TSC scaling when running on a CPU with a constant TSC, but no hardware-enumerated TSC frequency. - Advertise (on AMD) that the SMM_CTL MSR is not supported - Remove unnecessary exports Generic: - Support for responding to signals during page faults; introduces new FOLL_INTERRUPTIBLE flag that was reviewed by mm folks Selftests: - Fix an inverted check in the access tracking perf test, and restore support for asserting that there aren't too many idle pages when running on bare metal. - Fix build errors that occur in certain setups (unsure exactly what is unique about the problematic setup) due to glibc overriding static_assert() to a variant that requires a custom message. - Introduce actual atomics for clear/set_bit() in selftests - Add support for pinning vCPUs in dirty_log_perf_test. - Rename the so called "perf_util" framework to "memstress". - Add a lightweight psuedo RNG for guest use, and use it to randomize the access pattern and write vs. read percentage in the memstress tests. - Add a common ucall implementation; code dedup and pre-work for running SEV (and beyond) guests in selftests. - Provide a common constructor and arch hook, which will eventually be used by x86 to automatically select the right hypercall (AMD vs. Intel). - A bunch of added/enabled/fixed selftests for ARM64, covering memslots, breakpoints, stage-2 faults and access tracking. - x86-specific selftest changes: - Clean up x86's page table management. - Clean up and enhance the "smaller maxphyaddr" test, and add a related test to cover generic emulation failure. - Clean up the nEPT support checks. - Add X86_PROPERTY_* framework to retrieve multi-bit CPUID values. - Fix an ordering issue in the AMX test introduced by recent conversions to use kvm_cpu_has(), and harden the code to guard against similar bugs in the future. Anything that tiggers caching of KVM's supported CPUID, kvm_cpu_has() in this case, effectively hides opt-in XSAVE features if the caching occurs before the test opts in via prctl(). Documentation: - Remove deleted ioctls from documentation - Clean up the docs for the x86 MSR filter. - Various fixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (361 commits) KVM: x86: Add proper ReST tables for userspace MSR exits/flags KVM: selftests: Allocate ucall pool from MEM_REGION_DATA KVM: arm64: selftests: Align VA space allocator with TTBR0 KVM: arm64: Fix benign bug with incorrect use of VA_BITS KVM: arm64: PMU: Fix period computation for 64bit counters with 32bit overflow KVM: x86: Advertise that the SMM_CTL MSR is not supported KVM: x86: remove unnecessary exports KVM: selftests: Fix spelling mistake "probabalistic" -> "probabilistic" tools: KVM: selftests: Convert clear/set_bit() to actual atomics tools: Drop "atomic_" prefix from atomic test_and_set_bit() tools: Drop conflicting non-atomic test_and_{clear,set}_bit() helpers KVM: selftests: Use non-atomic clear/set bit helpers in KVM tests perf tools: Use dedicated non-atomic clear/set bit helpers tools: Take @bit as an "unsigned long" in {clear,set}_bit() helpers KVM: arm64: selftests: Enable single-step without a "full" ucall() KVM: x86: fix APICv/x2AVIC disabled when vm reboot by itself KVM: Remove stale comment about KVM_REQ_UNHALT KVM: Add missing arch for KVM_CREATE_DEVICE and KVM_{SET,GET}_DEVICE_ATTR KVM: Reference to kvm_userspace_memory_region in doc and comments KVM: Delete all references to removed KVM_SET_MEMORY_ALIAS ioctl ...
This commit is contained in:
commit
8fa590bf34
@ -272,18 +272,6 @@ the VCPU file descriptor can be mmap-ed, including:
|
||||
KVM_CAP_DIRTY_LOG_RING, see section 8.3.
|
||||
|
||||
|
||||
4.6 KVM_SET_MEMORY_REGION
|
||||
-------------------------
|
||||
|
||||
:Capability: basic
|
||||
:Architectures: all
|
||||
:Type: vm ioctl
|
||||
:Parameters: struct kvm_memory_region (in)
|
||||
:Returns: 0 on success, -1 on error
|
||||
|
||||
This ioctl is obsolete and has been removed.
|
||||
|
||||
|
||||
4.7 KVM_CREATE_VCPU
|
||||
-------------------
|
||||
|
||||
@ -368,17 +356,6 @@ see the description of the capability.
|
||||
Note that the Xen shared info page, if configured, shall always be assumed
|
||||
to be dirty. KVM will not explicitly mark it such.
|
||||
|
||||
4.9 KVM_SET_MEMORY_ALIAS
|
||||
------------------------
|
||||
|
||||
:Capability: basic
|
||||
:Architectures: x86
|
||||
:Type: vm ioctl
|
||||
:Parameters: struct kvm_memory_alias (in)
|
||||
:Returns: 0 (success), -1 (error)
|
||||
|
||||
This ioctl is obsolete and has been removed.
|
||||
|
||||
|
||||
4.10 KVM_RUN
|
||||
------------
|
||||
@ -1332,7 +1309,7 @@ yet and must be cleared on entry.
|
||||
__u64 userspace_addr; /* start of the userspace allocated memory */
|
||||
};
|
||||
|
||||
/* for kvm_memory_region::flags */
|
||||
/* for kvm_userspace_memory_region::flags */
|
||||
#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
|
||||
#define KVM_MEM_READONLY (1UL << 1)
|
||||
|
||||
@ -1377,10 +1354,6 @@ the memory region are automatically reflected into the guest. For example, an
|
||||
mmap() that affects the region will be made visible immediately. Another
|
||||
example is madvise(MADV_DROP).
|
||||
|
||||
It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
|
||||
The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
|
||||
allocation and is deprecated.
|
||||
|
||||
|
||||
4.36 KVM_SET_TSS_ADDR
|
||||
---------------------
|
||||
@ -3293,6 +3266,7 @@ valid entries found.
|
||||
----------------------
|
||||
|
||||
:Capability: KVM_CAP_DEVICE_CTRL
|
||||
:Architectures: all
|
||||
:Type: vm ioctl
|
||||
:Parameters: struct kvm_create_device (in/out)
|
||||
:Returns: 0 on success, -1 on error
|
||||
@ -3333,6 +3307,7 @@ number.
|
||||
:Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
|
||||
KVM_CAP_VCPU_ATTRIBUTES for vcpu device
|
||||
KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device (no set)
|
||||
:Architectures: x86, arm64, s390
|
||||
:Type: device ioctl, vm ioctl, vcpu ioctl
|
||||
:Parameters: struct kvm_device_attr
|
||||
:Returns: 0 on success, -1 on error
|
||||
@ -4104,80 +4079,71 @@ flags values for ``struct kvm_msr_filter_range``:
|
||||
``KVM_MSR_FILTER_READ``
|
||||
|
||||
Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap
|
||||
indicates that a read should immediately fail, while a 1 indicates that
|
||||
a read for a particular MSR should be handled regardless of the default
|
||||
indicates that read accesses should be denied, while a 1 indicates that
|
||||
a read for a particular MSR should be allowed regardless of the default
|
||||
filter action.
|
||||
|
||||
``KVM_MSR_FILTER_WRITE``
|
||||
|
||||
Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap
|
||||
indicates that a write should immediately fail, while a 1 indicates that
|
||||
a write for a particular MSR should be handled regardless of the default
|
||||
indicates that write accesses should be denied, while a 1 indicates that
|
||||
a write for a particular MSR should be allowed regardless of the default
|
||||
filter action.
|
||||
|
||||
``KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE``
|
||||
|
||||
Filter both read and write accesses to MSRs using the given bitmap. A 0
|
||||
in the bitmap indicates that both reads and writes should immediately fail,
|
||||
while a 1 indicates that reads and writes for a particular MSR are not
|
||||
filtered by this range.
|
||||
|
||||
flags values for ``struct kvm_msr_filter``:
|
||||
|
||||
``KVM_MSR_FILTER_DEFAULT_ALLOW``
|
||||
|
||||
If no filter range matches an MSR index that is getting accessed, KVM will
|
||||
fall back to allowing access to the MSR.
|
||||
allow accesses to all MSRs by default.
|
||||
|
||||
``KVM_MSR_FILTER_DEFAULT_DENY``
|
||||
|
||||
If no filter range matches an MSR index that is getting accessed, KVM will
|
||||
fall back to rejecting access to the MSR. In this mode, all MSRs that should
|
||||
be processed by KVM need to explicitly be marked as allowed in the bitmaps.
|
||||
deny accesses to all MSRs by default.
|
||||
|
||||
This ioctl allows user space to define up to 16 bitmaps of MSR ranges to
|
||||
specify whether a certain MSR access should be explicitly filtered for or not.
|
||||
This ioctl allows userspace to define up to 16 bitmaps of MSR ranges to deny
|
||||
guest MSR accesses that would normally be allowed by KVM. If an MSR is not
|
||||
covered by a specific range, the "default" filtering behavior applies. Each
|
||||
bitmap range covers MSRs from [base .. base+nmsrs).
|
||||
|
||||
If this ioctl has never been invoked, MSR accesses are not guarded and the
|
||||
default KVM in-kernel emulation behavior is fully preserved.
|
||||
If an MSR access is denied by userspace, the resulting KVM behavior depends on
|
||||
whether or not KVM_CAP_X86_USER_SPACE_MSR's KVM_MSR_EXIT_REASON_FILTER is
|
||||
enabled. If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace
|
||||
on denied accesses, i.e. userspace effectively intercepts the MSR access. If
|
||||
KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest
|
||||
on denied accesses.
|
||||
|
||||
If an MSR access is allowed by userspace, KVM will emulate and/or virtualize
|
||||
the access in accordance with the vCPU model. Note, KVM may still ultimately
|
||||
inject a #GP if an access is allowed by userspace, e.g. if KVM doesn't support
|
||||
the MSR, or to follow architectural behavior for the MSR.
|
||||
|
||||
By default, KVM operates in KVM_MSR_FILTER_DEFAULT_ALLOW mode with no MSR range
|
||||
filters.
|
||||
|
||||
Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR
|
||||
filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes
|
||||
an error.
|
||||
|
||||
As soon as the filtering is in place, every MSR access is processed through
|
||||
the filtering except for accesses to the x2APIC MSRs (from 0x800 to 0x8ff);
|
||||
x2APIC MSRs are always allowed, independent of the ``default_allow`` setting,
|
||||
and their behavior depends on the ``X2APIC_ENABLE`` bit of the APIC base
|
||||
register.
|
||||
|
||||
.. warning::
|
||||
MSR accesses coming from nested vmentry/vmexit are not filtered.
|
||||
MSR accesses as part of nested VM-Enter/VM-Exit are not filtered.
|
||||
This includes both writes to individual VMCS fields and reads/writes
|
||||
through the MSR lists pointed to by the VMCS.
|
||||
|
||||
If a bit is within one of the defined ranges, read and write accesses are
|
||||
guarded by the bitmap's value for the MSR index if the kind of access
|
||||
is included in the ``struct kvm_msr_filter_range`` flags. If no range
|
||||
cover this particular access, the behavior is determined by the flags
|
||||
field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW``
|
||||
and ``KVM_MSR_FILTER_DEFAULT_DENY``.
|
||||
|
||||
Each bitmap range specifies a range of MSRs to potentially allow access on.
|
||||
The range goes from MSR index [base .. base+nmsrs]. The flags field
|
||||
indicates whether reads, writes or both reads and writes are filtered
|
||||
by setting a 1 bit in the bitmap for the corresponding MSR index.
|
||||
|
||||
If an MSR access is not permitted through the filtering, it generates a
|
||||
#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that
|
||||
allows user space to deflect and potentially handle various MSR accesses
|
||||
into user space.
|
||||
x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that
|
||||
cover any x2APIC MSRs).
|
||||
|
||||
Note, invoking this ioctl while a vCPU is running is inherently racy. However,
|
||||
KVM does guarantee that vCPUs will see either the previous filter or the new
|
||||
filter, e.g. MSRs with identical settings in both the old and new filter will
|
||||
have deterministic behavior.
|
||||
|
||||
Similarly, if userspace wishes to intercept on denied accesses,
|
||||
KVM_MSR_EXIT_REASON_FILTER must be enabled before activating any filters, and
|
||||
left enabled until after all filters are deactivated. Failure to do so may
|
||||
result in KVM injecting a #GP instead of exiting to userspace.
|
||||
|
||||
4.98 KVM_CREATE_SPAPR_TCE_64
|
||||
----------------------------
|
||||
|
||||
@ -5163,10 +5129,13 @@ KVM_PV_ENABLE
|
||||
===== =============================
|
||||
|
||||
KVM_PV_DISABLE
|
||||
Deregister the VM from the Ultravisor and reclaim the memory that
|
||||
had been donated to the Ultravisor, making it usable by the kernel
|
||||
again. All registered VCPUs are converted back to non-protected
|
||||
ones.
|
||||
Deregister the VM from the Ultravisor and reclaim the memory that had
|
||||
been donated to the Ultravisor, making it usable by the kernel again.
|
||||
All registered VCPUs are converted back to non-protected ones. If a
|
||||
previous protected VM had been prepared for asynchonous teardown with
|
||||
KVM_PV_ASYNC_CLEANUP_PREPARE and not subsequently torn down with
|
||||
KVM_PV_ASYNC_CLEANUP_PERFORM, it will be torn down in this call
|
||||
together with the current protected VM.
|
||||
|
||||
KVM_PV_VM_SET_SEC_PARMS
|
||||
Pass the image header from VM memory to the Ultravisor in
|
||||
@ -5289,6 +5258,36 @@ KVM_PV_DUMP
|
||||
authentication tag all of which are needed to decrypt the dump at a
|
||||
later time.
|
||||
|
||||
KVM_PV_ASYNC_CLEANUP_PREPARE
|
||||
:Capability: KVM_CAP_S390_PROTECTED_ASYNC_DISABLE
|
||||
|
||||
Prepare the current protected VM for asynchronous teardown. Most
|
||||
resources used by the current protected VM will be set aside for a
|
||||
subsequent asynchronous teardown. The current protected VM will then
|
||||
resume execution immediately as non-protected. There can be at most
|
||||
one protected VM prepared for asynchronous teardown at any time. If
|
||||
a protected VM had already been prepared for teardown without
|
||||
subsequently calling KVM_PV_ASYNC_CLEANUP_PERFORM, this call will
|
||||
fail. In that case, the userspace process should issue a normal
|
||||
KVM_PV_DISABLE. The resources set aside with this call will need to
|
||||
be cleaned up with a subsequent call to KVM_PV_ASYNC_CLEANUP_PERFORM
|
||||
or KVM_PV_DISABLE, otherwise they will be cleaned up when KVM
|
||||
terminates. KVM_PV_ASYNC_CLEANUP_PREPARE can be called again as soon
|
||||
as cleanup starts, i.e. before KVM_PV_ASYNC_CLEANUP_PERFORM finishes.
|
||||
|
||||
KVM_PV_ASYNC_CLEANUP_PERFORM
|
||||
:Capability: KVM_CAP_S390_PROTECTED_ASYNC_DISABLE
|
||||
|
||||
Tear down the protected VM previously prepared for teardown with
|
||||
KVM_PV_ASYNC_CLEANUP_PREPARE. The resources that had been set aside
|
||||
will be freed during the execution of this command. This PV command
|
||||
should ideally be issued by userspace from a separate thread. If a
|
||||
fatal signal is received (or the process terminates naturally), the
|
||||
command will terminate immediately without completing, and the normal
|
||||
KVM shutdown procedure will take care of cleaning up all remaining
|
||||
protected VMs, including the ones whose teardown was interrupted by
|
||||
process termination.
|
||||
|
||||
4.126 KVM_XEN_HVM_SET_ATTR
|
||||
--------------------------
|
||||
|
||||
@ -5306,6 +5305,7 @@ KVM_PV_DUMP
|
||||
union {
|
||||
__u8 long_mode;
|
||||
__u8 vector;
|
||||
__u8 runstate_update_flag;
|
||||
struct {
|
||||
__u64 gfn;
|
||||
} shared_info;
|
||||
@ -5383,6 +5383,14 @@ KVM_XEN_ATTR_TYPE_XEN_VERSION
|
||||
event channel delivery, so responding within the kernel without
|
||||
exiting to userspace is beneficial.
|
||||
|
||||
KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG
|
||||
This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
|
||||
support for KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG. It enables the
|
||||
XEN_RUNSTATE_UPDATE flag which allows guest vCPUs to safely read
|
||||
other vCPUs' vcpu_runstate_info. Xen guests enable this feature via
|
||||
the VM_ASST_TYPE_runstate_update_flag of the HYPERVISOR_vm_assist
|
||||
hypercall.
|
||||
|
||||
4.127 KVM_XEN_HVM_GET_ATTR
|
||||
--------------------------
|
||||
|
||||
@ -6440,31 +6448,35 @@ if it decides to decode and emulate the instruction.
|
||||
|
||||
Used on x86 systems. When the VM capability KVM_CAP_X86_USER_SPACE_MSR is
|
||||
enabled, MSR accesses to registers that would invoke a #GP by KVM kernel code
|
||||
will instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR
|
||||
may instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR
|
||||
exit for writes.
|
||||
|
||||
The "reason" field specifies why the MSR trap occurred. User space will only
|
||||
receive MSR exit traps when a particular reason was requested during through
|
||||
The "reason" field specifies why the MSR interception occurred. Userspace will
|
||||
only receive MSR exits when a particular reason was requested during through
|
||||
ENABLE_CAP. Currently valid exit reasons are:
|
||||
|
||||
KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM
|
||||
KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits
|
||||
KVM_MSR_EXIT_REASON_FILTER - access blocked by KVM_X86_SET_MSR_FILTER
|
||||
============================ ========================================
|
||||
KVM_MSR_EXIT_REASON_UNKNOWN access to MSR that is unknown to KVM
|
||||
KVM_MSR_EXIT_REASON_INVAL access to invalid MSRs or reserved bits
|
||||
KVM_MSR_EXIT_REASON_FILTER access blocked by KVM_X86_SET_MSR_FILTER
|
||||
============================ ========================================
|
||||
|
||||
For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest
|
||||
wants to read. To respond to this request with a successful read, user space
|
||||
For KVM_EXIT_X86_RDMSR, the "index" field tells userspace which MSR the guest
|
||||
wants to read. To respond to this request with a successful read, userspace
|
||||
writes the respective data into the "data" field and must continue guest
|
||||
execution to ensure the read data is transferred into guest register state.
|
||||
|
||||
If the RDMSR request was unsuccessful, user space indicates that with a "1" in
|
||||
If the RDMSR request was unsuccessful, userspace indicates that with a "1" in
|
||||
the "error" field. This will inject a #GP into the guest when the VCPU is
|
||||
executed again.
|
||||
|
||||
For KVM_EXIT_X86_WRMSR, the "index" field tells user space which MSR the guest
|
||||
wants to write. Once finished processing the event, user space must continue
|
||||
vCPU execution. If the MSR write was unsuccessful, user space also sets the
|
||||
For KVM_EXIT_X86_WRMSR, the "index" field tells userspace which MSR the guest
|
||||
wants to write. Once finished processing the event, userspace must continue
|
||||
vCPU execution. If the MSR write was unsuccessful, userspace also sets the
|
||||
"error" field to "1".
|
||||
|
||||
See KVM_X86_SET_MSR_FILTER for details on the interaction with MSR filtering.
|
||||
|
||||
::
|
||||
|
||||
|
||||
@ -7229,19 +7241,29 @@ polling.
|
||||
:Parameters: args[0] contains the mask of KVM_MSR_EXIT_REASON_* events to report
|
||||
:Returns: 0 on success; -1 on error
|
||||
|
||||
This capability enables trapping of #GP invoking RDMSR and WRMSR instructions
|
||||
into user space.
|
||||
This capability allows userspace to intercept RDMSR and WRMSR instructions if
|
||||
access to an MSR is denied. By default, KVM injects #GP on denied accesses.
|
||||
|
||||
When a guest requests to read or write an MSR, KVM may not implement all MSRs
|
||||
that are relevant to a respective system. It also does not differentiate by
|
||||
CPU type.
|
||||
|
||||
To allow more fine grained control over MSR handling, user space may enable
|
||||
To allow more fine grained control over MSR handling, userspace may enable
|
||||
this capability. With it enabled, MSR accesses that match the mask specified in
|
||||
args[0] and trigger a #GP event inside the guest by KVM will instead trigger
|
||||
KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
|
||||
can then handle to implement model specific MSR handling and/or user notifications
|
||||
to inform a user that an MSR was not handled.
|
||||
args[0] and would trigger a #GP inside the guest will instead trigger
|
||||
KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications. Userspace
|
||||
can then implement model specific MSR handling and/or user notifications
|
||||
to inform a user that an MSR was not emulated/virtualized by KVM.
|
||||
|
||||
The valid mask flags are:
|
||||
|
||||
============================ ===============================================
|
||||
KVM_MSR_EXIT_REASON_UNKNOWN intercept accesses to unknown (to KVM) MSRs
|
||||
KVM_MSR_EXIT_REASON_INVAL intercept accesses that are architecturally
|
||||
invalid according to the vCPU model and/or mode
|
||||
KVM_MSR_EXIT_REASON_FILTER intercept accesses that are denied by userspace
|
||||
via KVM_X86_SET_MSR_FILTER
|
||||
============================ ===============================================
|
||||
|
||||
7.22 KVM_CAP_X86_BUS_LOCK_EXIT
|
||||
-------------------------------
|
||||
@ -7384,8 +7406,9 @@ hibernation of the host; however the VMM needs to manually save/restore the
|
||||
tags as appropriate if the VM is migrated.
|
||||
|
||||
When this capability is enabled all memory in memslots must be mapped as
|
||||
not-shareable (no MAP_SHARED), attempts to create a memslot with a
|
||||
MAP_SHARED mmap will result in an -EINVAL return.
|
||||
``MAP_ANONYMOUS`` or with a RAM-based file mapping (``tmpfs``, ``memfd``),
|
||||
attempts to create a memslot with an invalid mmap will result in an
|
||||
-EINVAL return.
|
||||
|
||||
When enabled the VMM may make use of the ``KVM_ARM_MTE_COPY_TAGS`` ioctl to
|
||||
perform a bulk copy of tags to/from the guest.
|
||||
@ -7901,7 +7924,7 @@ KVM_EXIT_X86_WRMSR exit notifications.
|
||||
This capability indicates that KVM supports that accesses to user defined MSRs
|
||||
may be rejected. With this capability exposed, KVM exports new VM ioctl
|
||||
KVM_X86_SET_MSR_FILTER which user space can call to specify bitmaps of MSR
|
||||
ranges that KVM should reject access to.
|
||||
ranges that KVM should deny access to.
|
||||
|
||||
In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to
|
||||
trap and emulate MSRs that are outside of the scope of KVM as well as
|
||||
@ -7920,7 +7943,7 @@ regardless of what has actually been exposed through the CPUID leaf.
|
||||
8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL
|
||||
----------------------------------------------------------
|
||||
|
||||
:Architectures: x86
|
||||
:Architectures: x86, arm64
|
||||
:Parameters: args[0] - size of the dirty log ring
|
||||
|
||||
KVM is capable of tracking dirty memory using ring buffers that are
|
||||
@ -8002,13 +8025,6 @@ flushing is done by the KVM_GET_DIRTY_LOG ioctl). To achieve that, one
|
||||
needs to kick the vcpu out of KVM_RUN using a signal. The resulting
|
||||
vmexit ensures that all dirty GFNs are flushed to the dirty rings.
|
||||
|
||||
NOTE: the capability KVM_CAP_DIRTY_LOG_RING and the corresponding
|
||||
ioctl KVM_RESET_DIRTY_RINGS are mutual exclusive to the existing ioctls
|
||||
KVM_GET_DIRTY_LOG and KVM_CLEAR_DIRTY_LOG. After enabling
|
||||
KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring size, the virtual
|
||||
machine will switch to ring-buffer dirty page tracking and further
|
||||
KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail.
|
||||
|
||||
NOTE: KVM_CAP_DIRTY_LOG_RING_ACQ_REL is the only capability that
|
||||
should be exposed by weakly ordered architecture, in order to indicate
|
||||
the additional memory ordering requirements imposed on userspace when
|
||||
@ -8017,6 +8033,33 @@ Architecture with TSO-like ordering (such as x86) are allowed to
|
||||
expose both KVM_CAP_DIRTY_LOG_RING and KVM_CAP_DIRTY_LOG_RING_ACQ_REL
|
||||
to userspace.
|
||||
|
||||
After enabling the dirty rings, the userspace needs to detect the
|
||||
capability of KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP to see whether the
|
||||
ring structures can be backed by per-slot bitmaps. With this capability
|
||||
advertised, it means the architecture can dirty guest pages without
|
||||
vcpu/ring context, so that some of the dirty information will still be
|
||||
maintained in the bitmap structure. KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP
|
||||
can't be enabled if the capability of KVM_CAP_DIRTY_LOG_RING_ACQ_REL
|
||||
hasn't been enabled, or any memslot has been existing.
|
||||
|
||||
Note that the bitmap here is only a backup of the ring structure. The
|
||||
use of the ring and bitmap combination is only beneficial if there is
|
||||
only a very small amount of memory that is dirtied out of vcpu/ring
|
||||
context. Otherwise, the stand-alone per-slot bitmap mechanism needs to
|
||||
be considered.
|
||||
|
||||
To collect dirty bits in the backup bitmap, userspace can use the same
|
||||
KVM_GET_DIRTY_LOG ioctl. KVM_CLEAR_DIRTY_LOG isn't needed as long as all
|
||||
the generation of the dirty bits is done in a single pass. Collecting
|
||||
the dirty bitmap should be the very last thing that the VMM does before
|
||||
considering the state as complete. VMM needs to ensure that the dirty
|
||||
state is final and avoid missing dirty pages from another ioctl ordered
|
||||
after the bitmap collection.
|
||||
|
||||
NOTE: One example of using the backup bitmap is saving arm64 vgic/its
|
||||
tables through KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} command on
|
||||
KVM device "kvm-arm-vgic-its" when dirty ring is enabled.
|
||||
|
||||
8.30 KVM_CAP_XEN_HVM
|
||||
--------------------
|
||||
|
||||
@ -8025,12 +8068,13 @@ to userspace.
|
||||
This capability indicates the features that Xen supports for hosting Xen
|
||||
PVHVM guests. Valid flags are::
|
||||
|
||||
#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0)
|
||||
#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
|
||||
#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
|
||||
#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
|
||||
#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
|
||||
#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
|
||||
#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0)
|
||||
#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
|
||||
#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
|
||||
#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
|
||||
#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
|
||||
#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
|
||||
#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6)
|
||||
|
||||
The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
|
||||
ioctl is available, for the guest to set its hypercall page.
|
||||
@ -8062,6 +8106,18 @@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UPCALL_VECTOR vCPU attributes.
|
||||
related to event channel delivery, timers, and the XENVER_version
|
||||
interception.
|
||||
|
||||
The KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG flag indicates that KVM supports
|
||||
the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute in the KVM_XEN_SET_ATTR
|
||||
and KVM_XEN_GET_ATTR ioctls. This controls whether KVM will set the
|
||||
XEN_RUNSTATE_UPDATE flag in guest memory mapped vcpu_runstate_info during
|
||||
updates of the runstate information. Note that versions of KVM which support
|
||||
the RUNSTATE feature above, but not thie RUNSTATE_UPDATE_FLAG feature, will
|
||||
always set the XEN_RUNSTATE_UPDATE flag when updating the guest structure,
|
||||
which is perhaps counterintuitive. When this flag is advertised, KVM will
|
||||
behave more correctly, not using the XEN_RUNSTATE_UPDATE flag until/unless
|
||||
specifically enabled (by the guest making the hypercall, causing the VMM
|
||||
to enable the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute).
|
||||
|
||||
8.31 KVM_CAP_PPC_MULTITCE
|
||||
-------------------------
|
||||
|
||||
|
@ -23,21 +23,23 @@ the PV_TIME_FEATURES hypercall should be probed using the SMCCC 1.1
|
||||
ARCH_FEATURES mechanism before calling it.
|
||||
|
||||
PV_TIME_FEATURES
|
||||
============= ======== ==========
|
||||
|
||||
============= ======== =================================================
|
||||
Function ID: (uint32) 0xC5000020
|
||||
PV_call_id: (uint32) The function to query for support.
|
||||
Currently only PV_TIME_ST is supported.
|
||||
Return value: (int64) NOT_SUPPORTED (-1) or SUCCESS (0) if the relevant
|
||||
PV-time feature is supported by the hypervisor.
|
||||
============= ======== ==========
|
||||
============= ======== =================================================
|
||||
|
||||
PV_TIME_ST
|
||||
============= ======== ==========
|
||||
|
||||
============= ======== ==============================================
|
||||
Function ID: (uint32) 0xC5000021
|
||||
Return value: (int64) IPA of the stolen time data structure for this
|
||||
VCPU. On failure:
|
||||
NOT_SUPPORTED (-1)
|
||||
============= ======== ==========
|
||||
============= ======== ==============================================
|
||||
|
||||
The IPA returned by PV_TIME_ST should be mapped by the guest as normal memory
|
||||
with inner and outer write back caching attributes, in the inner shareable
|
||||
@ -76,5 +78,5 @@ It is advisable that one or more 64k pages are set aside for the purpose of
|
||||
these structures and not used for other purposes, this enables the guest to map
|
||||
the region using 64k pages and avoids conflicting attributes with other memory.
|
||||
|
||||
For the user space interface see Documentation/virt/kvm/devices/vcpu.rst
|
||||
section "3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL".
|
||||
For the user space interface see
|
||||
:ref:`Documentation/virt/kvm/devices/vcpu.rst <kvm_arm_vcpu_pvtime_ctrl>`.
|
@ -52,7 +52,10 @@ KVM_DEV_ARM_VGIC_GRP_CTRL
|
||||
|
||||
KVM_DEV_ARM_ITS_SAVE_TABLES
|
||||
save the ITS table data into guest RAM, at the location provisioned
|
||||
by the guest in corresponding registers/table entries.
|
||||
by the guest in corresponding registers/table entries. Should userspace
|
||||
require a form of dirty tracking to identify which pages are modified
|
||||
by the saving process, it should use a bitmap even if using another
|
||||
mechanism to track the memory dirtied by the vCPUs.
|
||||
|
||||
The layout of the tables in guest memory defines an ABI. The entries
|
||||
are laid out in little endian format as described in the last paragraph.
|
||||
|
@ -171,6 +171,8 @@ configured values on other VCPUs. Userspace should configure the interrupt
|
||||
numbers on at least one VCPU after creating all VCPUs and before running any
|
||||
VCPUs.
|
||||
|
||||
.. _kvm_arm_vcpu_pvtime_ctrl:
|
||||
|
||||
3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL
|
||||
==================================
|
||||
|
||||
|
10
MAINTAINERS
10
MAINTAINERS
@ -11438,6 +11438,16 @@ F: arch/x86/kvm/svm/hyperv.*
|
||||
F: arch/x86/kvm/svm/svm_onhyperv.*
|
||||
F: arch/x86/kvm/vmx/evmcs.*
|
||||
|
||||
KVM X86 Xen (KVM/Xen)
|
||||
M: David Woodhouse <dwmw2@infradead.org>
|
||||
M: Paul Durrant <paul@xen.org>
|
||||
M: Sean Christopherson <seanjc@google.com>
|
||||
M: Paolo Bonzini <pbonzini@redhat.com>
|
||||
L: kvm@vger.kernel.org
|
||||
S: Supported
|
||||
T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git
|
||||
F: arch/x86/kvm/xen.*
|
||||
|
||||
KERNFS
|
||||
M: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
M: Tejun Heo <tj@kernel.org>
|
||||
|
@ -1988,6 +1988,7 @@ config ARM64_MTE
|
||||
depends on ARM64_PAN
|
||||
select ARCH_HAS_SUBPAGE_FAULTS
|
||||
select ARCH_USES_HIGH_VMA_FLAGS
|
||||
select ARCH_USES_PG_ARCH_X
|
||||
help
|
||||
Memory Tagging (part of the ARMv8.5 Extensions) provides
|
||||
architectural support for run-time, always-on detection of
|
||||
|
@ -135,7 +135,7 @@
|
||||
* 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are
|
||||
* not known to exist and will break with this configuration.
|
||||
*
|
||||
* The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2().
|
||||
* The VTCR_EL2 is configured per VM and is initialised in kvm_init_stage2_mmu.
|
||||
*
|
||||
* Note that when using 4K pages, we concatenate two first level page tables
|
||||
* together. With 16K pages, we concatenate 16 first level page tables.
|
||||
@ -340,9 +340,13 @@
|
||||
* We have
|
||||
* PAR [PA_Shift - 1 : 12] = PA [PA_Shift - 1 : 12]
|
||||
* HPFAR [PA_Shift - 9 : 4] = FIPA [PA_Shift - 1 : 12]
|
||||
*
|
||||
* Always assume 52 bit PA since at this point, we don't know how many PA bits
|
||||
* the page table has been set up for. This should be safe since unused address
|
||||
* bits in PAR are res0.
|
||||
*/
|
||||
#define PAR_TO_HPFAR(par) \
|
||||
(((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8)
|
||||
(((par) & GENMASK_ULL(52 - 1, 12)) >> 8)
|
||||
|
||||
#define ECN(x) { ESR_ELx_EC_##x, #x }
|
||||
|
||||
|
@ -76,6 +76,9 @@ enum __kvm_host_smccc_func {
|
||||
__KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
|
||||
__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs,
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps,
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm,
|
||||
};
|
||||
|
||||
#define DECLARE_KVM_VHE_SYM(sym) extern char sym[]
|
||||
@ -106,7 +109,7 @@ enum __kvm_host_smccc_func {
|
||||
#define per_cpu_ptr_nvhe_sym(sym, cpu) \
|
||||
({ \
|
||||
unsigned long base, off; \
|
||||
base = kvm_arm_hyp_percpu_base[cpu]; \
|
||||
base = kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; \
|
||||
off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \
|
||||
(unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \
|
||||
base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \
|
||||
@ -211,7 +214,7 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
|
||||
#define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init)
|
||||
#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector)
|
||||
|
||||
extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
|
||||
extern unsigned long kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[];
|
||||
DECLARE_KVM_NVHE_SYM(__per_cpu_start);
|
||||
DECLARE_KVM_NVHE_SYM(__per_cpu_end);
|
||||
|
||||
|
@ -73,6 +73,63 @@ u32 __attribute_const__ kvm_target_cpu(void);
|
||||
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
|
||||
void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
|
||||
|
||||
struct kvm_hyp_memcache {
|
||||
phys_addr_t head;
|
||||
unsigned long nr_pages;
|
||||
};
|
||||
|
||||
static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc,
|
||||
phys_addr_t *p,
|
||||
phys_addr_t (*to_pa)(void *virt))
|
||||
{
|
||||
*p = mc->head;
|
||||
mc->head = to_pa(p);
|
||||
mc->nr_pages++;
|
||||
}
|
||||
|
||||
static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc,
|
||||
void *(*to_va)(phys_addr_t phys))
|
||||
{
|
||||
phys_addr_t *p = to_va(mc->head);
|
||||
|
||||
if (!mc->nr_pages)
|
||||
return NULL;
|
||||
|
||||
mc->head = *p;
|
||||
mc->nr_pages--;
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc,
|
||||
unsigned long min_pages,
|
||||
void *(*alloc_fn)(void *arg),
|
||||
phys_addr_t (*to_pa)(void *virt),
|
||||
void *arg)
|
||||
{
|
||||
while (mc->nr_pages < min_pages) {
|
||||
phys_addr_t *p = alloc_fn(arg);
|
||||
|
||||
if (!p)
|
||||
return -ENOMEM;
|
||||
push_hyp_memcache(mc, p, to_pa);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc,
|
||||
void (*free_fn)(void *virt, void *arg),
|
||||
void *(*to_va)(phys_addr_t phys),
|
||||
void *arg)
|
||||
{
|
||||
while (mc->nr_pages)
|
||||
free_fn(pop_hyp_memcache(mc, to_va), arg);
|
||||
}
|
||||
|
||||
void free_hyp_memcache(struct kvm_hyp_memcache *mc);
|
||||
int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages);
|
||||
|
||||
struct kvm_vmid {
|
||||
atomic64_t id;
|
||||
};
|
||||
@ -115,6 +172,13 @@ struct kvm_smccc_features {
|
||||
unsigned long vendor_hyp_bmap;
|
||||
};
|
||||
|
||||
typedef unsigned int pkvm_handle_t;
|
||||
|
||||
struct kvm_protected_vm {
|
||||
pkvm_handle_t handle;
|
||||
struct kvm_hyp_memcache teardown_mc;
|
||||
};
|
||||
|
||||
struct kvm_arch {
|
||||
struct kvm_s2_mmu mmu;
|
||||
|
||||
@ -163,9 +227,19 @@ struct kvm_arch {
|
||||
|
||||
u8 pfr0_csv2;
|
||||
u8 pfr0_csv3;
|
||||
struct {
|
||||
u8 imp:4;
|
||||
u8 unimp:4;
|
||||
} dfr0_pmuver;
|
||||
|
||||
/* Hypercall features firmware registers' descriptor */
|
||||
struct kvm_smccc_features smccc_feat;
|
||||
|
||||
/*
|
||||
* For an untrusted host VM, 'pkvm.handle' is used to lookup
|
||||
* the associated pKVM instance in the hypervisor.
|
||||
*/
|
||||
struct kvm_protected_vm pkvm;
|
||||
};
|
||||
|
||||
struct kvm_vcpu_fault_info {
|
||||
@ -925,8 +999,6 @@ int kvm_set_ipa_limit(void);
|
||||
#define __KVM_HAVE_ARCH_VM_ALLOC
|
||||
struct kvm *kvm_arch_alloc_vm(void);
|
||||
|
||||
int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
|
||||
|
||||
static inline bool kvm_vm_is_protected(struct kvm *kvm)
|
||||
{
|
||||
return false;
|
||||
|
@ -123,4 +123,7 @@ extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val);
|
||||
extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val);
|
||||
extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val);
|
||||
|
||||
extern unsigned long kvm_nvhe_sym(__icache_flags);
|
||||
extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
|
||||
|
||||
#endif /* __ARM64_KVM_HYP_H__ */
|
||||
|
@ -166,7 +166,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
|
||||
void free_hyp_pgds(void);
|
||||
|
||||
void stage2_unmap_vm(struct kvm *kvm);
|
||||
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu);
|
||||
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
|
||||
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
|
||||
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
|
||||
phys_addr_t pa, unsigned long size, bool writable);
|
||||
|
@ -42,6 +42,8 @@ typedef u64 kvm_pte_t;
|
||||
#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
|
||||
#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
|
||||
|
||||
#define KVM_PHYS_INVALID (-1ULL)
|
||||
|
||||
static inline bool kvm_pte_valid(kvm_pte_t pte)
|
||||
{
|
||||
return pte & KVM_PTE_VALID;
|
||||
@ -57,6 +59,18 @@ static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
|
||||
return pa;
|
||||
}
|
||||
|
||||
static inline kvm_pte_t kvm_phys_to_pte(u64 pa)
|
||||
{
|
||||
kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
|
||||
|
||||
if (PAGE_SHIFT == 16) {
|
||||
pa &= GENMASK(51, 48);
|
||||
pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
|
||||
}
|
||||
|
||||
return pte;
|
||||
}
|
||||
|
||||
static inline u64 kvm_granule_shift(u32 level)
|
||||
{
|
||||
/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
|
||||
@ -85,6 +99,8 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
|
||||
* allocation is physically contiguous.
|
||||
* @free_pages_exact: Free an exact number of memory pages previously
|
||||
* allocated by zalloc_pages_exact.
|
||||
* @free_removed_table: Free a removed paging structure by unlinking and
|
||||
* dropping references.
|
||||
* @get_page: Increment the refcount on a page.
|
||||
* @put_page: Decrement the refcount on a page. When the
|
||||
* refcount reaches 0 the page is automatically
|
||||
@ -103,6 +119,7 @@ struct kvm_pgtable_mm_ops {
|
||||
void* (*zalloc_page)(void *arg);
|
||||
void* (*zalloc_pages_exact)(size_t size);
|
||||
void (*free_pages_exact)(void *addr, size_t size);
|
||||
void (*free_removed_table)(void *addr, u32 level);
|
||||
void (*get_page)(void *addr);
|
||||
void (*put_page)(void *addr);
|
||||
int (*page_count)(void *addr);
|
||||
@ -161,6 +178,121 @@ enum kvm_pgtable_prot {
|
||||
typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
|
||||
enum kvm_pgtable_prot prot);
|
||||
|
||||
/**
|
||||
* enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
|
||||
* @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid
|
||||
* entries.
|
||||
* @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their
|
||||
* children.
|
||||
* @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their
|
||||
* children.
|
||||
* @KVM_PGTABLE_WALK_SHARED: Indicates the page-tables may be shared
|
||||
* with other software walkers.
|
||||
*/
|
||||
enum kvm_pgtable_walk_flags {
|
||||
KVM_PGTABLE_WALK_LEAF = BIT(0),
|
||||
KVM_PGTABLE_WALK_TABLE_PRE = BIT(1),
|
||||
KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
|
||||
KVM_PGTABLE_WALK_SHARED = BIT(3),
|
||||
};
|
||||
|
||||
struct kvm_pgtable_visit_ctx {
|
||||
kvm_pte_t *ptep;
|
||||
kvm_pte_t old;
|
||||
void *arg;
|
||||
struct kvm_pgtable_mm_ops *mm_ops;
|
||||
u64 addr;
|
||||
u64 end;
|
||||
u32 level;
|
||||
enum kvm_pgtable_walk_flags flags;
|
||||
};
|
||||
|
||||
typedef int (*kvm_pgtable_visitor_fn_t)(const struct kvm_pgtable_visit_ctx *ctx,
|
||||
enum kvm_pgtable_walk_flags visit);
|
||||
|
||||
static inline bool kvm_pgtable_walk_shared(const struct kvm_pgtable_visit_ctx *ctx)
|
||||
{
|
||||
return ctx->flags & KVM_PGTABLE_WALK_SHARED;
|
||||
}
|
||||
|
||||
/**
|
||||
* struct kvm_pgtable_walker - Hook into a page-table walk.
|
||||
* @cb: Callback function to invoke during the walk.
|
||||
* @arg: Argument passed to the callback function.
|
||||
* @flags: Bitwise-OR of flags to identify the entry types on which to
|
||||
* invoke the callback function.
|
||||
*/
|
||||
struct kvm_pgtable_walker {
|
||||
const kvm_pgtable_visitor_fn_t cb;
|
||||
void * const arg;
|
||||
const enum kvm_pgtable_walk_flags flags;
|
||||
};
|
||||
|
||||
/*
|
||||
* RCU cannot be used in a non-kernel context such as the hyp. As such, page
|
||||
* table walkers used in hyp do not call into RCU and instead use other
|
||||
* synchronization mechanisms (such as a spinlock).
|
||||
*/
|
||||
#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
|
||||
|
||||
typedef kvm_pte_t *kvm_pteref_t;
|
||||
|
||||
static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker,
|
||||
kvm_pteref_t pteref)
|
||||
{
|
||||
return pteref;
|
||||
}
|
||||
|
||||
static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
|
||||
{
|
||||
/*
|
||||
* Due to the lack of RCU (or a similar protection scheme), only
|
||||
* non-shared table walkers are allowed in the hypervisor.
|
||||
*/
|
||||
if (walker->flags & KVM_PGTABLE_WALK_SHARED)
|
||||
return -EPERM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) {}
|
||||
|
||||
static inline bool kvm_pgtable_walk_lock_held(void)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
typedef kvm_pte_t __rcu *kvm_pteref_t;
|
||||
|
||||
static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker,
|
||||
kvm_pteref_t pteref)
|
||||
{
|
||||
return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED));
|
||||
}
|
||||
|
||||
static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
|
||||
{
|
||||
if (walker->flags & KVM_PGTABLE_WALK_SHARED)
|
||||
rcu_read_lock();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker)
|
||||
{
|
||||
if (walker->flags & KVM_PGTABLE_WALK_SHARED)
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static inline bool kvm_pgtable_walk_lock_held(void)
|
||||
{
|
||||
return rcu_read_lock_held();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* struct kvm_pgtable - KVM page-table.
|
||||
* @ia_bits: Maximum input address size, in bits.
|
||||
@ -175,7 +307,7 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
|
||||
struct kvm_pgtable {
|
||||
u32 ia_bits;
|
||||
u32 start_level;
|
||||
kvm_pte_t *pgd;
|
||||
kvm_pteref_t pgd;
|
||||
struct kvm_pgtable_mm_ops *mm_ops;
|
||||
|
||||
/* Stage-2 only */
|
||||
@ -184,39 +316,6 @@ struct kvm_pgtable {
|
||||
kvm_pgtable_force_pte_cb_t force_pte_cb;
|
||||
};
|
||||
|
||||
/**
|
||||
* enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
|
||||
* @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid
|
||||
* entries.
|
||||
* @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their
|
||||
* children.
|
||||
* @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their
|
||||
* children.
|
||||
*/
|
||||
enum kvm_pgtable_walk_flags {
|
||||
KVM_PGTABLE_WALK_LEAF = BIT(0),
|
||||
KVM_PGTABLE_WALK_TABLE_PRE = BIT(1),
|
||||
KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
|
||||
};
|
||||
|
||||
typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
|
||||
kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag,
|
||||
void * const arg);
|
||||
|
||||
/**
|
||||
* struct kvm_pgtable_walker - Hook into a page-table walk.
|
||||
* @cb: Callback function to invoke during the walk.
|
||||
* @arg: Argument passed to the callback function.
|
||||
* @flags: Bitwise-OR of flags to identify the entry types on which to
|
||||
* invoke the callback function.
|
||||
*/
|
||||
struct kvm_pgtable_walker {
|
||||
const kvm_pgtable_visitor_fn_t cb;
|
||||
void * const arg;
|
||||
const enum kvm_pgtable_walk_flags flags;
|
||||
};
|
||||
|
||||
/**
|
||||
* kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
|
||||
* @pgt: Uninitialised page-table structure to initialise.
|
||||
@ -296,6 +395,14 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
|
||||
*/
|
||||
u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD
|
||||
* @vtcr: Content of the VTCR register.
|
||||
*
|
||||
* Return: the size (in bytes) of the stage-2 PGD
|
||||
*/
|
||||
size_t kvm_pgtable_stage2_pgd_size(u64 vtcr);
|
||||
|
||||
/**
|
||||
* __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
|
||||
* @pgt: Uninitialised page-table structure to initialise.
|
||||
@ -324,6 +431,17 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
|
||||
*/
|
||||
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure.
|
||||
* @mm_ops: Memory management callbacks.
|
||||
* @pgtable: Unlinked stage-2 paging structure to be freed.
|
||||
* @level: Level of the stage-2 paging structure to be freed.
|
||||
*
|
||||
* The page-table is assumed to be unreachable by any hardware walkers prior to
|
||||
* freeing and therefore no TLB invalidation is performed.
|
||||
*/
|
||||
void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
|
||||
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
|
||||
@ -333,6 +451,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
|
||||
* @prot: Permissions and attributes for the mapping.
|
||||
* @mc: Cache of pre-allocated and zeroed memory from which to allocate
|
||||
* page-table pages.
|
||||
* @flags: Flags to control the page-table walk (ex. a shared walk)
|
||||
*
|
||||
* The offset of @addr within a page is ignored, @size is rounded-up to
|
||||
* the next page boundary and @phys is rounded-down to the previous page
|
||||
@ -354,7 +473,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
|
||||
*/
|
||||
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
||||
u64 phys, enum kvm_pgtable_prot prot,
|
||||
void *mc);
|
||||
void *mc, enum kvm_pgtable_walk_flags flags);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
|
||||
|
@ -9,11 +9,49 @@
|
||||
#include <linux/memblock.h>
|
||||
#include <asm/kvm_pgtable.h>
|
||||
|
||||
/* Maximum number of VMs that can co-exist under pKVM. */
|
||||
#define KVM_MAX_PVMS 255
|
||||
|
||||
#define HYP_MEMBLOCK_REGIONS 128
|
||||
|
||||
int pkvm_init_host_vm(struct kvm *kvm);
|
||||
int pkvm_create_hyp_vm(struct kvm *kvm);
|
||||
void pkvm_destroy_hyp_vm(struct kvm *kvm);
|
||||
|
||||
extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
|
||||
extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
|
||||
|
||||
static inline unsigned long
|
||||
hyp_vmemmap_memblock_size(struct memblock_region *reg, size_t vmemmap_entry_size)
|
||||
{
|
||||
unsigned long nr_pages = reg->size >> PAGE_SHIFT;
|
||||
unsigned long start, end;
|
||||
|
||||
start = (reg->base >> PAGE_SHIFT) * vmemmap_entry_size;
|
||||
end = start + nr_pages * vmemmap_entry_size;
|
||||
start = ALIGN_DOWN(start, PAGE_SIZE);
|
||||
end = ALIGN(end, PAGE_SIZE);
|
||||
|
||||
return end - start;
|
||||
}
|
||||
|
||||
static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size)
|
||||
{
|
||||
unsigned long res = 0, i;
|
||||
|
||||
for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
|
||||
res += hyp_vmemmap_memblock_size(&kvm_nvhe_sym(hyp_memory)[i],
|
||||
vmemmap_entry_size);
|
||||
}
|
||||
|
||||
return res >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static inline unsigned long hyp_vm_table_pages(void)
|
||||
{
|
||||
return PAGE_ALIGN(KVM_MAX_PVMS * sizeof(void *)) >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
|
||||
{
|
||||
unsigned long total = 0, i;
|
||||
|
@ -25,7 +25,7 @@ unsigned long mte_copy_tags_to_user(void __user *to, void *from,
|
||||
unsigned long n);
|
||||
int mte_save_tags(struct page *page);
|
||||
void mte_save_page_tags(const void *page_addr, void *tag_storage);
|
||||
bool mte_restore_tags(swp_entry_t entry, struct page *page);
|
||||
void mte_restore_tags(swp_entry_t entry, struct page *page);
|
||||
void mte_restore_page_tags(void *page_addr, const void *tag_storage);
|
||||
void mte_invalidate_tags(int type, pgoff_t offset);
|
||||
void mte_invalidate_tags_area(int type);
|
||||
@ -36,6 +36,58 @@ void mte_free_tag_storage(char *storage);
|
||||
|
||||
/* track which pages have valid allocation tags */
|
||||
#define PG_mte_tagged PG_arch_2
|
||||
/* simple lock to avoid multiple threads tagging the same page */
|
||||
#define PG_mte_lock PG_arch_3
|
||||
|
||||
static inline void set_page_mte_tagged(struct page *page)
|
||||
{
|
||||
/*
|
||||
* Ensure that the tags written prior to this function are visible
|
||||
* before the page flags update.
|
||||
*/
|
||||
smp_wmb();
|
||||
set_bit(PG_mte_tagged, &page->flags);
|
||||
}
|
||||
|
||||
static inline bool page_mte_tagged(struct page *page)
|
||||
{
|
||||
bool ret = test_bit(PG_mte_tagged, &page->flags);
|
||||
|
||||
/*
|
||||
* If the page is tagged, ensure ordering with a likely subsequent
|
||||
* read of the tags.
|
||||
*/
|
||||
if (ret)
|
||||
smp_rmb();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lock the page for tagging and return 'true' if the page can be tagged,
|
||||
* 'false' if already tagged. PG_mte_tagged is never cleared and therefore the
|
||||
* locking only happens once for page initialisation.
|
||||
*
|
||||
* The page MTE lock state:
|
||||
*
|
||||
* Locked: PG_mte_lock && !PG_mte_tagged
|
||||
* Unlocked: !PG_mte_lock || PG_mte_tagged
|
||||
*
|
||||
* Acquire semantics only if the page is tagged (returning 'false').
|
||||
*/
|
||||
static inline bool try_page_mte_tagging(struct page *page)
|
||||
{
|
||||
if (!test_and_set_bit(PG_mte_lock, &page->flags))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* The tags are either being initialised or may have been initialised
|
||||
* already. Check if the PG_mte_tagged flag has been set or wait
|
||||
* otherwise.
|
||||
*/
|
||||
smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged));
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void mte_zero_clear_page_tags(void *addr);
|
||||
void mte_sync_tags(pte_t old_pte, pte_t pte);
|
||||
@ -56,6 +108,17 @@ size_t mte_probe_user_range(const char __user *uaddr, size_t size);
|
||||
/* unused if !CONFIG_ARM64_MTE, silence the compiler */
|
||||
#define PG_mte_tagged 0
|
||||
|
||||
static inline void set_page_mte_tagged(struct page *page)
|
||||
{
|
||||
}
|
||||
static inline bool page_mte_tagged(struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline bool try_page_mte_tagging(struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline void mte_zero_clear_page_tags(void *addr)
|
||||
{
|
||||
}
|
||||
|
@ -1046,8 +1046,8 @@ static inline void arch_swap_invalidate_area(int type)
|
||||
#define __HAVE_ARCH_SWAP_RESTORE
|
||||
static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
|
||||
{
|
||||
if (system_supports_mte() && mte_restore_tags(entry, &folio->page))
|
||||
set_bit(PG_mte_tagged, &folio->flags);
|
||||
if (system_supports_mte())
|
||||
mte_restore_tags(entry, &folio->page);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_ARM64_MTE */
|
||||
|
@ -43,6 +43,7 @@
|
||||
#define __KVM_HAVE_VCPU_EVENTS
|
||||
|
||||
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
|
||||
#define KVM_DIRTY_LOG_PAGE_OFFSET 64
|
||||
|
||||
#define KVM_REG_SIZE(id) \
|
||||
(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
|
||||
|
@ -2076,8 +2076,10 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
|
||||
* Clear the tags in the zero page. This needs to be done via the
|
||||
* linear map which has the Tagged attribute.
|
||||
*/
|
||||
if (!test_and_set_bit(PG_mte_tagged, &ZERO_PAGE(0)->flags))
|
||||
if (try_page_mte_tagging(ZERO_PAGE(0))) {
|
||||
mte_clear_page_tags(lm_alias(empty_zero_page));
|
||||
set_page_mte_tagged(ZERO_PAGE(0));
|
||||
}
|
||||
|
||||
kasan_init_hw_tags_cpu();
|
||||
}
|
||||
|
@ -47,7 +47,7 @@ static int mte_dump_tag_range(struct coredump_params *cprm,
|
||||
* Pages mapped in user space as !pte_access_permitted() (e.g.
|
||||
* PROT_EXEC only) may not have the PG_mte_tagged flag set.
|
||||
*/
|
||||
if (!test_bit(PG_mte_tagged, &page->flags)) {
|
||||
if (!page_mte_tagged(page)) {
|
||||
put_page(page);
|
||||
dump_skip(cprm, MTE_PAGE_TAG_STORAGE);
|
||||
continue;
|
||||
|
@ -271,7 +271,7 @@ static int swsusp_mte_save_tags(void)
|
||||
if (!page)
|
||||
continue;
|
||||
|
||||
if (!test_bit(PG_mte_tagged, &page->flags))
|
||||
if (!page_mte_tagged(page))
|
||||
continue;
|
||||
|
||||
ret = save_tags(page, pfn);
|
||||
|
@ -63,12 +63,6 @@ KVM_NVHE_ALIAS(nvhe_hyp_panic_handler);
|
||||
/* Vectors installed by hyp-init on reset HVC. */
|
||||
KVM_NVHE_ALIAS(__hyp_stub_vectors);
|
||||
|
||||
/* Kernel symbol used by icache_is_vpipt(). */
|
||||
KVM_NVHE_ALIAS(__icache_flags);
|
||||
|
||||
/* VMID bits set by the KVM VMID allocator */
|
||||
KVM_NVHE_ALIAS(kvm_arm_vmid_bits);
|
||||
|
||||
/* Static keys which are set if a vGIC trap should be handled in hyp. */
|
||||
KVM_NVHE_ALIAS(vgic_v2_cpuif_trap);
|
||||
KVM_NVHE_ALIAS(vgic_v3_cpuif_trap);
|
||||
@ -84,9 +78,6 @@ KVM_NVHE_ALIAS(gic_nonsecure_priorities);
|
||||
KVM_NVHE_ALIAS(__start___kvm_ex_table);
|
||||
KVM_NVHE_ALIAS(__stop___kvm_ex_table);
|
||||
|
||||
/* Array containing bases of nVHE per-CPU memory regions. */
|
||||
KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
|
||||
|
||||
/* PMU available static key */
|
||||
#ifdef CONFIG_HW_PERF_EVENTS
|
||||
KVM_NVHE_ALIAS(kvm_arm_pmu_available);
|
||||
@ -103,12 +94,6 @@ KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy);
|
||||
KVM_NVHE_ALIAS_HYP(__memset, __pi_memset);
|
||||
#endif
|
||||
|
||||
/* Kernel memory sections */
|
||||
KVM_NVHE_ALIAS(__start_rodata);
|
||||
KVM_NVHE_ALIAS(__end_rodata);
|
||||
KVM_NVHE_ALIAS(__bss_start);
|
||||
KVM_NVHE_ALIAS(__bss_stop);
|
||||
|
||||
/* Hyp memory sections */
|
||||
KVM_NVHE_ALIAS(__hyp_idmap_text_start);
|
||||
KVM_NVHE_ALIAS(__hyp_idmap_text_end);
|
||||
|
@ -41,19 +41,17 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte,
|
||||
if (check_swap && is_swap_pte(old_pte)) {
|
||||
swp_entry_t entry = pte_to_swp_entry(old_pte);
|
||||
|
||||
if (!non_swap_entry(entry) && mte_restore_tags(entry, page))
|
||||
return;
|
||||
if (!non_swap_entry(entry))
|
||||
mte_restore_tags(entry, page);
|
||||
}
|
||||
|
||||
if (!pte_is_tagged)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Test PG_mte_tagged again in case it was racing with another
|
||||
* set_pte_at().
|
||||
*/
|
||||
if (!test_and_set_bit(PG_mte_tagged, &page->flags))
|
||||
if (try_page_mte_tagging(page)) {
|
||||
mte_clear_page_tags(page_address(page));
|
||||
set_page_mte_tagged(page);
|
||||
}
|
||||
}
|
||||
|
||||
void mte_sync_tags(pte_t old_pte, pte_t pte)
|
||||
@ -69,9 +67,11 @@ void mte_sync_tags(pte_t old_pte, pte_t pte)
|
||||
|
||||
/* if PG_mte_tagged is set, tags have already been initialised */
|
||||
for (i = 0; i < nr_pages; i++, page++) {
|
||||
if (!test_bit(PG_mte_tagged, &page->flags))
|
||||
if (!page_mte_tagged(page)) {
|
||||
mte_sync_page_tags(page, old_pte, check_swap,
|
||||
pte_is_tagged);
|
||||
set_page_mte_tagged(page);
|
||||
}
|
||||
}
|
||||
|
||||
/* ensure the tags are visible before the PTE is set */
|
||||
@ -96,8 +96,7 @@ int memcmp_pages(struct page *page1, struct page *page2)
|
||||
* pages is tagged, set_pte_at() may zero or change the tags of the
|
||||
* other page via mte_sync_tags().
|
||||
*/
|
||||
if (test_bit(PG_mte_tagged, &page1->flags) ||
|
||||
test_bit(PG_mte_tagged, &page2->flags))
|
||||
if (page_mte_tagged(page1) || page_mte_tagged(page2))
|
||||
return addr1 != addr2;
|
||||
|
||||
return ret;
|
||||
@ -454,7 +453,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
|
||||
put_page(page);
|
||||
break;
|
||||
}
|
||||
WARN_ON_ONCE(!test_bit(PG_mte_tagged, &page->flags));
|
||||
WARN_ON_ONCE(!page_mte_tagged(page));
|
||||
|
||||
/* limit access to the end of the page */
|
||||
offset = offset_in_page(addr);
|
||||
|
@ -32,6 +32,8 @@ menuconfig KVM
|
||||
select KVM_VFIO
|
||||
select HAVE_KVM_EVENTFD
|
||||
select HAVE_KVM_IRQFD
|
||||
select HAVE_KVM_DIRTY_RING_ACQ_REL
|
||||
select NEED_KVM_DIRTY_RING_WITH_BITMAP
|
||||
select HAVE_KVM_MSI
|
||||
select HAVE_KVM_IRQCHIP
|
||||
select HAVE_KVM_IRQ_ROUTING
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include <asm/kvm_arm.h>
|
||||
#include <asm/kvm_asm.h>
|
||||
#include <asm/kvm_mmu.h>
|
||||
#include <asm/kvm_pkvm.h>
|
||||
#include <asm/kvm_emulate.h>
|
||||
#include <asm/sections.h>
|
||||
|
||||
@ -50,7 +51,6 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
|
||||
DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
|
||||
|
||||
DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
|
||||
unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
|
||||
DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
|
||||
|
||||
static bool vgic_present;
|
||||
@ -138,24 +138,24 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = kvm_arm_setup_stage2(kvm, type);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = kvm_share_hyp(kvm, kvm + 1);
|
||||
if (ret)
|
||||
goto out_free_stage2_pgd;
|
||||
return ret;
|
||||
|
||||
ret = pkvm_init_host_vm(kvm);
|
||||
if (ret)
|
||||
goto err_unshare_kvm;
|
||||
|
||||
if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
|
||||
ret = -ENOMEM;
|
||||
goto out_free_stage2_pgd;
|
||||
goto err_unshare_kvm;
|
||||
}
|
||||
cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
|
||||
|
||||
ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);
|
||||
if (ret)
|
||||
goto err_free_cpumask;
|
||||
|
||||
kvm_vgic_early_init(kvm);
|
||||
|
||||
/* The maximum number of VCPUs is limited by the host's GIC model */
|
||||
@ -164,9 +164,18 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||||
set_default_spectre(kvm);
|
||||
kvm_arm_init_hypercalls(kvm);
|
||||
|
||||
return ret;
|
||||
out_free_stage2_pgd:
|
||||
kvm_free_stage2_pgd(&kvm->arch.mmu);
|
||||
/*
|
||||
* Initialise the default PMUver before there is a chance to
|
||||
* create an actual PMU.
|
||||
*/
|
||||
kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit();
|
||||
|
||||
return 0;
|
||||
|
||||
err_free_cpumask:
|
||||
free_cpumask_var(kvm->arch.supported_cpus);
|
||||
err_unshare_kvm:
|
||||
kvm_unshare_hyp(kvm, kvm + 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -187,6 +196,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
|
||||
|
||||
kvm_vgic_destroy(kvm);
|
||||
|
||||
if (is_protected_kvm_enabled())
|
||||
pkvm_destroy_hyp_vm(kvm);
|
||||
|
||||
kvm_destroy_vcpus(kvm);
|
||||
|
||||
kvm_unshare_hyp(kvm, kvm + 1);
|
||||
@ -569,6 +581,12 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (is_protected_kvm_enabled()) {
|
||||
ret = pkvm_create_hyp_vm(kvm);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!irqchip_in_kernel(kvm)) {
|
||||
/*
|
||||
* Tell the rest of the code that there are userspace irqchip
|
||||
@ -746,6 +764,9 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
|
||||
|
||||
if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
|
||||
return kvm_vcpu_suspend(vcpu);
|
||||
|
||||
if (kvm_dirty_ring_check_request(vcpu))
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
@ -1518,7 +1539,7 @@ static int kvm_init_vector_slots(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cpu_prepare_hyp_mode(int cpu)
|
||||
static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
|
||||
{
|
||||
struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
|
||||
unsigned long tcr;
|
||||
@ -1534,23 +1555,9 @@ static void cpu_prepare_hyp_mode(int cpu)
|
||||
|
||||
params->mair_el2 = read_sysreg(mair_el1);
|
||||
|
||||
/*
|
||||
* The ID map may be configured to use an extended virtual address
|
||||
* range. This is only the case if system RAM is out of range for the
|
||||
* currently configured page size and VA_BITS, in which case we will
|
||||
* also need the extended virtual range for the HYP ID map, or we won't
|
||||
* be able to enable the EL2 MMU.
|
||||
*
|
||||
* However, at EL2, there is only one TTBR register, and we can't switch
|
||||
* between translation tables *and* update TCR_EL2.T0SZ at the same
|
||||
* time. Bottom line: we need to use the extended range with *both* our
|
||||
* translation tables.
|
||||
*
|
||||
* So use the same T0SZ value we use for the ID map.
|
||||
*/
|
||||
tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1;
|
||||
tcr &= ~TCR_T0SZ_MASK;
|
||||
tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
|
||||
tcr |= TCR_T0SZ(hyp_va_bits);
|
||||
params->tcr_el2 = tcr;
|
||||
|
||||
params->pgd_pa = kvm_mmu_get_httbr();
|
||||
@ -1844,13 +1851,13 @@ static void teardown_hyp_mode(void)
|
||||
free_hyp_pgds();
|
||||
for_each_possible_cpu(cpu) {
|
||||
free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
|
||||
free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
|
||||
free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
|
||||
}
|
||||
}
|
||||
|
||||
static int do_pkvm_init(u32 hyp_va_bits)
|
||||
{
|
||||
void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base);
|
||||
void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
|
||||
int ret;
|
||||
|
||||
preempt_disable();
|
||||
@ -1870,11 +1877,8 @@ static int do_pkvm_init(u32 hyp_va_bits)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int kvm_hyp_init_protection(u32 hyp_va_bits)
|
||||
static void kvm_hyp_init_symbols(void)
|
||||
{
|
||||
void *addr = phys_to_virt(hyp_mem_base);
|
||||
int ret;
|
||||
|
||||
kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
|
||||
kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
|
||||
kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
|
||||
@ -1883,6 +1887,14 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits)
|
||||
kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
|
||||
kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
|
||||
kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
|
||||
kvm_nvhe_sym(__icache_flags) = __icache_flags;
|
||||
kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
|
||||
}
|
||||
|
||||
static int kvm_hyp_init_protection(u32 hyp_va_bits)
|
||||
{
|
||||
void *addr = phys_to_virt(hyp_mem_base);
|
||||
int ret;
|
||||
|
||||
ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
|
||||
if (ret)
|
||||
@ -1950,7 +1962,7 @@ static int init_hyp_mode(void)
|
||||
|
||||
page_addr = page_address(page);
|
||||
memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
|
||||
kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
|
||||
kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2043,7 +2055,7 @@ static int init_hyp_mode(void)
|
||||
}
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
|
||||
char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];
|
||||
char *percpu_end = percpu_begin + nvhe_percpu_size();
|
||||
|
||||
/* Map Hyp percpu pages */
|
||||
@ -2054,9 +2066,11 @@ static int init_hyp_mode(void)
|
||||
}
|
||||
|
||||
/* Prepare the CPU initialization parameters */
|
||||
cpu_prepare_hyp_mode(cpu);
|
||||
cpu_prepare_hyp_mode(cpu, hyp_va_bits);
|
||||
}
|
||||
|
||||
kvm_hyp_init_symbols();
|
||||
|
||||
if (is_protected_kvm_enabled()) {
|
||||
init_cpu_logical_map();
|
||||
|
||||
@ -2064,9 +2078,7 @@ static int init_hyp_mode(void)
|
||||
err = -ENODEV;
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_protected_kvm_enabled()) {
|
||||
err = kvm_hyp_init_protection(hyp_va_bits);
|
||||
if (err) {
|
||||
kvm_err("Failed to init hyp memory protection\n");
|
||||
@ -2130,6 +2142,11 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
|
||||
{
|
||||
return irqchip_in_kernel(kvm);
|
||||
}
|
||||
|
||||
bool kvm_arch_has_irq_bypass(void)
|
||||
{
|
||||
return true;
|
||||
|
@ -1059,7 +1059,7 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
|
||||
maddr = page_address(page);
|
||||
|
||||
if (!write) {
|
||||
if (test_bit(PG_mte_tagged, &page->flags))
|
||||
if (page_mte_tagged(page))
|
||||
num_tags = mte_copy_tags_to_user(tags, maddr,
|
||||
MTE_GRANULES_PER_PAGE);
|
||||
else
|
||||
@ -1068,15 +1068,19 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
|
||||
clear_user(tags, MTE_GRANULES_PER_PAGE);
|
||||
kvm_release_pfn_clean(pfn);
|
||||
} else {
|
||||
/*
|
||||
* Only locking to serialise with a concurrent
|
||||
* set_pte_at() in the VMM but still overriding the
|
||||
* tags, hence ignoring the return value.
|
||||
*/
|
||||
try_page_mte_tagging(page);
|
||||
num_tags = mte_copy_tags_from_user(maddr, tags,
|
||||
MTE_GRANULES_PER_PAGE);
|
||||
|
||||
/*
|
||||
* Set the flag after checking the write
|
||||
* completed fully
|
||||
*/
|
||||
if (num_tags == MTE_GRANULES_PER_PAGE)
|
||||
set_bit(PG_mte_tagged, &page->flags);
|
||||
/* uaccess failed, don't leave stale tags */
|
||||
if (num_tags != MTE_GRANULES_PER_PAGE)
|
||||
mte_clear_page_tags(page);
|
||||
set_page_mte_tagged(page);
|
||||
|
||||
kvm_release_pfn_dirty(pfn);
|
||||
}
|
||||
|
@ -2,9 +2,12 @@
|
||||
|
||||
#include <linux/kbuild.h>
|
||||
#include <nvhe/memory.h>
|
||||
#include <nvhe/pkvm.h>
|
||||
|
||||
int main(void)
|
||||
{
|
||||
DEFINE(STRUCT_HYP_PAGE_SIZE, sizeof(struct hyp_page));
|
||||
DEFINE(PKVM_HYP_VM_SIZE, sizeof(struct pkvm_hyp_vm));
|
||||
DEFINE(PKVM_HYP_VCPU_SIZE, sizeof(struct pkvm_hyp_vcpu));
|
||||
return 0;
|
||||
}
|
||||
|
@ -8,8 +8,10 @@
|
||||
#define __KVM_NVHE_MEM_PROTECT__
|
||||
#include <linux/kvm_host.h>
|
||||
#include <asm/kvm_hyp.h>
|
||||
#include <asm/kvm_mmu.h>
|
||||
#include <asm/kvm_pgtable.h>
|
||||
#include <asm/virt.h>
|
||||
#include <nvhe/pkvm.h>
|
||||
#include <nvhe/spinlock.h>
|
||||
|
||||
/*
|
||||
@ -43,30 +45,45 @@ static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
|
||||
return prot & PKVM_PAGE_STATE_PROT_MASK;
|
||||
}
|
||||
|
||||
struct host_kvm {
|
||||
struct host_mmu {
|
||||
struct kvm_arch arch;
|
||||
struct kvm_pgtable pgt;
|
||||
struct kvm_pgtable_mm_ops mm_ops;
|
||||
hyp_spinlock_t lock;
|
||||
};
|
||||
extern struct host_kvm host_kvm;
|
||||
extern struct host_mmu host_mmu;
|
||||
|
||||
extern const u8 pkvm_hyp_id;
|
||||
/* This corresponds to page-table locking order */
|
||||
enum pkvm_component_id {
|
||||
PKVM_ID_HOST,
|
||||
PKVM_ID_HYP,
|
||||
};
|
||||
|
||||
extern unsigned long hyp_nr_cpus;
|
||||
|
||||
int __pkvm_prot_finalize(void);
|
||||
int __pkvm_host_share_hyp(u64 pfn);
|
||||
int __pkvm_host_unshare_hyp(u64 pfn);
|
||||
int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
|
||||
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
|
||||
|
||||
bool addr_is_memory(phys_addr_t phys);
|
||||
int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
|
||||
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
|
||||
int kvm_host_prepare_stage2(void *pgt_pool_base);
|
||||
int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd);
|
||||
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
|
||||
|
||||
int hyp_pin_shared_mem(void *from, void *to);
|
||||
void hyp_unpin_shared_mem(void *from, void *to);
|
||||
void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
|
||||
int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
|
||||
struct kvm_hyp_memcache *host_mc);
|
||||
|
||||
static __always_inline void __load_host_stage2(void)
|
||||
{
|
||||
if (static_branch_likely(&kvm_protected_mode_initialized))
|
||||
__load_stage2(&host_kvm.arch.mmu, &host_kvm.arch);
|
||||
__load_stage2(&host_mmu.arch.mmu, &host_mmu.arch);
|
||||
else
|
||||
write_sysreg(0, vttbr_el2);
|
||||
}
|
||||
|
@ -38,6 +38,10 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr)
|
||||
#define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page))
|
||||
#define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool)
|
||||
|
||||
/*
|
||||
* Refcounting for 'struct hyp_page'.
|
||||
* hyp_pool::lock must be held if atomic access to the refcount is required.
|
||||
*/
|
||||
static inline int hyp_page_count(void *addr)
|
||||
{
|
||||
struct hyp_page *p = hyp_virt_to_page(addr);
|
||||
@ -45,4 +49,27 @@ static inline int hyp_page_count(void *addr)
|
||||
return p->refcount;
|
||||
}
|
||||
|
||||
static inline void hyp_page_ref_inc(struct hyp_page *p)
|
||||
{
|
||||
BUG_ON(p->refcount == USHRT_MAX);
|
||||
p->refcount++;
|
||||
}
|
||||
|
||||
static inline void hyp_page_ref_dec(struct hyp_page *p)
|
||||
{
|
||||
BUG_ON(!p->refcount);
|
||||
p->refcount--;
|
||||
}
|
||||
|
||||
static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
|
||||
{
|
||||
hyp_page_ref_dec(p);
|
||||
return (p->refcount == 0);
|
||||
}
|
||||
|
||||
static inline void hyp_set_page_refcounted(struct hyp_page *p)
|
||||
{
|
||||
BUG_ON(p->refcount);
|
||||
p->refcount = 1;
|
||||
}
|
||||
#endif /* __KVM_HYP_MEMORY_H */
|
||||
|
@ -13,9 +13,13 @@
|
||||
extern struct kvm_pgtable pkvm_pgtable;
|
||||
extern hyp_spinlock_t pkvm_pgd_lock;
|
||||
|
||||
int hyp_create_pcpu_fixmap(void);
|
||||
void *hyp_fixmap_map(phys_addr_t phys);
|
||||
void hyp_fixmap_unmap(void);
|
||||
|
||||
int hyp_create_idmap(u32 hyp_va_bits);
|
||||
int hyp_map_vectors(void);
|
||||
int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
|
||||
int hyp_back_vmemmap(phys_addr_t back);
|
||||
int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
|
||||
int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
|
||||
int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot);
|
||||
@ -24,16 +28,4 @@ int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
|
||||
unsigned long *haddr);
|
||||
int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr);
|
||||
|
||||
static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size,
|
||||
unsigned long *start, unsigned long *end)
|
||||
{
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
struct hyp_page *p = hyp_phys_to_page(phys);
|
||||
|
||||
*start = (unsigned long)p;
|
||||
*end = *start + nr_pages * sizeof(struct hyp_page);
|
||||
*start = ALIGN_DOWN(*start, PAGE_SIZE);
|
||||
*end = ALIGN(*end, PAGE_SIZE);
|
||||
}
|
||||
|
||||
#endif /* __KVM_HYP_MM_H */
|
||||
|
68
arch/arm64/kvm/hyp/include/nvhe/pkvm.h
Normal file
68
arch/arm64/kvm/hyp/include/nvhe/pkvm.h
Normal file
@ -0,0 +1,68 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
* Copyright (C) 2021 Google LLC
|
||||
* Author: Fuad Tabba <tabba@google.com>
|
||||
*/
|
||||
|
||||
#ifndef __ARM64_KVM_NVHE_PKVM_H__
|
||||
#define __ARM64_KVM_NVHE_PKVM_H__
|
||||
|
||||
#include <asm/kvm_pkvm.h>
|
||||
|
||||
#include <nvhe/gfp.h>
|
||||
#include <nvhe/spinlock.h>
|
||||
|
||||
/*
|
||||
* Holds the relevant data for maintaining the vcpu state completely at hyp.
|
||||
*/
|
||||
struct pkvm_hyp_vcpu {
|
||||
struct kvm_vcpu vcpu;
|
||||
|
||||
/* Backpointer to the host's (untrusted) vCPU instance. */
|
||||
struct kvm_vcpu *host_vcpu;
|
||||
};
|
||||
|
||||
/*
|
||||
* Holds the relevant data for running a protected vm.
|
||||
*/
|
||||
struct pkvm_hyp_vm {
|
||||
struct kvm kvm;
|
||||
|
||||
/* Backpointer to the host's (untrusted) KVM instance. */
|
||||
struct kvm *host_kvm;
|
||||
|
||||
/* The guest's stage-2 page-table managed by the hypervisor. */
|
||||
struct kvm_pgtable pgt;
|
||||
struct kvm_pgtable_mm_ops mm_ops;
|
||||
struct hyp_pool pool;
|
||||
hyp_spinlock_t lock;
|
||||
|
||||
/*
|
||||
* The number of vcpus initialized and ready to run.
|
||||
* Modifying this is protected by 'vm_table_lock'.
|
||||
*/
|
||||
unsigned int nr_vcpus;
|
||||
|
||||
/* Array of the hyp vCPU structures for this VM. */
|
||||
struct pkvm_hyp_vcpu *vcpus[];
|
||||
};
|
||||
|
||||
static inline struct pkvm_hyp_vm *
|
||||
pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu)
|
||||
{
|
||||
return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm);
|
||||
}
|
||||
|
||||
void pkvm_hyp_vm_table_init(void *tbl);
|
||||
|
||||
int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
|
||||
unsigned long pgd_hva);
|
||||
int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
|
||||
unsigned long vcpu_hva);
|
||||
int __pkvm_teardown_vm(pkvm_handle_t handle);
|
||||
|
||||
struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
|
||||
unsigned int vcpu_idx);
|
||||
void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
|
||||
|
||||
#endif /* __ARM64_KVM_NVHE_PKVM_H__ */
|
@ -28,9 +28,17 @@ typedef union hyp_spinlock {
|
||||
};
|
||||
} hyp_spinlock_t;
|
||||
|
||||
#define __HYP_SPIN_LOCK_INITIALIZER \
|
||||
{ .__val = 0 }
|
||||
|
||||
#define __HYP_SPIN_LOCK_UNLOCKED \
|
||||
((hyp_spinlock_t) __HYP_SPIN_LOCK_INITIALIZER)
|
||||
|
||||
#define DEFINE_HYP_SPINLOCK(x) hyp_spinlock_t x = __HYP_SPIN_LOCK_UNLOCKED
|
||||
|
||||
#define hyp_spin_lock_init(l) \
|
||||
do { \
|
||||
*(l) = (hyp_spinlock_t){ .__val = 0 }; \
|
||||
*(l) = __HYP_SPIN_LOCK_UNLOCKED; \
|
||||
} while (0)
|
||||
|
||||
static inline void hyp_spin_lock(hyp_spinlock_t *lock)
|
||||
|
@ -12,3 +12,14 @@ SYM_FUNC_START(__pi_dcache_clean_inval_poc)
|
||||
ret
|
||||
SYM_FUNC_END(__pi_dcache_clean_inval_poc)
|
||||
SYM_FUNC_ALIAS(dcache_clean_inval_poc, __pi_dcache_clean_inval_poc)
|
||||
|
||||
SYM_FUNC_START(__pi_icache_inval_pou)
|
||||
alternative_if ARM64_HAS_CACHE_DIC
|
||||
isb
|
||||
ret
|
||||
alternative_else_nop_endif
|
||||
|
||||
invalidate_icache_by_line x0, x1, x2, x3
|
||||
ret
|
||||
SYM_FUNC_END(__pi_icache_inval_pou)
|
||||
SYM_FUNC_ALIAS(icache_inval_pou, __pi_icache_inval_pou)
|
||||
|
@ -15,17 +15,93 @@
|
||||
|
||||
#include <nvhe/mem_protect.h>
|
||||
#include <nvhe/mm.h>
|
||||
#include <nvhe/pkvm.h>
|
||||
#include <nvhe/trap_handler.h>
|
||||
|
||||
DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
|
||||
|
||||
void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
|
||||
|
||||
static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
|
||||
{
|
||||
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
|
||||
|
||||
hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt;
|
||||
|
||||
hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state);
|
||||
hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl;
|
||||
|
||||
hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu;
|
||||
|
||||
hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2;
|
||||
hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2;
|
||||
hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2;
|
||||
|
||||
hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags;
|
||||
hyp_vcpu->vcpu.arch.fp_state = host_vcpu->arch.fp_state;
|
||||
|
||||
hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr);
|
||||
hyp_vcpu->vcpu.arch.host_fpsimd_state = host_vcpu->arch.host_fpsimd_state;
|
||||
|
||||
hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2;
|
||||
|
||||
hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3;
|
||||
}
|
||||
|
||||
static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
|
||||
{
|
||||
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
|
||||
struct vgic_v3_cpu_if *hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3;
|
||||
struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3;
|
||||
unsigned int i;
|
||||
|
||||
host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt;
|
||||
|
||||
host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2;
|
||||
host_vcpu->arch.cptr_el2 = hyp_vcpu->vcpu.arch.cptr_el2;
|
||||
|
||||
host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault;
|
||||
|
||||
host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags;
|
||||
host_vcpu->arch.fp_state = hyp_vcpu->vcpu.arch.fp_state;
|
||||
|
||||
host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr;
|
||||
for (i = 0; i < hyp_cpu_if->used_lrs; ++i)
|
||||
host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i];
|
||||
}
|
||||
|
||||
static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
|
||||
{
|
||||
DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
|
||||
DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1);
|
||||
int ret;
|
||||
|
||||
cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu));
|
||||
host_vcpu = kern_hyp_va(host_vcpu);
|
||||
|
||||
if (unlikely(is_protected_kvm_enabled())) {
|
||||
struct pkvm_hyp_vcpu *hyp_vcpu;
|
||||
struct kvm *host_kvm;
|
||||
|
||||
host_kvm = kern_hyp_va(host_vcpu->kvm);
|
||||
hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle,
|
||||
host_vcpu->vcpu_idx);
|
||||
if (!hyp_vcpu) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
flush_hyp_vcpu(hyp_vcpu);
|
||||
|
||||
ret = __kvm_vcpu_run(&hyp_vcpu->vcpu);
|
||||
|
||||
sync_hyp_vcpu(hyp_vcpu);
|
||||
pkvm_put_hyp_vcpu(hyp_vcpu);
|
||||
} else {
|
||||
/* The host is fully trusted, run its vCPU directly. */
|
||||
ret = __kvm_vcpu_run(host_vcpu);
|
||||
}
|
||||
|
||||
out:
|
||||
cpu_reg(host_ctxt, 1) = ret;
|
||||
}
|
||||
|
||||
static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
|
||||
@ -191,6 +267,33 @@ static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt)
|
||||
__pkvm_vcpu_init_traps(kern_hyp_va(vcpu));
|
||||
}
|
||||
|
||||
static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt)
|
||||
{
|
||||
DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
|
||||
DECLARE_REG(unsigned long, vm_hva, host_ctxt, 2);
|
||||
DECLARE_REG(unsigned long, pgd_hva, host_ctxt, 3);
|
||||
|
||||
host_kvm = kern_hyp_va(host_kvm);
|
||||
cpu_reg(host_ctxt, 1) = __pkvm_init_vm(host_kvm, vm_hva, pgd_hva);
|
||||
}
|
||||
|
||||
static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt)
|
||||
{
|
||||
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
|
||||
DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 2);
|
||||
DECLARE_REG(unsigned long, vcpu_hva, host_ctxt, 3);
|
||||
|
||||
host_vcpu = kern_hyp_va(host_vcpu);
|
||||
cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva);
|
||||
}
|
||||
|
||||
static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
|
||||
{
|
||||
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
|
||||
|
||||
cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
|
||||
}
|
||||
|
||||
typedef void (*hcall_t)(struct kvm_cpu_context *);
|
||||
|
||||
#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
|
||||
@ -220,6 +323,9 @@ static const hcall_t host_hcall[] = {
|
||||
HANDLE_FUNC(__vgic_v3_save_aprs),
|
||||
HANDLE_FUNC(__vgic_v3_restore_aprs),
|
||||
HANDLE_FUNC(__pkvm_vcpu_init_traps),
|
||||
HANDLE_FUNC(__pkvm_init_vm),
|
||||
HANDLE_FUNC(__pkvm_init_vcpu),
|
||||
HANDLE_FUNC(__pkvm_teardown_vm),
|
||||
};
|
||||
|
||||
static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
|
||||
|
@ -23,6 +23,8 @@ u64 cpu_logical_map(unsigned int cpu)
|
||||
return hyp_cpu_logical_map[cpu];
|
||||
}
|
||||
|
||||
unsigned long __ro_after_init kvm_arm_hyp_percpu_base[NR_CPUS];
|
||||
|
||||
unsigned long __hyp_per_cpu_offset(unsigned int cpu)
|
||||
{
|
||||
unsigned long *cpu_base_array;
|
||||
|
@ -21,21 +21,33 @@
|
||||
|
||||
#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
|
||||
|
||||
extern unsigned long hyp_nr_cpus;
|
||||
struct host_kvm host_kvm;
|
||||
struct host_mmu host_mmu;
|
||||
|
||||
static struct hyp_pool host_s2_pool;
|
||||
|
||||
const u8 pkvm_hyp_id = 1;
|
||||
static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm);
|
||||
#define current_vm (*this_cpu_ptr(&__current_vm))
|
||||
|
||||
static void guest_lock_component(struct pkvm_hyp_vm *vm)
|
||||
{
|
||||
hyp_spin_lock(&vm->lock);
|
||||
current_vm = vm;
|
||||
}
|
||||
|
||||
static void guest_unlock_component(struct pkvm_hyp_vm *vm)
|
||||
{
|
||||
current_vm = NULL;
|
||||
hyp_spin_unlock(&vm->lock);
|
||||
}
|
||||
|
||||
static void host_lock_component(void)
|
||||
{
|
||||
hyp_spin_lock(&host_kvm.lock);
|
||||
hyp_spin_lock(&host_mmu.lock);
|
||||
}
|
||||
|
||||
static void host_unlock_component(void)
|
||||
{
|
||||
hyp_spin_unlock(&host_kvm.lock);
|
||||
hyp_spin_unlock(&host_mmu.lock);
|
||||
}
|
||||
|
||||
static void hyp_lock_component(void)
|
||||
@ -79,6 +91,11 @@ static void host_s2_put_page(void *addr)
|
||||
hyp_put_page(&host_s2_pool, addr);
|
||||
}
|
||||
|
||||
static void host_s2_free_removed_table(void *addr, u32 level)
|
||||
{
|
||||
kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level);
|
||||
}
|
||||
|
||||
static int prepare_s2_pool(void *pgt_pool_base)
|
||||
{
|
||||
unsigned long nr_pages, pfn;
|
||||
@ -90,9 +107,10 @@ static int prepare_s2_pool(void *pgt_pool_base)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) {
|
||||
host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) {
|
||||
.zalloc_pages_exact = host_s2_zalloc_pages_exact,
|
||||
.zalloc_page = host_s2_zalloc_page,
|
||||
.free_removed_table = host_s2_free_removed_table,
|
||||
.phys_to_virt = hyp_phys_to_virt,
|
||||
.virt_to_phys = hyp_virt_to_phys,
|
||||
.page_count = hyp_page_count,
|
||||
@ -111,7 +129,7 @@ static void prepare_host_vtcr(void)
|
||||
parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val);
|
||||
phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
|
||||
|
||||
host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
|
||||
host_mmu.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
|
||||
id_aa64mmfr1_el1_sys_val, phys_shift);
|
||||
}
|
||||
|
||||
@ -119,45 +137,170 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr
|
||||
|
||||
int kvm_host_prepare_stage2(void *pgt_pool_base)
|
||||
{
|
||||
struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
|
||||
struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
|
||||
int ret;
|
||||
|
||||
prepare_host_vtcr();
|
||||
hyp_spin_lock_init(&host_kvm.lock);
|
||||
mmu->arch = &host_kvm.arch;
|
||||
hyp_spin_lock_init(&host_mmu.lock);
|
||||
mmu->arch = &host_mmu.arch;
|
||||
|
||||
ret = prepare_s2_pool(pgt_pool_base);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, mmu,
|
||||
&host_kvm.mm_ops, KVM_HOST_S2_FLAGS,
|
||||
ret = __kvm_pgtable_stage2_init(&host_mmu.pgt, mmu,
|
||||
&host_mmu.mm_ops, KVM_HOST_S2_FLAGS,
|
||||
host_stage2_force_pte_cb);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
|
||||
mmu->pgt = &host_kvm.pgt;
|
||||
mmu->pgd_phys = __hyp_pa(host_mmu.pgt.pgd);
|
||||
mmu->pgt = &host_mmu.pgt;
|
||||
atomic64_set(&mmu->vmid.id, 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool guest_stage2_force_pte_cb(u64 addr, u64 end,
|
||||
enum kvm_pgtable_prot prot)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static void *guest_s2_zalloc_pages_exact(size_t size)
|
||||
{
|
||||
void *addr = hyp_alloc_pages(¤t_vm->pool, get_order(size));
|
||||
|
||||
WARN_ON(size != (PAGE_SIZE << get_order(size)));
|
||||
hyp_split_page(hyp_virt_to_page(addr));
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
||||
static void guest_s2_free_pages_exact(void *addr, unsigned long size)
|
||||
{
|
||||
u8 order = get_order(size);
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < (1 << order); i++)
|
||||
hyp_put_page(¤t_vm->pool, addr + (i * PAGE_SIZE));
|
||||
}
|
||||
|
||||
static void *guest_s2_zalloc_page(void *mc)
|
||||
{
|
||||
struct hyp_page *p;
|
||||
void *addr;
|
||||
|
||||
addr = hyp_alloc_pages(¤t_vm->pool, 0);
|
||||
if (addr)
|
||||
return addr;
|
||||
|
||||
addr = pop_hyp_memcache(mc, hyp_phys_to_virt);
|
||||
if (!addr)
|
||||
return addr;
|
||||
|
||||
memset(addr, 0, PAGE_SIZE);
|
||||
p = hyp_virt_to_page(addr);
|
||||
memset(p, 0, sizeof(*p));
|
||||
p->refcount = 1;
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
||||
static void guest_s2_get_page(void *addr)
|
||||
{
|
||||
hyp_get_page(¤t_vm->pool, addr);
|
||||
}
|
||||
|
||||
static void guest_s2_put_page(void *addr)
|
||||
{
|
||||
hyp_put_page(¤t_vm->pool, addr);
|
||||
}
|
||||
|
||||
static void clean_dcache_guest_page(void *va, size_t size)
|
||||
{
|
||||
__clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
|
||||
hyp_fixmap_unmap();
|
||||
}
|
||||
|
||||
static void invalidate_icache_guest_page(void *va, size_t size)
|
||||
{
|
||||
__invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
|
||||
hyp_fixmap_unmap();
|
||||
}
|
||||
|
||||
int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
|
||||
{
|
||||
struct kvm_s2_mmu *mmu = &vm->kvm.arch.mmu;
|
||||
unsigned long nr_pages;
|
||||
int ret;
|
||||
|
||||
nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT;
|
||||
ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
hyp_spin_lock_init(&vm->lock);
|
||||
vm->mm_ops = (struct kvm_pgtable_mm_ops) {
|
||||
.zalloc_pages_exact = guest_s2_zalloc_pages_exact,
|
||||
.free_pages_exact = guest_s2_free_pages_exact,
|
||||
.zalloc_page = guest_s2_zalloc_page,
|
||||
.phys_to_virt = hyp_phys_to_virt,
|
||||
.virt_to_phys = hyp_virt_to_phys,
|
||||
.page_count = hyp_page_count,
|
||||
.get_page = guest_s2_get_page,
|
||||
.put_page = guest_s2_put_page,
|
||||
.dcache_clean_inval_poc = clean_dcache_guest_page,
|
||||
.icache_inval_pou = invalidate_icache_guest_page,
|
||||
};
|
||||
|
||||
guest_lock_component(vm);
|
||||
ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0,
|
||||
guest_stage2_force_pte_cb);
|
||||
guest_unlock_component(vm);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
vm->kvm.arch.mmu.pgd_phys = __hyp_pa(vm->pgt.pgd);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
|
||||
{
|
||||
void *addr;
|
||||
|
||||
/* Dump all pgtable pages in the hyp_pool */
|
||||
guest_lock_component(vm);
|
||||
kvm_pgtable_stage2_destroy(&vm->pgt);
|
||||
vm->kvm.arch.mmu.pgd_phys = 0ULL;
|
||||
guest_unlock_component(vm);
|
||||
|
||||
/* Drain the hyp_pool into the memcache */
|
||||
addr = hyp_alloc_pages(&vm->pool, 0);
|
||||
while (addr) {
|
||||
memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page));
|
||||
push_hyp_memcache(mc, addr, hyp_virt_to_phys);
|
||||
WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1));
|
||||
addr = hyp_alloc_pages(&vm->pool, 0);
|
||||
}
|
||||
}
|
||||
|
||||
int __pkvm_prot_finalize(void)
|
||||
{
|
||||
struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
|
||||
struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
|
||||
struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
|
||||
|
||||
if (params->hcr_el2 & HCR_VM)
|
||||
return -EPERM;
|
||||
|
||||
params->vttbr = kvm_get_vttbr(mmu);
|
||||
params->vtcr = host_kvm.arch.vtcr;
|
||||
params->vtcr = host_mmu.arch.vtcr;
|
||||
params->hcr_el2 |= HCR_VM;
|
||||
kvm_flush_dcache_to_poc(params, sizeof(*params));
|
||||
|
||||
write_sysreg(params->hcr_el2, hcr_el2);
|
||||
__load_stage2(&host_kvm.arch.mmu, &host_kvm.arch);
|
||||
__load_stage2(&host_mmu.arch.mmu, &host_mmu.arch);
|
||||
|
||||
/*
|
||||
* Make sure to have an ISB before the TLB maintenance below but only
|
||||
@ -175,7 +318,7 @@ int __pkvm_prot_finalize(void)
|
||||
|
||||
static int host_stage2_unmap_dev_all(void)
|
||||
{
|
||||
struct kvm_pgtable *pgt = &host_kvm.pgt;
|
||||
struct kvm_pgtable *pgt = &host_mmu.pgt;
|
||||
struct memblock_region *reg;
|
||||
u64 addr = 0;
|
||||
int i, ret;
|
||||
@ -195,7 +338,7 @@ struct kvm_mem_range {
|
||||
u64 end;
|
||||
};
|
||||
|
||||
static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
|
||||
static struct memblock_region *find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
|
||||
{
|
||||
int cur, left = 0, right = hyp_memblock_nr;
|
||||
struct memblock_region *reg;
|
||||
@ -218,18 +361,28 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
|
||||
} else {
|
||||
range->start = reg->base;
|
||||
range->end = end;
|
||||
return true;
|
||||
return reg;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool addr_is_memory(phys_addr_t phys)
|
||||
{
|
||||
struct kvm_mem_range range;
|
||||
|
||||
return find_mem_range(phys, &range);
|
||||
return !!find_mem_range(phys, &range);
|
||||
}
|
||||
|
||||
static bool addr_is_allowed_memory(phys_addr_t phys)
|
||||
{
|
||||
struct memblock_region *reg;
|
||||
struct kvm_mem_range range;
|
||||
|
||||
reg = find_mem_range(phys, &range);
|
||||
|
||||
return reg && !(reg->flags & MEMBLOCK_NOMAP);
|
||||
}
|
||||
|
||||
static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
|
||||
@ -250,8 +403,8 @@ static bool range_is_memory(u64 start, u64 end)
|
||||
static inline int __host_stage2_idmap(u64 start, u64 end,
|
||||
enum kvm_pgtable_prot prot)
|
||||
{
|
||||
return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start,
|
||||
prot, &host_s2_pool);
|
||||
return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start,
|
||||
prot, &host_s2_pool, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -263,7 +416,7 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
|
||||
#define host_stage2_try(fn, ...) \
|
||||
({ \
|
||||
int __ret; \
|
||||
hyp_assert_lock_held(&host_kvm.lock); \
|
||||
hyp_assert_lock_held(&host_mmu.lock); \
|
||||
__ret = fn(__VA_ARGS__); \
|
||||
if (__ret == -ENOMEM) { \
|
||||
__ret = host_stage2_unmap_dev_all(); \
|
||||
@ -286,8 +439,8 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
|
||||
u32 level;
|
||||
int ret;
|
||||
|
||||
hyp_assert_lock_held(&host_kvm.lock);
|
||||
ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level);
|
||||
hyp_assert_lock_held(&host_mmu.lock);
|
||||
ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -319,7 +472,7 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
|
||||
|
||||
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
|
||||
{
|
||||
return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt,
|
||||
return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
|
||||
addr, size, &host_s2_pool, owner_id);
|
||||
}
|
||||
|
||||
@ -348,7 +501,7 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr
|
||||
static int host_stage2_idmap(u64 addr)
|
||||
{
|
||||
struct kvm_mem_range range;
|
||||
bool is_memory = find_mem_range(addr, &range);
|
||||
bool is_memory = !!find_mem_range(addr, &range);
|
||||
enum kvm_pgtable_prot prot;
|
||||
int ret;
|
||||
|
||||
@ -380,12 +533,6 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
|
||||
BUG_ON(ret && ret != -EAGAIN);
|
||||
}
|
||||
|
||||
/* This corresponds to locking order */
|
||||
enum pkvm_component_id {
|
||||
PKVM_ID_HOST,
|
||||
PKVM_ID_HYP,
|
||||
};
|
||||
|
||||
struct pkvm_mem_transition {
|
||||
u64 nr_pages;
|
||||
|
||||
@ -399,6 +546,9 @@ struct pkvm_mem_transition {
|
||||
/* Address in the completer's address space */
|
||||
u64 completer_addr;
|
||||
} host;
|
||||
struct {
|
||||
u64 completer_addr;
|
||||
} hyp;
|
||||
};
|
||||
} initiator;
|
||||
|
||||
@ -412,23 +562,24 @@ struct pkvm_mem_share {
|
||||
const enum kvm_pgtable_prot completer_prot;
|
||||
};
|
||||
|
||||
struct pkvm_mem_donation {
|
||||
const struct pkvm_mem_transition tx;
|
||||
};
|
||||
|
||||
struct check_walk_data {
|
||||
enum pkvm_page_state desired;
|
||||
enum pkvm_page_state (*get_page_state)(kvm_pte_t pte);
|
||||
};
|
||||
|
||||
static int __check_page_state_visitor(u64 addr, u64 end, u32 level,
|
||||
kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag,
|
||||
void * const arg)
|
||||
static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx,
|
||||
enum kvm_pgtable_walk_flags visit)
|
||||
{
|
||||
struct check_walk_data *d = arg;
|
||||
kvm_pte_t pte = *ptep;
|
||||
struct check_walk_data *d = ctx->arg;
|
||||
|
||||
if (kvm_pte_valid(pte) && !addr_is_memory(kvm_pte_to_phys(pte)))
|
||||
if (kvm_pte_valid(ctx->old) && !addr_is_allowed_memory(kvm_pte_to_phys(ctx->old)))
|
||||
return -EINVAL;
|
||||
|
||||
return d->get_page_state(pte) == d->desired ? 0 : -EPERM;
|
||||
return d->get_page_state(ctx->old) == d->desired ? 0 : -EPERM;
|
||||
}
|
||||
|
||||
static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
||||
@ -459,8 +610,8 @@ static int __host_check_page_state_range(u64 addr, u64 size,
|
||||
.get_page_state = host_get_page_state,
|
||||
};
|
||||
|
||||
hyp_assert_lock_held(&host_kvm.lock);
|
||||
return check_page_state_range(&host_kvm.pgt, addr, size, &d);
|
||||
hyp_assert_lock_held(&host_mmu.lock);
|
||||
return check_page_state_range(&host_mmu.pgt, addr, size, &d);
|
||||
}
|
||||
|
||||
static int __host_set_page_state_range(u64 addr, u64 size,
|
||||
@ -511,6 +662,46 @@ static int host_initiate_unshare(u64 *completer_addr,
|
||||
return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED);
|
||||
}
|
||||
|
||||
static int host_initiate_donation(u64 *completer_addr,
|
||||
const struct pkvm_mem_transition *tx)
|
||||
{
|
||||
u8 owner_id = tx->completer.id;
|
||||
u64 size = tx->nr_pages * PAGE_SIZE;
|
||||
|
||||
*completer_addr = tx->initiator.host.completer_addr;
|
||||
return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id);
|
||||
}
|
||||
|
||||
static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
|
||||
{
|
||||
return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
|
||||
tx->initiator.id != PKVM_ID_HYP);
|
||||
}
|
||||
|
||||
static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx,
|
||||
enum pkvm_page_state state)
|
||||
{
|
||||
u64 size = tx->nr_pages * PAGE_SIZE;
|
||||
|
||||
if (__host_ack_skip_pgtable_check(tx))
|
||||
return 0;
|
||||
|
||||
return __host_check_page_state_range(addr, size, state);
|
||||
}
|
||||
|
||||
static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
|
||||
{
|
||||
return __host_ack_transition(addr, tx, PKVM_NOPAGE);
|
||||
}
|
||||
|
||||
static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx)
|
||||
{
|
||||
u64 size = tx->nr_pages * PAGE_SIZE;
|
||||
u8 host_id = tx->completer.id;
|
||||
|
||||
return host_stage2_set_owner_locked(addr, size, host_id);
|
||||
}
|
||||
|
||||
static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte)
|
||||
{
|
||||
if (!kvm_pte_valid(pte))
|
||||
@ -531,6 +722,27 @@ static int __hyp_check_page_state_range(u64 addr, u64 size,
|
||||
return check_page_state_range(&pkvm_pgtable, addr, size, &d);
|
||||
}
|
||||
|
||||
static int hyp_request_donation(u64 *completer_addr,
|
||||
const struct pkvm_mem_transition *tx)
|
||||
{
|
||||
u64 size = tx->nr_pages * PAGE_SIZE;
|
||||
u64 addr = tx->initiator.addr;
|
||||
|
||||
*completer_addr = tx->initiator.hyp.completer_addr;
|
||||
return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED);
|
||||
}
|
||||
|
||||
static int hyp_initiate_donation(u64 *completer_addr,
|
||||
const struct pkvm_mem_transition *tx)
|
||||
{
|
||||
u64 size = tx->nr_pages * PAGE_SIZE;
|
||||
int ret;
|
||||
|
||||
*completer_addr = tx->initiator.hyp.completer_addr;
|
||||
ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size);
|
||||
return (ret != size) ? -EFAULT : 0;
|
||||
}
|
||||
|
||||
static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
|
||||
{
|
||||
return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
|
||||
@ -555,6 +767,9 @@ static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx)
|
||||
{
|
||||
u64 size = tx->nr_pages * PAGE_SIZE;
|
||||
|
||||
if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr))
|
||||
return -EBUSY;
|
||||
|
||||
if (__hyp_ack_skip_pgtable_check(tx))
|
||||
return 0;
|
||||
|
||||
@ -562,6 +777,16 @@ static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx)
|
||||
PKVM_PAGE_SHARED_BORROWED);
|
||||
}
|
||||
|
||||
static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
|
||||
{
|
||||
u64 size = tx->nr_pages * PAGE_SIZE;
|
||||
|
||||
if (__hyp_ack_skip_pgtable_check(tx))
|
||||
return 0;
|
||||
|
||||
return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE);
|
||||
}
|
||||
|
||||
static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
|
||||
enum kvm_pgtable_prot perms)
|
||||
{
|
||||
@ -580,6 +805,15 @@ static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx)
|
||||
return (ret != size) ? -EFAULT : 0;
|
||||
}
|
||||
|
||||
static int hyp_complete_donation(u64 addr,
|
||||
const struct pkvm_mem_transition *tx)
|
||||
{
|
||||
void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE);
|
||||
enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED);
|
||||
|
||||
return pkvm_create_mappings_locked(start, end, prot);
|
||||
}
|
||||
|
||||
static int check_share(struct pkvm_mem_share *share)
|
||||
{
|
||||
const struct pkvm_mem_transition *tx = &share->tx;
|
||||
@ -732,6 +966,94 @@ static int do_unshare(struct pkvm_mem_share *share)
|
||||
return WARN_ON(__do_unshare(share));
|
||||
}
|
||||
|
||||
static int check_donation(struct pkvm_mem_donation *donation)
|
||||
{
|
||||
const struct pkvm_mem_transition *tx = &donation->tx;
|
||||
u64 completer_addr;
|
||||
int ret;
|
||||
|
||||
switch (tx->initiator.id) {
|
||||
case PKVM_ID_HOST:
|
||||
ret = host_request_owned_transition(&completer_addr, tx);
|
||||
break;
|
||||
case PKVM_ID_HYP:
|
||||
ret = hyp_request_donation(&completer_addr, tx);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
switch (tx->completer.id) {
|
||||
case PKVM_ID_HOST:
|
||||
ret = host_ack_donation(completer_addr, tx);
|
||||
break;
|
||||
case PKVM_ID_HYP:
|
||||
ret = hyp_ack_donation(completer_addr, tx);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __do_donate(struct pkvm_mem_donation *donation)
|
||||
{
|
||||
const struct pkvm_mem_transition *tx = &donation->tx;
|
||||
u64 completer_addr;
|
||||
int ret;
|
||||
|
||||
switch (tx->initiator.id) {
|
||||
case PKVM_ID_HOST:
|
||||
ret = host_initiate_donation(&completer_addr, tx);
|
||||
break;
|
||||
case PKVM_ID_HYP:
|
||||
ret = hyp_initiate_donation(&completer_addr, tx);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
switch (tx->completer.id) {
|
||||
case PKVM_ID_HOST:
|
||||
ret = host_complete_donation(completer_addr, tx);
|
||||
break;
|
||||
case PKVM_ID_HYP:
|
||||
ret = hyp_complete_donation(completer_addr, tx);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* do_donate():
|
||||
*
|
||||
* The page owner transfers ownership to another component, losing access
|
||||
* as a consequence.
|
||||
*
|
||||
* Initiator: OWNED => NOPAGE
|
||||
* Completer: NOPAGE => OWNED
|
||||
*/
|
||||
static int do_donate(struct pkvm_mem_donation *donation)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = check_donation(donation);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return WARN_ON(__do_donate(donation));
|
||||
}
|
||||
|
||||
int __pkvm_host_share_hyp(u64 pfn)
|
||||
{
|
||||
int ret;
|
||||
@ -797,3 +1119,112 @@ int __pkvm_host_unshare_hyp(u64 pfn)
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
|
||||
{
|
||||
int ret;
|
||||
u64 host_addr = hyp_pfn_to_phys(pfn);
|
||||
u64 hyp_addr = (u64)__hyp_va(host_addr);
|
||||
struct pkvm_mem_donation donation = {
|
||||
.tx = {
|
||||
.nr_pages = nr_pages,
|
||||
.initiator = {
|
||||
.id = PKVM_ID_HOST,
|
||||
.addr = host_addr,
|
||||
.host = {
|
||||
.completer_addr = hyp_addr,
|
||||
},
|
||||
},
|
||||
.completer = {
|
||||
.id = PKVM_ID_HYP,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
host_lock_component();
|
||||
hyp_lock_component();
|
||||
|
||||
ret = do_donate(&donation);
|
||||
|
||||
hyp_unlock_component();
|
||||
host_unlock_component();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
|
||||
{
|
||||
int ret;
|
||||
u64 host_addr = hyp_pfn_to_phys(pfn);
|
||||
u64 hyp_addr = (u64)__hyp_va(host_addr);
|
||||
struct pkvm_mem_donation donation = {
|
||||
.tx = {
|
||||
.nr_pages = nr_pages,
|
||||
.initiator = {
|
||||
.id = PKVM_ID_HYP,
|
||||
.addr = hyp_addr,
|
||||
.hyp = {
|
||||
.completer_addr = host_addr,
|
||||
},
|
||||
},
|
||||
.completer = {
|
||||
.id = PKVM_ID_HOST,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
host_lock_component();
|
||||
hyp_lock_component();
|
||||
|
||||
ret = do_donate(&donation);
|
||||
|
||||
hyp_unlock_component();
|
||||
host_unlock_component();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int hyp_pin_shared_mem(void *from, void *to)
|
||||
{
|
||||
u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
|
||||
u64 end = PAGE_ALIGN((u64)to);
|
||||
u64 size = end - start;
|
||||
int ret;
|
||||
|
||||
host_lock_component();
|
||||
hyp_lock_component();
|
||||
|
||||
ret = __host_check_page_state_range(__hyp_pa(start), size,
|
||||
PKVM_PAGE_SHARED_OWNED);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
ret = __hyp_check_page_state_range(start, size,
|
||||
PKVM_PAGE_SHARED_BORROWED);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
for (cur = start; cur < end; cur += PAGE_SIZE)
|
||||
hyp_page_ref_inc(hyp_virt_to_page(cur));
|
||||
|
||||
unlock:
|
||||
hyp_unlock_component();
|
||||
host_unlock_component();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void hyp_unpin_shared_mem(void *from, void *to)
|
||||
{
|
||||
u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
|
||||
u64 end = PAGE_ALIGN((u64)to);
|
||||
|
||||
host_lock_component();
|
||||
hyp_lock_component();
|
||||
|
||||
for (cur = start; cur < end; cur += PAGE_SIZE)
|
||||
hyp_page_ref_dec(hyp_virt_to_page(cur));
|
||||
|
||||
hyp_unlock_component();
|
||||
host_unlock_component();
|
||||
}
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <nvhe/early_alloc.h>
|
||||
#include <nvhe/gfp.h>
|
||||
#include <nvhe/memory.h>
|
||||
#include <nvhe/mem_protect.h>
|
||||
#include <nvhe/mm.h>
|
||||
#include <nvhe/spinlock.h>
|
||||
|
||||
@ -25,6 +26,12 @@ unsigned int hyp_memblock_nr;
|
||||
|
||||
static u64 __io_map_base;
|
||||
|
||||
struct hyp_fixmap_slot {
|
||||
u64 addr;
|
||||
kvm_pte_t *ptep;
|
||||
};
|
||||
static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots);
|
||||
|
||||
static int __pkvm_create_mappings(unsigned long start, unsigned long size,
|
||||
unsigned long phys, enum kvm_pgtable_prot prot)
|
||||
{
|
||||
@ -129,13 +136,36 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
|
||||
return ret;
|
||||
}
|
||||
|
||||
int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
|
||||
int hyp_back_vmemmap(phys_addr_t back)
|
||||
{
|
||||
unsigned long start, end;
|
||||
unsigned long i, start, size, end = 0;
|
||||
int ret;
|
||||
|
||||
hyp_vmemmap_range(phys, size, &start, &end);
|
||||
for (i = 0; i < hyp_memblock_nr; i++) {
|
||||
start = hyp_memory[i].base;
|
||||
start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE);
|
||||
/*
|
||||
* The begining of the hyp_vmemmap region for the current
|
||||
* memblock may already be backed by the page backing the end
|
||||
* the previous region, so avoid mapping it twice.
|
||||
*/
|
||||
start = max(start, end);
|
||||
|
||||
return __pkvm_create_mappings(start, end - start, back, PAGE_HYP);
|
||||
end = hyp_memory[i].base + hyp_memory[i].size;
|
||||
end = PAGE_ALIGN((u64)hyp_phys_to_page(end));
|
||||
if (start >= end)
|
||||
continue;
|
||||
|
||||
size = end - start;
|
||||
ret = __pkvm_create_mappings(start, size, back, PAGE_HYP);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
memset(hyp_phys_to_virt(back), 0, size);
|
||||
back += size;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void *__hyp_bp_vect_base;
|
||||
@ -189,6 +219,102 @@ int hyp_map_vectors(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void *hyp_fixmap_map(phys_addr_t phys)
|
||||
{
|
||||
struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
|
||||
kvm_pte_t pte, *ptep = slot->ptep;
|
||||
|
||||
pte = *ptep;
|
||||
pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID);
|
||||
pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID;
|
||||
WRITE_ONCE(*ptep, pte);
|
||||
dsb(ishst);
|
||||
|
||||
return (void *)slot->addr;
|
||||
}
|
||||
|
||||
static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
|
||||
{
|
||||
kvm_pte_t *ptep = slot->ptep;
|
||||
u64 addr = slot->addr;
|
||||
|
||||
WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
|
||||
|
||||
/*
|
||||
* Irritatingly, the architecture requires that we use inner-shareable
|
||||
* broadcast TLB invalidation here in case another CPU speculates
|
||||
* through our fixmap and decides to create an "amalagamation of the
|
||||
* values held in the TLB" due to the apparent lack of a
|
||||
* break-before-make sequence.
|
||||
*
|
||||
* https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
|
||||
*/
|
||||
dsb(ishst);
|
||||
__tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1));
|
||||
dsb(ish);
|
||||
isb();
|
||||
}
|
||||
|
||||
void hyp_fixmap_unmap(void)
|
||||
{
|
||||
fixmap_clear_slot(this_cpu_ptr(&fixmap_slots));
|
||||
}
|
||||
|
||||
static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx,
|
||||
enum kvm_pgtable_walk_flags visit)
|
||||
{
|
||||
struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg);
|
||||
|
||||
if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 1)
|
||||
return -EINVAL;
|
||||
|
||||
slot->addr = ctx->addr;
|
||||
slot->ptep = ctx->ptep;
|
||||
|
||||
/*
|
||||
* Clear the PTE, but keep the page-table page refcount elevated to
|
||||
* prevent it from ever being freed. This lets us manipulate the PTEs
|
||||
* by hand safely without ever needing to allocate memory.
|
||||
*/
|
||||
fixmap_clear_slot(slot);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int create_fixmap_slot(u64 addr, u64 cpu)
|
||||
{
|
||||
struct kvm_pgtable_walker walker = {
|
||||
.cb = __create_fixmap_slot_cb,
|
||||
.flags = KVM_PGTABLE_WALK_LEAF,
|
||||
.arg = (void *)cpu,
|
||||
};
|
||||
|
||||
return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker);
|
||||
}
|
||||
|
||||
int hyp_create_pcpu_fixmap(void)
|
||||
{
|
||||
unsigned long addr, i;
|
||||
int ret;
|
||||
|
||||
for (i = 0; i < hyp_nr_cpus; i++) {
|
||||
ret = pkvm_alloc_private_va_range(PAGE_SIZE, &addr);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PAGE_SIZE,
|
||||
__hyp_pa(__hyp_bss_start), PAGE_HYP);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = create_fixmap_slot(addr, i);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int hyp_create_idmap(u32 hyp_va_bits)
|
||||
{
|
||||
unsigned long start, end;
|
||||
@ -213,3 +339,36 @@ int hyp_create_idmap(u32 hyp_va_bits)
|
||||
|
||||
return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
|
||||
}
|
||||
|
||||
static void *admit_host_page(void *arg)
|
||||
{
|
||||
struct kvm_hyp_memcache *host_mc = arg;
|
||||
|
||||
if (!host_mc->nr_pages)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* The host still owns the pages in its memcache, so we need to go
|
||||
* through a full host-to-hyp donation cycle to change it. Fortunately,
|
||||
* __pkvm_host_donate_hyp() takes care of races for us, so if it
|
||||
* succeeds we're good to go.
|
||||
*/
|
||||
if (__pkvm_host_donate_hyp(hyp_phys_to_pfn(host_mc->head), 1))
|
||||
return NULL;
|
||||
|
||||
return pop_hyp_memcache(host_mc, hyp_phys_to_virt);
|
||||
}
|
||||
|
||||
/* Refill our local memcache by poping pages from the one provided by the host. */
|
||||
int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
|
||||
struct kvm_hyp_memcache *host_mc)
|
||||
{
|
||||
struct kvm_hyp_memcache tmp = *host_mc;
|
||||
int ret;
|
||||
|
||||
ret = __topup_hyp_memcache(mc, min_pages, admit_host_page,
|
||||
hyp_virt_to_phys, &tmp);
|
||||
*host_mc = tmp;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -93,11 +93,16 @@ static inline struct hyp_page *node_to_page(struct list_head *node)
|
||||
static void __hyp_attach_page(struct hyp_pool *pool,
|
||||
struct hyp_page *p)
|
||||
{
|
||||
phys_addr_t phys = hyp_page_to_phys(p);
|
||||
unsigned short order = p->order;
|
||||
struct hyp_page *buddy;
|
||||
|
||||
memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
|
||||
|
||||
/* Skip coalescing for 'external' pages being freed into the pool. */
|
||||
if (phys < pool->range_start || phys >= pool->range_end)
|
||||
goto insert;
|
||||
|
||||
/*
|
||||
* Only the first struct hyp_page of a high-order page (otherwise known
|
||||
* as the 'head') should have p->order set. The non-head pages should
|
||||
@ -116,6 +121,7 @@ static void __hyp_attach_page(struct hyp_pool *pool,
|
||||
p = min(p, buddy);
|
||||
}
|
||||
|
||||
insert:
|
||||
/* Mark the new head, and insert it */
|
||||
p->order = order;
|
||||
page_add_to_list(p, &pool->free_area[order]);
|
||||
@ -144,25 +150,6 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline void hyp_page_ref_inc(struct hyp_page *p)
|
||||
{
|
||||
BUG_ON(p->refcount == USHRT_MAX);
|
||||
p->refcount++;
|
||||
}
|
||||
|
||||
static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
|
||||
{
|
||||
BUG_ON(!p->refcount);
|
||||
p->refcount--;
|
||||
return (p->refcount == 0);
|
||||
}
|
||||
|
||||
static inline void hyp_set_page_refcounted(struct hyp_page *p)
|
||||
{
|
||||
BUG_ON(p->refcount);
|
||||
p->refcount = 1;
|
||||
}
|
||||
|
||||
static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p)
|
||||
{
|
||||
if (hyp_page_ref_dec_and_test(p))
|
||||
@ -249,10 +236,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
|
||||
|
||||
/* Init the vmemmap portion */
|
||||
p = hyp_phys_to_page(phys);
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
p[i].order = 0;
|
||||
for (i = 0; i < nr_pages; i++)
|
||||
hyp_set_page_refcounted(&p[i]);
|
||||
}
|
||||
|
||||
/* Attach the unused pages to the buddy tree */
|
||||
for (i = reserved_pages; i < nr_pages; i++)
|
||||
|
@ -7,8 +7,17 @@
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/mm.h>
|
||||
#include <nvhe/fixed_config.h>
|
||||
#include <nvhe/mem_protect.h>
|
||||
#include <nvhe/memory.h>
|
||||
#include <nvhe/pkvm.h>
|
||||
#include <nvhe/trap_handler.h>
|
||||
|
||||
/* Used by icache_is_vpipt(). */
|
||||
unsigned long __icache_flags;
|
||||
|
||||
/* Used by kvm_get_vttbr(). */
|
||||
unsigned int kvm_arm_vmid_bits;
|
||||
|
||||
/*
|
||||
* Set trap register values based on features in ID_AA64PFR0.
|
||||
*/
|
||||
@ -183,3 +192,430 @@ void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
|
||||
pvm_init_traps_aa64mmfr0(vcpu);
|
||||
pvm_init_traps_aa64mmfr1(vcpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* Start the VM table handle at the offset defined instead of at 0.
|
||||
* Mainly for sanity checking and debugging.
|
||||
*/
|
||||
#define HANDLE_OFFSET 0x1000
|
||||
|
||||
static unsigned int vm_handle_to_idx(pkvm_handle_t handle)
|
||||
{
|
||||
return handle - HANDLE_OFFSET;
|
||||
}
|
||||
|
||||
static pkvm_handle_t idx_to_vm_handle(unsigned int idx)
|
||||
{
|
||||
return idx + HANDLE_OFFSET;
|
||||
}
|
||||
|
||||
/*
|
||||
* Spinlock for protecting state related to the VM table. Protects writes
|
||||
* to 'vm_table' and 'nr_table_entries' as well as reads and writes to
|
||||
* 'last_hyp_vcpu_lookup'.
|
||||
*/
|
||||
static DEFINE_HYP_SPINLOCK(vm_table_lock);
|
||||
|
||||
/*
|
||||
* The table of VM entries for protected VMs in hyp.
|
||||
* Allocated at hyp initialization and setup.
|
||||
*/
|
||||
static struct pkvm_hyp_vm **vm_table;
|
||||
|
||||
void pkvm_hyp_vm_table_init(void *tbl)
|
||||
{
|
||||
WARN_ON(vm_table);
|
||||
vm_table = tbl;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the hyp vm structure corresponding to the handle.
|
||||
*/
|
||||
static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
|
||||
{
|
||||
unsigned int idx = vm_handle_to_idx(handle);
|
||||
|
||||
if (unlikely(idx >= KVM_MAX_PVMS))
|
||||
return NULL;
|
||||
|
||||
return vm_table[idx];
|
||||
}
|
||||
|
||||
struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
|
||||
unsigned int vcpu_idx)
|
||||
{
|
||||
struct pkvm_hyp_vcpu *hyp_vcpu = NULL;
|
||||
struct pkvm_hyp_vm *hyp_vm;
|
||||
|
||||
hyp_spin_lock(&vm_table_lock);
|
||||
hyp_vm = get_vm_by_handle(handle);
|
||||
if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx)
|
||||
goto unlock;
|
||||
|
||||
hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
|
||||
hyp_page_ref_inc(hyp_virt_to_page(hyp_vm));
|
||||
unlock:
|
||||
hyp_spin_unlock(&vm_table_lock);
|
||||
return hyp_vcpu;
|
||||
}
|
||||
|
||||
void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
|
||||
{
|
||||
struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
|
||||
|
||||
hyp_spin_lock(&vm_table_lock);
|
||||
hyp_page_ref_dec(hyp_virt_to_page(hyp_vm));
|
||||
hyp_spin_unlock(&vm_table_lock);
|
||||
}
|
||||
|
||||
static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
|
||||
{
|
||||
if (host_vcpu)
|
||||
hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
|
||||
}
|
||||
|
||||
static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
|
||||
unsigned int nr_vcpus)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nr_vcpus; i++)
|
||||
unpin_host_vcpu(hyp_vcpus[i]->host_vcpu);
|
||||
}
|
||||
|
||||
static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
|
||||
unsigned int nr_vcpus)
|
||||
{
|
||||
hyp_vm->host_kvm = host_kvm;
|
||||
hyp_vm->kvm.created_vcpus = nr_vcpus;
|
||||
hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr;
|
||||
}
|
||||
|
||||
static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
|
||||
struct pkvm_hyp_vm *hyp_vm,
|
||||
struct kvm_vcpu *host_vcpu,
|
||||
unsigned int vcpu_idx)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
|
||||
return -EBUSY;
|
||||
|
||||
if (host_vcpu->vcpu_idx != vcpu_idx) {
|
||||
ret = -EINVAL;
|
||||
goto done;
|
||||
}
|
||||
|
||||
hyp_vcpu->host_vcpu = host_vcpu;
|
||||
|
||||
hyp_vcpu->vcpu.kvm = &hyp_vm->kvm;
|
||||
hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id);
|
||||
hyp_vcpu->vcpu.vcpu_idx = vcpu_idx;
|
||||
|
||||
hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
|
||||
hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
|
||||
done:
|
||||
if (ret)
|
||||
unpin_host_vcpu(host_vcpu);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int find_free_vm_table_entry(struct kvm *host_kvm)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < KVM_MAX_PVMS; ++i) {
|
||||
if (!vm_table[i])
|
||||
return i;
|
||||
}
|
||||
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a VM table entry and insert a pointer to the new vm.
|
||||
*
|
||||
* Return a unique handle to the protected VM on success,
|
||||
* negative error code on failure.
|
||||
*/
|
||||
static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
|
||||
struct pkvm_hyp_vm *hyp_vm)
|
||||
{
|
||||
struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
|
||||
int idx;
|
||||
|
||||
hyp_assert_lock_held(&vm_table_lock);
|
||||
|
||||
/*
|
||||
* Initializing protected state might have failed, yet a malicious
|
||||
* host could trigger this function. Thus, ensure that 'vm_table'
|
||||
* exists.
|
||||
*/
|
||||
if (unlikely(!vm_table))
|
||||
return -EINVAL;
|
||||
|
||||
idx = find_free_vm_table_entry(host_kvm);
|
||||
if (idx < 0)
|
||||
return idx;
|
||||
|
||||
hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx);
|
||||
|
||||
/* VMID 0 is reserved for the host */
|
||||
atomic64_set(&mmu->vmid.id, idx + 1);
|
||||
|
||||
mmu->arch = &hyp_vm->kvm.arch;
|
||||
mmu->pgt = &hyp_vm->pgt;
|
||||
|
||||
vm_table[idx] = hyp_vm;
|
||||
return hyp_vm->kvm.arch.pkvm.handle;
|
||||
}
|
||||
|
||||
/*
|
||||
* Deallocate and remove the VM table entry corresponding to the handle.
|
||||
*/
|
||||
static void remove_vm_table_entry(pkvm_handle_t handle)
|
||||
{
|
||||
hyp_assert_lock_held(&vm_table_lock);
|
||||
vm_table[vm_handle_to_idx(handle)] = NULL;
|
||||
}
|
||||
|
||||
static size_t pkvm_get_hyp_vm_size(unsigned int nr_vcpus)
|
||||
{
|
||||
return size_add(sizeof(struct pkvm_hyp_vm),
|
||||
size_mul(sizeof(struct pkvm_hyp_vcpu *), nr_vcpus));
|
||||
}
|
||||
|
||||
static void *map_donated_memory_noclear(unsigned long host_va, size_t size)
|
||||
{
|
||||
void *va = (void *)kern_hyp_va(host_va);
|
||||
|
||||
if (!PAGE_ALIGNED(va))
|
||||
return NULL;
|
||||
|
||||
if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va),
|
||||
PAGE_ALIGN(size) >> PAGE_SHIFT))
|
||||
return NULL;
|
||||
|
||||
return va;
|
||||
}
|
||||
|
||||
static void *map_donated_memory(unsigned long host_va, size_t size)
|
||||
{
|
||||
void *va = map_donated_memory_noclear(host_va, size);
|
||||
|
||||
if (va)
|
||||
memset(va, 0, size);
|
||||
|
||||
return va;
|
||||
}
|
||||
|
||||
static void __unmap_donated_memory(void *va, size_t size)
|
||||
{
|
||||
WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va),
|
||||
PAGE_ALIGN(size) >> PAGE_SHIFT));
|
||||
}
|
||||
|
||||
static void unmap_donated_memory(void *va, size_t size)
|
||||
{
|
||||
if (!va)
|
||||
return;
|
||||
|
||||
memset(va, 0, size);
|
||||
__unmap_donated_memory(va, size);
|
||||
}
|
||||
|
||||
static void unmap_donated_memory_noclear(void *va, size_t size)
|
||||
{
|
||||
if (!va)
|
||||
return;
|
||||
|
||||
__unmap_donated_memory(va, size);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the hypervisor copy of the protected VM state using the
|
||||
* memory donated by the host.
|
||||
*
|
||||
* Unmaps the donated memory from the host at stage 2.
|
||||
*
|
||||
* host_kvm: A pointer to the host's struct kvm.
|
||||
* vm_hva: The host va of the area being donated for the VM state.
|
||||
* Must be page aligned.
|
||||
* pgd_hva: The host va of the area being donated for the stage-2 PGD for
|
||||
* the VM. Must be page aligned. Its size is implied by the VM's
|
||||
* VTCR.
|
||||
*
|
||||
* Return a unique handle to the protected VM on success,
|
||||
* negative error code on failure.
|
||||
*/
|
||||
int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
|
||||
unsigned long pgd_hva)
|
||||
{
|
||||
struct pkvm_hyp_vm *hyp_vm = NULL;
|
||||
size_t vm_size, pgd_size;
|
||||
unsigned int nr_vcpus;
|
||||
void *pgd = NULL;
|
||||
int ret;
|
||||
|
||||
ret = hyp_pin_shared_mem(host_kvm, host_kvm + 1);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
nr_vcpus = READ_ONCE(host_kvm->created_vcpus);
|
||||
if (nr_vcpus < 1) {
|
||||
ret = -EINVAL;
|
||||
goto err_unpin_kvm;
|
||||
}
|
||||
|
||||
vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
|
||||
pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr);
|
||||
|
||||
ret = -ENOMEM;
|
||||
|
||||
hyp_vm = map_donated_memory(vm_hva, vm_size);
|
||||
if (!hyp_vm)
|
||||
goto err_remove_mappings;
|
||||
|
||||
pgd = map_donated_memory_noclear(pgd_hva, pgd_size);
|
||||
if (!pgd)
|
||||
goto err_remove_mappings;
|
||||
|
||||
init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus);
|
||||
|
||||
hyp_spin_lock(&vm_table_lock);
|
||||
ret = insert_vm_table_entry(host_kvm, hyp_vm);
|
||||
if (ret < 0)
|
||||
goto err_unlock;
|
||||
|
||||
ret = kvm_guest_prepare_stage2(hyp_vm, pgd);
|
||||
if (ret)
|
||||
goto err_remove_vm_table_entry;
|
||||
hyp_spin_unlock(&vm_table_lock);
|
||||
|
||||
return hyp_vm->kvm.arch.pkvm.handle;
|
||||
|
||||
err_remove_vm_table_entry:
|
||||
remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle);
|
||||
err_unlock:
|
||||
hyp_spin_unlock(&vm_table_lock);
|
||||
err_remove_mappings:
|
||||
unmap_donated_memory(hyp_vm, vm_size);
|
||||
unmap_donated_memory(pgd, pgd_size);
|
||||
err_unpin_kvm:
|
||||
hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the hypervisor copy of the protected vCPU state using the
|
||||
* memory donated by the host.
|
||||
*
|
||||
* handle: The handle for the protected vm.
|
||||
* host_vcpu: A pointer to the corresponding host vcpu.
|
||||
* vcpu_hva: The host va of the area being donated for the vcpu state.
|
||||
* Must be page aligned. The size of the area must be equal to
|
||||
* the page-aligned size of 'struct pkvm_hyp_vcpu'.
|
||||
* Return 0 on success, negative error code on failure.
|
||||
*/
|
||||
int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
|
||||
unsigned long vcpu_hva)
|
||||
{
|
||||
struct pkvm_hyp_vcpu *hyp_vcpu;
|
||||
struct pkvm_hyp_vm *hyp_vm;
|
||||
unsigned int idx;
|
||||
int ret;
|
||||
|
||||
hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu));
|
||||
if (!hyp_vcpu)
|
||||
return -ENOMEM;
|
||||
|
||||
hyp_spin_lock(&vm_table_lock);
|
||||
|
||||
hyp_vm = get_vm_by_handle(handle);
|
||||
if (!hyp_vm) {
|
||||
ret = -ENOENT;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
idx = hyp_vm->nr_vcpus;
|
||||
if (idx >= hyp_vm->kvm.created_vcpus) {
|
||||
ret = -EINVAL;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
hyp_vm->vcpus[idx] = hyp_vcpu;
|
||||
hyp_vm->nr_vcpus++;
|
||||
unlock:
|
||||
hyp_spin_unlock(&vm_table_lock);
|
||||
|
||||
if (ret)
|
||||
unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void
|
||||
teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
|
||||
{
|
||||
size = PAGE_ALIGN(size);
|
||||
memset(addr, 0, size);
|
||||
|
||||
for (void *start = addr; start < addr + size; start += PAGE_SIZE)
|
||||
push_hyp_memcache(mc, start, hyp_virt_to_phys);
|
||||
|
||||
unmap_donated_memory_noclear(addr, size);
|
||||
}
|
||||
|
||||
int __pkvm_teardown_vm(pkvm_handle_t handle)
|
||||
{
|
||||
struct kvm_hyp_memcache *mc;
|
||||
struct pkvm_hyp_vm *hyp_vm;
|
||||
struct kvm *host_kvm;
|
||||
unsigned int idx;
|
||||
size_t vm_size;
|
||||
int err;
|
||||
|
||||
hyp_spin_lock(&vm_table_lock);
|
||||
hyp_vm = get_vm_by_handle(handle);
|
||||
if (!hyp_vm) {
|
||||
err = -ENOENT;
|
||||
goto err_unlock;
|
||||
}
|
||||
|
||||
if (WARN_ON(hyp_page_count(hyp_vm))) {
|
||||
err = -EBUSY;
|
||||
goto err_unlock;
|
||||
}
|
||||
|
||||
host_kvm = hyp_vm->host_kvm;
|
||||
|
||||
/* Ensure the VMID is clean before it can be reallocated */
|
||||
__kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu);
|
||||
remove_vm_table_entry(handle);
|
||||
hyp_spin_unlock(&vm_table_lock);
|
||||
|
||||
/* Reclaim guest pages (including page-table pages) */
|
||||
mc = &host_kvm->arch.pkvm.teardown_mc;
|
||||
reclaim_guest_pages(hyp_vm, mc);
|
||||
unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
|
||||
|
||||
/* Push the metadata pages to the teardown memcache */
|
||||
for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) {
|
||||
struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
|
||||
|
||||
teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu));
|
||||
}
|
||||
|
||||
vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus);
|
||||
teardown_donated_memory(mc, hyp_vm, vm_size);
|
||||
hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
|
||||
return 0;
|
||||
|
||||
err_unlock:
|
||||
hyp_spin_unlock(&vm_table_lock);
|
||||
return err;
|
||||
}
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <nvhe/memory.h>
|
||||
#include <nvhe/mem_protect.h>
|
||||
#include <nvhe/mm.h>
|
||||
#include <nvhe/pkvm.h>
|
||||
#include <nvhe/trap_handler.h>
|
||||
|
||||
unsigned long hyp_nr_cpus;
|
||||
@ -24,6 +25,7 @@ unsigned long hyp_nr_cpus;
|
||||
(unsigned long)__per_cpu_start)
|
||||
|
||||
static void *vmemmap_base;
|
||||
static void *vm_table_base;
|
||||
static void *hyp_pgt_base;
|
||||
static void *host_s2_pgt_base;
|
||||
static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
|
||||
@ -31,16 +33,20 @@ static struct hyp_pool hpool;
|
||||
|
||||
static int divide_memory_pool(void *virt, unsigned long size)
|
||||
{
|
||||
unsigned long vstart, vend, nr_pages;
|
||||
unsigned long nr_pages;
|
||||
|
||||
hyp_early_alloc_init(virt, size);
|
||||
|
||||
hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend);
|
||||
nr_pages = (vend - vstart) >> PAGE_SHIFT;
|
||||
nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page));
|
||||
vmemmap_base = hyp_early_alloc_contig(nr_pages);
|
||||
if (!vmemmap_base)
|
||||
return -ENOMEM;
|
||||
|
||||
nr_pages = hyp_vm_table_pages();
|
||||
vm_table_base = hyp_early_alloc_contig(nr_pages);
|
||||
if (!vm_table_base)
|
||||
return -ENOMEM;
|
||||
|
||||
nr_pages = hyp_s1_pgtable_pages();
|
||||
hyp_pgt_base = hyp_early_alloc_contig(nr_pages);
|
||||
if (!hyp_pgt_base)
|
||||
@ -78,7 +84,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base));
|
||||
ret = hyp_back_vmemmap(hyp_virt_to_phys(vmemmap_base));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -138,20 +144,17 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
|
||||
}
|
||||
|
||||
/*
|
||||
* Map the host's .bss and .rodata sections RO in the hypervisor, but
|
||||
* transfer the ownership from the host to the hypervisor itself to
|
||||
* make sure it can't be donated or shared with another entity.
|
||||
* Map the host sections RO in the hypervisor, but transfer the
|
||||
* ownership from the host to the hypervisor itself to make sure they
|
||||
* can't be donated or shared with another entity.
|
||||
*
|
||||
* The ownership transition requires matching changes in the host
|
||||
* stage-2. This will be done later (see finalize_host_mappings()) once
|
||||
* the hyp_vmemmap is addressable.
|
||||
*/
|
||||
prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED);
|
||||
ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot);
|
||||
ret = pkvm_create_mappings(&kvm_vgic_global_state,
|
||||
&kvm_vgic_global_state + 1, prot);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -186,33 +189,20 @@ static void hpool_put_page(void *addr)
|
||||
hyp_put_page(&hpool, addr);
|
||||
}
|
||||
|
||||
static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
|
||||
kvm_pte_t *ptep,
|
||||
enum kvm_pgtable_walk_flags flag,
|
||||
void * const arg)
|
||||
static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
||||
enum kvm_pgtable_walk_flags visit)
|
||||
{
|
||||
struct kvm_pgtable_mm_ops *mm_ops = arg;
|
||||
enum kvm_pgtable_prot prot;
|
||||
enum pkvm_page_state state;
|
||||
kvm_pte_t pte = *ptep;
|
||||
phys_addr_t phys;
|
||||
|
||||
if (!kvm_pte_valid(pte))
|
||||
if (!kvm_pte_valid(ctx->old))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Fix-up the refcount for the page-table pages as the early allocator
|
||||
* was unable to access the hyp_vmemmap and so the buddy allocator has
|
||||
* initialised the refcount to '1'.
|
||||
*/
|
||||
mm_ops->get_page(ptep);
|
||||
if (flag != KVM_PGTABLE_WALK_LEAF)
|
||||
return 0;
|
||||
|
||||
if (level != (KVM_PGTABLE_MAX_LEVELS - 1))
|
||||
if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1))
|
||||
return -EINVAL;
|
||||
|
||||
phys = kvm_pte_to_phys(pte);
|
||||
phys = kvm_pte_to_phys(ctx->old);
|
||||
if (!addr_is_memory(phys))
|
||||
return -EINVAL;
|
||||
|
||||
@ -220,10 +210,10 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
|
||||
* Adjust the host stage-2 mappings to match the ownership attributes
|
||||
* configured in the hypervisor stage-1.
|
||||
*/
|
||||
state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
|
||||
state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old));
|
||||
switch (state) {
|
||||
case PKVM_PAGE_OWNED:
|
||||
return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id);
|
||||
return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
|
||||
case PKVM_PAGE_SHARED_OWNED:
|
||||
prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED);
|
||||
break;
|
||||
@ -237,12 +227,25 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
|
||||
return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
|
||||
}
|
||||
|
||||
static int finalize_host_mappings(void)
|
||||
static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
||||
enum kvm_pgtable_walk_flags visit)
|
||||
{
|
||||
/*
|
||||
* Fix-up the refcount for the page-table pages as the early allocator
|
||||
* was unable to access the hyp_vmemmap and so the buddy allocator has
|
||||
* initialised the refcount to '1'.
|
||||
*/
|
||||
if (kvm_pte_valid(ctx->old))
|
||||
ctx->mm_ops->get_page(ctx->ptep);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fix_host_ownership(void)
|
||||
{
|
||||
struct kvm_pgtable_walker walker = {
|
||||
.cb = finalize_host_mappings_walker,
|
||||
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
|
||||
.arg = pkvm_pgtable.mm_ops,
|
||||
.cb = fix_host_ownership_walker,
|
||||
.flags = KVM_PGTABLE_WALK_LEAF,
|
||||
};
|
||||
int i, ret;
|
||||
|
||||
@ -258,6 +261,18 @@ static int finalize_host_mappings(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fix_hyp_pgtable_refcnt(void)
|
||||
{
|
||||
struct kvm_pgtable_walker walker = {
|
||||
.cb = fix_hyp_pgtable_refcnt_walker,
|
||||
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
|
||||
.arg = pkvm_pgtable.mm_ops,
|
||||
};
|
||||
|
||||
return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits),
|
||||
&walker);
|
||||
}
|
||||
|
||||
void __noreturn __pkvm_init_finalise(void)
|
||||
{
|
||||
struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
|
||||
@ -287,10 +302,19 @@ void __noreturn __pkvm_init_finalise(void)
|
||||
};
|
||||
pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
|
||||
|
||||
ret = finalize_host_mappings();
|
||||
ret = fix_host_ownership();
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = fix_hyp_pgtable_refcnt();
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = hyp_create_pcpu_fixmap();
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
pkvm_hyp_vm_table_init(vm_table_base);
|
||||
out:
|
||||
/*
|
||||
* We tail-called to here from handle___pkvm_init() and will not return,
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
#
|
||||
# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part
|
||||
# Makefile for Kernel-based Virtual Machine module, HYP/VHE part
|
||||
#
|
||||
|
||||
asflags-y := -D__KVM_VHE_HYPERVISOR__
|
||||
|
@ -1,16 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
* irq.h: in kernel interrupt controller related definitions
|
||||
* Copyright (c) 2016 Red Hat, Inc.
|
||||
*
|
||||
* This header is included by irqchip.c. However, on ARM, interrupt
|
||||
* controller declarations are located in include/kvm/arm_vgic.h since
|
||||
* they are mostly shared between arm and arm64.
|
||||
*/
|
||||
|
||||
#ifndef __IRQ_H
|
||||
#define __IRQ_H
|
||||
|
||||
#include <kvm/arm_vgic.h>
|
||||
|
||||
#endif
|
@ -128,6 +128,25 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)
|
||||
free_pages_exact(virt, size);
|
||||
}
|
||||
|
||||
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
|
||||
|
||||
static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
|
||||
{
|
||||
struct page *page = container_of(head, struct page, rcu_head);
|
||||
void *pgtable = page_to_virt(page);
|
||||
u32 level = page_private(page);
|
||||
|
||||
kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level);
|
||||
}
|
||||
|
||||
static void stage2_free_removed_table(void *addr, u32 level)
|
||||
{
|
||||
struct page *page = virt_to_page(addr);
|
||||
|
||||
set_page_private(page, (unsigned long)level);
|
||||
call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
|
||||
}
|
||||
|
||||
static void kvm_host_get_page(void *addr)
|
||||
{
|
||||
get_page(virt_to_page(addr));
|
||||
@ -640,8 +659,8 @@ static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
|
||||
static int get_user_mapping_size(struct kvm *kvm, u64 addr)
|
||||
{
|
||||
struct kvm_pgtable pgt = {
|
||||
.pgd = (kvm_pte_t *)kvm->mm->pgd,
|
||||
.ia_bits = VA_BITS,
|
||||
.pgd = (kvm_pteref_t)kvm->mm->pgd,
|
||||
.ia_bits = vabits_actual,
|
||||
.start_level = (KVM_PGTABLE_MAX_LEVELS -
|
||||
CONFIG_PGTABLE_LEVELS),
|
||||
.mm_ops = &kvm_user_mm_ops,
|
||||
@ -662,6 +681,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
|
||||
.zalloc_page = stage2_memcache_zalloc_page,
|
||||
.zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
|
||||
.free_pages_exact = kvm_s2_free_pages_exact,
|
||||
.free_removed_table = stage2_free_removed_table,
|
||||
.get_page = kvm_host_get_page,
|
||||
.put_page = kvm_s2_put_page,
|
||||
.page_count = kvm_host_page_count,
|
||||
@ -675,15 +695,42 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
|
||||
* kvm_init_stage2_mmu - Initialise a S2 MMU structure
|
||||
* @kvm: The pointer to the KVM structure
|
||||
* @mmu: The pointer to the s2 MMU structure
|
||||
* @type: The machine type of the virtual machine
|
||||
*
|
||||
* Allocates only the stage-2 HW PGD level table(s).
|
||||
* Note we don't need locking here as this is only called when the VM is
|
||||
* created, which can only be done once.
|
||||
*/
|
||||
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
|
||||
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
|
||||
{
|
||||
u32 kvm_ipa_limit = get_kvm_ipa_limit();
|
||||
int cpu, err;
|
||||
struct kvm_pgtable *pgt;
|
||||
u64 mmfr0, mmfr1;
|
||||
u32 phys_shift;
|
||||
|
||||
if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
|
||||
return -EINVAL;
|
||||
|
||||
phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
|
||||
if (is_protected_kvm_enabled()) {
|
||||
phys_shift = kvm_ipa_limit;
|
||||
} else if (phys_shift) {
|
||||
if (phys_shift > kvm_ipa_limit ||
|
||||
phys_shift < ARM64_MIN_PARANGE_BITS)
|
||||
return -EINVAL;
|
||||
} else {
|
||||
phys_shift = KVM_PHYS_SHIFT;
|
||||
if (phys_shift > kvm_ipa_limit) {
|
||||
pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
|
||||
current->comm);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
|
||||
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
|
||||
kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
|
||||
|
||||
if (mmu->pgt != NULL) {
|
||||
kvm_err("kvm_arch already initialized?\n");
|
||||
@ -807,6 +854,32 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
|
||||
}
|
||||
}
|
||||
|
||||
static void hyp_mc_free_fn(void *addr, void *unused)
|
||||
{
|
||||
free_page((unsigned long)addr);
|
||||
}
|
||||
|
||||
static void *hyp_mc_alloc_fn(void *unused)
|
||||
{
|
||||
return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
|
||||
}
|
||||
|
||||
void free_hyp_memcache(struct kvm_hyp_memcache *mc)
|
||||
{
|
||||
if (is_protected_kvm_enabled())
|
||||
__free_hyp_memcache(mc, hyp_mc_free_fn,
|
||||
kvm_host_va, NULL);
|
||||
}
|
||||
|
||||
int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
|
||||
{
|
||||
if (!is_protected_kvm_enabled())
|
||||
return 0;
|
||||
|
||||
return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
|
||||
kvm_host_pa, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_phys_addr_ioremap - map a device range to guest IPA
|
||||
*
|
||||
@ -841,7 +914,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
|
||||
|
||||
write_lock(&kvm->mmu_lock);
|
||||
ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
|
||||
&cache);
|
||||
&cache, 0);
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
if (ret)
|
||||
break;
|
||||
@ -1091,32 +1164,26 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
|
||||
* - mmap_lock protects between a VM faulting a page in and the VMM performing
|
||||
* an mprotect() to add VM_MTE
|
||||
*/
|
||||
static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
|
||||
unsigned long size)
|
||||
static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
|
||||
unsigned long size)
|
||||
{
|
||||
unsigned long i, nr_pages = size >> PAGE_SHIFT;
|
||||
struct page *page;
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
if (!kvm_has_mte(kvm))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* pfn_to_online_page() is used to reject ZONE_DEVICE pages
|
||||
* that may not support tags.
|
||||
*/
|
||||
page = pfn_to_online_page(pfn);
|
||||
|
||||
if (!page)
|
||||
return -EFAULT;
|
||||
return;
|
||||
|
||||
for (i = 0; i < nr_pages; i++, page++) {
|
||||
if (!test_bit(PG_mte_tagged, &page->flags)) {
|
||||
if (try_page_mte_tagging(page)) {
|
||||
mte_clear_page_tags(page_address(page));
|
||||
set_bit(PG_mte_tagged, &page->flags);
|
||||
set_page_mte_tagged(page);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
|
||||
{
|
||||
return vma->vm_flags & VM_MTE_ALLOWED;
|
||||
}
|
||||
|
||||
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
@ -1127,7 +1194,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
bool write_fault, writable, force_pte = false;
|
||||
bool exec_fault;
|
||||
bool device = false;
|
||||
bool shared;
|
||||
unsigned long mmu_seq;
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
|
||||
@ -1136,7 +1202,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
gfn_t gfn;
|
||||
kvm_pfn_t pfn;
|
||||
bool logging_active = memslot_is_logging(memslot);
|
||||
bool use_read_lock = false;
|
||||
unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
|
||||
unsigned long vma_pagesize, fault_granule;
|
||||
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
|
||||
@ -1171,14 +1236,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
if (logging_active) {
|
||||
force_pte = true;
|
||||
vma_shift = PAGE_SHIFT;
|
||||
use_read_lock = (fault_status == FSC_PERM && write_fault &&
|
||||
fault_granule == PAGE_SIZE);
|
||||
} else {
|
||||
vma_shift = get_vma_page_shift(vma, hva);
|
||||
}
|
||||
|
||||
shared = (vma->vm_flags & VM_SHARED);
|
||||
|
||||
switch (vma_shift) {
|
||||
#ifndef __PAGETABLE_PMD_FOLDED
|
||||
case PUD_SHIFT:
|
||||
@ -1239,7 +1300,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
|
||||
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
|
||||
write_fault, &writable, NULL);
|
||||
if (pfn == KVM_PFN_ERR_HWPOISON) {
|
||||
kvm_send_hwpoison_signal(hva, vma_shift);
|
||||
@ -1271,15 +1332,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
if (exec_fault && device)
|
||||
return -ENOEXEC;
|
||||
|
||||
/*
|
||||
* To reduce MMU contentions and enhance concurrency during dirty
|
||||
* logging dirty logging, only acquire read lock for permission
|
||||
* relaxation.
|
||||
*/
|
||||
if (use_read_lock)
|
||||
read_lock(&kvm->mmu_lock);
|
||||
else
|
||||
write_lock(&kvm->mmu_lock);
|
||||
read_lock(&kvm->mmu_lock);
|
||||
pgt = vcpu->arch.hw_mmu->pgt;
|
||||
if (mmu_invalidate_retry(kvm, mmu_seq))
|
||||
goto out_unlock;
|
||||
@ -1298,13 +1351,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
}
|
||||
|
||||
if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
|
||||
/* Check the VMM hasn't introduced a new VM_SHARED VMA */
|
||||
if (!shared)
|
||||
ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
|
||||
else
|
||||
/* Check the VMM hasn't introduced a new disallowed VMA */
|
||||
if (kvm_vma_mte_allowed(vma)) {
|
||||
sanitise_mte_tags(kvm, pfn, vma_pagesize);
|
||||
} else {
|
||||
ret = -EFAULT;
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
}
|
||||
}
|
||||
|
||||
if (writable)
|
||||
@ -1323,15 +1376,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
* permissions only if vma_pagesize equals fault_granule. Otherwise,
|
||||
* kvm_pgtable_stage2_map() should be called to change block size.
|
||||
*/
|
||||
if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
|
||||
if (fault_status == FSC_PERM && vma_pagesize == fault_granule)
|
||||
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
|
||||
} else {
|
||||
WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
|
||||
|
||||
else
|
||||
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
|
||||
__pfn_to_phys(pfn), prot,
|
||||
memcache);
|
||||
}
|
||||
memcache, KVM_PGTABLE_WALK_SHARED);
|
||||
|
||||
/* Mark the page dirty only if the fault is handled successfully */
|
||||
if (writable && !ret) {
|
||||
@ -1340,10 +1390,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
if (use_read_lock)
|
||||
read_unlock(&kvm->mmu_lock);
|
||||
else
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
read_unlock(&kvm->mmu_lock);
|
||||
kvm_set_pfn_accessed(pfn);
|
||||
kvm_release_pfn_clean(pfn);
|
||||
return ret != -EAGAIN ? ret : 0;
|
||||
@ -1526,15 +1573,18 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
|
||||
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
|
||||
{
|
||||
kvm_pfn_t pfn = pte_pfn(range->pte);
|
||||
int ret;
|
||||
|
||||
if (!kvm->arch.mmu.pgt)
|
||||
return false;
|
||||
|
||||
WARN_ON(range->end - range->start != 1);
|
||||
|
||||
ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
|
||||
if (ret)
|
||||
/*
|
||||
* If the page isn't tagged, defer to user_mem_abort() for sanitising
|
||||
* the MTE tags. The S2 pte should have been unmapped by
|
||||
* mmu_notifier_invalidate_range_end().
|
||||
*/
|
||||
if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
|
||||
return false;
|
||||
|
||||
/*
|
||||
@ -1549,7 +1599,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
|
||||
*/
|
||||
kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
|
||||
PAGE_SIZE, __pfn_to_phys(pfn),
|
||||
KVM_PGTABLE_PROT_R, NULL);
|
||||
KVM_PGTABLE_PROT_R, NULL, 0);
|
||||
|
||||
return false;
|
||||
}
|
||||
@ -1618,6 +1668,8 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
|
||||
int kvm_mmu_init(u32 *hyp_va_bits)
|
||||
{
|
||||
int err;
|
||||
u32 idmap_bits;
|
||||
u32 kernel_bits;
|
||||
|
||||
hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
|
||||
hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
|
||||
@ -1631,7 +1683,31 @@ int kvm_mmu_init(u32 *hyp_va_bits)
|
||||
*/
|
||||
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
|
||||
|
||||
*hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
|
||||
/*
|
||||
* The ID map may be configured to use an extended virtual address
|
||||
* range. This is only the case if system RAM is out of range for the
|
||||
* currently configured page size and VA_BITS_MIN, in which case we will
|
||||
* also need the extended virtual range for the HYP ID map, or we won't
|
||||
* be able to enable the EL2 MMU.
|
||||
*
|
||||
* However, in some cases the ID map may be configured for fewer than
|
||||
* the number of VA bits used by the regular kernel stage 1. This
|
||||
* happens when VA_BITS=52 and the kernel image is placed in PA space
|
||||
* below 48 bits.
|
||||
*
|
||||
* At EL2, there is only one TTBR register, and we can't switch between
|
||||
* translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
|
||||
* line: we need to use the extended range with *both* our translation
|
||||
* tables.
|
||||
*
|
||||
* So use the maximum of the idmap VA bits and the regular kernel stage
|
||||
* 1 VA bits to assure that the hypervisor can both ID map its code page
|
||||
* and map any kernel memory.
|
||||
*/
|
||||
idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
|
||||
kernel_bits = vabits_actual;
|
||||
*hyp_va_bits = max(idmap_bits, kernel_bits);
|
||||
|
||||
kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
|
||||
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
|
||||
kvm_debug("HYP VA range: %lx:%lx\n",
|
||||
@ -1740,12 +1816,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
|
||||
if (!vma)
|
||||
break;
|
||||
|
||||
/*
|
||||
* VM_SHARED mappings are not allowed with MTE to avoid races
|
||||
* when updating the PG_mte_tagged page flag, see
|
||||
* sanitise_mte_tags for more details.
|
||||
*/
|
||||
if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
|
||||
if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
@ -6,6 +6,7 @@
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/sort.h>
|
||||
|
||||
#include <asm/kvm_pkvm.h>
|
||||
@ -53,7 +54,7 @@ static int __init register_memblock_regions(void)
|
||||
|
||||
void __init kvm_hyp_reserve(void)
|
||||
{
|
||||
u64 nr_pages, prev, hyp_mem_pages = 0;
|
||||
u64 hyp_mem_pages = 0;
|
||||
int ret;
|
||||
|
||||
if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
|
||||
@ -71,21 +72,8 @@ void __init kvm_hyp_reserve(void)
|
||||
|
||||
hyp_mem_pages += hyp_s1_pgtable_pages();
|
||||
hyp_mem_pages += host_s2_pgtable_pages();
|
||||
|
||||
/*
|
||||
* The hyp_vmemmap needs to be backed by pages, but these pages
|
||||
* themselves need to be present in the vmemmap, so compute the number
|
||||
* of pages needed by looking for a fixed point.
|
||||
*/
|
||||
nr_pages = 0;
|
||||
do {
|
||||
prev = nr_pages;
|
||||
nr_pages = hyp_mem_pages + prev;
|
||||
nr_pages = DIV_ROUND_UP(nr_pages * STRUCT_HYP_PAGE_SIZE,
|
||||
PAGE_SIZE);
|
||||
nr_pages += __hyp_pgtable_max_pages(nr_pages);
|
||||
} while (nr_pages != prev);
|
||||
hyp_mem_pages += nr_pages;
|
||||
hyp_mem_pages += hyp_vm_table_pages();
|
||||
hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
|
||||
|
||||
/*
|
||||
* Try to allocate a PMD-aligned region to reduce TLB pressure once
|
||||
@ -107,3 +95,121 @@ void __init kvm_hyp_reserve(void)
|
||||
kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
|
||||
hyp_mem_base);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocates and donates memory for hypervisor VM structs at EL2.
|
||||
*
|
||||
* Allocates space for the VM state, which includes the hyp vm as well as
|
||||
* the hyp vcpus.
|
||||
*
|
||||
* Stores an opaque handler in the kvm struct for future reference.
|
||||
*
|
||||
* Return 0 on success, negative error code on failure.
|
||||
*/
|
||||
static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
|
||||
{
|
||||
size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
|
||||
struct kvm_vcpu *host_vcpu;
|
||||
pkvm_handle_t handle;
|
||||
void *pgd, *hyp_vm;
|
||||
unsigned long idx;
|
||||
int ret;
|
||||
|
||||
if (host_kvm->created_vcpus < 1)
|
||||
return -EINVAL;
|
||||
|
||||
pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr);
|
||||
|
||||
/*
|
||||
* The PGD pages will be reclaimed using a hyp_memcache which implies
|
||||
* page granularity. So, use alloc_pages_exact() to get individual
|
||||
* refcounts.
|
||||
*/
|
||||
pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT);
|
||||
if (!pgd)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Allocate memory to donate to hyp for vm and vcpu pointers. */
|
||||
hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE,
|
||||
size_mul(sizeof(void *),
|
||||
host_kvm->created_vcpus)));
|
||||
hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT);
|
||||
if (!hyp_vm) {
|
||||
ret = -ENOMEM;
|
||||
goto free_pgd;
|
||||
}
|
||||
|
||||
/* Donate the VM memory to hyp and let hyp initialize it. */
|
||||
ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd);
|
||||
if (ret < 0)
|
||||
goto free_vm;
|
||||
|
||||
handle = ret;
|
||||
|
||||
host_kvm->arch.pkvm.handle = handle;
|
||||
|
||||
/* Donate memory for the vcpus at hyp and initialize it. */
|
||||
hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
|
||||
kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
|
||||
void *hyp_vcpu;
|
||||
|
||||
/* Indexing of the vcpus to be sequential starting at 0. */
|
||||
if (WARN_ON(host_vcpu->vcpu_idx != idx)) {
|
||||
ret = -EINVAL;
|
||||
goto destroy_vm;
|
||||
}
|
||||
|
||||
hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
|
||||
if (!hyp_vcpu) {
|
||||
ret = -ENOMEM;
|
||||
goto destroy_vm;
|
||||
}
|
||||
|
||||
ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
|
||||
hyp_vcpu);
|
||||
if (ret) {
|
||||
free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
|
||||
goto destroy_vm;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
destroy_vm:
|
||||
pkvm_destroy_hyp_vm(host_kvm);
|
||||
return ret;
|
||||
free_vm:
|
||||
free_pages_exact(hyp_vm, hyp_vm_sz);
|
||||
free_pgd:
|
||||
free_pages_exact(pgd, pgd_sz);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int pkvm_create_hyp_vm(struct kvm *host_kvm)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&host_kvm->lock);
|
||||
if (!host_kvm->arch.pkvm.handle)
|
||||
ret = __pkvm_create_hyp_vm(host_kvm);
|
||||
mutex_unlock(&host_kvm->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
|
||||
{
|
||||
if (host_kvm->arch.pkvm.handle) {
|
||||
WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
|
||||
host_kvm->arch.pkvm.handle));
|
||||
}
|
||||
|
||||
host_kvm->arch.pkvm.handle = 0;
|
||||
free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
|
||||
}
|
||||
|
||||
int pkvm_init_host_vm(struct kvm *host_kvm)
|
||||
{
|
||||
mutex_init(&host_kvm->lock);
|
||||
return 0;
|
||||
}
|
||||
|
@ -15,16 +15,25 @@
|
||||
#include <kvm/arm_pmu.h>
|
||||
#include <kvm/arm_vgic.h>
|
||||
|
||||
#define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0)
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available);
|
||||
|
||||
static LIST_HEAD(arm_pmus);
|
||||
static DEFINE_MUTEX(arm_pmus_lock);
|
||||
|
||||
static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx);
|
||||
static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx);
|
||||
static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
|
||||
static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc);
|
||||
static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc);
|
||||
|
||||
#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
|
||||
static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc)
|
||||
{
|
||||
return container_of(pmc, struct kvm_vcpu, arch.pmu.pmc[pmc->idx]);
|
||||
}
|
||||
|
||||
static struct kvm_pmc *kvm_vcpu_idx_to_pmc(struct kvm_vcpu *vcpu, int cnt_idx)
|
||||
{
|
||||
return &vcpu->arch.pmu.pmc[cnt_idx];
|
||||
}
|
||||
|
||||
static u32 kvm_pmu_event_mask(struct kvm *kvm)
|
||||
{
|
||||
@ -47,113 +56,46 @@ static u32 kvm_pmu_event_mask(struct kvm *kvm)
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
|
||||
* @vcpu: The vcpu pointer
|
||||
* @select_idx: The counter index
|
||||
* kvm_pmc_is_64bit - determine if counter is 64bit
|
||||
* @pmc: counter context
|
||||
*/
|
||||
static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc)
|
||||
{
|
||||
return (select_idx == ARMV8_PMU_CYCLE_IDX &&
|
||||
__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC);
|
||||
return (pmc->idx == ARMV8_PMU_CYCLE_IDX ||
|
||||
kvm_pmu_is_3p5(kvm_pmc_to_vcpu(pmc)));
|
||||
}
|
||||
|
||||
static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc)
|
||||
static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc)
|
||||
{
|
||||
struct kvm_pmu *pmu;
|
||||
struct kvm_vcpu_arch *vcpu_arch;
|
||||
u64 val = __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), PMCR_EL0);
|
||||
|
||||
pmc -= pmc->idx;
|
||||
pmu = container_of(pmc, struct kvm_pmu, pmc[0]);
|
||||
vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu);
|
||||
return container_of(vcpu_arch, struct kvm_vcpu, arch);
|
||||
return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) ||
|
||||
(pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC));
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_pmu_pmc_is_chained - determine if the pmc is chained
|
||||
* @pmc: The PMU counter pointer
|
||||
*/
|
||||
static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc)
|
||||
static bool kvm_pmu_counter_can_chain(struct kvm_pmc *pmc)
|
||||
{
|
||||
return (!(pmc->idx & 1) && (pmc->idx + 1) < ARMV8_PMU_CYCLE_IDX &&
|
||||
!kvm_pmc_has_64bit_overflow(pmc));
|
||||
}
|
||||
|
||||
static u32 counter_index_to_reg(u64 idx)
|
||||
{
|
||||
return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + idx;
|
||||
}
|
||||
|
||||
static u32 counter_index_to_evtreg(u64 idx)
|
||||
{
|
||||
return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx;
|
||||
}
|
||||
|
||||
static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc)
|
||||
{
|
||||
struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
|
||||
u64 counter, reg, enabled, running;
|
||||
|
||||
return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter
|
||||
* @select_idx: The counter index
|
||||
*/
|
||||
static bool kvm_pmu_idx_is_high_counter(u64 select_idx)
|
||||
{
|
||||
return select_idx & 0x1;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_pmu_get_canonical_pmc - obtain the canonical pmc
|
||||
* @pmc: The PMU counter pointer
|
||||
*
|
||||
* When a pair of PMCs are chained together we use the low counter (canonical)
|
||||
* to hold the underlying perf event.
|
||||
*/
|
||||
static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc)
|
||||
{
|
||||
if (kvm_pmu_pmc_is_chained(pmc) &&
|
||||
kvm_pmu_idx_is_high_counter(pmc->idx))
|
||||
return pmc - 1;
|
||||
|
||||
return pmc;
|
||||
}
|
||||
static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc)
|
||||
{
|
||||
if (kvm_pmu_idx_is_high_counter(pmc->idx))
|
||||
return pmc - 1;
|
||||
else
|
||||
return pmc + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_pmu_idx_has_chain_evtype - determine if the event type is chain
|
||||
* @vcpu: The vcpu pointer
|
||||
* @select_idx: The counter index
|
||||
*/
|
||||
static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
{
|
||||
u64 eventsel, reg;
|
||||
|
||||
select_idx |= 0x1;
|
||||
|
||||
if (select_idx == ARMV8_PMU_CYCLE_IDX)
|
||||
return false;
|
||||
|
||||
reg = PMEVTYPER0_EL0 + select_idx;
|
||||
eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm);
|
||||
|
||||
return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_pmu_get_pair_counter_value - get PMU counter value
|
||||
* @vcpu: The vcpu pointer
|
||||
* @pmc: The PMU counter pointer
|
||||
*/
|
||||
static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu,
|
||||
struct kvm_pmc *pmc)
|
||||
{
|
||||
u64 counter, counter_high, reg, enabled, running;
|
||||
|
||||
if (kvm_pmu_pmc_is_chained(pmc)) {
|
||||
pmc = kvm_pmu_get_canonical_pmc(pmc);
|
||||
reg = PMEVCNTR0_EL0 + pmc->idx;
|
||||
|
||||
counter = __vcpu_sys_reg(vcpu, reg);
|
||||
counter_high = __vcpu_sys_reg(vcpu, reg + 1);
|
||||
|
||||
counter = lower_32_bits(counter) | (counter_high << 32);
|
||||
} else {
|
||||
reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
|
||||
? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
|
||||
counter = __vcpu_sys_reg(vcpu, reg);
|
||||
}
|
||||
reg = counter_index_to_reg(pmc->idx);
|
||||
counter = __vcpu_sys_reg(vcpu, reg);
|
||||
|
||||
/*
|
||||
* The real counter value is equal to the value of counter register plus
|
||||
@ -163,6 +105,9 @@ static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu,
|
||||
counter += perf_event_read_value(pmc->perf_event, &enabled,
|
||||
&running);
|
||||
|
||||
if (!kvm_pmc_is_64bit(pmc))
|
||||
counter = lower_32_bits(counter);
|
||||
|
||||
return counter;
|
||||
}
|
||||
|
||||
@ -173,22 +118,37 @@ static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu,
|
||||
*/
|
||||
u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
{
|
||||
u64 counter;
|
||||
struct kvm_pmu *pmu = &vcpu->arch.pmu;
|
||||
struct kvm_pmc *pmc = &pmu->pmc[select_idx];
|
||||
|
||||
if (!kvm_vcpu_has_pmu(vcpu))
|
||||
return 0;
|
||||
|
||||
counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
|
||||
return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx));
|
||||
}
|
||||
|
||||
if (kvm_pmu_pmc_is_chained(pmc) &&
|
||||
kvm_pmu_idx_is_high_counter(select_idx))
|
||||
counter = upper_32_bits(counter);
|
||||
else if (select_idx != ARMV8_PMU_CYCLE_IDX)
|
||||
counter = lower_32_bits(counter);
|
||||
static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force)
|
||||
{
|
||||
struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
|
||||
u64 reg;
|
||||
|
||||
return counter;
|
||||
kvm_pmu_release_perf_event(pmc);
|
||||
|
||||
reg = counter_index_to_reg(pmc->idx);
|
||||
|
||||
if (vcpu_mode_is_32bit(vcpu) && pmc->idx != ARMV8_PMU_CYCLE_IDX &&
|
||||
!force) {
|
||||
/*
|
||||
* Even with PMUv3p5, AArch32 cannot write to the top
|
||||
* 32bit of the counters. The only possible course of
|
||||
* action is to use PMCR.P, which will reset them to
|
||||
* 0 (the only use of the 'force' parameter).
|
||||
*/
|
||||
val = __vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32);
|
||||
val |= lower_32_bits(val);
|
||||
}
|
||||
|
||||
__vcpu_sys_reg(vcpu, reg) = val;
|
||||
|
||||
/* Recreate the perf event to reflect the updated sample_period */
|
||||
kvm_pmu_create_perf_event(pmc);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -199,17 +159,10 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
*/
|
||||
void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
|
||||
{
|
||||
u64 reg;
|
||||
|
||||
if (!kvm_vcpu_has_pmu(vcpu))
|
||||
return;
|
||||
|
||||
reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
|
||||
? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
|
||||
__vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
|
||||
|
||||
/* Recreate the perf event to reflect the updated sample_period */
|
||||
kvm_pmu_create_perf_event(vcpu, select_idx);
|
||||
kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -218,7 +171,6 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
|
||||
*/
|
||||
static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
|
||||
{
|
||||
pmc = kvm_pmu_get_canonical_pmc(pmc);
|
||||
if (pmc->perf_event) {
|
||||
perf_event_disable(pmc->perf_event);
|
||||
perf_event_release_kernel(pmc->perf_event);
|
||||
@ -232,29 +184,20 @@ static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
|
||||
*
|
||||
* If this counter has been configured to monitor some event, release it here.
|
||||
*/
|
||||
static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
|
||||
static void kvm_pmu_stop_counter(struct kvm_pmc *pmc)
|
||||
{
|
||||
u64 counter, reg, val;
|
||||
struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
|
||||
u64 reg, val;
|
||||
|
||||
pmc = kvm_pmu_get_canonical_pmc(pmc);
|
||||
if (!pmc->perf_event)
|
||||
return;
|
||||
|
||||
counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
|
||||
val = kvm_pmu_get_pmc_value(pmc);
|
||||
|
||||
if (pmc->idx == ARMV8_PMU_CYCLE_IDX) {
|
||||
reg = PMCCNTR_EL0;
|
||||
val = counter;
|
||||
} else {
|
||||
reg = PMEVCNTR0_EL0 + pmc->idx;
|
||||
val = lower_32_bits(counter);
|
||||
}
|
||||
reg = counter_index_to_reg(pmc->idx);
|
||||
|
||||
__vcpu_sys_reg(vcpu, reg) = val;
|
||||
|
||||
if (kvm_pmu_pmc_is_chained(pmc))
|
||||
__vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter);
|
||||
|
||||
kvm_pmu_release_perf_event(pmc);
|
||||
}
|
||||
|
||||
@ -280,13 +223,10 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
|
||||
void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
|
||||
struct kvm_pmu *pmu = &vcpu->arch.pmu;
|
||||
int i;
|
||||
|
||||
for_each_set_bit(i, &mask, 32)
|
||||
kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]);
|
||||
|
||||
bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS);
|
||||
kvm_pmu_stop_counter(kvm_vcpu_idx_to_pmc(vcpu, i));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -297,10 +237,9 @@ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
|
||||
void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
int i;
|
||||
struct kvm_pmu *pmu = &vcpu->arch.pmu;
|
||||
|
||||
for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
|
||||
kvm_pmu_release_perf_event(&pmu->pmc[i]);
|
||||
kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i));
|
||||
irq_work_sync(&vcpu->arch.pmu.overflow_work);
|
||||
}
|
||||
|
||||
@ -325,9 +264,6 @@ u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
|
||||
void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
|
||||
{
|
||||
int i;
|
||||
struct kvm_pmu *pmu = &vcpu->arch.pmu;
|
||||
struct kvm_pmc *pmc;
|
||||
|
||||
if (!kvm_vcpu_has_pmu(vcpu))
|
||||
return;
|
||||
|
||||
@ -335,17 +271,16 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
|
||||
return;
|
||||
|
||||
for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
|
||||
struct kvm_pmc *pmc;
|
||||
|
||||
if (!(val & BIT(i)))
|
||||
continue;
|
||||
|
||||
pmc = &pmu->pmc[i];
|
||||
pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
|
||||
|
||||
/* A change in the enable state may affect the chain state */
|
||||
kvm_pmu_update_pmc_chained(vcpu, i);
|
||||
kvm_pmu_create_perf_event(vcpu, i);
|
||||
|
||||
/* At this point, pmc must be the canonical */
|
||||
if (pmc->perf_event) {
|
||||
if (!pmc->perf_event) {
|
||||
kvm_pmu_create_perf_event(pmc);
|
||||
} else {
|
||||
perf_event_enable(pmc->perf_event);
|
||||
if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
|
||||
kvm_debug("fail to enable perf event\n");
|
||||
@ -363,23 +298,18 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
|
||||
void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
|
||||
{
|
||||
int i;
|
||||
struct kvm_pmu *pmu = &vcpu->arch.pmu;
|
||||
struct kvm_pmc *pmc;
|
||||
|
||||
if (!kvm_vcpu_has_pmu(vcpu) || !val)
|
||||
return;
|
||||
|
||||
for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
|
||||
struct kvm_pmc *pmc;
|
||||
|
||||
if (!(val & BIT(i)))
|
||||
continue;
|
||||
|
||||
pmc = &pmu->pmc[i];
|
||||
pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
|
||||
|
||||
/* A change in the enable state may affect the chain state */
|
||||
kvm_pmu_update_pmc_chained(vcpu, i);
|
||||
kvm_pmu_create_perf_event(vcpu, i);
|
||||
|
||||
/* At this point, pmc must be the canonical */
|
||||
if (pmc->perf_event)
|
||||
perf_event_disable(pmc->perf_event);
|
||||
}
|
||||
@ -476,14 +406,69 @@ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
|
||||
static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work)
|
||||
{
|
||||
struct kvm_vcpu *vcpu;
|
||||
struct kvm_pmu *pmu;
|
||||
|
||||
pmu = container_of(work, struct kvm_pmu, overflow_work);
|
||||
vcpu = kvm_pmc_to_vcpu(pmu->pmc);
|
||||
|
||||
vcpu = container_of(work, struct kvm_vcpu, arch.pmu.overflow_work);
|
||||
kvm_vcpu_kick(vcpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform an increment on any of the counters described in @mask,
|
||||
* generating the overflow if required, and propagate it as a chained
|
||||
* event if possible.
|
||||
*/
|
||||
static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu,
|
||||
unsigned long mask, u32 event)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
|
||||
return;
|
||||
|
||||
/* Weed out disabled counters */
|
||||
mask &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
|
||||
|
||||
for_each_set_bit(i, &mask, ARMV8_PMU_CYCLE_IDX) {
|
||||
struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
|
||||
u64 type, reg;
|
||||
|
||||
/* Filter on event type */
|
||||
type = __vcpu_sys_reg(vcpu, counter_index_to_evtreg(i));
|
||||
type &= kvm_pmu_event_mask(vcpu->kvm);
|
||||
if (type != event)
|
||||
continue;
|
||||
|
||||
/* Increment this counter */
|
||||
reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1;
|
||||
if (!kvm_pmc_is_64bit(pmc))
|
||||
reg = lower_32_bits(reg);
|
||||
__vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg;
|
||||
|
||||
/* No overflow? move on */
|
||||
if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg))
|
||||
continue;
|
||||
|
||||
/* Mark overflow */
|
||||
__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
|
||||
|
||||
if (kvm_pmu_counter_can_chain(pmc))
|
||||
kvm_pmu_counter_increment(vcpu, BIT(i + 1),
|
||||
ARMV8_PMUV3_PERFCTR_CHAIN);
|
||||
}
|
||||
}
|
||||
|
||||
/* Compute the sample period for a given counter value */
|
||||
static u64 compute_period(struct kvm_pmc *pmc, u64 counter)
|
||||
{
|
||||
u64 val;
|
||||
|
||||
if (kvm_pmc_is_64bit(pmc) && kvm_pmc_has_64bit_overflow(pmc))
|
||||
val = (-counter) & GENMASK(63, 0);
|
||||
else
|
||||
val = (-counter) & GENMASK(31, 0);
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
/**
|
||||
* When the perf event overflows, set the overflow status and inform the vcpu.
|
||||
*/
|
||||
@ -503,10 +488,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
|
||||
* Reset the sample period to the architectural limit,
|
||||
* i.e. the point where the counter overflows.
|
||||
*/
|
||||
period = -(local64_read(&perf_event->count));
|
||||
|
||||
if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
|
||||
period &= GENMASK(31, 0);
|
||||
period = compute_period(pmc, local64_read(&perf_event->count));
|
||||
|
||||
local64_set(&perf_event->hw.period_left, 0);
|
||||
perf_event->attr.sample_period = period;
|
||||
@ -514,6 +496,10 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
|
||||
|
||||
__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
|
||||
|
||||
if (kvm_pmu_counter_can_chain(pmc))
|
||||
kvm_pmu_counter_increment(vcpu, BIT(idx + 1),
|
||||
ARMV8_PMUV3_PERFCTR_CHAIN);
|
||||
|
||||
if (kvm_pmu_overflow_status(vcpu)) {
|
||||
kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
|
||||
|
||||
@ -533,50 +519,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
|
||||
*/
|
||||
void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
|
||||
{
|
||||
struct kvm_pmu *pmu = &vcpu->arch.pmu;
|
||||
int i;
|
||||
|
||||
if (!kvm_vcpu_has_pmu(vcpu))
|
||||
return;
|
||||
|
||||
if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
|
||||
return;
|
||||
|
||||
/* Weed out disabled counters */
|
||||
val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
|
||||
|
||||
for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) {
|
||||
u64 type, reg;
|
||||
|
||||
if (!(val & BIT(i)))
|
||||
continue;
|
||||
|
||||
/* PMSWINC only applies to ... SW_INC! */
|
||||
type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
|
||||
type &= kvm_pmu_event_mask(vcpu->kvm);
|
||||
if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
|
||||
continue;
|
||||
|
||||
/* increment this even SW_INC counter */
|
||||
reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
|
||||
reg = lower_32_bits(reg);
|
||||
__vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
|
||||
|
||||
if (reg) /* no overflow on the low part */
|
||||
continue;
|
||||
|
||||
if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) {
|
||||
/* increment the high counter */
|
||||
reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1;
|
||||
reg = lower_32_bits(reg);
|
||||
__vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg;
|
||||
if (!reg) /* mark overflow on the high counter */
|
||||
__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1);
|
||||
} else {
|
||||
/* mark overflow on low counter */
|
||||
__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
|
||||
}
|
||||
}
|
||||
kvm_pmu_counter_increment(vcpu, val, ARMV8_PMUV3_PERFCTR_SW_INCR);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -591,6 +534,12 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
|
||||
if (!kvm_vcpu_has_pmu(vcpu))
|
||||
return;
|
||||
|
||||
/* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */
|
||||
if (!kvm_pmu_is_3p5(vcpu))
|
||||
val &= ~ARMV8_PMU_PMCR_LP;
|
||||
|
||||
__vcpu_sys_reg(vcpu, PMCR_EL0) = val;
|
||||
|
||||
if (val & ARMV8_PMU_PMCR_E) {
|
||||
kvm_pmu_enable_counter_mask(vcpu,
|
||||
__vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
|
||||
@ -606,49 +555,44 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
|
||||
unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
|
||||
mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
|
||||
for_each_set_bit(i, &mask, 32)
|
||||
kvm_pmu_set_counter_value(vcpu, i, 0);
|
||||
kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true);
|
||||
}
|
||||
}
|
||||
|
||||
static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc)
|
||||
{
|
||||
struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
|
||||
return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
|
||||
(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx));
|
||||
(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx));
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_pmu_create_perf_event - create a perf event for a counter
|
||||
* @vcpu: The vcpu pointer
|
||||
* @select_idx: The number of selected counter
|
||||
* @pmc: Counter context
|
||||
*/
|
||||
static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
|
||||
{
|
||||
struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
|
||||
struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu;
|
||||
struct kvm_pmu *pmu = &vcpu->arch.pmu;
|
||||
struct kvm_pmc *pmc;
|
||||
struct perf_event *event;
|
||||
struct perf_event_attr attr;
|
||||
u64 eventsel, counter, reg, data;
|
||||
u64 eventsel, reg, data;
|
||||
|
||||
/*
|
||||
* For chained counters the event type and filtering attributes are
|
||||
* obtained from the low/even counter. We also use this counter to
|
||||
* determine if the event is enabled/disabled.
|
||||
*/
|
||||
pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]);
|
||||
|
||||
reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
|
||||
? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx;
|
||||
reg = counter_index_to_evtreg(pmc->idx);
|
||||
data = __vcpu_sys_reg(vcpu, reg);
|
||||
|
||||
kvm_pmu_stop_counter(vcpu, pmc);
|
||||
kvm_pmu_stop_counter(pmc);
|
||||
if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
|
||||
eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
|
||||
else
|
||||
eventsel = data & kvm_pmu_event_mask(vcpu->kvm);
|
||||
|
||||
/* Software increment event doesn't need to be backed by a perf event */
|
||||
if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR)
|
||||
/*
|
||||
* Neither SW increment nor chained events need to be backed
|
||||
* by a perf event.
|
||||
*/
|
||||
if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR ||
|
||||
eventsel == ARMV8_PMUV3_PERFCTR_CHAIN)
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -663,37 +607,25 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
attr.type = arm_pmu->pmu.type;
|
||||
attr.size = sizeof(attr);
|
||||
attr.pinned = 1;
|
||||
attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx);
|
||||
attr.disabled = !kvm_pmu_counter_is_enabled(pmc);
|
||||
attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0;
|
||||
attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
|
||||
attr.exclude_hv = 1; /* Don't count EL2 events */
|
||||
attr.exclude_host = 1; /* Don't count host events */
|
||||
attr.config = eventsel;
|
||||
|
||||
counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
|
||||
/*
|
||||
* If counting with a 64bit counter, advertise it to the perf
|
||||
* code, carefully dealing with the initial sample period
|
||||
* which also depends on the overflow.
|
||||
*/
|
||||
if (kvm_pmc_is_64bit(pmc))
|
||||
attr.config1 |= PERF_ATTR_CFG1_COUNTER_64BIT;
|
||||
|
||||
if (kvm_pmu_pmc_is_chained(pmc)) {
|
||||
/**
|
||||
* The initial sample period (overflow count) of an event. For
|
||||
* chained counters we only support overflow interrupts on the
|
||||
* high counter.
|
||||
*/
|
||||
attr.sample_period = (-counter) & GENMASK(63, 0);
|
||||
attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED;
|
||||
attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc));
|
||||
|
||||
event = perf_event_create_kernel_counter(&attr, -1, current,
|
||||
kvm_pmu_perf_overflow,
|
||||
pmc + 1);
|
||||
} else {
|
||||
/* The initial sample period (overflow count) of an event. */
|
||||
if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
|
||||
attr.sample_period = (-counter) & GENMASK(63, 0);
|
||||
else
|
||||
attr.sample_period = (-counter) & GENMASK(31, 0);
|
||||
|
||||
event = perf_event_create_kernel_counter(&attr, -1, current,
|
||||
event = perf_event_create_kernel_counter(&attr, -1, current,
|
||||
kvm_pmu_perf_overflow, pmc);
|
||||
}
|
||||
|
||||
if (IS_ERR(event)) {
|
||||
pr_err_once("kvm: pmu event creation failed %ld\n",
|
||||
@ -704,41 +636,6 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
pmc->perf_event = event;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_pmu_update_pmc_chained - update chained bitmap
|
||||
* @vcpu: The vcpu pointer
|
||||
* @select_idx: The number of selected counter
|
||||
*
|
||||
* Update the chained bitmap based on the event type written in the
|
||||
* typer register and the enable state of the odd register.
|
||||
*/
|
||||
static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
{
|
||||
struct kvm_pmu *pmu = &vcpu->arch.pmu;
|
||||
struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc;
|
||||
bool new_state, old_state;
|
||||
|
||||
old_state = kvm_pmu_pmc_is_chained(pmc);
|
||||
new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) &&
|
||||
kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1);
|
||||
|
||||
if (old_state == new_state)
|
||||
return;
|
||||
|
||||
canonical_pmc = kvm_pmu_get_canonical_pmc(pmc);
|
||||
kvm_pmu_stop_counter(vcpu, canonical_pmc);
|
||||
if (new_state) {
|
||||
/*
|
||||
* During promotion from !chained to chained we must ensure
|
||||
* the adjacent counter is stopped and its event destroyed
|
||||
*/
|
||||
kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc));
|
||||
set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
|
||||
return;
|
||||
}
|
||||
clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_pmu_set_counter_event_type - set selected counter to monitor some event
|
||||
* @vcpu: The vcpu pointer
|
||||
@ -752,6 +649,7 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
|
||||
void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
|
||||
u64 select_idx)
|
||||
{
|
||||
struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx);
|
||||
u64 reg, mask;
|
||||
|
||||
if (!kvm_vcpu_has_pmu(vcpu))
|
||||
@ -761,20 +659,19 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
|
||||
mask &= ~ARMV8_PMU_EVTYPE_EVENT;
|
||||
mask |= kvm_pmu_event_mask(vcpu->kvm);
|
||||
|
||||
reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
|
||||
? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
|
||||
reg = counter_index_to_evtreg(pmc->idx);
|
||||
|
||||
__vcpu_sys_reg(vcpu, reg) = data & mask;
|
||||
|
||||
kvm_pmu_update_pmc_chained(vcpu, select_idx);
|
||||
kvm_pmu_create_perf_event(vcpu, select_idx);
|
||||
kvm_pmu_create_perf_event(pmc);
|
||||
}
|
||||
|
||||
void kvm_host_pmu_init(struct arm_pmu *pmu)
|
||||
{
|
||||
struct arm_pmu_entry *entry;
|
||||
|
||||
if (pmu->pmuver == 0 || pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
|
||||
if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI ||
|
||||
pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
|
||||
return;
|
||||
|
||||
mutex_lock(&arm_pmus_lock);
|
||||
@ -827,7 +724,7 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void)
|
||||
|
||||
if (event->pmu) {
|
||||
pmu = to_arm_pmu(event->pmu);
|
||||
if (pmu->pmuver == 0 ||
|
||||
if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI ||
|
||||
pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
|
||||
pmu = NULL;
|
||||
}
|
||||
@ -849,6 +746,8 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
|
||||
|
||||
if (!pmceid1) {
|
||||
val = read_sysreg(pmceid0_el0);
|
||||
/* always support CHAIN */
|
||||
val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN);
|
||||
base = 0;
|
||||
} else {
|
||||
val = read_sysreg(pmceid1_el0);
|
||||
@ -1150,3 +1049,14 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
|
||||
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
u8 kvm_arm_pmu_get_pmuver_limit(void)
|
||||
{
|
||||
u64 tmp;
|
||||
|
||||
tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
|
||||
tmp = cpuid_feature_cap_perfmon_field(tmp,
|
||||
ID_AA64DFR0_EL1_PMUVer_SHIFT,
|
||||
ID_AA64DFR0_EL1_PMUVer_V3P5);
|
||||
return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp);
|
||||
}
|
||||
|
@ -395,32 +395,3 @@ int kvm_set_ipa_limit(void)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
|
||||
{
|
||||
u64 mmfr0, mmfr1;
|
||||
u32 phys_shift;
|
||||
|
||||
if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
|
||||
return -EINVAL;
|
||||
|
||||
phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
|
||||
if (phys_shift) {
|
||||
if (phys_shift > kvm_ipa_limit ||
|
||||
phys_shift < ARM64_MIN_PARANGE_BITS)
|
||||
return -EINVAL;
|
||||
} else {
|
||||
phys_shift = KVM_PHYS_SHIFT;
|
||||
if (phys_shift > kvm_ipa_limit) {
|
||||
pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
|
||||
current->comm);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
|
||||
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
|
||||
kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -639,22 +639,18 @@ static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
|
||||
|
||||
static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
|
||||
{
|
||||
u64 pmcr, val;
|
||||
u64 pmcr;
|
||||
|
||||
/* No PMU available, PMCR_EL0 may UNDEF... */
|
||||
if (!kvm_arm_support_pmu_v3())
|
||||
return;
|
||||
|
||||
pmcr = read_sysreg(pmcr_el0);
|
||||
/*
|
||||
* Writable bits of PMCR_EL0 (ARMV8_PMU_PMCR_MASK) are reset to UNKNOWN
|
||||
* except PMCR.E resetting to zero.
|
||||
*/
|
||||
val = ((pmcr & ~ARMV8_PMU_PMCR_MASK)
|
||||
| (ARMV8_PMU_PMCR_MASK & 0xdecafbad)) & (~ARMV8_PMU_PMCR_E);
|
||||
/* Only preserve PMCR_EL0.N, and reset the rest to 0 */
|
||||
pmcr = read_sysreg(pmcr_el0) & ARMV8_PMU_PMCR_N_MASK;
|
||||
if (!kvm_supports_32bit_el0())
|
||||
val |= ARMV8_PMU_PMCR_LC;
|
||||
__vcpu_sys_reg(vcpu, r->reg) = val;
|
||||
pmcr |= ARMV8_PMU_PMCR_LC;
|
||||
|
||||
__vcpu_sys_reg(vcpu, r->reg) = pmcr;
|
||||
}
|
||||
|
||||
static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
|
||||
@ -697,13 +693,15 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
|
||||
return false;
|
||||
|
||||
if (p->is_write) {
|
||||
/* Only update writeable bits of PMCR */
|
||||
/*
|
||||
* Only update writeable bits of PMCR (continuing into
|
||||
* kvm_pmu_handle_pmcr() as well)
|
||||
*/
|
||||
val = __vcpu_sys_reg(vcpu, PMCR_EL0);
|
||||
val &= ~ARMV8_PMU_PMCR_MASK;
|
||||
val |= p->regval & ARMV8_PMU_PMCR_MASK;
|
||||
if (!kvm_supports_32bit_el0())
|
||||
val |= ARMV8_PMU_PMCR_LC;
|
||||
__vcpu_sys_reg(vcpu, PMCR_EL0) = val;
|
||||
kvm_pmu_handle_pmcr(vcpu, val);
|
||||
kvm_vcpu_pmu_restore_guest(vcpu);
|
||||
} else {
|
||||
@ -1062,6 +1060,40 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
|
||||
return true;
|
||||
}
|
||||
|
||||
static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (kvm_vcpu_has_pmu(vcpu))
|
||||
return vcpu->kvm->arch.dfr0_pmuver.imp;
|
||||
|
||||
return vcpu->kvm->arch.dfr0_pmuver.unimp;
|
||||
}
|
||||
|
||||
static u8 perfmon_to_pmuver(u8 perfmon)
|
||||
{
|
||||
switch (perfmon) {
|
||||
case ID_DFR0_EL1_PerfMon_PMUv3:
|
||||
return ID_AA64DFR0_EL1_PMUVer_IMP;
|
||||
case ID_DFR0_EL1_PerfMon_IMPDEF:
|
||||
return ID_AA64DFR0_EL1_PMUVer_IMP_DEF;
|
||||
default:
|
||||
/* Anything ARMv8.1+ and NI have the same value. For now. */
|
||||
return perfmon;
|
||||
}
|
||||
}
|
||||
|
||||
static u8 pmuver_to_perfmon(u8 pmuver)
|
||||
{
|
||||
switch (pmuver) {
|
||||
case ID_AA64DFR0_EL1_PMUVer_IMP:
|
||||
return ID_DFR0_EL1_PerfMon_PMUv3;
|
||||
case ID_AA64DFR0_EL1_PMUVer_IMP_DEF:
|
||||
return ID_DFR0_EL1_PerfMon_IMPDEF;
|
||||
default:
|
||||
/* Anything ARMv8.1+ and NI have the same value. For now. */
|
||||
return pmuver;
|
||||
}
|
||||
}
|
||||
|
||||
/* Read a sanitised cpufeature ID register by sys_reg_desc */
|
||||
static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r)
|
||||
{
|
||||
@ -1111,18 +1143,17 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
|
||||
/* Limit debug to ARMv8.0 */
|
||||
val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer);
|
||||
val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6);
|
||||
/* Limit guests to PMUv3 for ARMv8.4 */
|
||||
val = cpuid_feature_cap_perfmon_field(val,
|
||||
ID_AA64DFR0_EL1_PMUVer_SHIFT,
|
||||
kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_EL1_PMUVer_V3P4 : 0);
|
||||
/* Set PMUver to the required version */
|
||||
val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
|
||||
val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
|
||||
vcpu_pmuver(vcpu));
|
||||
/* Hide SPE from guests */
|
||||
val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer);
|
||||
break;
|
||||
case SYS_ID_DFR0_EL1:
|
||||
/* Limit guests to PMUv3 for ARMv8.4 */
|
||||
val = cpuid_feature_cap_perfmon_field(val,
|
||||
ID_DFR0_EL1_PerfMon_SHIFT,
|
||||
kvm_vcpu_has_pmu(vcpu) ? ID_DFR0_EL1_PerfMon_PMUv3p4 : 0);
|
||||
val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon);
|
||||
val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon),
|
||||
pmuver_to_perfmon(vcpu_pmuver(vcpu)));
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1222,6 +1253,85 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
|
||||
const struct sys_reg_desc *rd,
|
||||
u64 val)
|
||||
{
|
||||
u8 pmuver, host_pmuver;
|
||||
bool valid_pmu;
|
||||
|
||||
host_pmuver = kvm_arm_pmu_get_pmuver_limit();
|
||||
|
||||
/*
|
||||
* Allow AA64DFR0_EL1.PMUver to be set from userspace as long
|
||||
* as it doesn't promise more than what the HW gives us. We
|
||||
* allow an IMPDEF PMU though, only if no PMU is supported
|
||||
* (KVM backward compatibility handling).
|
||||
*/
|
||||
pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val);
|
||||
if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver))
|
||||
return -EINVAL;
|
||||
|
||||
valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF);
|
||||
|
||||
/* Make sure view register and PMU support do match */
|
||||
if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
|
||||
return -EINVAL;
|
||||
|
||||
/* We can only differ with PMUver, and anything else is an error */
|
||||
val ^= read_id_reg(vcpu, rd);
|
||||
val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
|
||||
if (val)
|
||||
return -EINVAL;
|
||||
|
||||
if (valid_pmu)
|
||||
vcpu->kvm->arch.dfr0_pmuver.imp = pmuver;
|
||||
else
|
||||
vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
|
||||
const struct sys_reg_desc *rd,
|
||||
u64 val)
|
||||
{
|
||||
u8 perfmon, host_perfmon;
|
||||
bool valid_pmu;
|
||||
|
||||
host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
|
||||
|
||||
/*
|
||||
* Allow DFR0_EL1.PerfMon to be set from userspace as long as
|
||||
* it doesn't promise more than what the HW gives us on the
|
||||
* AArch64 side (as everything is emulated with that), and
|
||||
* that this is a PMUv3.
|
||||
*/
|
||||
perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), val);
|
||||
if ((perfmon != ID_DFR0_EL1_PerfMon_IMPDEF && perfmon > host_perfmon) ||
|
||||
(perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3))
|
||||
return -EINVAL;
|
||||
|
||||
valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_EL1_PerfMon_IMPDEF);
|
||||
|
||||
/* Make sure view register and PMU support do match */
|
||||
if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
|
||||
return -EINVAL;
|
||||
|
||||
/* We can only differ with PerfMon, and anything else is an error */
|
||||
val ^= read_id_reg(vcpu, rd);
|
||||
val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon);
|
||||
if (val)
|
||||
return -EINVAL;
|
||||
|
||||
if (valid_pmu)
|
||||
vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon);
|
||||
else
|
||||
vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* cpufeature ID register user accessors
|
||||
*
|
||||
@ -1443,7 +1553,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
|
||||
/* CRm=1 */
|
||||
AA32_ID_SANITISED(ID_PFR0_EL1),
|
||||
AA32_ID_SANITISED(ID_PFR1_EL1),
|
||||
AA32_ID_SANITISED(ID_DFR0_EL1),
|
||||
{ SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg,
|
||||
.get_user = get_id_reg, .set_user = set_id_dfr0_el1,
|
||||
.visibility = aa32_id_visibility, },
|
||||
ID_HIDDEN(ID_AFR0_EL1),
|
||||
AA32_ID_SANITISED(ID_MMFR0_EL1),
|
||||
AA32_ID_SANITISED(ID_MMFR1_EL1),
|
||||
@ -1483,7 +1595,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
|
||||
ID_UNALLOCATED(4,7),
|
||||
|
||||
/* CRm=5 */
|
||||
ID_SANITISED(ID_AA64DFR0_EL1),
|
||||
{ SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg,
|
||||
.get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, },
|
||||
ID_SANITISED(ID_AA64DFR1_EL1),
|
||||
ID_UNALLOCATED(5,2),
|
||||
ID_UNALLOCATED(5,3),
|
||||
|
@ -2743,6 +2743,7 @@ static int vgic_its_has_attr(struct kvm_device *dev,
|
||||
static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
|
||||
{
|
||||
const struct vgic_its_abi *abi = vgic_its_get_abi(its);
|
||||
struct vgic_dist *dist = &kvm->arch.vgic;
|
||||
int ret = 0;
|
||||
|
||||
if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
|
||||
@ -2762,7 +2763,9 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
|
||||
vgic_its_reset(kvm, its);
|
||||
break;
|
||||
case KVM_DEV_ARM_ITS_SAVE_TABLES:
|
||||
dist->save_its_tables_in_progress = true;
|
||||
ret = abi->save_tables(its);
|
||||
dist->save_its_tables_in_progress = false;
|
||||
break;
|
||||
case KVM_DEV_ARM_ITS_RESTORE_TABLES:
|
||||
ret = abi->restore_tables(its);
|
||||
@ -2775,6 +2778,23 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* kvm_arch_allow_write_without_running_vcpu - allow writing guest memory
|
||||
* without the running VCPU when dirty ring is enabled.
|
||||
*
|
||||
* The running VCPU is required to track dirty guest pages when dirty ring
|
||||
* is enabled. Otherwise, the backup bitmap should be used to track the
|
||||
* dirty guest pages. When vgic/its tables are being saved, the backup
|
||||
* bitmap is used to track the dirty guest pages due to the missed running
|
||||
* VCPU in the period.
|
||||
*/
|
||||
bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm)
|
||||
{
|
||||
struct vgic_dist *dist = &kvm->arch.vgic;
|
||||
|
||||
return dist->save_its_tables_in_progress;
|
||||
}
|
||||
|
||||
static int vgic_its_set_attr(struct kvm_device *dev,
|
||||
struct kvm_device_attr *attr)
|
||||
{
|
||||
|
@ -21,9 +21,12 @@ void copy_highpage(struct page *to, struct page *from)
|
||||
|
||||
copy_page(kto, kfrom);
|
||||
|
||||
if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) {
|
||||
set_bit(PG_mte_tagged, &to->flags);
|
||||
if (system_supports_mte() && page_mte_tagged(from)) {
|
||||
page_kasan_tag_reset(to);
|
||||
/* It's a new page, shouldn't have been tagged yet */
|
||||
WARN_ON_ONCE(!try_page_mte_tagging(to));
|
||||
mte_copy_page_tags(kto, kfrom);
|
||||
set_page_mte_tagged(to);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(copy_highpage);
|
||||
|
@ -943,6 +943,8 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
|
||||
|
||||
void tag_clear_highpage(struct page *page)
|
||||
{
|
||||
/* Newly allocated page, shouldn't have been tagged yet */
|
||||
WARN_ON_ONCE(!try_page_mte_tagging(page));
|
||||
mte_zero_clear_page_tags(page_address(page));
|
||||
set_bit(PG_mte_tagged, &page->flags);
|
||||
set_page_mte_tagged(page);
|
||||
}
|
||||
|
@ -24,7 +24,7 @@ int mte_save_tags(struct page *page)
|
||||
{
|
||||
void *tag_storage, *ret;
|
||||
|
||||
if (!test_bit(PG_mte_tagged, &page->flags))
|
||||
if (!page_mte_tagged(page))
|
||||
return 0;
|
||||
|
||||
tag_storage = mte_allocate_tag_storage();
|
||||
@ -46,21 +46,17 @@ int mte_save_tags(struct page *page)
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool mte_restore_tags(swp_entry_t entry, struct page *page)
|
||||
void mte_restore_tags(swp_entry_t entry, struct page *page)
|
||||
{
|
||||
void *tags = xa_load(&mte_pages, entry.val);
|
||||
|
||||
if (!tags)
|
||||
return false;
|
||||
return;
|
||||
|
||||
/*
|
||||
* Test PG_mte_tagged again in case it was racing with another
|
||||
* set_pte_at().
|
||||
*/
|
||||
if (!test_and_set_bit(PG_mte_tagged, &page->flags))
|
||||
if (try_page_mte_tagging(page)) {
|
||||
mte_restore_page_tags(page_address(page), tags);
|
||||
|
||||
return true;
|
||||
set_page_mte_tagged(page);
|
||||
}
|
||||
}
|
||||
|
||||
void mte_invalidate_tags(int type, pgoff_t offset)
|
||||
|
@ -598,7 +598,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
|
||||
write_ok = true;
|
||||
} else {
|
||||
/* Call KVM generic code to do the slow-path check */
|
||||
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
|
||||
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
|
||||
writing, &write_ok, NULL);
|
||||
if (is_error_noslot_pfn(pfn))
|
||||
return -EFAULT;
|
||||
|
@ -846,7 +846,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
|
||||
unsigned long pfn;
|
||||
|
||||
/* Call KVM generic code to do the slow-path check */
|
||||
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
|
||||
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
|
||||
writing, upgrade_p, NULL);
|
||||
if (is_error_noslot_pfn(pfn))
|
||||
return -EFAULT;
|
||||
|
@ -1,22 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef __IRQ_H
|
||||
#define __IRQ_H
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
|
||||
static inline int irqchip_in_kernel(struct kvm *kvm)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
#ifdef CONFIG_KVM_MPIC
|
||||
ret = ret || (kvm->arch.mpic != NULL);
|
||||
#endif
|
||||
#ifdef CONFIG_KVM_XICS
|
||||
ret = ret || (kvm->arch.xics != NULL);
|
||||
ret = ret || (kvm->arch.xive != NULL);
|
||||
#endif
|
||||
smp_rmb();
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
@ -36,7 +36,6 @@
|
||||
#include <asm/setup.h>
|
||||
|
||||
#include "timing.h"
|
||||
#include "irq.h"
|
||||
#include "../mm/mmu_decl.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
@ -2165,10 +2164,25 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
#ifdef CONFIG_KVM_MPIC
|
||||
ret = ret || (kvm->arch.mpic != NULL);
|
||||
#endif
|
||||
#ifdef CONFIG_KVM_XICS
|
||||
ret = ret || (kvm->arch.xics != NULL);
|
||||
ret = ret || (kvm->arch.xive != NULL);
|
||||
#endif
|
||||
smp_rmb();
|
||||
return ret;
|
||||
}
|
||||
|
||||
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
|
||||
bool line_status)
|
||||
{
|
||||
if (!irqchip_in_kernel(kvm))
|
||||
if (!kvm_arch_irqchip_in_kernel(kvm))
|
||||
return -ENXIO;
|
||||
|
||||
irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
|
||||
|
@ -142,8 +142,7 @@ struct mcck_volatile_info {
|
||||
CR14_EXTERNAL_DAMAGE_SUBMASK)
|
||||
|
||||
#define SIDAD_SIZE_MASK 0xff
|
||||
#define sida_origin(sie_block) \
|
||||
((sie_block)->sidad & PAGE_MASK)
|
||||
#define sida_addr(sie_block) phys_to_virt((sie_block)->sidad & PAGE_MASK)
|
||||
#define sida_size(sie_block) \
|
||||
((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE)
|
||||
|
||||
@ -276,6 +275,7 @@ struct kvm_s390_sie_block {
|
||||
#define ECB3_AES 0x04
|
||||
#define ECB3_RI 0x01
|
||||
__u8 ecb3; /* 0x0063 */
|
||||
#define ESCA_SCAOL_MASK ~0x3fU
|
||||
__u32 scaol; /* 0x0064 */
|
||||
__u8 sdf; /* 0x0068 */
|
||||
__u8 epdx; /* 0x0069 */
|
||||
@ -942,6 +942,8 @@ struct kvm_s390_pv {
|
||||
unsigned long stor_base;
|
||||
void *stor_var;
|
||||
bool dumping;
|
||||
void *set_aside;
|
||||
struct list_head need_cleanup;
|
||||
struct mmu_notifier mmu_notifier;
|
||||
};
|
||||
|
||||
@ -1017,7 +1019,13 @@ void kvm_arch_crypto_clear_masks(struct kvm *kvm);
|
||||
void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
|
||||
unsigned long *aqm, unsigned long *adm);
|
||||
|
||||
extern int sie64a(struct kvm_s390_sie_block *, u64 *);
|
||||
int __sie64a(phys_addr_t sie_block_phys, struct kvm_s390_sie_block *sie_block, u64 *rsa);
|
||||
|
||||
static inline int sie64a(struct kvm_s390_sie_block *sie_block, u64 *rsa)
|
||||
{
|
||||
return __sie64a(virt_to_phys(sie_block), sie_block, rsa);
|
||||
}
|
||||
|
||||
extern char sie_exit;
|
||||
|
||||
extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
|
||||
|
@ -4,8 +4,8 @@
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
int set_memory_encrypted(unsigned long addr, int numpages);
|
||||
int set_memory_decrypted(unsigned long addr, int numpages);
|
||||
int set_memory_encrypted(unsigned long vaddr, int numpages);
|
||||
int set_memory_decrypted(unsigned long vaddr, int numpages);
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
|
@ -46,6 +46,7 @@ struct stack_frame {
|
||||
unsigned long sie_savearea;
|
||||
unsigned long sie_reason;
|
||||
unsigned long sie_flags;
|
||||
unsigned long sie_control_block_phys;
|
||||
};
|
||||
};
|
||||
unsigned long gprs[10];
|
||||
|
@ -34,6 +34,7 @@
|
||||
#define UVC_CMD_INIT_UV 0x000f
|
||||
#define UVC_CMD_CREATE_SEC_CONF 0x0100
|
||||
#define UVC_CMD_DESTROY_SEC_CONF 0x0101
|
||||
#define UVC_CMD_DESTROY_SEC_CONF_FAST 0x0102
|
||||
#define UVC_CMD_CREATE_SEC_CPU 0x0120
|
||||
#define UVC_CMD_DESTROY_SEC_CPU 0x0121
|
||||
#define UVC_CMD_CONV_TO_SEC_STOR 0x0200
|
||||
@ -81,6 +82,7 @@ enum uv_cmds_inst {
|
||||
BIT_UVC_CMD_UNSHARE_ALL = 20,
|
||||
BIT_UVC_CMD_PIN_PAGE_SHARED = 21,
|
||||
BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
|
||||
BIT_UVC_CMD_DESTROY_SEC_CONF_FAST = 23,
|
||||
BIT_UVC_CMD_DUMP_INIT = 24,
|
||||
BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25,
|
||||
BIT_UVC_CMD_DUMP_CPU = 26,
|
||||
@ -230,6 +232,14 @@ struct uv_cb_nodata {
|
||||
u64 reserved20[4];
|
||||
} __packed __aligned(8);
|
||||
|
||||
/* Destroy Configuration Fast */
|
||||
struct uv_cb_destroy_fast {
|
||||
struct uv_cb_header header;
|
||||
u64 reserved08[2];
|
||||
u64 handle;
|
||||
u64 reserved20[5];
|
||||
} __packed __aligned(8);
|
||||
|
||||
/* Set Shared Access */
|
||||
struct uv_cb_share {
|
||||
struct uv_cb_header header;
|
||||
|
@ -62,6 +62,7 @@ int main(void)
|
||||
OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea);
|
||||
OFFSET(__SF_SIE_REASON, stack_frame, sie_reason);
|
||||
OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags);
|
||||
OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys);
|
||||
DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame));
|
||||
BLANK();
|
||||
/* idle data offsets */
|
||||
|
@ -207,18 +207,20 @@ ENDPROC(__switch_to)
|
||||
|
||||
#if IS_ENABLED(CONFIG_KVM)
|
||||
/*
|
||||
* sie64a calling convention:
|
||||
* %r2 pointer to sie control block
|
||||
* %r3 guest register save area
|
||||
* __sie64a calling convention:
|
||||
* %r2 pointer to sie control block phys
|
||||
* %r3 pointer to sie control block virt
|
||||
* %r4 guest register save area
|
||||
*/
|
||||
ENTRY(sie64a)
|
||||
ENTRY(__sie64a)
|
||||
stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers
|
||||
lg %r12,__LC_CURRENT
|
||||
stg %r2,__SF_SIE_CONTROL(%r15) # save control block pointer
|
||||
stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area
|
||||
stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical..
|
||||
stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses
|
||||
stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area
|
||||
xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0
|
||||
mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags
|
||||
lmg %r0,%r13,0(%r3) # load guest gprs 0-13
|
||||
lmg %r0,%r13,0(%r4) # load guest gprs 0-13
|
||||
lg %r14,__LC_GMAP # get gmap pointer
|
||||
ltgr %r14,%r14
|
||||
jz .Lsie_gmap
|
||||
@ -230,6 +232,7 @@ ENTRY(sie64a)
|
||||
jnz .Lsie_skip
|
||||
TSTMSK __LC_CPU_FLAGS,_CIF_FPU
|
||||
jo .Lsie_skip # exit if fp/vx regs changed
|
||||
lg %r14,__SF_SIE_CONTROL_PHYS(%r15) # get sie block phys addr
|
||||
BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
|
||||
.Lsie_entry:
|
||||
sie 0(%r14)
|
||||
@ -240,13 +243,14 @@ ENTRY(sie64a)
|
||||
BPOFF
|
||||
BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
|
||||
.Lsie_skip:
|
||||
lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer
|
||||
ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
|
||||
lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce
|
||||
.Lsie_done:
|
||||
# some program checks are suppressing. C code (e.g. do_protection_exception)
|
||||
# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
|
||||
# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
|
||||
# Other instructions between sie64a and .Lsie_done should not cause program
|
||||
# Other instructions between __sie64a and .Lsie_done should not cause program
|
||||
# interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
|
||||
.Lrewind_pad6:
|
||||
nopr 7
|
||||
@ -275,8 +279,8 @@ sie_exit:
|
||||
EX_TABLE(.Lrewind_pad4,.Lsie_fault)
|
||||
EX_TABLE(.Lrewind_pad2,.Lsie_fault)
|
||||
EX_TABLE(sie_exit,.Lsie_fault)
|
||||
ENDPROC(sie64a)
|
||||
EXPORT_SYMBOL(sie64a)
|
||||
ENDPROC(__sie64a)
|
||||
EXPORT_SYMBOL(__sie64a)
|
||||
EXPORT_SYMBOL(sie_exit)
|
||||
#endif
|
||||
|
||||
@ -355,7 +359,7 @@ ENTRY(pgm_check_handler)
|
||||
j 3f # -> fault in user space
|
||||
.Lpgm_skip_asce:
|
||||
#if IS_ENABLED(CONFIG_KVM)
|
||||
# cleanup critical section for program checks in sie64a
|
||||
# cleanup critical section for program checks in __sie64a
|
||||
OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f
|
||||
SIEEXIT
|
||||
lghi %r10,_PIF_GUEST_FAULT
|
||||
|
@ -255,6 +255,13 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr,
|
||||
*/
|
||||
static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
|
||||
{
|
||||
/*
|
||||
* The misc feature indicates, among other things, that importing a
|
||||
* shared page from a different protected VM will automatically also
|
||||
* transfer its ownership.
|
||||
*/
|
||||
if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications))
|
||||
return false;
|
||||
if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
|
||||
return false;
|
||||
return atomic_read(&mm->context.protected_count) > 1;
|
||||
|
@ -217,7 +217,7 @@ static int handle_itdb(struct kvm_vcpu *vcpu)
|
||||
return 0;
|
||||
if (current->thread.per_flags & PER_FLAG_NO_TE)
|
||||
return 0;
|
||||
itdb = (struct kvm_s390_itdb *)vcpu->arch.sie_block->itdba;
|
||||
itdb = phys_to_virt(vcpu->arch.sie_block->itdba);
|
||||
rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb));
|
||||
if (rc)
|
||||
return rc;
|
||||
@ -409,8 +409,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
|
||||
out:
|
||||
if (!cc) {
|
||||
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
|
||||
memcpy((void *)(sida_origin(vcpu->arch.sie_block)),
|
||||
sctns, PAGE_SIZE);
|
||||
memcpy(sida_addr(vcpu->arch.sie_block), sctns, PAGE_SIZE);
|
||||
} else {
|
||||
r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
|
||||
if (r) {
|
||||
@ -464,7 +463,7 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
|
||||
|
||||
static int handle_pv_spx(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
u32 pref = *(u32 *)vcpu->arch.sie_block->sidad;
|
||||
u32 pref = *(u32 *)sida_addr(vcpu->arch.sie_block);
|
||||
|
||||
kvm_s390_set_prefix(vcpu, pref);
|
||||
trace_kvm_s390_handle_prefix(vcpu, 1, pref);
|
||||
@ -497,7 +496,7 @@ static int handle_pv_sclp(struct kvm_vcpu *vcpu)
|
||||
|
||||
static int handle_pv_uvc(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct uv_cb_share *guest_uvcb = (void *)vcpu->arch.sie_block->sidad;
|
||||
struct uv_cb_share *guest_uvcb = sida_addr(vcpu->arch.sie_block);
|
||||
struct uv_cb_cts uvcb = {
|
||||
.header.cmd = UVC_CMD_UNPIN_PAGE_SHARED,
|
||||
.header.len = sizeof(uvcb),
|
||||
|
@ -314,11 +314,6 @@ static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa)
|
||||
return READ_ONCE(gisa->ipm);
|
||||
}
|
||||
|
||||
static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
|
||||
{
|
||||
clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
|
||||
}
|
||||
|
||||
static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
|
||||
{
|
||||
return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
|
||||
|
@ -1,19 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* s390 irqchip routines
|
||||
*
|
||||
* Copyright IBM Corp. 2014
|
||||
*
|
||||
* Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
|
||||
*/
|
||||
#ifndef __KVM_IRQ_H
|
||||
#define __KVM_IRQ_H
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
|
||||
static inline int irqchip_in_kernel(struct kvm *kvm)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif
|
@ -209,6 +209,14 @@ unsigned int diag9c_forwarding_hz;
|
||||
module_param(diag9c_forwarding_hz, uint, 0644);
|
||||
MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
|
||||
|
||||
/*
|
||||
* allow asynchronous deinit for protected guests; enable by default since
|
||||
* the feature is opt-in anyway
|
||||
*/
|
||||
static int async_destroy = 1;
|
||||
module_param(async_destroy, int, 0444);
|
||||
MODULE_PARM_DESC(async_destroy, "Asynchronous destroy for protected guests");
|
||||
|
||||
/*
|
||||
* For now we handle at most 16 double words as this is what the s390 base
|
||||
* kernel handles and stores in the prefix page. If we ever need to go beyond
|
||||
@ -616,6 +624,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
|
||||
case KVM_CAP_S390_BPB:
|
||||
r = test_facility(82);
|
||||
break;
|
||||
case KVM_CAP_S390_PROTECTED_ASYNC_DISABLE:
|
||||
r = async_destroy && is_prot_virt_host();
|
||||
break;
|
||||
case KVM_CAP_S390_PROTECTED:
|
||||
r = is_prot_virt_host();
|
||||
break;
|
||||
@ -2519,9 +2530,13 @@ static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd,
|
||||
|
||||
static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
|
||||
{
|
||||
const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM);
|
||||
void __user *argp = (void __user *)cmd->data;
|
||||
int r = 0;
|
||||
u16 dummy;
|
||||
void __user *argp = (void __user *)cmd->data;
|
||||
|
||||
if (need_lock)
|
||||
mutex_lock(&kvm->lock);
|
||||
|
||||
switch (cmd->cmd) {
|
||||
case KVM_PV_ENABLE: {
|
||||
@ -2555,6 +2570,31 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
|
||||
set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
|
||||
break;
|
||||
}
|
||||
case KVM_PV_ASYNC_CLEANUP_PREPARE:
|
||||
r = -EINVAL;
|
||||
if (!kvm_s390_pv_is_protected(kvm) || !async_destroy)
|
||||
break;
|
||||
|
||||
r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc);
|
||||
/*
|
||||
* If a CPU could not be destroyed, destroy VM will also fail.
|
||||
* There is no point in trying to destroy it. Instead return
|
||||
* the rc and rrc from the first CPU that failed destroying.
|
||||
*/
|
||||
if (r)
|
||||
break;
|
||||
r = kvm_s390_pv_set_aside(kvm, &cmd->rc, &cmd->rrc);
|
||||
|
||||
/* no need to block service interrupts any more */
|
||||
clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
|
||||
break;
|
||||
case KVM_PV_ASYNC_CLEANUP_PERFORM:
|
||||
r = -EINVAL;
|
||||
if (!async_destroy)
|
||||
break;
|
||||
/* kvm->lock must not be held; this is asserted inside the function. */
|
||||
r = kvm_s390_pv_deinit_aside_vm(kvm, &cmd->rc, &cmd->rrc);
|
||||
break;
|
||||
case KVM_PV_DISABLE: {
|
||||
r = -EINVAL;
|
||||
if (!kvm_s390_pv_is_protected(kvm))
|
||||
@ -2568,7 +2608,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
|
||||
*/
|
||||
if (r)
|
||||
break;
|
||||
r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc);
|
||||
r = kvm_s390_pv_deinit_cleanup_all(kvm, &cmd->rc, &cmd->rrc);
|
||||
|
||||
/* no need to block service interrupts any more */
|
||||
clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
|
||||
@ -2718,6 +2758,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
|
||||
default:
|
||||
r = -ENOTTY;
|
||||
}
|
||||
if (need_lock)
|
||||
mutex_unlock(&kvm->lock);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -2922,9 +2965,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
|
||||
r = -EINVAL;
|
||||
break;
|
||||
}
|
||||
mutex_lock(&kvm->lock);
|
||||
/* must be called without kvm->lock */
|
||||
r = kvm_s390_handle_pv(kvm, &args);
|
||||
mutex_unlock(&kvm->lock);
|
||||
if (copy_to_user(argp, &args, sizeof(args))) {
|
||||
r = -EFAULT;
|
||||
break;
|
||||
@ -3243,6 +3285,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||||
kvm_s390_vsie_init(kvm);
|
||||
if (use_gisa)
|
||||
kvm_s390_gisa_init(kvm);
|
||||
INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup);
|
||||
kvm->arch.pv.set_aside = NULL;
|
||||
KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
|
||||
|
||||
return 0;
|
||||
@ -3287,11 +3331,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
|
||||
/*
|
||||
* We are already at the end of life and kvm->lock is not taken.
|
||||
* This is ok as the file descriptor is closed by now and nobody
|
||||
* can mess with the pv state. To avoid lockdep_assert_held from
|
||||
* complaining we do not use kvm_s390_pv_is_protected.
|
||||
* can mess with the pv state.
|
||||
*/
|
||||
if (kvm_s390_pv_get_handle(kvm))
|
||||
kvm_s390_pv_deinit_vm(kvm, &rc, &rrc);
|
||||
kvm_s390_pv_deinit_cleanup_all(kvm, &rc, &rrc);
|
||||
/*
|
||||
* Remove the mmu notifier only when the whole KVM VM is torn down,
|
||||
* and only if one was registered to begin with. If the VM is
|
||||
@ -3344,28 +3386,30 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu)
|
||||
static void sca_add_vcpu(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (!kvm_s390_use_sca_entries()) {
|
||||
struct bsca_block *sca = vcpu->kvm->arch.sca;
|
||||
phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca);
|
||||
|
||||
/* we still need the basic sca for the ipte control */
|
||||
vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
|
||||
vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
|
||||
vcpu->arch.sie_block->scaoh = sca_phys >> 32;
|
||||
vcpu->arch.sie_block->scaol = sca_phys;
|
||||
return;
|
||||
}
|
||||
read_lock(&vcpu->kvm->arch.sca_lock);
|
||||
if (vcpu->kvm->arch.use_esca) {
|
||||
struct esca_block *sca = vcpu->kvm->arch.sca;
|
||||
phys_addr_t sca_phys = virt_to_phys(sca);
|
||||
|
||||
sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
|
||||
vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
|
||||
vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU;
|
||||
sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
|
||||
vcpu->arch.sie_block->scaoh = sca_phys >> 32;
|
||||
vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK;
|
||||
vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
|
||||
set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
|
||||
} else {
|
||||
struct bsca_block *sca = vcpu->kvm->arch.sca;
|
||||
phys_addr_t sca_phys = virt_to_phys(sca);
|
||||
|
||||
sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
|
||||
vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
|
||||
vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
|
||||
sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
|
||||
vcpu->arch.sie_block->scaoh = sca_phys >> 32;
|
||||
vcpu->arch.sie_block->scaol = sca_phys;
|
||||
set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
|
||||
}
|
||||
read_unlock(&vcpu->kvm->arch.sca_lock);
|
||||
@ -3396,6 +3440,7 @@ static int sca_switch_to_extended(struct kvm *kvm)
|
||||
struct kvm_vcpu *vcpu;
|
||||
unsigned long vcpu_idx;
|
||||
u32 scaol, scaoh;
|
||||
phys_addr_t new_sca_phys;
|
||||
|
||||
if (kvm->arch.use_esca)
|
||||
return 0;
|
||||
@ -3404,8 +3449,9 @@ static int sca_switch_to_extended(struct kvm *kvm)
|
||||
if (!new_sca)
|
||||
return -ENOMEM;
|
||||
|
||||
scaoh = (u32)((u64)(new_sca) >> 32);
|
||||
scaol = (u32)(u64)(new_sca) & ~0x3fU;
|
||||
new_sca_phys = virt_to_phys(new_sca);
|
||||
scaoh = new_sca_phys >> 32;
|
||||
scaol = new_sca_phys & ESCA_SCAOL_MASK;
|
||||
|
||||
kvm_s390_vcpu_block_all(kvm);
|
||||
write_lock(&kvm->arch.sca_lock);
|
||||
@ -3625,15 +3671,18 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
|
||||
|
||||
void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
free_page(vcpu->arch.sie_block->cbrlo);
|
||||
free_page((unsigned long)phys_to_virt(vcpu->arch.sie_block->cbrlo));
|
||||
vcpu->arch.sie_block->cbrlo = 0;
|
||||
}
|
||||
|
||||
int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL_ACCOUNT);
|
||||
if (!vcpu->arch.sie_block->cbrlo)
|
||||
void *cbrlo_page = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
|
||||
|
||||
if (!cbrlo_page)
|
||||
return -ENOMEM;
|
||||
|
||||
vcpu->arch.sie_block->cbrlo = virt_to_phys(cbrlo_page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -3643,7 +3692,7 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu)
|
||||
|
||||
vcpu->arch.sie_block->ibc = model->ibc;
|
||||
if (test_kvm_facility(vcpu->kvm, 7))
|
||||
vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list;
|
||||
vcpu->arch.sie_block->fac = virt_to_phys(model->fac_list);
|
||||
}
|
||||
|
||||
static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
|
||||
@ -3700,9 +3749,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
|
||||
VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
|
||||
vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id);
|
||||
}
|
||||
vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx)
|
||||
| SDNXC;
|
||||
vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
|
||||
vcpu->arch.sie_block->sdnxo = virt_to_phys(&vcpu->run->s.regs.sdnx) | SDNXC;
|
||||
vcpu->arch.sie_block->riccbd = virt_to_phys(&vcpu->run->s.regs.riccb);
|
||||
|
||||
if (sclp.has_kss)
|
||||
kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS);
|
||||
@ -3752,7 +3800,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
|
||||
return -ENOMEM;
|
||||
|
||||
vcpu->arch.sie_block = &sie_page->sie_block;
|
||||
vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
|
||||
vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb);
|
||||
|
||||
/* the real guest size will always be smaller than msl */
|
||||
vcpu->arch.sie_block->mso = 0;
|
||||
@ -5169,6 +5217,7 @@ static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu,
|
||||
struct kvm_s390_mem_op *mop)
|
||||
{
|
||||
void __user *uaddr = (void __user *)mop->buf;
|
||||
void *sida_addr;
|
||||
int r = 0;
|
||||
|
||||
if (mop->flags || !mop->size)
|
||||
@ -5180,16 +5229,16 @@ static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu,
|
||||
if (!kvm_s390_pv_cpu_is_protected(vcpu))
|
||||
return -EINVAL;
|
||||
|
||||
sida_addr = (char *)sida_addr(vcpu->arch.sie_block) + mop->sida_offset;
|
||||
|
||||
switch (mop->op) {
|
||||
case KVM_S390_MEMOP_SIDA_READ:
|
||||
if (copy_to_user(uaddr, (void *)(sida_origin(vcpu->arch.sie_block) +
|
||||
mop->sida_offset), mop->size))
|
||||
if (copy_to_user(uaddr, sida_addr, mop->size))
|
||||
r = -EFAULT;
|
||||
|
||||
break;
|
||||
case KVM_S390_MEMOP_SIDA_WRITE:
|
||||
if (copy_from_user((void *)(sida_origin(vcpu->arch.sie_block) +
|
||||
mop->sida_offset), uaddr, mop->size))
|
||||
if (copy_from_user(sida_addr, uaddr, mop->size))
|
||||
r = -EFAULT;
|
||||
break;
|
||||
}
|
||||
@ -5567,6 +5616,11 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
|
||||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
|
||||
bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Section: memory related */
|
||||
int kvm_arch_prepare_memory_region(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *old,
|
||||
|
@ -23,7 +23,8 @@
|
||||
/* Transactional Memory Execution related macros */
|
||||
#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE))
|
||||
#define TDB_FORMAT1 1
|
||||
#define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1))
|
||||
#define IS_ITDB_VALID(vcpu) \
|
||||
((*(char *)phys_to_virt((vcpu)->arch.sie_block->itdba) == TDB_FORMAT1))
|
||||
|
||||
extern debug_info_t *kvm_s390_dbf;
|
||||
extern debug_info_t *kvm_s390_dbf_uv;
|
||||
@ -233,7 +234,7 @@ static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots)
|
||||
|
||||
static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm)
|
||||
{
|
||||
u32 gd = (u32)(u64)kvm->arch.gisa_int.origin;
|
||||
u32 gd = virt_to_phys(kvm->arch.gisa_int.origin);
|
||||
|
||||
if (gd && sclp.has_gisaf)
|
||||
gd |= GISA_FORMAT1;
|
||||
@ -243,6 +244,9 @@ static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm)
|
||||
/* implemented in pv.c */
|
||||
int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
|
||||
int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
|
||||
int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc);
|
||||
int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
|
||||
int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc);
|
||||
int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
|
||||
int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
|
||||
int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
|
||||
|
@ -924,8 +924,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
|
||||
return -EREMOTE;
|
||||
}
|
||||
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
|
||||
memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem,
|
||||
PAGE_SIZE);
|
||||
memcpy(sida_addr(vcpu->arch.sie_block), (void *)mem, PAGE_SIZE);
|
||||
rc = 0;
|
||||
} else {
|
||||
rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE);
|
||||
|
@ -18,6 +18,29 @@
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include "kvm-s390.h"
|
||||
|
||||
/**
|
||||
* struct pv_vm_to_be_destroyed - Represents a protected VM that needs to
|
||||
* be destroyed
|
||||
*
|
||||
* @list: list head for the list of leftover VMs
|
||||
* @old_gmap_table: the gmap table of the leftover protected VM
|
||||
* @handle: the handle of the leftover protected VM
|
||||
* @stor_var: pointer to the variable storage of the leftover protected VM
|
||||
* @stor_base: address of the base storage of the leftover protected VM
|
||||
*
|
||||
* Represents a protected VM that is still registered with the Ultravisor,
|
||||
* but which does not correspond any longer to an active KVM VM. It should
|
||||
* be destroyed at some point later, either asynchronously or when the
|
||||
* process terminates.
|
||||
*/
|
||||
struct pv_vm_to_be_destroyed {
|
||||
struct list_head list;
|
||||
unsigned long old_gmap_table;
|
||||
u64 handle;
|
||||
void *stor_var;
|
||||
unsigned long stor_base;
|
||||
};
|
||||
|
||||
static void kvm_s390_clear_pv_state(struct kvm *kvm)
|
||||
{
|
||||
kvm->arch.pv.handle = 0;
|
||||
@ -44,7 +67,7 @@ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
|
||||
free_pages(vcpu->arch.pv.stor_base,
|
||||
get_order(uv_info.guest_cpu_stor_len));
|
||||
|
||||
free_page(sida_origin(vcpu->arch.sie_block));
|
||||
free_page((unsigned long)sida_addr(vcpu->arch.sie_block));
|
||||
vcpu->arch.sie_block->pv_handle_cpu = 0;
|
||||
vcpu->arch.sie_block->pv_handle_config = 0;
|
||||
memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv));
|
||||
@ -66,6 +89,7 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
|
||||
.header.cmd = UVC_CMD_CREATE_SEC_CPU,
|
||||
.header.len = sizeof(uvcb),
|
||||
};
|
||||
void *sida_addr;
|
||||
int cc;
|
||||
|
||||
if (kvm_s390_pv_cpu_get_handle(vcpu))
|
||||
@ -79,16 +103,17 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
|
||||
/* Input */
|
||||
uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm);
|
||||
uvcb.num = vcpu->arch.sie_block->icpua;
|
||||
uvcb.state_origin = (u64)vcpu->arch.sie_block;
|
||||
uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base;
|
||||
uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block);
|
||||
uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base);
|
||||
|
||||
/* Alloc Secure Instruction Data Area Designation */
|
||||
vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
||||
if (!vcpu->arch.sie_block->sidad) {
|
||||
sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
||||
if (!sida_addr) {
|
||||
free_pages(vcpu->arch.pv.stor_base,
|
||||
get_order(uv_info.guest_cpu_stor_len));
|
||||
return -ENOMEM;
|
||||
}
|
||||
vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr);
|
||||
|
||||
cc = uv_call(0, (u64)&uvcb);
|
||||
*rc = uvcb.header.rc;
|
||||
@ -159,7 +184,185 @@ static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/* this should not fail, but if it does, we must not free the donated memory */
|
||||
/**
|
||||
* kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM.
|
||||
* @kvm: the KVM that was associated with this leftover protected VM
|
||||
* @leftover: details about the leftover protected VM that needs a clean up
|
||||
* @rc: the RC code of the Destroy Secure Configuration UVC
|
||||
* @rrc: the RRC code of the Destroy Secure Configuration UVC
|
||||
*
|
||||
* Destroy one leftover protected VM.
|
||||
* On success, kvm->mm->context.protected_count will be decremented atomically
|
||||
* and all other resources used by the VM will be freed.
|
||||
*
|
||||
* Return: 0 in case of success, otherwise 1
|
||||
*/
|
||||
static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm,
|
||||
struct pv_vm_to_be_destroyed *leftover,
|
||||
u16 *rc, u16 *rrc)
|
||||
{
|
||||
int cc;
|
||||
|
||||
/* It used the destroy-fast UVC, nothing left to do here */
|
||||
if (!leftover->handle)
|
||||
goto done_fast;
|
||||
cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
|
||||
KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc);
|
||||
WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc);
|
||||
if (cc)
|
||||
return cc;
|
||||
/*
|
||||
* Intentionally leak unusable memory. If the UVC fails, the memory
|
||||
* used for the VM and its metadata is permanently unusable.
|
||||
* This can only happen in case of a serious KVM or hardware bug; it
|
||||
* is not expected to happen in normal operation.
|
||||
*/
|
||||
free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len));
|
||||
free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER);
|
||||
vfree(leftover->stor_var);
|
||||
done_fast:
|
||||
atomic_dec(&kvm->mm->context.protected_count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory.
|
||||
* @kvm: the VM whose memory is to be cleared.
|
||||
*
|
||||
* Destroy the first 2GB of guest memory, to avoid prefix issues after reboot.
|
||||
* The CPUs of the protected VM need to be destroyed beforehand.
|
||||
*/
|
||||
static void kvm_s390_destroy_lower_2g(struct kvm *kvm)
|
||||
{
|
||||
const unsigned long pages_2g = SZ_2G / PAGE_SIZE;
|
||||
struct kvm_memory_slot *slot;
|
||||
unsigned long len;
|
||||
int srcu_idx;
|
||||
|
||||
srcu_idx = srcu_read_lock(&kvm->srcu);
|
||||
|
||||
/* Take the memslot containing guest absolute address 0 */
|
||||
slot = gfn_to_memslot(kvm, 0);
|
||||
/* Clear all slots or parts thereof that are below 2GB */
|
||||
while (slot && slot->base_gfn < pages_2g) {
|
||||
len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE;
|
||||
s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len);
|
||||
/* Take the next memslot */
|
||||
slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages);
|
||||
}
|
||||
|
||||
srcu_read_unlock(&kvm->srcu, srcu_idx);
|
||||
}
|
||||
|
||||
static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
|
||||
{
|
||||
struct uv_cb_destroy_fast uvcb = {
|
||||
.header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST,
|
||||
.header.len = sizeof(uvcb),
|
||||
.handle = kvm_s390_pv_get_handle(kvm),
|
||||
};
|
||||
int cc;
|
||||
|
||||
cc = uv_call_sched(0, (u64)&uvcb);
|
||||
if (rc)
|
||||
*rc = uvcb.header.rc;
|
||||
if (rrc)
|
||||
*rrc = uvcb.header.rrc;
|
||||
WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
|
||||
KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x",
|
||||
uvcb.header.rc, uvcb.header.rrc);
|
||||
WARN_ONCE(cc, "protvirt destroy vm fast failed handle %llx rc %x rrc %x",
|
||||
kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc);
|
||||
/* Inteded memory leak on "impossible" error */
|
||||
if (!cc)
|
||||
kvm_s390_pv_dealloc_vm(kvm);
|
||||
return cc ? -EIO : 0;
|
||||
}
|
||||
|
||||
static inline bool is_destroy_fast_available(void)
|
||||
{
|
||||
return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list);
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_s390_pv_set_aside - Set aside a protected VM for later teardown.
|
||||
* @kvm: the VM
|
||||
* @rc: return value for the RC field of the UVCB
|
||||
* @rrc: return value for the RRC field of the UVCB
|
||||
*
|
||||
* Set aside the protected VM for a subsequent teardown. The VM will be able
|
||||
* to continue immediately as a non-secure VM, and the information needed to
|
||||
* properly tear down the protected VM is set aside. If another protected VM
|
||||
* was already set aside without starting its teardown, this function will
|
||||
* fail.
|
||||
* The CPUs of the protected VM need to be destroyed beforehand.
|
||||
*
|
||||
* Context: kvm->lock needs to be held
|
||||
*
|
||||
* Return: 0 in case of success, -EINVAL if another protected VM was already set
|
||||
* aside, -ENOMEM if the system ran out of memory.
|
||||
*/
|
||||
int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
|
||||
{
|
||||
struct pv_vm_to_be_destroyed *priv;
|
||||
int res = 0;
|
||||
|
||||
lockdep_assert_held(&kvm->lock);
|
||||
/*
|
||||
* If another protected VM was already prepared for teardown, refuse.
|
||||
* A normal deinitialization has to be performed instead.
|
||||
*/
|
||||
if (kvm->arch.pv.set_aside)
|
||||
return -EINVAL;
|
||||
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
|
||||
if (!priv)
|
||||
return -ENOMEM;
|
||||
|
||||
if (is_destroy_fast_available()) {
|
||||
res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
|
||||
} else {
|
||||
priv->stor_var = kvm->arch.pv.stor_var;
|
||||
priv->stor_base = kvm->arch.pv.stor_base;
|
||||
priv->handle = kvm_s390_pv_get_handle(kvm);
|
||||
priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table;
|
||||
WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
|
||||
if (s390_replace_asce(kvm->arch.gmap))
|
||||
res = -ENOMEM;
|
||||
}
|
||||
|
||||
if (res) {
|
||||
kfree(priv);
|
||||
return res;
|
||||
}
|
||||
|
||||
kvm_s390_destroy_lower_2g(kvm);
|
||||
kvm_s390_clear_pv_state(kvm);
|
||||
kvm->arch.pv.set_aside = priv;
|
||||
|
||||
*rc = UVC_RC_EXECUTED;
|
||||
*rrc = 42;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_s390_pv_deinit_vm - Deinitialize the current protected VM
|
||||
* @kvm: the KVM whose protected VM needs to be deinitialized
|
||||
* @rc: the RC code of the UVC
|
||||
* @rrc: the RRC code of the UVC
|
||||
*
|
||||
* Deinitialize the current protected VM. This function will destroy and
|
||||
* cleanup the current protected VM, but it will not cleanup the guest
|
||||
* memory. This function should only be called when the protected VM has
|
||||
* just been created and therefore does not have any guest memory, or when
|
||||
* the caller cleans up the guest memory separately.
|
||||
*
|
||||
* This function should not fail, but if it does, the donated memory must
|
||||
* not be freed.
|
||||
*
|
||||
* Context: kvm->lock needs to be held
|
||||
*
|
||||
* Return: 0 in case of success, otherwise -EIO
|
||||
*/
|
||||
int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
|
||||
{
|
||||
int cc;
|
||||
@ -167,15 +370,6 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
|
||||
cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
|
||||
UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
|
||||
WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
|
||||
/*
|
||||
* if the mm still has a mapping, make all its pages accessible
|
||||
* before destroying the guest
|
||||
*/
|
||||
if (mmget_not_zero(kvm->mm)) {
|
||||
s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
|
||||
mmput(kvm->mm);
|
||||
}
|
||||
|
||||
if (!cc) {
|
||||
atomic_dec(&kvm->mm->context.protected_count);
|
||||
kvm_s390_pv_dealloc_vm(kvm);
|
||||
@ -189,11 +383,137 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
|
||||
return cc ? -EIO : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated
|
||||
* with a specific KVM.
|
||||
* @kvm: the KVM to be cleaned up
|
||||
* @rc: the RC code of the first failing UVC
|
||||
* @rrc: the RRC code of the first failing UVC
|
||||
*
|
||||
* This function will clean up all protected VMs associated with a KVM.
|
||||
* This includes the active one, the one prepared for deinitialization with
|
||||
* kvm_s390_pv_set_aside, and any still pending in the need_cleanup list.
|
||||
*
|
||||
* Context: kvm->lock needs to be held unless being called from
|
||||
* kvm_arch_destroy_vm.
|
||||
*
|
||||
* Return: 0 if all VMs are successfully cleaned up, otherwise -EIO
|
||||
*/
|
||||
int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
|
||||
{
|
||||
struct pv_vm_to_be_destroyed *cur;
|
||||
bool need_zap = false;
|
||||
u16 _rc, _rrc;
|
||||
int cc = 0;
|
||||
|
||||
/* Make sure the counter does not reach 0 before calling s390_uv_destroy_range */
|
||||
atomic_inc(&kvm->mm->context.protected_count);
|
||||
|
||||
*rc = 1;
|
||||
/* If the current VM is protected, destroy it */
|
||||
if (kvm_s390_pv_get_handle(kvm)) {
|
||||
cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
|
||||
need_zap = true;
|
||||
}
|
||||
|
||||
/* If a previous protected VM was set aside, put it in the need_cleanup list */
|
||||
if (kvm->arch.pv.set_aside) {
|
||||
list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
|
||||
kvm->arch.pv.set_aside = NULL;
|
||||
}
|
||||
|
||||
/* Cleanup all protected VMs in the need_cleanup list */
|
||||
while (!list_empty(&kvm->arch.pv.need_cleanup)) {
|
||||
cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list);
|
||||
need_zap = true;
|
||||
if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) {
|
||||
cc = 1;
|
||||
/*
|
||||
* Only return the first error rc and rrc, so make
|
||||
* sure it is not overwritten. All destroys will
|
||||
* additionally be reported via KVM_UV_EVENT().
|
||||
*/
|
||||
if (*rc == UVC_RC_EXECUTED) {
|
||||
*rc = _rc;
|
||||
*rrc = _rrc;
|
||||
}
|
||||
}
|
||||
list_del(&cur->list);
|
||||
kfree(cur);
|
||||
}
|
||||
|
||||
/*
|
||||
* If the mm still has a mapping, try to mark all its pages as
|
||||
* accessible. The counter should not reach zero before this
|
||||
* cleanup has been performed.
|
||||
*/
|
||||
if (need_zap && mmget_not_zero(kvm->mm)) {
|
||||
s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
|
||||
mmput(kvm->mm);
|
||||
}
|
||||
|
||||
/* Now the counter can safely reach 0 */
|
||||
atomic_dec(&kvm->mm->context.protected_count);
|
||||
return cc ? -EIO : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM.
|
||||
* @kvm: the VM previously associated with the protected VM
|
||||
* @rc: return value for the RC field of the UVCB
|
||||
* @rrc: return value for the RRC field of the UVCB
|
||||
*
|
||||
* Tear down the protected VM that had been previously prepared for teardown
|
||||
* using kvm_s390_pv_set_aside_vm. Ideally this should be called by
|
||||
* userspace asynchronously from a separate thread.
|
||||
*
|
||||
* Context: kvm->lock must not be held.
|
||||
*
|
||||
* Return: 0 in case of success, -EINVAL if no protected VM had been
|
||||
* prepared for asynchronous teardowm, -EIO in case of other errors.
|
||||
*/
|
||||
int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
|
||||
{
|
||||
struct pv_vm_to_be_destroyed *p;
|
||||
int ret = 0;
|
||||
|
||||
lockdep_assert_not_held(&kvm->lock);
|
||||
mutex_lock(&kvm->lock);
|
||||
p = kvm->arch.pv.set_aside;
|
||||
kvm->arch.pv.set_aside = NULL;
|
||||
mutex_unlock(&kvm->lock);
|
||||
if (!p)
|
||||
return -EINVAL;
|
||||
|
||||
/* When a fatal signal is received, stop immediately */
|
||||
if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
|
||||
goto done;
|
||||
if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
|
||||
ret = -EIO;
|
||||
kfree(p);
|
||||
p = NULL;
|
||||
done:
|
||||
/*
|
||||
* p is not NULL if we aborted because of a fatal signal, in which
|
||||
* case queue the leftover for later cleanup.
|
||||
*/
|
||||
if (p) {
|
||||
mutex_lock(&kvm->lock);
|
||||
list_add(&p->list, &kvm->arch.pv.need_cleanup);
|
||||
mutex_unlock(&kvm->lock);
|
||||
/* Did not finish, but pretend things went well */
|
||||
*rc = UVC_RC_EXECUTED;
|
||||
*rrc = 42;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
|
||||
struct mm_struct *mm)
|
||||
{
|
||||
struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier);
|
||||
u16 dummy;
|
||||
int r;
|
||||
|
||||
/*
|
||||
* No locking is needed since this is the last thread of the last user of this
|
||||
@ -202,7 +522,9 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
|
||||
* unregistered. This means that if this notifier runs, then the
|
||||
* struct kvm is still valid.
|
||||
*/
|
||||
kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
|
||||
r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
|
||||
if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm))
|
||||
kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy);
|
||||
}
|
||||
|
||||
static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
|
||||
@ -226,8 +548,9 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
|
||||
uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
|
||||
uvcb.guest_stor_len = kvm->arch.pv.guest_len;
|
||||
uvcb.guest_asce = kvm->arch.gmap->asce;
|
||||
uvcb.guest_sca = (unsigned long)kvm->arch.sca;
|
||||
uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
|
||||
uvcb.guest_sca = virt_to_phys(kvm->arch.sca);
|
||||
uvcb.conf_base_stor_origin =
|
||||
virt_to_phys((void *)kvm->arch.pv.stor_base);
|
||||
uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
|
||||
|
||||
cc = uv_call_sched(0, (u64)&uvcb);
|
||||
|
@ -656,7 +656,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
|
||||
page = gfn_to_page(kvm, gpa_to_gfn(gpa));
|
||||
if (is_error_page(page))
|
||||
return -EINVAL;
|
||||
*hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
|
||||
*hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -871,7 +871,7 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
|
||||
WARN_ON_ONCE(rc);
|
||||
return 1;
|
||||
}
|
||||
vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
|
||||
vsie_page->scb_o = phys_to_virt(hpa);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -72,7 +72,7 @@ static struct gmap *gmap_alloc(unsigned long limit)
|
||||
goto out_free;
|
||||
page->index = 0;
|
||||
list_add(&page->lru, &gmap->crst_list);
|
||||
table = (unsigned long *) page_to_phys(page);
|
||||
table = page_to_virt(page);
|
||||
crst_table_init(table, etype);
|
||||
gmap->table = table;
|
||||
gmap->asce = atype | _ASCE_TABLE_LENGTH |
|
||||
@ -311,12 +311,12 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
|
||||
page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
new = (unsigned long *) page_to_phys(page);
|
||||
new = page_to_virt(page);
|
||||
crst_table_init(new, init);
|
||||
spin_lock(&gmap->guest_table_lock);
|
||||
if (*table & _REGION_ENTRY_INVALID) {
|
||||
list_add(&page->lru, &gmap->crst_list);
|
||||
*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
|
||||
*table = __pa(new) | _REGION_ENTRY_LENGTH |
|
||||
(*table & _REGION_ENTRY_TYPE_MASK);
|
||||
page->index = gaddr;
|
||||
page = NULL;
|
||||
@ -556,7 +556,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
|
||||
gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
|
||||
gaddr & _REGION1_MASK))
|
||||
return -ENOMEM;
|
||||
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
|
||||
table = __va(*table & _REGION_ENTRY_ORIGIN);
|
||||
}
|
||||
if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
|
||||
table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
|
||||
@ -564,7 +564,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
|
||||
gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
|
||||
gaddr & _REGION2_MASK))
|
||||
return -ENOMEM;
|
||||
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
|
||||
table = __va(*table & _REGION_ENTRY_ORIGIN);
|
||||
}
|
||||
if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
|
||||
table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
|
||||
@ -572,7 +572,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
|
||||
gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
|
||||
gaddr & _REGION3_MASK))
|
||||
return -ENOMEM;
|
||||
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
|
||||
table = __va(*table & _REGION_ENTRY_ORIGIN);
|
||||
}
|
||||
table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
|
||||
/* Walk the parent mm page table */
|
||||
@ -812,7 +812,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
|
||||
break;
|
||||
if (*table & _REGION_ENTRY_INVALID)
|
||||
return NULL;
|
||||
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
|
||||
table = __va(*table & _REGION_ENTRY_ORIGIN);
|
||||
fallthrough;
|
||||
case _ASCE_TYPE_REGION2:
|
||||
table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
|
||||
@ -820,7 +820,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
|
||||
break;
|
||||
if (*table & _REGION_ENTRY_INVALID)
|
||||
return NULL;
|
||||
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
|
||||
table = __va(*table & _REGION_ENTRY_ORIGIN);
|
||||
fallthrough;
|
||||
case _ASCE_TYPE_REGION3:
|
||||
table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
|
||||
@ -828,7 +828,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
|
||||
break;
|
||||
if (*table & _REGION_ENTRY_INVALID)
|
||||
return NULL;
|
||||
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
|
||||
table = __va(*table & _REGION_ENTRY_ORIGIN);
|
||||
fallthrough;
|
||||
case _ASCE_TYPE_SEGMENT:
|
||||
table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
|
||||
@ -836,7 +836,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
|
||||
break;
|
||||
if (*table & _REGION_ENTRY_INVALID)
|
||||
return NULL;
|
||||
table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
|
||||
table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
|
||||
table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
|
||||
}
|
||||
return table;
|
||||
@ -1149,7 +1149,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
|
||||
if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
|
||||
address = pte_val(pte) & PAGE_MASK;
|
||||
address += gaddr & ~PAGE_MASK;
|
||||
*val = *(unsigned long *) address;
|
||||
*val = *(unsigned long *)__va(address);
|
||||
set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
|
||||
/* Do *NOT* clear the _PAGE_INVALID bit! */
|
||||
rc = 0;
|
||||
@ -1334,7 +1334,8 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
|
||||
*/
|
||||
static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
|
||||
{
|
||||
unsigned long sto, *ste, *pgt;
|
||||
unsigned long *ste;
|
||||
phys_addr_t sto, pgt;
|
||||
struct page *page;
|
||||
|
||||
BUG_ON(!gmap_is_shadow(sg));
|
||||
@ -1342,13 +1343,13 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
|
||||
if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
|
||||
return;
|
||||
gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
|
||||
sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
|
||||
sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
|
||||
gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
|
||||
pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
|
||||
pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
|
||||
*ste = _SEGMENT_ENTRY_EMPTY;
|
||||
__gmap_unshadow_pgt(sg, raddr, pgt);
|
||||
__gmap_unshadow_pgt(sg, raddr, __va(pgt));
|
||||
/* Free page table */
|
||||
page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
|
||||
page = phys_to_page(pgt);
|
||||
list_del(&page->lru);
|
||||
page_table_free_pgste(page);
|
||||
}
|
||||
@ -1364,19 +1365,19 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
|
||||
static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
|
||||
unsigned long *sgt)
|
||||
{
|
||||
unsigned long *pgt;
|
||||
struct page *page;
|
||||
phys_addr_t pgt;
|
||||
int i;
|
||||
|
||||
BUG_ON(!gmap_is_shadow(sg));
|
||||
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
|
||||
if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
|
||||
continue;
|
||||
pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
|
||||
pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
|
||||
sgt[i] = _SEGMENT_ENTRY_EMPTY;
|
||||
__gmap_unshadow_pgt(sg, raddr, pgt);
|
||||
__gmap_unshadow_pgt(sg, raddr, __va(pgt));
|
||||
/* Free page table */
|
||||
page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
|
||||
page = phys_to_page(pgt);
|
||||
list_del(&page->lru);
|
||||
page_table_free_pgste(page);
|
||||
}
|
||||
@ -1391,7 +1392,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
|
||||
*/
|
||||
static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
|
||||
{
|
||||
unsigned long r3o, *r3e, *sgt;
|
||||
unsigned long r3o, *r3e;
|
||||
phys_addr_t sgt;
|
||||
struct page *page;
|
||||
|
||||
BUG_ON(!gmap_is_shadow(sg));
|
||||
@ -1400,12 +1402,12 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
|
||||
return;
|
||||
gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
|
||||
r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
|
||||
gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
|
||||
sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
|
||||
gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
|
||||
sgt = *r3e & _REGION_ENTRY_ORIGIN;
|
||||
*r3e = _REGION3_ENTRY_EMPTY;
|
||||
__gmap_unshadow_sgt(sg, raddr, sgt);
|
||||
__gmap_unshadow_sgt(sg, raddr, __va(sgt));
|
||||
/* Free segment table */
|
||||
page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
|
||||
page = phys_to_page(sgt);
|
||||
list_del(&page->lru);
|
||||
__free_pages(page, CRST_ALLOC_ORDER);
|
||||
}
|
||||
@ -1421,19 +1423,19 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
|
||||
static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
|
||||
unsigned long *r3t)
|
||||
{
|
||||
unsigned long *sgt;
|
||||
struct page *page;
|
||||
phys_addr_t sgt;
|
||||
int i;
|
||||
|
||||
BUG_ON(!gmap_is_shadow(sg));
|
||||
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
|
||||
if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
|
||||
continue;
|
||||
sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
|
||||
sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
|
||||
r3t[i] = _REGION3_ENTRY_EMPTY;
|
||||
__gmap_unshadow_sgt(sg, raddr, sgt);
|
||||
__gmap_unshadow_sgt(sg, raddr, __va(sgt));
|
||||
/* Free segment table */
|
||||
page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
|
||||
page = phys_to_page(sgt);
|
||||
list_del(&page->lru);
|
||||
__free_pages(page, CRST_ALLOC_ORDER);
|
||||
}
|
||||
@ -1448,7 +1450,8 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
|
||||
*/
|
||||
static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
|
||||
{
|
||||
unsigned long r2o, *r2e, *r3t;
|
||||
unsigned long r2o, *r2e;
|
||||
phys_addr_t r3t;
|
||||
struct page *page;
|
||||
|
||||
BUG_ON(!gmap_is_shadow(sg));
|
||||
@ -1457,12 +1460,12 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
|
||||
return;
|
||||
gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
|
||||
r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
|
||||
gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
|
||||
r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
|
||||
gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
|
||||
r3t = *r2e & _REGION_ENTRY_ORIGIN;
|
||||
*r2e = _REGION2_ENTRY_EMPTY;
|
||||
__gmap_unshadow_r3t(sg, raddr, r3t);
|
||||
__gmap_unshadow_r3t(sg, raddr, __va(r3t));
|
||||
/* Free region 3 table */
|
||||
page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
|
||||
page = phys_to_page(r3t);
|
||||
list_del(&page->lru);
|
||||
__free_pages(page, CRST_ALLOC_ORDER);
|
||||
}
|
||||
@ -1478,7 +1481,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
|
||||
static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
|
||||
unsigned long *r2t)
|
||||
{
|
||||
unsigned long *r3t;
|
||||
phys_addr_t r3t;
|
||||
struct page *page;
|
||||
int i;
|
||||
|
||||
@ -1486,11 +1489,11 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
|
||||
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
|
||||
if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
|
||||
continue;
|
||||
r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
|
||||
r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
|
||||
r2t[i] = _REGION2_ENTRY_EMPTY;
|
||||
__gmap_unshadow_r3t(sg, raddr, r3t);
|
||||
__gmap_unshadow_r3t(sg, raddr, __va(r3t));
|
||||
/* Free region 3 table */
|
||||
page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
|
||||
page = phys_to_page(r3t);
|
||||
list_del(&page->lru);
|
||||
__free_pages(page, CRST_ALLOC_ORDER);
|
||||
}
|
||||
@ -1505,8 +1508,9 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
|
||||
*/
|
||||
static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
|
||||
{
|
||||
unsigned long r1o, *r1e, *r2t;
|
||||
unsigned long r1o, *r1e;
|
||||
struct page *page;
|
||||
phys_addr_t r2t;
|
||||
|
||||
BUG_ON(!gmap_is_shadow(sg));
|
||||
r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
|
||||
@ -1514,12 +1518,12 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
|
||||
return;
|
||||
gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
|
||||
r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
|
||||
gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
|
||||
r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
|
||||
gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
|
||||
r2t = *r1e & _REGION_ENTRY_ORIGIN;
|
||||
*r1e = _REGION1_ENTRY_EMPTY;
|
||||
__gmap_unshadow_r2t(sg, raddr, r2t);
|
||||
__gmap_unshadow_r2t(sg, raddr, __va(r2t));
|
||||
/* Free region 2 table */
|
||||
page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
|
||||
page = phys_to_page(r2t);
|
||||
list_del(&page->lru);
|
||||
__free_pages(page, CRST_ALLOC_ORDER);
|
||||
}
|
||||
@ -1535,22 +1539,23 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
|
||||
static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
|
||||
unsigned long *r1t)
|
||||
{
|
||||
unsigned long asce, *r2t;
|
||||
unsigned long asce;
|
||||
struct page *page;
|
||||
phys_addr_t r2t;
|
||||
int i;
|
||||
|
||||
BUG_ON(!gmap_is_shadow(sg));
|
||||
asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
|
||||
asce = __pa(r1t) | _ASCE_TYPE_REGION1;
|
||||
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
|
||||
if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
|
||||
continue;
|
||||
r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
|
||||
__gmap_unshadow_r2t(sg, raddr, r2t);
|
||||
r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
|
||||
__gmap_unshadow_r2t(sg, raddr, __va(r2t));
|
||||
/* Clear entry and flush translation r1t -> r2t */
|
||||
gmap_idte_one(asce, raddr);
|
||||
r1t[i] = _REGION1_ENTRY_EMPTY;
|
||||
/* Free region 2 table */
|
||||
page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
|
||||
page = phys_to_page(r2t);
|
||||
list_del(&page->lru);
|
||||
__free_pages(page, CRST_ALLOC_ORDER);
|
||||
}
|
||||
@ -1572,7 +1577,7 @@ static void gmap_unshadow(struct gmap *sg)
|
||||
sg->removed = 1;
|
||||
gmap_call_notifier(sg, 0, -1UL);
|
||||
gmap_flush_tlb(sg);
|
||||
table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
|
||||
table = __va(sg->asce & _ASCE_ORIGIN);
|
||||
switch (sg->asce & _ASCE_TYPE_MASK) {
|
||||
case _ASCE_TYPE_REGION1:
|
||||
__gmap_unshadow_r1t(sg, 0, table);
|
||||
@ -1747,7 +1752,8 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
|
||||
int fake)
|
||||
{
|
||||
unsigned long raddr, origin, offset, len;
|
||||
unsigned long *s_r2t, *table;
|
||||
unsigned long *table;
|
||||
phys_addr_t s_r2t;
|
||||
struct page *page;
|
||||
int rc;
|
||||
|
||||
@ -1759,7 +1765,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
|
||||
page->index = r2t & _REGION_ENTRY_ORIGIN;
|
||||
if (fake)
|
||||
page->index |= GMAP_SHADOW_FAKE_TABLE;
|
||||
s_r2t = (unsigned long *) page_to_phys(page);
|
||||
s_r2t = page_to_phys(page);
|
||||
/* Install shadow region second table */
|
||||
spin_lock(&sg->guest_table_lock);
|
||||
table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
|
||||
@ -1774,9 +1780,9 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
|
||||
rc = -EAGAIN; /* Race with shadow */
|
||||
goto out_free;
|
||||
}
|
||||
crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
|
||||
crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
|
||||
/* mark as invalid as long as the parent table is not protected */
|
||||
*table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
|
||||
*table = s_r2t | _REGION_ENTRY_LENGTH |
|
||||
_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
|
||||
if (sg->edat_level >= 1)
|
||||
*table |= (r2t & _REGION_ENTRY_PROTECT);
|
||||
@ -1797,8 +1803,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
|
||||
spin_lock(&sg->guest_table_lock);
|
||||
if (!rc) {
|
||||
table = gmap_table_walk(sg, saddr, 4);
|
||||
if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
|
||||
(unsigned long) s_r2t)
|
||||
if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
|
||||
rc = -EAGAIN; /* Race with unshadow */
|
||||
else
|
||||
*table &= ~_REGION_ENTRY_INVALID;
|
||||
@ -1831,7 +1836,8 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
|
||||
int fake)
|
||||
{
|
||||
unsigned long raddr, origin, offset, len;
|
||||
unsigned long *s_r3t, *table;
|
||||
unsigned long *table;
|
||||
phys_addr_t s_r3t;
|
||||
struct page *page;
|
||||
int rc;
|
||||
|
||||
@ -1843,7 +1849,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
|
||||
page->index = r3t & _REGION_ENTRY_ORIGIN;
|
||||
if (fake)
|
||||
page->index |= GMAP_SHADOW_FAKE_TABLE;
|
||||
s_r3t = (unsigned long *) page_to_phys(page);
|
||||
s_r3t = page_to_phys(page);
|
||||
/* Install shadow region second table */
|
||||
spin_lock(&sg->guest_table_lock);
|
||||
table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
|
||||
@ -1858,9 +1864,9 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
|
||||
rc = -EAGAIN; /* Race with shadow */
|
||||
goto out_free;
|
||||
}
|
||||
crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
|
||||
crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
|
||||
/* mark as invalid as long as the parent table is not protected */
|
||||
*table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
|
||||
*table = s_r3t | _REGION_ENTRY_LENGTH |
|
||||
_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
|
||||
if (sg->edat_level >= 1)
|
||||
*table |= (r3t & _REGION_ENTRY_PROTECT);
|
||||
@ -1881,8 +1887,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
|
||||
spin_lock(&sg->guest_table_lock);
|
||||
if (!rc) {
|
||||
table = gmap_table_walk(sg, saddr, 3);
|
||||
if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
|
||||
(unsigned long) s_r3t)
|
||||
if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
|
||||
rc = -EAGAIN; /* Race with unshadow */
|
||||
else
|
||||
*table &= ~_REGION_ENTRY_INVALID;
|
||||
@ -1915,7 +1920,8 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
|
||||
int fake)
|
||||
{
|
||||
unsigned long raddr, origin, offset, len;
|
||||
unsigned long *s_sgt, *table;
|
||||
unsigned long *table;
|
||||
phys_addr_t s_sgt;
|
||||
struct page *page;
|
||||
int rc;
|
||||
|
||||
@ -1927,7 +1933,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
|
||||
page->index = sgt & _REGION_ENTRY_ORIGIN;
|
||||
if (fake)
|
||||
page->index |= GMAP_SHADOW_FAKE_TABLE;
|
||||
s_sgt = (unsigned long *) page_to_phys(page);
|
||||
s_sgt = page_to_phys(page);
|
||||
/* Install shadow region second table */
|
||||
spin_lock(&sg->guest_table_lock);
|
||||
table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
|
||||
@ -1942,9 +1948,9 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
|
||||
rc = -EAGAIN; /* Race with shadow */
|
||||
goto out_free;
|
||||
}
|
||||
crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
|
||||
crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
|
||||
/* mark as invalid as long as the parent table is not protected */
|
||||
*table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
|
||||
*table = s_sgt | _REGION_ENTRY_LENGTH |
|
||||
_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
|
||||
if (sg->edat_level >= 1)
|
||||
*table |= sgt & _REGION_ENTRY_PROTECT;
|
||||
@ -1965,8 +1971,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
|
||||
spin_lock(&sg->guest_table_lock);
|
||||
if (!rc) {
|
||||
table = gmap_table_walk(sg, saddr, 2);
|
||||
if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
|
||||
(unsigned long) s_sgt)
|
||||
if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
|
||||
rc = -EAGAIN; /* Race with unshadow */
|
||||
else
|
||||
*table &= ~_REGION_ENTRY_INVALID;
|
||||
@ -2039,8 +2044,9 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
|
||||
int fake)
|
||||
{
|
||||
unsigned long raddr, origin;
|
||||
unsigned long *s_pgt, *table;
|
||||
unsigned long *table;
|
||||
struct page *page;
|
||||
phys_addr_t s_pgt;
|
||||
int rc;
|
||||
|
||||
BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
|
||||
@ -2051,7 +2057,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
|
||||
page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
|
||||
if (fake)
|
||||
page->index |= GMAP_SHADOW_FAKE_TABLE;
|
||||
s_pgt = (unsigned long *) page_to_phys(page);
|
||||
s_pgt = page_to_phys(page);
|
||||
/* Install shadow page table */
|
||||
spin_lock(&sg->guest_table_lock);
|
||||
table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
|
||||
@ -2084,8 +2090,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
|
||||
spin_lock(&sg->guest_table_lock);
|
||||
if (!rc) {
|
||||
table = gmap_table_walk(sg, saddr, 1);
|
||||
if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
|
||||
(unsigned long) s_pgt)
|
||||
if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
|
||||
rc = -EAGAIN; /* Race with unshadow */
|
||||
else
|
||||
*table &= ~_SEGMENT_ENTRY_INVALID;
|
||||
|
@ -141,25 +141,25 @@ void mark_rodata_ro(void)
|
||||
debug_checkwx();
|
||||
}
|
||||
|
||||
int set_memory_encrypted(unsigned long addr, int numpages)
|
||||
int set_memory_encrypted(unsigned long vaddr, int numpages)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* make specified pages unshared, (swiotlb, dma_free) */
|
||||
for (i = 0; i < numpages; ++i) {
|
||||
uv_remove_shared(addr);
|
||||
addr += PAGE_SIZE;
|
||||
uv_remove_shared(virt_to_phys((void *)vaddr));
|
||||
vaddr += PAGE_SIZE;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int set_memory_decrypted(unsigned long addr, int numpages)
|
||||
int set_memory_decrypted(unsigned long vaddr, int numpages)
|
||||
{
|
||||
int i;
|
||||
/* make specified pages shared (swiotlb, dma_alloca) */
|
||||
for (i = 0; i < numpages; ++i) {
|
||||
uv_set_shared(addr);
|
||||
addr += PAGE_SIZE;
|
||||
uv_set_shared(virt_to_phys((void *)vaddr));
|
||||
vaddr += PAGE_SIZE;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -1603,10 +1603,8 @@ void __init intel_pmu_arch_lbr_init(void)
|
||||
* x86_perf_get_lbr - get the LBR records information
|
||||
*
|
||||
* @lbr: the caller's memory to store the LBR records information
|
||||
*
|
||||
* Returns: 0 indicates the LBR info has been successfully obtained
|
||||
*/
|
||||
int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
|
||||
void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
|
||||
{
|
||||
int lbr_fmt = x86_pmu.intel_cap.lbr_format;
|
||||
|
||||
@ -1614,8 +1612,6 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
|
||||
lbr->from = x86_pmu.lbr_from;
|
||||
lbr->to = x86_pmu.lbr_to;
|
||||
lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
|
||||
|
||||
|
@ -311,6 +311,9 @@
|
||||
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
|
||||
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
|
||||
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
|
||||
#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */
|
||||
#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */
|
||||
#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
|
||||
|
||||
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
|
||||
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
|
||||
|
@ -61,6 +61,8 @@
|
||||
#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10)
|
||||
/* Support for debug MSRs available */
|
||||
#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11)
|
||||
/* Support for extended gva ranges for flush hypercalls available */
|
||||
#define HV_FEATURE_EXT_GVA_RANGES_FLUSH BIT(14)
|
||||
/*
|
||||
* Support for returning hypercall output block via XMM
|
||||
* registers is available
|
||||
@ -607,6 +609,41 @@ struct hv_enlightened_vmcs {
|
||||
|
||||
#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF
|
||||
|
||||
/*
|
||||
* Note, Hyper-V isn't actually stealing bit 28 from Intel, just abusing it by
|
||||
* pairing it with architecturally impossible exit reasons. Bit 28 is set only
|
||||
* on SMI exits to a SMI transfer monitor (STM) and if and only if a MTF VM-Exit
|
||||
* is pending. I.e. it will never be set by hardware for non-SMI exits (there
|
||||
* are only three), nor will it ever be set unless the VMM is an STM.
|
||||
*/
|
||||
#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031
|
||||
|
||||
/*
|
||||
* Hyper-V uses the software reserved 32 bytes in VMCB control area to expose
|
||||
* SVM enlightenments to guests.
|
||||
*/
|
||||
struct hv_vmcb_enlightenments {
|
||||
struct __packed hv_enlightenments_control {
|
||||
u32 nested_flush_hypercall:1;
|
||||
u32 msr_bitmap:1;
|
||||
u32 enlightened_npt_tlb: 1;
|
||||
u32 reserved:29;
|
||||
} __packed hv_enlightenments_control;
|
||||
u32 hv_vp_id;
|
||||
u64 hv_vm_id;
|
||||
u64 partition_assist_page;
|
||||
u64 reserved;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* Hyper-V uses the software reserved clean bit in VMCB.
|
||||
*/
|
||||
#define HV_VMCB_NESTED_ENLIGHTENMENTS 31
|
||||
|
||||
/* Synthetic VM-Exit */
|
||||
#define HV_SVM_EXITCODE_ENL 0xf0000000
|
||||
#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1)
|
||||
|
||||
struct hv_partition_assist_pg {
|
||||
u32 tlb_lock_count;
|
||||
};
|
||||
|
@ -110,10 +110,12 @@ KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
|
||||
KVM_X86_OP_OPTIONAL(set_hv_timer)
|
||||
KVM_X86_OP_OPTIONAL(cancel_hv_timer)
|
||||
KVM_X86_OP(setup_mce)
|
||||
#ifdef CONFIG_KVM_SMM
|
||||
KVM_X86_OP(smi_allowed)
|
||||
KVM_X86_OP(enter_smm)
|
||||
KVM_X86_OP(leave_smm)
|
||||
KVM_X86_OP(enable_smi_window)
|
||||
#endif
|
||||
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
|
||||
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
|
||||
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
|
||||
@ -123,7 +125,7 @@ KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
|
||||
KVM_X86_OP(get_msr_feature)
|
||||
KVM_X86_OP(can_emulate_instruction)
|
||||
KVM_X86_OP(apic_init_signal_blocked)
|
||||
KVM_X86_OP_OPTIONAL(enable_direct_tlbflush)
|
||||
KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
|
||||
KVM_X86_OP_OPTIONAL(migrate_timers)
|
||||
KVM_X86_OP(msr_filter_changed)
|
||||
KVM_X86_OP(complete_emulated_msr)
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <linux/clocksource.h>
|
||||
#include <linux/irqbypass.h>
|
||||
#include <linux/hyperv.h>
|
||||
#include <linux/kfifo.h>
|
||||
|
||||
#include <asm/apic.h>
|
||||
#include <asm/pvclock-abi.h>
|
||||
@ -81,7 +82,9 @@
|
||||
#define KVM_REQ_NMI KVM_ARCH_REQ(9)
|
||||
#define KVM_REQ_PMU KVM_ARCH_REQ(10)
|
||||
#define KVM_REQ_PMI KVM_ARCH_REQ(11)
|
||||
#ifdef CONFIG_KVM_SMM
|
||||
#define KVM_REQ_SMI KVM_ARCH_REQ(12)
|
||||
#endif
|
||||
#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13)
|
||||
#define KVM_REQ_MCLOCK_INPROGRESS \
|
||||
KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
|
||||
@ -108,6 +111,8 @@
|
||||
KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
|
||||
#define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \
|
||||
KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
|
||||
#define KVM_REQ_HV_TLB_FLUSH \
|
||||
KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
|
||||
|
||||
#define CR0_RESERVED_BITS \
|
||||
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
|
||||
@ -204,6 +209,7 @@ typedef enum exit_fastpath_completion fastpath_t;
|
||||
|
||||
struct x86_emulate_ctxt;
|
||||
struct x86_exception;
|
||||
union kvm_smram;
|
||||
enum x86_intercept;
|
||||
enum x86_intercept_stage;
|
||||
|
||||
@ -253,16 +259,16 @@ enum x86_intercept_stage;
|
||||
#define PFERR_GUEST_PAGE_BIT 33
|
||||
#define PFERR_IMPLICIT_ACCESS_BIT 48
|
||||
|
||||
#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
|
||||
#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
|
||||
#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
|
||||
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
|
||||
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
|
||||
#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
|
||||
#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
|
||||
#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
|
||||
#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
|
||||
#define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT)
|
||||
#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT)
|
||||
#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT)
|
||||
#define PFERR_USER_MASK BIT(PFERR_USER_BIT)
|
||||
#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT)
|
||||
#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT)
|
||||
#define PFERR_PK_MASK BIT(PFERR_PK_BIT)
|
||||
#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT)
|
||||
#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT)
|
||||
#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT)
|
||||
#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
|
||||
|
||||
#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
|
||||
PFERR_WRITE_MASK | \
|
||||
@ -488,17 +494,19 @@ enum pmc_type {
|
||||
struct kvm_pmc {
|
||||
enum pmc_type type;
|
||||
u8 idx;
|
||||
bool is_paused;
|
||||
bool intr;
|
||||
u64 counter;
|
||||
u64 prev_counter;
|
||||
u64 eventsel;
|
||||
struct perf_event *perf_event;
|
||||
struct kvm_vcpu *vcpu;
|
||||
/*
|
||||
* only for creating or reusing perf_event,
|
||||
* eventsel value for general purpose counters,
|
||||
* ctrl value for fixed counters.
|
||||
*/
|
||||
u64 current_config;
|
||||
bool is_paused;
|
||||
bool intr;
|
||||
};
|
||||
|
||||
/* More counters may conflict with other existing Architectural MSRs */
|
||||
@ -524,7 +532,16 @@ struct kvm_pmu {
|
||||
struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
|
||||
struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
|
||||
struct irq_work irq_work;
|
||||
DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
|
||||
|
||||
/*
|
||||
* Overlay the bitmap with a 64-bit atomic so that all bits can be
|
||||
* set in a single access, e.g. to reprogram all counters when the PMU
|
||||
* filter changes.
|
||||
*/
|
||||
union {
|
||||
DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
|
||||
atomic64_t __reprogram_pmi;
|
||||
};
|
||||
DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
|
||||
DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX);
|
||||
|
||||
@ -602,6 +619,29 @@ struct kvm_vcpu_hv_synic {
|
||||
bool dont_zero_synic_pages;
|
||||
};
|
||||
|
||||
/* The maximum number of entries on the TLB flush fifo. */
|
||||
#define KVM_HV_TLB_FLUSH_FIFO_SIZE (16)
|
||||
/*
|
||||
* Note: the following 'magic' entry is made up by KVM to avoid putting
|
||||
* anything besides GVA on the TLB flush fifo. It is theoretically possible
|
||||
* to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000
|
||||
* which will look identical. KVM's action to 'flush everything' instead of
|
||||
* flushing these particular addresses is, however, fully legitimate as
|
||||
* flushing more than requested is always OK.
|
||||
*/
|
||||
#define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1)
|
||||
|
||||
enum hv_tlb_flush_fifos {
|
||||
HV_L1_TLB_FLUSH_FIFO,
|
||||
HV_L2_TLB_FLUSH_FIFO,
|
||||
HV_NR_TLB_FLUSH_FIFOS,
|
||||
};
|
||||
|
||||
struct kvm_vcpu_hv_tlb_flush_fifo {
|
||||
spinlock_t write_lock;
|
||||
DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE);
|
||||
};
|
||||
|
||||
/* Hyper-V per vcpu emulation context */
|
||||
struct kvm_vcpu_hv {
|
||||
struct kvm_vcpu *vcpu;
|
||||
@ -623,6 +663,19 @@ struct kvm_vcpu_hv {
|
||||
u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */
|
||||
u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */
|
||||
} cpuid_cache;
|
||||
|
||||
struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS];
|
||||
|
||||
/* Preallocated buffer for handling hypercalls passing sparse vCPU set */
|
||||
u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS];
|
||||
|
||||
struct hv_vp_assist_page vp_assist_page;
|
||||
|
||||
struct {
|
||||
u64 pa_page_gpa;
|
||||
u64 vm_id;
|
||||
u32 vp_id;
|
||||
} nested;
|
||||
};
|
||||
|
||||
/* Xen HVM per vcpu emulation context */
|
||||
@ -633,6 +686,7 @@ struct kvm_vcpu_xen {
|
||||
struct gfn_to_pfn_cache vcpu_info_cache;
|
||||
struct gfn_to_pfn_cache vcpu_time_info_cache;
|
||||
struct gfn_to_pfn_cache runstate_cache;
|
||||
struct gfn_to_pfn_cache runstate2_cache;
|
||||
u64 last_steal;
|
||||
u64 runstate_entry_time;
|
||||
u64 runstate_times[4];
|
||||
@ -1059,6 +1113,7 @@ struct msr_bitmap_range {
|
||||
struct kvm_xen {
|
||||
u32 xen_version;
|
||||
bool long_mode;
|
||||
bool runstate_update_flag;
|
||||
u8 upcall_vector;
|
||||
struct gfn_to_pfn_cache shinfo_cache;
|
||||
struct idr evtchn_ports;
|
||||
@ -1156,7 +1211,18 @@ struct kvm_arch {
|
||||
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
|
||||
struct list_head active_mmu_pages;
|
||||
struct list_head zapped_obsolete_pages;
|
||||
struct list_head lpage_disallowed_mmu_pages;
|
||||
/*
|
||||
* A list of kvm_mmu_page structs that, if zapped, could possibly be
|
||||
* replaced by an NX huge page. A shadow page is on this list if its
|
||||
* existence disallows an NX huge page (nx_huge_page_disallowed is set)
|
||||
* and there are no other conditions that prevent a huge page, e.g.
|
||||
* the backing host page is huge, dirtly logging is not enabled for its
|
||||
* memslot, etc... Note, zapping shadow pages on this list doesn't
|
||||
* guarantee an NX huge page will be created in its stead, e.g. if the
|
||||
* guest attempts to execute from the region then KVM obviously can't
|
||||
* create an NX huge page (without hanging the guest).
|
||||
*/
|
||||
struct list_head possible_nx_huge_pages;
|
||||
struct kvm_page_track_notifier_node mmu_sp_tracker;
|
||||
struct kvm_page_track_notifier_head track_notifier_head;
|
||||
/*
|
||||
@ -1272,7 +1338,7 @@ struct kvm_arch {
|
||||
bool sgx_provisioning_allowed;
|
||||
|
||||
struct kvm_pmu_event_filter __rcu *pmu_event_filter;
|
||||
struct task_struct *nx_lpage_recovery_thread;
|
||||
struct task_struct *nx_huge_page_recovery_thread;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
@ -1284,6 +1350,9 @@ struct kvm_arch {
|
||||
*/
|
||||
bool tdp_mmu_enabled;
|
||||
|
||||
/* The number of TDP MMU pages across all roots. */
|
||||
atomic64_t tdp_mmu_pages;
|
||||
|
||||
/*
|
||||
* List of kvm_mmu_page structs being used as roots.
|
||||
* All kvm_mmu_page structs in the list should have
|
||||
@ -1304,21 +1373,13 @@ struct kvm_arch {
|
||||
*/
|
||||
struct list_head tdp_mmu_roots;
|
||||
|
||||
/*
|
||||
* List of kvm_mmu_page structs not being used as roots.
|
||||
* All kvm_mmu_page structs in the list should have
|
||||
* tdp_mmu_page set and a tdp_mmu_root_count of 0.
|
||||
*/
|
||||
struct list_head tdp_mmu_pages;
|
||||
|
||||
/*
|
||||
* Protects accesses to the following fields when the MMU lock
|
||||
* is held in read mode:
|
||||
* - tdp_mmu_roots (above)
|
||||
* - tdp_mmu_pages (above)
|
||||
* - the link field of kvm_mmu_page structs used by the TDP MMU
|
||||
* - lpage_disallowed_mmu_pages
|
||||
* - the lpage_disallowed_link field of kvm_mmu_page structs used
|
||||
* - possible_nx_huge_pages;
|
||||
* - the possible_nx_huge_page_link field of kvm_mmu_page structs used
|
||||
* by the TDP MMU
|
||||
* It is acceptable, but not necessary, to acquire this lock when
|
||||
* the thread holds the MMU lock in write mode.
|
||||
@ -1612,10 +1673,12 @@ struct kvm_x86_ops {
|
||||
|
||||
void (*setup_mce)(struct kvm_vcpu *vcpu);
|
||||
|
||||
#ifdef CONFIG_KVM_SMM
|
||||
int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
|
||||
int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
|
||||
int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
|
||||
int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram);
|
||||
int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
|
||||
void (*enable_smi_window)(struct kvm_vcpu *vcpu);
|
||||
#endif
|
||||
|
||||
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
|
||||
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
|
||||
@ -1630,7 +1693,7 @@ struct kvm_x86_ops {
|
||||
void *insn, int insn_len);
|
||||
|
||||
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
|
||||
int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
|
||||
int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);
|
||||
|
||||
void (*migrate_timers)(struct kvm_vcpu *vcpu);
|
||||
void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
|
||||
@ -1663,6 +1726,7 @@ struct kvm_x86_nested_ops {
|
||||
int (*enable_evmcs)(struct kvm_vcpu *vcpu,
|
||||
uint16_t *vmcs_version);
|
||||
uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
|
||||
void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu);
|
||||
};
|
||||
|
||||
struct kvm_x86_init_ops {
|
||||
@ -1844,6 +1908,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
|
||||
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
|
||||
|
||||
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
|
||||
void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
|
||||
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
|
||||
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
|
||||
|
||||
@ -1909,8 +1974,6 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
|
||||
void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
|
||||
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
|
||||
struct x86_exception *exception);
|
||||
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
|
||||
struct x86_exception *exception);
|
||||
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
|
||||
struct x86_exception *exception);
|
||||
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
|
||||
@ -1994,14 +2057,18 @@ enum {
|
||||
#define HF_NMI_MASK (1 << 3)
|
||||
#define HF_IRET_MASK (1 << 4)
|
||||
#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
|
||||
|
||||
#ifdef CONFIG_KVM_SMM
|
||||
#define HF_SMM_MASK (1 << 6)
|
||||
#define HF_SMM_INSIDE_NMI_MASK (1 << 7)
|
||||
|
||||
#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
|
||||
#define KVM_ADDRESS_SPACE_NUM 2
|
||||
|
||||
#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
|
||||
#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
|
||||
# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
|
||||
# define KVM_ADDRESS_SPACE_NUM 2
|
||||
# define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
|
||||
# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
|
||||
#else
|
||||
# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0)
|
||||
#endif
|
||||
|
||||
#define KVM_ARCH_WANT_MMU_NOTIFIER
|
||||
|
||||
@ -2089,14 +2156,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
|
||||
#endif
|
||||
}
|
||||
|
||||
#define put_smstate(type, buf, offset, val) \
|
||||
*(type *)((buf) + (offset) - 0x7e00) = val
|
||||
|
||||
#define GET_SMSTATE(type, buf, offset) \
|
||||
(*(type *)((buf) + (offset) - 0x7e00))
|
||||
|
||||
int kvm_cpu_dirty_log_size(void);
|
||||
|
||||
int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
|
||||
|
||||
#define KVM_CLOCK_VALID_FLAGS \
|
||||
|
@ -543,12 +543,12 @@ static inline void perf_check_microcode(void) { }
|
||||
|
||||
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
|
||||
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data);
|
||||
extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
|
||||
extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
|
||||
#else
|
||||
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data);
|
||||
static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
|
||||
static inline void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
|
||||
{
|
||||
return -1;
|
||||
memset(lbr, 0, sizeof(*lbr));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -5,6 +5,8 @@
|
||||
#include <uapi/asm/svm.h>
|
||||
#include <uapi/asm/kvm.h>
|
||||
|
||||
#include <asm/hyperv-tlfs.h>
|
||||
|
||||
/*
|
||||
* 32-bit intercept words in the VMCB Control Area, starting
|
||||
* at Byte offset 000h.
|
||||
@ -161,7 +163,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
|
||||
* Offset 0x3e0, 32 bytes reserved
|
||||
* for use by hypervisor/software.
|
||||
*/
|
||||
u8 reserved_sw[32];
|
||||
union {
|
||||
struct hv_vmcb_enlightenments hv_enlightenments;
|
||||
u8 reserved_sw[32];
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@ -293,12 +298,13 @@ struct vmcb_save_area {
|
||||
struct vmcb_seg ldtr;
|
||||
struct vmcb_seg idtr;
|
||||
struct vmcb_seg tr;
|
||||
u8 reserved_1[42];
|
||||
/* Reserved fields are named following their struct offset */
|
||||
u8 reserved_0xa0[42];
|
||||
u8 vmpl;
|
||||
u8 cpl;
|
||||
u8 reserved_2[4];
|
||||
u8 reserved_0xcc[4];
|
||||
u64 efer;
|
||||
u8 reserved_3[112];
|
||||
u8 reserved_0xd8[112];
|
||||
u64 cr4;
|
||||
u64 cr3;
|
||||
u64 cr0;
|
||||
@ -306,7 +312,7 @@ struct vmcb_save_area {
|
||||
u64 dr6;
|
||||
u64 rflags;
|
||||
u64 rip;
|
||||
u8 reserved_4[88];
|
||||
u8 reserved_0x180[88];
|
||||
u64 rsp;
|
||||
u64 s_cet;
|
||||
u64 ssp;
|
||||
@ -321,14 +327,14 @@ struct vmcb_save_area {
|
||||
u64 sysenter_esp;
|
||||
u64 sysenter_eip;
|
||||
u64 cr2;
|
||||
u8 reserved_5[32];
|
||||
u8 reserved_0x248[32];
|
||||
u64 g_pat;
|
||||
u64 dbgctl;
|
||||
u64 br_from;
|
||||
u64 br_to;
|
||||
u64 last_excp_from;
|
||||
u64 last_excp_to;
|
||||
u8 reserved_6[72];
|
||||
u8 reserved_0x298[72];
|
||||
u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
|
||||
} __packed;
|
||||
|
||||
@ -349,12 +355,12 @@ struct sev_es_save_area {
|
||||
u64 vmpl2_ssp;
|
||||
u64 vmpl3_ssp;
|
||||
u64 u_cet;
|
||||
u8 reserved_1[2];
|
||||
u8 reserved_0xc8[2];
|
||||
u8 vmpl;
|
||||
u8 cpl;
|
||||
u8 reserved_2[4];
|
||||
u8 reserved_0xcc[4];
|
||||
u64 efer;
|
||||
u8 reserved_3[104];
|
||||
u8 reserved_0xd8[104];
|
||||
u64 xss;
|
||||
u64 cr4;
|
||||
u64 cr3;
|
||||
@ -371,7 +377,7 @@ struct sev_es_save_area {
|
||||
u64 dr1_addr_mask;
|
||||
u64 dr2_addr_mask;
|
||||
u64 dr3_addr_mask;
|
||||
u8 reserved_4[24];
|
||||
u8 reserved_0x1c0[24];
|
||||
u64 rsp;
|
||||
u64 s_cet;
|
||||
u64 ssp;
|
||||
@ -386,21 +392,21 @@ struct sev_es_save_area {
|
||||
u64 sysenter_esp;
|
||||
u64 sysenter_eip;
|
||||
u64 cr2;
|
||||
u8 reserved_5[32];
|
||||
u8 reserved_0x248[32];
|
||||
u64 g_pat;
|
||||
u64 dbgctl;
|
||||
u64 br_from;
|
||||
u64 br_to;
|
||||
u64 last_excp_from;
|
||||
u64 last_excp_to;
|
||||
u8 reserved_7[80];
|
||||
u8 reserved_0x298[80];
|
||||
u32 pkru;
|
||||
u8 reserved_8[20];
|
||||
u64 reserved_9; /* rax already available at 0x01f8 */
|
||||
u32 tsc_aux;
|
||||
u8 reserved_0x2f0[24];
|
||||
u64 rcx;
|
||||
u64 rdx;
|
||||
u64 rbx;
|
||||
u64 reserved_10; /* rsp already available at 0x01d8 */
|
||||
u64 reserved_0x320; /* rsp already available at 0x01d8 */
|
||||
u64 rbp;
|
||||
u64 rsi;
|
||||
u64 rdi;
|
||||
@ -412,7 +418,7 @@ struct sev_es_save_area {
|
||||
u64 r13;
|
||||
u64 r14;
|
||||
u64 r15;
|
||||
u8 reserved_11[16];
|
||||
u8 reserved_0x380[16];
|
||||
u64 guest_exit_info_1;
|
||||
u64 guest_exit_info_2;
|
||||
u64 guest_exit_int_info;
|
||||
@ -425,7 +431,7 @@ struct sev_es_save_area {
|
||||
u64 pcpu_id;
|
||||
u64 event_inj;
|
||||
u64 xcr0;
|
||||
u8 reserved_12[16];
|
||||
u8 reserved_0x3f0[16];
|
||||
|
||||
/* Floating point area */
|
||||
u64 x87_dp;
|
||||
@ -443,23 +449,23 @@ struct sev_es_save_area {
|
||||
} __packed;
|
||||
|
||||
struct ghcb_save_area {
|
||||
u8 reserved_1[203];
|
||||
u8 reserved_0x0[203];
|
||||
u8 cpl;
|
||||
u8 reserved_2[116];
|
||||
u8 reserved_0xcc[116];
|
||||
u64 xss;
|
||||
u8 reserved_3[24];
|
||||
u8 reserved_0x148[24];
|
||||
u64 dr7;
|
||||
u8 reserved_4[16];
|
||||
u8 reserved_0x168[16];
|
||||
u64 rip;
|
||||
u8 reserved_5[88];
|
||||
u8 reserved_0x180[88];
|
||||
u64 rsp;
|
||||
u8 reserved_6[24];
|
||||
u8 reserved_0x1e0[24];
|
||||
u64 rax;
|
||||
u8 reserved_7[264];
|
||||
u8 reserved_0x200[264];
|
||||
u64 rcx;
|
||||
u64 rdx;
|
||||
u64 rbx;
|
||||
u8 reserved_8[8];
|
||||
u8 reserved_0x320[8];
|
||||
u64 rbp;
|
||||
u64 rsi;
|
||||
u64 rdi;
|
||||
@ -471,12 +477,12 @@ struct ghcb_save_area {
|
||||
u64 r13;
|
||||
u64 r14;
|
||||
u64 r15;
|
||||
u8 reserved_9[16];
|
||||
u8 reserved_0x380[16];
|
||||
u64 sw_exit_code;
|
||||
u64 sw_exit_info_1;
|
||||
u64 sw_exit_info_2;
|
||||
u64 sw_scratch;
|
||||
u8 reserved_10[56];
|
||||
u8 reserved_0x3b0[56];
|
||||
u64 xcr0;
|
||||
u8 valid_bitmap[16];
|
||||
u64 x87_state_gpa;
|
||||
@ -490,7 +496,7 @@ struct ghcb {
|
||||
|
||||
u8 shared_buffer[GHCB_SHARED_BUF_SIZE];
|
||||
|
||||
u8 reserved_1[10];
|
||||
u8 reserved_0xff0[10];
|
||||
u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */
|
||||
u32 ghcb_usage;
|
||||
} __packed;
|
||||
@ -502,6 +508,9 @@ struct ghcb {
|
||||
#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024
|
||||
#define EXPECTED_GHCB_SIZE PAGE_SIZE
|
||||
|
||||
#define BUILD_BUG_RESERVED_OFFSET(x, y) \
|
||||
ASSERT_STRUCT_OFFSET(struct x, reserved ## _ ## y, y)
|
||||
|
||||
static inline void __unused_size_checks(void)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE);
|
||||
@ -509,6 +518,39 @@ static inline void __unused_size_checks(void)
|
||||
BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE);
|
||||
BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
|
||||
BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
|
||||
|
||||
/* Check offsets of reserved fields */
|
||||
|
||||
BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xa0);
|
||||
BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xcc);
|
||||
BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xd8);
|
||||
BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x180);
|
||||
BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x248);
|
||||
BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x298);
|
||||
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xc8);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xcc);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xd8);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x2f0);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380);
|
||||
BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0);
|
||||
|
||||
BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x0);
|
||||
BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0xcc);
|
||||
BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x148);
|
||||
BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x168);
|
||||
BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x180);
|
||||
BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x1e0);
|
||||
BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x200);
|
||||
BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x320);
|
||||
BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x380);
|
||||
BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x3b0);
|
||||
|
||||
BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0);
|
||||
}
|
||||
|
||||
struct vmcb {
|
||||
|
@ -53,14 +53,6 @@
|
||||
/* Architectural interrupt line count. */
|
||||
#define KVM_NR_INTERRUPTS 256
|
||||
|
||||
struct kvm_memory_alias {
|
||||
__u32 slot; /* this has a different namespace than memory slots */
|
||||
__u32 flags;
|
||||
__u64 guest_phys_addr;
|
||||
__u64 memory_size;
|
||||
__u64 target_phys_addr;
|
||||
};
|
||||
|
||||
/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
|
||||
struct kvm_pic_state {
|
||||
__u8 last_irr; /* edge detection */
|
||||
@ -214,6 +206,8 @@ struct kvm_msr_list {
|
||||
struct kvm_msr_filter_range {
|
||||
#define KVM_MSR_FILTER_READ (1 << 0)
|
||||
#define KVM_MSR_FILTER_WRITE (1 << 1)
|
||||
#define KVM_MSR_FILTER_RANGE_VALID_MASK (KVM_MSR_FILTER_READ | \
|
||||
KVM_MSR_FILTER_WRITE)
|
||||
__u32 flags;
|
||||
__u32 nmsrs; /* number of msrs in bitmap */
|
||||
__u32 base; /* MSR index the bitmap starts at */
|
||||
@ -222,8 +216,11 @@ struct kvm_msr_filter_range {
|
||||
|
||||
#define KVM_MSR_FILTER_MAX_RANGES 16
|
||||
struct kvm_msr_filter {
|
||||
#ifndef __KERNEL__
|
||||
#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
|
||||
#endif
|
||||
#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
|
||||
#define KVM_MSR_FILTER_VALID_MASK (KVM_MSR_FILTER_DEFAULT_DENY)
|
||||
__u32 flags;
|
||||
struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
|
||||
};
|
||||
|
@ -349,7 +349,7 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
|
||||
static void kvm_guest_cpu_init(void)
|
||||
{
|
||||
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
|
||||
u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
|
||||
u64 pa;
|
||||
|
||||
WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
|
||||
|
||||
|
@ -118,6 +118,17 @@ config KVM_AMD_SEV
|
||||
Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
|
||||
with Encrypted State (SEV-ES) on AMD processors.
|
||||
|
||||
config KVM_SMM
|
||||
bool "System Management Mode emulation"
|
||||
default y
|
||||
depends on KVM
|
||||
help
|
||||
Provides support for KVM to emulate System Management Mode (SMM)
|
||||
in virtual machines. This can be used by the virtual machine
|
||||
firmware to implement UEFI secure boot.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config KVM_XEN
|
||||
bool "Support for Xen hypercall interface"
|
||||
depends on KVM
|
||||
|
@ -20,12 +20,14 @@ endif
|
||||
|
||||
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
|
||||
kvm-$(CONFIG_KVM_XEN) += xen.o
|
||||
kvm-$(CONFIG_KVM_SMM) += smm.o
|
||||
|
||||
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
|
||||
vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
|
||||
vmx/hyperv.o vmx/nested.o vmx/posted_intr.o
|
||||
kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
|
||||
|
||||
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
|
||||
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
|
||||
svm/sev.o svm/hyperv.o
|
||||
|
||||
ifdef CONFIG_HYPERV
|
||||
kvm-amd-y += svm/svm_onhyperv.o
|
||||
|
@ -62,10 +62,16 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
|
||||
* This one is tied to SSB in the user API, and not
|
||||
* visible in /proc/cpuinfo.
|
||||
*/
|
||||
#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
|
||||
#define KVM_X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
|
||||
|
||||
#define F feature_bit
|
||||
#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
|
||||
|
||||
/* Scattered Flag - For features that are scattered by cpufeatures.h. */
|
||||
#define SF(name) \
|
||||
({ \
|
||||
BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \
|
||||
(boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0); \
|
||||
})
|
||||
|
||||
/*
|
||||
* Magic value used by KVM when querying userspace-provided CPUID entries and
|
||||
@ -543,9 +549,9 @@ static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
|
||||
void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask)
|
||||
{
|
||||
/* Use kvm_cpu_cap_mask for non-scattered leafs. */
|
||||
/* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */
|
||||
BUILD_BUG_ON(leaf < NCAPINTS);
|
||||
|
||||
kvm_cpu_caps[leaf] = mask;
|
||||
@ -555,7 +561,7 @@ void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
|
||||
|
||||
static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
|
||||
{
|
||||
/* Use kvm_cpu_cap_init_scattered for scattered leafs. */
|
||||
/* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */
|
||||
BUILD_BUG_ON(leaf >= NCAPINTS);
|
||||
|
||||
kvm_cpu_caps[leaf] &= mask;
|
||||
@ -657,14 +663,19 @@ void kvm_set_cpu_caps(void)
|
||||
kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
|
||||
|
||||
kvm_cpu_cap_mask(CPUID_7_1_EAX,
|
||||
F(AVX_VNNI) | F(AVX512_BF16)
|
||||
F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) |
|
||||
F(AVX_IFMA)
|
||||
);
|
||||
|
||||
kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
|
||||
F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI)
|
||||
);
|
||||
|
||||
kvm_cpu_cap_mask(CPUID_D_1_EAX,
|
||||
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
|
||||
);
|
||||
|
||||
kvm_cpu_cap_init_scattered(CPUID_12_EAX,
|
||||
kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX,
|
||||
SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
|
||||
);
|
||||
|
||||
@ -694,7 +705,7 @@ void kvm_set_cpu_caps(void)
|
||||
F(CLZERO) | F(XSAVEERPTR) |
|
||||
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
|
||||
F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
|
||||
__feature_bit(KVM_X86_FEATURE_PSFD)
|
||||
__feature_bit(KVM_X86_FEATURE_AMD_PSFD)
|
||||
);
|
||||
|
||||
/*
|
||||
@ -913,9 +924,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
|
||||
goto out;
|
||||
|
||||
cpuid_entry_override(entry, CPUID_7_1_EAX);
|
||||
cpuid_entry_override(entry, CPUID_7_1_EDX);
|
||||
entry->ebx = 0;
|
||||
entry->ecx = 0;
|
||||
entry->edx = 0;
|
||||
}
|
||||
break;
|
||||
case 0xa: { /* Architectural Performance Monitoring */
|
||||
@ -1220,8 +1231,12 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
|
||||
* Other defined bits are for MSRs that KVM does not expose:
|
||||
* EAX 3 SPCL, SMM page configuration lock
|
||||
* EAX 13 PCMSR, Prefetch control MSR
|
||||
*
|
||||
* KVM doesn't support SMM_CTL.
|
||||
* EAX 9 SMM_CTL MSR is not supported
|
||||
*/
|
||||
entry->eax &= BIT(0) | BIT(2) | BIT(6);
|
||||
entry->eax |= BIT(9);
|
||||
if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC))
|
||||
entry->eax |= BIT(2);
|
||||
if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
|
||||
|
@ -242,37 +242,6 @@ enum x86_transfer_type {
|
||||
X86_TRANSFER_TASK_SWITCH,
|
||||
};
|
||||
|
||||
static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
|
||||
{
|
||||
if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
|
||||
nr &= NR_EMULATOR_GPRS - 1;
|
||||
|
||||
if (!(ctxt->regs_valid & (1 << nr))) {
|
||||
ctxt->regs_valid |= 1 << nr;
|
||||
ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
|
||||
}
|
||||
return ctxt->_regs[nr];
|
||||
}
|
||||
|
||||
static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
|
||||
{
|
||||
if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
|
||||
nr &= NR_EMULATOR_GPRS - 1;
|
||||
|
||||
BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
|
||||
BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
|
||||
|
||||
ctxt->regs_valid |= 1 << nr;
|
||||
ctxt->regs_dirty |= 1 << nr;
|
||||
return &ctxt->_regs[nr];
|
||||
}
|
||||
|
||||
static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
|
||||
{
|
||||
reg_read(ctxt, nr);
|
||||
return reg_write(ctxt, nr);
|
||||
}
|
||||
|
||||
static void writeback_registers(struct x86_emulate_ctxt *ctxt)
|
||||
{
|
||||
unsigned long dirty = ctxt->regs_dirty;
|
||||
@ -2338,335 +2307,15 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
return ctxt->ops->guest_has_long_mode(ctxt);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
|
||||
{
|
||||
desc->g = (flags >> 23) & 1;
|
||||
desc->d = (flags >> 22) & 1;
|
||||
desc->l = (flags >> 21) & 1;
|
||||
desc->avl = (flags >> 20) & 1;
|
||||
desc->p = (flags >> 15) & 1;
|
||||
desc->dpl = (flags >> 13) & 3;
|
||||
desc->s = (flags >> 12) & 1;
|
||||
desc->type = (flags >> 8) & 15;
|
||||
}
|
||||
|
||||
static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate,
|
||||
int n)
|
||||
{
|
||||
struct desc_struct desc;
|
||||
int offset;
|
||||
u16 selector;
|
||||
|
||||
selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4);
|
||||
|
||||
if (n < 3)
|
||||
offset = 0x7f84 + n * 12;
|
||||
else
|
||||
offset = 0x7f2c + (n - 3) * 12;
|
||||
|
||||
set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8));
|
||||
set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4));
|
||||
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset));
|
||||
ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
|
||||
return X86EMUL_CONTINUE;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
|
||||
int n)
|
||||
{
|
||||
struct desc_struct desc;
|
||||
int offset;
|
||||
u16 selector;
|
||||
u32 base3;
|
||||
|
||||
offset = 0x7e00 + n * 16;
|
||||
|
||||
selector = GET_SMSTATE(u16, smstate, offset);
|
||||
rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8);
|
||||
set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4));
|
||||
set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8));
|
||||
base3 = GET_SMSTATE(u32, smstate, offset + 12);
|
||||
|
||||
ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
|
||||
return X86EMUL_CONTINUE;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
|
||||
u64 cr0, u64 cr3, u64 cr4)
|
||||
{
|
||||
int bad;
|
||||
u64 pcid;
|
||||
|
||||
/* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
|
||||
pcid = 0;
|
||||
if (cr4 & X86_CR4_PCIDE) {
|
||||
pcid = cr3 & 0xfff;
|
||||
cr3 &= ~0xfff;
|
||||
}
|
||||
|
||||
bad = ctxt->ops->set_cr(ctxt, 3, cr3);
|
||||
if (bad)
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
/*
|
||||
* First enable PAE, long mode needs it before CR0.PG = 1 is set.
|
||||
* Then enable protected mode. However, PCID cannot be enabled
|
||||
* if EFER.LMA=0, so set it separately.
|
||||
*/
|
||||
bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
|
||||
if (bad)
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
bad = ctxt->ops->set_cr(ctxt, 0, cr0);
|
||||
if (bad)
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
if (cr4 & X86_CR4_PCIDE) {
|
||||
bad = ctxt->ops->set_cr(ctxt, 4, cr4);
|
||||
if (bad)
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
if (pcid) {
|
||||
bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
|
||||
if (bad)
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return X86EMUL_CONTINUE;
|
||||
}
|
||||
|
||||
static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
|
||||
const char *smstate)
|
||||
{
|
||||
struct desc_struct desc;
|
||||
struct desc_ptr dt;
|
||||
u16 selector;
|
||||
u32 val, cr0, cr3, cr4;
|
||||
int i;
|
||||
|
||||
cr0 = GET_SMSTATE(u32, smstate, 0x7ffc);
|
||||
cr3 = GET_SMSTATE(u32, smstate, 0x7ff8);
|
||||
ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED;
|
||||
ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0);
|
||||
|
||||
for (i = 0; i < 8; i++)
|
||||
*reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
|
||||
|
||||
val = GET_SMSTATE(u32, smstate, 0x7fcc);
|
||||
|
||||
if (ctxt->ops->set_dr(ctxt, 6, val))
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
val = GET_SMSTATE(u32, smstate, 0x7fc8);
|
||||
|
||||
if (ctxt->ops->set_dr(ctxt, 7, val))
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
selector = GET_SMSTATE(u32, smstate, 0x7fc4);
|
||||
set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64));
|
||||
set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60));
|
||||
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c));
|
||||
ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
|
||||
|
||||
selector = GET_SMSTATE(u32, smstate, 0x7fc0);
|
||||
set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80));
|
||||
set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c));
|
||||
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78));
|
||||
ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
|
||||
|
||||
dt.address = GET_SMSTATE(u32, smstate, 0x7f74);
|
||||
dt.size = GET_SMSTATE(u32, smstate, 0x7f70);
|
||||
ctxt->ops->set_gdt(ctxt, &dt);
|
||||
|
||||
dt.address = GET_SMSTATE(u32, smstate, 0x7f58);
|
||||
dt.size = GET_SMSTATE(u32, smstate, 0x7f54);
|
||||
ctxt->ops->set_idt(ctxt, &dt);
|
||||
|
||||
for (i = 0; i < 6; i++) {
|
||||
int r = rsm_load_seg_32(ctxt, smstate, i);
|
||||
if (r != X86EMUL_CONTINUE)
|
||||
return r;
|
||||
}
|
||||
|
||||
cr4 = GET_SMSTATE(u32, smstate, 0x7f14);
|
||||
|
||||
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8));
|
||||
|
||||
return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
|
||||
const char *smstate)
|
||||
{
|
||||
struct desc_struct desc;
|
||||
struct desc_ptr dt;
|
||||
u64 val, cr0, cr3, cr4;
|
||||
u32 base3;
|
||||
u16 selector;
|
||||
int i, r;
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
*reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8);
|
||||
|
||||
ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
|
||||
ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
|
||||
|
||||
val = GET_SMSTATE(u64, smstate, 0x7f68);
|
||||
|
||||
if (ctxt->ops->set_dr(ctxt, 6, val))
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
val = GET_SMSTATE(u64, smstate, 0x7f60);
|
||||
|
||||
if (ctxt->ops->set_dr(ctxt, 7, val))
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
cr0 = GET_SMSTATE(u64, smstate, 0x7f58);
|
||||
cr3 = GET_SMSTATE(u64, smstate, 0x7f50);
|
||||
cr4 = GET_SMSTATE(u64, smstate, 0x7f48);
|
||||
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00));
|
||||
val = GET_SMSTATE(u64, smstate, 0x7ed0);
|
||||
|
||||
if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA))
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
selector = GET_SMSTATE(u32, smstate, 0x7e90);
|
||||
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8);
|
||||
set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94));
|
||||
set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98));
|
||||
base3 = GET_SMSTATE(u32, smstate, 0x7e9c);
|
||||
ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
|
||||
|
||||
dt.size = GET_SMSTATE(u32, smstate, 0x7e84);
|
||||
dt.address = GET_SMSTATE(u64, smstate, 0x7e88);
|
||||
ctxt->ops->set_idt(ctxt, &dt);
|
||||
|
||||
selector = GET_SMSTATE(u32, smstate, 0x7e70);
|
||||
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8);
|
||||
set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74));
|
||||
set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78));
|
||||
base3 = GET_SMSTATE(u32, smstate, 0x7e7c);
|
||||
ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
|
||||
|
||||
dt.size = GET_SMSTATE(u32, smstate, 0x7e64);
|
||||
dt.address = GET_SMSTATE(u64, smstate, 0x7e68);
|
||||
ctxt->ops->set_gdt(ctxt, &dt);
|
||||
|
||||
r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
|
||||
if (r != X86EMUL_CONTINUE)
|
||||
return r;
|
||||
|
||||
for (i = 0; i < 6; i++) {
|
||||
r = rsm_load_seg_64(ctxt, smstate, i);
|
||||
if (r != X86EMUL_CONTINUE)
|
||||
return r;
|
||||
}
|
||||
|
||||
return X86EMUL_CONTINUE;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int em_rsm(struct x86_emulate_ctxt *ctxt)
|
||||
{
|
||||
unsigned long cr0, cr4, efer;
|
||||
char buf[512];
|
||||
u64 smbase;
|
||||
int ret;
|
||||
|
||||
if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
|
||||
return emulate_ud(ctxt);
|
||||
|
||||
smbase = ctxt->ops->get_smbase(ctxt);
|
||||
if (ctxt->ops->leave_smm(ctxt))
|
||||
ctxt->ops->triple_fault(ctxt);
|
||||
|
||||
ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf));
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
|
||||
ctxt->ops->set_nmi_mask(ctxt, false);
|
||||
|
||||
ctxt->ops->exiting_smm(ctxt);
|
||||
|
||||
/*
|
||||
* Get back to real mode, to prepare a safe state in which to load
|
||||
* CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
|
||||
* supports long mode.
|
||||
*/
|
||||
if (emulator_has_longmode(ctxt)) {
|
||||
struct desc_struct cs_desc;
|
||||
|
||||
/* Zero CR4.PCIDE before CR0.PG. */
|
||||
cr4 = ctxt->ops->get_cr(ctxt, 4);
|
||||
if (cr4 & X86_CR4_PCIDE)
|
||||
ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
|
||||
|
||||
/* A 32-bit code segment is required to clear EFER.LMA. */
|
||||
memset(&cs_desc, 0, sizeof(cs_desc));
|
||||
cs_desc.type = 0xb;
|
||||
cs_desc.s = cs_desc.g = cs_desc.p = 1;
|
||||
ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
|
||||
}
|
||||
|
||||
/* For the 64-bit case, this will clear EFER.LMA. */
|
||||
cr0 = ctxt->ops->get_cr(ctxt, 0);
|
||||
if (cr0 & X86_CR0_PE)
|
||||
ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
|
||||
|
||||
if (emulator_has_longmode(ctxt)) {
|
||||
/* Clear CR4.PAE before clearing EFER.LME. */
|
||||
cr4 = ctxt->ops->get_cr(ctxt, 4);
|
||||
if (cr4 & X86_CR4_PAE)
|
||||
ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
|
||||
|
||||
/* And finally go back to 32-bit mode. */
|
||||
efer = 0;
|
||||
ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
|
||||
}
|
||||
|
||||
/*
|
||||
* Give leave_smm() a chance to make ISA-specific changes to the vCPU
|
||||
* state (e.g. enter guest mode) before loading state from the SMM
|
||||
* state-save area.
|
||||
*/
|
||||
if (ctxt->ops->leave_smm(ctxt, buf))
|
||||
goto emulate_shutdown;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
if (emulator_has_longmode(ctxt))
|
||||
ret = rsm_load_state_64(ctxt, buf);
|
||||
else
|
||||
#endif
|
||||
ret = rsm_load_state_32(ctxt, buf);
|
||||
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
goto emulate_shutdown;
|
||||
|
||||
/*
|
||||
* Note, the ctxt->ops callbacks are responsible for handling side
|
||||
* effects when writing MSRs and CRs, e.g. MMU context resets, CPUID
|
||||
* runtime updates, etc... If that changes, e.g. this flow is moved
|
||||
* out of the emulator to make it look more like enter_smm(), then
|
||||
* those side effects need to be explicitly handled for both success
|
||||
* and shutdown.
|
||||
*/
|
||||
return emulator_recalc_and_set_mode(ctxt);
|
||||
|
||||
emulate_shutdown:
|
||||
ctxt->ops->triple_fault(ctxt);
|
||||
return X86EMUL_CONTINUE;
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -23,22 +23,25 @@
|
||||
#include "ioapic.h"
|
||||
#include "cpuid.h"
|
||||
#include "hyperv.h"
|
||||
#include "mmu.h"
|
||||
#include "xen.h"
|
||||
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/sched/cputime.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/eventfd.h>
|
||||
|
||||
#include <asm/apicdef.h>
|
||||
#include <asm/mshyperv.h>
|
||||
#include <trace/events/kvm.h>
|
||||
|
||||
#include "trace.h"
|
||||
#include "irq.h"
|
||||
#include "fpu.h"
|
||||
|
||||
#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
|
||||
#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK)
|
||||
|
||||
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
|
||||
bool vcpu_kick);
|
||||
@ -897,13 +900,15 @@ bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled);
|
||||
|
||||
bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
|
||||
struct hv_vp_assist_page *assist_page)
|
||||
int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (!kvm_hv_assist_page_enabled(vcpu))
|
||||
return false;
|
||||
return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
|
||||
assist_page, sizeof(*assist_page));
|
||||
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
|
||||
|
||||
if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu))
|
||||
return -EFAULT;
|
||||
|
||||
return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
|
||||
&hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page);
|
||||
|
||||
@ -954,6 +959,11 @@ int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
|
||||
|
||||
hv_vcpu->vp_index = vcpu->vcpu_idx;
|
||||
|
||||
for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) {
|
||||
INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries);
|
||||
spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1736,6 +1746,28 @@ static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks,
|
||||
}
|
||||
}
|
||||
|
||||
static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[])
|
||||
{
|
||||
int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK;
|
||||
unsigned long sbank;
|
||||
|
||||
if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* The index into the sparse bank is the number of preceding bits in
|
||||
* the valid mask. Optimize for VMs with <64 vCPUs by skipping the
|
||||
* fancy math if there can't possibly be preceding bits.
|
||||
*/
|
||||
if (valid_bit_nr)
|
||||
sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0));
|
||||
else
|
||||
sbank = 0;
|
||||
|
||||
return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK,
|
||||
(unsigned long *)&sparse_banks[sbank]);
|
||||
}
|
||||
|
||||
struct kvm_hv_hcall {
|
||||
u64 param;
|
||||
u64 ingpa;
|
||||
@ -1749,57 +1781,173 @@ struct kvm_hv_hcall {
|
||||
sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
|
||||
};
|
||||
|
||||
static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
|
||||
int consumed_xmm_halves,
|
||||
u64 *sparse_banks, gpa_t offset)
|
||||
|
||||
static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc,
|
||||
u16 orig_cnt, u16 cnt_cap, u64 *data,
|
||||
int consumed_xmm_halves, gpa_t offset)
|
||||
{
|
||||
u16 var_cnt;
|
||||
int i;
|
||||
|
||||
if (hc->var_cnt > 64)
|
||||
return -EINVAL;
|
||||
|
||||
/* Ignore banks that cannot possibly contain a legal VP index. */
|
||||
var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS);
|
||||
/*
|
||||
* Preserve the original count when ignoring entries via a "cap", KVM
|
||||
* still needs to validate the guest input (though the non-XMM path
|
||||
* punts on the checks).
|
||||
*/
|
||||
u16 cnt = min(orig_cnt, cnt_cap);
|
||||
int i, j;
|
||||
|
||||
if (hc->fast) {
|
||||
/*
|
||||
* Each XMM holds two sparse banks, but do not count halves that
|
||||
* have already been consumed for hypercall parameters.
|
||||
*/
|
||||
if (hc->var_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves)
|
||||
if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves)
|
||||
return HV_STATUS_INVALID_HYPERCALL_INPUT;
|
||||
for (i = 0; i < var_cnt; i++) {
|
||||
int j = i + consumed_xmm_halves;
|
||||
|
||||
for (i = 0; i < cnt; i++) {
|
||||
j = i + consumed_xmm_halves;
|
||||
if (j % 2)
|
||||
sparse_banks[i] = sse128_hi(hc->xmm[j / 2]);
|
||||
data[i] = sse128_hi(hc->xmm[j / 2]);
|
||||
else
|
||||
sparse_banks[i] = sse128_lo(hc->xmm[j / 2]);
|
||||
data[i] = sse128_lo(hc->xmm[j / 2]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks,
|
||||
var_cnt * sizeof(*sparse_banks));
|
||||
return kvm_read_guest(kvm, hc->ingpa + offset, data,
|
||||
cnt * sizeof(*data));
|
||||
}
|
||||
|
||||
static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
|
||||
u64 *sparse_banks, int consumed_xmm_halves,
|
||||
gpa_t offset)
|
||||
{
|
||||
if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS)
|
||||
return -EINVAL;
|
||||
|
||||
/* Cap var_cnt to ignore banks that cannot contain a legal VP index. */
|
||||
return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS,
|
||||
sparse_banks, consumed_xmm_halves, offset);
|
||||
}
|
||||
|
||||
static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[],
|
||||
int consumed_xmm_halves, gpa_t offset)
|
||||
{
|
||||
return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt,
|
||||
entries, consumed_xmm_halves, offset);
|
||||
}
|
||||
|
||||
static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu,
|
||||
struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo,
|
||||
u64 *entries, int count)
|
||||
{
|
||||
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
|
||||
u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY;
|
||||
|
||||
if (!hv_vcpu)
|
||||
return;
|
||||
|
||||
spin_lock(&tlb_flush_fifo->write_lock);
|
||||
|
||||
/*
|
||||
* All entries should fit on the fifo leaving one free for 'flush all'
|
||||
* entry in case another request comes in. In case there's not enough
|
||||
* space, just put 'flush all' entry there.
|
||||
*/
|
||||
if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) {
|
||||
WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: full fifo always contains 'flush all' entry, no need to check the
|
||||
* return value.
|
||||
*/
|
||||
kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1);
|
||||
|
||||
out_unlock:
|
||||
spin_unlock(&tlb_flush_fifo->write_lock);
|
||||
}
|
||||
|
||||
int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
|
||||
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
|
||||
u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE];
|
||||
int i, j, count;
|
||||
gva_t gva;
|
||||
|
||||
if (!tdp_enabled || !hv_vcpu)
|
||||
return -EINVAL;
|
||||
|
||||
tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
|
||||
|
||||
count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE);
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
|
||||
goto out_flush_all;
|
||||
|
||||
/*
|
||||
* Lower 12 bits of 'address' encode the number of additional
|
||||
* pages to flush.
|
||||
*/
|
||||
gva = entries[i] & PAGE_MASK;
|
||||
for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++)
|
||||
static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
|
||||
|
||||
++vcpu->stat.tlb_flush;
|
||||
}
|
||||
return 0;
|
||||
|
||||
out_flush_all:
|
||||
kfifo_reset_out(&tlb_flush_fifo->entries);
|
||||
|
||||
/* Fall back to full flush. */
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
|
||||
{
|
||||
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
|
||||
u64 *sparse_banks = hv_vcpu->sparse_banks;
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
struct hv_tlb_flush_ex flush_ex;
|
||||
struct hv_tlb_flush flush;
|
||||
DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
|
||||
struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
|
||||
/*
|
||||
* Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE'
|
||||
* entries on the TLB flush fifo. The last entry, however, needs to be
|
||||
* always left free for 'flush all' entry which gets placed when
|
||||
* there is not enough space to put all the requested entries.
|
||||
*/
|
||||
u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1];
|
||||
u64 *tlb_flush_entries;
|
||||
u64 valid_bank_mask;
|
||||
u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
|
||||
struct kvm_vcpu *v;
|
||||
unsigned long i;
|
||||
bool all_cpus;
|
||||
int consumed_xmm_halves = 0;
|
||||
gpa_t data_offset;
|
||||
|
||||
/*
|
||||
* The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the
|
||||
* valid mask is a u64. Fail the build if KVM's max allowed number of
|
||||
* vCPUs (>4096) would exceed this limit, KVM will additional changes
|
||||
* for Hyper-V support to avoid setting the guest up to fail.
|
||||
* The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS
|
||||
* sparse banks. Fail the build if KVM's max allowed number of
|
||||
* vCPUs (>4096) exceeds this limit.
|
||||
*/
|
||||
BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64);
|
||||
BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS);
|
||||
|
||||
/*
|
||||
* 'Slow' hypercall's first parameter is the address in guest's memory
|
||||
* where hypercall parameters are placed. This is either a GPA or a
|
||||
* nested GPA when KVM is handling the call from L2 ('direct' TLB
|
||||
* flush). Translate the address here so the memory can be uniformly
|
||||
* read with kvm_read_guest().
|
||||
*/
|
||||
if (!hc->fast && is_guest_mode(vcpu)) {
|
||||
hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL);
|
||||
if (unlikely(hc->ingpa == INVALID_GPA))
|
||||
return HV_STATUS_INVALID_HYPERCALL_INPUT;
|
||||
}
|
||||
|
||||
if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
|
||||
hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) {
|
||||
@ -1807,14 +1955,17 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
|
||||
flush.address_space = hc->ingpa;
|
||||
flush.flags = hc->outgpa;
|
||||
flush.processor_mask = sse128_lo(hc->xmm[0]);
|
||||
consumed_xmm_halves = 1;
|
||||
} else {
|
||||
if (unlikely(kvm_read_guest(kvm, hc->ingpa,
|
||||
&flush, sizeof(flush))))
|
||||
return HV_STATUS_INVALID_HYPERCALL_INPUT;
|
||||
data_offset = sizeof(flush);
|
||||
}
|
||||
|
||||
trace_kvm_hv_flush_tlb(flush.processor_mask,
|
||||
flush.address_space, flush.flags);
|
||||
flush.address_space, flush.flags,
|
||||
is_guest_mode(vcpu));
|
||||
|
||||
valid_bank_mask = BIT_ULL(0);
|
||||
sparse_banks[0] = flush.processor_mask;
|
||||
@ -1834,16 +1985,18 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
|
||||
flush_ex.flags = hc->outgpa;
|
||||
memcpy(&flush_ex.hv_vp_set,
|
||||
&hc->xmm[0], sizeof(hc->xmm[0]));
|
||||
consumed_xmm_halves = 2;
|
||||
} else {
|
||||
if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex,
|
||||
sizeof(flush_ex))))
|
||||
return HV_STATUS_INVALID_HYPERCALL_INPUT;
|
||||
data_offset = sizeof(flush_ex);
|
||||
}
|
||||
|
||||
trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
|
||||
flush_ex.hv_vp_set.format,
|
||||
flush_ex.address_space,
|
||||
flush_ex.flags);
|
||||
flush_ex.flags, is_guest_mode(vcpu));
|
||||
|
||||
valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask;
|
||||
all_cpus = flush_ex.hv_vp_set.format !=
|
||||
@ -1852,29 +2005,95 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
|
||||
if (hc->var_cnt != hweight64(valid_bank_mask))
|
||||
return HV_STATUS_INVALID_HYPERCALL_INPUT;
|
||||
|
||||
if (all_cpus)
|
||||
goto do_flush;
|
||||
if (!all_cpus) {
|
||||
if (!hc->var_cnt)
|
||||
goto ret_success;
|
||||
|
||||
if (!hc->var_cnt)
|
||||
goto ret_success;
|
||||
if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks,
|
||||
consumed_xmm_halves, data_offset))
|
||||
return HV_STATUS_INVALID_HYPERCALL_INPUT;
|
||||
}
|
||||
|
||||
if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks,
|
||||
offsetof(struct hv_tlb_flush_ex,
|
||||
hv_vp_set.bank_contents)))
|
||||
return HV_STATUS_INVALID_HYPERCALL_INPUT;
|
||||
/*
|
||||
* Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU
|
||||
* banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs'
|
||||
* case (HV_GENERIC_SET_ALL). Always adjust data_offset and
|
||||
* consumed_xmm_halves to make sure TLB flush entries are read
|
||||
* from the correct offset.
|
||||
*/
|
||||
data_offset += hc->var_cnt * sizeof(sparse_banks[0]);
|
||||
consumed_xmm_halves += hc->var_cnt;
|
||||
}
|
||||
|
||||
if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
|
||||
hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
|
||||
hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) {
|
||||
tlb_flush_entries = NULL;
|
||||
} else {
|
||||
if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries,
|
||||
consumed_xmm_halves, data_offset))
|
||||
return HV_STATUS_INVALID_HYPERCALL_INPUT;
|
||||
tlb_flush_entries = __tlb_flush_entries;
|
||||
}
|
||||
|
||||
do_flush:
|
||||
/*
|
||||
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
|
||||
* analyze it here, flush TLB regardless of the specified address space.
|
||||
*/
|
||||
if (all_cpus) {
|
||||
kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
|
||||
} else {
|
||||
if (all_cpus && !is_guest_mode(vcpu)) {
|
||||
kvm_for_each_vcpu(i, v, kvm) {
|
||||
tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
|
||||
hv_tlb_flush_enqueue(v, tlb_flush_fifo,
|
||||
tlb_flush_entries, hc->rep_cnt);
|
||||
}
|
||||
|
||||
kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
|
||||
} else if (!is_guest_mode(vcpu)) {
|
||||
sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
|
||||
|
||||
kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask);
|
||||
for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) {
|
||||
v = kvm_get_vcpu(kvm, i);
|
||||
if (!v)
|
||||
continue;
|
||||
tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
|
||||
hv_tlb_flush_enqueue(v, tlb_flush_fifo,
|
||||
tlb_flush_entries, hc->rep_cnt);
|
||||
}
|
||||
|
||||
kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
|
||||
} else {
|
||||
struct kvm_vcpu_hv *hv_v;
|
||||
|
||||
bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
|
||||
|
||||
kvm_for_each_vcpu(i, v, kvm) {
|
||||
hv_v = to_hv_vcpu(v);
|
||||
|
||||
/*
|
||||
* The following check races with nested vCPUs entering/exiting
|
||||
* and/or migrating between L1's vCPUs, however the only case when
|
||||
* KVM *must* flush the TLB is when the target L2 vCPU keeps
|
||||
* running on the same L1 vCPU from the moment of the request until
|
||||
* kvm_hv_flush_tlb() returns. TLB is fully flushed in all other
|
||||
* cases, e.g. when the target L2 vCPU migrates to a different L1
|
||||
* vCPU or when the corresponding L1 vCPU temporary switches to a
|
||||
* different L2 vCPU while the request is being processed.
|
||||
*/
|
||||
if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id)
|
||||
continue;
|
||||
|
||||
if (!all_cpus &&
|
||||
!hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask,
|
||||
sparse_banks))
|
||||
continue;
|
||||
|
||||
__set_bit(i, vcpu_mask);
|
||||
tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true);
|
||||
hv_tlb_flush_enqueue(v, tlb_flush_fifo,
|
||||
tlb_flush_entries, hc->rep_cnt);
|
||||
}
|
||||
|
||||
kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
|
||||
}
|
||||
|
||||
ret_success:
|
||||
@ -1883,8 +2102,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
|
||||
((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
|
||||
}
|
||||
|
||||
static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
|
||||
unsigned long *vcpu_bitmap)
|
||||
static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
|
||||
u64 *sparse_banks, u64 valid_bank_mask)
|
||||
{
|
||||
struct kvm_lapic_irq irq = {
|
||||
.delivery_mode = APIC_DM_FIXED,
|
||||
@ -1894,7 +2113,9 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
|
||||
unsigned long i;
|
||||
|
||||
kvm_for_each_vcpu(i, vcpu, kvm) {
|
||||
if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
|
||||
if (sparse_banks &&
|
||||
!hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu),
|
||||
valid_bank_mask, sparse_banks))
|
||||
continue;
|
||||
|
||||
/* We fail only when APIC is disabled */
|
||||
@ -1904,12 +2125,12 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
|
||||
|
||||
static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
|
||||
{
|
||||
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
|
||||
u64 *sparse_banks = hv_vcpu->sparse_banks;
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
struct hv_send_ipi_ex send_ipi_ex;
|
||||
struct hv_send_ipi send_ipi;
|
||||
DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
|
||||
u64 valid_bank_mask;
|
||||
u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
|
||||
u32 vector;
|
||||
bool all_cpus;
|
||||
|
||||
@ -1959,7 +2180,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
|
||||
if (!hc->var_cnt)
|
||||
goto ret_success;
|
||||
|
||||
if (kvm_get_sparse_vp_set(kvm, hc, 1, sparse_banks,
|
||||
if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, 1,
|
||||
offsetof(struct hv_send_ipi_ex,
|
||||
vp_set.bank_contents)))
|
||||
return HV_STATUS_INVALID_HYPERCALL_INPUT;
|
||||
@ -1969,13 +2190,10 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
|
||||
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
|
||||
return HV_STATUS_INVALID_HYPERCALL_INPUT;
|
||||
|
||||
if (all_cpus) {
|
||||
kvm_send_ipi_to_many(kvm, vector, NULL);
|
||||
} else {
|
||||
sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
|
||||
|
||||
kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
|
||||
}
|
||||
if (all_cpus)
|
||||
kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0);
|
||||
else
|
||||
kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask);
|
||||
|
||||
ret_success:
|
||||
return HV_STATUS_SUCCESS;
|
||||
@ -2062,10 +2280,25 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
|
||||
|
||||
static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
|
||||
{
|
||||
u32 tlb_lock_count = 0;
|
||||
int ret;
|
||||
|
||||
if (hv_result_success(result) && is_guest_mode(vcpu) &&
|
||||
kvm_hv_is_tlb_flush_hcall(vcpu) &&
|
||||
kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa,
|
||||
&tlb_lock_count, sizeof(tlb_lock_count)))
|
||||
result = HV_STATUS_INVALID_HYPERCALL_INPUT;
|
||||
|
||||
trace_kvm_hv_hypercall_done(result);
|
||||
kvm_hv_hypercall_set_result(vcpu, result);
|
||||
++vcpu->stat.hypercalls;
|
||||
return kvm_skip_emulated_instruction(vcpu);
|
||||
|
||||
ret = kvm_skip_emulated_instruction(vcpu);
|
||||
|
||||
if (tlb_lock_count)
|
||||
kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
|
||||
@ -2502,6 +2735,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
|
||||
ent->ebx |= HV_DEBUGGING;
|
||||
ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE;
|
||||
ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
|
||||
ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH;
|
||||
|
||||
/*
|
||||
* Direct Synthetic timers only make sense with in-kernel
|
||||
@ -2545,6 +2779,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
|
||||
|
||||
case HYPERV_CPUID_NESTED_FEATURES:
|
||||
ent->eax = evmcs_ver;
|
||||
ent->eax |= HV_X64_NESTED_DIRECT_FLUSH;
|
||||
ent->eax |= HV_X64_NESTED_MSR_BITMAP;
|
||||
ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
|
||||
break;
|
||||
|
@ -22,6 +22,7 @@
|
||||
#define __ARCH_X86_KVM_HYPERV_H__
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
#include "x86.h"
|
||||
|
||||
/* "Hv#1" signature */
|
||||
#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
|
||||
@ -107,8 +108,7 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
|
||||
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
|
||||
|
||||
bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
|
||||
bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
|
||||
struct hv_vp_assist_page *assist_page);
|
||||
int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu);
|
||||
|
||||
static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu,
|
||||
int timer_index)
|
||||
@ -151,4 +151,64 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
|
||||
int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
|
||||
struct kvm_cpuid_entry2 __user *entries);
|
||||
|
||||
static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu,
|
||||
bool is_guest_mode)
|
||||
{
|
||||
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
|
||||
int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO :
|
||||
HV_L1_TLB_FLUSH_FIFO;
|
||||
|
||||
return &hv_vcpu->tlb_flush_fifo[i];
|
||||
}
|
||||
|
||||
static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
|
||||
|
||||
if (!to_hv_vcpu(vcpu) || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
|
||||
return;
|
||||
|
||||
tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
|
||||
|
||||
kfifo_reset_out(&tlb_flush_fifo->entries);
|
||||
}
|
||||
|
||||
static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
|
||||
|
||||
return hv_vcpu &&
|
||||
(hv_vcpu->cpuid_cache.nested_eax & HV_X64_NESTED_DIRECT_FLUSH);
|
||||
}
|
||||
|
||||
static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
|
||||
u16 code;
|
||||
|
||||
if (!hv_vcpu)
|
||||
return false;
|
||||
|
||||
code = is_64_bit_hypercall(vcpu) ? kvm_rcx_read(vcpu) :
|
||||
kvm_rax_read(vcpu);
|
||||
|
||||
return (code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
|
||||
code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
|
||||
code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
|
||||
code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX);
|
||||
}
|
||||
|
||||
static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (!to_hv_vcpu(vcpu))
|
||||
return 0;
|
||||
|
||||
if (!kvm_hv_assist_page_enabled(vcpu))
|
||||
return 0;
|
||||
|
||||
return kvm_hv_get_assist_page(vcpu);
|
||||
}
|
||||
|
||||
int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
|
||||
|
||||
#endif
|
||||
|
@ -31,7 +31,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
|
||||
|
||||
return r;
|
||||
}
|
||||
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
|
||||
|
||||
/*
|
||||
* check if there is a pending userspace external interrupt
|
||||
@ -150,7 +149,6 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
|
||||
if (kvm_xen_timer_enabled(vcpu))
|
||||
kvm_xen_inject_timer_irqs(vcpu);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
|
||||
|
||||
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
@ -165,3 +163,8 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
|
||||
|
||||
return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
|
||||
}
|
||||
|
||||
bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
|
||||
{
|
||||
return irqchip_in_kernel(kvm);
|
||||
}
|
||||
|
@ -200,9 +200,4 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
|
||||
return vcpu->arch.hflags & HF_GUEST_MASK;
|
||||
}
|
||||
|
||||
static inline bool is_smm(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return vcpu->arch.hflags & HF_SMM_MASK;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -116,16 +116,6 @@ struct x86_emulate_ops {
|
||||
unsigned int bytes,
|
||||
struct x86_exception *fault, bool system);
|
||||
|
||||
/*
|
||||
* read_phys: Read bytes of standard (non-emulated/special) memory.
|
||||
* Used for descriptor reading.
|
||||
* @addr: [IN ] Physical address from which to read.
|
||||
* @val: [OUT] Value read from memory.
|
||||
* @bytes: [IN ] Number of bytes to read from memory.
|
||||
*/
|
||||
int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
|
||||
void *val, unsigned int bytes);
|
||||
|
||||
/*
|
||||
* write_std: Write bytes of standard (non-emulated/special) memory.
|
||||
* Used for descriptor writing.
|
||||
@ -209,11 +199,8 @@ struct x86_emulate_ops {
|
||||
int (*cpl)(struct x86_emulate_ctxt *ctxt);
|
||||
void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
|
||||
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
|
||||
u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
|
||||
void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
|
||||
int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
|
||||
int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
|
||||
int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
|
||||
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
|
||||
int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
|
||||
int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
|
||||
@ -234,8 +221,7 @@ struct x86_emulate_ops {
|
||||
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
|
||||
|
||||
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
|
||||
void (*exiting_smm)(struct x86_emulate_ctxt *ctxt);
|
||||
int (*leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate);
|
||||
int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
|
||||
void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
|
||||
int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
|
||||
};
|
||||
@ -292,7 +278,6 @@ enum x86emul_mode {
|
||||
/* These match some of the HF_* flags defined in kvm_host.h */
|
||||
#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
|
||||
#define X86EMUL_SMM_MASK (1 << 6)
|
||||
#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7)
|
||||
|
||||
/*
|
||||
* fastop functions are declared as taking a never-defined fastop parameter,
|
||||
@ -526,4 +511,35 @@ void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
|
||||
void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
|
||||
bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
|
||||
|
||||
static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
|
||||
{
|
||||
if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
|
||||
nr &= NR_EMULATOR_GPRS - 1;
|
||||
|
||||
if (!(ctxt->regs_valid & (1 << nr))) {
|
||||
ctxt->regs_valid |= 1 << nr;
|
||||
ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
|
||||
}
|
||||
return ctxt->_regs[nr];
|
||||
}
|
||||
|
||||
static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
|
||||
{
|
||||
if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
|
||||
nr &= NR_EMULATOR_GPRS - 1;
|
||||
|
||||
BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
|
||||
BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
|
||||
|
||||
ctxt->regs_valid |= 1 << nr;
|
||||
ctxt->regs_dirty |= 1 << nr;
|
||||
return &ctxt->_regs[nr];
|
||||
}
|
||||
|
||||
static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
|
||||
{
|
||||
reg_read(ctxt, nr);
|
||||
return reg_write(ctxt, nr);
|
||||
}
|
||||
|
||||
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
|
||||
|
@ -42,6 +42,7 @@
|
||||
#include "x86.h"
|
||||
#include "cpuid.h"
|
||||
#include "hyperv.h"
|
||||
#include "smm.h"
|
||||
|
||||
#ifndef CONFIG_X86_64
|
||||
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
|
||||
@ -159,7 +160,6 @@ bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
|
||||
&& !(kvm_mwait_in_guest(vcpu->kvm) ||
|
||||
kvm_can_post_timer_interrupt(vcpu));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_can_use_hv_timer);
|
||||
|
||||
static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
@ -1170,9 +1170,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
|
||||
break;
|
||||
|
||||
case APIC_DM_SMI:
|
||||
result = 1;
|
||||
kvm_make_request(KVM_REQ_SMI, vcpu);
|
||||
kvm_vcpu_kick(vcpu);
|
||||
if (!kvm_inject_smi(vcpu)) {
|
||||
kvm_vcpu_kick(vcpu);
|
||||
result = 1;
|
||||
}
|
||||
break;
|
||||
|
||||
case APIC_DM_NMI:
|
||||
@ -1912,7 +1913,6 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
|
||||
|
||||
return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
|
||||
|
||||
static void cancel_hv_timer(struct kvm_lapic *apic)
|
||||
{
|
||||
@ -2430,7 +2430,6 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
|
||||
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_apic_update_apicv);
|
||||
|
||||
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
|
||||
{
|
||||
@ -2722,8 +2721,6 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
|
||||
icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
|
||||
__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
|
||||
}
|
||||
} else {
|
||||
kvm_lapic_xapic_id_updated(vcpu->arch.apic);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -2759,6 +2756,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
|
||||
}
|
||||
memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
|
||||
|
||||
if (!apic_x2apic_mode(apic))
|
||||
kvm_lapic_xapic_id_updated(apic);
|
||||
|
||||
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
|
||||
kvm_recalculate_apic_map(vcpu->kvm);
|
||||
kvm_apic_set_version(vcpu);
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <linux/kvm_host.h>
|
||||
|
||||
#include "hyperv.h"
|
||||
#include "kvm_cache_regs.h"
|
||||
#include "smm.h"
|
||||
|
||||
#define KVM_APIC_INIT 0
|
||||
#define KVM_APIC_SIPI 1
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include "tdp_mmu.h"
|
||||
#include "x86.h"
|
||||
#include "kvm_cache_regs.h"
|
||||
#include "smm.h"
|
||||
#include "kvm_emulate.h"
|
||||
#include "cpuid.h"
|
||||
#include "spte.h"
|
||||
@ -802,15 +803,31 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
|
||||
}
|
||||
|
||||
void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
{
|
||||
if (sp->lpage_disallowed)
|
||||
/*
|
||||
* If it's possible to replace the shadow page with an NX huge page,
|
||||
* i.e. if the shadow page is the only thing currently preventing KVM
|
||||
* from using a huge page, add the shadow page to the list of "to be
|
||||
* zapped for NX recovery" pages. Note, the shadow page can already be
|
||||
* on the list if KVM is reusing an existing shadow page, i.e. if KVM
|
||||
* links a shadow page at multiple points.
|
||||
*/
|
||||
if (!list_empty(&sp->possible_nx_huge_page_link))
|
||||
return;
|
||||
|
||||
++kvm->stat.nx_lpage_splits;
|
||||
list_add_tail(&sp->lpage_disallowed_link,
|
||||
&kvm->arch.lpage_disallowed_mmu_pages);
|
||||
sp->lpage_disallowed = true;
|
||||
list_add_tail(&sp->possible_nx_huge_page_link,
|
||||
&kvm->arch.possible_nx_huge_pages);
|
||||
}
|
||||
|
||||
static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
|
||||
bool nx_huge_page_possible)
|
||||
{
|
||||
sp->nx_huge_page_disallowed = true;
|
||||
|
||||
if (nx_huge_page_possible)
|
||||
track_possible_nx_huge_page(kvm, sp);
|
||||
}
|
||||
|
||||
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
@ -830,11 +847,20 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
kvm_mmu_gfn_allow_lpage(slot, gfn);
|
||||
}
|
||||
|
||||
void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
{
|
||||
if (list_empty(&sp->possible_nx_huge_page_link))
|
||||
return;
|
||||
|
||||
--kvm->stat.nx_lpage_splits;
|
||||
sp->lpage_disallowed = false;
|
||||
list_del(&sp->lpage_disallowed_link);
|
||||
list_del_init(&sp->possible_nx_huge_page_link);
|
||||
}
|
||||
|
||||
static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
{
|
||||
sp->nx_huge_page_disallowed = false;
|
||||
|
||||
untrack_possible_nx_huge_page(kvm, sp);
|
||||
}
|
||||
|
||||
static struct kvm_memory_slot *
|
||||
@ -1645,7 +1671,7 @@ static int is_empty_shadow_page(u64 *spt)
|
||||
u64 *pos;
|
||||
u64 *end;
|
||||
|
||||
for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
|
||||
for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++)
|
||||
if (is_shadow_present_pte(*pos)) {
|
||||
printk(KERN_ERR "%s: %p %llx\n", __func__,
|
||||
pos, *pos);
|
||||
@ -1793,7 +1819,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
|
||||
continue;
|
||||
}
|
||||
|
||||
child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK);
|
||||
child = spte_to_child_sp(ent);
|
||||
|
||||
if (child->unsync_children) {
|
||||
if (mmu_pages_add(pvec, child, i))
|
||||
@ -1894,7 +1920,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
if (sp->role.invalid)
|
||||
return true;
|
||||
|
||||
/* TDP MMU pages due not use the MMU generation. */
|
||||
/* TDP MMU pages do not use the MMU generation. */
|
||||
return !sp->tdp_mmu_page &&
|
||||
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
|
||||
}
|
||||
@ -2129,6 +2155,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
|
||||
|
||||
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
|
||||
|
||||
INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
|
||||
|
||||
/*
|
||||
* active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
|
||||
* depends on valid pages being added to the head of the list. See
|
||||
@ -2350,7 +2378,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
|
||||
* so we should update the spte at this point to get
|
||||
* a new sp with the correct access.
|
||||
*/
|
||||
child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK);
|
||||
child = spte_to_child_sp(*sptep);
|
||||
if (child->role.access == direct_access)
|
||||
return;
|
||||
|
||||
@ -2371,7 +2399,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
|
||||
if (is_last_spte(pte, sp->role.level)) {
|
||||
drop_spte(kvm, spte);
|
||||
} else {
|
||||
child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
|
||||
child = spte_to_child_sp(pte);
|
||||
drop_parent_pte(child, spte);
|
||||
|
||||
/*
|
||||
@ -2487,8 +2515,8 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
|
||||
zapped_root = !is_obsolete_sp(kvm, sp);
|
||||
}
|
||||
|
||||
if (sp->lpage_disallowed)
|
||||
unaccount_huge_nx_page(kvm, sp);
|
||||
if (sp->nx_huge_page_disallowed)
|
||||
unaccount_nx_huge_page(kvm, sp);
|
||||
|
||||
sp->role.invalid = 1;
|
||||
|
||||
@ -2811,7 +2839,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
|
||||
struct kvm_mmu_page *child;
|
||||
u64 pte = *sptep;
|
||||
|
||||
child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
|
||||
child = spte_to_child_sp(pte);
|
||||
drop_parent_pte(child, sptep);
|
||||
flush = true;
|
||||
} else if (pfn != spte_to_pfn(*sptep)) {
|
||||
@ -3085,7 +3113,8 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
|
||||
if (cur_level > PG_LEVEL_4K &&
|
||||
cur_level == fault->goal_level &&
|
||||
is_shadow_present_pte(spte) &&
|
||||
!is_large_pte(spte)) {
|
||||
!is_large_pte(spte) &&
|
||||
spte_to_child_sp(spte)->nx_huge_page_disallowed) {
|
||||
/*
|
||||
* A small SPTE exists for this pfn, but FNAME(fetch)
|
||||
* and __direct_map would like to create a large PTE
|
||||
@ -3127,9 +3156,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
continue;
|
||||
|
||||
link_shadow_page(vcpu, it.sptep, sp);
|
||||
if (fault->is_tdp && fault->huge_page_disallowed &&
|
||||
fault->req_level >= it.level)
|
||||
account_huge_nx_page(vcpu->kvm, sp);
|
||||
if (fault->huge_page_disallowed)
|
||||
account_nx_huge_page(vcpu->kvm, sp,
|
||||
fault->req_level >= it.level);
|
||||
}
|
||||
|
||||
if (WARN_ON_ONCE(it.level != fault->goal_level))
|
||||
@ -3149,8 +3178,13 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
|
||||
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
|
||||
}
|
||||
|
||||
static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
|
||||
static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
|
||||
{
|
||||
if (is_sigpending_pfn(pfn)) {
|
||||
kvm_handle_signal_exit(vcpu);
|
||||
return -EINTR;
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not cache the mmio info caused by writing the readonly gfn
|
||||
* into the spte otherwise read access on readonly gfn also can
|
||||
@ -3172,7 +3206,7 @@ static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fau
|
||||
{
|
||||
/* The pfn is invalid, report the error! */
|
||||
if (unlikely(is_error_pfn(fault->pfn)))
|
||||
return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
|
||||
return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn);
|
||||
|
||||
if (unlikely(!fault->slot)) {
|
||||
gva_t gva = fault->is_tdp ? 0 : fault->addr;
|
||||
@ -3423,7 +3457,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
|
||||
if (!VALID_PAGE(*root_hpa))
|
||||
return;
|
||||
|
||||
sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK);
|
||||
/*
|
||||
* The "root" may be a special root, e.g. a PAE entry, treat it as a
|
||||
* SPTE to ensure any non-PA bits are dropped.
|
||||
*/
|
||||
sp = spte_to_child_sp(*root_hpa);
|
||||
if (WARN_ON(!sp))
|
||||
return;
|
||||
|
||||
@ -3908,8 +3946,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
|
||||
hpa_t root = vcpu->arch.mmu->pae_root[i];
|
||||
|
||||
if (IS_VALID_PAE_ROOT(root)) {
|
||||
root &= SPTE_BASE_ADDR_MASK;
|
||||
sp = to_shadow_page(root);
|
||||
sp = spte_to_child_sp(root);
|
||||
mmu_sync_children(vcpu, sp, true);
|
||||
}
|
||||
}
|
||||
@ -4170,7 +4207,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
}
|
||||
|
||||
async = false;
|
||||
fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
|
||||
fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
|
||||
fault->write, &fault->map_writable,
|
||||
&fault->hva);
|
||||
if (!async)
|
||||
@ -4187,7 +4224,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
}
|
||||
}
|
||||
|
||||
fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
|
||||
/*
|
||||
* Allow gup to bail on pending non-fatal signals when it's also allowed
|
||||
* to wait for IO. Note, gup always bails if it is unable to quickly
|
||||
* get a page and a fatal signal, i.e. SIGKILL, is pending.
|
||||
*/
|
||||
fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
|
||||
fault->write, &fault->map_writable,
|
||||
&fault->hva);
|
||||
return RET_PF_CONTINUE;
|
||||
@ -5972,7 +6014,7 @@ int kvm_mmu_init_vm(struct kvm *kvm)
|
||||
|
||||
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
|
||||
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
|
||||
INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
|
||||
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
|
||||
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
|
||||
|
||||
r = kvm_mmu_init_tdp_mmu(kvm);
|
||||
@ -6657,7 +6699,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
|
||||
kvm_mmu_zap_all_fast(kvm);
|
||||
mutex_unlock(&kvm->slots_lock);
|
||||
|
||||
wake_up_process(kvm->arch.nx_lpage_recovery_thread);
|
||||
wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
|
||||
}
|
||||
mutex_unlock(&kvm_lock);
|
||||
}
|
||||
@ -6789,7 +6831,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
|
||||
mutex_lock(&kvm_lock);
|
||||
|
||||
list_for_each_entry(kvm, &vm_list, vm_list)
|
||||
wake_up_process(kvm->arch.nx_lpage_recovery_thread);
|
||||
wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
|
||||
|
||||
mutex_unlock(&kvm_lock);
|
||||
}
|
||||
@ -6797,9 +6839,10 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
|
||||
return err;
|
||||
}
|
||||
|
||||
static void kvm_recover_nx_lpages(struct kvm *kvm)
|
||||
static void kvm_recover_nx_huge_pages(struct kvm *kvm)
|
||||
{
|
||||
unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
|
||||
struct kvm_memory_slot *slot;
|
||||
int rcu_idx;
|
||||
struct kvm_mmu_page *sp;
|
||||
unsigned int ratio;
|
||||
@ -6820,25 +6863,56 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
|
||||
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
|
||||
to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
|
||||
for ( ; to_zap; --to_zap) {
|
||||
if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
|
||||
if (list_empty(&kvm->arch.possible_nx_huge_pages))
|
||||
break;
|
||||
|
||||
/*
|
||||
* We use a separate list instead of just using active_mmu_pages
|
||||
* because the number of lpage_disallowed pages is expected to
|
||||
* be relatively small compared to the total.
|
||||
* because the number of shadow pages that be replaced with an
|
||||
* NX huge page is expected to be relatively small compared to
|
||||
* the total number of shadow pages. And because the TDP MMU
|
||||
* doesn't use active_mmu_pages.
|
||||
*/
|
||||
sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
|
||||
sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
|
||||
struct kvm_mmu_page,
|
||||
lpage_disallowed_link);
|
||||
WARN_ON_ONCE(!sp->lpage_disallowed);
|
||||
if (is_tdp_mmu_page(sp)) {
|
||||
flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
|
||||
} else {
|
||||
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
|
||||
WARN_ON_ONCE(sp->lpage_disallowed);
|
||||
possible_nx_huge_page_link);
|
||||
WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
|
||||
WARN_ON_ONCE(!sp->role.direct);
|
||||
|
||||
/*
|
||||
* Unaccount and do not attempt to recover any NX Huge Pages
|
||||
* that are being dirty tracked, as they would just be faulted
|
||||
* back in as 4KiB pages. The NX Huge Pages in this slot will be
|
||||
* recovered, along with all the other huge pages in the slot,
|
||||
* when dirty logging is disabled.
|
||||
*
|
||||
* Since gfn_to_memslot() is relatively expensive, it helps to
|
||||
* skip it if it the test cannot possibly return true. On the
|
||||
* other hand, if any memslot has logging enabled, chances are
|
||||
* good that all of them do, in which case unaccount_nx_huge_page()
|
||||
* is much cheaper than zapping the page.
|
||||
*
|
||||
* If a memslot update is in progress, reading an incorrect value
|
||||
* of kvm->nr_memslots_dirty_logging is not a problem: if it is
|
||||
* becoming zero, gfn_to_memslot() will be done unnecessarily; if
|
||||
* it is becoming nonzero, the page will be zapped unnecessarily.
|
||||
* Either way, this only affects efficiency in racy situations,
|
||||
* and not correctness.
|
||||
*/
|
||||
slot = NULL;
|
||||
if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
|
||||
slot = gfn_to_memslot(kvm, sp->gfn);
|
||||
WARN_ON_ONCE(!slot);
|
||||
}
|
||||
|
||||
if (slot && kvm_slot_dirty_track_enabled(slot))
|
||||
unaccount_nx_huge_page(kvm, sp);
|
||||
else if (is_tdp_mmu_page(sp))
|
||||
flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
|
||||
else
|
||||
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
|
||||
WARN_ON_ONCE(sp->nx_huge_page_disallowed);
|
||||
|
||||
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
|
||||
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
|
||||
rcu_read_unlock();
|
||||
@ -6857,7 +6931,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
|
||||
srcu_read_unlock(&kvm->srcu, rcu_idx);
|
||||
}
|
||||
|
||||
static long get_nx_lpage_recovery_timeout(u64 start_time)
|
||||
static long get_nx_huge_page_recovery_timeout(u64 start_time)
|
||||
{
|
||||
bool enabled;
|
||||
uint period;
|
||||
@ -6868,19 +6942,19 @@ static long get_nx_lpage_recovery_timeout(u64 start_time)
|
||||
: MAX_SCHEDULE_TIMEOUT;
|
||||
}
|
||||
|
||||
static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
|
||||
static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
|
||||
{
|
||||
u64 start_time;
|
||||
long remaining_time;
|
||||
|
||||
while (true) {
|
||||
start_time = get_jiffies_64();
|
||||
remaining_time = get_nx_lpage_recovery_timeout(start_time);
|
||||
remaining_time = get_nx_huge_page_recovery_timeout(start_time);
|
||||
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
while (!kthread_should_stop() && remaining_time > 0) {
|
||||
schedule_timeout(remaining_time);
|
||||
remaining_time = get_nx_lpage_recovery_timeout(start_time);
|
||||
remaining_time = get_nx_huge_page_recovery_timeout(start_time);
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
}
|
||||
|
||||
@ -6889,7 +6963,7 @@ static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
|
||||
if (kthread_should_stop())
|
||||
return 0;
|
||||
|
||||
kvm_recover_nx_lpages(kvm);
|
||||
kvm_recover_nx_huge_pages(kvm);
|
||||
}
|
||||
}
|
||||
|
||||
@ -6897,17 +6971,17 @@ int kvm_mmu_post_init_vm(struct kvm *kvm)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
|
||||
err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
|
||||
"kvm-nx-lpage-recovery",
|
||||
&kvm->arch.nx_lpage_recovery_thread);
|
||||
&kvm->arch.nx_huge_page_recovery_thread);
|
||||
if (!err)
|
||||
kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
|
||||
kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
|
||||
{
|
||||
if (kvm->arch.nx_lpage_recovery_thread)
|
||||
kthread_stop(kvm->arch.nx_lpage_recovery_thread);
|
||||
if (kvm->arch.nx_huge_page_recovery_thread)
|
||||
kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
|
||||
}
|
||||
|
@ -57,7 +57,13 @@ struct kvm_mmu_page {
|
||||
bool tdp_mmu_page;
|
||||
bool unsync;
|
||||
u8 mmu_valid_gen;
|
||||
bool lpage_disallowed; /* Can't be replaced by an equiv large page */
|
||||
|
||||
/*
|
||||
* The shadow page can't be replaced by an equivalent huge page
|
||||
* because it is being used to map an executable page in the guest
|
||||
* and the NX huge page mitigation is enabled.
|
||||
*/
|
||||
bool nx_huge_page_disallowed;
|
||||
|
||||
/*
|
||||
* The following two entries are used to key the shadow page in the
|
||||
@ -100,7 +106,14 @@ struct kvm_mmu_page {
|
||||
};
|
||||
};
|
||||
|
||||
struct list_head lpage_disallowed_link;
|
||||
/*
|
||||
* Tracks shadow pages that, if zapped, would allow KVM to create an NX
|
||||
* huge page. A shadow page will have nx_huge_page_disallowed set but
|
||||
* not be on the list if a huge page is disallowed for other reasons,
|
||||
* e.g. because KVM is shadowing a PTE at the same gfn, the memslot
|
||||
* isn't properly aligned, etc...
|
||||
*/
|
||||
struct list_head possible_nx_huge_page_link;
|
||||
#ifdef CONFIG_X86_32
|
||||
/*
|
||||
* Used out of the mmu-lock to avoid reading spte values while an
|
||||
@ -120,18 +133,6 @@ struct kvm_mmu_page {
|
||||
|
||||
extern struct kmem_cache *mmu_page_header_cache;
|
||||
|
||||
static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
|
||||
{
|
||||
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
|
||||
|
||||
return (struct kvm_mmu_page *)page_private(page);
|
||||
}
|
||||
|
||||
static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
|
||||
{
|
||||
return to_shadow_page(__pa(sptep));
|
||||
}
|
||||
|
||||
static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
|
||||
{
|
||||
return role.smm ? 1 : 0;
|
||||
@ -315,7 +316,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
|
||||
|
||||
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
|
||||
|
||||
void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
|
||||
void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
|
||||
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
|
||||
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
|
||||
|
||||
#endif /* __KVM_X86_MMU_INTERNAL_H */
|
||||
|
@ -713,9 +713,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
|
||||
continue;
|
||||
|
||||
link_shadow_page(vcpu, it.sptep, sp);
|
||||
if (fault->huge_page_disallowed &&
|
||||
fault->req_level >= it.level)
|
||||
account_huge_nx_page(vcpu->kvm, sp);
|
||||
if (fault->huge_page_disallowed)
|
||||
account_nx_huge_page(vcpu->kvm, sp,
|
||||
fault->req_level >= it.level);
|
||||
}
|
||||
|
||||
if (WARN_ON_ONCE(it.level != fault->goal_level))
|
||||
|
@ -161,6 +161,18 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||
if (!prefetch)
|
||||
spte |= spte_shadow_accessed_mask(spte);
|
||||
|
||||
/*
|
||||
* For simplicity, enforce the NX huge page mitigation even if not
|
||||
* strictly necessary. KVM could ignore the mitigation if paging is
|
||||
* disabled in the guest, as the guest doesn't have an page tables to
|
||||
* abuse. But to safely ignore the mitigation, KVM would have to
|
||||
* ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
|
||||
* is toggled on, and that's a net negative for performance when TDP is
|
||||
* enabled. When TDP is disabled, KVM will always switch to a new MMU
|
||||
* when CR0.PG is toggled, but leveraging that to ignore the mitigation
|
||||
* would tie make_spte() further to vCPU/MMU state, and add complexity
|
||||
* just to optimize a mode that is anything but performance critical.
|
||||
*/
|
||||
if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
|
||||
is_nx_huge_page_enabled(vcpu->kvm)) {
|
||||
pte_access &= ~ACC_EXEC_MASK;
|
||||
|
@ -188,7 +188,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
|
||||
* should not modify the SPTE.
|
||||
*
|
||||
* Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
|
||||
* bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
|
||||
* both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
|
||||
* vulnerability. Use only low bits to avoid 64-bit immediates.
|
||||
*
|
||||
* Only used by the TDP MMU.
|
||||
@ -219,6 +219,23 @@ static inline int spte_index(u64 *sptep)
|
||||
*/
|
||||
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
|
||||
|
||||
static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
|
||||
{
|
||||
struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
|
||||
|
||||
return (struct kvm_mmu_page *)page_private(page);
|
||||
}
|
||||
|
||||
static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte)
|
||||
{
|
||||
return to_shadow_page(spte & SPTE_BASE_ADDR_MASK);
|
||||
}
|
||||
|
||||
static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
|
||||
{
|
||||
return to_shadow_page(__pa(sptep));
|
||||
}
|
||||
|
||||
static inline bool is_mmio_spte(u64 spte)
|
||||
{
|
||||
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
|
||||
|
@ -29,7 +29,6 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
|
||||
kvm->arch.tdp_mmu_enabled = true;
|
||||
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
|
||||
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
|
||||
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
|
||||
kvm->arch.tdp_mmu_zap_wq = wq;
|
||||
return 1;
|
||||
}
|
||||
@ -54,7 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
|
||||
/* Also waits for any queued work items. */
|
||||
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
|
||||
|
||||
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
|
||||
WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
|
||||
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
|
||||
|
||||
/*
|
||||
@ -284,6 +283,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
|
||||
static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
|
||||
gfn_t gfn, union kvm_mmu_page_role role)
|
||||
{
|
||||
INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
|
||||
|
||||
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
|
||||
|
||||
sp->role = role;
|
||||
@ -375,11 +376,13 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
|
||||
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
{
|
||||
kvm_account_pgtable_pages((void *)sp->spt, +1);
|
||||
atomic64_inc(&kvm->arch.tdp_mmu_pages);
|
||||
}
|
||||
|
||||
static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
{
|
||||
kvm_account_pgtable_pages((void *)sp->spt, -1);
|
||||
atomic64_dec(&kvm->arch.tdp_mmu_pages);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -395,14 +398,17 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
|
||||
bool shared)
|
||||
{
|
||||
tdp_unaccount_mmu_page(kvm, sp);
|
||||
|
||||
if (!sp->nx_huge_page_disallowed)
|
||||
return;
|
||||
|
||||
if (shared)
|
||||
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
else
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
|
||||
list_del(&sp->link);
|
||||
if (sp->lpage_disallowed)
|
||||
unaccount_huge_nx_page(kvm, sp);
|
||||
sp->nx_huge_page_disallowed = false;
|
||||
untrack_possible_nx_huge_page(kvm, sp);
|
||||
|
||||
if (shared)
|
||||
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
@ -1116,16 +1122,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
|
||||
* @kvm: kvm instance
|
||||
* @iter: a tdp_iter instance currently on the SPTE that should be set
|
||||
* @sp: The new TDP page table to install.
|
||||
* @account_nx: True if this page table is being installed to split a
|
||||
* non-executable huge page.
|
||||
* @shared: This operation is running under the MMU lock in read mode.
|
||||
*
|
||||
* Returns: 0 if the new page table was installed. Non-0 if the page table
|
||||
* could not be installed (e.g. the atomic compare-exchange failed).
|
||||
*/
|
||||
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
|
||||
struct kvm_mmu_page *sp, bool account_nx,
|
||||
bool shared)
|
||||
struct kvm_mmu_page *sp, bool shared)
|
||||
{
|
||||
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
|
||||
int ret = 0;
|
||||
@ -1138,16 +1141,14 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
|
||||
tdp_mmu_set_spte(kvm, iter, spte);
|
||||
}
|
||||
|
||||
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
|
||||
if (account_nx)
|
||||
account_huge_nx_page(kvm, sp);
|
||||
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
tdp_account_mmu_page(kvm, sp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
|
||||
struct kvm_mmu_page *sp, bool shared);
|
||||
|
||||
/*
|
||||
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
|
||||
* page tables and SPTEs to translate the faulting guest physical address.
|
||||
@ -1155,9 +1156,10 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
|
||||
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
{
|
||||
struct kvm_mmu *mmu = vcpu->arch.mmu;
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
struct tdp_iter iter;
|
||||
struct kvm_mmu_page *sp;
|
||||
int ret;
|
||||
int ret = RET_PF_RETRY;
|
||||
|
||||
kvm_mmu_hugepage_adjust(vcpu, fault);
|
||||
|
||||
@ -1166,6 +1168,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
rcu_read_lock();
|
||||
|
||||
tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
|
||||
int r;
|
||||
|
||||
if (fault->nx_huge_page_workaround_enabled)
|
||||
disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
|
||||
|
||||
@ -1173,57 +1177,52 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If there is an SPTE mapping a large page at a higher level
|
||||
* than the target, that SPTE must be cleared and replaced
|
||||
* with a non-leaf SPTE.
|
||||
* If SPTE has been frozen by another thread, just give up and
|
||||
* retry, avoiding unnecessary page table allocation and free.
|
||||
*/
|
||||
if (is_removed_spte(iter.old_spte))
|
||||
goto retry;
|
||||
|
||||
/* Step down into the lower level page table if it exists. */
|
||||
if (is_shadow_present_pte(iter.old_spte) &&
|
||||
is_large_pte(iter.old_spte)) {
|
||||
if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
|
||||
break;
|
||||
!is_large_pte(iter.old_spte))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* The iter must explicitly re-read the spte here
|
||||
* because the new value informs the !present
|
||||
* path below.
|
||||
*/
|
||||
iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
|
||||
/*
|
||||
* The SPTE is either non-present or points to a huge page that
|
||||
* needs to be split.
|
||||
*/
|
||||
sp = tdp_mmu_alloc_sp(vcpu);
|
||||
tdp_mmu_init_child_sp(sp, &iter);
|
||||
|
||||
sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
|
||||
|
||||
if (is_shadow_present_pte(iter.old_spte))
|
||||
r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
|
||||
else
|
||||
r = tdp_mmu_link_sp(kvm, &iter, sp, true);
|
||||
|
||||
/*
|
||||
* Also force the guest to retry the access if the upper level SPTEs
|
||||
* aren't in place.
|
||||
*/
|
||||
if (r) {
|
||||
tdp_mmu_free_sp(sp);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (!is_shadow_present_pte(iter.old_spte)) {
|
||||
bool account_nx = fault->huge_page_disallowed &&
|
||||
fault->req_level >= iter.level;
|
||||
|
||||
/*
|
||||
* If SPTE has been frozen by another thread, just
|
||||
* give up and retry, avoiding unnecessary page table
|
||||
* allocation and free.
|
||||
*/
|
||||
if (is_removed_spte(iter.old_spte))
|
||||
break;
|
||||
|
||||
sp = tdp_mmu_alloc_sp(vcpu);
|
||||
tdp_mmu_init_child_sp(sp, &iter);
|
||||
|
||||
if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
|
||||
tdp_mmu_free_sp(sp);
|
||||
break;
|
||||
}
|
||||
if (fault->huge_page_disallowed &&
|
||||
fault->req_level >= iter.level) {
|
||||
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
track_possible_nx_huge_page(kvm, sp);
|
||||
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Force the guest to retry the access if the upper level SPTEs aren't
|
||||
* in place, or if the target leaf SPTE is frozen by another CPU.
|
||||
*/
|
||||
if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
|
||||
rcu_read_unlock();
|
||||
return RET_PF_RETRY;
|
||||
}
|
||||
|
||||
ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
|
||||
rcu_read_unlock();
|
||||
|
||||
retry:
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1472,6 +1471,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
|
||||
return sp;
|
||||
}
|
||||
|
||||
/* Note, the caller is responsible for initializing @sp. */
|
||||
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
|
||||
struct kvm_mmu_page *sp, bool shared)
|
||||
{
|
||||
@ -1479,8 +1479,6 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
|
||||
const int level = iter->level;
|
||||
int ret, i;
|
||||
|
||||
tdp_mmu_init_child_sp(sp, iter);
|
||||
|
||||
/*
|
||||
* No need for atomics when writing to sp->spt since the page table has
|
||||
* not been linked in yet and thus is not reachable from any other CPU.
|
||||
@ -1496,7 +1494,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
|
||||
* correctness standpoint since the translation will be the same either
|
||||
* way.
|
||||
*/
|
||||
ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
|
||||
ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@ -1556,6 +1554,8 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
|
||||
continue;
|
||||
}
|
||||
|
||||
tdp_mmu_init_child_sp(sp, &iter);
|
||||
|
||||
if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
|
||||
goto retry;
|
||||
|
||||
|
@ -5,6 +5,8 @@
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
|
||||
#include "spte.h"
|
||||
|
||||
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
|
||||
|
||||
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
|
||||
|
@ -101,10 +101,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
|
||||
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
|
||||
bool skip_pmi = false;
|
||||
|
||||
/* Ignore counters that have been reprogrammed already. */
|
||||
if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
|
||||
return;
|
||||
|
||||
if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
|
||||
if (!in_pmi) {
|
||||
/*
|
||||
@ -122,7 +118,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
|
||||
} else {
|
||||
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
|
||||
}
|
||||
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
|
||||
|
||||
if (!pmc->intr || skip_pmi)
|
||||
return;
|
||||
@ -147,12 +142,22 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
|
||||
{
|
||||
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
|
||||
|
||||
/*
|
||||
* Ignore overflow events for counters that are scheduled to be
|
||||
* reprogrammed, e.g. if a PMI for the previous event races with KVM's
|
||||
* handling of a related guest WRMSR.
|
||||
*/
|
||||
if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
|
||||
return;
|
||||
|
||||
__kvm_perf_overflow(pmc, true);
|
||||
|
||||
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
|
||||
}
|
||||
|
||||
static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
|
||||
u64 config, bool exclude_user,
|
||||
bool exclude_kernel, bool intr)
|
||||
static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
|
||||
bool exclude_user, bool exclude_kernel,
|
||||
bool intr)
|
||||
{
|
||||
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
|
||||
struct perf_event *event;
|
||||
@ -204,14 +209,14 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
|
||||
if (IS_ERR(event)) {
|
||||
pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
|
||||
PTR_ERR(event), pmc->idx);
|
||||
return;
|
||||
return PTR_ERR(event);
|
||||
}
|
||||
|
||||
pmc->perf_event = event;
|
||||
pmc_to_pmu(pmc)->event_count++;
|
||||
clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
|
||||
pmc->is_paused = false;
|
||||
pmc->intr = intr || pebs;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void pmc_pause_counter(struct kvm_pmc *pmc)
|
||||
@ -245,7 +250,6 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
|
||||
perf_event_enable(pmc->perf_event);
|
||||
pmc->is_paused = false;
|
||||
|
||||
clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -293,7 +297,7 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc)
|
||||
return allow_event;
|
||||
}
|
||||
|
||||
void reprogram_counter(struct kvm_pmc *pmc)
|
||||
static void reprogram_counter(struct kvm_pmc *pmc)
|
||||
{
|
||||
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
|
||||
u64 eventsel = pmc->eventsel;
|
||||
@ -303,10 +307,13 @@ void reprogram_counter(struct kvm_pmc *pmc)
|
||||
pmc_pause_counter(pmc);
|
||||
|
||||
if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
|
||||
return;
|
||||
goto reprogram_complete;
|
||||
|
||||
if (!check_pmu_event_filter(pmc))
|
||||
return;
|
||||
goto reprogram_complete;
|
||||
|
||||
if (pmc->counter < pmc->prev_counter)
|
||||
__kvm_perf_overflow(pmc, false);
|
||||
|
||||
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
|
||||
printk_once("kvm pmu: pin control bit is ignored\n");
|
||||
@ -324,18 +331,29 @@ void reprogram_counter(struct kvm_pmc *pmc)
|
||||
}
|
||||
|
||||
if (pmc->current_config == new_config && pmc_resume_counter(pmc))
|
||||
return;
|
||||
goto reprogram_complete;
|
||||
|
||||
pmc_release_perf_event(pmc);
|
||||
|
||||
pmc->current_config = new_config;
|
||||
pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
|
||||
(eventsel & pmu->raw_event_mask),
|
||||
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
|
||||
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
|
||||
eventsel & ARCH_PERFMON_EVENTSEL_INT);
|
||||
|
||||
/*
|
||||
* If reprogramming fails, e.g. due to contention, leave the counter's
|
||||
* regprogram bit set, i.e. opportunistically try again on the next PMU
|
||||
* refresh. Don't make a new request as doing so can stall the guest
|
||||
* if reprogramming repeatedly fails.
|
||||
*/
|
||||
if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
|
||||
(eventsel & pmu->raw_event_mask),
|
||||
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
|
||||
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
|
||||
eventsel & ARCH_PERFMON_EVENTSEL_INT))
|
||||
return;
|
||||
|
||||
reprogram_complete:
|
||||
clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
|
||||
pmc->prev_counter = 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(reprogram_counter);
|
||||
|
||||
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
@ -345,10 +363,11 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
|
||||
for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
|
||||
struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
|
||||
|
||||
if (unlikely(!pmc || !pmc->perf_event)) {
|
||||
if (unlikely(!pmc)) {
|
||||
clear_bit(bit, pmu->reprogram_pmi);
|
||||
continue;
|
||||
}
|
||||
|
||||
reprogram_counter(pmc);
|
||||
}
|
||||
|
||||
@ -522,14 +541,9 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
|
||||
|
||||
static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
|
||||
{
|
||||
u64 prev_count;
|
||||
|
||||
prev_count = pmc->counter;
|
||||
pmc->prev_counter = pmc->counter;
|
||||
pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
|
||||
|
||||
reprogram_counter(pmc);
|
||||
if (pmc->counter < prev_count)
|
||||
__kvm_perf_overflow(pmc, false);
|
||||
kvm_pmu_request_counter_reprogam(pmc);
|
||||
}
|
||||
|
||||
static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
|
||||
@ -542,12 +556,15 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
|
||||
static inline bool cpl_is_matched(struct kvm_pmc *pmc)
|
||||
{
|
||||
bool select_os, select_user;
|
||||
u64 config = pmc->current_config;
|
||||
u64 config;
|
||||
|
||||
if (pmc_is_gp(pmc)) {
|
||||
config = pmc->eventsel;
|
||||
select_os = config & ARCH_PERFMON_EVENTSEL_OS;
|
||||
select_user = config & ARCH_PERFMON_EVENTSEL_USR;
|
||||
} else {
|
||||
config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
|
||||
pmc->idx - INTEL_PMC_IDX_FIXED);
|
||||
select_os = config & 0x1;
|
||||
select_user = config & 0x2;
|
||||
}
|
||||
@ -577,6 +594,8 @@ EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
|
||||
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
|
||||
{
|
||||
struct kvm_pmu_event_filter tmp, *filter;
|
||||
struct kvm_vcpu *vcpu;
|
||||
unsigned long i;
|
||||
size_t size;
|
||||
int r;
|
||||
|
||||
@ -613,9 +632,18 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
|
||||
mutex_lock(&kvm->lock);
|
||||
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
|
||||
mutex_is_locked(&kvm->lock));
|
||||
synchronize_srcu_expedited(&kvm->srcu);
|
||||
|
||||
BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
|
||||
sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
|
||||
|
||||
kvm_for_each_vcpu(i, vcpu, kvm)
|
||||
atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
|
||||
|
||||
kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
|
||||
|
||||
mutex_unlock(&kvm->lock);
|
||||
|
||||
synchronize_srcu_expedited(&kvm->srcu);
|
||||
r = 0;
|
||||
cleanup:
|
||||
kfree(filter);
|
||||
|
@ -183,7 +183,11 @@ static inline void kvm_init_pmu_capability(void)
|
||||
KVM_PMC_MAX_FIXED);
|
||||
}
|
||||
|
||||
void reprogram_counter(struct kvm_pmc *pmc);
|
||||
static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc)
|
||||
{
|
||||
set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
|
||||
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
|
||||
}
|
||||
|
||||
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
|
||||
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user