- Make sure clearing CPU buffers using VERW happens at the latest possible

point in the return-to-userspace path, otherwise memory accesses after
   the VERW execution could cause data to land in CPU buffers again
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmXbG7IACgkQEsHwGGHe
 VUoEEg//d1qt/PEWCC23wMO6gLMl4J/e4ZQAuGOKGed/jUmOaQKpHJmpDMRc0li5
 llRYDdfE0ikmtQT3t9vQDs3xbWfT5bLMsijliRimb193FaS1HGlHMMS1nxhfjyfv
 MecbWfkwzX2JnrxJpsbfue+7kks3HyIXYsXV7kSFiHavk4F3GFQXYLO11pKbNQwN
 9UfjJDeVsrcWPGCHhoPKF5NHUnQKIA8ZC6g8yBq894AtdWOhFY7ePKBZefUWQQ1n
 myc5GJ3dKFICMCZvkMABtHYCmHU/W3y/6tPtnrz3kT8GdCIAHG+K9VRUfY1ml94H
 x327GoM3sEzHLsPizKy00/Uao+j6FOtv631LoDLsO2MF3sHoTZDaSgg5y2D/ZC7t
 IZdK3mUGtdINRhGiWWpdxyaMfkQ62cdZk8FkeYkRAewYS6WYSdMX3cPqFNy4Ss5u
 r3reMOD3JcxAatcqhHMXjARMfY+N08gQBpxBul3ejgH8t8aY7xJx6Vggty5kBlHZ
 7urV9jIRxSXfbBmOcYu6HP1ucFLWNSUQCBn7Imrh+5zbE1XVv7NaAWvT4Nmgb0/X
 57fHoYYSVwaJ0k3zWWM7QYEdcuJ7IZnVgTCQYx26Ec2AOQRxE9ose+awTLYtTbp1
 T+XaOlItHKMRzx9K46D7xHwmC5qiokFki3exp5vfGZxGyT3+t/c=
 =n5us
 -----END PGP SIGNATURE-----

Merge tag 'x86_urgent_for_v6.8_rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Borislav Petkov:

 - Make sure clearing CPU buffers using VERW happens at the latest
   possible point in the return-to-userspace path, otherwise memory
   accesses after the VERW execution could cause data to land in CPU
   buffers again

* tag 'x86_urgent_for_v6.8_rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  KVM/VMX: Move VERW closer to VMentry for MDS mitigation
  KVM/VMX: Use BT+JNC, i.e. EFLAGS.CF to select VMRESUME vs. VMLAUNCH
  x86/bugs: Use ALTERNATIVE() instead of mds_user_clear static key
  x86/entry_32: Add VERW just before userspace transition
  x86/entry_64: Add VERW just before userspace transition
  x86/bugs: Add asm helpers for executing VERW
This commit is contained in:
Linus Torvalds 2024-02-25 10:22:21 -08:00
commit 1eee4ef38c
13 changed files with 110 additions and 44 deletions

View File

@ -95,6 +95,9 @@ The kernel provides a function to invoke the buffer clearing:
mds_clear_cpu_buffers()
Also macro CLEAR_CPU_BUFFERS can be used in ASM late in exit-to-user path.
Other than CFLAGS.ZF, this macro doesn't clobber any registers.
The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state
(idle) transitions.
@ -138,17 +141,30 @@ Mitigation points
When transitioning from kernel to user space the CPU buffers are flushed
on affected CPUs when the mitigation is not disabled on the kernel
command line. The migitation is enabled through the static key
mds_user_clear.
command line. The mitigation is enabled through the feature flag
X86_FEATURE_CLEAR_CPU_BUF.
The mitigation is invoked in prepare_exit_to_usermode() which covers
all but one of the kernel to user space transitions. The exception
is when we return from a Non Maskable Interrupt (NMI), which is
handled directly in do_nmi().
The mitigation is invoked just before transitioning to userspace after
user registers are restored. This is done to minimize the window in
which kernel data could be accessed after VERW e.g. via an NMI after
VERW.
(The reason that NMI is special is that prepare_exit_to_usermode() can
enable IRQs. In NMI context, NMIs are blocked, and we don't want to
enable IRQs with NMIs blocked.)
**Corner case not handled**
Interrupts returning to kernel don't clear CPUs buffers since the
exit-to-user path is expected to do that anyways. But, there could be
a case when an NMI is generated in kernel after the exit-to-user path
has cleared the buffers. This case is not handled and NMI returning to
kernel don't clear CPU buffers because:
1. It is rare to get an NMI after VERW, but before returning to userspace.
2. For an unprivileged user, there is no known way to make that NMI
less rare or target it.
3. It would take a large number of these precisely-timed NMIs to mount
an actual attack. There's presumably not enough bandwidth.
4. The NMI in question occurs after a VERW, i.e. when user state is
restored and most interesting data is already scrubbed. Whats left
is only the data that NMI touches, and that may or may not be of
any interest.
2. C-State transition

View File

@ -6,6 +6,9 @@
#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
#include <asm/segment.h>
#include <asm/cache.h>
.pushsection .noinstr.text, "ax"
@ -20,3 +23,23 @@ SYM_FUNC_END(entry_ibpb)
EXPORT_SYMBOL_GPL(entry_ibpb);
.popsection
/*
* Define the VERW operand that is disguised as entry code so that
* it can be referenced with KPTI enabled. This ensure VERW can be
* used late in exit-to-user path after page tables are switched.
*/
.pushsection .entry.text, "ax"
.align L1_CACHE_BYTES, 0xcc
SYM_CODE_START_NOALIGN(mds_verw_sel)
UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
.word __KERNEL_DS
.align L1_CACHE_BYTES, 0xcc
SYM_CODE_END(mds_verw_sel);
/* For KVM */
EXPORT_SYMBOL_GPL(mds_verw_sel);
.popsection

View File

@ -885,6 +885,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
BUG_IF_WRONG_CR3 no_user_check=1
popfl
popl %eax
CLEAR_CPU_BUFFERS
/*
* Return back to the vDSO, which will pop ecx and edx.
@ -954,6 +955,7 @@ restore_all_switch_stack:
/* Restore user state */
RESTORE_REGS pop=4 # skip orig_eax/error_code
CLEAR_CPU_BUFFERS
.Lirq_return:
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
@ -1146,6 +1148,7 @@ SYM_CODE_START(asm_exc_nmi)
/* Not on SYSENTER stack. */
call exc_nmi
CLEAR_CPU_BUFFERS
jmp .Lnmi_return
.Lnmi_from_sysenter_stack:

View File

@ -161,6 +161,7 @@ syscall_return_via_sysret:
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
swapgs
CLEAR_CPU_BUFFERS
sysretq
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
@ -573,6 +574,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
.Lswapgs_and_iret:
swapgs
CLEAR_CPU_BUFFERS
/* Assert that the IRET frame indicates user mode. */
testb $3, 8(%rsp)
jnz .Lnative_iret
@ -723,6 +725,8 @@ native_irq_return_ldt:
*/
popq %rax /* Restore user RAX */
CLEAR_CPU_BUFFERS
/*
* RSP now points to an ordinary IRET frame, except that the page
* is read-only and RSP[31:16] are preloaded with the userspace
@ -1449,6 +1453,12 @@ nmi_restore:
std
movq $0, 5*8(%rsp) /* clear "NMI executing" */
/*
* Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like
* NMI in kernel after user state is restored. For an unprivileged user
* these conditions are hard to meet.
*/
/*
* iretq reads the "iret" frame and exits the NMI stack in a
* single instruction. We are returning to kernel mode, so this
@ -1466,6 +1476,7 @@ SYM_CODE_START(entry_SYSCALL32_ignore)
UNWIND_HINT_END_OF_STACK
ENDBR
mov $-ENOSYS, %eax
CLEAR_CPU_BUFFERS
sysretl
SYM_CODE_END(entry_SYSCALL32_ignore)

View File

@ -270,6 +270,7 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL)
xorl %r9d, %r9d
xorl %r10d, %r10d
swapgs
CLEAR_CPU_BUFFERS
sysretl
SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR

View File

@ -95,7 +95,7 @@
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */
/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */

View File

@ -91,7 +91,6 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
static __always_inline void arch_exit_to_user_mode(void)
{
mds_user_clear_cpu_buffers();
amd_clear_divider();
}
#define arch_exit_to_user_mode arch_exit_to_user_mode

View File

@ -315,6 +315,17 @@
#endif
.endm
/*
* Macro to execute VERW instruction that mitigate transient data sampling
* attacks such as MDS. On affected systems a microcode update overloaded VERW
* instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF.
*
* Note: Only the memory operand variant of VERW clears the CPU buffers.
*/
.macro CLEAR_CPU_BUFFERS
ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF
.endm
#else /* __ASSEMBLY__ */
#define ANNOTATE_RETPOLINE_SAFE \
@ -529,13 +540,14 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
DECLARE_STATIC_KEY_FALSE(mds_user_clear);
DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);
extern u16 mds_verw_sel;
#include <asm/segment.h>
/**
@ -561,17 +573,6 @@ static __always_inline void mds_clear_cpu_buffers(void)
asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
}
/**
* mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
*
* Clear CPU buffers if the corresponding static key is enabled
*/
static __always_inline void mds_user_clear_cpu_buffers(void)
{
if (static_branch_likely(&mds_user_clear))
mds_clear_cpu_buffers();
}
/**
* mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
*

View File

@ -111,9 +111,6 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
/* Control unconditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
/* Control MDS CPU buffer clear before returning to user space */
DEFINE_STATIC_KEY_FALSE(mds_user_clear);
EXPORT_SYMBOL_GPL(mds_user_clear);
/* Control MDS CPU buffer clear before idling (halt, mwait) */
DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
EXPORT_SYMBOL_GPL(mds_idle_clear);
@ -252,7 +249,7 @@ static void __init mds_select_mitigation(void)
if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
mds_mitigation = MDS_MITIGATION_VMWERV;
static_branch_enable(&mds_user_clear);
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
(mds_nosmt || cpu_mitigations_auto_nosmt()))
@ -356,7 +353,7 @@ static void __init taa_select_mitigation(void)
* For guests that can't determine whether the correct microcode is
* present on host, enable the mitigation for UCODE_NEEDED as well.
*/
static_branch_enable(&mds_user_clear);
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (taa_nosmt || cpu_mitigations_auto_nosmt())
cpu_smt_disable(false);
@ -424,7 +421,7 @@ static void __init mmio_select_mitigation(void)
*/
if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
boot_cpu_has(X86_FEATURE_RTM)))
static_branch_enable(&mds_user_clear);
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
else
static_branch_enable(&mmio_stale_data_clear);
@ -484,12 +481,12 @@ static void __init md_clear_update_mitigation(void)
if (cpu_mitigations_off())
return;
if (!static_key_enabled(&mds_user_clear))
if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF))
goto out;
/*
* mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data
* mitigation, if necessary.
* X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO
* Stale Data mitigation, if necessary.
*/
if (mds_mitigation == MDS_MITIGATION_OFF &&
boot_cpu_has_bug(X86_BUG_MDS)) {

View File

@ -563,9 +563,6 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
}
if (this_cpu_dec_return(nmi_state))
goto nmi_restart;
if (user_mode(regs))
mds_user_clear_cpu_buffers();
}
#if IS_ENABLED(CONFIG_KVM_INTEL)

View File

@ -2,7 +2,10 @@
#ifndef __KVM_X86_VMX_RUN_FLAGS_H
#define __KVM_X86_VMX_RUN_FLAGS_H
#define VMX_RUN_VMRESUME (1 << 0)
#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1)
#define VMX_RUN_VMRESUME_SHIFT 0
#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
#endif /* __KVM_X86_VMX_RUN_FLAGS_H */

View File

@ -139,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
mov (%_ASM_SP), %_ASM_AX
/* Check if vmlaunch or vmresume is needed */
test $VMX_RUN_VMRESUME, %ebx
bt $VMX_RUN_VMRESUME_SHIFT, %ebx
/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
@ -161,8 +161,11 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
/* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */
jz .Lvmlaunch
/* Clobbers EFLAGS.ZF */
CLEAR_CPU_BUFFERS
/* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */
jnc .Lvmlaunch
/*
* After a successful VMRESUME/VMLAUNCH, control flow "magically"

View File

@ -388,7 +388,16 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
{
vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
/*
* Disable VERW's behavior of clearing CPU buffers for the guest if the
* CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
* the mitigation. Disabling the clearing behavior provides a
* performance boost for guests that aren't aware that manually clearing
* CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
* and VM-Exit.
*/
vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
(host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
!boot_cpu_has_bug(X86_BUG_MDS) &&
!boot_cpu_has_bug(X86_BUG_TAA);
@ -7224,11 +7233,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
guest_state_enter_irqoff();
/* L1D Flush includes CPU buffer clear to mitigate MDS */
/*
* L1D Flush includes CPU buffer clear to mitigate MDS, but VERW
* mitigation for MDS is done late in VMentry and is still
* executed in spite of L1D Flush. This is because an extra VERW
* should not matter much after the big hammer L1D Flush.
*/
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&mds_user_clear))
mds_clear_cpu_buffers();
else if (static_branch_unlikely(&mmio_stale_data_clear) &&
kvm_arch_has_assigned_device(vcpu->kvm))
mds_clear_cpu_buffers();