linux-stable/arch/x86/hyperv/hv_vtl.c
Linus Torvalds eb55307e67 X86 core code updates:
- Limit the hardcoded topology quirk for Hygon CPUs to those which have a
     model ID less than 4. The newer models have the topology CPUID leaf 0xB
     correctly implemented and are not affected.
 
   - Make SMT control more robust against enumeration failures
 
     SMT control was added to allow controlling SMT at boottime or
     runtime. The primary purpose was to provide a simple mechanism to
     disable SMT in the light of speculation attack vectors.
 
     It turned out that the code is sensible to enumeration failures and
     worked only by chance for XEN/PV. XEN/PV has no real APIC enumeration
     which means the primary thread mask is not set up correctly. By chance
     a XEN/PV boot ends up with smp_num_siblings == 2, which makes the
     hotplug control stay at its default value "enabled". So the mask is
     never evaluated.
 
     The ongoing rework of the topology evaluation caused XEN/PV to end up
     with smp_num_siblings == 1, which sets the SMT control to "not
     supported" and the empty primary thread mask causes the hotplug core to
     deny the bringup of the APS.
 
     Make the decision logic more robust and take 'not supported' and 'not
     implemented' into account for the decision whether a CPU should be
     booted or not.
 
   - Fake primary thread mask for XEN/PV
 
     Pretend that all XEN/PV vCPUs are primary threads, which makes the
     usage of the primary thread mask valid on XEN/PV. That is consistent
     with because all of the topology information on XEN/PV is fake or even
     non-existent.
 
   - Encapsulate topology information in cpuinfo_x86
 
     Move the randomly scattered topology data into a separate data
     structure for readability and as a preparatory step for the topology
     evaluation overhaul.
 
   - Consolidate APIC ID data type to u32
 
     It's fixed width hardware data and not randomly u16, int, unsigned long
     or whatever developers decided to use.
 
   - Cure the abuse of cpuinfo for persisting logical IDs.
 
     Per CPU cpuinfo is used to persist the logical package and die
     IDs. That's really not the right place simply because cpuinfo is
     subject to be reinitialized when a CPU goes through an offline/online
     cycle.
 
     Use separate per CPU data for the persisting to enable the further
     topology management rework. It will be removed once the new topology
     management is in place.
 
   - Provide a debug interface for inspecting topology information
 
     Useful in general and extremly helpful for validating the topology
     management rework in terms of correctness or "bug" compatibility.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmU+yX0THHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoROUD/4vlvKEcpm9rbI5DzLcaq4DFHKbyEZF
 cQtzuOSM/9vTc9DHnuoNNLl9TWSYxiVYnejf3E21evfsqspYlzbTH8bId9XBCUid
 6B68AJW842M2erNuwj0b0HwF1z++zpDmBDyhGOty/KQhoM8pYOHMvntAmbzJbuso
 Dgx6BLVFcboTy6RwlfRa0EE8f9W5V+JbmG/VBDpdyCInal7VrudoVFZmWQnPIft7
 zwOJpAoehkp8OKq7geKDf79yWxu9a1sNPd62HtaVEvfHwehHqE6OaMLss1us+0vT
 SJ/D6gmRQBOwcXaZL0wL1dG7Km9Et4AisOvzhXGvTa5b2D5oljVoqJ7V7FTf5g3u
 y3aqWbeUJzERUbeJt1HoGVAKyA4GtZOvg+TNIysf6F1Z4khl9alfa9jiqjj4g1au
 zgItq/ZMBEBmJ7X4FxQUEUVBG2CDsEidyNBDRcimWQUDfBakV/iCs0suD8uu8ZOD
 K5jMx8Hi2+xFx7r1YqsfsyMBYOf/zUZw65RbNe+kI992JbJ9nhcODbnbo5MlAsyv
 vcqlK5FwXgZ4YAC8dZHU/tyTiqAW7oaOSkqKwTP5gcyNEqsjQHV//q6v+uqtjfYn
 1C4oUsRHT2vJiV9ktNJTA4GQHIYF4geGgpG8Ih2SjXsSzdGtUd3DtX1iq0YiLEOk
 eHhYsnniqsYB5g==
 =xrz8
 -----END PGP SIGNATURE-----

Merge tag 'x86-core-2023-10-29-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 core updates from Thomas Gleixner:

 - Limit the hardcoded topology quirk for Hygon CPUs to those which have
   a model ID less than 4.

   The newer models have the topology CPUID leaf 0xB correctly
   implemented and are not affected.

 - Make SMT control more robust against enumeration failures

   SMT control was added to allow controlling SMT at boottime or
   runtime. The primary purpose was to provide a simple mechanism to
   disable SMT in the light of speculation attack vectors.

   It turned out that the code is sensible to enumeration failures and
   worked only by chance for XEN/PV. XEN/PV has no real APIC enumeration
   which means the primary thread mask is not set up correctly. By
   chance a XEN/PV boot ends up with smp_num_siblings == 2, which makes
   the hotplug control stay at its default value "enabled". So the mask
   is never evaluated.

   The ongoing rework of the topology evaluation caused XEN/PV to end up
   with smp_num_siblings == 1, which sets the SMT control to "not
   supported" and the empty primary thread mask causes the hotplug core
   to deny the bringup of the APS.

   Make the decision logic more robust and take 'not supported' and 'not
   implemented' into account for the decision whether a CPU should be
   booted or not.

 - Fake primary thread mask for XEN/PV

   Pretend that all XEN/PV vCPUs are primary threads, which makes the
   usage of the primary thread mask valid on XEN/PV. That is consistent
   with because all of the topology information on XEN/PV is fake or
   even non-existent.

 - Encapsulate topology information in cpuinfo_x86

   Move the randomly scattered topology data into a separate data
   structure for readability and as a preparatory step for the topology
   evaluation overhaul.

 - Consolidate APIC ID data type to u32

   It's fixed width hardware data and not randomly u16, int, unsigned
   long or whatever developers decided to use.

 - Cure the abuse of cpuinfo for persisting logical IDs.

   Per CPU cpuinfo is used to persist the logical package and die IDs.
   That's really not the right place simply because cpuinfo is subject
   to be reinitialized when a CPU goes through an offline/online cycle.

   Use separate per CPU data for the persisting to enable the further
   topology management rework. It will be removed once the new topology
   management is in place.

 - Provide a debug interface for inspecting topology information

   Useful in general and extremly helpful for validating the topology
   management rework in terms of correctness or "bug" compatibility.

* tag 'x86-core-2023-10-29-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits)
  x86/apic, x86/hyperv: Use u32 in hv_snp_boot_ap() too
  x86/cpu: Provide debug interface
  x86/cpu/topology: Cure the abuse of cpuinfo for persisting logical ids
  x86/apic: Use u32 for wakeup_secondary_cpu[_64]()
  x86/apic: Use u32 for [gs]et_apic_id()
  x86/apic: Use u32 for phys_pkg_id()
  x86/apic: Use u32 for cpu_present_to_apicid()
  x86/apic: Use u32 for check_apicid_used()
  x86/apic: Use u32 for APIC IDs in global data
  x86/apic: Use BAD_APICID consistently
  x86/cpu: Move cpu_l[l2]c_id into topology info
  x86/cpu: Move logical package and die IDs into topology info
  x86/cpu: Remove pointless evaluation of x86_coreid_bits
  x86/cpu: Move cu_id into topology info
  x86/cpu: Move cpu_core_id into topology info
  hwmon: (fam15h_power) Use topology_core_id()
  scsi: lpfc: Use topology_core_id()
  x86/cpu: Move cpu_die_id into topology info
  x86/cpu: Move phys_proc_id into topology info
  x86/cpu: Encapsulate topology information in cpuinfo_x86
  ...
2023-10-30 17:37:47 -10:00

233 lines
6.7 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023, Microsoft Corporation.
*
* Author:
* Saurabh Sengar <ssengar@microsoft.com>
*/
#include <asm/apic.h>
#include <asm/boot.h>
#include <asm/desc.h>
#include <asm/i8259.h>
#include <asm/mshyperv.h>
#include <asm/realmode.h>
extern struct boot_params boot_params;
static struct real_mode_header hv_vtl_real_mode_header;
void __init hv_vtl_init_platform(void)
{
pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
x86_platform.realmode_reserve = x86_init_noop;
x86_platform.realmode_init = x86_init_noop;
x86_init.irqs.pre_vector_init = x86_init_noop;
x86_init.timers.timer_init = x86_init_noop;
/* Avoid searching for BIOS MP tables */
x86_init.mpparse.find_smp_config = x86_init_noop;
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
x86_platform.get_wallclock = get_rtc_noop;
x86_platform.set_wallclock = set_rtc_noop;
x86_platform.get_nmi_reason = hv_get_nmi_reason;
x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
x86_platform.legacy.rtc = 0;
x86_platform.legacy.warm_reset = 0;
x86_platform.legacy.reserve_bios_regions = 0;
x86_platform.legacy.devices.pnpbios = 0;
}
static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc)
{
return ((u64)desc->base3 << 32) | ((u64)desc->base2 << 24) |
(desc->base1 << 16) | desc->base0;
}
static inline u32 hv_vtl_system_desc_limit(struct ldttss_desc *desc)
{
return ((u32)desc->limit1 << 16) | (u32)desc->limit0;
}
typedef void (*secondary_startup_64_fn)(void*, void*);
static void hv_vtl_ap_entry(void)
{
((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params);
}
static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
{
u64 status;
int ret = 0;
struct hv_enable_vp_vtl *input;
unsigned long irq_flags;
struct desc_ptr gdt_ptr;
struct desc_ptr idt_ptr;
struct ldttss_desc *tss;
struct ldttss_desc *ldt;
struct desc_struct *gdt;
u64 rsp = current->thread.sp;
u64 rip = (u64)&hv_vtl_ap_entry;
native_store_gdt(&gdt_ptr);
store_idt(&idt_ptr);
gdt = (struct desc_struct *)((void *)(gdt_ptr.address));
tss = (struct ldttss_desc *)(gdt + GDT_ENTRY_TSS);
ldt = (struct ldttss_desc *)(gdt + GDT_ENTRY_LDT);
local_irq_save(irq_flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->partition_id = HV_PARTITION_ID_SELF;
input->vp_index = target_vp_index;
input->target_vtl.target_vtl = HV_VTL_MGMT;
/*
* The x86_64 Linux kernel follows the 16-bit -> 32-bit -> 64-bit
* mode transition sequence after waking up an AP with SIPI whose
* vector points to the 16-bit AP startup trampoline code. Here in
* VTL2, we can't perform that sequence as the AP has to start in
* the 64-bit mode.
*
* To make this happen, we tell the hypervisor to load a valid 64-bit
* context (most of which is just magic numbers from the CPU manual)
* so that AP jumps right to the 64-bit entry of the kernel, and the
* control registers are loaded with values that let the AP fetch the
* code and data and carry on with work it gets assigned.
*/
input->vp_context.rip = rip;
input->vp_context.rsp = rsp;
input->vp_context.rflags = 0x0000000000000002;
input->vp_context.efer = __rdmsr(MSR_EFER);
input->vp_context.cr0 = native_read_cr0();
input->vp_context.cr3 = __native_read_cr3();
input->vp_context.cr4 = native_read_cr4();
input->vp_context.msr_cr_pat = __rdmsr(MSR_IA32_CR_PAT);
input->vp_context.idtr.limit = idt_ptr.size;
input->vp_context.idtr.base = idt_ptr.address;
input->vp_context.gdtr.limit = gdt_ptr.size;
input->vp_context.gdtr.base = gdt_ptr.address;
/* Non-system desc (64bit), long, code, present */
input->vp_context.cs.selector = __KERNEL_CS;
input->vp_context.cs.base = 0;
input->vp_context.cs.limit = 0xffffffff;
input->vp_context.cs.attributes = 0xa09b;
/* Non-system desc (64bit), data, present, granularity, default */
input->vp_context.ss.selector = __KERNEL_DS;
input->vp_context.ss.base = 0;
input->vp_context.ss.limit = 0xffffffff;
input->vp_context.ss.attributes = 0xc093;
/* System desc (128bit), present, LDT */
input->vp_context.ldtr.selector = GDT_ENTRY_LDT * 8;
input->vp_context.ldtr.base = hv_vtl_system_desc_base(ldt);
input->vp_context.ldtr.limit = hv_vtl_system_desc_limit(ldt);
input->vp_context.ldtr.attributes = 0x82;
/* System desc (128bit), present, TSS, 0x8b - busy, 0x89 -- default */
input->vp_context.tr.selector = GDT_ENTRY_TSS * 8;
input->vp_context.tr.base = hv_vtl_system_desc_base(tss);
input->vp_context.tr.limit = hv_vtl_system_desc_limit(tss);
input->vp_context.tr.attributes = 0x8b;
status = hv_do_hypercall(HVCALL_ENABLE_VP_VTL, input, NULL);
if (!hv_result_success(status) &&
hv_result(status) != HV_STATUS_VTL_ALREADY_ENABLED) {
pr_err("HVCALL_ENABLE_VP_VTL failed for VP : %d ! [Err: %#llx\n]",
target_vp_index, status);
ret = -EINVAL;
goto free_lock;
}
status = hv_do_hypercall(HVCALL_START_VP, input, NULL);
if (!hv_result_success(status)) {
pr_err("HVCALL_START_VP failed for VP : %d ! [Err: %#llx]\n",
target_vp_index, status);
ret = -EINVAL;
}
free_lock:
local_irq_restore(irq_flags);
return ret;
}
static int hv_vtl_apicid_to_vp_id(u32 apic_id)
{
u64 control;
u64 status;
unsigned long irq_flags;
struct hv_get_vp_from_apic_id_in *input;
u32 *output, ret;
local_irq_save(irq_flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->partition_id = HV_PARTITION_ID_SELF;
input->apic_ids[0] = apic_id;
output = (u32 *)input;
control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID;
status = hv_do_hypercall(control, input, output);
ret = output[0];
local_irq_restore(irq_flags);
if (!hv_result_success(status)) {
pr_err("failed to get vp id from apic id %d, status %#llx\n",
apic_id, status);
return -EINVAL;
}
return ret;
}
static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
{
int vp_id;
pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
vp_id = hv_vtl_apicid_to_vp_id(apicid);
if (vp_id < 0) {
pr_err("Couldn't find CPU with APIC ID %d\n", apicid);
return -EINVAL;
}
if (vp_id > ms_hyperv.max_vp_index) {
pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid);
return -EINVAL;
}
return hv_vtl_bringup_vcpu(vp_id, start_eip);
}
int __init hv_vtl_early_init(void)
{
/*
* `boot_cpu_has` returns the runtime feature support,
* and here is the earliest it can be used.
*/
if (cpu_feature_enabled(X86_FEATURE_XSAVE))
panic("XSAVE has to be disabled as it is not supported by this module.\n"
"Please add 'noxsave' to the kernel command line.\n");
real_mode_header = &hv_vtl_real_mode_header;
apic_update_callback(wakeup_secondary_cpu_64, hv_vtl_wakeup_secondary_cpu);
return 0;
}