From a91bf718dbc993ea582cd53c0cb711a0839b4603 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 12 Aug 2016 14:57:12 +0800 Subject: [PATCH 01/22] x86/mm/numa: Open code function early_get_boot_cpu_id() Previously early_acpi_boot_init() was called in early_get_boot_cpu_id() to get the value for boot_cpu_physical_apicid. Now early_acpi_boot_init() has been taken out and moved to setup_arch(), the name of early_get_boot_cpu_id() doesn't match its implementation anymore, and only the getting boot-time SMP configuration code was left. So in this patch we open code it. Also move the smp_found_config check into default_get_smp_config to simplify code, because both early_get_smp_config() and get_smp_config() call x86_init.mpparse.get_smp_config(). Also remove the redundent CONFIG_X86_MPPARSE #ifdef check when we call early_get_smp_config(). Signed-off-by: Baoquan He Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-acpi@vger.kernel.org Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1470985033-22493-1-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 3 +++ arch/x86/kernel/setup.c | 3 +-- arch/x86/mm/amdtopology.c | 22 +++++----------------- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 068c4a929de6..0f8d20497383 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -499,6 +499,9 @@ void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; + if (!smp_found_config) + return; + if (!mpf) return; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa60f5f5a16..cbf56344a0f6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1221,8 +1221,7 @@ void __init setup_arch(char **cmdline_p) /* * get boot-time SMP configuration: */ - if (smp_found_config) - get_smp_config(); + get_smp_config(); prefill_possible_map(); diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index ba47524f56e8..d1c7de095808 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -52,21 +52,6 @@ static __init int find_northbridge(void) return -ENOENT; } -static __init void early_get_boot_cpu_id(void) -{ - /* - * need to get the APIC ID of the BSP so can use that to - * create apicid_to_node in amd_scan_nodes() - */ -#ifdef CONFIG_X86_MPPARSE - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - early_get_smp_config(); -#endif -} - int __init amd_numa_init(void) { u64 start = PFN_PHYS(0); @@ -180,8 +165,11 @@ int __init amd_numa_init(void) cores = 1 << bits; apicid_base = 0; - /* get the APIC ID of the BSP early for systems with apicid lifting */ - early_get_boot_cpu_id(); + /* + * get boot-time SMP configuration: + */ + early_get_smp_config(); + if (boot_cpu_physical_apicid > 0) { pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); apicid_base = boot_cpu_physical_apicid; From 6de421198c75d95088331e6a480e952292b0e121 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 12 Aug 2016 14:57:13 +0800 Subject: [PATCH 02/22] x86/apic, ACPI: Remove the repeated lapic address override entry parsing The ACPI MADT has a 32-bit field providing lapic address at which each processor can access its lapic information. MADT also contains an optional entry to provide a 64-bit address to override the 32-bit one. However the current code does the lapic address override entry parsing twice. One is in early_acpi_boot_init() because AMD NUMA need get boot_cpu_id earlier. The other is in acpi_boot_init() which parses all MADT entries. So in this patch we remove the repeated code in the 2nd part. Meanwhile print lapic override entry information like other MADT entry, this will be added to boot log. This patch is not supposed to change any runtime behavior, other than improving kernel messages. Signed-off-by: Baoquan He Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-acpi@vger.kernel.org Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1470985033-22493-2-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 17 ++--------------- arch/x86/kernel/apic/apic.c | 2 +- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 90d84c3eee53..2087bea6b461 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -282,6 +282,8 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) return -EINVAL; + acpi_table_print_madt_entry(header); + acpi_lapic_addr = lapic_addr_ovr->address; return 0; @@ -998,21 +1000,6 @@ static int __init acpi_parse_madt_lapic_entries(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; - /* - * Note that the LAPIC address is obtained from the MADT (32-bit value) - * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value). - */ - - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, - acpi_parse_lapic_addr_ovr, 0); - if (count < 0) { - printk(KERN_ERR PREFIX - "Error parsing LAPIC address override entry\n"); - return count; - } - - register_lapic_address(acpi_lapic_addr); - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, acpi_parse_sapic, MAX_LOCAL_APIC); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index cea4fc19e844..63b748444880 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1825,7 +1825,7 @@ void __init register_lapic_address(unsigned long address) if (!x2apic_mode) { set_fixmap_nocache(FIX_APIC_BASE, address); apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, mp_lapic_addr); + APIC_BASE, address); } if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = read_apic_id(); From 31b02dd718712f4c45afbeea7fbd187ecb1b202c Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 12 Aug 2016 15:21:47 +0800 Subject: [PATCH 03/22] x86/apic, ACPI: Fix incorrect assignment when handling apic/x2apic entries By pure accident the bug makes no functional difference, because the only expression where we are using these values is (!count && !x2count), in which the variables are interchangeable, but it makes sense to fix the bug nevertheless. Signed-off-by: Baoquan He Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-acpi@vger.kernel.org Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1470986507-24191-1-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2087bea6b461..1ad5fe213043 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1018,8 +1018,8 @@ static int __init acpi_parse_madt_lapic_entries(void) return ret; } - x2count = madt_proc[0].count; - count = madt_proc[1].count; + count = madt_proc[0].count; + x2count = madt_proc[1].count; } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); From fe7bd58f5d25d5d655b1da4a084cc4ef6f085fee Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 17 Aug 2016 16:00:33 +0800 Subject: [PATCH 04/22] x86/ioapic: Change prototype of acpi_ioapic_add() Change the argument of acpi_ioapic_add() to a generic ACPI handle, and move its prototype from drivers/acpi/internal.h to include/linux/acpi.h so that it can be called from outside the pci_root driver. Signed-off-by: Rui Wang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bhelgaas@google.com Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1471420837-31003-2-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/acpi/internal.h | 2 -- drivers/acpi/ioapic.c | 6 +++--- drivers/acpi/pci_root.c | 2 +- include/linux/acpi.h | 6 ++++++ 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 940218ff0193..f26fc1d7cfea 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -40,10 +40,8 @@ int acpi_sysfs_init(void); void acpi_container_init(void); void acpi_memory_hotplug_init(void); #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC -int acpi_ioapic_add(struct acpi_pci_root *root); int acpi_ioapic_remove(struct acpi_pci_root *root); #else -static inline int acpi_ioapic_add(struct acpi_pci_root *root) { return 0; } static inline int acpi_ioapic_remove(struct acpi_pci_root *root) { return 0; } #endif #ifdef CONFIG_ACPI_DOCK diff --git a/drivers/acpi/ioapic.c b/drivers/acpi/ioapic.c index ccdc8db16bb8..2449377a6e7c 100644 --- a/drivers/acpi/ioapic.c +++ b/drivers/acpi/ioapic.c @@ -189,13 +189,13 @@ exit: return AE_OK; } -int acpi_ioapic_add(struct acpi_pci_root *root) +int acpi_ioapic_add(acpi_handle root_handle) { acpi_status status, retval = AE_OK; - status = acpi_walk_namespace(ACPI_TYPE_DEVICE, root->device->handle, + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, root_handle, UINT_MAX, handle_ioapic_add, NULL, - root->device->handle, (void **)&retval); + root_handle, (void **)&retval); return ACPI_SUCCESS(status) && ACPI_SUCCESS(retval) ? 0 : -ENODEV; } diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index d144168d4ef9..b07eda1e7b05 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -614,7 +614,7 @@ static int acpi_pci_root_add(struct acpi_device *device, if (hotadd) { pcibios_resource_survey_bus(root->bus); pci_assign_unassigned_root_bus_resources(root->bus); - acpi_ioapic_add(root); + acpi_ioapic_add(root->device->handle); } pci_lock_rescan_remove(); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4d8452c2384b..c9a596b9535c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -751,6 +751,12 @@ static inline int acpi_reconfig_notifier_unregister(struct notifier_block *nb) #endif /* !CONFIG_ACPI */ +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC +int acpi_ioapic_add(acpi_handle root); +#else +static inline int acpi_ioapic_add(acpi_handle root) { return 0; } +#endif + #ifdef CONFIG_ACPI void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, u32 pm1a_ctrl, u32 pm1b_ctrl)); From 584c5c422f6c749ced1e0bc3c6837f650f64e1e1 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 17 Aug 2016 16:00:34 +0800 Subject: [PATCH 05/22] x86/ioapic: Support hot-removal of IOAPICs present during boot IOAPICs present during system boot aren't added to ioapic_list, thus are unable to be hot-removed. Fix it by calling acpi_ioapic_add() during root bus enumeration. Signed-off-by: Rui Wang Acked-by: Bjorn Helgaas Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1471420837-31003-3-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/acpi/pci_root.c | 10 ++++++++++ drivers/pci/setup-bus.c | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index b07eda1e7b05..bf601d4df8cf 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -614,6 +614,16 @@ static int acpi_pci_root_add(struct acpi_device *device, if (hotadd) { pcibios_resource_survey_bus(root->bus); pci_assign_unassigned_root_bus_resources(root->bus); + /* + * This is only called for the hotadd case. For the boot-time + * case, we need to wait until after PCI initialization in + * order to deal with IOAPICs mapped in on a PCI BAR. + * + * This is currently x86-specific, because acpi_ioapic_add() + * is an empty function without CONFIG_ACPI_HOTPLUG_IOAPIC. + * And CONFIG_ACPI_HOTPLUG_IOAPIC depends on CONFIG_X86_IO_APIC + * (see drivers/acpi/Kconfig). + */ acpi_ioapic_add(root->device->handle); } diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index c74059e10a6d..ec538d3d2bd5 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "pci.h" unsigned int pci_flags; @@ -1852,8 +1853,10 @@ void __init pci_assign_unassigned_resources(void) { struct pci_bus *root_bus; - list_for_each_entry(root_bus, &pci_root_buses, node) + list_for_each_entry(root_bus, &pci_root_buses, node) { pci_assign_unassigned_root_bus_resources(root_bus); + acpi_ioapic_add(ACPI_HANDLE(root_bus->bridge)); + } } void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge) From 6ab7eba5db93c11d61f6f7fbe21edbc875b26c1a Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 17 Aug 2016 16:00:35 +0800 Subject: [PATCH 06/22] x86/ioapic: Fix setup_res() failing to get resource acpi_dev_filter_resource_type() returns 0 on success, and 1 on failure. A return value of zero means there's a matching resource, so we should continue within setup_res() to get the resource. Signed-off-by: Rui Wang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bhelgaas@google.com Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1471420837-31003-4-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/acpi/ioapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/ioapic.c b/drivers/acpi/ioapic.c index 2449377a6e7c..8ab6d426c178 100644 --- a/drivers/acpi/ioapic.c +++ b/drivers/acpi/ioapic.c @@ -46,7 +46,7 @@ static acpi_status setup_res(struct acpi_resource *acpi_res, void *data) struct resource_win win; res->flags = 0; - if (acpi_dev_filter_resource_type(acpi_res, IORESOURCE_MEM) == 0) + if (acpi_dev_filter_resource_type(acpi_res, IORESOURCE_MEM)) return AE_OK; if (!acpi_dev_resource_memory(acpi_res, res)) { From 162b83bd5f1d7124e21da78bcf2685b9824d9ef0 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 17 Aug 2016 16:00:36 +0800 Subject: [PATCH 07/22] x86/ioapic: Fix lost IOAPIC resource after hot-removal and hotadd IOAPIC resource at 0xfecxxxxx gets lost from /proc/iomem after hot-removing and then hot-adding the IOAPIC device. After system boot, in /proc/iomem: fec00000-fecfffff : PNP0003:00 fec00000-fec003ff : IOAPIC 0 fec01000-fec013ff : IOAPIC 1 fec40000-fec403ff : IOAPIC 2 fec80000-fec803ff : IOAPIC 3 fecc0000-fecc03ff : IOAPIC 4 Then hot-remove IOAPIC 2 and hot-add it again: fec00000-fecfffff : PNP0003:00 fec00000-fec003ff : IOAPIC 0 fec01000-fec013ff : IOAPIC 1 fec80000-fec803ff : IOAPIC 3 fecc0000-fecc03ff : IOAPIC 4 The range at 0xfec40000 is lost from /proc/iomem - which is a bug. This bug happens because handle_ioapic_add() requests resources from either PCI config BAR or ACPI "_CRS", not both. But Intel platforms map the IOxAPIC registers both at the PCI config BAR (called MBAR, dynamic), and at the ACPI "_CRS" (called ABAR, static). The 0xfecX_YZ00 to 0xfecX_YZFF range appears in "_CRS" of each IOAPIC device. Both ranges should be claimed from /proc/iomem for exclusive use. Signed-off-by: Rui Wang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bhelgaas@google.com Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1471420837-31003-5-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/acpi/ioapic.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/drivers/acpi/ioapic.c b/drivers/acpi/ioapic.c index 8ab6d426c178..ee201111e063 100644 --- a/drivers/acpi/ioapic.c +++ b/drivers/acpi/ioapic.c @@ -97,7 +97,7 @@ static acpi_status handle_ioapic_add(acpi_handle handle, u32 lvl, unsigned long long gsi_base; struct acpi_pci_ioapic *ioapic; struct pci_dev *dev = NULL; - struct resource *res = NULL; + struct resource *res = NULL, *pci_res = NULL, *crs_res; char *type = NULL; if (!acpi_is_ioapic(handle, &type)) @@ -137,23 +137,28 @@ static acpi_status handle_ioapic_add(acpi_handle handle, u32 lvl, pci_set_master(dev); if (pci_request_region(dev, 0, type)) goto exit_disable; - res = &dev->resource[0]; + pci_res = &dev->resource[0]; ioapic->pdev = dev; } else { pci_dev_put(dev); dev = NULL; - - res = &ioapic->res; - acpi_walk_resources(handle, METHOD_NAME__CRS, setup_res, res); - if (res->flags == 0) { - acpi_handle_warn(handle, "failed to get resource\n"); - goto exit_free; - } else if (request_resource(&iomem_resource, res)) { - acpi_handle_warn(handle, "failed to insert resource\n"); - goto exit_free; - } } + crs_res = &ioapic->res; + acpi_walk_resources(handle, METHOD_NAME__CRS, setup_res, crs_res); + if (crs_res->flags == 0) { + acpi_handle_warn(handle, "failed to get resource\n"); + goto exit_release; + } else if (request_resource(&iomem_resource, crs_res)) { + acpi_handle_warn(handle, "failed to insert resource\n"); + goto exit_release; + } + + /* try pci resource first, then "_CRS" resource */ + res = pci_res; + if (!res || !res->flags) + res = crs_res; + if (acpi_register_ioapic(handle, res->start, (u32)gsi_base)) { acpi_handle_warn(handle, "failed to register IOAPIC\n"); goto exit_release; @@ -174,14 +179,13 @@ done: exit_release: if (dev) pci_release_region(dev, 0); - else - release_resource(res); + if (ioapic->res.flags && ioapic->res.parent) + release_resource(&ioapic->res); exit_disable: if (dev) pci_disable_device(dev); exit_put: pci_dev_put(dev); -exit_free: kfree(ioapic); exit: mutex_unlock(&ioapic_list_lock); @@ -217,9 +221,9 @@ int acpi_ioapic_remove(struct acpi_pci_root *root) pci_release_region(ioapic->pdev, 0); pci_disable_device(ioapic->pdev); pci_dev_put(ioapic->pdev); - } else if (ioapic->res.flags && ioapic->res.parent) { - release_resource(&ioapic->res); } + if (ioapic->res.flags && ioapic->res.parent) + release_resource(&ioapic->res); list_del(&ioapic->list); kfree(ioapic); } From 624cad9d2907a0788b56e3ca664c5d7d02645ed4 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 17 Aug 2016 16:00:37 +0800 Subject: [PATCH 08/22] x86/ioapic: Fix IOAPIC failing to request resource handle_ioapic_add() uses request_resource() to request ACPI "_CRS" resources. This can fail with the following error message: [ 247.325693] ACPI: \_SB_.IIO1.AID1: failed to insert resource This happens when there are multiple IOAPICs and DSDT groups their "_CRS" resources as the children of a parent resource, as seen from /proc/iomem: fec00000-fecfffff : PNP0003:00 fec00000-fec003ff : IOAPIC 0 fec01000-fec013ff : IOAPIC 1 fec40000-fec403ff : IOAPIC 2 In this case request_resource() fails because there's a conflicting resource which is the parent (fec0000-fecfffff). Fix it by using insert_resource() which can request resources by taking the conflicting resource as the parent. Signed-off-by: Rui Wang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bhelgaas@google.com Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1471420837-31003-6-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/acpi/ioapic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/ioapic.c b/drivers/acpi/ioapic.c index ee201111e063..6d7ce6e12aaa 100644 --- a/drivers/acpi/ioapic.c +++ b/drivers/acpi/ioapic.c @@ -146,10 +146,12 @@ static acpi_status handle_ioapic_add(acpi_handle handle, u32 lvl, crs_res = &ioapic->res; acpi_walk_resources(handle, METHOD_NAME__CRS, setup_res, crs_res); + crs_res->name = type; + crs_res->flags |= IORESOURCE_BUSY; if (crs_res->flags == 0) { acpi_handle_warn(handle, "failed to get resource\n"); goto exit_release; - } else if (request_resource(&iomem_resource, crs_res)) { + } else if (insert_resource(&iomem_resource, crs_res)) { acpi_handle_warn(handle, "failed to insert resource\n"); goto exit_release; } From 384d9fe3741657c8ed8cd9bf30bc1d4611864d56 Mon Sep 17 00:00:00 2001 From: Wei Jiangang Date: Fri, 19 Aug 2016 11:22:36 +0800 Subject: [PATCH 09/22] x86/smpboot: Check APIC ID before setting up default routing This is not a bugfix, but code optimization. If the BSP's APIC ID in local APIC is unexpected, a kernel panic will occur and the system will halt. That means no need to enable APIC mode, and no reason to set up the default routing for APIC. The combination of default_setup_apic_routing() and apic_bsp_setup() are used to enable APIC mode. They two should be kept together, rather than being separated by the codes of checking APIC ID. Just like their usage in APIC_init_uniprocessor(). Signed-off-by: Wei Jiangang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@suse.de Link: http://lkml.kernel.org/r/1471576957-12961-1-git-send-email-weijg.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2a6e84a30a54..8216b997c1c9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1316,14 +1316,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) break; } - default_setup_apic_routing(); - if (read_apic_id() != boot_cpu_physical_apicid) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", read_apic_id(), boot_cpu_physical_apicid); /* Or can we switch back to PIC here? */ } + default_setup_apic_routing(); cpu0_logical_apicid = apic_bsp_setup(false); pr_info("CPU%d: ", 0); From 5035da41996d346c648a65c1d7a9f6469c7d358a Mon Sep 17 00:00:00 2001 From: Wei Jiangang Date: Fri, 19 Aug 2016 11:22:37 +0800 Subject: [PATCH 10/22] x86/apic: Update comment about disabling processor focus Fix references to discarded end_level_ioapic_irq(). Signed-off-by: Wei Jiangang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@suse.de Link: http://lkml.kernel.org/r/1471576957-12961-2-git-send-email-weijg.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 63b748444880..1cbae30af51c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1374,7 +1374,6 @@ void setup_local_APIC(void) * Actually disabling the focus CPU check just makes the hang less * frequent as it makes the interrupt distributon model be more * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro */ /* From d9c149d6ce1a94de578a4e323f6881fcb6b986ab Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Sat, 10 Sep 2016 23:40:45 +0800 Subject: [PATCH 11/22] x86/ioapic: Ignore root bridges without a companion ACPI device Some PCI root bridges don't have a corresponding ACPI device. This can be the case on some old platforms. Don't call acpi_ioapic_add() on these bridges because they can't support ioapic hotplug. Reported-and-tested-by: Borislav Petkov Signed-off-by: Rui Wang Reviewed-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bhelgaas@google.com Cc: helgaas@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1473522046-31329-1-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- drivers/pci/setup-bus.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index ec538d3d2bd5..f30ca75b5b6c 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1855,7 +1855,10 @@ void __init pci_assign_unassigned_resources(void) list_for_each_entry(root_bus, &pci_root_buses, node) { pci_assign_unassigned_root_bus_resources(root_bus); - acpi_ioapic_add(ACPI_HANDLE(root_bus->bridge)); + + /* Make sure the root bridge has a companion ACPI device: */ + if (ACPI_HANDLE(root_bus->bridge)) + acpi_ioapic_add(ACPI_HANDLE(root_bus->bridge)); } } From b0f48706a176b71a6e54f399d7404bbeeaa7cfab Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 18 Sep 2016 19:34:51 +0800 Subject: [PATCH 12/22] x86/apic: Order irq_enter/exit() calls correctly vs. ack_APIC_irq() =============================== [ INFO: suspicious RCU usage. ] 4.8.0-rc6+ #5 Not tainted ------------------------------- ./arch/x86/include/asm/msr-trace.h:47 suspicious rcu_dereference_check() usage! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0 RCU used illegally from extended quiescent state! no locks held by swapper/2/0. stack backtrace: CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.8.0-rc6+ #5 Hardware name: Dell Inc. OptiPlex 7020/0F5C5X, BIOS A03 01/08/2015 0000000000000000 ffff8d1bd6003f10 ffffffff94446949 ffff8d1bd4a68000 0000000000000001 ffff8d1bd6003f40 ffffffff940e9247 ffff8d1bbdfcf3d0 000000000000080b 0000000000000000 0000000000000000 ffff8d1bd6003f70 Call Trace: [] dump_stack+0x99/0xd0 [] lockdep_rcu_suspicious+0xe7/0x120 [] do_trace_write_msr+0x135/0x140 [] native_write_msr+0x20/0x30 [] native_apic_msr_eoi_write+0x1d/0x30 [] smp_trace_call_function_interrupt+0x1e/0x270 [] trace_call_function_interrupt+0x96/0xa0 [] ? cpuidle_enter_state+0xe4/0x360 [] ? cpuidle_enter_state+0xcf/0x360 [] cpuidle_enter+0x17/0x20 [] cpu_startup_entry+0x338/0x4d0 [] start_secondary+0x154/0x180 This can be reproduced readily by running ftrace test case of kselftest. Move the irq_enter() call before ack_APIC_irq(), because irq_enter() tells the RCU susbstems to end the extended quiescent state, so that the following trace call in ack_APIC_irq() works correctly. The same applies to exiting_ack_irq() which calls ack_APIC_irq() after irq_exit(). [ tglx: Massaged changelog ] Signed-off-by: Wanpeng Li Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1474198491-3738-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 124357773ffa..f5aaf6c83222 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -650,8 +650,8 @@ static inline void entering_ack_irq(void) static inline void ipi_entering_ack_irq(void) { - ack_APIC_irq(); irq_enter(); + ack_APIC_irq(); } static inline void exiting_irq(void) @@ -661,9 +661,8 @@ static inline void exiting_irq(void) static inline void exiting_ack_irq(void) { - irq_exit(); - /* Ack only at the end to avoid potential reentry */ ack_APIC_irq(); + irq_exit(); } extern void ioapic_zap_locks(void); From cff9ab2b291e64259d97add48fe073c081afe4e2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 13 Sep 2016 20:12:32 +0200 Subject: [PATCH 13/22] x86/apic: Get rid of apic_version[] array The array has a size of MAX_LOCAL_APIC, which can be as large as 32k, so it can consume up to 128k. The array has been there forever and was never used for anything useful other than a version mismatch check which was introduced in 2009. There is no reason to store the version in an array. The kernel is not prepared to handle different APIC versions anyway, so the real important part is to detect a version mismatch and warn about it, which can be done with a single variable as well. [ tglx: Massaged changelog ] Signed-off-by: Denys Vlasenko CC: Andy Lutomirski CC: Borislav Petkov CC: Brian Gerst CC: Mike Travis Link: http://lkml.kernel.org/r/20160913181232.30815-1-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 2 +- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic/apic.c | 17 +++++++---------- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/apic/probe_32.c | 2 +- arch/x86/kernel/smpboot.c | 10 +++++----- 6 files changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index b07233b64578..c2f94dcc92ce 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -6,7 +6,6 @@ #include #include -extern int apic_version[]; extern int pic_mode; #ifdef CONFIG_X86_32 @@ -40,6 +39,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; +extern u8 boot_cpu_apic_version; extern unsigned long mp_lapic_addr; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 1ad5fe213043..0447e314e7f5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -182,7 +182,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) } if (boot_cpu_physical_apicid != -1U) - ver = apic_version[boot_cpu_physical_apicid]; + ver = boot_cpu_apic_version; cpu = generic_processor_info(id, ver); if (cpu >= 0) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1cbae30af51c..779dae5a852f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -64,6 +64,8 @@ unsigned disabled_cpus; unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); +u8 boot_cpu_apic_version; + /* * The highest APIC ID seen during enumeration. */ @@ -1812,8 +1814,7 @@ void __init init_apic_mappings(void) * since smp_sanity_check is prepared for such a case * and disable smp mode */ - apic_version[new_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } @@ -1828,13 +1829,10 @@ void __init register_lapic_address(unsigned long address) } if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = read_apic_id(); - apic_version[boot_cpu_physical_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } -int apic_version[MAX_LOCAL_APIC]; - /* * Local APIC interrupts */ @@ -2124,11 +2122,10 @@ int generic_processor_info(int apicid, int version) cpu, apicid); version = 0x10; } - apic_version[apicid] = version; - if (version != apic_version[boot_cpu_physical_apicid]) { + if (version != boot_cpu_apic_version) { pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); + boot_cpu_apic_version, cpu, version); } physid_set(apicid, phys_cpu_present_map); @@ -2271,7 +2268,7 @@ int __init APIC_init_uniprocessor(void) * Complain if the BIOS pretends there is one. */ if (!boot_cpu_has(X86_FEATURE_APIC) && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + APIC_INTEGRATED(boot_cpu_apic_version)) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); return -1; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7491f417a8e4..48e6d84f173e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1593,7 +1593,7 @@ void __init setup_ioapic_ids_from_mpc(void) * no meaning without the serial APIC bus. */ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + || APIC_XAPIC(boot_cpu_apic_version)) return; setup_ioapic_ids_from_mpc_nocheck(); } @@ -2423,7 +2423,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) static u8 io_apic_unique_id(int idx, u8 id) { if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + !APIC_XAPIC(boot_cpu_apic_version)) return io_apic_get_unique_id(idx, id); else return id; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 7c43e716c158..563096267ca2 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -152,7 +152,7 @@ early_param("apic", parse_apic); void __init default_setup_apic_routing(void) { - int version = apic_version[boot_cpu_physical_apicid]; + int version = boot_cpu_apic_version; if (num_possible_cpus() > 8) { switch (boot_cpu_data.x86_vendor) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8216b997c1c9..f2b8e4574d69 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -683,7 +683,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -710,7 +710,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); @@ -749,7 +749,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) * Determine this based on the APIC version. * If we don't have an integrated APIC, don't send the STARTUP IPIs. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) + if (APIC_INTEGRATED(boot_cpu_apic_version)) num_starts = 2; else num_starts = 0; @@ -987,7 +987,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } @@ -1242,7 +1242,7 @@ static int __init smp_sanity_check(unsigned max_cpus) /* * If we couldn't find a local APIC, then get out of here now! */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && + if (APIC_INTEGRATED(boot_cpu_apic_version) && !boot_cpu_has(X86_FEATURE_APIC)) { if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", From 2532fc318db0e1fe68e01407ee27634c76916e44 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Thu, 25 Aug 2016 16:35:14 +0800 Subject: [PATCH 14/22] x86/numa: Online memory-less nodes at boot time For now, x86 does not support memory-less node. A node without memory will not be onlined, and the cpus on it will be mapped to the other online nodes with memory in init_cpu_to_node(). The reason of doing this is to ensure each cpu has mapped to a node with memory, so that it will be able to allocate local memory for that cpu. But we don't have to do it in this way. In this series of patches, we are going to construct cpu <-> node mapping for all possible cpus at boot time, which is a persistent mapping. It means that the cpu will be mapped to the node which it belongs to, and will never be changed. If a node has only cpus but no memory, the cpus on it will be mapped to a memory-less node. And the memory-less node should be onlined. Allocate pgdats for all memory-less nodes and online them at boot time. Then build zonelists for these nodes. As a result, when cpus on these memory-less nodes try to allocate memory from local node, it will automatically fall back to the proper zones in the zonelists. Signed-off-by: Zhu Guihua Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: Tang Chen Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-2-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/numa.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fb682108f4dc..3f35b48d1d9d 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -722,22 +722,19 @@ void __init x86_numa_init(void) numa_init(dummy_numa_init); } -static __init int find_near_online_node(int node) +static void __init init_memory_less_node(int nid) { - int n, val; - int min_val = INT_MAX; - int best_node = -1; + unsigned long zones_size[MAX_NR_ZONES] = {0}; + unsigned long zholes_size[MAX_NR_ZONES] = {0}; - for_each_online_node(n) { - val = node_distance(node, n); + /* Allocate and initialize node data. Memory-less node is now online.*/ + alloc_node_data(nid); + free_area_init_node(nid, zones_size, 0, zholes_size); - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - return best_node; + /* + * All zonelists will be built later in start_kernel() after per cpu + * areas are initialized. + */ } /* @@ -766,8 +763,10 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; + if (!node_online(node)) - node = find_near_online_node(node); + init_memory_less_node(node); + numa_set_node(cpu, node); } } From f7c28833c252031bc68a29e26a18a661797cf3a3 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 25 Aug 2016 16:35:15 +0800 Subject: [PATCH 15/22] x86/acpi: Enable acpi to register all possible cpus at boot time cpuid <-> nodeid mapping is firstly established at boot time. And workqueue caches the mapping in wq_numa_possible_cpumask in wq_numa_init() at boot time. When doing node online/offline, cpuid <-> nodeid mapping is established/destroyed, which means, cpuid <-> nodeid mapping will change if node hotplug happens. But workqueue does not update wq_numa_possible_cpumask. So here is the problem: Assume we have the following cpuid <-> nodeid in the beginning: Node | CPU ------------------------ node 0 | 0-14, 60-74 node 1 | 15-29, 75-89 node 2 | 30-44, 90-104 node 3 | 45-59, 105-119 and we hot-remove node2 and node3, it becomes: Node | CPU ------------------------ node 0 | 0-14, 60-74 node 1 | 15-29, 75-89 and we hot-add node4 and node5, it becomes: Node | CPU ------------------------ node 0 | 0-14, 60-74 node 1 | 15-29, 75-89 node 4 | 30-59 node 5 | 90-119 But in wq_numa_possible_cpumask, cpu30 is still mapped to node2, and the like. When a pool workqueue is initialized, if its cpumask belongs to a node, its pool->node will be mapped to that node. And memory used by this workqueue will also be allocated on that node. static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs){ ... /* if cpumask is contained inside a NUMA node, we belong to that node */ if (wq_numa_enabled) { for_each_node(node) { if (cpumask_subset(pool->attrs->cpumask, wq_numa_possible_cpumask[node])) { pool->node = node; break; } } } Since wq_numa_possible_cpumask is not updated, it could be mapped to an offline node, which will lead to memory allocation failure: SLUB: Unable to allocate memory on node 2 (gfp=0x80d0) cache: kmalloc-192, object size: 192, buffer size: 192, default order: 1, min order: 0 node 0: slabs: 6172, objs: 259224, free: 245741 node 1: slabs: 3261, objs: 136962, free: 127656 It happens here: create_worker(struct worker_pool *pool) |--> worker = alloc_worker(pool->node); static struct worker *alloc_worker(int node) { struct worker *worker; worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); --> Here, useing the wrong node. ...... return worker; } [Solution] There are four mappings in the kernel: 1. nodeid (logical node id) <-> pxm 2. apicid (physical cpu id) <-> nodeid 3. cpuid (logical cpu id) <-> apicid 4. cpuid (logical cpu id) <-> nodeid 1. pxm (proximity domain) is provided by ACPI firmware in SRAT, and nodeid <-> pxm mapping is setup at boot time. This mapping is persistent, won't change. 2. apicid <-> nodeid mapping is setup using info in 1. The mapping is setup at boot time and CPU hotadd time, and cleared at CPU hotremove time. This mapping is also persistent. 3. cpuid <-> apicid mapping is setup at boot time and CPU hotadd time. cpuid is allocated, lower ids first, and released at CPU hotremove time, reused for other hotadded CPUs. So this mapping is not persistent. 4. cpuid <-> nodeid mapping is also setup at boot time and CPU hotadd time, and cleared at CPU hotremove time. As a result of 3, this mapping is not persistent. To fix this problem, we establish cpuid <-> nodeid mapping for all the possible cpus at boot time, and make it persistent. And according to init_cpu_to_node(), cpuid <-> nodeid mapping is based on apicid <-> nodeid mapping and cpuid <-> apicid mapping. So the key point is obtaining all cpus' apicid. apicid can be obtained by _MAT (Multiple APIC Table Entry) method or found in MADT (Multiple APIC Description Table). So we finish the job in the following steps: 1. Enable apic registeration flow to handle both enabled and disabled cpus. This is done by introducing an extra parameter to generic_processor_info to let the caller control if disabled cpus are ignored. 2. Introduce a new array storing all possible cpuid <-> apicid mapping. And also modify the way cpuid is calculated. Establish all possible cpuid <-> apicid mapping when registering local apic. Store the mapping in this array. 3. Enable _MAT and MADT relative apis to return non-present or disabled cpus' apicid. This is also done by introducing an extra parameter to these apis to let the caller control if disabled cpus are ignored. 4. Establish all possible cpuid <-> nodeid mapping. This is done via an additional acpi namespace walk for processors. This patch finished step 1. Signed-off-by: Gu Zheng Signed-off-by: Tang Chen Signed-off-by: Zhu Guihua Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-3-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 779dae5a852f..a8c94bb6b528 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2021,7 +2021,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -int generic_processor_info(int apicid, int version) +static int __generic_processor_info(int apicid, int version, bool enabled) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2087,7 +2087,6 @@ int generic_processor_info(int apicid, int version) return -EINVAL; } - num_processors++; if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed @@ -2110,6 +2109,7 @@ int generic_processor_info(int apicid, int version) pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", thiscpu, apicid); + disabled_cpus++; return -ENOSPC; } @@ -2128,7 +2128,6 @@ int generic_processor_info(int apicid, int version) boot_cpu_apic_version, cpu, version); } - physid_set(apicid, phys_cpu_present_map); if (apicid > max_physical_apicid) max_physical_apicid = apicid; @@ -2141,11 +2140,23 @@ int generic_processor_info(int apicid, int version) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - set_cpu_present(cpu, true); + + if (enabled) { + num_processors++; + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + } else { + disabled_cpus++; + } return cpu; } +int generic_processor_info(int apicid, int version) +{ + return __generic_processor_info(apicid, version, true); +} + int hard_smp_processor_id(void) { return read_apic_id(); From 8f54969dc8d6704632b42cbb5e47730cd75cc713 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 25 Aug 2016 16:35:16 +0800 Subject: [PATCH 16/22] x86/acpi: Introduce persistent storage for cpuid <-> apicid mapping The whole patch-set aims at making cpuid <-> nodeid mapping persistent. So that, when node online/offline happens, cache based on cpuid <-> nodeid mapping such as wq_numa_possible_cpumask will not cause any problem. It contains 4 steps: 1. Enable apic registeration flow to handle both enabled and disabled cpus. 2. Introduce a new array storing all possible cpuid <-> apicid mapping. 3. Enable _MAT and MADT relative apis to return non-present or disabled cpus' apicid. 4. Establish all possible cpuid <-> nodeid mapping. This patch finishes step 2. In this patch, we introduce a new static array named cpuid_to_apicid[], which is large enough to store info for all possible cpus. And then, we modify the cpuid calculation. In generic_processor_info(), it simply finds the next unused cpuid. And it is also why the cpuid <-> nodeid mapping changes with node hotplug. After this patch, we find the next unused cpuid, map it to an apicid, and store the mapping in cpuid_to_apicid[], so that cpuid <-> apicid mapping will be persistent. And finally we will use this array to make cpuid <-> nodeid persistent. cpuid <-> apicid mapping is established at local apic registeration time. But non-present or disabled cpus are ignored. In this patch, we establish all possible cpuid <-> apicid mapping when registering local apic. Signed-off-by: Gu Zheng Signed-off-by: Tang Chen Signed-off-by: Zhu Guihua Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-4-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 1 + arch/x86/kernel/acpi/boot.c | 7 +--- arch/x86/kernel/apic/apic.c | 60 +++++++++++++++++++++++++++++++++-- 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index c2f94dcc92ce..32007041ef8c 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { } #endif int generic_processor_info(int apicid, int version); +int __generic_processor_info(int apicid, int version, bool enabled); #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0447e314e7f5..7d668d172fce 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -176,15 +176,10 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } - if (!enabled) { - ++disabled_cpus; - return -EINVAL; - } - if (boot_cpu_physical_apicid != -1U) ver = boot_cpu_apic_version; - cpu = generic_processor_info(id, ver); + cpu = __generic_processor_info(id, ver, enabled); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a8c94bb6b528..2dc01c38ad8e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2021,7 +2021,53 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -static int __generic_processor_info(int apicid, int version, bool enabled) +/* + * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated + * contiguously, it equals to current allocated max logical CPU ID plus 1. + * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of + * nr_logical_cpuids is nr_cpu_ids. + * + * NOTE: Reserve 0 for BSP. + */ +static int nr_logical_cpuids = 1; + +/* + * Used to store mapping between logical CPU IDs and APIC IDs. + */ +static int cpuid_to_apicid[] = { + [0 ... NR_CPUS - 1] = -1, +}; + +/* + * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids + * and cpuid_to_apicid[] synchronized. + */ +static int allocate_logical_cpuid(int apicid) +{ + int i; + + /* + * cpuid <-> apicid mapping is persistent, so when a cpu is up, + * check if the kernel has allocated a cpuid for it. + */ + for (i = 0; i < nr_logical_cpuids; i++) { + if (cpuid_to_apicid[i] == apicid) + return i; + } + + /* Allocate a new cpuid. */ + if (nr_logical_cpuids >= nr_cpu_ids) { + WARN_ONCE(1, "Only %d processors supported." + "Processor %d/0x%x and the rest are ignored.\n", + nr_cpu_ids - 1, nr_logical_cpuids, apicid); + return -1; + } + + cpuid_to_apicid[nr_logical_cpuids] = apicid; + return nr_logical_cpuids++; +} + +int __generic_processor_info(int apicid, int version, bool enabled) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2096,8 +2142,16 @@ static int __generic_processor_info(int apicid, int version, bool enabled) * for BSP. */ cpu = 0; - } else - cpu = cpumask_next_zero(-1, cpu_present_mask); + + /* Logical cpuid 0 is reserved for BSP. */ + cpuid_to_apicid[0] = apicid; + } else { + cpu = allocate_logical_cpuid(apicid); + if (cpu < 0) { + disabled_cpus++; + return -EINVAL; + } + } /* * This can happen on physical hotplug. The sanity check at boot time From 8ad893faf2eaedb710a3073afbb5d569df2c3e41 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 25 Aug 2016 16:35:17 +0800 Subject: [PATCH 17/22] x86/acpi: Enable MADT APIs to return disabled apicids The whole patch-set aims at making cpuid <-> nodeid mapping persistent. So that, when node online/offline happens, cache based on cpuid <-> nodeid mapping such as wq_numa_possible_cpumask will not cause any problem. It contains 4 steps: 1. Enable apic registeration flow to handle both enabled and disabled cpus. 2. Introduce a new array storing all possible cpuid <-> apicid mapping. 3. Enable _MAT and MADT relative apis to return non-present or disabled cpus' apicid. 4. Establish all possible cpuid <-> nodeid mapping. This patch finishes step 3. There are four mappings in the kernel: 1. nodeid (logical node id) <-> pxm (persistent) 2. apicid (physical cpu id) <-> nodeid (persistent) 3. cpuid (logical cpu id) <-> apicid (not persistent, now persistent by step 2) 4. cpuid (logical cpu id) <-> nodeid (not persistent) So, in order to setup persistent cpuid <-> nodeid mapping for all possible CPUs, we should: 1. Setup cpuid <-> apicid mapping for all possible CPUs, which has been done in step 1, 2. 2. Setup cpuid <-> nodeid mapping for all possible CPUs. But before that, we should obtain all apicids from MADT. All processors' apicids can be obtained by _MAT method or from MADT in ACPI. The current code ignores disabled processors and returns -ENODEV. After this patch, a new parameter will be added to MADT APIs so that caller is able to control if disabled processors are ignored. Signed-off-by: Gu Zheng Signed-off-by: Tang Chen Signed-off-by: Zhu Guihua Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-5-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- drivers/acpi/acpi_processor.c | 5 ++- drivers/acpi/processor_core.c | 60 ++++++++++++++++++++++------------- 2 files changed, 42 insertions(+), 23 deletions(-) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index c7ba948d253c..02b84aa69fa4 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -300,8 +300,11 @@ static int acpi_processor_get_info(struct acpi_device *device) * Extra Processor objects may be enumerated on MP systems with * less than the max # of CPUs. They should be ignored _iff * they are physically not present. + * + * NOTE: Even if the processor has a cpuid, it may not be present + * because cpuid <-> apicid mapping is persistent now. */ - if (invalid_logical_cpuid(pr->id)) { + if (invalid_logical_cpuid(pr->id) || !cpu_present(pr->id)) { int ret = acpi_processor_hotadd_init(pr); if (ret) return ret; diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 9125d7d96372..fd59ae871db3 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -32,12 +32,12 @@ static struct acpi_table_madt *get_madt_table(void) } static int map_lapic_id(struct acpi_subtable_header *entry, - u32 acpi_id, phys_cpuid_t *apic_id) + u32 acpi_id, phys_cpuid_t *apic_id, bool ignore_disabled) { struct acpi_madt_local_apic *lapic = container_of(entry, struct acpi_madt_local_apic, header); - if (!(lapic->lapic_flags & ACPI_MADT_ENABLED)) + if (ignore_disabled && !(lapic->lapic_flags & ACPI_MADT_ENABLED)) return -ENODEV; if (lapic->processor_id != acpi_id) @@ -48,12 +48,13 @@ static int map_lapic_id(struct acpi_subtable_header *entry, } static int map_x2apic_id(struct acpi_subtable_header *entry, - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id) + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id, + bool ignore_disabled) { struct acpi_madt_local_x2apic *apic = container_of(entry, struct acpi_madt_local_x2apic, header); - if (!(apic->lapic_flags & ACPI_MADT_ENABLED)) + if (ignore_disabled && !(apic->lapic_flags & ACPI_MADT_ENABLED)) return -ENODEV; if (device_declaration && (apic->uid == acpi_id)) { @@ -65,12 +66,13 @@ static int map_x2apic_id(struct acpi_subtable_header *entry, } static int map_lsapic_id(struct acpi_subtable_header *entry, - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id) + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id, + bool ignore_disabled) { struct acpi_madt_local_sapic *lsapic = container_of(entry, struct acpi_madt_local_sapic, header); - if (!(lsapic->lapic_flags & ACPI_MADT_ENABLED)) + if (ignore_disabled && !(lsapic->lapic_flags & ACPI_MADT_ENABLED)) return -ENODEV; if (device_declaration) { @@ -87,12 +89,13 @@ static int map_lsapic_id(struct acpi_subtable_header *entry, * Retrieve the ARM CPU physical identifier (MPIDR) */ static int map_gicc_mpidr(struct acpi_subtable_header *entry, - int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr) + int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr, + bool ignore_disabled) { struct acpi_madt_generic_interrupt *gicc = container_of(entry, struct acpi_madt_generic_interrupt, header); - if (!(gicc->flags & ACPI_MADT_ENABLED)) + if (ignore_disabled && !(gicc->flags & ACPI_MADT_ENABLED)) return -ENODEV; /* device_declaration means Device object in DSDT, in the @@ -109,7 +112,7 @@ static int map_gicc_mpidr(struct acpi_subtable_header *entry, } static phys_cpuid_t map_madt_entry(struct acpi_table_madt *madt, - int type, u32 acpi_id) + int type, u32 acpi_id, bool ignore_disabled) { unsigned long madt_end, entry; phys_cpuid_t phys_id = PHYS_CPUID_INVALID; /* CPU hardware ID */ @@ -127,16 +130,20 @@ static phys_cpuid_t map_madt_entry(struct acpi_table_madt *madt, struct acpi_subtable_header *header = (struct acpi_subtable_header *)entry; if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) { - if (!map_lapic_id(header, acpi_id, &phys_id)) + if (!map_lapic_id(header, acpi_id, &phys_id, + ignore_disabled)) break; } else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) { - if (!map_x2apic_id(header, type, acpi_id, &phys_id)) + if (!map_x2apic_id(header, type, acpi_id, &phys_id, + ignore_disabled)) break; } else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) { - if (!map_lsapic_id(header, type, acpi_id, &phys_id)) + if (!map_lsapic_id(header, type, acpi_id, &phys_id, + ignore_disabled)) break; } else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) { - if (!map_gicc_mpidr(header, type, acpi_id, &phys_id)) + if (!map_gicc_mpidr(header, type, acpi_id, &phys_id, + ignore_disabled)) break; } entry += header->length; @@ -156,14 +163,15 @@ phys_cpuid_t __init acpi_map_madt_entry(u32 acpi_id) if (!madt) return PHYS_CPUID_INVALID; - rv = map_madt_entry(madt, 1, acpi_id); + rv = map_madt_entry(madt, 1, acpi_id, true); early_acpi_os_unmap_memory(madt, tbl_size); return rv; } -static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id) +static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id, + bool ignore_disabled) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; @@ -184,30 +192,38 @@ static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id) header = (struct acpi_subtable_header *)obj->buffer.pointer; if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) - map_lapic_id(header, acpi_id, &phys_id); + map_lapic_id(header, acpi_id, &phys_id, ignore_disabled); else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) - map_lsapic_id(header, type, acpi_id, &phys_id); + map_lsapic_id(header, type, acpi_id, &phys_id, ignore_disabled); else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) - map_x2apic_id(header, type, acpi_id, &phys_id); + map_x2apic_id(header, type, acpi_id, &phys_id, ignore_disabled); else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) - map_gicc_mpidr(header, type, acpi_id, &phys_id); + map_gicc_mpidr(header, type, acpi_id, &phys_id, + ignore_disabled); exit: kfree(buffer.pointer); return phys_id; } -phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id) +static phys_cpuid_t __acpi_get_phys_id(acpi_handle handle, int type, + u32 acpi_id, bool ignore_disabled) { phys_cpuid_t phys_id; - phys_id = map_mat_entry(handle, type, acpi_id); + phys_id = map_mat_entry(handle, type, acpi_id, ignore_disabled); if (invalid_phys_cpuid(phys_id)) - phys_id = map_madt_entry(get_madt_table(), type, acpi_id); + phys_id = map_madt_entry(get_madt_table(), type, acpi_id, + ignore_disabled); return phys_id; } +phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id) +{ + return __acpi_get_phys_id(handle, type, acpi_id, true); +} + int acpi_map_cpuid(phys_cpuid_t phys_id, u32 acpi_id) { #ifdef CONFIG_SMP From dc6db24d2476cd09c0ecf2b8d80313539f737a89 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 25 Aug 2016 16:35:18 +0800 Subject: [PATCH 18/22] x86/acpi: Set persistent cpuid <-> nodeid mapping when booting The whole patch-set aims at making cpuid <-> nodeid mapping persistent. So that, when node online/offline happens, cache based on cpuid <-> nodeid mapping such as wq_numa_possible_cpumask will not cause any problem. It contains 4 steps: 1. Enable apic registeration flow to handle both enabled and disabled cpus. 2. Introduce a new array storing all possible cpuid <-> apicid mapping. 3. Enable _MAT and MADT relative apis to return non-present or disabled cpus' apicid. 4. Establish all possible cpuid <-> nodeid mapping. This patch finishes step 4. This patch set the persistent cpuid <-> nodeid mapping for all enabled/disabled processors at boot time via an additional acpi namespace walk for processors. [ tglx: Remove the unneeded exports ] Signed-off-by: Gu Zheng Signed-off-by: Tang Chen Signed-off-by: Zhu Guihua Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-6-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- arch/ia64/kernel/acpi.c | 2 +- arch/x86/kernel/acpi/boot.c | 3 +- drivers/acpi/acpi_processor.c | 5 +++ drivers/acpi/bus.c | 1 + drivers/acpi/processor_core.c | 68 +++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 3 ++ 6 files changed, 80 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 92b7bc956795..9273e034b730 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -796,7 +796,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) * ACPI based hotplug CPU support */ #ifdef CONFIG_ACPI_HOTPLUG_CPU -static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA /* diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7d668d172fce..fc8841016116 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -702,7 +702,7 @@ static void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -713,6 +713,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) numa_set_node(cpu, nid); } #endif + return 0; } int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index 02b84aa69fa4..f9f23fdd96a1 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -182,6 +182,11 @@ int __weak arch_register_cpu(int cpu) void __weak arch_unregister_cpu(int cpu) {} +int __weak acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ + return -ENODEV; +} + static int acpi_processor_hotadd_init(struct acpi_processor *pr) { unsigned long long sta; diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 85b7d07fe5c8..a760dac656ea 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -1193,6 +1193,7 @@ static int __init acpi_init(void) acpi_wakeup_device_init(); acpi_debugger_init(); acpi_setup_sb_notify_handler(); + acpi_set_processor_mapping(); return 0; } diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index fd59ae871db3..88019766a59a 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -280,6 +280,74 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id) } EXPORT_SYMBOL_GPL(acpi_get_cpuid); +#ifdef CONFIG_ACPI_HOTPLUG_CPU +static bool __init +map_processor(acpi_handle handle, phys_cpuid_t *phys_id, int *cpuid) +{ + int type; + u32 acpi_id; + acpi_status status; + acpi_object_type acpi_type; + unsigned long long tmp; + union acpi_object object = { 0 }; + struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; + + status = acpi_get_type(handle, &acpi_type); + if (ACPI_FAILURE(status)) + return false; + + switch (acpi_type) { + case ACPI_TYPE_PROCESSOR: + status = acpi_evaluate_object(handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) + return false; + acpi_id = object.processor.proc_id; + break; + case ACPI_TYPE_DEVICE: + status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp); + if (ACPI_FAILURE(status)) + return false; + acpi_id = tmp; + break; + default: + return false; + } + + type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0; + + *phys_id = __acpi_get_phys_id(handle, type, acpi_id, false); + *cpuid = acpi_map_cpuid(*phys_id, acpi_id); + if (*cpuid == -1) + return false; + + return true; +} + +static acpi_status __init +set_processor_node_mapping(acpi_handle handle, u32 lvl, void *context, + void **rv) +{ + phys_cpuid_t phys_id; + int cpu_id; + + if (!map_processor(handle, &phys_id, &cpu_id)) + return AE_ERROR; + + acpi_map_cpu2node(handle, cpu_id, phys_id); + return AE_OK; +} + +void __init acpi_set_processor_mapping(void) +{ + /* Set persistent cpu <-> node mapping for all processors. */ + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, set_processor_node_mapping, + NULL, NULL, NULL); +} +#else +void __init acpi_set_processor_mapping(void) {} +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base, u64 *phys_addr, int *ioapic_id) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c9a596b9535c..5b4f9accf96b 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -271,8 +271,11 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id) /* Arch dependent functions for cpu hotplug support */ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu); int acpi_unmap_cpu(int cpu); +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ +void acpi_set_processor_mapping(void); + #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr); #endif From 8e089eaa1999def4bb954caa91941f29b0672b6a Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 25 Aug 2016 16:35:19 +0800 Subject: [PATCH 19/22] acpi: Provide mechanism to validate processors in the ACPI tables [Problem] When we set cpuid <-> nodeid mapping to be persistent, it will use the DSDT As we know, the ACPI tables are just like user's input in that respect, and we don't crash if user's input is unreasonable. Such as, the mapping of the proc_id and pxm in some machine's ACPI table is like this: proc_id | pxm -------------------- 0 <-> 0 1 <-> 0 2 <-> 1 3 <-> 1 89 <-> 0 89 <-> 0 89 <-> 0 89 <-> 1 89 <-> 1 89 <-> 2 89 <-> 3 ..... We can't be sure which one is correct to the proc_id 89. We may map a wrong node to a cpu. When pages are allocated, this may cause a kernal panic. So, we should provide mechanisms to validate the ACPI tables, just like we do validation to check user's input in web project. The mechanism is that the processor objects which have the duplicate IDs are not valid. [Solution] We add a validation function, like this: foreach Processor in DSDT proc_id = get_ACPI_Processor_number(Processor) if (proc_id exists ) mark both of them as being unreasonable; The function will record the unique or duplicate processor IDs. The duplicate processor IDs such as 89 are regarded as the unreasonable IDs which mean that the processor objects in question are not valid. [ tglx: Add __init[data] annotations ] Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-7-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- drivers/acpi/acpi_processor.c | 79 +++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index f9f23fdd96a1..f27c709186c1 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -581,8 +581,87 @@ static struct acpi_scan_handler processor_container_handler = { .attach = acpi_processor_container_attach, }; +/* The number of the unique processor IDs */ +static int nr_unique_ids __initdata; + +/* The number of the duplicate processor IDs */ +static int nr_duplicate_ids __initdata; + +/* Used to store the unique processor IDs */ +static int unique_processor_ids[] __initdata = { + [0 ... NR_CPUS - 1] = -1, +}; + +/* Used to store the duplicate processor IDs */ +static int duplicate_processor_ids[] __initdata = { + [0 ... NR_CPUS - 1] = -1, +}; + +static void __init processor_validated_ids_update(int proc_id) +{ + int i; + + if (nr_unique_ids == NR_CPUS||nr_duplicate_ids == NR_CPUS) + return; + + /* + * Firstly, compare the proc_id with duplicate IDs, if the proc_id is + * already in the IDs, do nothing. + */ + for (i = 0; i < nr_duplicate_ids; i++) { + if (duplicate_processor_ids[i] == proc_id) + return; + } + + /* + * Secondly, compare the proc_id with unique IDs, if the proc_id is in + * the IDs, put it in the duplicate IDs. + */ + for (i = 0; i < nr_unique_ids; i++) { + if (unique_processor_ids[i] == proc_id) { + duplicate_processor_ids[nr_duplicate_ids] = proc_id; + nr_duplicate_ids++; + return; + } + } + + /* + * Lastly, the proc_id is a unique ID, put it in the unique IDs. + */ + unique_processor_ids[nr_unique_ids] = proc_id; + nr_unique_ids++; +} + +static acpi_status __init acpi_processor_ids_walk(acpi_handle handle, + u32 lvl, + void *context, + void **rv) +{ + acpi_status status; + union acpi_object object = { 0 }; + struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; + + status = acpi_evaluate_object(handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) + acpi_handle_info(handle, "Not get the processor object\n"); + else + processor_validated_ids_update(object.processor.proc_id); + + return AE_OK; +} + +static void __init acpi_processor_check_duplicates(void) +{ + /* Search all processor nodes in ACPI namespace */ + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_processor_ids_walk, + NULL, NULL, NULL); +} + void __init acpi_processor_init(void) { + acpi_processor_check_duplicates(); acpi_scan_add_handler_with_hotplug(&processor_handler, "processor"); acpi_scan_add_handler(&processor_container_handler); } From fd74da217df7d4bd25e95411da64e0b92762842e Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 25 Aug 2016 16:35:20 +0800 Subject: [PATCH 20/22] acpi: Validate processor id when mapping the processor When we want to identify whether the proc_id is unreasonable or not, we can call the "acpi_processor_validate_proc_id" function. It will search in the duplicate IDs. If we find the proc_id in the IDs, we return true to the call function. Conversely, the false represents available. When we establish all possible cpuid <-> nodeid mapping to handle the cpu hotplugs, we will use the proc_id from ACPI table. We do validation when we get the proc_id. If the result is true, we will stop the mapping. [ tglx: Mark the new function __init ] Signed-off-by: Dou Liyang Acked-by: Ingo Molnar Cc: mika.j.penttila@gmail.com Cc: len.brown@intel.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: yasu.isimatu@gmail.com Cc: linux-mm@kvack.org Cc: linux-acpi@vger.kernel.org Cc: isimatu.yasuaki@jp.fujitsu.com Cc: gongzhaogang@inspur.com Cc: tj@kernel.org Cc: izumi.taku@jp.fujitsu.com Cc: cl@linux.com Cc: chen.tang@easystack.cn Cc: akpm@linux-foundation.org Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1472114120-3281-8-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner --- drivers/acpi/acpi_processor.c | 15 +++++++++++++++ drivers/acpi/processor_core.c | 4 ++++ include/linux/acpi.h | 3 +++ 3 files changed, 22 insertions(+) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index f27c709186c1..3de3b6b8f0f1 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -659,6 +659,21 @@ static void __init acpi_processor_check_duplicates(void) NULL, NULL, NULL); } +bool __init acpi_processor_validate_proc_id(int proc_id) +{ + int i; + + /* + * compare the proc_id with duplicate IDs, if the proc_id is already + * in the duplicate IDs, return true, otherwise, return false. + */ + for (i = 0; i < nr_duplicate_ids; i++) { + if (duplicate_processor_ids[i] == proc_id) + return true; + } + return false; +} + void __init acpi_processor_init(void) { acpi_processor_check_duplicates(); diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 88019766a59a..9ac265f235b7 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -302,6 +302,10 @@ map_processor(acpi_handle handle, phys_cpuid_t *phys_id, int *cpuid) if (ACPI_FAILURE(status)) return false; acpi_id = object.processor.proc_id; + + /* validate the acpi_id */ + if(acpi_processor_validate_proc_id(acpi_id)) + return false; break; case ACPI_TYPE_DEVICE: status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5b4f9accf96b..7f307f3bd12c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -267,6 +267,9 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id) return phys_id == PHYS_CPUID_INVALID; } +/* Validate the processor object's proc_id */ +bool acpi_processor_validate_proc_id(int proc_id); + #ifdef CONFIG_ACPI_HOTPLUG_CPU /* Arch dependent functions for cpu hotplug support */ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu); From c183a603e8d8a5a189729b77d0c623a3d5950e5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 Sep 2016 17:08:04 +0200 Subject: [PATCH 21/22] acpi: Fix broken error check in map_processor() map_processor() checks the cpuid value returned by acpi_map_cpuid() for -1 but acpi_map_cpuid() returns -EINVAL in case of error. As a consequence the error is ignored and the following access into percpu data with that negative cpuid results in a boot crash. This happens always when NR_CPUS/nr_cpu_ids is smaller than the number of processors listed in the ACPI tables. Use a proper error check for id < 0 so the function returns instead of trying to map CPU#(-EINVAL). Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Dou Liyang Cc: Gu Zheng Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tang Chen Cc: Zhu Guihua Cc: akpm@linux-foundation.org Cc: chen.tang@easystack.cn Cc: cl@linux.com Cc: gongzhaogang@inspur.com Cc: isimatu.yasuaki@jp.fujitsu.com Cc: izumi.taku@jp.fujitsu.com Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: len.brown@intel.com Cc: lenb@kernel.org Cc: linux-acpi@vger.kernel.org Cc: linux-mm@kvack.org Cc: mika.j.penttila@gmail.com Cc: rafael@kernel.org Cc: rjw@rjwysocki.net Cc: tj@kernel.org Cc: yasu.isimatu@gmail.com Fixes: dc6db24d2476 ("x86/acpi: Set persistent cpuid <-> nodeid mapping when booting") Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1609231705570.5640@nanos Signed-off-by: Ingo Molnar --- drivers/acpi/processor_core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 9ac265f235b7..5c78ee1860b0 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -284,7 +284,7 @@ EXPORT_SYMBOL_GPL(acpi_get_cpuid); static bool __init map_processor(acpi_handle handle, phys_cpuid_t *phys_id, int *cpuid) { - int type; + int type, id; u32 acpi_id; acpi_status status; acpi_object_type acpi_type; @@ -320,10 +320,11 @@ map_processor(acpi_handle handle, phys_cpuid_t *phys_id, int *cpuid) type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0; *phys_id = __acpi_get_phys_id(handle, type, acpi_id, false); - *cpuid = acpi_map_cpuid(*phys_id, acpi_id); - if (*cpuid == -1) - return false; + id = acpi_map_cpuid(*phys_id, acpi_id); + if (id < 0) + return false; + *cpuid = id; return true; } From eb6296dec19f304199ae389e6863ecc1fc74b7c7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 26 Sep 2016 15:48:33 -0400 Subject: [PATCH 22/22] x86/apic: Fix silent & fatal merge conflict in __generic_processor_info() Fix up the silent merge conflict between commit c291b0151585 in x86/urgent and commit f7c28833c2520 in x86/apic which both remove num_processors++ from the original location and then add it at two different locations. As a result num_processors is incremented twice which can cut the number of available cpus in half. Remove the one which is added by commit c291b0151585. In hindsight I should have merged x86/urgent into x86/apic _before_ adding the nodeid bits, but in hindsight we are always smarter. Reported-and-tested-by: Borislav Petkov Reported-by: Mike Galbraith Fixes: 1e1b37273cf7 ("Merge branch 'x86/urgent' into x86/apic") Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1609261350090.5483@nanos Cc: Dou Liyang Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 46bb29958509..f266b8a92a9e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2171,8 +2171,6 @@ int __generic_processor_info(int apicid, int version, bool enabled) return -ENOSPC; } - num_processors++; - /* * Validate version */