linux-stable/drivers/base/arch_numa.c

376 lines
8.2 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* NUMA support, based on the x86 implementation.
*
* Copyright (C) 2015 Cavium Inc.
* Author: Ganapatrao Kulkarni <gkulkarni@cavium.com>
*/
#define pr_fmt(fmt) "NUMA: " fmt
#include <linux/acpi.h>
#include <linux/memblock.h>
#include <linux/module.h>
#include <linux/of.h>
arch_numa: switch over to numa_memblks Until now arch_numa was directly translating firmware NUMA information to memblock. Using numa_memblks as an intermediate step has a few advantages: * alignment with more battle tested x86 implementation * availability of NUMA emulation * maintaining node information for not yet populated memory Adjust a few places in numa_memblks to compile with 32-bit phys_addr_t and replace current functionality related to numa_add_memblk() and __node_distance() in arch_numa with the implementation based on numa_memblks and add functions required by numa_emulation. [rppt@kernel.org: fix section mismatch] Link: https://lkml.kernel.org/r/ZrO6cExVz1He_yPn@kernel.org [rppt@kernel.org: PFN_PHYS() translation is unnecessary here] Link: https://lkml.kernel.org/r/Zs2T5wkSYO9MGcab@kernel.org Link: https://lkml.kernel.org/r/20240807064110.1003856-25-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] Acked-by: Dan Williams <dan.j.williams@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David S. Miller <davem@davemloft.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Rafael J. Wysocki <rafael@kernel.org> Cc: Rob Herring (Arm) <robh@kernel.org> Cc: Samuel Holland <samuel.holland@sifive.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-07 06:41:08 +00:00
#include <linux/numa_memblks.h>
#include <asm/sections.h>
static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
bool numa_off;
static __init int numa_parse_early_param(char *opt)
{
if (!opt)
return -EINVAL;
if (str_has_prefix(opt, "off"))
numa_off = true;
arch_numa: switch over to numa_memblks Until now arch_numa was directly translating firmware NUMA information to memblock. Using numa_memblks as an intermediate step has a few advantages: * alignment with more battle tested x86 implementation * availability of NUMA emulation * maintaining node information for not yet populated memory Adjust a few places in numa_memblks to compile with 32-bit phys_addr_t and replace current functionality related to numa_add_memblk() and __node_distance() in arch_numa with the implementation based on numa_memblks and add functions required by numa_emulation. [rppt@kernel.org: fix section mismatch] Link: https://lkml.kernel.org/r/ZrO6cExVz1He_yPn@kernel.org [rppt@kernel.org: PFN_PHYS() translation is unnecessary here] Link: https://lkml.kernel.org/r/Zs2T5wkSYO9MGcab@kernel.org Link: https://lkml.kernel.org/r/20240807064110.1003856-25-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] Acked-by: Dan Williams <dan.j.williams@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David S. Miller <davem@davemloft.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Rafael J. Wysocki <rafael@kernel.org> Cc: Rob Herring (Arm) <robh@kernel.org> Cc: Samuel Holland <samuel.holland@sifive.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-07 06:41:08 +00:00
if (!strncmp(opt, "fake=", 5))
return numa_emu_cmdline(opt + 5);
return 0;
}
early_param("numa", numa_parse_early_param);
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
/*
* Returns a pointer to the bitmask of CPUs on Node 'node'.
*/
const struct cpumask *cpumask_of_node(int node)
{
if (node == NUMA_NO_NODE)
return cpu_all_mask;
if (WARN_ON(node < 0 || node >= nr_node_ids))
return cpu_none_mask;
if (WARN_ON(node_to_cpumask_map[node] == NULL))
return cpu_online_mask;
return node_to_cpumask_map[node];
}
EXPORT_SYMBOL(cpumask_of_node);
#endif
arch_numa: switch over to numa_memblks Until now arch_numa was directly translating firmware NUMA information to memblock. Using numa_memblks as an intermediate step has a few advantages: * alignment with more battle tested x86 implementation * availability of NUMA emulation * maintaining node information for not yet populated memory Adjust a few places in numa_memblks to compile with 32-bit phys_addr_t and replace current functionality related to numa_add_memblk() and __node_distance() in arch_numa with the implementation based on numa_memblks and add functions required by numa_emulation. [rppt@kernel.org: fix section mismatch] Link: https://lkml.kernel.org/r/ZrO6cExVz1He_yPn@kernel.org [rppt@kernel.org: PFN_PHYS() translation is unnecessary here] Link: https://lkml.kernel.org/r/Zs2T5wkSYO9MGcab@kernel.org Link: https://lkml.kernel.org/r/20240807064110.1003856-25-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] Acked-by: Dan Williams <dan.j.williams@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David S. Miller <davem@davemloft.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Rafael J. Wysocki <rafael@kernel.org> Cc: Rob Herring (Arm) <robh@kernel.org> Cc: Samuel Holland <samuel.holland@sifive.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-07 06:41:08 +00:00
#ifndef CONFIG_NUMA_EMU
static void numa_update_cpu(unsigned int cpu, bool remove)
{
int nid = cpu_to_node(cpu);
if (nid == NUMA_NO_NODE)
return;
if (remove)
cpumask_clear_cpu(cpu, node_to_cpumask_map[nid]);
else
cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
}
void numa_add_cpu(unsigned int cpu)
{
numa_update_cpu(cpu, false);
}
void numa_remove_cpu(unsigned int cpu)
{
numa_update_cpu(cpu, true);
}
arch_numa: switch over to numa_memblks Until now arch_numa was directly translating firmware NUMA information to memblock. Using numa_memblks as an intermediate step has a few advantages: * alignment with more battle tested x86 implementation * availability of NUMA emulation * maintaining node information for not yet populated memory Adjust a few places in numa_memblks to compile with 32-bit phys_addr_t and replace current functionality related to numa_add_memblk() and __node_distance() in arch_numa with the implementation based on numa_memblks and add functions required by numa_emulation. [rppt@kernel.org: fix section mismatch] Link: https://lkml.kernel.org/r/ZrO6cExVz1He_yPn@kernel.org [rppt@kernel.org: PFN_PHYS() translation is unnecessary here] Link: https://lkml.kernel.org/r/Zs2T5wkSYO9MGcab@kernel.org Link: https://lkml.kernel.org/r/20240807064110.1003856-25-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] Acked-by: Dan Williams <dan.j.williams@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David S. Miller <davem@davemloft.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Rafael J. Wysocki <rafael@kernel.org> Cc: Rob Herring (Arm) <robh@kernel.org> Cc: Samuel Holland <samuel.holland@sifive.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-07 06:41:08 +00:00
#endif
void numa_clear_node(unsigned int cpu)
{
numa_remove_cpu(cpu);
set_cpu_numa_node(cpu, NUMA_NO_NODE);
}
/*
* Allocate node_to_cpumask_map based on number of available nodes
* Requires node_possible_map to be valid.
*
* Note: cpumask_of_node() is not valid until after this is done.
* (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
*/
static void __init setup_node_to_cpumask_map(void)
{
int node;
/* setup nr_node_ids if not done yet */
if (nr_node_ids == MAX_NUMNODES)
setup_nr_node_ids();
/* allocate and clear the mapping */
for (node = 0; node < nr_node_ids; node++) {
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
cpumask_clear(node_to_cpumask_map[node]);
}
/* cpumask_of_node() will now work */
pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}
/*
* Set the cpu to node and mem mapping
*/
void numa_store_cpu_info(unsigned int cpu)
{
set_cpu_numa_node(cpu, cpu_to_node_map[cpu]);
}
void __init early_map_cpu_to_node(unsigned int cpu, int nid)
{
/* fallback to node 0 */
if (nid < 0 || nid >= MAX_NUMNODES || numa_off)
nid = 0;
cpu_to_node_map[cpu] = nid;
/*
* We should set the numa node of cpu0 as soon as possible, because it
* has already been set up online before. cpu_to_node(0) will soon be
* called.
*/
if (!cpu)
set_cpu_numa_node(cpu, nid);
}
#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
arch_numa: switch over to numa_memblks Until now arch_numa was directly translating firmware NUMA information to memblock. Using numa_memblks as an intermediate step has a few advantages: * alignment with more battle tested x86 implementation * availability of NUMA emulation * maintaining node information for not yet populated memory Adjust a few places in numa_memblks to compile with 32-bit phys_addr_t and replace current functionality related to numa_add_memblk() and __node_distance() in arch_numa with the implementation based on numa_memblks and add functions required by numa_emulation. [rppt@kernel.org: fix section mismatch] Link: https://lkml.kernel.org/r/ZrO6cExVz1He_yPn@kernel.org [rppt@kernel.org: PFN_PHYS() translation is unnecessary here] Link: https://lkml.kernel.org/r/Zs2T5wkSYO9MGcab@kernel.org Link: https://lkml.kernel.org/r/20240807064110.1003856-25-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] Acked-by: Dan Williams <dan.j.williams@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David S. Miller <davem@davemloft.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Rafael J. Wysocki <rafael@kernel.org> Cc: Rob Herring (Arm) <robh@kernel.org> Cc: Samuel Holland <samuel.holland@sifive.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-07 06:41:08 +00:00
int early_cpu_to_node(int cpu)
{
return cpu_to_node_map[cpu];
}
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
{
arm64/numa: fix pcpu_cpu_distance() to get correct CPU proximity The pcpu_build_alloc_info() function group CPUs according to their proximity, by call callback function @cpu_distance_fn from different ARCHs. For arm64 the callback of @cpu_distance_fn is pcpu_cpu_distance(from, to) -> node_distance(from, to) The @from and @to for function node_distance() should be nid. However, pcpu_cpu_distance() in arch/arm64/mm/numa.c just past the cpu id for @from and @to, and didn't convert to numa node id. For this incorrect cpu proximity get from ARCH, it may cause each CPU in one group and make group_cnt out of bound: setup_per_cpu_areas() pcpu_embed_first_chunk() pcpu_build_alloc_info() in pcpu_build_alloc_info, since cpu_distance_fn will return REMOTE_DISTANCE if we pass cpu ids (0,1,2...), so cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE will wrongly be ture. This may results in triggering the BUG_ON(unit != nr_units) later: [ 0.000000] kernel BUG at mm/percpu.c:1916! [ 0.000000] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [ 0.000000] Modules linked in: [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 4.9.0-rc1-00003-g14155ca-dirty #26 [ 0.000000] Hardware name: Hisilicon Hi1616 Evaluation Board (DT) [ 0.000000] task: ffff000008d6e900 task.stack: ffff000008d60000 [ 0.000000] PC is at pcpu_embed_first_chunk+0x420/0x704 [ 0.000000] LR is at pcpu_embed_first_chunk+0x3bc/0x704 [ 0.000000] pc : [<ffff000008c754f4>] lr : [<ffff000008c75490>] pstate: 800000c5 [ 0.000000] sp : ffff000008d63eb0 [ 0.000000] x29: ffff000008d63eb0 [ 0.000000] x28: 0000000000000000 [ 0.000000] x27: 0000000000000040 [ 0.000000] x26: ffff8413fbfcef00 [ 0.000000] x25: 0000000000000042 [ 0.000000] x24: 0000000000000042 [ 0.000000] x23: 0000000000001000 [ 0.000000] x22: 0000000000000046 [ 0.000000] x21: 0000000000000001 [ 0.000000] x20: ffff000008cb3bc8 [ 0.000000] x19: ffff8413fbfcf570 [ 0.000000] x18: 0000000000000000 [ 0.000000] x17: ffff000008e49ae0 [ 0.000000] x16: 0000000000000003 [ 0.000000] x15: 000000000000001e [ 0.000000] x14: 0000000000000004 [ 0.000000] x13: 0000000000000000 [ 0.000000] x12: 000000000000006f [ 0.000000] x11: 00000413fbffff00 [ 0.000000] x10: 0000000000000004 [ 0.000000] x9 : 0000000000000000 [ 0.000000] x8 : 0000000000000001 [ 0.000000] x7 : ffff8413fbfcf63c [ 0.000000] x6 : ffff000008d65d28 [ 0.000000] x5 : ffff000008d65e50 [ 0.000000] x4 : 0000000000000000 [ 0.000000] x3 : ffff000008cb3cc8 [ 0.000000] x2 : 0000000000000040 [ 0.000000] x1 : 0000000000000040 [ 0.000000] x0 : 0000000000000000 [...] [ 0.000000] Call trace: [ 0.000000] Exception stack(0xffff000008d63ce0 to 0xffff000008d63e10) [ 0.000000] 3ce0: ffff8413fbfcf570 0001000000000000 ffff000008d63eb0 ffff000008c754f4 [ 0.000000] 3d00: ffff000008d63d50 ffff0000081af210 00000413fbfff010 0000000000001000 [ 0.000000] 3d20: ffff000008d63d50 ffff0000081af220 00000413fbfff010 0000000000001000 [ 0.000000] 3d40: 00000413fbfcef00 0000000000000004 ffff000008d63db0 ffff0000081af390 [ 0.000000] 3d60: 00000413fbfcef00 0000000000001000 0000000000000000 0000000000001000 [ 0.000000] 3d80: 0000000000000000 0000000000000040 0000000000000040 ffff000008cb3cc8 [ 0.000000] 3da0: 0000000000000000 ffff000008d65e50 ffff000008d65d28 ffff8413fbfcf63c [ 0.000000] 3dc0: 0000000000000001 0000000000000000 0000000000000004 00000413fbffff00 [ 0.000000] 3de0: 000000000000006f 0000000000000000 0000000000000004 000000000000001e [ 0.000000] 3e00: 0000000000000003 ffff000008e49ae0 [ 0.000000] [<ffff000008c754f4>] pcpu_embed_first_chunk+0x420/0x704 [ 0.000000] [<ffff000008c6658c>] setup_per_cpu_areas+0x38/0xc8 [ 0.000000] [<ffff000008c608d8>] start_kernel+0x10c/0x390 [ 0.000000] [<ffff000008c601d8>] __primary_switched+0x5c/0x64 [ 0.000000] Code: b8018660 17ffffd7 6b16037f 54000080 (d4210000) [ 0.000000] ---[ end trace 0000000000000000 ]--- [ 0.000000] Kernel panic - not syncing: Attempted to kill the idle task! Fix by getting cpu's node id with early_cpu_to_node() then pass it to node_distance() as the original intention. Fixes: 7af3a0a99252 ("arm64/numa: support HAVE_SETUP_PER_CPU_AREA") Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com> Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Zhen Lei <thunder.leizhen@huawei.com> Signed-off-by: Will Deacon <will.deacon@arm.com>
2016-10-21 08:13:55 +00:00
return node_distance(early_cpu_to_node(from), early_cpu_to_node(to));
}
void __init setup_per_cpu_areas(void)
{
unsigned long delta;
unsigned int cpu;
int rc = -EINVAL;
if (pcpu_chosen_fc != PCPU_FC_PAGE) {
/*
* Always reserve area for module percpu variables. That's
* what the legacy allocator did.
*/
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
pcpu_cpu_distance,
early_cpu_to_node);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
if (rc < 0)
pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n",
pcpu_fc_names[pcpu_chosen_fc], rc);
#endif
}
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
if (rc < 0)
rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, early_cpu_to_node);
#endif
if (rc < 0)
panic("Failed to initialize percpu areas (err=%d).", rc);
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu)
__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif
/*
* Initialize NODE_DATA for a node on the local memory
*/
static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
{
arm64/numa: Drop duplicate message When booting linux on a system without CONFIG_NUMA enabled, the following messages are printed during boot - NUMA: Faking a node at [mem 0x0000000000000000-0x00000083ffffffff] NUMA: Adding memblock [0x8000000000 - 0x8000e7ffff] on node 0 NUMA: Adding memblock [0x8000e80000 - 0x83f65cffff] on node 0 NUMA: Adding memblock [0x83f65d0000 - 0x83f665ffff] on node 0 NUMA: Adding memblock [0x83f6660000 - 0x83f676ffff] on node 0 NUMA: Adding memblock [0x83f6770000 - 0x83f678ffff] on node 0 NUMA: Adding memblock [0x83f6790000 - 0x83fb82ffff] on node 0 NUMA: Adding memblock [0x83fb830000 - 0x83fbc0ffff] on node 0 NUMA: Adding memblock [0x83fbc10000 - 0x83fbdfffff] on node 0 NUMA: Adding memblock [0x83fbe00000 - 0x83fbffffff] on node 0 NUMA: Adding memblock [0x83fc000000 - 0x83fffbffff] on node 0 NUMA: Adding memblock [0x83fffc0000 - 0x83fffdffff] on node 0 NUMA: Adding memblock [0x83fffe0000 - 0x83ffffffff] on node 0 NUMA: Initmem setup node 0 [mem 0x8000000000-0x83ffffffff] NUMA: NODE_DATA [mem 0x83fffec500-0x83fffedfff] The information is then duplicated by core kernel messages right after the above output. Early memory node ranges node 0: [mem 0x0000008000000000-0x0000008000e7ffff] node 0: [mem 0x0000008000e80000-0x00000083f65cffff] node 0: [mem 0x00000083f65d0000-0x00000083f665ffff] node 0: [mem 0x00000083f6660000-0x00000083f676ffff] node 0: [mem 0x00000083f6770000-0x00000083f678ffff] node 0: [mem 0x00000083f6790000-0x00000083fb82ffff] node 0: [mem 0x00000083fb830000-0x00000083fbc0ffff] node 0: [mem 0x00000083fbc10000-0x00000083fbdfffff] node 0: [mem 0x00000083fbe00000-0x00000083fbffffff] node 0: [mem 0x00000083fc000000-0x00000083fffbffff] node 0: [mem 0x00000083fffc0000-0x00000083fffdffff] node 0: [mem 0x00000083fffe0000-0x00000083ffffffff] Initmem setup node 0 [mem 0x0000008000000000-0x00000083ffffffff] Remove the duplication of memblock layout information printed during boot by dropping the messages from arm64 numa initialisation. Signed-off-by: Punit Agrawal <punit.agrawal@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Will Deacon <will.deacon@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com>
2017-07-20 11:03:59 +00:00
if (start_pfn >= end_pfn)
pr_info("Initmem setup node %d [<memory-less node>]\n", nid);
arch, mm: pull out allocation of NODE_DATA to generic code Architectures that support NUMA duplicate the code that allocates NODE_DATA on the node-local memory with slight variations in reporting of the addresses where the memory was allocated. Use x86 version as the basis for the generic alloc_node_data() function and call this function in architecture specific numa initialization. Round up node data size to SMP_CACHE_BYTES rather than to PAGE_SIZE like x86 used to do since the bootmem era when allocation granularity was PAGE_SIZE anyway. Link: https://lkml.kernel.org/r/20240807064110.1003856-10-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Acked-by: David Hildenbrand <david@redhat.com> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] Acked-by: Dan Williams <dan.j.williams@intel.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David S. Miller <davem@davemloft.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Rafael J. Wysocki <rafael@kernel.org> Cc: Rob Herring (Arm) <robh@kernel.org> Cc: Samuel Holland <samuel.holland@sifive.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-07 06:40:53 +00:00
alloc_node_data(nid);
NODE_DATA(nid)->node_id = nid;
NODE_DATA(nid)->node_start_pfn = start_pfn;
NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
}
static int __init numa_register_nodes(void)
{
int nid;
/* Finally register nodes. */
for_each_node_mask(nid, numa_nodes_parsed) {
unsigned long start_pfn, end_pfn;
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
setup_node_data(nid, start_pfn, end_pfn);
node_set_online(nid);
}
/* Setup online nodes to actual nodes*/
node_possible_map = numa_nodes_parsed;
return 0;
}
static int __init numa_init(int (*init_func)(void))
{
int ret;
nodes_clear(numa_nodes_parsed);
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
arch_numa: switch over to numa_memblks Until now arch_numa was directly translating firmware NUMA information to memblock. Using numa_memblks as an intermediate step has a few advantages: * alignment with more battle tested x86 implementation * availability of NUMA emulation * maintaining node information for not yet populated memory Adjust a few places in numa_memblks to compile with 32-bit phys_addr_t and replace current functionality related to numa_add_memblk() and __node_distance() in arch_numa with the implementation based on numa_memblks and add functions required by numa_emulation. [rppt@kernel.org: fix section mismatch] Link: https://lkml.kernel.org/r/ZrO6cExVz1He_yPn@kernel.org [rppt@kernel.org: PFN_PHYS() translation is unnecessary here] Link: https://lkml.kernel.org/r/Zs2T5wkSYO9MGcab@kernel.org Link: https://lkml.kernel.org/r/20240807064110.1003856-25-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] Acked-by: Dan Williams <dan.j.williams@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David S. Miller <davem@davemloft.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Rafael J. Wysocki <rafael@kernel.org> Cc: Rob Herring (Arm) <robh@kernel.org> Cc: Samuel Holland <samuel.holland@sifive.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-07 06:41:08 +00:00
ret = numa_memblks_init(init_func, /* memblock_force_top_down */ false);
if (ret < 0)
goto out_free_distance;
if (nodes_empty(numa_nodes_parsed)) {
pr_info("No NUMA configuration found\n");
ret = -EINVAL;
goto out_free_distance;
}
ret = numa_register_nodes();
if (ret < 0)
goto out_free_distance;
setup_node_to_cpumask_map();
return 0;
out_free_distance:
arch_numa: switch over to numa_memblks Until now arch_numa was directly translating firmware NUMA information to memblock. Using numa_memblks as an intermediate step has a few advantages: * alignment with more battle tested x86 implementation * availability of NUMA emulation * maintaining node information for not yet populated memory Adjust a few places in numa_memblks to compile with 32-bit phys_addr_t and replace current functionality related to numa_add_memblk() and __node_distance() in arch_numa with the implementation based on numa_memblks and add functions required by numa_emulation. [rppt@kernel.org: fix section mismatch] Link: https://lkml.kernel.org/r/ZrO6cExVz1He_yPn@kernel.org [rppt@kernel.org: PFN_PHYS() translation is unnecessary here] Link: https://lkml.kernel.org/r/Zs2T5wkSYO9MGcab@kernel.org Link: https://lkml.kernel.org/r/20240807064110.1003856-25-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] Acked-by: Dan Williams <dan.j.williams@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David S. Miller <davem@davemloft.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Rafael J. Wysocki <rafael@kernel.org> Cc: Rob Herring (Arm) <robh@kernel.org> Cc: Samuel Holland <samuel.holland@sifive.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-07 06:41:08 +00:00
numa_reset_distance();
return ret;
}
/**
* dummy_numa_init() - Fallback dummy NUMA init
*
* Used if there's no underlying NUMA architecture, NUMA initialization
* fails, or NUMA is disabled on the command line.
*
* Must online at least one node (node 0) and add memory blocks that cover all
* allowed memory. It is unlikely that this function fails.
*
* Return: 0 on success, -errno on failure.
*/
static int __init dummy_numa_init(void)
{
arm64: numa: simplify dummy_numa_init() dummy_numa_init() loops over memblock.memory and passes nid=0 to numa_add_memblk() which essentially wraps memblock_set_node(). However, memblock_set_node() can cope with entire memory span itself, so the loop over memblock.memory regions is redundant. Using a single call to memblock_set_node() rather than a loop also fixes an issue with a buggy ACPI firmware in which the SRAT table covers some but not all of the memory in the EFI memory map. Jonathan Cameron says: This issue can be easily triggered by having an SRAT table which fails to cover all elements of the EFI memory map. This firmware error is detected and a warning printed. e.g. "NUMA: Warning: invalid memblk node 64 [mem 0x240000000-0x27fffffff]" At that point we fall back to dummy_numa_init(). However, the failed ACPI init has left us with our memblocks all broken up as we split them when trying to assign them to NUMA nodes. We then iterate over the memblocks and add them to node 0. numa_add_memblk() calls memblock_set_node() which merges regions that were previously split up during the earlier attempt to add them to different nodes during parsing of SRAT. This means elements are moved in the memblock array and we can end up in a different memblock after the call to numa_add_memblk(). Result is: Unable to handle kernel paging request at virtual address 0000000000003a40 Mem abort info: ESR = 0x96000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 [0000000000003a40] user address but active_mm is swapper Internal error: Oops: 96000004 [#1] PREEMPT SMP ... Call trace: sparse_init_nid+0x5c/0x2b0 sparse_init+0x138/0x170 bootmem_init+0x80/0xe0 setup_arch+0x2a0/0x5fc start_kernel+0x8c/0x648 Replace the loop with a single call to memblock_set_node() to the entire memory. Signed-off-by: Mike Rapoport <rppt@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Acked-by: Catalin Marinas <catalin.marinas@arm.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Baoquan He <bhe@redhat.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Christoph Hellwig <hch@lst.de> Cc: Daniel Axtens <dja@axtens.net> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Emil Renner Berthing <kernel@esmil.dk> Cc: Hari Bathini <hbathini@linux.ibm.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Max Filippov <jcmvbkbc@gmail.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Michal Simek <monstr@monstr.eu> Cc: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Paul Walmsley <paul.walmsley@sifive.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <linux@armlinux.org.uk> Cc: Stafford Horne <shorne@gmail.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Link: https://lkml.kernel.org/r/20200818151634.14343-5-rppt@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-13 23:57:31 +00:00
phys_addr_t start = memblock_start_of_DRAM();
phys_addr_t end = memblock_end_of_DRAM() - 1;
int ret;
if (numa_off)
pr_info("NUMA disabled\n"); /* Forced off on command line. */
pr_info("Faking a node at [mem %pap-%pap]\n", &start, &end);
ret = numa_add_memblk(0, start, end + 1);
arm64: numa: simplify dummy_numa_init() dummy_numa_init() loops over memblock.memory and passes nid=0 to numa_add_memblk() which essentially wraps memblock_set_node(). However, memblock_set_node() can cope with entire memory span itself, so the loop over memblock.memory regions is redundant. Using a single call to memblock_set_node() rather than a loop also fixes an issue with a buggy ACPI firmware in which the SRAT table covers some but not all of the memory in the EFI memory map. Jonathan Cameron says: This issue can be easily triggered by having an SRAT table which fails to cover all elements of the EFI memory map. This firmware error is detected and a warning printed. e.g. "NUMA: Warning: invalid memblk node 64 [mem 0x240000000-0x27fffffff]" At that point we fall back to dummy_numa_init(). However, the failed ACPI init has left us with our memblocks all broken up as we split them when trying to assign them to NUMA nodes. We then iterate over the memblocks and add them to node 0. numa_add_memblk() calls memblock_set_node() which merges regions that were previously split up during the earlier attempt to add them to different nodes during parsing of SRAT. This means elements are moved in the memblock array and we can end up in a different memblock after the call to numa_add_memblk(). Result is: Unable to handle kernel paging request at virtual address 0000000000003a40 Mem abort info: ESR = 0x96000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 [0000000000003a40] user address but active_mm is swapper Internal error: Oops: 96000004 [#1] PREEMPT SMP ... Call trace: sparse_init_nid+0x5c/0x2b0 sparse_init+0x138/0x170 bootmem_init+0x80/0xe0 setup_arch+0x2a0/0x5fc start_kernel+0x8c/0x648 Replace the loop with a single call to memblock_set_node() to the entire memory. Signed-off-by: Mike Rapoport <rppt@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Acked-by: Catalin Marinas <catalin.marinas@arm.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Baoquan He <bhe@redhat.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Christoph Hellwig <hch@lst.de> Cc: Daniel Axtens <dja@axtens.net> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Emil Renner Berthing <kernel@esmil.dk> Cc: Hari Bathini <hbathini@linux.ibm.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Max Filippov <jcmvbkbc@gmail.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Michal Simek <monstr@monstr.eu> Cc: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Paul Walmsley <paul.walmsley@sifive.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <linux@armlinux.org.uk> Cc: Stafford Horne <shorne@gmail.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Link: https://lkml.kernel.org/r/20200818151634.14343-5-rppt@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-13 23:57:31 +00:00
if (ret) {
pr_err("NUMA init failed\n");
return ret;
}
arch_numa: switch over to numa_memblks Until now arch_numa was directly translating firmware NUMA information to memblock. Using numa_memblks as an intermediate step has a few advantages: * alignment with more battle tested x86 implementation * availability of NUMA emulation * maintaining node information for not yet populated memory Adjust a few places in numa_memblks to compile with 32-bit phys_addr_t and replace current functionality related to numa_add_memblk() and __node_distance() in arch_numa with the implementation based on numa_memblks and add functions required by numa_emulation. [rppt@kernel.org: fix section mismatch] Link: https://lkml.kernel.org/r/ZrO6cExVz1He_yPn@kernel.org [rppt@kernel.org: PFN_PHYS() translation is unnecessary here] Link: https://lkml.kernel.org/r/Zs2T5wkSYO9MGcab@kernel.org Link: https://lkml.kernel.org/r/20240807064110.1003856-25-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] Acked-by: Dan Williams <dan.j.williams@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David S. Miller <davem@davemloft.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Rafael J. Wysocki <rafael@kernel.org> Cc: Rob Herring (Arm) <robh@kernel.org> Cc: Samuel Holland <samuel.holland@sifive.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-07 06:41:08 +00:00
node_set(0, numa_nodes_parsed);
numa_off = true;
return 0;
}
#ifdef CONFIG_ACPI_NUMA
static int __init arch_acpi_numa_init(void)
{
int ret;
ret = acpi_numa_init();
if (ret) {
pr_debug("Failed to initialise from firmware\n");
return ret;
}
return srat_disabled() ? -EINVAL : 0;
}
#else
static int __init arch_acpi_numa_init(void)
{
return -EOPNOTSUPP;
}
#endif
/**
* arch_numa_init() - Initialize NUMA
*
* Try each configured NUMA initialization method until one succeeds. The
* last fallback is dummy single node config encompassing whole memory.
*/
void __init arch_numa_init(void)
{
if (!numa_off) {
if (!acpi_disabled && !numa_init(arch_acpi_numa_init))
return;
if (acpi_disabled && !numa_init(of_numa_init))
return;
}
numa_init(dummy_numa_init);
}
arch_numa: switch over to numa_memblks Until now arch_numa was directly translating firmware NUMA information to memblock. Using numa_memblks as an intermediate step has a few advantages: * alignment with more battle tested x86 implementation * availability of NUMA emulation * maintaining node information for not yet populated memory Adjust a few places in numa_memblks to compile with 32-bit phys_addr_t and replace current functionality related to numa_add_memblk() and __node_distance() in arch_numa with the implementation based on numa_memblks and add functions required by numa_emulation. [rppt@kernel.org: fix section mismatch] Link: https://lkml.kernel.org/r/ZrO6cExVz1He_yPn@kernel.org [rppt@kernel.org: PFN_PHYS() translation is unnecessary here] Link: https://lkml.kernel.org/r/Zs2T5wkSYO9MGcab@kernel.org Link: https://lkml.kernel.org/r/20240807064110.1003856-25-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU] Acked-by: Dan Williams <dan.j.williams@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David S. Miller <davem@davemloft.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Rafael J. Wysocki <rafael@kernel.org> Cc: Rob Herring (Arm) <robh@kernel.org> Cc: Samuel Holland <samuel.holland@sifive.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-07 06:41:08 +00:00
#ifdef CONFIG_NUMA_EMU
void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
unsigned int nr_emu_nids)
{
int i, j;
/*
* Transform cpu_to_node_map table to use emulated nids by
* reverse-mapping phys_nid. The maps should always exist but fall
* back to zero just in case.
*/
for (i = 0; i < ARRAY_SIZE(cpu_to_node_map); i++) {
if (cpu_to_node_map[i] == NUMA_NO_NODE)
continue;
for (j = 0; j < nr_emu_nids; j++)
if (cpu_to_node_map[i] == emu_nid_to_phys[j])
break;
cpu_to_node_map[i] = j < nr_emu_nids ? j : 0;
}
}
u64 __init numa_emu_dma_end(void)
{
return memblock_start_of_DRAM() + SZ_4G;
}
void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
{
struct cpumask *mask;
if (node == NUMA_NO_NODE)
return;
mask = node_to_cpumask_map[node];
if (!cpumask_available(mask)) {
pr_err("node_to_cpumask_map[%i] NULL\n", node);
dump_stack();
return;
}
if (enable)
cpumask_set_cpu(cpu, mask);
else
cpumask_clear_cpu(cpu, mask);
pr_debug("%s cpu %d node %d: mask now %*pbl\n",
enable ? "numa_add_cpu" : "numa_remove_cpu",
cpu, node, cpumask_pr_args(mask));
}
#endif /* CONFIG_NUMA_EMU */