2019-05-27 08:55:01 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2010-07-12 14:36:09 +10:00
|
|
|
/*
|
|
|
|
* Procedures for maintaining information about logical memory blocks.
|
|
|
|
*
|
|
|
|
* Peter Bergner, IBM Corp. June 2001.
|
|
|
|
* Copyright (C) 2001 Peter Bergner.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
2010-07-06 15:39:13 -07:00
|
|
|
#include <linux/slab.h>
|
2010-07-12 14:36:09 +10:00
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/bitops.h>
|
2010-07-06 15:39:07 -07:00
|
|
|
#include <linux/poison.h>
|
2010-07-06 15:39:16 -07:00
|
|
|
#include <linux/pfn.h>
|
2010-07-06 15:39:19 -07:00
|
|
|
#include <linux/debugfs.h>
|
2018-04-05 16:25:34 -07:00
|
|
|
#include <linux/kmemleak.h>
|
2010-07-06 15:39:19 -07:00
|
|
|
#include <linux/seq_file.h>
|
2010-07-12 14:36:09 +10:00
|
|
|
#include <linux/memblock.h>
|
|
|
|
|
2016-07-28 15:48:06 -07:00
|
|
|
#include <asm/sections.h>
|
2014-01-21 15:50:19 -08:00
|
|
|
#include <linux/io.h>
|
|
|
|
|
|
|
|
#include "internal.h"
|
2013-11-12 15:07:59 -08:00
|
|
|
|
arm64, mm, efi: Account for GICv3 LPI tables in static memblock reserve table
In the irqchip and EFI code, we have what basically amounts to a quirk
to work around a peculiarity in the GICv3 architecture, which permits
the system memory address of LPI tables to be programmable only once
after a CPU reset. This means kexec kernels must use the same memory
as the first kernel, and thus ensure that this memory has not been
given out for other purposes by the time the ITS init code runs, which
is not very early for secondary CPUs.
On systems with many CPUs, these reservations could overflow the
memblock reservation table, and this was addressed in commit:
eff896288872 ("efi/arm: Defer persistent reservations until after paging_init()")
However, this turns out to have made things worse, since the allocation
of page tables and heap space for the resized memblock reservation table
itself may overwrite the regions we are attempting to reserve, which may
cause all kinds of corruption, also considering that the ITS will still
be poking bits into that memory in response to incoming MSIs.
So instead, let's grow the static memblock reservation table on such
systems so it can accommodate these reservations at an earlier time.
This will permit us to revert the above commit in a subsequent patch.
[ mingo: Minor cleanups. ]
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20190215123333.21209-2-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-02-15 13:33:32 +01:00
|
|
|
#define INIT_MEMBLOCK_REGIONS 128
|
|
|
|
#define INIT_PHYSMEM_REGIONS 4
|
|
|
|
|
|
|
|
#ifndef INIT_MEMBLOCK_RESERVED_REGIONS
|
|
|
|
# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS
|
|
|
|
#endif
|
|
|
|
|
memblock,arm64: expand the static memblock memory table
In a system(Huawei Ascend ARM64 SoC) using HBM, a multi-bit ECC error
occurs, and the BIOS will mark the corresponding area (for example, 2 MB)
as unusable. When the system restarts next time, these areas are not
reported or reported as EFI_UNUSABLE_MEMORY. Both cases lead to an
increase in the number of memblocks, whereas EFI_UNUSABLE_MEMORY leads to
a larger number of memblocks.
For example, if the EFI_UNUSABLE_MEMORY type is reported:
...
memory[0x92] [0x0000200834a00000-0x0000200835bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0
memory[0x93] [0x0000200835c00000-0x0000200835dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x94] [0x0000200835e00000-0x00002008367fffff], 0x0000000000a00000 bytes on node 7 flags: 0x0
memory[0x95] [0x0000200836800000-0x00002008369fffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x96] [0x0000200836a00000-0x0000200837bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0
memory[0x97] [0x0000200837c00000-0x0000200837dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x98] [0x0000200837e00000-0x000020087fffffff], 0x0000000048200000 bytes on node 7 flags: 0x0
memory[0x99] [0x0000200880000000-0x0000200bcfffffff], 0x0000000350000000 bytes on node 6 flags: 0x0
memory[0x9a] [0x0000200bd0000000-0x0000200bd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9b] [0x0000200bd0200000-0x0000200bd07fffff], 0x0000000000600000 bytes on node 6 flags: 0x0
memory[0x9c] [0x0000200bd0800000-0x0000200bd09fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9d] [0x0000200bd0a00000-0x0000200fcfffffff], 0x00000003ff600000 bytes on node 6 flags: 0x0
memory[0x9e] [0x0000200fd0000000-0x0000200fd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9f] [0x0000200fd0200000-0x0000200fffffffff], 0x000000002fe00000 bytes on node 6 flags: 0x0
...
The EFI memory map is parsed to construct the memblock arrays before the
memblock arrays can be resized. As the result, memory regions beyond
INIT_MEMBLOCK_REGIONS are lost.
Add a new macro INIT_MEMBLOCK_MEMORY_REGIONS to replace
INIT_MEMBLOCK_REGTIONS to define the size of the static memblock.memory
array.
Allow overriding memblock.memory array size with architecture defined
INIT_MEMBLOCK_MEMORY_REGIONS and make arm64 to set
INIT_MEMBLOCK_MEMORY_REGIONS to 1024 when CONFIG_EFI is enabled.
Link: https://lkml.kernel.org/r/20220615102742.96450-1-zhouguanghui1@huawei.com
Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Tested-by: Darren Hart <darren@os.amperecomputing.com>
Acked-by: Will Deacon <will@kernel.org> [arm64]
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Xu Qiang <xuqiang36@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-06-15 10:27:42 +00:00
|
|
|
#ifndef INIT_MEMBLOCK_MEMORY_REGIONS
|
|
|
|
#define INIT_MEMBLOCK_MEMORY_REGIONS INIT_MEMBLOCK_REGIONS
|
|
|
|
#endif
|
|
|
|
|
2018-06-30 17:55:05 +03:00
|
|
|
/**
|
|
|
|
* DOC: memblock overview
|
|
|
|
*
|
|
|
|
* Memblock is a method of managing memory regions during the early
|
|
|
|
* boot period when the usual kernel memory allocators are not up and
|
|
|
|
* running.
|
|
|
|
*
|
|
|
|
* Memblock views the system memory as collections of contiguous
|
|
|
|
* regions. There are several types of these collections:
|
|
|
|
*
|
|
|
|
* * ``memory`` - describes the physical memory available to the
|
|
|
|
* kernel; this may differ from the actual physical memory installed
|
|
|
|
* in the system, for instance when the memory is restricted with
|
|
|
|
* ``mem=`` command line parameter
|
|
|
|
* * ``reserved`` - describes the regions that were allocated
|
2020-07-01 16:18:29 +02:00
|
|
|
* * ``physmem`` - describes the actual physical memory available during
|
|
|
|
* boot regardless of the possible restrictions and memory hot(un)plug;
|
|
|
|
* the ``physmem`` type is only available on some architectures.
|
2018-06-30 17:55:05 +03:00
|
|
|
*
|
2020-09-25 12:01:25 +02:00
|
|
|
* Each region is represented by struct memblock_region that
|
2018-06-30 17:55:05 +03:00
|
|
|
* defines the region extents, its attributes and NUMA node id on NUMA
|
2020-09-28 15:50:33 +02:00
|
|
|
* systems. Every memory type is described by the struct memblock_type
|
|
|
|
* which contains an array of memory regions along with
|
2020-07-01 16:18:29 +02:00
|
|
|
* the allocator metadata. The "memory" and "reserved" types are nicely
|
2020-09-25 12:01:25 +02:00
|
|
|
* wrapped with struct memblock. This structure is statically
|
2020-07-01 16:18:29 +02:00
|
|
|
* initialized at build time. The region arrays are initially sized to
|
memblock,arm64: expand the static memblock memory table
In a system(Huawei Ascend ARM64 SoC) using HBM, a multi-bit ECC error
occurs, and the BIOS will mark the corresponding area (for example, 2 MB)
as unusable. When the system restarts next time, these areas are not
reported or reported as EFI_UNUSABLE_MEMORY. Both cases lead to an
increase in the number of memblocks, whereas EFI_UNUSABLE_MEMORY leads to
a larger number of memblocks.
For example, if the EFI_UNUSABLE_MEMORY type is reported:
...
memory[0x92] [0x0000200834a00000-0x0000200835bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0
memory[0x93] [0x0000200835c00000-0x0000200835dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x94] [0x0000200835e00000-0x00002008367fffff], 0x0000000000a00000 bytes on node 7 flags: 0x0
memory[0x95] [0x0000200836800000-0x00002008369fffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x96] [0x0000200836a00000-0x0000200837bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0
memory[0x97] [0x0000200837c00000-0x0000200837dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x98] [0x0000200837e00000-0x000020087fffffff], 0x0000000048200000 bytes on node 7 flags: 0x0
memory[0x99] [0x0000200880000000-0x0000200bcfffffff], 0x0000000350000000 bytes on node 6 flags: 0x0
memory[0x9a] [0x0000200bd0000000-0x0000200bd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9b] [0x0000200bd0200000-0x0000200bd07fffff], 0x0000000000600000 bytes on node 6 flags: 0x0
memory[0x9c] [0x0000200bd0800000-0x0000200bd09fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9d] [0x0000200bd0a00000-0x0000200fcfffffff], 0x00000003ff600000 bytes on node 6 flags: 0x0
memory[0x9e] [0x0000200fd0000000-0x0000200fd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9f] [0x0000200fd0200000-0x0000200fffffffff], 0x000000002fe00000 bytes on node 6 flags: 0x0
...
The EFI memory map is parsed to construct the memblock arrays before the
memblock arrays can be resized. As the result, memory regions beyond
INIT_MEMBLOCK_REGIONS are lost.
Add a new macro INIT_MEMBLOCK_MEMORY_REGIONS to replace
INIT_MEMBLOCK_REGTIONS to define the size of the static memblock.memory
array.
Allow overriding memblock.memory array size with architecture defined
INIT_MEMBLOCK_MEMORY_REGIONS and make arm64 to set
INIT_MEMBLOCK_MEMORY_REGIONS to 1024 when CONFIG_EFI is enabled.
Link: https://lkml.kernel.org/r/20220615102742.96450-1-zhouguanghui1@huawei.com
Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Tested-by: Darren Hart <darren@os.amperecomputing.com>
Acked-by: Will Deacon <will@kernel.org> [arm64]
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Xu Qiang <xuqiang36@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-06-15 10:27:42 +00:00
|
|
|
* %INIT_MEMBLOCK_MEMORY_REGIONS for "memory" and
|
|
|
|
* %INIT_MEMBLOCK_RESERVED_REGIONS for "reserved". The region array
|
|
|
|
* for "physmem" is initially sized to %INIT_PHYSMEM_REGIONS.
|
2019-11-30 17:56:21 -08:00
|
|
|
* The memblock_allow_resize() enables automatic resizing of the region
|
|
|
|
* arrays during addition of new regions. This feature should be used
|
|
|
|
* with care so that memory allocated for the region array will not
|
|
|
|
* overlap with areas that should be reserved, for example initrd.
|
2018-06-30 17:55:05 +03:00
|
|
|
*
|
|
|
|
* The early architecture setup should tell memblock what the physical
|
2019-11-30 17:56:21 -08:00
|
|
|
* memory layout is by using memblock_add() or memblock_add_node()
|
|
|
|
* functions. The first function does not assign the region to a NUMA
|
|
|
|
* node and it is appropriate for UMA systems. Yet, it is possible to
|
|
|
|
* use it on NUMA systems as well and assign the region to a NUMA node
|
|
|
|
* later in the setup process using memblock_set_node(). The
|
|
|
|
* memblock_add_node() performs such an assignment directly.
|
2018-06-30 17:55:05 +03:00
|
|
|
*
|
2019-03-11 23:30:54 -07:00
|
|
|
* Once memblock is setup the memory can be allocated using one of the
|
|
|
|
* API variants:
|
|
|
|
*
|
2019-11-30 17:56:21 -08:00
|
|
|
* * memblock_phys_alloc*() - these functions return the **physical**
|
|
|
|
* address of the allocated memory
|
|
|
|
* * memblock_alloc*() - these functions return the **virtual** address
|
|
|
|
* of the allocated memory.
|
2019-03-11 23:30:54 -07:00
|
|
|
*
|
2020-06-04 16:49:16 -07:00
|
|
|
* Note, that both API variants use implicit assumptions about allowed
|
2019-03-11 23:30:54 -07:00
|
|
|
* memory ranges and the fallback methods. Consult the documentation
|
2019-11-30 17:56:21 -08:00
|
|
|
* of memblock_alloc_internal() and memblock_alloc_range_nid()
|
|
|
|
* functions for more elaborate description.
|
2018-06-30 17:55:05 +03:00
|
|
|
*
|
2019-11-30 17:56:21 -08:00
|
|
|
* As the system boot progresses, the architecture specific mem_init()
|
|
|
|
* function frees all the memory to the buddy page allocator.
|
2018-06-30 17:55:05 +03:00
|
|
|
*
|
2019-11-30 17:56:21 -08:00
|
|
|
* Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
|
2020-07-01 16:18:29 +02:00
|
|
|
* memblock data structures (except "physmem") will be discarded after the
|
|
|
|
* system initialization completes.
|
2018-06-30 17:55:05 +03:00
|
|
|
*/
|
|
|
|
|
2021-06-28 19:43:01 -07:00
|
|
|
#ifndef CONFIG_NUMA
|
2018-10-30 15:09:40 -07:00
|
|
|
struct pglist_data __refdata contig_page_data;
|
|
|
|
EXPORT_SYMBOL(contig_page_data);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
unsigned long max_low_pfn;
|
|
|
|
unsigned long min_low_pfn;
|
|
|
|
unsigned long max_pfn;
|
|
|
|
unsigned long long max_possible_pfn;
|
|
|
|
|
memblock,arm64: expand the static memblock memory table
In a system(Huawei Ascend ARM64 SoC) using HBM, a multi-bit ECC error
occurs, and the BIOS will mark the corresponding area (for example, 2 MB)
as unusable. When the system restarts next time, these areas are not
reported or reported as EFI_UNUSABLE_MEMORY. Both cases lead to an
increase in the number of memblocks, whereas EFI_UNUSABLE_MEMORY leads to
a larger number of memblocks.
For example, if the EFI_UNUSABLE_MEMORY type is reported:
...
memory[0x92] [0x0000200834a00000-0x0000200835bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0
memory[0x93] [0x0000200835c00000-0x0000200835dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x94] [0x0000200835e00000-0x00002008367fffff], 0x0000000000a00000 bytes on node 7 flags: 0x0
memory[0x95] [0x0000200836800000-0x00002008369fffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x96] [0x0000200836a00000-0x0000200837bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0
memory[0x97] [0x0000200837c00000-0x0000200837dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x98] [0x0000200837e00000-0x000020087fffffff], 0x0000000048200000 bytes on node 7 flags: 0x0
memory[0x99] [0x0000200880000000-0x0000200bcfffffff], 0x0000000350000000 bytes on node 6 flags: 0x0
memory[0x9a] [0x0000200bd0000000-0x0000200bd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9b] [0x0000200bd0200000-0x0000200bd07fffff], 0x0000000000600000 bytes on node 6 flags: 0x0
memory[0x9c] [0x0000200bd0800000-0x0000200bd09fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9d] [0x0000200bd0a00000-0x0000200fcfffffff], 0x00000003ff600000 bytes on node 6 flags: 0x0
memory[0x9e] [0x0000200fd0000000-0x0000200fd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9f] [0x0000200fd0200000-0x0000200fffffffff], 0x000000002fe00000 bytes on node 6 flags: 0x0
...
The EFI memory map is parsed to construct the memblock arrays before the
memblock arrays can be resized. As the result, memory regions beyond
INIT_MEMBLOCK_REGIONS are lost.
Add a new macro INIT_MEMBLOCK_MEMORY_REGIONS to replace
INIT_MEMBLOCK_REGTIONS to define the size of the static memblock.memory
array.
Allow overriding memblock.memory array size with architecture defined
INIT_MEMBLOCK_MEMORY_REGIONS and make arm64 to set
INIT_MEMBLOCK_MEMORY_REGIONS to 1024 when CONFIG_EFI is enabled.
Link: https://lkml.kernel.org/r/20220615102742.96450-1-zhouguanghui1@huawei.com
Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Tested-by: Darren Hart <darren@os.amperecomputing.com>
Acked-by: Will Deacon <will@kernel.org> [arm64]
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Xu Qiang <xuqiang36@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-06-15 10:27:42 +00:00
|
|
|
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
|
arm64, mm, efi: Account for GICv3 LPI tables in static memblock reserve table
In the irqchip and EFI code, we have what basically amounts to a quirk
to work around a peculiarity in the GICv3 architecture, which permits
the system memory address of LPI tables to be programmable only once
after a CPU reset. This means kexec kernels must use the same memory
as the first kernel, and thus ensure that this memory has not been
given out for other purposes by the time the ITS init code runs, which
is not very early for secondary CPUs.
On systems with many CPUs, these reservations could overflow the
memblock reservation table, and this was addressed in commit:
eff896288872 ("efi/arm: Defer persistent reservations until after paging_init()")
However, this turns out to have made things worse, since the allocation
of page tables and heap space for the resized memblock reservation table
itself may overwrite the regions we are attempting to reserve, which may
cause all kinds of corruption, also considering that the ITS will still
be poking bits into that memory in response to incoming MSIs.
So instead, let's grow the static memblock reservation table on such
systems so it can accommodate these reservations at an earlier time.
This will permit us to revert the above commit in a subsequent patch.
[ mingo: Minor cleanups. ]
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20190215123333.21209-2-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-02-15 13:33:32 +01:00
|
|
|
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
|
2014-01-29 18:16:01 +01:00
|
|
|
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
|
2020-07-01 16:18:29 +02:00
|
|
|
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS];
|
2014-01-29 18:16:01 +01:00
|
|
|
#endif
|
2011-12-08 10:22:07 -08:00
|
|
|
|
|
|
|
struct memblock memblock __initdata_memblock = {
|
|
|
|
.memory.regions = memblock_memory_init_regions,
|
memblock,arm64: expand the static memblock memory table
In a system(Huawei Ascend ARM64 SoC) using HBM, a multi-bit ECC error
occurs, and the BIOS will mark the corresponding area (for example, 2 MB)
as unusable. When the system restarts next time, these areas are not
reported or reported as EFI_UNUSABLE_MEMORY. Both cases lead to an
increase in the number of memblocks, whereas EFI_UNUSABLE_MEMORY leads to
a larger number of memblocks.
For example, if the EFI_UNUSABLE_MEMORY type is reported:
...
memory[0x92] [0x0000200834a00000-0x0000200835bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0
memory[0x93] [0x0000200835c00000-0x0000200835dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x94] [0x0000200835e00000-0x00002008367fffff], 0x0000000000a00000 bytes on node 7 flags: 0x0
memory[0x95] [0x0000200836800000-0x00002008369fffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x96] [0x0000200836a00000-0x0000200837bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0
memory[0x97] [0x0000200837c00000-0x0000200837dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4
memory[0x98] [0x0000200837e00000-0x000020087fffffff], 0x0000000048200000 bytes on node 7 flags: 0x0
memory[0x99] [0x0000200880000000-0x0000200bcfffffff], 0x0000000350000000 bytes on node 6 flags: 0x0
memory[0x9a] [0x0000200bd0000000-0x0000200bd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9b] [0x0000200bd0200000-0x0000200bd07fffff], 0x0000000000600000 bytes on node 6 flags: 0x0
memory[0x9c] [0x0000200bd0800000-0x0000200bd09fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9d] [0x0000200bd0a00000-0x0000200fcfffffff], 0x00000003ff600000 bytes on node 6 flags: 0x0
memory[0x9e] [0x0000200fd0000000-0x0000200fd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4
memory[0x9f] [0x0000200fd0200000-0x0000200fffffffff], 0x000000002fe00000 bytes on node 6 flags: 0x0
...
The EFI memory map is parsed to construct the memblock arrays before the
memblock arrays can be resized. As the result, memory regions beyond
INIT_MEMBLOCK_REGIONS are lost.
Add a new macro INIT_MEMBLOCK_MEMORY_REGIONS to replace
INIT_MEMBLOCK_REGTIONS to define the size of the static memblock.memory
array.
Allow overriding memblock.memory array size with architecture defined
INIT_MEMBLOCK_MEMORY_REGIONS and make arm64 to set
INIT_MEMBLOCK_MEMORY_REGIONS to 1024 when CONFIG_EFI is enabled.
Link: https://lkml.kernel.org/r/20220615102742.96450-1-zhouguanghui1@huawei.com
Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Tested-by: Darren Hart <darren@os.amperecomputing.com>
Acked-by: Will Deacon <will@kernel.org> [arm64]
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Xu Qiang <xuqiang36@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-06-15 10:27:42 +00:00
|
|
|
.memory.max = INIT_MEMBLOCK_MEMORY_REGIONS,
|
2017-02-24 14:55:59 -08:00
|
|
|
.memory.name = "memory",
|
2011-12-08 10:22:07 -08:00
|
|
|
|
|
|
|
.reserved.regions = memblock_reserved_init_regions,
|
arm64, mm, efi: Account for GICv3 LPI tables in static memblock reserve table
In the irqchip and EFI code, we have what basically amounts to a quirk
to work around a peculiarity in the GICv3 architecture, which permits
the system memory address of LPI tables to be programmable only once
after a CPU reset. This means kexec kernels must use the same memory
as the first kernel, and thus ensure that this memory has not been
given out for other purposes by the time the ITS init code runs, which
is not very early for secondary CPUs.
On systems with many CPUs, these reservations could overflow the
memblock reservation table, and this was addressed in commit:
eff896288872 ("efi/arm: Defer persistent reservations until after paging_init()")
However, this turns out to have made things worse, since the allocation
of page tables and heap space for the resized memblock reservation table
itself may overwrite the regions we are attempting to reserve, which may
cause all kinds of corruption, also considering that the ITS will still
be poking bits into that memory in response to incoming MSIs.
So instead, let's grow the static memblock reservation table on such
systems so it can accommodate these reservations at an earlier time.
This will permit us to revert the above commit in a subsequent patch.
[ mingo: Minor cleanups. ]
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20190215123333.21209-2-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-02-15 13:33:32 +01:00
|
|
|
.reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
|
2017-02-24 14:55:59 -08:00
|
|
|
.reserved.name = "reserved",
|
2011-12-08 10:22:07 -08:00
|
|
|
|
2013-11-12 15:07:59 -08:00
|
|
|
.bottom_up = false,
|
2011-12-08 10:22:07 -08:00
|
|
|
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
|
|
|
|
};
|
2010-07-12 14:36:09 +10:00
|
|
|
|
2020-07-01 16:18:29 +02:00
|
|
|
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
|
|
|
|
struct memblock_type physmem = {
|
|
|
|
.regions = memblock_physmem_init_regions,
|
|
|
|
.max = INIT_PHYSMEM_REGIONS,
|
|
|
|
.name = "physmem",
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
2020-10-13 16:58:25 -07:00
|
|
|
/*
|
|
|
|
* keep a pointer to &memblock.memory in the text section to use it in
|
|
|
|
* __next_mem_range() and its helpers.
|
|
|
|
* For architectures that do not keep memblock data after init, this
|
|
|
|
* pointer will be reset to NULL at memblock_discard()
|
|
|
|
*/
|
|
|
|
static __refdata struct memblock_type *memblock_memory = &memblock.memory;
|
|
|
|
|
2020-10-13 16:57:49 -07:00
|
|
|
#define for_each_memblock_type(i, memblock_type, rgn) \
|
|
|
|
for (i = 0, rgn = &memblock_type->regions[0]; \
|
|
|
|
i < memblock_type->cnt; \
|
|
|
|
i++, rgn = &memblock_type->regions[i])
|
|
|
|
|
2020-10-13 16:57:54 -07:00
|
|
|
#define memblock_dbg(fmt, ...) \
|
|
|
|
do { \
|
|
|
|
if (memblock_debug) \
|
|
|
|
pr_info(fmt, ##__VA_ARGS__); \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
static int memblock_debug __initdata_memblock;
|
2023-04-23 15:29:35 +02:00
|
|
|
static bool system_has_some_mirror __initdata_memblock;
|
2011-12-08 10:22:08 -08:00
|
|
|
static int memblock_can_resize __initdata_memblock;
|
2023-04-23 15:29:35 +02:00
|
|
|
static int memblock_memory_in_slab __initdata_memblock;
|
|
|
|
static int memblock_reserved_in_slab __initdata_memblock;
|
2010-07-12 14:36:09 +10:00
|
|
|
|
2023-08-02 15:23:28 +08:00
|
|
|
bool __init_memblock memblock_has_mirror(void)
|
|
|
|
{
|
|
|
|
return system_has_some_mirror;
|
|
|
|
}
|
|
|
|
|
2019-03-11 23:29:46 -07:00
|
|
|
static enum memblock_flags __init_memblock choose_memblock_flags(void)
|
2015-06-24 16:58:12 -07:00
|
|
|
{
|
|
|
|
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
|
|
|
|
}
|
|
|
|
|
2011-12-08 10:22:07 -08:00
|
|
|
/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
|
|
|
|
static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
|
|
|
|
{
|
2018-06-07 17:06:15 -07:00
|
|
|
return *size = min(*size, PHYS_ADDR_MAX - base);
|
2011-12-08 10:22:07 -08:00
|
|
|
}
|
|
|
|
|
2010-07-12 14:36:48 +10:00
|
|
|
/*
|
|
|
|
* Address comparison utilities
|
|
|
|
*/
|
2024-01-12 12:09:50 -08:00
|
|
|
unsigned long __init_memblock
|
|
|
|
memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2,
|
|
|
|
phys_addr_t size2)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
|
|
|
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
|
|
|
|
}
|
|
|
|
|
mem-hotplug: handle node hole when initializing numa_meminfo.
When parsing SRAT, all memory ranges are added into numa_meminfo. In
numa_init(), before entering numa_cleanup_meminfo(), all possible memory
ranges are in numa_meminfo. And numa_cleanup_meminfo() removes all
ranges over max_pfn or empty.
But, this only works if the nodes are continuous. Let's have a look at
the following example:
We have an SRAT like this:
SRAT: Node 0 PXM 0 [mem 0x00000000-0x5fffffff]
SRAT: Node 0 PXM 0 [mem 0x100000000-0x1ffffffffff]
SRAT: Node 1 PXM 1 [mem 0x20000000000-0x3ffffffffff]
SRAT: Node 4 PXM 2 [mem 0x40000000000-0x5ffffffffff] hotplug
SRAT: Node 5 PXM 3 [mem 0x60000000000-0x7ffffffffff] hotplug
SRAT: Node 2 PXM 4 [mem 0x80000000000-0x9ffffffffff] hotplug
SRAT: Node 3 PXM 5 [mem 0xa0000000000-0xbffffffffff] hotplug
SRAT: Node 6 PXM 6 [mem 0xc0000000000-0xdffffffffff] hotplug
SRAT: Node 7 PXM 7 [mem 0xe0000000000-0xfffffffffff] hotplug
On boot, only node 0,1,2,3 exist.
And the numa_meminfo will look like this:
numa_meminfo.nr_blks = 9
1. on node 0: [0, 60000000]
2. on node 0: [100000000, 20000000000]
3. on node 1: [20000000000, 40000000000]
4. on node 4: [40000000000, 60000000000]
5. on node 5: [60000000000, 80000000000]
6. on node 2: [80000000000, a0000000000]
7. on node 3: [a0000000000, a0800000000]
8. on node 6: [c0000000000, a0800000000]
9. on node 7: [e0000000000, a0800000000]
And numa_cleanup_meminfo() will merge 1 and 2, and remove 8,9 because the
end address is over max_pfn, which is a0800000000. But 4 and 5 are not
removed because their end addresses are less then max_pfn. But in fact,
node 4 and 5 don't exist.
In a word, numa_cleanup_meminfo() is not able to handle holes between nodes.
Since memory ranges in node 4 and 5 are in numa_meminfo, in
numa_register_memblks(), node 4 and 5 will be mistakenly set to online.
If you run lscpu, it will show:
NUMA node0 CPU(s): 0-14,128-142
NUMA node1 CPU(s): 15-29,143-157
NUMA node2 CPU(s):
NUMA node3 CPU(s):
NUMA node4 CPU(s): 62-76,190-204
NUMA node5 CPU(s): 78-92,206-220
In this patch, we use memblock_overlaps_region() to check if ranges in
numa_meminfo overlap with ranges in memory_block. Since memory_block
contains all available memory at boot time, if they overlap, it means the
ranges exist. If not, then remove them from numa_meminfo.
After this patch, lscpu will show:
NUMA node0 CPU(s): 0-14,128-142
NUMA node1 CPU(s): 15-29,143-157
NUMA node4 CPU(s): 62-76,190-204
NUMA node5 CPU(s): 78-92,206-220
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-08 15:02:03 -07:00
|
|
|
bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
|
2011-10-31 17:09:15 -07:00
|
|
|
phys_addr_t base, phys_addr_t size)
|
2010-07-12 14:36:48 +10:00
|
|
|
{
|
|
|
|
unsigned long i;
|
|
|
|
|
2021-06-30 09:12:13 +03:00
|
|
|
memblock_cap_size(base, &size);
|
|
|
|
|
2016-01-14 15:20:39 -08:00
|
|
|
for (i = 0; i < type->cnt; i++)
|
|
|
|
if (memblock_addrs_overlap(base, size, type->regions[i].base,
|
|
|
|
type->regions[i].size))
|
2024-05-07 07:58:31 +00:00
|
|
|
return true;
|
|
|
|
return false;
|
2010-07-12 14:36:48 +10:00
|
|
|
}
|
|
|
|
|
2018-06-30 17:55:02 +03:00
|
|
|
/**
|
2013-11-12 15:07:59 -08:00
|
|
|
* __memblock_find_range_bottom_up - find free area utility in bottom-up
|
|
|
|
* @start: start of candidate range
|
2018-06-30 17:55:02 +03:00
|
|
|
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
|
|
|
|
* %MEMBLOCK_ALLOC_ACCESSIBLE
|
2013-11-12 15:07:59 -08:00
|
|
|
* @size: size of free area to find
|
|
|
|
* @align: alignment of free area to find
|
2014-01-21 15:50:16 -08:00
|
|
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
2015-06-24 16:58:09 -07:00
|
|
|
* @flags: pick from blocks based on memory attributes
|
2013-11-12 15:07:59 -08:00
|
|
|
*
|
|
|
|
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2013-11-12 15:07:59 -08:00
|
|
|
* Found address on success, 0 on failure.
|
|
|
|
*/
|
|
|
|
static phys_addr_t __init_memblock
|
|
|
|
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
|
2015-06-24 16:58:09 -07:00
|
|
|
phys_addr_t size, phys_addr_t align, int nid,
|
2018-06-30 17:55:01 +03:00
|
|
|
enum memblock_flags flags)
|
2013-11-12 15:07:59 -08:00
|
|
|
{
|
|
|
|
phys_addr_t this_start, this_end, cand;
|
|
|
|
u64 i;
|
|
|
|
|
2015-06-24 16:58:09 -07:00
|
|
|
for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
|
2013-11-12 15:07:59 -08:00
|
|
|
this_start = clamp(this_start, start, end);
|
|
|
|
this_end = clamp(this_end, start, end);
|
|
|
|
|
|
|
|
cand = round_up(this_start, align);
|
|
|
|
if (cand < this_end && this_end - cand >= size)
|
|
|
|
return cand;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-12-08 10:22:09 -08:00
|
|
|
/**
|
mm/memblock.c: factor out of top-down allocation
[Problem]
The current Linux cannot migrate pages used by the kernel because of the
kernel direct mapping. In Linux kernel space, va = pa + PAGE_OFFSET.
When the pa is changed, we cannot simply update the pagetable and keep the
va unmodified. So the kernel pages are not migratable.
There are also some other issues will cause the kernel pages not
migratable. For example, the physical address may be cached somewhere and
will be used. It is not to update all the caches.
When doing memory hotplug in Linux, we first migrate all the pages in one
memory device somewhere else, and then remove the device. But if pages
are used by the kernel, they are not migratable. As a result, memory used
by the kernel cannot be hot-removed.
Modifying the kernel direct mapping mechanism is too difficult to do. And
it may cause the kernel performance down and unstable. So we use the
following way to do memory hotplug.
[What we are doing]
In Linux, memory in one numa node is divided into several zones. One of
the zones is ZONE_MOVABLE, which the kernel won't use.
In order to implement memory hotplug in Linux, we are going to arrange all
hotpluggable memory in ZONE_MOVABLE so that the kernel won't use these
memory. To do this, we need ACPI's help.
In ACPI, SRAT(System Resource Affinity Table) contains NUMA info. The
memory affinities in SRAT record every memory range in the system, and
also, flags specifying if the memory range is hotpluggable. (Please refer
to ACPI spec 5.0 5.2.16)
With the help of SRAT, we have to do the following two things to achieve our
goal:
1. When doing memory hot-add, allow the users arranging hotpluggable as
ZONE_MOVABLE.
(This has been done by the MOVABLE_NODE functionality in Linux.)
2. when the system is booting, prevent bootmem allocator from allocating
hotpluggable memory for the kernel before the memory initialization
finishes.
The problem 2 is the key problem we are going to solve. But before solving it,
we need some preparation. Please see below.
[Preparation]
Bootloader has to load the kernel image into memory. And this memory must
be unhotpluggable. We cannot prevent this anyway. So in a memory hotplug
system, we can assume any node the kernel resides in is not hotpluggable.
Before SRAT is parsed, we don't know which memory ranges are hotpluggable.
But memblock has already started to work. In the current kernel,
memblock allocates the following memory before SRAT is parsed:
setup_arch()
|->memblock_x86_fill() /* memblock is ready */
|......
|->early_reserve_e820_mpc_new() /* allocate memory under 1MB */
|->reserve_real_mode() /* allocate memory under 1MB */
|->init_mem_mapping() /* allocate page tables, about 2MB to map 1GB memory */
|->dma_contiguous_reserve() /* specified by user, should be low */
|->setup_log_buf() /* specified by user, several mega bytes */
|->relocate_initrd() /* could be large, but will be freed after boot, should reorder */
|->acpi_initrd_override() /* several mega bytes */
|->reserve_crashkernel() /* could be large, should reorder */
|......
|->initmem_init() /* Parse SRAT */
According to Tejun's advice, before SRAT is parsed, we should try our best
to allocate memory near the kernel image. Since the whole node the kernel
resides in won't be hotpluggable, and for a modern server, a node may have
at least 16GB memory, allocating several mega bytes memory around the
kernel image won't cross to hotpluggable memory.
[About this patchset]
So this patchset is the preparation for the problem 2 that we want to
solve. It does the following:
1. Make memblock be able to allocate memory bottom up.
1) Keep all the memblock APIs' prototype unmodified.
2) When the direction is bottom up, keep the start address greater than the
end of kernel image.
2. Improve init_mem_mapping() to support allocate page tables in
bottom up direction.
3. Introduce "movable_node" boot option to enable and disable this
functionality.
This patch (of 6):
Create a new function __memblock_find_range_top_down to factor out of
top-down allocation from memblock_find_in_range_node. This is a
preparation because we will introduce a new bottom-up allocation mode in
the following patch.
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-12 15:07:57 -08:00
|
|
|
* __memblock_find_range_top_down - find free area utility, in top-down
|
2011-12-08 10:22:09 -08:00
|
|
|
* @start: start of candidate range
|
2018-06-30 17:55:02 +03:00
|
|
|
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
|
|
|
|
* %MEMBLOCK_ALLOC_ACCESSIBLE
|
2011-12-08 10:22:09 -08:00
|
|
|
* @size: size of free area to find
|
|
|
|
* @align: alignment of free area to find
|
2014-01-21 15:50:16 -08:00
|
|
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
2015-06-24 16:58:09 -07:00
|
|
|
* @flags: pick from blocks based on memory attributes
|
2011-12-08 10:22:09 -08:00
|
|
|
*
|
mm/memblock.c: factor out of top-down allocation
[Problem]
The current Linux cannot migrate pages used by the kernel because of the
kernel direct mapping. In Linux kernel space, va = pa + PAGE_OFFSET.
When the pa is changed, we cannot simply update the pagetable and keep the
va unmodified. So the kernel pages are not migratable.
There are also some other issues will cause the kernel pages not
migratable. For example, the physical address may be cached somewhere and
will be used. It is not to update all the caches.
When doing memory hotplug in Linux, we first migrate all the pages in one
memory device somewhere else, and then remove the device. But if pages
are used by the kernel, they are not migratable. As a result, memory used
by the kernel cannot be hot-removed.
Modifying the kernel direct mapping mechanism is too difficult to do. And
it may cause the kernel performance down and unstable. So we use the
following way to do memory hotplug.
[What we are doing]
In Linux, memory in one numa node is divided into several zones. One of
the zones is ZONE_MOVABLE, which the kernel won't use.
In order to implement memory hotplug in Linux, we are going to arrange all
hotpluggable memory in ZONE_MOVABLE so that the kernel won't use these
memory. To do this, we need ACPI's help.
In ACPI, SRAT(System Resource Affinity Table) contains NUMA info. The
memory affinities in SRAT record every memory range in the system, and
also, flags specifying if the memory range is hotpluggable. (Please refer
to ACPI spec 5.0 5.2.16)
With the help of SRAT, we have to do the following two things to achieve our
goal:
1. When doing memory hot-add, allow the users arranging hotpluggable as
ZONE_MOVABLE.
(This has been done by the MOVABLE_NODE functionality in Linux.)
2. when the system is booting, prevent bootmem allocator from allocating
hotpluggable memory for the kernel before the memory initialization
finishes.
The problem 2 is the key problem we are going to solve. But before solving it,
we need some preparation. Please see below.
[Preparation]
Bootloader has to load the kernel image into memory. And this memory must
be unhotpluggable. We cannot prevent this anyway. So in a memory hotplug
system, we can assume any node the kernel resides in is not hotpluggable.
Before SRAT is parsed, we don't know which memory ranges are hotpluggable.
But memblock has already started to work. In the current kernel,
memblock allocates the following memory before SRAT is parsed:
setup_arch()
|->memblock_x86_fill() /* memblock is ready */
|......
|->early_reserve_e820_mpc_new() /* allocate memory under 1MB */
|->reserve_real_mode() /* allocate memory under 1MB */
|->init_mem_mapping() /* allocate page tables, about 2MB to map 1GB memory */
|->dma_contiguous_reserve() /* specified by user, should be low */
|->setup_log_buf() /* specified by user, several mega bytes */
|->relocate_initrd() /* could be large, but will be freed after boot, should reorder */
|->acpi_initrd_override() /* several mega bytes */
|->reserve_crashkernel() /* could be large, should reorder */
|......
|->initmem_init() /* Parse SRAT */
According to Tejun's advice, before SRAT is parsed, we should try our best
to allocate memory near the kernel image. Since the whole node the kernel
resides in won't be hotpluggable, and for a modern server, a node may have
at least 16GB memory, allocating several mega bytes memory around the
kernel image won't cross to hotpluggable memory.
[About this patchset]
So this patchset is the preparation for the problem 2 that we want to
solve. It does the following:
1. Make memblock be able to allocate memory bottom up.
1) Keep all the memblock APIs' prototype unmodified.
2) When the direction is bottom up, keep the start address greater than the
end of kernel image.
2. Improve init_mem_mapping() to support allocate page tables in
bottom up direction.
3. Introduce "movable_node" boot option to enable and disable this
functionality.
This patch (of 6):
Create a new function __memblock_find_range_top_down to factor out of
top-down allocation from memblock_find_in_range_node. This is a
preparation because we will introduce a new bottom-up allocation mode in
the following patch.
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-12 15:07:57 -08:00
|
|
|
* Utility called from memblock_find_in_range_node(), find free area top-down.
|
2011-12-08 10:22:09 -08:00
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2013-11-12 15:07:59 -08:00
|
|
|
* Found address on success, 0 on failure.
|
2010-07-12 14:36:48 +10:00
|
|
|
*/
|
mm/memblock.c: factor out of top-down allocation
[Problem]
The current Linux cannot migrate pages used by the kernel because of the
kernel direct mapping. In Linux kernel space, va = pa + PAGE_OFFSET.
When the pa is changed, we cannot simply update the pagetable and keep the
va unmodified. So the kernel pages are not migratable.
There are also some other issues will cause the kernel pages not
migratable. For example, the physical address may be cached somewhere and
will be used. It is not to update all the caches.
When doing memory hotplug in Linux, we first migrate all the pages in one
memory device somewhere else, and then remove the device. But if pages
are used by the kernel, they are not migratable. As a result, memory used
by the kernel cannot be hot-removed.
Modifying the kernel direct mapping mechanism is too difficult to do. And
it may cause the kernel performance down and unstable. So we use the
following way to do memory hotplug.
[What we are doing]
In Linux, memory in one numa node is divided into several zones. One of
the zones is ZONE_MOVABLE, which the kernel won't use.
In order to implement memory hotplug in Linux, we are going to arrange all
hotpluggable memory in ZONE_MOVABLE so that the kernel won't use these
memory. To do this, we need ACPI's help.
In ACPI, SRAT(System Resource Affinity Table) contains NUMA info. The
memory affinities in SRAT record every memory range in the system, and
also, flags specifying if the memory range is hotpluggable. (Please refer
to ACPI spec 5.0 5.2.16)
With the help of SRAT, we have to do the following two things to achieve our
goal:
1. When doing memory hot-add, allow the users arranging hotpluggable as
ZONE_MOVABLE.
(This has been done by the MOVABLE_NODE functionality in Linux.)
2. when the system is booting, prevent bootmem allocator from allocating
hotpluggable memory for the kernel before the memory initialization
finishes.
The problem 2 is the key problem we are going to solve. But before solving it,
we need some preparation. Please see below.
[Preparation]
Bootloader has to load the kernel image into memory. And this memory must
be unhotpluggable. We cannot prevent this anyway. So in a memory hotplug
system, we can assume any node the kernel resides in is not hotpluggable.
Before SRAT is parsed, we don't know which memory ranges are hotpluggable.
But memblock has already started to work. In the current kernel,
memblock allocates the following memory before SRAT is parsed:
setup_arch()
|->memblock_x86_fill() /* memblock is ready */
|......
|->early_reserve_e820_mpc_new() /* allocate memory under 1MB */
|->reserve_real_mode() /* allocate memory under 1MB */
|->init_mem_mapping() /* allocate page tables, about 2MB to map 1GB memory */
|->dma_contiguous_reserve() /* specified by user, should be low */
|->setup_log_buf() /* specified by user, several mega bytes */
|->relocate_initrd() /* could be large, but will be freed after boot, should reorder */
|->acpi_initrd_override() /* several mega bytes */
|->reserve_crashkernel() /* could be large, should reorder */
|......
|->initmem_init() /* Parse SRAT */
According to Tejun's advice, before SRAT is parsed, we should try our best
to allocate memory near the kernel image. Since the whole node the kernel
resides in won't be hotpluggable, and for a modern server, a node may have
at least 16GB memory, allocating several mega bytes memory around the
kernel image won't cross to hotpluggable memory.
[About this patchset]
So this patchset is the preparation for the problem 2 that we want to
solve. It does the following:
1. Make memblock be able to allocate memory bottom up.
1) Keep all the memblock APIs' prototype unmodified.
2) When the direction is bottom up, keep the start address greater than the
end of kernel image.
2. Improve init_mem_mapping() to support allocate page tables in
bottom up direction.
3. Introduce "movable_node" boot option to enable and disable this
functionality.
This patch (of 6):
Create a new function __memblock_find_range_top_down to factor out of
top-down allocation from memblock_find_in_range_node. This is a
preparation because we will introduce a new bottom-up allocation mode in
the following patch.
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-12 15:07:57 -08:00
|
|
|
static phys_addr_t __init_memblock
|
|
|
|
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
|
2015-06-24 16:58:09 -07:00
|
|
|
phys_addr_t size, phys_addr_t align, int nid,
|
2018-06-30 17:55:01 +03:00
|
|
|
enum memblock_flags flags)
|
2013-02-22 16:33:51 -08:00
|
|
|
{
|
|
|
|
phys_addr_t this_start, this_end, cand;
|
|
|
|
u64 i;
|
|
|
|
|
2015-06-24 16:58:09 -07:00
|
|
|
for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
|
|
|
|
NULL) {
|
2013-02-22 16:33:51 -08:00
|
|
|
this_start = clamp(this_start, start, end);
|
|
|
|
this_end = clamp(this_end, start, end);
|
|
|
|
|
|
|
|
if (this_end < size)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
cand = round_down(this_end - size, align);
|
|
|
|
if (cand >= this_start)
|
|
|
|
return cand;
|
|
|
|
}
|
mm/memblock.c: factor out of top-down allocation
[Problem]
The current Linux cannot migrate pages used by the kernel because of the
kernel direct mapping. In Linux kernel space, va = pa + PAGE_OFFSET.
When the pa is changed, we cannot simply update the pagetable and keep the
va unmodified. So the kernel pages are not migratable.
There are also some other issues will cause the kernel pages not
migratable. For example, the physical address may be cached somewhere and
will be used. It is not to update all the caches.
When doing memory hotplug in Linux, we first migrate all the pages in one
memory device somewhere else, and then remove the device. But if pages
are used by the kernel, they are not migratable. As a result, memory used
by the kernel cannot be hot-removed.
Modifying the kernel direct mapping mechanism is too difficult to do. And
it may cause the kernel performance down and unstable. So we use the
following way to do memory hotplug.
[What we are doing]
In Linux, memory in one numa node is divided into several zones. One of
the zones is ZONE_MOVABLE, which the kernel won't use.
In order to implement memory hotplug in Linux, we are going to arrange all
hotpluggable memory in ZONE_MOVABLE so that the kernel won't use these
memory. To do this, we need ACPI's help.
In ACPI, SRAT(System Resource Affinity Table) contains NUMA info. The
memory affinities in SRAT record every memory range in the system, and
also, flags specifying if the memory range is hotpluggable. (Please refer
to ACPI spec 5.0 5.2.16)
With the help of SRAT, we have to do the following two things to achieve our
goal:
1. When doing memory hot-add, allow the users arranging hotpluggable as
ZONE_MOVABLE.
(This has been done by the MOVABLE_NODE functionality in Linux.)
2. when the system is booting, prevent bootmem allocator from allocating
hotpluggable memory for the kernel before the memory initialization
finishes.
The problem 2 is the key problem we are going to solve. But before solving it,
we need some preparation. Please see below.
[Preparation]
Bootloader has to load the kernel image into memory. And this memory must
be unhotpluggable. We cannot prevent this anyway. So in a memory hotplug
system, we can assume any node the kernel resides in is not hotpluggable.
Before SRAT is parsed, we don't know which memory ranges are hotpluggable.
But memblock has already started to work. In the current kernel,
memblock allocates the following memory before SRAT is parsed:
setup_arch()
|->memblock_x86_fill() /* memblock is ready */
|......
|->early_reserve_e820_mpc_new() /* allocate memory under 1MB */
|->reserve_real_mode() /* allocate memory under 1MB */
|->init_mem_mapping() /* allocate page tables, about 2MB to map 1GB memory */
|->dma_contiguous_reserve() /* specified by user, should be low */
|->setup_log_buf() /* specified by user, several mega bytes */
|->relocate_initrd() /* could be large, but will be freed after boot, should reorder */
|->acpi_initrd_override() /* several mega bytes */
|->reserve_crashkernel() /* could be large, should reorder */
|......
|->initmem_init() /* Parse SRAT */
According to Tejun's advice, before SRAT is parsed, we should try our best
to allocate memory near the kernel image. Since the whole node the kernel
resides in won't be hotpluggable, and for a modern server, a node may have
at least 16GB memory, allocating several mega bytes memory around the
kernel image won't cross to hotpluggable memory.
[About this patchset]
So this patchset is the preparation for the problem 2 that we want to
solve. It does the following:
1. Make memblock be able to allocate memory bottom up.
1) Keep all the memblock APIs' prototype unmodified.
2) When the direction is bottom up, keep the start address greater than the
end of kernel image.
2. Improve init_mem_mapping() to support allocate page tables in
bottom up direction.
3. Introduce "movable_node" boot option to enable and disable this
functionality.
This patch (of 6):
Create a new function __memblock_find_range_top_down to factor out of
top-down allocation from memblock_find_in_range_node. This is a
preparation because we will introduce a new bottom-up allocation mode in
the following patch.
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-12 15:07:57 -08:00
|
|
|
|
2013-02-22 16:33:51 -08:00
|
|
|
return 0;
|
|
|
|
}
|
2010-07-12 14:36:48 +10:00
|
|
|
|
mm/memblock.c: factor out of top-down allocation
[Problem]
The current Linux cannot migrate pages used by the kernel because of the
kernel direct mapping. In Linux kernel space, va = pa + PAGE_OFFSET.
When the pa is changed, we cannot simply update the pagetable and keep the
va unmodified. So the kernel pages are not migratable.
There are also some other issues will cause the kernel pages not
migratable. For example, the physical address may be cached somewhere and
will be used. It is not to update all the caches.
When doing memory hotplug in Linux, we first migrate all the pages in one
memory device somewhere else, and then remove the device. But if pages
are used by the kernel, they are not migratable. As a result, memory used
by the kernel cannot be hot-removed.
Modifying the kernel direct mapping mechanism is too difficult to do. And
it may cause the kernel performance down and unstable. So we use the
following way to do memory hotplug.
[What we are doing]
In Linux, memory in one numa node is divided into several zones. One of
the zones is ZONE_MOVABLE, which the kernel won't use.
In order to implement memory hotplug in Linux, we are going to arrange all
hotpluggable memory in ZONE_MOVABLE so that the kernel won't use these
memory. To do this, we need ACPI's help.
In ACPI, SRAT(System Resource Affinity Table) contains NUMA info. The
memory affinities in SRAT record every memory range in the system, and
also, flags specifying if the memory range is hotpluggable. (Please refer
to ACPI spec 5.0 5.2.16)
With the help of SRAT, we have to do the following two things to achieve our
goal:
1. When doing memory hot-add, allow the users arranging hotpluggable as
ZONE_MOVABLE.
(This has been done by the MOVABLE_NODE functionality in Linux.)
2. when the system is booting, prevent bootmem allocator from allocating
hotpluggable memory for the kernel before the memory initialization
finishes.
The problem 2 is the key problem we are going to solve. But before solving it,
we need some preparation. Please see below.
[Preparation]
Bootloader has to load the kernel image into memory. And this memory must
be unhotpluggable. We cannot prevent this anyway. So in a memory hotplug
system, we can assume any node the kernel resides in is not hotpluggable.
Before SRAT is parsed, we don't know which memory ranges are hotpluggable.
But memblock has already started to work. In the current kernel,
memblock allocates the following memory before SRAT is parsed:
setup_arch()
|->memblock_x86_fill() /* memblock is ready */
|......
|->early_reserve_e820_mpc_new() /* allocate memory under 1MB */
|->reserve_real_mode() /* allocate memory under 1MB */
|->init_mem_mapping() /* allocate page tables, about 2MB to map 1GB memory */
|->dma_contiguous_reserve() /* specified by user, should be low */
|->setup_log_buf() /* specified by user, several mega bytes */
|->relocate_initrd() /* could be large, but will be freed after boot, should reorder */
|->acpi_initrd_override() /* several mega bytes */
|->reserve_crashkernel() /* could be large, should reorder */
|......
|->initmem_init() /* Parse SRAT */
According to Tejun's advice, before SRAT is parsed, we should try our best
to allocate memory near the kernel image. Since the whole node the kernel
resides in won't be hotpluggable, and for a modern server, a node may have
at least 16GB memory, allocating several mega bytes memory around the
kernel image won't cross to hotpluggable memory.
[About this patchset]
So this patchset is the preparation for the problem 2 that we want to
solve. It does the following:
1. Make memblock be able to allocate memory bottom up.
1) Keep all the memblock APIs' prototype unmodified.
2) When the direction is bottom up, keep the start address greater than the
end of kernel image.
2. Improve init_mem_mapping() to support allocate page tables in
bottom up direction.
3. Introduce "movable_node" boot option to enable and disable this
functionality.
This patch (of 6):
Create a new function __memblock_find_range_top_down to factor out of
top-down allocation from memblock_find_in_range_node. This is a
preparation because we will introduce a new bottom-up allocation mode in
the following patch.
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-12 15:07:57 -08:00
|
|
|
/**
|
|
|
|
* memblock_find_in_range_node - find free area in given range and node
|
|
|
|
* @size: size of free area to find
|
|
|
|
* @align: alignment of free area to find
|
2014-01-21 15:50:14 -08:00
|
|
|
* @start: start of candidate range
|
2018-06-30 17:55:02 +03:00
|
|
|
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
|
|
|
|
* %MEMBLOCK_ALLOC_ACCESSIBLE
|
2014-01-21 15:50:16 -08:00
|
|
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
2015-06-24 16:58:09 -07:00
|
|
|
* @flags: pick from blocks based on memory attributes
|
mm/memblock.c: factor out of top-down allocation
[Problem]
The current Linux cannot migrate pages used by the kernel because of the
kernel direct mapping. In Linux kernel space, va = pa + PAGE_OFFSET.
When the pa is changed, we cannot simply update the pagetable and keep the
va unmodified. So the kernel pages are not migratable.
There are also some other issues will cause the kernel pages not
migratable. For example, the physical address may be cached somewhere and
will be used. It is not to update all the caches.
When doing memory hotplug in Linux, we first migrate all the pages in one
memory device somewhere else, and then remove the device. But if pages
are used by the kernel, they are not migratable. As a result, memory used
by the kernel cannot be hot-removed.
Modifying the kernel direct mapping mechanism is too difficult to do. And
it may cause the kernel performance down and unstable. So we use the
following way to do memory hotplug.
[What we are doing]
In Linux, memory in one numa node is divided into several zones. One of
the zones is ZONE_MOVABLE, which the kernel won't use.
In order to implement memory hotplug in Linux, we are going to arrange all
hotpluggable memory in ZONE_MOVABLE so that the kernel won't use these
memory. To do this, we need ACPI's help.
In ACPI, SRAT(System Resource Affinity Table) contains NUMA info. The
memory affinities in SRAT record every memory range in the system, and
also, flags specifying if the memory range is hotpluggable. (Please refer
to ACPI spec 5.0 5.2.16)
With the help of SRAT, we have to do the following two things to achieve our
goal:
1. When doing memory hot-add, allow the users arranging hotpluggable as
ZONE_MOVABLE.
(This has been done by the MOVABLE_NODE functionality in Linux.)
2. when the system is booting, prevent bootmem allocator from allocating
hotpluggable memory for the kernel before the memory initialization
finishes.
The problem 2 is the key problem we are going to solve. But before solving it,
we need some preparation. Please see below.
[Preparation]
Bootloader has to load the kernel image into memory. And this memory must
be unhotpluggable. We cannot prevent this anyway. So in a memory hotplug
system, we can assume any node the kernel resides in is not hotpluggable.
Before SRAT is parsed, we don't know which memory ranges are hotpluggable.
But memblock has already started to work. In the current kernel,
memblock allocates the following memory before SRAT is parsed:
setup_arch()
|->memblock_x86_fill() /* memblock is ready */
|......
|->early_reserve_e820_mpc_new() /* allocate memory under 1MB */
|->reserve_real_mode() /* allocate memory under 1MB */
|->init_mem_mapping() /* allocate page tables, about 2MB to map 1GB memory */
|->dma_contiguous_reserve() /* specified by user, should be low */
|->setup_log_buf() /* specified by user, several mega bytes */
|->relocate_initrd() /* could be large, but will be freed after boot, should reorder */
|->acpi_initrd_override() /* several mega bytes */
|->reserve_crashkernel() /* could be large, should reorder */
|......
|->initmem_init() /* Parse SRAT */
According to Tejun's advice, before SRAT is parsed, we should try our best
to allocate memory near the kernel image. Since the whole node the kernel
resides in won't be hotpluggable, and for a modern server, a node may have
at least 16GB memory, allocating several mega bytes memory around the
kernel image won't cross to hotpluggable memory.
[About this patchset]
So this patchset is the preparation for the problem 2 that we want to
solve. It does the following:
1. Make memblock be able to allocate memory bottom up.
1) Keep all the memblock APIs' prototype unmodified.
2) When the direction is bottom up, keep the start address greater than the
end of kernel image.
2. Improve init_mem_mapping() to support allocate page tables in
bottom up direction.
3. Introduce "movable_node" boot option to enable and disable this
functionality.
This patch (of 6):
Create a new function __memblock_find_range_top_down to factor out of
top-down allocation from memblock_find_in_range_node. This is a
preparation because we will introduce a new bottom-up allocation mode in
the following patch.
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-12 15:07:57 -08:00
|
|
|
*
|
|
|
|
* Find @size free area aligned to @align in the specified range and node.
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2013-11-12 15:07:59 -08:00
|
|
|
* Found address on success, 0 on failure.
|
mm/memblock.c: factor out of top-down allocation
[Problem]
The current Linux cannot migrate pages used by the kernel because of the
kernel direct mapping. In Linux kernel space, va = pa + PAGE_OFFSET.
When the pa is changed, we cannot simply update the pagetable and keep the
va unmodified. So the kernel pages are not migratable.
There are also some other issues will cause the kernel pages not
migratable. For example, the physical address may be cached somewhere and
will be used. It is not to update all the caches.
When doing memory hotplug in Linux, we first migrate all the pages in one
memory device somewhere else, and then remove the device. But if pages
are used by the kernel, they are not migratable. As a result, memory used
by the kernel cannot be hot-removed.
Modifying the kernel direct mapping mechanism is too difficult to do. And
it may cause the kernel performance down and unstable. So we use the
following way to do memory hotplug.
[What we are doing]
In Linux, memory in one numa node is divided into several zones. One of
the zones is ZONE_MOVABLE, which the kernel won't use.
In order to implement memory hotplug in Linux, we are going to arrange all
hotpluggable memory in ZONE_MOVABLE so that the kernel won't use these
memory. To do this, we need ACPI's help.
In ACPI, SRAT(System Resource Affinity Table) contains NUMA info. The
memory affinities in SRAT record every memory range in the system, and
also, flags specifying if the memory range is hotpluggable. (Please refer
to ACPI spec 5.0 5.2.16)
With the help of SRAT, we have to do the following two things to achieve our
goal:
1. When doing memory hot-add, allow the users arranging hotpluggable as
ZONE_MOVABLE.
(This has been done by the MOVABLE_NODE functionality in Linux.)
2. when the system is booting, prevent bootmem allocator from allocating
hotpluggable memory for the kernel before the memory initialization
finishes.
The problem 2 is the key problem we are going to solve. But before solving it,
we need some preparation. Please see below.
[Preparation]
Bootloader has to load the kernel image into memory. And this memory must
be unhotpluggable. We cannot prevent this anyway. So in a memory hotplug
system, we can assume any node the kernel resides in is not hotpluggable.
Before SRAT is parsed, we don't know which memory ranges are hotpluggable.
But memblock has already started to work. In the current kernel,
memblock allocates the following memory before SRAT is parsed:
setup_arch()
|->memblock_x86_fill() /* memblock is ready */
|......
|->early_reserve_e820_mpc_new() /* allocate memory under 1MB */
|->reserve_real_mode() /* allocate memory under 1MB */
|->init_mem_mapping() /* allocate page tables, about 2MB to map 1GB memory */
|->dma_contiguous_reserve() /* specified by user, should be low */
|->setup_log_buf() /* specified by user, several mega bytes */
|->relocate_initrd() /* could be large, but will be freed after boot, should reorder */
|->acpi_initrd_override() /* several mega bytes */
|->reserve_crashkernel() /* could be large, should reorder */
|......
|->initmem_init() /* Parse SRAT */
According to Tejun's advice, before SRAT is parsed, we should try our best
to allocate memory near the kernel image. Since the whole node the kernel
resides in won't be hotpluggable, and for a modern server, a node may have
at least 16GB memory, allocating several mega bytes memory around the
kernel image won't cross to hotpluggable memory.
[About this patchset]
So this patchset is the preparation for the problem 2 that we want to
solve. It does the following:
1. Make memblock be able to allocate memory bottom up.
1) Keep all the memblock APIs' prototype unmodified.
2) When the direction is bottom up, keep the start address greater than the
end of kernel image.
2. Improve init_mem_mapping() to support allocate page tables in
bottom up direction.
3. Introduce "movable_node" boot option to enable and disable this
functionality.
This patch (of 6):
Create a new function __memblock_find_range_top_down to factor out of
top-down allocation from memblock_find_in_range_node. This is a
preparation because we will introduce a new bottom-up allocation mode in
the following patch.
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-12 15:07:57 -08:00
|
|
|
*/
|
2019-03-11 23:29:46 -07:00
|
|
|
static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
|
2014-01-21 15:50:14 -08:00
|
|
|
phys_addr_t align, phys_addr_t start,
|
2018-06-30 17:55:01 +03:00
|
|
|
phys_addr_t end, int nid,
|
|
|
|
enum memblock_flags flags)
|
mm/memblock.c: factor out of top-down allocation
[Problem]
The current Linux cannot migrate pages used by the kernel because of the
kernel direct mapping. In Linux kernel space, va = pa + PAGE_OFFSET.
When the pa is changed, we cannot simply update the pagetable and keep the
va unmodified. So the kernel pages are not migratable.
There are also some other issues will cause the kernel pages not
migratable. For example, the physical address may be cached somewhere and
will be used. It is not to update all the caches.
When doing memory hotplug in Linux, we first migrate all the pages in one
memory device somewhere else, and then remove the device. But if pages
are used by the kernel, they are not migratable. As a result, memory used
by the kernel cannot be hot-removed.
Modifying the kernel direct mapping mechanism is too difficult to do. And
it may cause the kernel performance down and unstable. So we use the
following way to do memory hotplug.
[What we are doing]
In Linux, memory in one numa node is divided into several zones. One of
the zones is ZONE_MOVABLE, which the kernel won't use.
In order to implement memory hotplug in Linux, we are going to arrange all
hotpluggable memory in ZONE_MOVABLE so that the kernel won't use these
memory. To do this, we need ACPI's help.
In ACPI, SRAT(System Resource Affinity Table) contains NUMA info. The
memory affinities in SRAT record every memory range in the system, and
also, flags specifying if the memory range is hotpluggable. (Please refer
to ACPI spec 5.0 5.2.16)
With the help of SRAT, we have to do the following two things to achieve our
goal:
1. When doing memory hot-add, allow the users arranging hotpluggable as
ZONE_MOVABLE.
(This has been done by the MOVABLE_NODE functionality in Linux.)
2. when the system is booting, prevent bootmem allocator from allocating
hotpluggable memory for the kernel before the memory initialization
finishes.
The problem 2 is the key problem we are going to solve. But before solving it,
we need some preparation. Please see below.
[Preparation]
Bootloader has to load the kernel image into memory. And this memory must
be unhotpluggable. We cannot prevent this anyway. So in a memory hotplug
system, we can assume any node the kernel resides in is not hotpluggable.
Before SRAT is parsed, we don't know which memory ranges are hotpluggable.
But memblock has already started to work. In the current kernel,
memblock allocates the following memory before SRAT is parsed:
setup_arch()
|->memblock_x86_fill() /* memblock is ready */
|......
|->early_reserve_e820_mpc_new() /* allocate memory under 1MB */
|->reserve_real_mode() /* allocate memory under 1MB */
|->init_mem_mapping() /* allocate page tables, about 2MB to map 1GB memory */
|->dma_contiguous_reserve() /* specified by user, should be low */
|->setup_log_buf() /* specified by user, several mega bytes */
|->relocate_initrd() /* could be large, but will be freed after boot, should reorder */
|->acpi_initrd_override() /* several mega bytes */
|->reserve_crashkernel() /* could be large, should reorder */
|......
|->initmem_init() /* Parse SRAT */
According to Tejun's advice, before SRAT is parsed, we should try our best
to allocate memory near the kernel image. Since the whole node the kernel
resides in won't be hotpluggable, and for a modern server, a node may have
at least 16GB memory, allocating several mega bytes memory around the
kernel image won't cross to hotpluggable memory.
[About this patchset]
So this patchset is the preparation for the problem 2 that we want to
solve. It does the following:
1. Make memblock be able to allocate memory bottom up.
1) Keep all the memblock APIs' prototype unmodified.
2) When the direction is bottom up, keep the start address greater than the
end of kernel image.
2. Improve init_mem_mapping() to support allocate page tables in
bottom up direction.
3. Introduce "movable_node" boot option to enable and disable this
functionality.
This patch (of 6):
Create a new function __memblock_find_range_top_down to factor out of
top-down allocation from memblock_find_in_range_node. This is a
preparation because we will introduce a new bottom-up allocation mode in
the following patch.
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-12 15:07:57 -08:00
|
|
|
{
|
|
|
|
/* pump up @end */
|
mm/memblock.c: skip kmemleak for kasan_init()
Kmemleak does not play well with KASAN (tested on both HPE Apollo 70 and
Huawei TaiShan 2280 aarch64 servers).
After calling start_kernel()->setup_arch()->kasan_init(), kmemleak early
log buffer went from something like 280 to 260000 which caused kmemleak
disabled and crash dump memory reservation failed. The multitude of
kmemleak_alloc() calls is from nested loops while KASAN is setting up full
memory mappings, so let early kmemleak allocations skip those
memblock_alloc_internal() calls came from kasan_init() given that those
early KASAN memory mappings should not reference to other memory. Hence,
no kmemleak false positives.
kasan_init
kasan_map_populate [1]
kasan_pgd_populate [2]
kasan_pud_populate [3]
kasan_pmd_populate [4]
kasan_pte_populate [5]
kasan_alloc_zeroed_page
memblock_alloc_try_nid
memblock_alloc_internal
kmemleak_alloc
[1] for_each_memblock(memory, reg)
[2] while (pgdp++, addr = next, addr != end)
[3] while (pudp++, addr = next, addr != end && pud_none(READ_ONCE(*pudp)))
[4] while (pmdp++, addr = next, addr != end && pmd_none(READ_ONCE(*pmdp)))
[5] while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep)))
Link: http://lkml.kernel.org/r/1543442925-17794-1-git-send-email-cai@gmx.us
Signed-off-by: Qian Cai <cai@gmx.us>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:36:29 -08:00
|
|
|
if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
|
2021-11-05 11:05:09 -04:00
|
|
|
end == MEMBLOCK_ALLOC_NOLEAKTRACE)
|
mm/memblock.c: factor out of top-down allocation
[Problem]
The current Linux cannot migrate pages used by the kernel because of the
kernel direct mapping. In Linux kernel space, va = pa + PAGE_OFFSET.
When the pa is changed, we cannot simply update the pagetable and keep the
va unmodified. So the kernel pages are not migratable.
There are also some other issues will cause the kernel pages not
migratable. For example, the physical address may be cached somewhere and
will be used. It is not to update all the caches.
When doing memory hotplug in Linux, we first migrate all the pages in one
memory device somewhere else, and then remove the device. But if pages
are used by the kernel, they are not migratable. As a result, memory used
by the kernel cannot be hot-removed.
Modifying the kernel direct mapping mechanism is too difficult to do. And
it may cause the kernel performance down and unstable. So we use the
following way to do memory hotplug.
[What we are doing]
In Linux, memory in one numa node is divided into several zones. One of
the zones is ZONE_MOVABLE, which the kernel won't use.
In order to implement memory hotplug in Linux, we are going to arrange all
hotpluggable memory in ZONE_MOVABLE so that the kernel won't use these
memory. To do this, we need ACPI's help.
In ACPI, SRAT(System Resource Affinity Table) contains NUMA info. The
memory affinities in SRAT record every memory range in the system, and
also, flags specifying if the memory range is hotpluggable. (Please refer
to ACPI spec 5.0 5.2.16)
With the help of SRAT, we have to do the following two things to achieve our
goal:
1. When doing memory hot-add, allow the users arranging hotpluggable as
ZONE_MOVABLE.
(This has been done by the MOVABLE_NODE functionality in Linux.)
2. when the system is booting, prevent bootmem allocator from allocating
hotpluggable memory for the kernel before the memory initialization
finishes.
The problem 2 is the key problem we are going to solve. But before solving it,
we need some preparation. Please see below.
[Preparation]
Bootloader has to load the kernel image into memory. And this memory must
be unhotpluggable. We cannot prevent this anyway. So in a memory hotplug
system, we can assume any node the kernel resides in is not hotpluggable.
Before SRAT is parsed, we don't know which memory ranges are hotpluggable.
But memblock has already started to work. In the current kernel,
memblock allocates the following memory before SRAT is parsed:
setup_arch()
|->memblock_x86_fill() /* memblock is ready */
|......
|->early_reserve_e820_mpc_new() /* allocate memory under 1MB */
|->reserve_real_mode() /* allocate memory under 1MB */
|->init_mem_mapping() /* allocate page tables, about 2MB to map 1GB memory */
|->dma_contiguous_reserve() /* specified by user, should be low */
|->setup_log_buf() /* specified by user, several mega bytes */
|->relocate_initrd() /* could be large, but will be freed after boot, should reorder */
|->acpi_initrd_override() /* several mega bytes */
|->reserve_crashkernel() /* could be large, should reorder */
|......
|->initmem_init() /* Parse SRAT */
According to Tejun's advice, before SRAT is parsed, we should try our best
to allocate memory near the kernel image. Since the whole node the kernel
resides in won't be hotpluggable, and for a modern server, a node may have
at least 16GB memory, allocating several mega bytes memory around the
kernel image won't cross to hotpluggable memory.
[About this patchset]
So this patchset is the preparation for the problem 2 that we want to
solve. It does the following:
1. Make memblock be able to allocate memory bottom up.
1) Keep all the memblock APIs' prototype unmodified.
2) When the direction is bottom up, keep the start address greater than the
end of kernel image.
2. Improve init_mem_mapping() to support allocate page tables in
bottom up direction.
3. Introduce "movable_node" boot option to enable and disable this
functionality.
This patch (of 6):
Create a new function __memblock_find_range_top_down to factor out of
top-down allocation from memblock_find_in_range_node. This is a
preparation because we will introduce a new bottom-up allocation mode in
the following patch.
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-12 15:07:57 -08:00
|
|
|
end = memblock.current_limit;
|
|
|
|
|
|
|
|
/* avoid allocating the first page */
|
|
|
|
start = max_t(phys_addr_t, start, PAGE_SIZE);
|
|
|
|
end = max(start, end);
|
|
|
|
|
2021-02-04 18:32:36 -08:00
|
|
|
if (memblock_bottom_up())
|
|
|
|
return __memblock_find_range_bottom_up(start, end, size, align,
|
|
|
|
nid, flags);
|
|
|
|
else
|
|
|
|
return __memblock_find_range_top_down(start, end, size, align,
|
|
|
|
nid, flags);
|
mm/memblock.c: factor out of top-down allocation
[Problem]
The current Linux cannot migrate pages used by the kernel because of the
kernel direct mapping. In Linux kernel space, va = pa + PAGE_OFFSET.
When the pa is changed, we cannot simply update the pagetable and keep the
va unmodified. So the kernel pages are not migratable.
There are also some other issues will cause the kernel pages not
migratable. For example, the physical address may be cached somewhere and
will be used. It is not to update all the caches.
When doing memory hotplug in Linux, we first migrate all the pages in one
memory device somewhere else, and then remove the device. But if pages
are used by the kernel, they are not migratable. As a result, memory used
by the kernel cannot be hot-removed.
Modifying the kernel direct mapping mechanism is too difficult to do. And
it may cause the kernel performance down and unstable. So we use the
following way to do memory hotplug.
[What we are doing]
In Linux, memory in one numa node is divided into several zones. One of
the zones is ZONE_MOVABLE, which the kernel won't use.
In order to implement memory hotplug in Linux, we are going to arrange all
hotpluggable memory in ZONE_MOVABLE so that the kernel won't use these
memory. To do this, we need ACPI's help.
In ACPI, SRAT(System Resource Affinity Table) contains NUMA info. The
memory affinities in SRAT record every memory range in the system, and
also, flags specifying if the memory range is hotpluggable. (Please refer
to ACPI spec 5.0 5.2.16)
With the help of SRAT, we have to do the following two things to achieve our
goal:
1. When doing memory hot-add, allow the users arranging hotpluggable as
ZONE_MOVABLE.
(This has been done by the MOVABLE_NODE functionality in Linux.)
2. when the system is booting, prevent bootmem allocator from allocating
hotpluggable memory for the kernel before the memory initialization
finishes.
The problem 2 is the key problem we are going to solve. But before solving it,
we need some preparation. Please see below.
[Preparation]
Bootloader has to load the kernel image into memory. And this memory must
be unhotpluggable. We cannot prevent this anyway. So in a memory hotplug
system, we can assume any node the kernel resides in is not hotpluggable.
Before SRAT is parsed, we don't know which memory ranges are hotpluggable.
But memblock has already started to work. In the current kernel,
memblock allocates the following memory before SRAT is parsed:
setup_arch()
|->memblock_x86_fill() /* memblock is ready */
|......
|->early_reserve_e820_mpc_new() /* allocate memory under 1MB */
|->reserve_real_mode() /* allocate memory under 1MB */
|->init_mem_mapping() /* allocate page tables, about 2MB to map 1GB memory */
|->dma_contiguous_reserve() /* specified by user, should be low */
|->setup_log_buf() /* specified by user, several mega bytes */
|->relocate_initrd() /* could be large, but will be freed after boot, should reorder */
|->acpi_initrd_override() /* several mega bytes */
|->reserve_crashkernel() /* could be large, should reorder */
|......
|->initmem_init() /* Parse SRAT */
According to Tejun's advice, before SRAT is parsed, we should try our best
to allocate memory near the kernel image. Since the whole node the kernel
resides in won't be hotpluggable, and for a modern server, a node may have
at least 16GB memory, allocating several mega bytes memory around the
kernel image won't cross to hotpluggable memory.
[About this patchset]
So this patchset is the preparation for the problem 2 that we want to
solve. It does the following:
1. Make memblock be able to allocate memory bottom up.
1) Keep all the memblock APIs' prototype unmodified.
2) When the direction is bottom up, keep the start address greater than the
end of kernel image.
2. Improve init_mem_mapping() to support allocate page tables in
bottom up direction.
3. Introduce "movable_node" boot option to enable and disable this
functionality.
This patch (of 6):
Create a new function __memblock_find_range_top_down to factor out of
top-down allocation from memblock_find_in_range_node. This is a
preparation because we will introduce a new bottom-up allocation mode in
the following patch.
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-12 15:07:57 -08:00
|
|
|
}
|
|
|
|
|
2011-12-08 10:22:09 -08:00
|
|
|
/**
|
|
|
|
* memblock_find_in_range - find free area in given range
|
|
|
|
* @start: start of candidate range
|
2018-06-30 17:55:02 +03:00
|
|
|
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
|
|
|
|
* %MEMBLOCK_ALLOC_ACCESSIBLE
|
2011-12-08 10:22:09 -08:00
|
|
|
* @size: size of free area to find
|
|
|
|
* @align: alignment of free area to find
|
|
|
|
*
|
|
|
|
* Find @size free area aligned to @align in the specified range.
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2013-11-12 15:07:59 -08:00
|
|
|
* Found address on success, 0 on failure.
|
2011-07-12 09:58:10 +02:00
|
|
|
*/
|
2021-09-02 15:00:26 -07:00
|
|
|
static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
|
2011-12-08 10:22:09 -08:00
|
|
|
phys_addr_t end, phys_addr_t size,
|
|
|
|
phys_addr_t align)
|
2010-07-12 14:36:48 +10:00
|
|
|
{
|
2015-06-24 16:58:12 -07:00
|
|
|
phys_addr_t ret;
|
2018-06-30 17:55:01 +03:00
|
|
|
enum memblock_flags flags = choose_memblock_flags();
|
2015-06-24 16:58:12 -07:00
|
|
|
|
|
|
|
again:
|
|
|
|
ret = memblock_find_in_range_node(size, align, start, end,
|
|
|
|
NUMA_NO_NODE, flags);
|
|
|
|
|
|
|
|
if (!ret && (flags & MEMBLOCK_MIRROR)) {
|
2022-06-14 17:21:53 +08:00
|
|
|
pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
|
2015-06-24 16:58:12 -07:00
|
|
|
&size);
|
|
|
|
flags &= ~MEMBLOCK_MIRROR;
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
2010-07-12 14:36:48 +10:00
|
|
|
}
|
|
|
|
|
2010-07-28 15:43:02 +10:00
|
|
|
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
2011-12-08 10:22:08 -08:00
|
|
|
type->total_size -= type->regions[r].size;
|
2011-07-14 11:43:42 +02:00
|
|
|
memmove(&type->regions[r], &type->regions[r + 1],
|
|
|
|
(type->cnt - (r + 1)) * sizeof(type->regions[r]));
|
2010-08-04 14:06:41 +10:00
|
|
|
type->cnt--;
|
2010-07-12 14:36:09 +10:00
|
|
|
|
mm/memblock: properly handle overlaps and fix error path
Currently memblock_reserve() or memblock_free() don't handle overlaps of
any kind. There is some special casing for coalescing exactly adjacent
regions but that's about it.
This is annoying because typically memblock_reserve() is used to mark
regions passed by the firmware as reserved and we all know how much we can
trust our firmwares...
Also, with the current code, if we do something it doesn't handle right
such as trying to memblock_reserve() a large range spanning multiple
existing smaller reserved regions for example, or doing overlapping
reservations, it can silently corrupt the internal region array, causing
odd errors much later on, such as allocations returning reserved regions
etc...
This patch rewrites the underlying functions that add or remove a region
to the arrays. The new code is a lot more robust as it fully handles
overlapping regions. It's also, imho, simpler than the previous
implementation.
In addition, while doing so, I found a bug where if we fail to double the
array while adding a region, we would remove the last region of the array
rather than the region we just allocated. This fixes it too.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:33:43 -07:00
|
|
|
/* Special case for empty arrays */
|
|
|
|
if (type->cnt == 0) {
|
2011-12-08 10:22:08 -08:00
|
|
|
WARN_ON(type->total_size != 0);
|
mm/memblock: properly handle overlaps and fix error path
Currently memblock_reserve() or memblock_free() don't handle overlaps of
any kind. There is some special casing for coalescing exactly adjacent
regions but that's about it.
This is annoying because typically memblock_reserve() is used to mark
regions passed by the firmware as reserved and we all know how much we can
trust our firmwares...
Also, with the current code, if we do something it doesn't handle right
such as trying to memblock_reserve() a large range spanning multiple
existing smaller reserved regions for example, or doing overlapping
reservations, it can silently corrupt the internal region array, causing
odd errors much later on, such as allocations returning reserved regions
etc...
This patch rewrites the underlying functions that add or remove a region
to the arrays. The new code is a lot more robust as it fully handles
overlapping regions. It's also, imho, simpler than the previous
implementation.
In addition, while doing so, I found a bug where if we fail to double the
array while adding a region, we would remove the last region of the array
rather than the region we just allocated. This fixes it too.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:33:43 -07:00
|
|
|
type->regions[0].base = 0;
|
|
|
|
type->regions[0].size = 0;
|
2014-01-21 15:49:20 -08:00
|
|
|
type->regions[0].flags = 0;
|
2011-07-14 11:43:42 +02:00
|
|
|
memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
|
mm/memblock: properly handle overlaps and fix error path
Currently memblock_reserve() or memblock_free() don't handle overlaps of
any kind. There is some special casing for coalescing exactly adjacent
regions but that's about it.
This is annoying because typically memblock_reserve() is used to mark
regions passed by the firmware as reserved and we all know how much we can
trust our firmwares...
Also, with the current code, if we do something it doesn't handle right
such as trying to memblock_reserve() a large range spanning multiple
existing smaller reserved regions for example, or doing overlapping
reservations, it can silently corrupt the internal region array, causing
odd errors much later on, such as allocations returning reserved regions
etc...
This patch rewrites the underlying functions that add or remove a region
to the arrays. The new code is a lot more robust as it fully handles
overlapping regions. It's also, imho, simpler than the previous
implementation.
In addition, while doing so, I found a bug where if we fail to double the
array while adding a region, we would remove the last region of the array
rather than the region we just allocated. This fixes it too.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:33:43 -07:00
|
|
|
}
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2019-05-13 17:22:59 -07:00
|
|
|
#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
|
2017-08-18 15:16:05 -07:00
|
|
|
/**
|
2018-06-30 17:55:02 +03:00
|
|
|
* memblock_discard - discard memory and reserved arrays if they were allocated
|
2017-08-18 15:16:05 -07:00
|
|
|
*/
|
|
|
|
void __init memblock_discard(void)
|
2014-01-23 15:53:11 -08:00
|
|
|
{
|
2017-08-18 15:16:05 -07:00
|
|
|
phys_addr_t addr, size;
|
2014-01-23 15:53:11 -08:00
|
|
|
|
2017-08-18 15:16:05 -07:00
|
|
|
if (memblock.reserved.regions != memblock_reserved_init_regions) {
|
|
|
|
addr = __pa(memblock.reserved.regions);
|
|
|
|
size = PAGE_ALIGN(sizeof(struct memblock_region) *
|
|
|
|
memblock.reserved.max);
|
2022-02-17 22:53:27 +08:00
|
|
|
if (memblock_reserved_in_slab)
|
|
|
|
kfree(memblock.reserved.regions);
|
|
|
|
else
|
|
|
|
memblock_free_late(addr, size);
|
2017-08-18 15:16:05 -07:00
|
|
|
}
|
2014-01-23 15:53:11 -08:00
|
|
|
|
2017-08-25 15:55:46 -07:00
|
|
|
if (memblock.memory.regions != memblock_memory_init_regions) {
|
2017-08-18 15:16:05 -07:00
|
|
|
addr = __pa(memblock.memory.regions);
|
|
|
|
size = PAGE_ALIGN(sizeof(struct memblock_region) *
|
|
|
|
memblock.memory.max);
|
2022-02-17 22:53:27 +08:00
|
|
|
if (memblock_memory_in_slab)
|
|
|
|
kfree(memblock.memory.regions);
|
|
|
|
else
|
|
|
|
memblock_free_late(addr, size);
|
2017-08-18 15:16:05 -07:00
|
|
|
}
|
2020-10-13 16:58:25 -07:00
|
|
|
|
|
|
|
memblock_memory = NULL;
|
2014-01-23 15:53:11 -08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-06-20 12:53:05 -07:00
|
|
|
/**
|
|
|
|
* memblock_double_array - double the size of the memblock regions array
|
|
|
|
* @type: memblock type of the regions array being doubled
|
|
|
|
* @new_area_start: starting address of memory range to avoid overlap with
|
|
|
|
* @new_area_size: size of memory range to avoid overlap with
|
|
|
|
*
|
|
|
|
* Double the size of the @type regions array. If memblock is being used to
|
|
|
|
* allocate memory for a new reserved regions array and there is a previously
|
2018-06-30 17:55:02 +03:00
|
|
|
* allocated memory range [@new_area_start, @new_area_start + @new_area_size]
|
2012-06-20 12:53:05 -07:00
|
|
|
* waiting to be reserved, ensure the memory used by the new array does
|
|
|
|
* not overlap.
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2012-06-20 12:53:05 -07:00
|
|
|
* 0 on success, -1 on failure.
|
|
|
|
*/
|
|
|
|
static int __init_memblock memblock_double_array(struct memblock_type *type,
|
|
|
|
phys_addr_t new_area_start,
|
|
|
|
phys_addr_t new_area_size)
|
2010-07-06 15:39:13 -07:00
|
|
|
{
|
|
|
|
struct memblock_region *new_array, *old_array;
|
2012-07-11 14:02:56 -07:00
|
|
|
phys_addr_t old_alloc_size, new_alloc_size;
|
2018-08-17 15:47:17 -07:00
|
|
|
phys_addr_t old_size, new_size, addr, new_end;
|
2010-07-06 15:39:13 -07:00
|
|
|
int use_slab = slab_is_available();
|
2012-05-29 15:06:50 -07:00
|
|
|
int *in_slab;
|
2010-07-06 15:39:13 -07:00
|
|
|
|
|
|
|
/* We don't allow resizing until we know about the reserved regions
|
|
|
|
* of memory that aren't suitable for allocation
|
|
|
|
*/
|
|
|
|
if (!memblock_can_resize)
|
2023-06-24 11:26:07 +08:00
|
|
|
panic("memblock: cannot resize %s array\n", type->name);
|
2010-07-06 15:39:13 -07:00
|
|
|
|
|
|
|
/* Calculate new doubled size */
|
|
|
|
old_size = type->max * sizeof(struct memblock_region);
|
|
|
|
new_size = old_size << 1;
|
2012-07-11 14:02:56 -07:00
|
|
|
/*
|
|
|
|
* We need to allocated new one align to PAGE_SIZE,
|
|
|
|
* so we can free them completely later.
|
|
|
|
*/
|
|
|
|
old_alloc_size = PAGE_ALIGN(old_size);
|
|
|
|
new_alloc_size = PAGE_ALIGN(new_size);
|
2010-07-06 15:39:13 -07:00
|
|
|
|
2012-05-29 15:06:50 -07:00
|
|
|
/* Retrieve the slab flag */
|
|
|
|
if (type == &memblock.memory)
|
|
|
|
in_slab = &memblock_memory_in_slab;
|
|
|
|
else
|
|
|
|
in_slab = &memblock_reserved_in_slab;
|
|
|
|
|
2019-03-11 23:30:54 -07:00
|
|
|
/* Try to find some space for it */
|
2010-07-06 15:39:13 -07:00
|
|
|
if (use_slab) {
|
|
|
|
new_array = kmalloc(new_size, GFP_KERNEL);
|
2011-07-12 09:58:09 +02:00
|
|
|
addr = new_array ? __pa(new_array) : 0;
|
2012-05-29 15:06:50 -07:00
|
|
|
} else {
|
2012-06-20 12:53:05 -07:00
|
|
|
/* only exclude range when trying to double reserved.regions */
|
|
|
|
if (type != &memblock.reserved)
|
|
|
|
new_area_start = new_area_size = 0;
|
|
|
|
|
|
|
|
addr = memblock_find_in_range(new_area_start + new_area_size,
|
|
|
|
memblock.current_limit,
|
2012-07-11 14:02:56 -07:00
|
|
|
new_alloc_size, PAGE_SIZE);
|
2012-06-20 12:53:05 -07:00
|
|
|
if (!addr && new_area_size)
|
|
|
|
addr = memblock_find_in_range(0,
|
2012-07-31 16:42:40 -07:00
|
|
|
min(new_area_start, memblock.current_limit),
|
|
|
|
new_alloc_size, PAGE_SIZE);
|
2012-06-20 12:53:05 -07:00
|
|
|
|
2012-09-04 13:55:05 +05:30
|
|
|
new_array = addr ? __va(addr) : NULL;
|
2012-05-29 15:06:50 -07:00
|
|
|
}
|
2011-07-12 09:58:09 +02:00
|
|
|
if (!addr) {
|
2010-07-06 15:39:13 -07:00
|
|
|
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
|
2017-02-24 14:55:59 -08:00
|
|
|
type->name, type->max, type->max * 2);
|
2010-07-06 15:39:13 -07:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2018-08-17 15:47:17 -07:00
|
|
|
new_end = addr + new_size - 1;
|
|
|
|
memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]",
|
|
|
|
type->name, type->max * 2, &addr, &new_end);
|
2010-07-28 15:13:22 +10:00
|
|
|
|
2012-07-31 16:42:40 -07:00
|
|
|
/*
|
|
|
|
* Found space, we now need to move the array over before we add the
|
|
|
|
* reserved region since it may be our reserved array itself that is
|
|
|
|
* full.
|
2010-07-06 15:39:13 -07:00
|
|
|
*/
|
|
|
|
memcpy(new_array, type->regions, old_size);
|
|
|
|
memset(new_array + type->max, 0, old_size);
|
|
|
|
old_array = type->regions;
|
|
|
|
type->regions = new_array;
|
|
|
|
type->max <<= 1;
|
|
|
|
|
2012-07-31 16:42:40 -07:00
|
|
|
/* Free old array. We needn't free it if the array is the static one */
|
2012-05-29 15:06:50 -07:00
|
|
|
if (*in_slab)
|
|
|
|
kfree(old_array);
|
|
|
|
else if (old_array != memblock_memory_init_regions &&
|
|
|
|
old_array != memblock_reserved_init_regions)
|
2021-11-05 13:43:22 -07:00
|
|
|
memblock_free(old_array, old_alloc_size);
|
2010-07-06 15:39:13 -07:00
|
|
|
|
2012-07-31 16:42:40 -07:00
|
|
|
/*
|
|
|
|
* Reserve the new array if that comes from the memblock. Otherwise, we
|
|
|
|
* needn't do it
|
2012-05-29 15:06:50 -07:00
|
|
|
*/
|
|
|
|
if (!use_slab)
|
2012-07-11 14:02:56 -07:00
|
|
|
BUG_ON(memblock_reserve(addr, new_alloc_size));
|
2012-05-29 15:06:50 -07:00
|
|
|
|
|
|
|
/* Update slab flag */
|
|
|
|
*in_slab = use_slab;
|
|
|
|
|
2010-07-06 15:39:13 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-07-12 11:15:55 +02:00
|
|
|
/**
|
|
|
|
* memblock_merge_regions - merge neighboring compatible regions
|
|
|
|
* @type: memblock type to scan
|
2023-01-29 17:00:34 +08:00
|
|
|
* @start_rgn: start scanning from (@start_rgn - 1)
|
|
|
|
* @end_rgn: end scanning at (@end_rgn - 1)
|
|
|
|
* Scan @type and merge neighboring compatible regions in [@start_rgn - 1, @end_rgn)
|
2011-07-12 11:15:55 +02:00
|
|
|
*/
|
2023-01-29 17:00:34 +08:00
|
|
|
static void __init_memblock memblock_merge_regions(struct memblock_type *type,
|
|
|
|
unsigned long start_rgn,
|
|
|
|
unsigned long end_rgn)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
2011-07-12 11:15:55 +02:00
|
|
|
int i = 0;
|
2023-01-29 17:00:34 +08:00
|
|
|
if (start_rgn)
|
|
|
|
i = start_rgn - 1;
|
|
|
|
end_rgn = min(end_rgn, type->cnt - 1);
|
|
|
|
while (i < end_rgn) {
|
2011-07-12 11:15:55 +02:00
|
|
|
struct memblock_region *this = &type->regions[i];
|
|
|
|
struct memblock_region *next = &type->regions[i + 1];
|
2010-07-12 14:36:09 +10:00
|
|
|
|
2011-07-14 11:43:42 +02:00
|
|
|
if (this->base + this->size != next->base ||
|
|
|
|
memblock_get_region_node(this) !=
|
2014-01-21 15:49:20 -08:00
|
|
|
memblock_get_region_node(next) ||
|
|
|
|
this->flags != next->flags) {
|
2011-07-12 11:15:55 +02:00
|
|
|
BUG_ON(this->base + this->size > next->base);
|
|
|
|
i++;
|
|
|
|
continue;
|
mm/memblock: properly handle overlaps and fix error path
Currently memblock_reserve() or memblock_free() don't handle overlaps of
any kind. There is some special casing for coalescing exactly adjacent
regions but that's about it.
This is annoying because typically memblock_reserve() is used to mark
regions passed by the firmware as reserved and we all know how much we can
trust our firmwares...
Also, with the current code, if we do something it doesn't handle right
such as trying to memblock_reserve() a large range spanning multiple
existing smaller reserved regions for example, or doing overlapping
reservations, it can silently corrupt the internal region array, causing
odd errors much later on, such as allocations returning reserved regions
etc...
This patch rewrites the underlying functions that add or remove a region
to the arrays. The new code is a lot more robust as it fully handles
overlapping regions. It's also, imho, simpler than the previous
implementation.
In addition, while doing so, I found a bug where if we fail to double the
array while adding a region, we would remove the last region of the array
rather than the region we just allocated. This fixes it too.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:33:43 -07:00
|
|
|
}
|
|
|
|
|
2011-07-12 11:15:55 +02:00
|
|
|
this->size += next->size;
|
2013-01-11 14:31:44 -08:00
|
|
|
/* move forward from next + 1, index of which is i + 2 */
|
|
|
|
memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
|
2011-07-12 11:15:55 +02:00
|
|
|
type->cnt--;
|
2023-01-29 17:00:34 +08:00
|
|
|
end_rgn--;
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
2011-07-12 11:15:55 +02:00
|
|
|
}
|
2010-07-12 14:36:09 +10:00
|
|
|
|
2011-07-12 11:15:55 +02:00
|
|
|
/**
|
|
|
|
* memblock_insert_region - insert new memblock region
|
2013-04-29 15:08:41 -07:00
|
|
|
* @type: memblock type to insert into
|
|
|
|
* @idx: index for the insertion point
|
|
|
|
* @base: base address of the new region
|
|
|
|
* @size: size of the new region
|
|
|
|
* @nid: node id of the new region
|
2014-01-21 15:49:20 -08:00
|
|
|
* @flags: flags of the new region
|
2011-07-12 11:15:55 +02:00
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Insert new memblock region [@base, @base + @size) into @type at @idx.
|
2016-08-04 15:31:46 -07:00
|
|
|
* @type must already have extra room to accommodate the new region.
|
2011-07-12 11:15:55 +02:00
|
|
|
*/
|
|
|
|
static void __init_memblock memblock_insert_region(struct memblock_type *type,
|
|
|
|
int idx, phys_addr_t base,
|
2014-01-21 15:49:20 -08:00
|
|
|
phys_addr_t size,
|
2018-06-30 17:55:01 +03:00
|
|
|
int nid,
|
|
|
|
enum memblock_flags flags)
|
2011-07-12 11:15:55 +02:00
|
|
|
{
|
|
|
|
struct memblock_region *rgn = &type->regions[idx];
|
|
|
|
|
|
|
|
BUG_ON(type->cnt >= type->max);
|
|
|
|
memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
|
|
|
|
rgn->base = base;
|
|
|
|
rgn->size = size;
|
2014-01-21 15:49:20 -08:00
|
|
|
rgn->flags = flags;
|
2011-07-14 11:43:42 +02:00
|
|
|
memblock_set_region_node(rgn, nid);
|
2011-07-12 11:15:55 +02:00
|
|
|
type->cnt++;
|
2011-12-08 10:22:08 -08:00
|
|
|
type->total_size += size;
|
2011-07-12 11:15:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2014-01-29 18:16:01 +01:00
|
|
|
* memblock_add_range - add new memblock region
|
2011-07-12 11:15:55 +02:00
|
|
|
* @type: memblock type to add new region into
|
|
|
|
* @base: base address of the new region
|
|
|
|
* @size: size of the new region
|
2011-12-08 10:22:08 -08:00
|
|
|
* @nid: nid of the new region
|
2014-01-21 15:49:20 -08:00
|
|
|
* @flags: flags of the new region
|
2011-07-12 11:15:55 +02:00
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Add new memblock region [@base, @base + @size) into @type. The new region
|
2011-07-12 11:15:55 +02:00
|
|
|
* is allowed to overlap with existing ones - overlaps don't affect already
|
|
|
|
* existing regions. @type is guaranteed to be minimal (all neighbouring
|
|
|
|
* compatible regions are merged) after the addition.
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2011-07-12 11:15:55 +02:00
|
|
|
* 0 on success, -errno on failure.
|
|
|
|
*/
|
2020-01-30 22:14:20 -08:00
|
|
|
static int __init_memblock memblock_add_range(struct memblock_type *type,
|
2014-01-21 15:49:20 -08:00
|
|
|
phys_addr_t base, phys_addr_t size,
|
2018-06-30 17:55:01 +03:00
|
|
|
int nid, enum memblock_flags flags)
|
2011-07-12 11:15:55 +02:00
|
|
|
{
|
|
|
|
bool insert = false;
|
2011-12-08 10:22:07 -08:00
|
|
|
phys_addr_t obase = base;
|
|
|
|
phys_addr_t end = base + memblock_cap_size(base, &size);
|
2023-01-29 17:00:34 +08:00
|
|
|
int idx, nr_new, start_rgn = -1, end_rgn;
|
2016-01-14 15:20:42 -08:00
|
|
|
struct memblock_region *rgn;
|
2011-07-12 11:15:55 +02:00
|
|
|
|
2012-04-20 08:31:34 -07:00
|
|
|
if (!size)
|
|
|
|
return 0;
|
|
|
|
|
2011-07-12 11:15:55 +02:00
|
|
|
/* special case for empty array */
|
|
|
|
if (type->regions[0].size == 0) {
|
2024-04-05 01:58:21 +00:00
|
|
|
WARN_ON(type->cnt != 0 || type->total_size);
|
mm/memblock: properly handle overlaps and fix error path
Currently memblock_reserve() or memblock_free() don't handle overlaps of
any kind. There is some special casing for coalescing exactly adjacent
regions but that's about it.
This is annoying because typically memblock_reserve() is used to mark
regions passed by the firmware as reserved and we all know how much we can
trust our firmwares...
Also, with the current code, if we do something it doesn't handle right
such as trying to memblock_reserve() a large range spanning multiple
existing smaller reserved regions for example, or doing overlapping
reservations, it can silently corrupt the internal region array, causing
odd errors much later on, such as allocations returning reserved regions
etc...
This patch rewrites the underlying functions that add or remove a region
to the arrays. The new code is a lot more robust as it fully handles
overlapping regions. It's also, imho, simpler than the previous
implementation.
In addition, while doing so, I found a bug where if we fail to double the
array while adding a region, we would remove the last region of the array
rather than the region we just allocated. This fixes it too.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:33:43 -07:00
|
|
|
type->regions[0].base = base;
|
|
|
|
type->regions[0].size = size;
|
2014-01-21 15:49:20 -08:00
|
|
|
type->regions[0].flags = flags;
|
2011-12-08 10:22:08 -08:00
|
|
|
memblock_set_region_node(&type->regions[0], nid);
|
2011-12-08 10:22:08 -08:00
|
|
|
type->total_size = size;
|
2024-04-05 01:58:21 +00:00
|
|
|
type->cnt = 1;
|
mm/memblock: properly handle overlaps and fix error path
Currently memblock_reserve() or memblock_free() don't handle overlaps of
any kind. There is some special casing for coalescing exactly adjacent
regions but that's about it.
This is annoying because typically memblock_reserve() is used to mark
regions passed by the firmware as reserved and we all know how much we can
trust our firmwares...
Also, with the current code, if we do something it doesn't handle right
such as trying to memblock_reserve() a large range spanning multiple
existing smaller reserved regions for example, or doing overlapping
reservations, it can silently corrupt the internal region array, causing
odd errors much later on, such as allocations returning reserved regions
etc...
This patch rewrites the underlying functions that add or remove a region
to the arrays. The new code is a lot more robust as it fully handles
overlapping regions. It's also, imho, simpler than the previous
implementation.
In addition, while doing so, I found a bug where if we fail to double the
array while adding a region, we would remove the last region of the array
rather than the region we just allocated. This fixes it too.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:33:43 -07:00
|
|
|
return 0;
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
2022-06-15 17:40:15 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The worst case is when new range overlaps all existing regions,
|
|
|
|
* then we'll need type->cnt + 1 empty regions in @type. So if
|
2023-01-29 17:00:33 +08:00
|
|
|
* type->cnt * 2 + 1 is less than or equal to type->max, we know
|
2022-06-15 17:40:15 +08:00
|
|
|
* that there is enough empty regions in @type, and we can insert
|
|
|
|
* regions directly.
|
|
|
|
*/
|
2023-01-29 17:00:33 +08:00
|
|
|
if (type->cnt * 2 + 1 <= type->max)
|
2022-06-15 17:40:15 +08:00
|
|
|
insert = true;
|
|
|
|
|
2011-07-12 11:15:55 +02:00
|
|
|
repeat:
|
|
|
|
/*
|
|
|
|
* The following is executed twice. Once with %false @insert and
|
|
|
|
* then with %true. The first counts the number of regions needed
|
2016-08-04 15:31:46 -07:00
|
|
|
* to accommodate the new area. The second actually inserts them.
|
2010-07-06 15:39:13 -07:00
|
|
|
*/
|
2011-07-12 11:15:55 +02:00
|
|
|
base = obase;
|
|
|
|
nr_new = 0;
|
2010-07-12 14:36:09 +10:00
|
|
|
|
2017-11-15 17:33:42 -08:00
|
|
|
for_each_memblock_type(idx, type, rgn) {
|
2011-07-12 11:15:55 +02:00
|
|
|
phys_addr_t rbase = rgn->base;
|
|
|
|
phys_addr_t rend = rbase + rgn->size;
|
|
|
|
|
|
|
|
if (rbase >= end)
|
2010-07-12 14:36:09 +10:00
|
|
|
break;
|
2011-07-12 11:15:55 +02:00
|
|
|
if (rend <= base)
|
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* @rgn overlaps. If it separates the lower part of new
|
|
|
|
* area, insert that portion.
|
|
|
|
*/
|
|
|
|
if (rbase > base) {
|
2021-06-28 19:43:01 -07:00
|
|
|
#ifdef CONFIG_NUMA
|
2015-09-04 15:47:38 -07:00
|
|
|
WARN_ON(nid != memblock_get_region_node(rgn));
|
|
|
|
#endif
|
2015-09-08 14:59:53 -07:00
|
|
|
WARN_ON(flags != rgn->flags);
|
2011-07-12 11:15:55 +02:00
|
|
|
nr_new++;
|
2023-01-29 17:00:34 +08:00
|
|
|
if (insert) {
|
|
|
|
if (start_rgn == -1)
|
|
|
|
start_rgn = idx;
|
|
|
|
end_rgn = idx + 1;
|
2016-01-14 15:20:42 -08:00
|
|
|
memblock_insert_region(type, idx++, base,
|
2014-01-21 15:49:20 -08:00
|
|
|
rbase - base, nid,
|
|
|
|
flags);
|
2023-01-29 17:00:34 +08:00
|
|
|
}
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
2011-07-12 11:15:55 +02:00
|
|
|
/* area below @rend is dealt with, forget about it */
|
|
|
|
base = min(rend, end);
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
2011-07-12 11:15:55 +02:00
|
|
|
|
|
|
|
/* insert the remaining portion */
|
|
|
|
if (base < end) {
|
|
|
|
nr_new++;
|
2023-01-29 17:00:34 +08:00
|
|
|
if (insert) {
|
|
|
|
if (start_rgn == -1)
|
|
|
|
start_rgn = idx;
|
|
|
|
end_rgn = idx + 1;
|
2016-01-14 15:20:42 -08:00
|
|
|
memblock_insert_region(type, idx, base, end - base,
|
2014-01-21 15:49:20 -08:00
|
|
|
nid, flags);
|
2023-01-29 17:00:34 +08:00
|
|
|
}
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2016-07-26 15:24:56 -07:00
|
|
|
if (!nr_new)
|
|
|
|
return 0;
|
|
|
|
|
2011-07-12 11:15:55 +02:00
|
|
|
/*
|
|
|
|
* If this was the first round, resize array and repeat for actual
|
|
|
|
* insertions; otherwise, merge and return.
|
2010-07-06 15:39:13 -07:00
|
|
|
*/
|
2011-07-12 11:15:55 +02:00
|
|
|
if (!insert) {
|
|
|
|
while (type->cnt + nr_new > type->max)
|
2012-06-20 12:53:05 -07:00
|
|
|
if (memblock_double_array(type, obase, size) < 0)
|
2011-07-12 11:15:55 +02:00
|
|
|
return -ENOMEM;
|
|
|
|
insert = true;
|
|
|
|
goto repeat;
|
|
|
|
} else {
|
2023-01-29 17:00:34 +08:00
|
|
|
memblock_merge_regions(type, start_rgn, end_rgn);
|
2011-07-12 11:15:55 +02:00
|
|
|
return 0;
|
2010-07-06 15:39:13 -07:00
|
|
|
}
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2018-06-30 17:55:03 +03:00
|
|
|
/**
|
|
|
|
* memblock_add_node - add new memblock region within a NUMA node
|
|
|
|
* @base: base address of the new region
|
|
|
|
* @size: size of the new region
|
|
|
|
* @nid: nid of the new region
|
2021-11-05 13:44:49 -07:00
|
|
|
* @flags: flags of the new region
|
2018-06-30 17:55:03 +03:00
|
|
|
*
|
|
|
|
* Add new memblock region [@base, @base + @size) to the "memory"
|
|
|
|
* type. See memblock_add_range() description for mode details
|
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* 0 on success, -errno on failure.
|
|
|
|
*/
|
2011-12-08 10:22:08 -08:00
|
|
|
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
|
2021-11-05 13:44:49 -07:00
|
|
|
int nid, enum memblock_flags flags)
|
2011-12-08 10:22:08 -08:00
|
|
|
{
|
2021-08-11 10:54:36 +02:00
|
|
|
phys_addr_t end = base + size - 1;
|
|
|
|
|
2021-11-05 13:44:49 -07:00
|
|
|
memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
|
|
|
|
&base, &end, nid, flags, (void *)_RET_IP_);
|
2021-08-11 10:54:36 +02:00
|
|
|
|
2021-11-05 13:44:49 -07:00
|
|
|
return memblock_add_range(&memblock.memory, base, size, nid, flags);
|
2011-12-08 10:22:08 -08:00
|
|
|
}
|
|
|
|
|
2018-06-30 17:55:03 +03:00
|
|
|
/**
|
|
|
|
* memblock_add - add new memblock region
|
|
|
|
* @base: base address of the new region
|
|
|
|
* @size: size of the new region
|
|
|
|
*
|
|
|
|
* Add new memblock region [@base, @base + @size) to the "memory"
|
|
|
|
* type. See memblock_add_range() description for mode details
|
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* 0 on success, -errno on failure.
|
|
|
|
*/
|
2016-05-20 16:57:35 -07:00
|
|
|
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
|
2015-04-15 16:14:44 -07:00
|
|
|
{
|
2017-02-22 15:46:42 -08:00
|
|
|
phys_addr_t end = base + size - 1;
|
|
|
|
|
2020-01-30 22:14:23 -08:00
|
|
|
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
|
2017-02-22 15:46:42 -08:00
|
|
|
&base, &end, (void *)_RET_IP_);
|
2015-04-15 16:14:44 -07:00
|
|
|
|
2016-05-20 16:57:35 -07:00
|
|
|
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2023-10-26 10:03:29 +08:00
|
|
|
/**
|
|
|
|
* memblock_validate_numa_coverage - check if amount of memory with
|
|
|
|
* no node ID assigned is less than a threshold
|
|
|
|
* @threshold_bytes: maximal number of pages that can have unassigned node
|
|
|
|
* ID (in bytes).
|
|
|
|
*
|
|
|
|
* A buggy firmware may report memory that does not belong to any node.
|
|
|
|
* Check if amount of such memory is below @threshold_bytes.
|
|
|
|
*
|
|
|
|
* Return: true on success, false on failure.
|
|
|
|
*/
|
|
|
|
bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_bytes)
|
|
|
|
{
|
|
|
|
unsigned long nr_pages = 0;
|
|
|
|
unsigned long start_pfn, end_pfn, mem_size_mb;
|
|
|
|
int nid, i;
|
|
|
|
|
|
|
|
/* calculate lose page */
|
|
|
|
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
|
2024-06-14 11:05:43 +03:00
|
|
|
if (!numa_valid_node(nid))
|
2023-10-26 10:03:29 +08:00
|
|
|
nr_pages += end_pfn - start_pfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((nr_pages << PAGE_SHIFT) >= threshold_bytes) {
|
|
|
|
mem_size_mb = memblock_phys_mem_size() >> 20;
|
|
|
|
pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n",
|
|
|
|
(nr_pages << PAGE_SHIFT) >> 20, mem_size_mb);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-12-08 10:22:07 -08:00
|
|
|
/**
|
|
|
|
* memblock_isolate_range - isolate given range into disjoint memblocks
|
|
|
|
* @type: memblock type to isolate range for
|
|
|
|
* @base: base of range to isolate
|
|
|
|
* @size: size of range to isolate
|
|
|
|
* @start_rgn: out parameter for the start of isolated region
|
|
|
|
* @end_rgn: out parameter for the end of isolated region
|
|
|
|
*
|
|
|
|
* Walk @type and ensure that regions don't cross the boundaries defined by
|
2018-06-30 17:55:02 +03:00
|
|
|
* [@base, @base + @size). Crossing regions are split at the boundaries,
|
2011-12-08 10:22:07 -08:00
|
|
|
* which may create at most two more regions. The index of the first
|
2024-05-07 07:58:29 +00:00
|
|
|
* region inside the range is returned in *@start_rgn and the index of the
|
|
|
|
* first region after the range is returned in *@end_rgn.
|
2011-12-08 10:22:07 -08:00
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2011-12-08 10:22:07 -08:00
|
|
|
* 0 on success, -errno on failure.
|
|
|
|
*/
|
|
|
|
static int __init_memblock memblock_isolate_range(struct memblock_type *type,
|
|
|
|
phys_addr_t base, phys_addr_t size,
|
|
|
|
int *start_rgn, int *end_rgn)
|
|
|
|
{
|
2011-12-08 10:22:07 -08:00
|
|
|
phys_addr_t end = base + memblock_cap_size(base, &size);
|
2016-01-14 15:20:42 -08:00
|
|
|
int idx;
|
|
|
|
struct memblock_region *rgn;
|
2011-12-08 10:22:07 -08:00
|
|
|
|
|
|
|
*start_rgn = *end_rgn = 0;
|
|
|
|
|
2012-04-20 08:31:34 -07:00
|
|
|
if (!size)
|
|
|
|
return 0;
|
|
|
|
|
2011-12-08 10:22:07 -08:00
|
|
|
/* we'll create at most two more regions */
|
|
|
|
while (type->cnt + 2 > type->max)
|
2012-06-20 12:53:05 -07:00
|
|
|
if (memblock_double_array(type, base, size) < 0)
|
2011-12-08 10:22:07 -08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2017-11-15 17:33:42 -08:00
|
|
|
for_each_memblock_type(idx, type, rgn) {
|
2011-12-08 10:22:07 -08:00
|
|
|
phys_addr_t rbase = rgn->base;
|
|
|
|
phys_addr_t rend = rbase + rgn->size;
|
|
|
|
|
|
|
|
if (rbase >= end)
|
|
|
|
break;
|
|
|
|
if (rend <= base)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (rbase < base) {
|
|
|
|
/*
|
|
|
|
* @rgn intersects from below. Split and continue
|
|
|
|
* to process the next region - the new top half.
|
|
|
|
*/
|
|
|
|
rgn->base = base;
|
2011-12-08 10:22:08 -08:00
|
|
|
rgn->size -= base - rbase;
|
|
|
|
type->total_size -= base - rbase;
|
2016-01-14 15:20:42 -08:00
|
|
|
memblock_insert_region(type, idx, rbase, base - rbase,
|
2014-01-21 15:49:20 -08:00
|
|
|
memblock_get_region_node(rgn),
|
|
|
|
rgn->flags);
|
2011-12-08 10:22:07 -08:00
|
|
|
} else if (rend > end) {
|
|
|
|
/*
|
|
|
|
* @rgn intersects from above. Split and redo the
|
|
|
|
* current region - the new bottom half.
|
|
|
|
*/
|
|
|
|
rgn->base = end;
|
2011-12-08 10:22:08 -08:00
|
|
|
rgn->size -= end - rbase;
|
|
|
|
type->total_size -= end - rbase;
|
2016-01-14 15:20:42 -08:00
|
|
|
memblock_insert_region(type, idx--, rbase, end - rbase,
|
2014-01-21 15:49:20 -08:00
|
|
|
memblock_get_region_node(rgn),
|
|
|
|
rgn->flags);
|
2011-12-08 10:22:07 -08:00
|
|
|
} else {
|
|
|
|
/* @rgn is fully contained, record it */
|
|
|
|
if (!*end_rgn)
|
2016-01-14 15:20:42 -08:00
|
|
|
*start_rgn = idx;
|
|
|
|
*end_rgn = idx + 1;
|
2011-12-08 10:22:07 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-11-05 18:47:00 -08:00
|
|
|
static int __init_memblock memblock_remove_range(struct memblock_type *type,
|
2014-01-29 18:16:01 +01:00
|
|
|
phys_addr_t base, phys_addr_t size)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
2011-12-08 10:22:07 -08:00
|
|
|
int start_rgn, end_rgn;
|
|
|
|
int i, ret;
|
2010-07-12 14:36:09 +10:00
|
|
|
|
2011-12-08 10:22:07 -08:00
|
|
|
ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2010-07-12 14:36:09 +10:00
|
|
|
|
2011-12-08 10:22:07 -08:00
|
|
|
for (i = end_rgn - 1; i >= start_rgn; i--)
|
|
|
|
memblock_remove_region(type, i);
|
mm/memblock: properly handle overlaps and fix error path
Currently memblock_reserve() or memblock_free() don't handle overlaps of
any kind. There is some special casing for coalescing exactly adjacent
regions but that's about it.
This is annoying because typically memblock_reserve() is used to mark
regions passed by the firmware as reserved and we all know how much we can
trust our firmwares...
Also, with the current code, if we do something it doesn't handle right
such as trying to memblock_reserve() a large range spanning multiple
existing smaller reserved regions for example, or doing overlapping
reservations, it can silently corrupt the internal region array, causing
odd errors much later on, such as allocations returning reserved regions
etc...
This patch rewrites the underlying functions that add or remove a region
to the arrays. The new code is a lot more robust as it fully handles
overlapping regions. It's also, imho, simpler than the previous
implementation.
In addition, while doing so, I found a bug where if we fail to double the
array while adding a region, we would remove the last region of the array
rather than the region we just allocated. This fixes it too.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:33:43 -07:00
|
|
|
return 0;
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2011-12-08 10:22:06 -08:00
|
|
|
int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
2018-06-07 17:07:35 -07:00
|
|
|
phys_addr_t end = base + size - 1;
|
|
|
|
|
2020-01-30 22:14:23 -08:00
|
|
|
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
|
2018-06-07 17:07:35 -07:00
|
|
|
&base, &end, (void *)_RET_IP_);
|
|
|
|
|
2014-01-29 18:16:01 +01:00
|
|
|
return memblock_remove_range(&memblock.memory, base, size);
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
memblock: introduce saner 'memblock_free_ptr()' interface
The boot-time allocation interface for memblock is a mess, with
'memblock_alloc()' returning a virtual pointer, but then you are
supposed to free it with 'memblock_free()' that takes a _physical_
address.
Not only is that all kinds of strange and illogical, but it actually
causes bugs, when people then use it like a normal allocation function,
and it fails spectacularly on a NULL pointer:
https://lore.kernel.org/all/20210912140820.GD25450@xsang-OptiPlex-9020/
or just random memory corruption if the debug checks don't catch it:
https://lore.kernel.org/all/61ab2d0c-3313-aaab-514c-e15b7aa054a0@suse.cz/
I really don't want to apply patches that treat the symptoms, when the
fundamental cause is this horribly confusing interface.
I started out looking at just automating a sane replacement sequence,
but because of this mix or virtual and physical addresses, and because
people have used the "__pa()" macro that can take either a regular
kernel pointer, or just the raw "unsigned long" address, it's all quite
messy.
So this just introduces a new saner interface for freeing a virtual
address that was allocated using 'memblock_alloc()', and that was kept
as a regular kernel pointer. And then it converts a couple of users
that are obvious and easy to test, including the 'xbc_nodes' case in
lib/bootconfig.c that caused problems.
Reported-by: kernel test robot <oliver.sang@intel.com>
Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed")
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-14 13:23:22 -07:00
|
|
|
/**
|
2021-11-05 13:43:22 -07:00
|
|
|
* memblock_free - free boot memory allocation
|
memblock: introduce saner 'memblock_free_ptr()' interface
The boot-time allocation interface for memblock is a mess, with
'memblock_alloc()' returning a virtual pointer, but then you are
supposed to free it with 'memblock_free()' that takes a _physical_
address.
Not only is that all kinds of strange and illogical, but it actually
causes bugs, when people then use it like a normal allocation function,
and it fails spectacularly on a NULL pointer:
https://lore.kernel.org/all/20210912140820.GD25450@xsang-OptiPlex-9020/
or just random memory corruption if the debug checks don't catch it:
https://lore.kernel.org/all/61ab2d0c-3313-aaab-514c-e15b7aa054a0@suse.cz/
I really don't want to apply patches that treat the symptoms, when the
fundamental cause is this horribly confusing interface.
I started out looking at just automating a sane replacement sequence,
but because of this mix or virtual and physical addresses, and because
people have used the "__pa()" macro that can take either a regular
kernel pointer, or just the raw "unsigned long" address, it's all quite
messy.
So this just introduces a new saner interface for freeing a virtual
address that was allocated using 'memblock_alloc()', and that was kept
as a regular kernel pointer. And then it converts a couple of users
that are obvious and easy to test, including the 'xbc_nodes' case in
lib/bootconfig.c that caused problems.
Reported-by: kernel test robot <oliver.sang@intel.com>
Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed")
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-14 13:23:22 -07:00
|
|
|
* @ptr: starting address of the boot memory allocation
|
|
|
|
* @size: size of the boot memory block in bytes
|
|
|
|
*
|
|
|
|
* Free boot memory block previously allocated by memblock_alloc_xx() API.
|
|
|
|
* The freeing memory will not be released to the buddy allocator.
|
|
|
|
*/
|
2021-11-05 13:43:22 -07:00
|
|
|
void __init_memblock memblock_free(void *ptr, size_t size)
|
memblock: introduce saner 'memblock_free_ptr()' interface
The boot-time allocation interface for memblock is a mess, with
'memblock_alloc()' returning a virtual pointer, but then you are
supposed to free it with 'memblock_free()' that takes a _physical_
address.
Not only is that all kinds of strange and illogical, but it actually
causes bugs, when people then use it like a normal allocation function,
and it fails spectacularly on a NULL pointer:
https://lore.kernel.org/all/20210912140820.GD25450@xsang-OptiPlex-9020/
or just random memory corruption if the debug checks don't catch it:
https://lore.kernel.org/all/61ab2d0c-3313-aaab-514c-e15b7aa054a0@suse.cz/
I really don't want to apply patches that treat the symptoms, when the
fundamental cause is this horribly confusing interface.
I started out looking at just automating a sane replacement sequence,
but because of this mix or virtual and physical addresses, and because
people have used the "__pa()" macro that can take either a regular
kernel pointer, or just the raw "unsigned long" address, it's all quite
messy.
So this just introduces a new saner interface for freeing a virtual
address that was allocated using 'memblock_alloc()', and that was kept
as a regular kernel pointer. And then it converts a couple of users
that are obvious and easy to test, including the 'xbc_nodes' case in
lib/bootconfig.c that caused problems.
Reported-by: kernel test robot <oliver.sang@intel.com>
Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed")
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-14 13:23:22 -07:00
|
|
|
{
|
|
|
|
if (ptr)
|
2021-11-05 13:43:19 -07:00
|
|
|
memblock_phys_free(__pa(ptr), size);
|
memblock: introduce saner 'memblock_free_ptr()' interface
The boot-time allocation interface for memblock is a mess, with
'memblock_alloc()' returning a virtual pointer, but then you are
supposed to free it with 'memblock_free()' that takes a _physical_
address.
Not only is that all kinds of strange and illogical, but it actually
causes bugs, when people then use it like a normal allocation function,
and it fails spectacularly on a NULL pointer:
https://lore.kernel.org/all/20210912140820.GD25450@xsang-OptiPlex-9020/
or just random memory corruption if the debug checks don't catch it:
https://lore.kernel.org/all/61ab2d0c-3313-aaab-514c-e15b7aa054a0@suse.cz/
I really don't want to apply patches that treat the symptoms, when the
fundamental cause is this horribly confusing interface.
I started out looking at just automating a sane replacement sequence,
but because of this mix or virtual and physical addresses, and because
people have used the "__pa()" macro that can take either a regular
kernel pointer, or just the raw "unsigned long" address, it's all quite
messy.
So this just introduces a new saner interface for freeing a virtual
address that was allocated using 'memblock_alloc()', and that was kept
as a regular kernel pointer. And then it converts a couple of users
that are obvious and easy to test, including the 'xbc_nodes' case in
lib/bootconfig.c that caused problems.
Reported-by: kernel test robot <oliver.sang@intel.com>
Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed")
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-14 13:23:22 -07:00
|
|
|
}
|
|
|
|
|
2018-12-28 00:35:29 -08:00
|
|
|
/**
|
2021-11-05 13:43:19 -07:00
|
|
|
* memblock_phys_free - free boot memory block
|
2018-12-28 00:35:29 -08:00
|
|
|
* @base: phys starting address of the boot memory block
|
|
|
|
* @size: size of the boot memory block in bytes
|
|
|
|
*
|
2022-12-16 14:03:03 +04:00
|
|
|
* Free boot memory block previously allocated by memblock_phys_alloc_xx() API.
|
2018-12-28 00:35:29 -08:00
|
|
|
* The freeing memory will not be released to the buddy allocator.
|
|
|
|
*/
|
2021-11-05 13:43:19 -07:00
|
|
|
int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
2017-02-22 15:46:42 -08:00
|
|
|
phys_addr_t end = base + size - 1;
|
|
|
|
|
2020-01-30 22:14:23 -08:00
|
|
|
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
|
2017-02-22 15:46:42 -08:00
|
|
|
&base, &end, (void *)_RET_IP_);
|
2011-07-12 11:16:06 +02:00
|
|
|
|
2016-10-11 13:55:11 -07:00
|
|
|
kmemleak_free_part_phys(base, size);
|
2014-01-29 18:16:01 +01:00
|
|
|
return memblock_remove_range(&memblock.reserved, base, size);
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2016-05-20 16:57:35 -07:00
|
|
|
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
2017-02-22 15:46:42 -08:00
|
|
|
phys_addr_t end = base + size - 1;
|
|
|
|
|
2020-01-30 22:14:23 -08:00
|
|
|
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
|
2017-02-22 15:46:42 -08:00
|
|
|
&base, &end, (void *)_RET_IP_);
|
2010-07-12 14:36:09 +10:00
|
|
|
|
2016-05-20 16:57:35 -07:00
|
|
|
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2020-01-30 22:14:20 -08:00
|
|
|
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
|
|
|
|
int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
|
|
|
|
{
|
|
|
|
phys_addr_t end = base + size - 1;
|
|
|
|
|
|
|
|
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
|
|
|
|
&base, &end, (void *)_RET_IP_);
|
|
|
|
|
2020-07-01 16:18:29 +02:00
|
|
|
return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0);
|
2020-01-30 22:14:20 -08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2014-01-21 15:49:23 -08:00
|
|
|
/**
|
2018-06-30 17:55:02 +03:00
|
|
|
* memblock_setclr_flag - set or clear flag for a memory region
|
2023-09-13 11:53:59 +01:00
|
|
|
* @type: memblock type to set/clear flag for
|
2018-06-30 17:55:02 +03:00
|
|
|
* @base: base address of the region
|
|
|
|
* @size: size of the region
|
|
|
|
* @set: set or clear the flag
|
2020-12-15 20:47:26 -08:00
|
|
|
* @flag: the flag to update
|
2014-01-21 15:49:23 -08:00
|
|
|
*
|
2014-12-12 16:54:59 -08:00
|
|
|
* This function isolates region [@base, @base + @size), and sets/clears flag
|
2014-01-21 15:49:23 -08:00
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return: 0 on success, -errno on failure.
|
2014-01-21 15:49:23 -08:00
|
|
|
*/
|
2023-09-13 11:53:59 +01:00
|
|
|
static int __init_memblock memblock_setclr_flag(struct memblock_type *type,
|
|
|
|
phys_addr_t base, phys_addr_t size, int set, int flag)
|
2014-01-21 15:49:23 -08:00
|
|
|
{
|
|
|
|
int i, ret, start_rgn, end_rgn;
|
|
|
|
|
|
|
|
ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2019-03-11 23:30:46 -07:00
|
|
|
for (i = start_rgn; i < end_rgn; i++) {
|
|
|
|
struct memblock_region *r = &type->regions[i];
|
|
|
|
|
2014-12-12 16:54:59 -08:00
|
|
|
if (set)
|
2019-03-11 23:30:46 -07:00
|
|
|
r->flags |= flag;
|
2014-12-12 16:54:59 -08:00
|
|
|
else
|
2019-03-11 23:30:46 -07:00
|
|
|
r->flags &= ~flag;
|
|
|
|
}
|
2014-01-21 15:49:23 -08:00
|
|
|
|
2023-01-29 17:00:34 +08:00
|
|
|
memblock_merge_regions(type, start_rgn, end_rgn);
|
2014-01-21 15:49:23 -08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2014-12-12 16:54:59 -08:00
|
|
|
* memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
|
2014-01-21 15:49:23 -08:00
|
|
|
* @base: the base phys addr of the region
|
|
|
|
* @size: the size of the region
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return: 0 on success, -errno on failure.
|
2014-12-12 16:54:59 -08:00
|
|
|
*/
|
|
|
|
int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
|
|
|
|
{
|
2023-09-13 11:53:59 +01:00
|
|
|
return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_HOTPLUG);
|
2014-12-12 16:54:59 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
|
|
|
|
* @base: the base phys addr of the region
|
|
|
|
* @size: the size of the region
|
2014-01-21 15:49:23 -08:00
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return: 0 on success, -errno on failure.
|
2014-01-21 15:49:23 -08:00
|
|
|
*/
|
|
|
|
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
|
|
|
|
{
|
2023-09-13 11:53:59 +01:00
|
|
|
return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_HOTPLUG);
|
2014-01-21 15:49:23 -08:00
|
|
|
}
|
|
|
|
|
2015-06-24 16:58:12 -07:00
|
|
|
/**
|
|
|
|
* memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR.
|
|
|
|
* @base: the base phys addr of the region
|
|
|
|
* @size: the size of the region
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return: 0 on success, -errno on failure.
|
2015-06-24 16:58:12 -07:00
|
|
|
*/
|
|
|
|
int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
|
|
|
|
{
|
2022-06-14 17:21:56 +08:00
|
|
|
if (!mirrored_kernelcore)
|
|
|
|
return 0;
|
|
|
|
|
2015-06-24 16:58:12 -07:00
|
|
|
system_has_some_mirror = true;
|
|
|
|
|
2023-09-13 11:53:59 +01:00
|
|
|
return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_MIRROR);
|
2015-06-24 16:58:12 -07:00
|
|
|
}
|
|
|
|
|
2015-11-30 13:28:15 +01:00
|
|
|
/**
|
|
|
|
* memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP.
|
|
|
|
* @base: the base phys addr of the region
|
|
|
|
* @size: the size of the region
|
|
|
|
*
|
2021-06-30 18:51:16 -07:00
|
|
|
* The memory regions marked with %MEMBLOCK_NOMAP will not be added to the
|
|
|
|
* direct mapping of the physical memory. These regions will still be
|
|
|
|
* covered by the memory map. The struct page representing NOMAP memory
|
|
|
|
* frames in the memory map will be PageReserved()
|
|
|
|
*
|
2021-10-21 10:09:29 +03:00
|
|
|
* Note: if the memory being marked %MEMBLOCK_NOMAP was allocated from
|
|
|
|
* memblock, the caller must inform kmemleak to ignore that memory
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return: 0 on success, -errno on failure.
|
2015-11-30 13:28:15 +01:00
|
|
|
*/
|
|
|
|
int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
|
|
|
|
{
|
2023-09-13 11:53:59 +01:00
|
|
|
return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_NOMAP);
|
2015-11-30 13:28:15 +01:00
|
|
|
}
|
2015-06-24 16:58:12 -07:00
|
|
|
|
2017-04-03 11:23:54 +09:00
|
|
|
/**
|
|
|
|
* memblock_clear_nomap - Clear flag MEMBLOCK_NOMAP for a specified region.
|
|
|
|
* @base: the base phys addr of the region
|
|
|
|
* @size: the size of the region
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return: 0 on success, -errno on failure.
|
2017-04-03 11:23:54 +09:00
|
|
|
*/
|
|
|
|
int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
|
|
|
|
{
|
2023-09-13 11:53:59 +01:00
|
|
|
return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_NOMAP);
|
2017-04-03 11:23:54 +09:00
|
|
|
}
|
|
|
|
|
2023-09-13 11:54:00 +01:00
|
|
|
/**
|
|
|
|
* memblock_reserved_mark_noinit - Mark a reserved memory region with flag
|
|
|
|
* MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized
|
|
|
|
* for this region.
|
|
|
|
* @base: the base phys addr of the region
|
|
|
|
* @size: the size of the region
|
|
|
|
*
|
|
|
|
* struct pages will not be initialized for reserved memory regions marked with
|
|
|
|
* %MEMBLOCK_RSRV_NOINIT.
|
|
|
|
*
|
|
|
|
* Return: 0 on success, -errno on failure.
|
|
|
|
*/
|
|
|
|
int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size)
|
|
|
|
{
|
|
|
|
return memblock_setclr_flag(&memblock.reserved, base, size, 1,
|
|
|
|
MEMBLOCK_RSRV_NOINIT);
|
|
|
|
}
|
|
|
|
|
2020-10-13 16:58:25 -07:00
|
|
|
static bool should_skip_region(struct memblock_type *type,
|
|
|
|
struct memblock_region *m,
|
|
|
|
int nid, int flags)
|
2019-03-11 23:30:50 -07:00
|
|
|
{
|
|
|
|
int m_nid = memblock_get_region_node(m);
|
|
|
|
|
2020-10-13 16:58:25 -07:00
|
|
|
/* we never skip regions when iterating memblock.reserved or physmem */
|
|
|
|
if (type != memblock_memory)
|
|
|
|
return false;
|
|
|
|
|
2019-03-11 23:30:50 -07:00
|
|
|
/* only memory regions are associated with nodes, check it */
|
2024-06-14 11:05:43 +03:00
|
|
|
if (numa_valid_node(nid) && nid != m_nid)
|
2019-03-11 23:30:50 -07:00
|
|
|
return true;
|
|
|
|
|
|
|
|
/* skip hotpluggable memory regions if needed */
|
memblock: make for_each_mem_range() traverse MEMBLOCK_HOTPLUG regions
Commit b10d6bca8720 ("arch, drivers: replace for_each_membock() with
for_each_mem_range()") didn't take into account that when there is
movable_node parameter in the kernel command line, for_each_mem_range()
would skip ranges marked with MEMBLOCK_HOTPLUG.
The page table setup code in POWER uses for_each_mem_range() to create
the linear mapping of the physical memory and since the regions marked
as MEMORY_HOTPLUG are skipped, they never make it to the linear map.
A later access to the memory in those ranges will fail:
BUG: Unable to handle kernel data access on write at 0xc000000400000000
Faulting instruction address: 0xc00000000008a3c0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
Modules linked in:
CPU: 0 PID: 53 Comm: kworker/u2:0 Not tainted 5.13.0 #7
NIP: c00000000008a3c0 LR: c0000000003c1ed8 CTR: 0000000000000040
REGS: c000000008a57770 TRAP: 0300 Not tainted (5.13.0)
MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 84222202 XER: 20040000
CFAR: c0000000003c1ed4 DAR: c000000400000000 DSISR: 42000000 IRQMASK: 0
GPR00: c0000000003c1ed8 c000000008a57a10 c0000000019da700 c000000400000000
GPR04: 0000000000000280 0000000000000180 0000000000000400 0000000000000200
GPR08: 0000000000000100 0000000000000080 0000000000000040 0000000000000300
GPR12: 0000000000000380 c000000001bc0000 c0000000001660c8 c000000006337e00
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000040000000 0000000020000000 c000000001a81990 c000000008c30000
GPR24: c000000008c20000 c000000001a81998 000fffffffff0000 c000000001a819a0
GPR28: c000000001a81908 c00c000001000000 c000000008c40000 c000000008a64680
NIP clear_user_page+0x50/0x80
LR __handle_mm_fault+0xc88/0x1910
Call Trace:
__handle_mm_fault+0xc44/0x1910 (unreliable)
handle_mm_fault+0x130/0x2a0
__get_user_pages+0x248/0x610
__get_user_pages_remote+0x12c/0x3e0
get_arg_page+0x54/0xf0
copy_string_kernel+0x11c/0x210
kernel_execve+0x16c/0x220
call_usermodehelper_exec_async+0x1b0/0x2f0
ret_from_kernel_thread+0x5c/0x70
Instruction dump:
79280fa4 79271764 79261f24 794ae8e2 7ca94214 7d683a14 7c893a14 7d893050
7d4903a6 60000000 60000000 60000000 <7c001fec> 7c091fec 7c081fec 7c051fec
---[ end trace 490b8c67e6075e09 ]---
Making for_each_mem_range() include MEMBLOCK_HOTPLUG regions in the
traversal fixes this issue.
Link: https://bugzilla.redhat.com/show_bug.cgi?id=1976100
Link: https://lkml.kernel.org/r/20210712071132.20902-1-rppt@kernel.org
Fixes: b10d6bca8720 ("arch, drivers: replace for_each_membock() with for_each_mem_range()")
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Tested-by: Greg Kurz <groug@kaod.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: <stable@vger.kernel.org> [5.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-23 15:50:26 -07:00
|
|
|
if (movable_node_is_enabled() && memblock_is_hotpluggable(m) &&
|
|
|
|
!(flags & MEMBLOCK_HOTPLUG))
|
2019-03-11 23:30:50 -07:00
|
|
|
return true;
|
|
|
|
|
|
|
|
/* if we want mirror memory skip non-mirror memory regions */
|
|
|
|
if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/* skip nomap memory unless we were asked for it explicitly */
|
|
|
|
if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
|
|
|
|
return true;
|
|
|
|
|
memblock: add MEMBLOCK_DRIVER_MANAGED to mimic IORESOURCE_SYSRAM_DRIVER_MANAGED
Let's add a flag that corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED,
indicating that we're dealing with a memory region that is never
indicated in the firmware-provided memory map, but always detected and
added by a driver.
Similar to MEMBLOCK_HOTPLUG, most infrastructure has to treat such
memory regions like ordinary MEMBLOCK_NONE memory regions -- for
example, when selecting memory regions to add to the vmcore for dumping
in the crashkernel via for_each_mem_range().
However, especially kexec_file is not supposed to select such memblocks
via for_each_free_mem_range() / for_each_free_mem_range_reverse() to
place kexec images, similar to how we handle
IORESOURCE_SYSRAM_DRIVER_MANAGED without CONFIG_ARCH_KEEP_MEMBLOCK.
We'll make sure that memory hotplug code sets the flag where applicable
(IORESOURCE_SYSRAM_DRIVER_MANAGED) next. This prepares architectures
that need CONFIG_ARCH_KEEP_MEMBLOCK, such as arm64, for virtio-mem
support.
Note that kexec *must not* indicate this memory to the second kernel and
*must not* place kexec-images on this memory. Let's add a comment to
kexec_walk_memblock(), documenting how we handle MEMBLOCK_DRIVER_MANAGED
now just like using IORESOURCE_SYSRAM_DRIVER_MANAGED in
locate_mem_hole_callback() for kexec_walk_resources().
Also note that MEMBLOCK_HOTPLUG cannot be reused due to different
semantics:
MEMBLOCK_HOTPLUG: memory is indicated as "System RAM" in the
firmware-provided memory map and added to the system early during
boot; kexec *has to* indicate this memory to the second kernel and
can place kexec-images on this memory. After memory hotunplug,
kexec has to be re-armed. We mostly ignore this flag when
"movable_node" is not set on the kernel command line, because
then we're told to not care about hotunpluggability of such
memory regions.
MEMBLOCK_DRIVER_MANAGED: memory is not indicated as "System RAM" in
the firmware-provided memory map; this memory is always detected
and added to the system by a driver; memory might not actually be
physically hotunpluggable. kexec *must not* indicate this memory to
the second kernel and *must not* place kexec-images on this memory.
Link: https://lkml.kernel.org/r/20211004093605.5830-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Jianyong Wu <Jianyong.Wu@arm.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Shahab Vahedi <shahab@synopsys.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-05 13:44:53 -07:00
|
|
|
/* skip driver-managed memory unless we were asked for it explicitly */
|
|
|
|
if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
|
|
|
|
return true;
|
|
|
|
|
2019-03-11 23:30:50 -07:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-07-12 11:15:59 +02:00
|
|
|
/**
|
2019-03-11 23:30:54 -07:00
|
|
|
* __next_mem_range - next function for for_each_free_mem_range() etc.
|
2011-07-12 11:15:59 +02:00
|
|
|
* @idx: pointer to u64 loop variable
|
2014-01-21 15:50:16 -08:00
|
|
|
* @nid: node selector, %NUMA_NO_NODE for all nodes
|
2015-06-24 16:58:09 -07:00
|
|
|
* @flags: pick from blocks based on memory attributes
|
2014-01-29 18:16:01 +01:00
|
|
|
* @type_a: pointer to memblock_type from where the range is taken
|
|
|
|
* @type_b: pointer to memblock_type which excludes memory from being taken
|
2012-06-20 12:53:01 -07:00
|
|
|
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
|
|
|
|
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
|
|
|
|
* @out_nid: ptr to int for nid of the range, can be %NULL
|
2011-07-12 11:15:59 +02:00
|
|
|
*
|
2014-01-29 18:16:01 +01:00
|
|
|
* Find the first area from *@idx which matches @nid, fill the out
|
2011-07-12 11:15:59 +02:00
|
|
|
* parameters, and update *@idx for the next iteration. The lower 32bit of
|
2014-01-29 18:16:01 +01:00
|
|
|
* *@idx contains index into type_a and the upper 32bit indexes the
|
|
|
|
* areas before each region in type_b. For example, if type_b regions
|
2011-07-12 11:15:59 +02:00
|
|
|
* look like the following,
|
|
|
|
*
|
|
|
|
* 0:[0-16), 1:[32-48), 2:[128-130)
|
|
|
|
*
|
|
|
|
* The upper 32bit indexes the following regions.
|
|
|
|
*
|
|
|
|
* 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX)
|
|
|
|
*
|
|
|
|
* As both region arrays are sorted, the function advances the two indices
|
|
|
|
* in lockstep and returns each intersection.
|
|
|
|
*/
|
2020-07-01 16:18:29 +02:00
|
|
|
void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
|
|
|
|
struct memblock_type *type_a,
|
|
|
|
struct memblock_type *type_b, phys_addr_t *out_start,
|
|
|
|
phys_addr_t *out_end, int *out_nid)
|
2011-07-12 11:15:59 +02:00
|
|
|
{
|
2014-01-29 18:16:01 +01:00
|
|
|
int idx_a = *idx & 0xffffffff;
|
|
|
|
int idx_b = *idx >> 32;
|
2014-01-21 15:50:16 -08:00
|
|
|
|
2014-01-29 18:16:01 +01:00
|
|
|
for (; idx_a < type_a->cnt; idx_a++) {
|
|
|
|
struct memblock_region *m = &type_a->regions[idx_a];
|
|
|
|
|
2011-07-12 11:15:59 +02:00
|
|
|
phys_addr_t m_start = m->base;
|
|
|
|
phys_addr_t m_end = m->base + m->size;
|
2014-01-29 18:16:01 +01:00
|
|
|
int m_nid = memblock_get_region_node(m);
|
2011-07-12 11:15:59 +02:00
|
|
|
|
2020-10-13 16:58:25 -07:00
|
|
|
if (should_skip_region(type_a, m, nid, flags))
|
2015-11-30 13:28:15 +01:00
|
|
|
continue;
|
|
|
|
|
2014-01-29 18:16:01 +01:00
|
|
|
if (!type_b) {
|
|
|
|
if (out_start)
|
|
|
|
*out_start = m_start;
|
|
|
|
if (out_end)
|
|
|
|
*out_end = m_end;
|
|
|
|
if (out_nid)
|
|
|
|
*out_nid = m_nid;
|
|
|
|
idx_a++;
|
|
|
|
*idx = (u32)idx_a | (u64)idx_b << 32;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* scan areas before each reservation */
|
|
|
|
for (; idx_b < type_b->cnt + 1; idx_b++) {
|
|
|
|
struct memblock_region *r;
|
|
|
|
phys_addr_t r_start;
|
|
|
|
phys_addr_t r_end;
|
|
|
|
|
|
|
|
r = &type_b->regions[idx_b];
|
|
|
|
r_start = idx_b ? r[-1].base + r[-1].size : 0;
|
|
|
|
r_end = idx_b < type_b->cnt ?
|
2018-06-07 17:06:15 -07:00
|
|
|
r->base : PHYS_ADDR_MAX;
|
2011-07-12 11:15:59 +02:00
|
|
|
|
2014-01-29 18:16:01 +01:00
|
|
|
/*
|
|
|
|
* if idx_b advanced past idx_a,
|
|
|
|
* break out to advance idx_a
|
|
|
|
*/
|
2011-07-12 11:15:59 +02:00
|
|
|
if (r_start >= m_end)
|
|
|
|
break;
|
|
|
|
/* if the two regions intersect, we're done */
|
|
|
|
if (m_start < r_end) {
|
|
|
|
if (out_start)
|
2014-01-29 18:16:01 +01:00
|
|
|
*out_start =
|
|
|
|
max(m_start, r_start);
|
2011-07-12 11:15:59 +02:00
|
|
|
if (out_end)
|
|
|
|
*out_end = min(m_end, r_end);
|
|
|
|
if (out_nid)
|
2014-01-29 18:16:01 +01:00
|
|
|
*out_nid = m_nid;
|
2011-07-12 11:15:59 +02:00
|
|
|
/*
|
2014-01-29 18:16:01 +01:00
|
|
|
* The region which ends first is
|
|
|
|
* advanced for the next iteration.
|
2011-07-12 11:15:59 +02:00
|
|
|
*/
|
|
|
|
if (m_end <= r_end)
|
2014-01-29 18:16:01 +01:00
|
|
|
idx_a++;
|
2011-07-12 11:15:59 +02:00
|
|
|
else
|
2014-01-29 18:16:01 +01:00
|
|
|
idx_b++;
|
|
|
|
*idx = (u32)idx_a | (u64)idx_b << 32;
|
2011-07-12 11:15:59 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* signal end of iteration */
|
|
|
|
*idx = ULLONG_MAX;
|
|
|
|
}
|
|
|
|
|
2011-12-08 10:22:09 -08:00
|
|
|
/**
|
2014-01-29 18:16:01 +01:00
|
|
|
* __next_mem_range_rev - generic next function for for_each_*_range_rev()
|
|
|
|
*
|
2011-12-08 10:22:09 -08:00
|
|
|
* @idx: pointer to u64 loop variable
|
2015-09-08 15:04:22 -07:00
|
|
|
* @nid: node selector, %NUMA_NO_NODE for all nodes
|
2015-06-24 16:58:09 -07:00
|
|
|
* @flags: pick from blocks based on memory attributes
|
2014-01-29 18:16:01 +01:00
|
|
|
* @type_a: pointer to memblock_type from where the range is taken
|
|
|
|
* @type_b: pointer to memblock_type which excludes memory from being taken
|
2012-06-20 12:53:01 -07:00
|
|
|
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
|
|
|
|
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
|
|
|
|
* @out_nid: ptr to int for nid of the range, can be %NULL
|
2011-12-08 10:22:09 -08:00
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Finds the next range from type_a which is not marked as unsuitable
|
|
|
|
* in type_b.
|
|
|
|
*
|
2014-01-29 18:16:01 +01:00
|
|
|
* Reverse of __next_mem_range().
|
2011-12-08 10:22:09 -08:00
|
|
|
*/
|
2018-06-30 17:55:01 +03:00
|
|
|
void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
|
|
|
|
enum memblock_flags flags,
|
2014-01-29 18:16:01 +01:00
|
|
|
struct memblock_type *type_a,
|
|
|
|
struct memblock_type *type_b,
|
|
|
|
phys_addr_t *out_start,
|
|
|
|
phys_addr_t *out_end, int *out_nid)
|
2011-12-08 10:22:09 -08:00
|
|
|
{
|
2014-01-29 18:16:01 +01:00
|
|
|
int idx_a = *idx & 0xffffffff;
|
|
|
|
int idx_b = *idx >> 32;
|
2014-01-21 15:50:16 -08:00
|
|
|
|
2011-12-08 10:22:09 -08:00
|
|
|
if (*idx == (u64)ULLONG_MAX) {
|
2014-01-29 18:16:01 +01:00
|
|
|
idx_a = type_a->cnt - 1;
|
2016-08-04 15:32:00 -07:00
|
|
|
if (type_b != NULL)
|
|
|
|
idx_b = type_b->cnt;
|
|
|
|
else
|
|
|
|
idx_b = 0;
|
2011-12-08 10:22:09 -08:00
|
|
|
}
|
|
|
|
|
2014-01-29 18:16:01 +01:00
|
|
|
for (; idx_a >= 0; idx_a--) {
|
|
|
|
struct memblock_region *m = &type_a->regions[idx_a];
|
|
|
|
|
2011-12-08 10:22:09 -08:00
|
|
|
phys_addr_t m_start = m->base;
|
|
|
|
phys_addr_t m_end = m->base + m->size;
|
2014-01-29 18:16:01 +01:00
|
|
|
int m_nid = memblock_get_region_node(m);
|
2011-12-08 10:22:09 -08:00
|
|
|
|
2020-10-13 16:58:25 -07:00
|
|
|
if (should_skip_region(type_a, m, nid, flags))
|
2015-11-30 13:28:15 +01:00
|
|
|
continue;
|
|
|
|
|
2014-01-29 18:16:01 +01:00
|
|
|
if (!type_b) {
|
|
|
|
if (out_start)
|
|
|
|
*out_start = m_start;
|
|
|
|
if (out_end)
|
|
|
|
*out_end = m_end;
|
|
|
|
if (out_nid)
|
|
|
|
*out_nid = m_nid;
|
2016-07-28 15:48:56 -07:00
|
|
|
idx_a--;
|
2014-01-29 18:16:01 +01:00
|
|
|
*idx = (u32)idx_a | (u64)idx_b << 32;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* scan areas before each reservation */
|
|
|
|
for (; idx_b >= 0; idx_b--) {
|
|
|
|
struct memblock_region *r;
|
|
|
|
phys_addr_t r_start;
|
|
|
|
phys_addr_t r_end;
|
|
|
|
|
|
|
|
r = &type_b->regions[idx_b];
|
|
|
|
r_start = idx_b ? r[-1].base + r[-1].size : 0;
|
|
|
|
r_end = idx_b < type_b->cnt ?
|
2018-06-07 17:06:15 -07:00
|
|
|
r->base : PHYS_ADDR_MAX;
|
2014-01-29 18:16:01 +01:00
|
|
|
/*
|
|
|
|
* if idx_b advanced past idx_a,
|
|
|
|
* break out to advance idx_a
|
|
|
|
*/
|
2011-12-08 10:22:09 -08:00
|
|
|
|
|
|
|
if (r_end <= m_start)
|
|
|
|
break;
|
|
|
|
/* if the two regions intersect, we're done */
|
|
|
|
if (m_end > r_start) {
|
|
|
|
if (out_start)
|
|
|
|
*out_start = max(m_start, r_start);
|
|
|
|
if (out_end)
|
|
|
|
*out_end = min(m_end, r_end);
|
|
|
|
if (out_nid)
|
2014-01-29 18:16:01 +01:00
|
|
|
*out_nid = m_nid;
|
2011-12-08 10:22:09 -08:00
|
|
|
if (m_start >= r_start)
|
2014-01-29 18:16:01 +01:00
|
|
|
idx_a--;
|
2011-12-08 10:22:09 -08:00
|
|
|
else
|
2014-01-29 18:16:01 +01:00
|
|
|
idx_b--;
|
|
|
|
*idx = (u32)idx_a | (u64)idx_b << 32;
|
2011-12-08 10:22:09 -08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-01-29 18:16:01 +01:00
|
|
|
/* signal end of iteration */
|
2011-12-08 10:22:09 -08:00
|
|
|
*idx = ULLONG_MAX;
|
|
|
|
}
|
|
|
|
|
2011-07-14 11:43:42 +02:00
|
|
|
/*
|
2018-11-16 15:08:57 -08:00
|
|
|
* Common iterator interface used to define for_each_mem_pfn_range().
|
2011-07-14 11:43:42 +02:00
|
|
|
*/
|
|
|
|
void __init_memblock __next_mem_pfn_range(int *idx, int nid,
|
|
|
|
unsigned long *out_start_pfn,
|
|
|
|
unsigned long *out_end_pfn, int *out_nid)
|
|
|
|
{
|
|
|
|
struct memblock_type *type = &memblock.memory;
|
|
|
|
struct memblock_region *r;
|
2020-06-03 15:56:53 -07:00
|
|
|
int r_nid;
|
2011-07-14 11:43:42 +02:00
|
|
|
|
|
|
|
while (++*idx < type->cnt) {
|
|
|
|
r = &type->regions[*idx];
|
2020-06-03 15:56:53 -07:00
|
|
|
r_nid = memblock_get_region_node(r);
|
2011-07-14 11:43:42 +02:00
|
|
|
|
|
|
|
if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
|
|
|
|
continue;
|
2024-06-14 11:05:43 +03:00
|
|
|
if (!numa_valid_node(nid) || nid == r_nid)
|
2011-07-14 11:43:42 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (*idx >= type->cnt) {
|
|
|
|
*idx = -1;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (out_start_pfn)
|
|
|
|
*out_start_pfn = PFN_UP(r->base);
|
|
|
|
if (out_end_pfn)
|
|
|
|
*out_end_pfn = PFN_DOWN(r->base + r->size);
|
|
|
|
if (out_nid)
|
2020-06-03 15:56:53 -07:00
|
|
|
*out_nid = r_nid;
|
2011-07-14 11:43:42 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* memblock_set_node - set node ID on memblock regions
|
|
|
|
* @base: base of area to set node ID for
|
|
|
|
* @size: size of area to set node ID for
|
2014-01-21 15:49:26 -08:00
|
|
|
* @type: memblock type to set node ID for
|
2011-07-14 11:43:42 +02:00
|
|
|
* @nid: node ID to set
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Set the nid of memblock @type regions in [@base, @base + @size) to @nid.
|
2011-07-14 11:43:42 +02:00
|
|
|
* Regions which cross the area boundaries are split as necessary.
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2011-07-14 11:43:42 +02:00
|
|
|
* 0 on success, -errno on failure.
|
|
|
|
*/
|
|
|
|
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
|
2014-01-21 15:49:26 -08:00
|
|
|
struct memblock_type *type, int nid)
|
2011-07-14 11:43:42 +02:00
|
|
|
{
|
2021-06-28 19:43:01 -07:00
|
|
|
#ifdef CONFIG_NUMA
|
2011-12-08 10:22:07 -08:00
|
|
|
int start_rgn, end_rgn;
|
|
|
|
int i, ret;
|
2011-07-14 11:43:42 +02:00
|
|
|
|
2011-12-08 10:22:07 -08:00
|
|
|
ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2011-07-14 11:43:42 +02:00
|
|
|
|
2011-12-08 10:22:07 -08:00
|
|
|
for (i = start_rgn; i < end_rgn; i++)
|
2012-10-08 16:32:21 -07:00
|
|
|
memblock_set_region_node(&type->regions[i], nid);
|
2011-07-14 11:43:42 +02:00
|
|
|
|
2023-01-29 17:00:34 +08:00
|
|
|
memblock_merge_regions(type, start_rgn, end_rgn);
|
2020-06-03 15:57:02 -07:00
|
|
|
#endif
|
2011-07-14 11:43:42 +02:00
|
|
|
return 0;
|
|
|
|
}
|
2020-06-03 15:57:02 -07:00
|
|
|
|
2019-05-13 17:21:17 -07:00
|
|
|
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
|
|
|
|
/**
|
|
|
|
* __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
|
|
|
|
*
|
|
|
|
* @idx: pointer to u64 loop variable
|
|
|
|
* @zone: zone in which all of the memory blocks reside
|
|
|
|
* @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
|
|
|
|
* @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
|
|
|
|
*
|
|
|
|
* This function is meant to be a zone/pfn specific wrapper for the
|
|
|
|
* for_each_mem_range type iterators. Specifically they are used in the
|
|
|
|
* deferred memory init routines and as such we were duplicating much of
|
|
|
|
* this logic throughout the code. So instead of having it in multiple
|
|
|
|
* locations it seemed like it would make more sense to centralize this to
|
|
|
|
* one new iterator that does everything they need.
|
|
|
|
*/
|
|
|
|
void __init_memblock
|
|
|
|
__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
|
|
|
|
unsigned long *out_spfn, unsigned long *out_epfn)
|
|
|
|
{
|
|
|
|
int zone_nid = zone_to_nid(zone);
|
|
|
|
phys_addr_t spa, epa;
|
|
|
|
|
|
|
|
__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
|
|
|
|
&memblock.memory, &memblock.reserved,
|
2022-02-17 22:07:54 +08:00
|
|
|
&spa, &epa, NULL);
|
2019-05-13 17:21:17 -07:00
|
|
|
|
|
|
|
while (*idx != U64_MAX) {
|
|
|
|
unsigned long epfn = PFN_DOWN(epa);
|
|
|
|
unsigned long spfn = PFN_UP(spa);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify the end is at least past the start of the zone and
|
|
|
|
* that we have at least one PFN to initialize.
|
|
|
|
*/
|
|
|
|
if (zone->zone_start_pfn < epfn && spfn < epfn) {
|
|
|
|
/* if we went too far just stop searching */
|
|
|
|
if (zone_end_pfn(zone) <= spfn) {
|
|
|
|
*idx = U64_MAX;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (out_spfn)
|
|
|
|
*out_spfn = max(zone->zone_start_pfn, spfn);
|
|
|
|
if (out_epfn)
|
|
|
|
*out_epfn = min(zone_end_pfn(zone), epfn);
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
|
|
|
|
&memblock.memory, &memblock.reserved,
|
2022-02-17 22:07:54 +08:00
|
|
|
&spa, &epa, NULL);
|
2019-05-13 17:21:17 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* signal end of iteration */
|
|
|
|
if (out_spfn)
|
|
|
|
*out_spfn = ULONG_MAX;
|
|
|
|
if (out_epfn)
|
|
|
|
*out_epfn = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
|
2011-07-14 11:43:42 +02:00
|
|
|
|
2019-03-11 23:29:41 -07:00
|
|
|
/**
|
|
|
|
* memblock_alloc_range_nid - allocate boot memory block
|
|
|
|
* @size: size of memory block to be allocated in bytes
|
|
|
|
* @align: alignment of the region and block's size
|
|
|
|
* @start: the lower bound of the memory region to allocate (phys address)
|
|
|
|
* @end: the upper bound of the memory region to allocate (phys address)
|
|
|
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
2019-11-30 17:56:27 -08:00
|
|
|
* @exact_nid: control the allocation fall back to other nodes
|
2019-03-11 23:29:41 -07:00
|
|
|
*
|
|
|
|
* The allocation is performed from memory region limited by
|
2019-11-30 17:56:24 -08:00
|
|
|
* memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE.
|
2019-03-11 23:29:41 -07:00
|
|
|
*
|
2019-11-30 17:56:27 -08:00
|
|
|
* If the specified node can not hold the requested memory and @exact_nid
|
|
|
|
* is false, the allocation falls back to any node in the system.
|
2019-03-11 23:29:41 -07:00
|
|
|
*
|
|
|
|
* For systems with memory mirroring, the allocation is attempted first
|
|
|
|
* from the regions with mirroring enabled and then retried from any
|
|
|
|
* memory region.
|
|
|
|
*
|
2022-06-11 11:55:48 +08:00
|
|
|
* In addition, function using kmemleak_alloc_phys for allocated boot
|
|
|
|
* memory block, it is never reported as leaks.
|
2019-03-11 23:29:41 -07:00
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* Physical address of allocated memory block on success, %0 on failure.
|
|
|
|
*/
|
2020-04-10 14:32:42 -07:00
|
|
|
phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
|
2014-06-04 16:06:53 -07:00
|
|
|
phys_addr_t align, phys_addr_t start,
|
2019-11-30 17:56:27 -08:00
|
|
|
phys_addr_t end, int nid,
|
|
|
|
bool exact_nid)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
2019-03-11 23:29:41 -07:00
|
|
|
enum memblock_flags flags = choose_memblock_flags();
|
2010-07-12 14:36:48 +10:00
|
|
|
phys_addr_t found;
|
2010-07-12 14:36:09 +10:00
|
|
|
|
2024-06-19 11:55:55 +02:00
|
|
|
/*
|
|
|
|
* Detect any accidental use of these APIs after slab is ready, as at
|
|
|
|
* this moment memblock may be deinitialized already and its
|
|
|
|
* internal data may be destroyed (after execution of memblock_free_all)
|
|
|
|
*/
|
|
|
|
if (WARN_ON_ONCE(slab_is_available())) {
|
|
|
|
void *vaddr = kzalloc_node(size, GFP_NOWAIT, nid);
|
|
|
|
|
|
|
|
return vaddr ? virt_to_phys(vaddr) : 0;
|
|
|
|
}
|
|
|
|
|
2018-10-30 15:10:01 -07:00
|
|
|
if (!align) {
|
|
|
|
/* Can't use WARNs this early in boot on powerpc */
|
|
|
|
dump_stack();
|
|
|
|
align = SMP_CACHE_BYTES;
|
|
|
|
}
|
|
|
|
|
2019-03-11 23:29:41 -07:00
|
|
|
again:
|
2015-06-24 16:58:09 -07:00
|
|
|
found = memblock_find_in_range_node(size, align, start, end, nid,
|
|
|
|
flags);
|
2019-03-11 23:29:41 -07:00
|
|
|
if (found && !memblock_reserve(found, size))
|
|
|
|
goto done;
|
|
|
|
|
2024-06-14 11:05:43 +03:00
|
|
|
if (numa_valid_node(nid) && !exact_nid) {
|
2019-03-11 23:29:41 -07:00
|
|
|
found = memblock_find_in_range_node(size, align, start,
|
|
|
|
end, NUMA_NO_NODE,
|
|
|
|
flags);
|
|
|
|
if (found && !memblock_reserve(found, size))
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (flags & MEMBLOCK_MIRROR) {
|
|
|
|
flags &= ~MEMBLOCK_MIRROR;
|
2022-06-14 17:21:53 +08:00
|
|
|
pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
|
2019-03-11 23:29:41 -07:00
|
|
|
&size);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
done:
|
2021-11-05 11:05:09 -04:00
|
|
|
/*
|
|
|
|
* Skip kmemleak for those places like kasan_init() and
|
|
|
|
* early_pgtable_alloc() due to high volume.
|
|
|
|
*/
|
|
|
|
if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
|
2014-06-06 14:38:20 -07:00
|
|
|
/*
|
2022-06-11 11:55:48 +08:00
|
|
|
* Memblock allocated blocks are never reported as
|
|
|
|
* leaks. This is because many of these blocks are
|
|
|
|
* only referred via the physical address which is
|
|
|
|
* not looked up by kmemleak.
|
2014-06-06 14:38:20 -07:00
|
|
|
*/
|
2022-06-11 11:55:48 +08:00
|
|
|
kmemleak_alloc_phys(found, size, 0);
|
2019-03-11 23:29:41 -07:00
|
|
|
|
2023-06-06 17:26:29 +03:00
|
|
|
/*
|
|
|
|
* Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
|
|
|
|
* require memory to be accepted before it can be used by the
|
|
|
|
* guest.
|
|
|
|
*
|
|
|
|
* Accept the memory of the allocated buffer.
|
|
|
|
*/
|
2024-08-09 14:48:51 +03:00
|
|
|
accept_memory(found, size);
|
2023-06-06 17:26:29 +03:00
|
|
|
|
2019-03-11 23:29:41 -07:00
|
|
|
return found;
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2019-03-11 23:30:54 -07:00
|
|
|
/**
|
|
|
|
* memblock_phys_alloc_range - allocate a memory block inside specified range
|
|
|
|
* @size: size of memory block to be allocated in bytes
|
|
|
|
* @align: alignment of the region and block's size
|
|
|
|
* @start: the lower bound of the memory region to allocate (physical address)
|
|
|
|
* @end: the upper bound of the memory region to allocate (physical address)
|
|
|
|
*
|
|
|
|
* Allocate @size bytes in the between @start and @end.
|
|
|
|
*
|
|
|
|
* Return: physical address of the allocated memory block on success,
|
|
|
|
* %0 on failure.
|
|
|
|
*/
|
2019-03-11 23:29:16 -07:00
|
|
|
phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
|
|
|
|
phys_addr_t align,
|
|
|
|
phys_addr_t start,
|
|
|
|
phys_addr_t end)
|
2014-06-04 16:06:53 -07:00
|
|
|
{
|
2020-11-16 10:14:04 +05:30
|
|
|
memblock_dbg("%s: %llu bytes align=0x%llx from=%pa max_addr=%pa %pS\n",
|
|
|
|
__func__, (u64)size, (u64)align, &start, &end,
|
|
|
|
(void *)_RET_IP_);
|
2019-11-30 17:56:27 -08:00
|
|
|
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
|
|
|
|
false);
|
2011-12-08 10:22:09 -08:00
|
|
|
}
|
|
|
|
|
2019-03-11 23:30:54 -07:00
|
|
|
/**
|
2021-01-20 21:28:18 +09:00
|
|
|
* memblock_phys_alloc_try_nid - allocate a memory block from specified NUMA node
|
2019-03-11 23:30:54 -07:00
|
|
|
* @size: size of memory block to be allocated in bytes
|
|
|
|
* @align: alignment of the region and block's size
|
|
|
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
|
|
|
*
|
|
|
|
* Allocates memory block from the specified NUMA node. If the node
|
|
|
|
* has no available memory, attempts to allocated from any node in the
|
|
|
|
* system.
|
|
|
|
*
|
|
|
|
* Return: physical address of the allocated memory block on success,
|
|
|
|
* %0 on failure.
|
|
|
|
*/
|
memblock: rename memblock_alloc{_nid,_try_nid} to memblock_phys_alloc*
Make it explicit that the caller gets a physical address rather than a
virtual one.
This will also allow using meblock_alloc prefix for memblock allocations
returning virtual address, which is done in the following patches.
The conversion is done using the following semantic patch:
@@
expression e1, e2, e3;
@@
(
- memblock_alloc(e1, e2)
+ memblock_phys_alloc(e1, e2)
|
- memblock_alloc_nid(e1, e2, e3)
+ memblock_phys_alloc_nid(e1, e2, e3)
|
- memblock_alloc_try_nid(e1, e2, e3)
+ memblock_phys_alloc_try_nid(e1, e2, e3)
)
Link: http://lkml.kernel.org/r/1536927045-23536-7-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Palmer Dabbelt <palmer@sifive.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Serge Semin <fancer.lancer@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-30 15:07:59 -07:00
|
|
|
phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
|
2010-07-06 15:39:17 -07:00
|
|
|
{
|
2019-03-11 23:29:21 -07:00
|
|
|
return memblock_alloc_range_nid(size, align, 0,
|
2019-11-30 17:56:27 -08:00
|
|
|
MEMBLOCK_ALLOC_ACCESSIBLE, nid, false);
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2014-01-21 15:50:19 -08:00
|
|
|
/**
|
2018-10-30 15:08:04 -07:00
|
|
|
* memblock_alloc_internal - allocate boot memory block
|
2014-01-21 15:50:19 -08:00
|
|
|
* @size: size of memory block to be allocated in bytes
|
|
|
|
* @align: alignment of the region and block's size
|
|
|
|
* @min_addr: the lower bound of the memory region to allocate (phys address)
|
|
|
|
* @max_addr: the upper bound of the memory region to allocate (phys address)
|
|
|
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
2019-11-30 17:56:27 -08:00
|
|
|
* @exact_nid: control the allocation fall back to other nodes
|
2014-01-21 15:50:19 -08:00
|
|
|
*
|
2019-03-11 23:29:41 -07:00
|
|
|
* Allocates memory block using memblock_alloc_range_nid() and
|
|
|
|
* converts the returned physical address to virtual.
|
2014-01-21 15:50:19 -08:00
|
|
|
*
|
2019-03-11 23:29:41 -07:00
|
|
|
* The @min_addr limit is dropped if it can not be satisfied and the allocation
|
|
|
|
* will fall back to memory below @min_addr. Other constraints, such
|
|
|
|
* as node and mirrored memory will be handled again in
|
|
|
|
* memblock_alloc_range_nid().
|
2014-01-21 15:50:19 -08:00
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2014-01-21 15:50:19 -08:00
|
|
|
* Virtual address of allocated memory block on success, NULL on failure.
|
|
|
|
*/
|
2018-10-30 15:08:04 -07:00
|
|
|
static void * __init memblock_alloc_internal(
|
2014-01-21 15:50:19 -08:00
|
|
|
phys_addr_t size, phys_addr_t align,
|
|
|
|
phys_addr_t min_addr, phys_addr_t max_addr,
|
2019-11-30 17:56:27 -08:00
|
|
|
int nid, bool exact_nid)
|
2014-01-21 15:50:19 -08:00
|
|
|
{
|
|
|
|
phys_addr_t alloc;
|
|
|
|
|
|
|
|
|
2019-10-18 20:20:01 -07:00
|
|
|
if (max_addr > memblock.current_limit)
|
|
|
|
max_addr = memblock.current_limit;
|
|
|
|
|
2019-11-30 17:56:27 -08:00
|
|
|
alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid,
|
|
|
|
exact_nid);
|
2014-01-21 15:50:19 -08:00
|
|
|
|
2019-03-11 23:29:41 -07:00
|
|
|
/* retry allocation without lower limit */
|
|
|
|
if (!alloc && min_addr)
|
2019-11-30 17:56:27 -08:00
|
|
|
alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid,
|
|
|
|
exact_nid);
|
2014-01-21 15:50:19 -08:00
|
|
|
|
2019-03-11 23:29:41 -07:00
|
|
|
if (!alloc)
|
|
|
|
return NULL;
|
2014-01-21 15:50:19 -08:00
|
|
|
|
2019-03-11 23:29:41 -07:00
|
|
|
return phys_to_virt(alloc);
|
2014-01-21 15:50:19 -08:00
|
|
|
}
|
|
|
|
|
2019-11-30 17:56:27 -08:00
|
|
|
/**
|
|
|
|
* memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node
|
|
|
|
* without zeroing memory
|
|
|
|
* @size: size of memory block to be allocated in bytes
|
|
|
|
* @align: alignment of the region and block's size
|
|
|
|
* @min_addr: the lower bound of the memory region from where the allocation
|
|
|
|
* is preferred (phys address)
|
|
|
|
* @max_addr: the upper bound of the memory region from where the allocation
|
|
|
|
* is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
|
|
|
|
* allocate only from memory limited by memblock.current_limit value
|
|
|
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
|
|
|
*
|
|
|
|
* Public function, provides additional debug information (including caller
|
|
|
|
* info), if enabled. Does not zero allocated memory.
|
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* Virtual address of allocated memory block on success, NULL on failure.
|
|
|
|
*/
|
|
|
|
void * __init memblock_alloc_exact_nid_raw(
|
|
|
|
phys_addr_t size, phys_addr_t align,
|
|
|
|
phys_addr_t min_addr, phys_addr_t max_addr,
|
|
|
|
int nid)
|
|
|
|
{
|
|
|
|
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
|
|
|
|
__func__, (u64)size, (u64)align, nid, &min_addr,
|
|
|
|
&max_addr, (void *)_RET_IP_);
|
|
|
|
|
2021-09-02 14:58:05 -07:00
|
|
|
return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
|
|
|
|
true);
|
2019-11-30 17:56:27 -08:00
|
|
|
}
|
|
|
|
|
2017-11-15 17:36:27 -08:00
|
|
|
/**
|
2018-10-30 15:08:04 -07:00
|
|
|
* memblock_alloc_try_nid_raw - allocate boot memory block without zeroing
|
2017-11-15 17:36:27 -08:00
|
|
|
* memory and without panicking
|
|
|
|
* @size: size of memory block to be allocated in bytes
|
|
|
|
* @align: alignment of the region and block's size
|
|
|
|
* @min_addr: the lower bound of the memory region from where the allocation
|
|
|
|
* is preferred (phys address)
|
|
|
|
* @max_addr: the upper bound of the memory region from where the allocation
|
2018-10-30 15:09:44 -07:00
|
|
|
* is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
|
2017-11-15 17:36:27 -08:00
|
|
|
* allocate only from memory limited by memblock.current_limit value
|
|
|
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
|
|
|
*
|
|
|
|
* Public function, provides additional debug information (including caller
|
|
|
|
* info), if enabled. Does not zero allocated memory, does not panic if request
|
|
|
|
* cannot be satisfied.
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2017-11-15 17:36:27 -08:00
|
|
|
* Virtual address of allocated memory block on success, NULL on failure.
|
|
|
|
*/
|
2018-10-30 15:08:04 -07:00
|
|
|
void * __init memblock_alloc_try_nid_raw(
|
2017-11-15 17:36:27 -08:00
|
|
|
phys_addr_t size, phys_addr_t align,
|
|
|
|
phys_addr_t min_addr, phys_addr_t max_addr,
|
|
|
|
int nid)
|
|
|
|
{
|
2019-03-25 21:32:28 +02:00
|
|
|
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
|
2018-08-17 15:47:17 -07:00
|
|
|
__func__, (u64)size, (u64)align, nid, &min_addr,
|
|
|
|
&max_addr, (void *)_RET_IP_);
|
2017-11-15 17:36:27 -08:00
|
|
|
|
2021-09-02 14:58:05 -07:00
|
|
|
return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
|
|
|
|
false);
|
2017-11-15 17:36:27 -08:00
|
|
|
}
|
|
|
|
|
2014-01-21 15:50:19 -08:00
|
|
|
/**
|
2019-03-11 23:30:37 -07:00
|
|
|
* memblock_alloc_try_nid - allocate boot memory block
|
2014-01-21 15:50:19 -08:00
|
|
|
* @size: size of memory block to be allocated in bytes
|
|
|
|
* @align: alignment of the region and block's size
|
|
|
|
* @min_addr: the lower bound of the memory region from where the allocation
|
|
|
|
* is preferred (phys address)
|
|
|
|
* @max_addr: the upper bound of the memory region from where the allocation
|
2018-10-30 15:09:44 -07:00
|
|
|
* is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
|
2014-01-21 15:50:19 -08:00
|
|
|
* allocate only from memory limited by memblock.current_limit value
|
|
|
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
|
|
|
*
|
2019-03-11 23:30:37 -07:00
|
|
|
* Public function, provides additional debug information (including caller
|
|
|
|
* info), if enabled. This function zeroes the allocated memory.
|
2014-01-21 15:50:19 -08:00
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2014-01-21 15:50:19 -08:00
|
|
|
* Virtual address of allocated memory block on success, NULL on failure.
|
|
|
|
*/
|
2018-10-30 15:08:04 -07:00
|
|
|
void * __init memblock_alloc_try_nid(
|
2014-01-21 15:50:19 -08:00
|
|
|
phys_addr_t size, phys_addr_t align,
|
|
|
|
phys_addr_t min_addr, phys_addr_t max_addr,
|
|
|
|
int nid)
|
|
|
|
{
|
|
|
|
void *ptr;
|
|
|
|
|
2019-03-25 21:32:28 +02:00
|
|
|
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
|
2018-08-17 15:47:17 -07:00
|
|
|
__func__, (u64)size, (u64)align, nid, &min_addr,
|
|
|
|
&max_addr, (void *)_RET_IP_);
|
2018-10-30 15:08:04 -07:00
|
|
|
ptr = memblock_alloc_internal(size, align,
|
2019-11-30 17:56:27 -08:00
|
|
|
min_addr, max_addr, nid, false);
|
2019-03-11 23:30:37 -07:00
|
|
|
if (ptr)
|
2017-11-15 17:36:27 -08:00
|
|
|
memset(ptr, 0, size);
|
2014-01-21 15:50:19 -08:00
|
|
|
|
2019-03-11 23:30:37 -07:00
|
|
|
return ptr;
|
2014-01-21 15:50:19 -08:00
|
|
|
}
|
|
|
|
|
2018-06-30 17:55:03 +03:00
|
|
|
/**
|
2021-11-05 13:43:16 -07:00
|
|
|
* memblock_free_late - free pages directly to buddy allocator
|
2018-06-30 17:55:03 +03:00
|
|
|
* @base: phys starting address of the boot memory block
|
2014-01-21 15:50:19 -08:00
|
|
|
* @size: size of the boot memory block in bytes
|
|
|
|
*
|
2019-03-11 23:30:54 -07:00
|
|
|
* This is only useful when the memblock allocator has already been torn
|
2014-01-21 15:50:19 -08:00
|
|
|
* down, but we are still initializing the system. Pages are released directly
|
2019-03-11 23:30:54 -07:00
|
|
|
* to the buddy allocator.
|
2014-01-21 15:50:19 -08:00
|
|
|
*/
|
2021-11-05 13:43:16 -07:00
|
|
|
void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
|
2014-01-21 15:50:19 -08:00
|
|
|
{
|
2018-08-17 15:47:17 -07:00
|
|
|
phys_addr_t cursor, end;
|
2014-01-21 15:50:19 -08:00
|
|
|
|
2018-08-17 15:47:17 -07:00
|
|
|
end = base + size - 1;
|
2019-03-25 21:32:28 +02:00
|
|
|
memblock_dbg("%s: [%pa-%pa] %pS\n",
|
2018-08-17 15:47:17 -07:00
|
|
|
__func__, &base, &end, (void *)_RET_IP_);
|
2016-10-11 13:55:11 -07:00
|
|
|
kmemleak_free_part_phys(base, size);
|
2014-01-21 15:50:19 -08:00
|
|
|
cursor = PFN_UP(base);
|
|
|
|
end = PFN_DOWN(base + size);
|
|
|
|
|
|
|
|
for (; cursor < end; cursor++) {
|
2023-02-07 08:21:51 +00:00
|
|
|
memblock_free_pages(pfn_to_page(cursor), cursor, 0);
|
2018-12-28 00:34:29 -08:00
|
|
|
totalram_pages_inc();
|
2014-01-21 15:50:19 -08:00
|
|
|
}
|
|
|
|
}
|
2010-07-06 15:39:17 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Remaining API functions
|
|
|
|
*/
|
|
|
|
|
2016-02-05 15:36:19 -08:00
|
|
|
phys_addr_t __init_memblock memblock_phys_mem_size(void)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
2011-12-08 10:22:08 -08:00
|
|
|
return memblock.memory.total_size;
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2016-10-07 16:59:18 -07:00
|
|
|
phys_addr_t __init_memblock memblock_reserved_size(void)
|
|
|
|
{
|
|
|
|
return memblock.reserved.total_size;
|
|
|
|
}
|
|
|
|
|
2024-08-08 00:14:13 +00:00
|
|
|
/**
|
|
|
|
* memblock_estimated_nr_free_pages - return estimated number of free pages
|
|
|
|
* from memblock point of view
|
|
|
|
*
|
|
|
|
* During bootup, subsystems might need a rough estimate of the number of free
|
|
|
|
* pages in the whole system, before precise numbers are available from the
|
|
|
|
* buddy. Especially with CONFIG_DEFERRED_STRUCT_PAGE_INIT, the numbers
|
|
|
|
* obtained from the buddy might be very imprecise during bootup.
|
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* An estimated number of free pages from memblock point of view.
|
|
|
|
*/
|
|
|
|
unsigned long __init memblock_estimated_nr_free_pages(void)
|
|
|
|
{
|
|
|
|
return PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size());
|
|
|
|
}
|
|
|
|
|
2011-10-31 17:08:16 -07:00
|
|
|
/* lowest address */
|
|
|
|
phys_addr_t __init_memblock memblock_start_of_DRAM(void)
|
|
|
|
{
|
|
|
|
return memblock.memory.regions[0].base;
|
|
|
|
}
|
|
|
|
|
2010-07-28 15:43:02 +10:00
|
|
|
phys_addr_t __init_memblock memblock_end_of_DRAM(void)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
|
|
|
int idx = memblock.memory.cnt - 1;
|
|
|
|
|
2010-08-04 14:06:41 +10:00
|
|
|
return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2016-07-28 15:48:26 -07:00
|
|
|
static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
2018-06-07 17:06:15 -07:00
|
|
|
phys_addr_t max_addr = PHYS_ADDR_MAX;
|
2014-04-07 15:37:52 -07:00
|
|
|
struct memblock_region *r;
|
2010-07-12 14:36:09 +10:00
|
|
|
|
2016-07-28 15:48:26 -07:00
|
|
|
/*
|
|
|
|
* translate the memory @limit size into the max address within one of
|
|
|
|
* the memory memblock regions, if the @limit exceeds the total size
|
2018-06-07 17:06:15 -07:00
|
|
|
* of those regions, max_addr will keep original value PHYS_ADDR_MAX
|
2016-07-28 15:48:26 -07:00
|
|
|
*/
|
2020-10-13 16:58:30 -07:00
|
|
|
for_each_mem_region(r) {
|
2011-12-08 10:22:07 -08:00
|
|
|
if (limit <= r->size) {
|
|
|
|
max_addr = r->base + limit;
|
|
|
|
break;
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
2011-12-08 10:22:07 -08:00
|
|
|
limit -= r->size;
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
2011-12-08 10:22:07 -08:00
|
|
|
|
2016-07-28 15:48:26 -07:00
|
|
|
return max_addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __init memblock_enforce_memory_limit(phys_addr_t limit)
|
|
|
|
{
|
2020-04-01 21:11:01 -07:00
|
|
|
phys_addr_t max_addr;
|
2016-07-28 15:48:26 -07:00
|
|
|
|
|
|
|
if (!limit)
|
|
|
|
return;
|
|
|
|
|
|
|
|
max_addr = __find_max_addr(limit);
|
|
|
|
|
|
|
|
/* @limit exceeds the total size of the memory, do nothing */
|
2018-06-07 17:06:15 -07:00
|
|
|
if (max_addr == PHYS_ADDR_MAX)
|
2016-07-28 15:48:26 -07:00
|
|
|
return;
|
|
|
|
|
2011-12-08 10:22:07 -08:00
|
|
|
/* truncate both memory and reserved regions */
|
2014-01-29 18:16:01 +01:00
|
|
|
memblock_remove_range(&memblock.memory, max_addr,
|
2018-06-07 17:06:15 -07:00
|
|
|
PHYS_ADDR_MAX);
|
2014-01-29 18:16:01 +01:00
|
|
|
memblock_remove_range(&memblock.reserved, max_addr,
|
2018-06-07 17:06:15 -07:00
|
|
|
PHYS_ADDR_MAX);
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2017-04-03 11:23:55 +09:00
|
|
|
void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
|
|
|
|
{
|
|
|
|
int start_rgn, end_rgn;
|
|
|
|
int i, ret;
|
|
|
|
|
|
|
|
if (!size)
|
|
|
|
return;
|
|
|
|
|
2021-10-18 15:15:45 -07:00
|
|
|
if (!memblock_memory->total_size) {
|
2021-08-11 10:55:18 +02:00
|
|
|
pr_warn("%s: No memory registered yet\n", __func__);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-04-03 11:23:55 +09:00
|
|
|
ret = memblock_isolate_range(&memblock.memory, base, size,
|
|
|
|
&start_rgn, &end_rgn);
|
|
|
|
if (ret)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* remove all the MAP regions */
|
|
|
|
for (i = memblock.memory.cnt - 1; i >= end_rgn; i--)
|
|
|
|
if (!memblock_is_nomap(&memblock.memory.regions[i]))
|
|
|
|
memblock_remove_region(&memblock.memory, i);
|
|
|
|
|
|
|
|
for (i = start_rgn - 1; i >= 0; i--)
|
|
|
|
if (!memblock_is_nomap(&memblock.memory.regions[i]))
|
|
|
|
memblock_remove_region(&memblock.memory, i);
|
|
|
|
|
|
|
|
/* truncate the reserved regions */
|
|
|
|
memblock_remove_range(&memblock.reserved, 0, base);
|
|
|
|
memblock_remove_range(&memblock.reserved,
|
2018-06-07 17:06:15 -07:00
|
|
|
base + size, PHYS_ADDR_MAX);
|
2017-04-03 11:23:55 +09:00
|
|
|
}
|
|
|
|
|
2016-07-28 15:48:26 -07:00
|
|
|
void __init memblock_mem_limit_remove_map(phys_addr_t limit)
|
|
|
|
{
|
|
|
|
phys_addr_t max_addr;
|
|
|
|
|
|
|
|
if (!limit)
|
|
|
|
return;
|
|
|
|
|
|
|
|
max_addr = __find_max_addr(limit);
|
|
|
|
|
|
|
|
/* @limit exceeds the total size of the memory, do nothing */
|
2018-06-07 17:06:15 -07:00
|
|
|
if (max_addr == PHYS_ADDR_MAX)
|
2016-07-28 15:48:26 -07:00
|
|
|
return;
|
|
|
|
|
2017-04-03 11:23:55 +09:00
|
|
|
memblock_cap_memory_range(0, max_addr);
|
2016-07-28 15:48:26 -07:00
|
|
|
}
|
|
|
|
|
2010-10-11 12:34:09 -07:00
|
|
|
static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
|
2010-08-04 14:38:47 +10:00
|
|
|
{
|
|
|
|
unsigned int left = 0, right = type->cnt;
|
|
|
|
|
|
|
|
do {
|
|
|
|
unsigned int mid = (right + left) / 2;
|
|
|
|
|
|
|
|
if (addr < type->regions[mid].base)
|
|
|
|
right = mid;
|
|
|
|
else if (addr >= (type->regions[mid].base +
|
|
|
|
type->regions[mid].size))
|
|
|
|
left = mid + 1;
|
|
|
|
else
|
|
|
|
return mid;
|
|
|
|
} while (left < right);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2018-12-14 14:17:06 -08:00
|
|
|
bool __init_memblock memblock_is_reserved(phys_addr_t addr)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
2010-08-04 14:38:47 +10:00
|
|
|
return memblock_search(&memblock.reserved, addr) != -1;
|
|
|
|
}
|
2010-07-12 14:36:09 +10:00
|
|
|
|
2016-01-14 15:18:54 -08:00
|
|
|
bool __init_memblock memblock_is_memory(phys_addr_t addr)
|
2010-08-04 14:38:47 +10:00
|
|
|
{
|
|
|
|
return memblock_search(&memblock.memory, addr) != -1;
|
|
|
|
}
|
|
|
|
|
2018-02-06 15:41:18 -08:00
|
|
|
bool __init_memblock memblock_is_map_memory(phys_addr_t addr)
|
2015-11-30 13:28:15 +01:00
|
|
|
{
|
|
|
|
int i = memblock_search(&memblock.memory, addr);
|
|
|
|
|
|
|
|
if (i == -1)
|
|
|
|
return false;
|
|
|
|
return !memblock_is_nomap(&memblock.memory.regions[i]);
|
|
|
|
}
|
|
|
|
|
2013-09-11 14:22:17 -07:00
|
|
|
int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
|
|
|
|
unsigned long *start_pfn, unsigned long *end_pfn)
|
|
|
|
{
|
|
|
|
struct memblock_type *type = &memblock.memory;
|
2014-04-07 15:37:53 -07:00
|
|
|
int mid = memblock_search(type, PFN_PHYS(pfn));
|
2013-09-11 14:22:17 -07:00
|
|
|
|
|
|
|
if (mid == -1)
|
2023-12-07 21:10:01 +08:00
|
|
|
return NUMA_NO_NODE;
|
2013-09-11 14:22:17 -07:00
|
|
|
|
2014-06-04 16:07:51 -07:00
|
|
|
*start_pfn = PFN_DOWN(type->regions[mid].base);
|
|
|
|
*end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
|
2013-09-11 14:22:17 -07:00
|
|
|
|
2020-06-03 15:56:53 -07:00
|
|
|
return memblock_get_region_node(&type->regions[mid]);
|
2013-09-11 14:22:17 -07:00
|
|
|
}
|
|
|
|
|
2012-05-24 00:45:21 -07:00
|
|
|
/**
|
|
|
|
* memblock_is_region_memory - check if a region is a subset of memory
|
|
|
|
* @base: base of region to check
|
|
|
|
* @size: size of region to check
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Check if the region [@base, @base + @size) is a subset of a memory block.
|
2012-05-24 00:45:21 -07:00
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2012-05-24 00:45:21 -07:00
|
|
|
* 0 if false, non-zero if true
|
|
|
|
*/
|
2018-02-06 15:41:18 -08:00
|
|
|
bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
|
2010-08-04 14:38:47 +10:00
|
|
|
{
|
2011-01-20 14:44:20 -08:00
|
|
|
int idx = memblock_search(&memblock.memory, base);
|
2011-12-08 10:22:07 -08:00
|
|
|
phys_addr_t end = base + memblock_cap_size(base, &size);
|
2010-08-04 14:38:47 +10:00
|
|
|
|
|
|
|
if (idx == -1)
|
2018-02-06 15:41:18 -08:00
|
|
|
return false;
|
2017-02-22 15:45:04 -08:00
|
|
|
return (memblock.memory.regions[idx].base +
|
2011-12-08 10:22:07 -08:00
|
|
|
memblock.memory.regions[idx].size) >= end;
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2012-05-24 00:45:21 -07:00
|
|
|
/**
|
|
|
|
* memblock_is_region_reserved - check if a region intersects reserved memory
|
|
|
|
* @base: base of region to check
|
|
|
|
* @size: size of region to check
|
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Check if the region [@base, @base + @size) intersects a reserved
|
|
|
|
* memory block.
|
2012-05-24 00:45:21 -07:00
|
|
|
*
|
2018-06-30 17:55:02 +03:00
|
|
|
* Return:
|
2015-09-08 15:02:00 -07:00
|
|
|
* True if they intersect, false if not.
|
2012-05-24 00:45:21 -07:00
|
|
|
*/
|
2015-09-08 15:02:00 -07:00
|
|
|
bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
|
2010-07-12 14:36:09 +10:00
|
|
|
{
|
2015-09-08 15:02:00 -07:00
|
|
|
return memblock_overlaps_region(&memblock.reserved, base, size);
|
2010-07-12 14:36:09 +10:00
|
|
|
}
|
|
|
|
|
2012-10-22 16:35:18 -07:00
|
|
|
void __init_memblock memblock_trim_memory(phys_addr_t align)
|
|
|
|
{
|
|
|
|
phys_addr_t start, end, orig_start, orig_end;
|
2014-04-07 15:37:52 -07:00
|
|
|
struct memblock_region *r;
|
2012-10-22 16:35:18 -07:00
|
|
|
|
2020-10-13 16:58:30 -07:00
|
|
|
for_each_mem_region(r) {
|
2014-04-07 15:37:52 -07:00
|
|
|
orig_start = r->base;
|
|
|
|
orig_end = r->base + r->size;
|
2012-10-22 16:35:18 -07:00
|
|
|
start = round_up(orig_start, align);
|
|
|
|
end = round_down(orig_end, align);
|
|
|
|
|
|
|
|
if (start == orig_start && end == orig_end)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (start < end) {
|
2014-04-07 15:37:52 -07:00
|
|
|
r->base = start;
|
|
|
|
r->size = end - start;
|
2012-10-22 16:35:18 -07:00
|
|
|
} else {
|
2014-04-07 15:37:52 -07:00
|
|
|
memblock_remove_region(&memblock.memory,
|
|
|
|
r - memblock.memory.regions);
|
|
|
|
r--;
|
2012-10-22 16:35:18 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-07-06 15:39:01 -07:00
|
|
|
|
2010-09-15 13:05:29 -07:00
|
|
|
void __init_memblock memblock_set_current_limit(phys_addr_t limit)
|
2010-07-06 15:39:01 -07:00
|
|
|
{
|
|
|
|
memblock.current_limit = limit;
|
|
|
|
}
|
|
|
|
|
2014-02-27 01:23:43 +01:00
|
|
|
phys_addr_t __init_memblock memblock_get_current_limit(void)
|
|
|
|
{
|
|
|
|
return memblock.current_limit;
|
|
|
|
}
|
|
|
|
|
2017-02-24 14:55:59 -08:00
|
|
|
static void __init_memblock memblock_dump(struct memblock_type *type)
|
2010-07-12 14:36:48 +10:00
|
|
|
{
|
2017-02-22 15:46:42 -08:00
|
|
|
phys_addr_t base, end, size;
|
2018-06-30 17:55:01 +03:00
|
|
|
enum memblock_flags flags;
|
2016-01-14 15:20:42 -08:00
|
|
|
int idx;
|
|
|
|
struct memblock_region *rgn;
|
2010-07-12 14:36:48 +10:00
|
|
|
|
2017-02-24 14:55:59 -08:00
|
|
|
pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt);
|
2010-07-12 14:36:48 +10:00
|
|
|
|
2017-11-15 17:33:42 -08:00
|
|
|
for_each_memblock_type(idx, type, rgn) {
|
2011-07-14 11:43:42 +02:00
|
|
|
char nid_buf[32] = "";
|
|
|
|
|
|
|
|
base = rgn->base;
|
|
|
|
size = rgn->size;
|
2017-02-22 15:46:42 -08:00
|
|
|
end = base + size - 1;
|
2014-01-21 15:49:20 -08:00
|
|
|
flags = rgn->flags;
|
2021-06-28 19:43:01 -07:00
|
|
|
#ifdef CONFIG_NUMA
|
2024-06-14 11:05:43 +03:00
|
|
|
if (numa_valid_node(memblock_get_region_node(rgn)))
|
2011-07-14 11:43:42 +02:00
|
|
|
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
|
|
|
|
memblock_get_region_node(rgn));
|
|
|
|
#endif
|
2018-06-30 17:55:01 +03:00
|
|
|
pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#x\n",
|
2017-02-24 14:55:59 -08:00
|
|
|
type->name, idx, &base, &end, &size, nid_buf, flags);
|
2010-07-12 14:36:48 +10:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-13 16:57:54 -07:00
|
|
|
static void __init_memblock __memblock_dump_all(void)
|
2010-07-12 14:36:48 +10:00
|
|
|
{
|
|
|
|
pr_info("MEMBLOCK configuration:\n");
|
2017-02-22 15:46:42 -08:00
|
|
|
pr_info(" memory size = %pa reserved size = %pa\n",
|
|
|
|
&memblock.memory.total_size,
|
|
|
|
&memblock.reserved.total_size);
|
2010-07-12 14:36:48 +10:00
|
|
|
|
2017-02-24 14:55:59 -08:00
|
|
|
memblock_dump(&memblock.memory);
|
|
|
|
memblock_dump(&memblock.reserved);
|
2017-02-24 14:55:56 -08:00
|
|
|
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
|
2020-07-01 16:18:29 +02:00
|
|
|
memblock_dump(&physmem);
|
2017-02-24 14:55:56 -08:00
|
|
|
#endif
|
2010-07-12 14:36:48 +10:00
|
|
|
}
|
|
|
|
|
2020-10-13 16:57:54 -07:00
|
|
|
void __init_memblock memblock_dump_all(void)
|
|
|
|
{
|
|
|
|
if (memblock_debug)
|
|
|
|
__memblock_dump_all();
|
|
|
|
}
|
|
|
|
|
2011-12-08 10:22:08 -08:00
|
|
|
void __init memblock_allow_resize(void)
|
2010-07-12 14:36:48 +10:00
|
|
|
{
|
2010-07-06 15:39:13 -07:00
|
|
|
memblock_can_resize = 1;
|
2010-07-12 14:36:48 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
static int __init early_memblock(char *p)
|
|
|
|
{
|
|
|
|
if (p && strstr(p, "debug"))
|
|
|
|
memblock_debug = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
early_param("memblock", early_memblock);
|
|
|
|
|
2020-12-14 19:09:59 -08:00
|
|
|
static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
struct page *start_pg, *end_pg;
|
|
|
|
phys_addr_t pg, pgend;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Convert start_pfn/end_pfn to a struct page pointer.
|
|
|
|
*/
|
|
|
|
start_pg = pfn_to_page(start_pfn - 1) + 1;
|
|
|
|
end_pg = pfn_to_page(end_pfn - 1) + 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Convert to physical addresses, and round start upwards and end
|
|
|
|
* downwards.
|
|
|
|
*/
|
|
|
|
pg = PAGE_ALIGN(__pa(start_pg));
|
2024-05-07 07:58:32 +00:00
|
|
|
pgend = PAGE_ALIGN_DOWN(__pa(end_pg));
|
2020-12-14 19:09:59 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If there are free pages between these, free the section of the
|
|
|
|
* memmap array.
|
|
|
|
*/
|
|
|
|
if (pg < pgend)
|
2021-11-05 13:43:19 -07:00
|
|
|
memblock_phys_free(pg, pgend - pg);
|
2020-12-14 19:09:59 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The mem_map array can get very big. Free the unused area of the memory map.
|
|
|
|
*/
|
|
|
|
static void __init free_unused_memmap(void)
|
|
|
|
{
|
|
|
|
unsigned long start, end, prev_end = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) ||
|
|
|
|
IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This relies on each bank being in address order.
|
|
|
|
* The banks are sorted previously in bootmem_init().
|
|
|
|
*/
|
|
|
|
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
|
|
|
|
#ifdef CONFIG_SPARSEMEM
|
|
|
|
/*
|
|
|
|
* Take care not to free memmap entries that don't exist
|
|
|
|
* due to SPARSEMEM sections which aren't present.
|
|
|
|
*/
|
|
|
|
start = min(start, ALIGN(prev_end, PAGES_PER_SECTION));
|
2021-05-17 21:31:59 +03:00
|
|
|
#endif
|
2020-12-14 19:09:59 -08:00
|
|
|
/*
|
2021-05-17 21:15:15 +03:00
|
|
|
* Align down here since many operations in VM subsystem
|
|
|
|
* presume that there are no holes in the memory map inside
|
|
|
|
* a pageblock
|
2020-12-14 19:09:59 -08:00
|
|
|
*/
|
2022-09-07 14:08:42 +08:00
|
|
|
start = pageblock_start_pfn(start);
|
2020-12-14 19:09:59 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we had a previous bank, and there is a space
|
|
|
|
* between the current bank and the previous, free it.
|
|
|
|
*/
|
|
|
|
if (prev_end && prev_end < start)
|
|
|
|
free_memmap(prev_end, start);
|
|
|
|
|
|
|
|
/*
|
2021-05-17 21:15:15 +03:00
|
|
|
* Align up here since many operations in VM subsystem
|
|
|
|
* presume that there are no holes in the memory map inside
|
|
|
|
* a pageblock
|
2020-12-14 19:09:59 -08:00
|
|
|
*/
|
2022-09-07 14:08:43 +08:00
|
|
|
prev_end = pageblock_align(end);
|
2020-12-14 19:09:59 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_SPARSEMEM
|
2021-05-17 21:31:59 +03:00
|
|
|
if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) {
|
2022-09-07 14:08:43 +08:00
|
|
|
prev_end = pageblock_align(end);
|
2020-12-14 19:09:59 -08:00
|
|
|
free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION));
|
2021-05-17 21:31:59 +03:00
|
|
|
}
|
2020-12-14 19:09:59 -08:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2018-10-30 15:09:40 -07:00
|
|
|
static void __init __free_pages_memory(unsigned long start, unsigned long end)
|
|
|
|
{
|
|
|
|
int order;
|
|
|
|
|
|
|
|
while (start < end) {
|
2023-04-06 10:25:29 +03:00
|
|
|
/*
|
|
|
|
* Free the pages in the largest chunks alignment allows.
|
|
|
|
*
|
|
|
|
* __ffs() behaviour is undefined for 0. start == 0 is
|
2023-12-28 17:47:04 +03:00
|
|
|
* MAX_PAGE_ORDER-aligned, set order to MAX_PAGE_ORDER for
|
|
|
|
* the case.
|
2023-04-06 10:25:29 +03:00
|
|
|
*/
|
|
|
|
if (start)
|
2023-12-28 17:47:04 +03:00
|
|
|
order = min_t(int, MAX_PAGE_ORDER, __ffs(start));
|
2023-04-06 10:25:29 +03:00
|
|
|
else
|
2023-12-28 17:47:04 +03:00
|
|
|
order = MAX_PAGE_ORDER;
|
2018-10-30 15:09:40 -07:00
|
|
|
|
|
|
|
while (start + (1UL << order) > end)
|
|
|
|
order--;
|
|
|
|
|
|
|
|
memblock_free_pages(pfn_to_page(start), start, order);
|
|
|
|
|
|
|
|
start += (1UL << order);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned long __init __free_memory_core(phys_addr_t start,
|
|
|
|
phys_addr_t end)
|
|
|
|
{
|
|
|
|
unsigned long start_pfn = PFN_UP(start);
|
|
|
|
unsigned long end_pfn = min_t(unsigned long,
|
|
|
|
PFN_DOWN(end), max_low_pfn);
|
|
|
|
|
|
|
|
if (start_pfn >= end_pfn)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
__free_pages_memory(start_pfn, end_pfn);
|
|
|
|
|
|
|
|
return end_pfn - start_pfn;
|
|
|
|
}
|
|
|
|
|
2021-06-30 18:51:16 -07:00
|
|
|
static void __init memmap_init_reserved_pages(void)
|
|
|
|
{
|
|
|
|
struct memblock_region *region;
|
|
|
|
phys_addr_t start, end;
|
2023-06-19 10:34:06 +08:00
|
|
|
int nid;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* set nid on all reserved pages and also treat struct
|
|
|
|
* pages for the NOMAP regions as PageReserved
|
|
|
|
*/
|
|
|
|
for_each_mem_region(region) {
|
|
|
|
nid = memblock_get_region_node(region);
|
|
|
|
start = region->base;
|
|
|
|
end = start + region->size;
|
|
|
|
|
|
|
|
if (memblock_is_nomap(region))
|
|
|
|
reserve_bootmem_region(start, end, nid);
|
|
|
|
|
|
|
|
memblock_set_node(start, end, &memblock.reserved, nid);
|
|
|
|
}
|
2021-06-30 18:51:16 -07:00
|
|
|
|
2023-09-13 11:54:00 +01:00
|
|
|
/*
|
|
|
|
* initialize struct pages for reserved regions that don't have
|
|
|
|
* the MEMBLOCK_RSRV_NOINIT flag set
|
|
|
|
*/
|
2023-06-19 10:34:06 +08:00
|
|
|
for_each_reserved_mem_region(region) {
|
2023-09-13 11:54:00 +01:00
|
|
|
if (!memblock_is_reserved_noinit(region)) {
|
|
|
|
nid = memblock_get_region_node(region);
|
|
|
|
start = region->base;
|
|
|
|
end = start + region->size;
|
2021-06-30 18:51:16 -07:00
|
|
|
|
2024-06-14 11:05:43 +03:00
|
|
|
if (!numa_valid_node(nid))
|
2024-01-18 14:18:53 +08:00
|
|
|
nid = early_pfn_to_nid(PFN_DOWN(start));
|
|
|
|
|
2023-09-13 11:54:00 +01:00
|
|
|
reserve_bootmem_region(start, end, nid);
|
|
|
|
}
|
2021-06-30 18:51:16 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-30 15:09:40 -07:00
|
|
|
static unsigned long __init free_low_memory_core_early(void)
|
|
|
|
{
|
|
|
|
unsigned long count = 0;
|
|
|
|
phys_addr_t start, end;
|
|
|
|
u64 i;
|
|
|
|
|
|
|
|
memblock_clear_hotplug(0, -1);
|
|
|
|
|
2021-06-30 18:51:16 -07:00
|
|
|
memmap_init_reserved_pages();
|
2018-10-30 15:09:40 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
|
|
|
|
* because in some case like Node0 doesn't have RAM installed
|
|
|
|
* low ram will be on Node1
|
|
|
|
*/
|
|
|
|
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
|
|
|
|
NULL)
|
|
|
|
count += __free_memory_core(start, end);
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int reset_managed_pages_done __initdata;
|
|
|
|
|
2023-06-07 02:45:48 +00:00
|
|
|
static void __init reset_node_managed_pages(pg_data_t *pgdat)
|
2018-10-30 15:09:40 -07:00
|
|
|
{
|
|
|
|
struct zone *z;
|
|
|
|
|
|
|
|
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
|
2018-12-28 00:34:24 -08:00
|
|
|
atomic_long_set(&z->managed_pages, 0);
|
2018-10-30 15:09:40 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void __init reset_all_zones_managed_pages(void)
|
|
|
|
{
|
|
|
|
struct pglist_data *pgdat;
|
|
|
|
|
|
|
|
if (reset_managed_pages_done)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for_each_online_pgdat(pgdat)
|
|
|
|
reset_node_managed_pages(pgdat);
|
|
|
|
|
|
|
|
reset_managed_pages_done = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* memblock_free_all - release free pages to the buddy allocator
|
|
|
|
*/
|
2021-01-14 16:08:17 +09:00
|
|
|
void __init memblock_free_all(void)
|
2018-10-30 15:09:40 -07:00
|
|
|
{
|
|
|
|
unsigned long pages;
|
|
|
|
|
2020-12-14 19:09:59 -08:00
|
|
|
free_unused_memmap();
|
2018-10-30 15:09:40 -07:00
|
|
|
reset_all_zones_managed_pages();
|
|
|
|
|
|
|
|
pages = free_low_memory_core_early();
|
2018-12-28 00:34:29 -08:00
|
|
|
totalram_pages_add(pages);
|
2018-10-30 15:09:40 -07:00
|
|
|
}
|
|
|
|
|
2024-06-13 11:55:07 -04:00
|
|
|
/* Keep a table to reserve named memory */
|
|
|
|
#define RESERVE_MEM_MAX_ENTRIES 8
|
|
|
|
#define RESERVE_MEM_NAME_SIZE 16
|
|
|
|
struct reserve_mem_table {
|
|
|
|
char name[RESERVE_MEM_NAME_SIZE];
|
|
|
|
phys_addr_t start;
|
|
|
|
phys_addr_t size;
|
|
|
|
};
|
|
|
|
static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES];
|
|
|
|
static int reserved_mem_count;
|
|
|
|
|
|
|
|
/* Add wildcard region with a lookup name */
|
|
|
|
static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
|
|
|
|
const char *name)
|
|
|
|
{
|
|
|
|
struct reserve_mem_table *map;
|
|
|
|
|
|
|
|
map = &reserved_mem_table[reserved_mem_count++];
|
|
|
|
map->start = start;
|
|
|
|
map->size = size;
|
|
|
|
strscpy(map->name, name);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* reserve_mem_find_by_name - Find reserved memory region with a given name
|
|
|
|
* @name: The name that is attached to a reserved memory region
|
|
|
|
* @start: If found, holds the start address
|
|
|
|
* @size: If found, holds the size of the address.
|
|
|
|
*
|
|
|
|
* @start and @size are only updated if @name is found.
|
|
|
|
*
|
|
|
|
* Returns: 1 if found or 0 if not found.
|
|
|
|
*/
|
|
|
|
int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size)
|
|
|
|
{
|
|
|
|
struct reserve_mem_table *map;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < reserved_mem_count; i++) {
|
|
|
|
map = &reserved_mem_table[i];
|
|
|
|
if (!map->size)
|
|
|
|
continue;
|
|
|
|
if (strcmp(name, map->name) == 0) {
|
|
|
|
*start = map->start;
|
|
|
|
*size = map->size;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parse reserve_mem=nn:align:name
|
|
|
|
*/
|
|
|
|
static int __init reserve_mem(char *p)
|
|
|
|
{
|
|
|
|
phys_addr_t start, size, align, tmp;
|
|
|
|
char *name;
|
|
|
|
char *oldp;
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (!p)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* Check if there's room for more reserved memory */
|
|
|
|
if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES)
|
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
oldp = p;
|
|
|
|
size = memparse(p, &p);
|
|
|
|
if (!size || p == oldp)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (*p != ':')
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
align = memparse(p+1, &p);
|
|
|
|
if (*p != ':')
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* memblock_phys_alloc() doesn't like a zero size align,
|
|
|
|
* but it is OK for this command to have it.
|
|
|
|
*/
|
|
|
|
if (align < SMP_CACHE_BYTES)
|
|
|
|
align = SMP_CACHE_BYTES;
|
|
|
|
|
|
|
|
name = p + 1;
|
|
|
|
len = strlen(name);
|
|
|
|
|
|
|
|
/* name needs to have length but not too big */
|
|
|
|
if (!len || len >= RESERVE_MEM_NAME_SIZE)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* Make sure that name has text */
|
|
|
|
for (p = name; *p; p++) {
|
|
|
|
if (!isspace(*p))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!*p)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* Make sure the name is not already used */
|
|
|
|
if (reserve_mem_find_by_name(name, &start, &tmp))
|
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
start = memblock_phys_alloc(size, align);
|
|
|
|
if (!start)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
reserved_mem_add(start, size, name);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("reserve_mem=", reserve_mem);
|
|
|
|
|
2019-05-13 17:22:59 -07:00
|
|
|
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
|
2023-05-19 18:53:21 +08:00
|
|
|
static const char * const flagname[] = {
|
|
|
|
[ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG",
|
|
|
|
[ilog2(MEMBLOCK_MIRROR)] = "MIRROR",
|
|
|
|
[ilog2(MEMBLOCK_NOMAP)] = "NOMAP",
|
|
|
|
[ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG",
|
2024-02-09 08:39:12 +05:30
|
|
|
[ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT",
|
2023-05-19 18:53:21 +08:00
|
|
|
};
|
2010-07-06 15:39:19 -07:00
|
|
|
|
|
|
|
static int memblock_debug_show(struct seq_file *m, void *private)
|
|
|
|
{
|
|
|
|
struct memblock_type *type = m->private;
|
|
|
|
struct memblock_region *reg;
|
2023-06-01 21:31:49 +08:00
|
|
|
int i, j, nid;
|
2023-05-19 18:53:21 +08:00
|
|
|
unsigned int count = ARRAY_SIZE(flagname);
|
2017-02-22 15:46:42 -08:00
|
|
|
phys_addr_t end;
|
2010-07-06 15:39:19 -07:00
|
|
|
|
|
|
|
for (i = 0; i < type->cnt; i++) {
|
|
|
|
reg = &type->regions[i];
|
2017-02-22 15:46:42 -08:00
|
|
|
end = reg->base + reg->size - 1;
|
2023-06-01 21:31:49 +08:00
|
|
|
nid = memblock_get_region_node(reg);
|
2010-07-06 15:39:19 -07:00
|
|
|
|
2017-02-22 15:46:42 -08:00
|
|
|
seq_printf(m, "%4d: ", i);
|
2023-05-19 18:53:21 +08:00
|
|
|
seq_printf(m, "%pa..%pa ", ®->base, &end);
|
2024-06-14 11:05:43 +03:00
|
|
|
if (numa_valid_node(nid))
|
2023-06-01 21:31:49 +08:00
|
|
|
seq_printf(m, "%4d ", nid);
|
|
|
|
else
|
|
|
|
seq_printf(m, "%4c ", 'x');
|
2023-05-19 18:53:21 +08:00
|
|
|
if (reg->flags) {
|
|
|
|
for (j = 0; j < count; j++) {
|
|
|
|
if (reg->flags & (1U << j)) {
|
|
|
|
seq_printf(m, "%s\n", flagname[j]);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (j == count)
|
|
|
|
seq_printf(m, "%s\n", "UNKNOWN");
|
|
|
|
} else {
|
|
|
|
seq_printf(m, "%s\n", "NONE");
|
|
|
|
}
|
2010-07-06 15:39:19 -07:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2018-04-05 16:23:16 -07:00
|
|
|
DEFINE_SHOW_ATTRIBUTE(memblock_debug);
|
2010-07-06 15:39:19 -07:00
|
|
|
|
|
|
|
static int __init memblock_init_debugfs(void)
|
|
|
|
{
|
|
|
|
struct dentry *root = debugfs_create_dir("memblock", NULL);
|
2019-03-05 15:46:09 -08:00
|
|
|
|
2018-06-14 15:27:58 -07:00
|
|
|
debugfs_create_file("memory", 0444, root,
|
|
|
|
&memblock.memory, &memblock_debug_fops);
|
|
|
|
debugfs_create_file("reserved", 0444, root,
|
|
|
|
&memblock.reserved, &memblock_debug_fops);
|
2014-01-29 18:16:01 +01:00
|
|
|
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
|
2020-07-01 16:18:29 +02:00
|
|
|
debugfs_create_file("physmem", 0444, root, &physmem,
|
|
|
|
&memblock_debug_fops);
|
2014-01-29 18:16:01 +01:00
|
|
|
#endif
|
2010-07-06 15:39:19 -07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
__initcall(memblock_init_debugfs);
|
|
|
|
|
|
|
|
#endif /* CONFIG_DEBUG_FS */
|