2019-06-03 07:44:50 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2012-03-05 11:49:27 +00:00
|
|
|
/*
|
|
|
|
* Based on arch/arm/mm/init.c
|
|
|
|
*
|
|
|
|
* Copyright (C) 1995-2005 Russell King
|
|
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/export.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/init.h>
|
2016-08-15 14:45:46 +08:00
|
|
|
#include <linux/cache.h>
|
2012-03-05 11:49:27 +00:00
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/nodemask.h>
|
|
|
|
#include <linux/initrd.h>
|
|
|
|
#include <linux/gfp.h>
|
2023-10-05 16:40:30 +01:00
|
|
|
#include <linux/math.h>
|
2012-03-05 11:49:27 +00:00
|
|
|
#include <linux/memblock.h>
|
|
|
|
#include <linux/sort.h>
|
2017-04-03 11:24:32 +09:00
|
|
|
#include <linux/of.h>
|
2012-03-05 11:49:27 +00:00
|
|
|
#include <linux/of_fdt.h>
|
2019-10-14 20:31:03 +02:00
|
|
|
#include <linux/dma-direct.h>
|
2020-09-11 10:56:52 +02:00
|
|
|
#include <linux/dma-map-ops.h>
|
2014-07-28 19:03:03 +01:00
|
|
|
#include <linux/efi.h>
|
2015-02-05 18:01:53 +00:00
|
|
|
#include <linux/swiotlb.h>
|
2016-09-05 19:30:22 +08:00
|
|
|
#include <linux/vmalloc.h>
|
2017-01-10 13:35:49 -08:00
|
|
|
#include <linux/mm.h>
|
2017-04-03 11:24:32 +09:00
|
|
|
#include <linux/kexec.h>
|
arm64: kdump: provide /proc/vmcore file
Arch-specific functions are added to allow for implementing a crash dump
file interface, /proc/vmcore, which can be viewed as a ELF file.
A user space tool, like kexec-tools, is responsible for allocating
a separate region for the core's ELF header within crash kdump kernel
memory and filling it in when executing kexec_load().
Then, its location will be advertised to crash dump kernel via a new
device-tree property, "linux,elfcorehdr", and crash dump kernel preserves
the region for later use with reserve_elfcorehdr() at boot time.
On crash dump kernel, /proc/vmcore will access the primary kernel's memory
with copy_oldmem_page(), which feeds the data page-by-page by ioremap'ing
it since it does not reside in linear mapping on crash dump kernel.
Meanwhile, elfcorehdr_read() is simple as the region is always mapped.
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Reviewed-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-04-03 11:24:38 +09:00
|
|
|
#include <linux/crash_dump.h>
|
mm: hugetlb: optionally allocate gigantic hugepages using cma
Commit 944d9fec8d7a ("hugetlb: add support for gigantic page allocation
at runtime") has added the run-time allocation of gigantic pages.
However it actually works only at early stages of the system loading,
when the majority of memory is free. After some time the memory gets
fragmented by non-movable pages, so the chances to find a contiguous 1GB
block are getting close to zero. Even dropping caches manually doesn't
help a lot.
At large scale rebooting servers in order to allocate gigantic hugepages
is quite expensive and complex. At the same time keeping some constant
percentage of memory in reserved hugepages even if the workload isn't
using it is a big waste: not all workloads can benefit from using 1 GB
pages.
The following solution can solve the problem:
1) On boot time a dedicated cma area* is reserved. The size is passed
as a kernel argument.
2) Run-time allocations of gigantic hugepages are performed using the
cma allocator and the dedicated cma area
In this case gigantic hugepages can be allocated successfully with a
high probability, however the memory isn't completely wasted if nobody
is using 1GB hugepages: it can be used for pagecache, anon memory, THPs,
etc.
* On a multi-node machine a per-node cma area is allocated on each node.
Following gigantic hugetlb allocation are using the first available
numa node if the mask isn't specified by a user.
Usage:
1) configure the kernel to allocate a cma area for hugetlb allocations:
pass hugetlb_cma=10G as a kernel argument
2) allocate hugetlb pages as usual, e.g.
echo 10 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
If the option isn't enabled or the allocation of the cma area failed,
the current behavior of the system is preserved.
x86 and arm-64 are covered by this patch, other architectures can be
trivially added later.
The patch contains clean-ups and fixes proposed and implemented by Aslan
Bakirov and Randy Dunlap. It also contains ideas and suggestions
proposed by Rik van Riel, Michal Hocko and Mike Kravetz. Thanks!
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Andreas Schaufler <andreas.schaufler@gmx.de>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@kernel.org>
Cc: Aslan Bakirov <aslan@fb.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Link: http://lkml.kernel.org/r/20200407163840.92263-3-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-10 14:32:45 -07:00
|
|
|
#include <linux/hugetlb.h>
|
arm64: mm: Set ZONE_DMA size based on early IORT scan
We recently introduced a 1 GB sized ZONE_DMA to cater for platforms
incorporating masters that can address less than 32 bits of DMA, in
particular the Raspberry Pi 4, which has 4 or 8 GB of DRAM, but has
peripherals that can only address up to 1 GB (and its PCIe host
bridge can only access the bottom 3 GB)
Instructing the DMA layer about these limitations is straight-forward,
even though we had to fix some issues regarding memory limits set in
the IORT for named components, and regarding the handling of ACPI _DMA
methods. However, the DMA layer also needs to be able to allocate
memory that is guaranteed to meet those DMA constraints, for bounce
buffering as well as allocating the backing for consistent mappings.
This is why the 1 GB ZONE_DMA was introduced recently. Unfortunately,
it turns out the having a 1 GB ZONE_DMA as well as a ZONE_DMA32 causes
problems with kdump, and potentially in other places where allocations
cannot cross zone boundaries. Therefore, we should avoid having two
separate DMA zones when possible.
So let's do an early scan of the IORT, and only create the ZONE_DMA
if we encounter any devices that need it. This puts the burden on
the firmware to describe such limitations in the IORT, which may be
redundant (and less precise) if _DMA methods are also being provided.
However, it should be noted that this situation is highly unusual for
arm64 ACPI machines. Also, the DMA subsystem still gives precedence to
the _DMA method if implemented, and so we will not lose the ability to
perform streaming DMA outside the ZONE_DMA if the _DMA method permits
it.
[nsaenz: unified implementation with DT's counterpart]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20201119175400.9995-7-nsaenzjulienne@suse.de
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-19 18:53:58 +01:00
|
|
|
#include <linux/acpi_iort.h>
|
arm64: kdump: Skip kmemleak scan reserved memory for kdump
Trying to boot with kdump + kmemleak, command will result in a crash:
"echo scan > /sys/kernel/debug/kmemleak"
crashkernel reserved: 0x0000000007c00000 - 0x0000000027c00000 (512 MB)
Kernel command line: BOOT_IMAGE=(hd1,gpt2)/vmlinuz-5.14.0-rc5-next-20210809+ root=/dev/mapper/ao-root ro rd.lvm.lv=ao/root rd.lvm.lv=ao/swap crashkernel=512M
Unable to handle kernel paging request at virtual address ffff000007c00000
Mem abort info:
ESR = 0x96000007
EC = 0x25: DABT (current EL), IL = 32 bits
SET = 0, FnV = 0
EA = 0, S1PTW = 0
FSC = 0x07: level 3 translation fault
Data abort info:
ISV = 0, ISS = 0x00000007
CM = 0, WnR = 0
swapper pgtable: 64k pages, 48-bit VAs, pgdp=00002024f0d80000
[ffff000007c00000] pgd=1800205ffffd0003, p4d=1800205ffffd0003, pud=1800205ffffd0003, pmd=1800205ffffc0003, pte=0068000007c00f06
Internal error: Oops: 96000007 [#1] SMP
pstate: 804000c9 (Nzcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : scan_block+0x98/0x230
lr : scan_block+0x94/0x230
sp : ffff80008d6cfb70
x29: ffff80008d6cfb70 x28: 0000000000000000 x27: 0000000000000000
x26: 00000000000000c0 x25: 0000000000000001 x24: 0000000000000000
x23: ffffa88a6b18b398 x22: ffff000007c00ff9 x21: ffffa88a6ac7fc40
x20: ffffa88a6af6a830 x19: ffff000007c00000 x18: 0000000000000000
x17: 0000000000000000 x16: 0000000000000000 x15: ffffffffffffffff
x14: ffffffff00000000 x13: ffffffffffffffff x12: 0000000000000020
x11: 0000000000000000 x10: 0000000001080000 x9 : ffffa88a6951c77c
x8 : ffffa88a6a893988 x7 : ffff203ff6cfb3c0 x6 : ffffa88a6a52b3c0
x5 : ffff203ff6cfb3c0 x4 : 0000000000000000 x3 : 0000000000000000
x2 : 0000000000000001 x1 : ffff20226cb56a40 x0 : 0000000000000000
Call trace:
scan_block+0x98/0x230
scan_gray_list+0x120/0x270
kmemleak_scan+0x3a0/0x648
kmemleak_write+0x3ac/0x4c8
full_proxy_write+0x6c/0xa0
vfs_write+0xc8/0x2b8
ksys_write+0x70/0xf8
__arm64_sys_write+0x24/0x30
invoke_syscall+0x4c/0x110
el0_svc_common+0x9c/0x190
do_el0_svc+0x30/0x98
el0_svc+0x28/0xd8
el0t_64_sync_handler+0x90/0xb8
el0t_64_sync+0x180/0x184
The reserved memory for kdump will be looked up by kmemleak, this area
will be set invalid when kdump service is bring up. That will result in
crash when kmemleak scan this area.
Fixes: a7259df76702 ("memblock: make memblock_find_in_range method private")
Signed-off-by: Chen Wandun <chenwandun@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210910064844.3827813-1-chenwandun@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-09-10 14:48:44 +08:00
|
|
|
#include <linux/kmemleak.h>
|
2024-05-05 19:06:24 +03:00
|
|
|
#include <linux/execmem.h>
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2016-02-16 13:52:42 +01:00
|
|
|
#include <asm/boot.h>
|
2014-07-16 17:42:43 +01:00
|
|
|
#include <asm/fixmap.h>
|
2016-02-16 13:52:40 +01:00
|
|
|
#include <asm/kasan.h>
|
2016-02-16 13:52:42 +01:00
|
|
|
#include <asm/kernel-pgtable.h>
|
KVM: arm64: Prepare the creation of s1 mappings at EL2
When memory protection is enabled, the EL2 code needs the ability to
create and manage its own page-table. To do so, introduce a new set of
hypercalls to bootstrap a memory management system at EL2.
This leads to the following boot flow in nVHE Protected mode:
1. the host allocates memory for the hypervisor very early on, using
the memblock API;
2. the host creates a set of stage 1 page-table for EL2, installs the
EL2 vectors, and issues the __pkvm_init hypercall;
3. during __pkvm_init, the hypervisor re-creates its stage 1 page-table
and stores it in the memory pool provided by the host;
4. the hypervisor then extends its stage 1 mappings to include a
vmemmap in the EL2 VA space, hence allowing to use the buddy
allocator introduced in a previous patch;
5. the hypervisor jumps back in the idmap page, switches from the
host-provided page-table to the new one, and wraps up its
initialization by enabling the new allocator, before returning to
the host.
6. the host can free the now unused page-table created for EL2, and
will now need to issue hypercalls to make changes to the EL2 stage 1
mappings instead of modifying them directly.
Note that for the sake of simplifying the review, this patch focuses on
the hypervisor side of things. In other words, this only implements the
new hypercalls, but does not make use of them from the host yet. The
host-side changes will follow in a subsequent patch.
Credits to Will for __pkvm_init_switch_pgd.
Acked-by: Will Deacon <will@kernel.org>
Co-authored-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210319100146.1149909-18-qperret@google.com
2021-03-19 10:01:25 +00:00
|
|
|
#include <asm/kvm_host.h>
|
arm64: Fix overlapping VA allocations
PCI IO space was intended to be 16MiB, at 32MiB below MODULES_VADDR, but
commit d1e6dc91b532d3d3 ("arm64: Add architectural support for PCI")
extended this to cover the full 32MiB. The final 8KiB of this 32MiB is
also allocated for the fixmap, allowing for potential clashes between
the two.
This change was masked by assumptions in mem_init and the page table
dumping code, which assumed the I/O space to be 16MiB long through
seaparte hard-coded definitions.
This patch changes the definition of the PCI I/O space allocation to
live in asm/memory.h, along with the other VA space allocations. As the
fixmap allocation depends on the number of fixmap entries, this is moved
below the PCI I/O space allocation. Both the fixmap and PCI I/O space
are guarded with 2MB of padding. Sites assuming the I/O space was 16MiB
are moved over use new PCI_IO_{START,END} definitions, which will keep
in sync with the size of the IO space (now restored to 16MiB).
As a useful side effect, the use of the new PCI_IO_{START,END}
definitions prevents a build issue in the dumping code due to a (now
redundant) missing include of io.h for PCI_IOBASE.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Liviu Dudau <liviu.dudau@arm.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: Will Deacon <will.deacon@arm.com>
[catalin.marinas@arm.com: reorder FIXADDR and PCI_IO address_markers_idx enum]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2015-01-22 18:20:35 +00:00
|
|
|
#include <asm/memory.h>
|
2016-04-08 15:50:27 -07:00
|
|
|
#include <asm/numa.h>
|
2024-10-17 14:14:30 +01:00
|
|
|
#include <asm/rsi.h>
|
2012-03-05 11:49:27 +00:00
|
|
|
#include <asm/sections.h>
|
|
|
|
#include <asm/setup.h>
|
2019-05-14 15:46:51 -07:00
|
|
|
#include <linux/sizes.h>
|
2012-03-05 11:49:27 +00:00
|
|
|
#include <asm/tlb.h>
|
2014-11-14 15:54:08 +00:00
|
|
|
#include <asm/alternative.h>
|
2021-05-12 13:18:22 -07:00
|
|
|
#include <asm/xen/swiotlb-xen.h>
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2016-02-16 13:52:42 +01:00
|
|
|
/*
|
|
|
|
* We need to be able to catch inadvertent references to memstart_addr
|
|
|
|
* that occur (potentially in generic code) before arm64_memblock_init()
|
|
|
|
* executes, which assigns it its actual value. So use a default value
|
|
|
|
* that cannot be mistaken for a real physical address.
|
|
|
|
*/
|
2016-08-15 14:45:46 +08:00
|
|
|
s64 memstart_addr __ro_after_init = -1;
|
2018-12-07 18:08:15 +00:00
|
|
|
EXPORT_SYMBOL(memstart_addr);
|
|
|
|
|
2019-09-11 20:25:45 +02:00
|
|
|
/*
|
2021-01-07 14:40:08 +00:00
|
|
|
* If the corresponding config options are enabled, we create both ZONE_DMA
|
|
|
|
* and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory
|
|
|
|
* unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4).
|
|
|
|
* In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
|
|
|
|
* otherwise it is empty.
|
2019-09-11 20:25:45 +02:00
|
|
|
*/
|
2022-03-02 09:38:09 -08:00
|
|
|
phys_addr_t __ro_after_init arm64_dma_phys_limit;
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2023-08-04 15:56:15 +08:00
|
|
|
/*
|
|
|
|
* To make optimal use of block mappings when laying out the linear
|
|
|
|
* mapping, round down the base of physical memory to a size that can
|
|
|
|
* be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
|
|
|
|
* (64k granule), or a multiple that can be mapped using contiguous bits
|
|
|
|
* in the page tables: 32 * PMD_SIZE (16k granule)
|
|
|
|
*/
|
|
|
|
#if defined(CONFIG_ARM64_4K_PAGES)
|
|
|
|
#define ARM64_MEMSTART_SHIFT PUD_SHIFT
|
|
|
|
#elif defined(CONFIG_ARM64_16K_PAGES)
|
|
|
|
#define ARM64_MEMSTART_SHIFT CONT_PMD_SHIFT
|
|
|
|
#else
|
|
|
|
#define ARM64_MEMSTART_SHIFT PMD_SHIFT
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sparsemem vmemmap imposes an additional requirement on the alignment of
|
|
|
|
* memstart_addr, due to the fact that the base of the vmemmap region
|
|
|
|
* has a direct correspondence, and needs to appear sufficiently aligned
|
|
|
|
* in the virtual address space.
|
|
|
|
*/
|
|
|
|
#if ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
|
|
|
|
#define ARM64_MEMSTART_ALIGN (1UL << SECTION_SIZE_BITS)
|
|
|
|
#else
|
|
|
|
#define ARM64_MEMSTART_ALIGN (1UL << ARM64_MEMSTART_SHIFT)
|
|
|
|
#endif
|
|
|
|
|
2023-09-14 11:31:40 +08:00
|
|
|
static void __init arch_reserve_crashkernel(void)
|
2017-04-03 11:24:32 +09:00
|
|
|
{
|
2023-09-14 11:31:40 +08:00
|
|
|
unsigned long long low_size = 0;
|
arm64: kdump: simplify the reservation behaviour of crashkernel=,high
On arm64, reservation for 'crashkernel=xM,high' is taken by searching for
suitable memory region top down. If the 'xM' of crashkernel high memory
is reserved from high memory successfully, it will try to reserve
crashkernel low memory later accoringly. Otherwise, it will try to search
low memory area for the 'xM' suitable region. Please see the details in
Documentation/admin-guide/kernel-parameters.txt.
While we observed an unexpected case where a reserved region crosses the
high and low meomry boundary. E.g on a system with 4G as low memory end,
user added the kernel parameters like: 'crashkernel=512M,high', it could
finally have [4G-126M, 4G+386M], [1G, 1G+128M] regions in running kernel.
The crashkernel high region crossing low and high memory boudary will bring
issues:
1) For crashkernel=x,high, if getting crashkernel high region across
low and high memory boundary, then user will see two memory regions in
low memory, and one memory region in high memory. The two crashkernel
low memory regions are confusing as shown in above example.
2) If people explicityly specify "crashkernel=x,high crashkernel=y,low"
and y <= 128M, when crashkernel high region crosses low and high memory
boundary and the part of crashkernel high reservation below boundary is
bigger than y, the expected crahskernel low reservation will be skipped.
But the expected crashkernel high reservation is shrank and could not
satisfy user space requirement.
3) The crossing boundary behaviour of crahskernel high reservation is
different than x86 arch. On x86_64, the low memory end is 4G fixedly,
and the memory near 4G is reserved by system, e.g for mapping firmware,
pci mapping, so the crashkernel reservation crossing boundary never happens.
From distros point of view, this brings inconsistency and confusion. Users
need to dig into x86 and arm64 system details to find out why.
For kernel itself, the impact of issue 3) could be slight. While issue
1) and 2) cause actual impact because it brings obscure semantics and
behaviour to crashkernel=,high reservation.
Here, for crashkernel=xM,high, search the high memory for the suitable
region only in high memory. If failed, try reserving the suitable
region only in low memory. Like this, the crashkernel high region will
only exist in high memory, and crashkernel low region only exists in low
memory. The reservation behaviour for crashkernel=,high is clearer and
simpler.
Note: RPi4 has different zone ranges than normal memory. Its DMA zone is
0~1G, and DMA32 zone is 1G~4G if CONFIG_ZONE_DMA|DMA32 are enabled by
default. The low memory end is 1G in order to validate all devices, high
memory starts at 1G memory. However, for being consistent with normal
arm64 system, its low memory end is still 1G, while reserving crashkernel
high memory from 4G if crashkernel=size,high specified. This will remove
confusion.
With above change applied, summary of arm64 crashkernel reservation range:
1)
RPi4(zone DMA:0~1G; DMA32:1G~4G):
crashkernel=size
0~1G: low memory | 1G~top: high memory
crashkernel=size,high
0~1G: low memory | 4G~top: high memory
2)
Other normal system:
crashkernel=size
crashkernel=size,high
0~4G: low memory | 4G~top: high memory
3)
Systems w/o zone DMA|DMA32
crashkernel=size
crashkernel=size,high
0~top: low memory
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Link: https://lore.kernel.org/r/ZGIBSEoZ7VRVvP8H@MiWiFi-R3L-srv
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2023-05-15 17:54:16 +08:00
|
|
|
unsigned long long crash_base, crash_size;
|
2022-05-06 19:43:59 +08:00
|
|
|
char *cmdline = boot_command_line;
|
arm64: kdump: simplify the reservation behaviour of crashkernel=,high
On arm64, reservation for 'crashkernel=xM,high' is taken by searching for
suitable memory region top down. If the 'xM' of crashkernel high memory
is reserved from high memory successfully, it will try to reserve
crashkernel low memory later accoringly. Otherwise, it will try to search
low memory area for the 'xM' suitable region. Please see the details in
Documentation/admin-guide/kernel-parameters.txt.
While we observed an unexpected case where a reserved region crosses the
high and low meomry boundary. E.g on a system with 4G as low memory end,
user added the kernel parameters like: 'crashkernel=512M,high', it could
finally have [4G-126M, 4G+386M], [1G, 1G+128M] regions in running kernel.
The crashkernel high region crossing low and high memory boudary will bring
issues:
1) For crashkernel=x,high, if getting crashkernel high region across
low and high memory boundary, then user will see two memory regions in
low memory, and one memory region in high memory. The two crashkernel
low memory regions are confusing as shown in above example.
2) If people explicityly specify "crashkernel=x,high crashkernel=y,low"
and y <= 128M, when crashkernel high region crosses low and high memory
boundary and the part of crashkernel high reservation below boundary is
bigger than y, the expected crahskernel low reservation will be skipped.
But the expected crashkernel high reservation is shrank and could not
satisfy user space requirement.
3) The crossing boundary behaviour of crahskernel high reservation is
different than x86 arch. On x86_64, the low memory end is 4G fixedly,
and the memory near 4G is reserved by system, e.g for mapping firmware,
pci mapping, so the crashkernel reservation crossing boundary never happens.
From distros point of view, this brings inconsistency and confusion. Users
need to dig into x86 and arm64 system details to find out why.
For kernel itself, the impact of issue 3) could be slight. While issue
1) and 2) cause actual impact because it brings obscure semantics and
behaviour to crashkernel=,high reservation.
Here, for crashkernel=xM,high, search the high memory for the suitable
region only in high memory. If failed, try reserving the suitable
region only in low memory. Like this, the crashkernel high region will
only exist in high memory, and crashkernel low region only exists in low
memory. The reservation behaviour for crashkernel=,high is clearer and
simpler.
Note: RPi4 has different zone ranges than normal memory. Its DMA zone is
0~1G, and DMA32 zone is 1G~4G if CONFIG_ZONE_DMA|DMA32 are enabled by
default. The low memory end is 1G in order to validate all devices, high
memory starts at 1G memory. However, for being consistent with normal
arm64 system, its low memory end is still 1G, while reserving crashkernel
high memory from 4G if crashkernel=size,high specified. This will remove
confusion.
With above change applied, summary of arm64 crashkernel reservation range:
1)
RPi4(zone DMA:0~1G; DMA32:1G~4G):
crashkernel=size
0~1G: low memory | 1G~top: high memory
crashkernel=size,high
0~1G: low memory | 4G~top: high memory
2)
Other normal system:
crashkernel=size
crashkernel=size,high
0~4G: low memory | 4G~top: high memory
3)
Systems w/o zone DMA|DMA32
crashkernel=size
crashkernel=size,high
0~top: low memory
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Link: https://lore.kernel.org/r/ZGIBSEoZ7VRVvP8H@MiWiFi-R3L-srv
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2023-05-15 17:54:16 +08:00
|
|
|
bool high = false;
|
|
|
|
int ret;
|
2017-04-03 11:24:32 +09:00
|
|
|
|
2024-01-24 13:12:47 +08:00
|
|
|
if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
|
2022-03-23 16:06:42 -07:00
|
|
|
return;
|
|
|
|
|
2022-05-06 19:43:59 +08:00
|
|
|
ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
|
2023-09-14 11:31:40 +08:00
|
|
|
&crash_size, &crash_base,
|
|
|
|
&low_size, &high);
|
|
|
|
if (ret)
|
2017-04-03 11:24:32 +09:00
|
|
|
return;
|
2022-05-06 19:43:59 +08:00
|
|
|
|
2023-09-14 11:31:40 +08:00
|
|
|
reserve_crashkernel_generic(cmdline, crash_size, crash_base,
|
|
|
|
low_size, high);
|
2017-04-03 11:24:32 +09:00
|
|
|
}
|
|
|
|
|
2024-08-11 10:09:35 +03:00
|
|
|
static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
|
2014-07-18 11:54:37 +01:00
|
|
|
{
|
2024-09-01 09:12:34 +03:00
|
|
|
/**
|
|
|
|
* Information we get from firmware (e.g. DT dma-ranges) describe DMA
|
|
|
|
* bus constraints. Devices using DMA might have their own limitations.
|
|
|
|
* Some of them rely on DMA zone in low 32-bit memory. Keep low RAM
|
|
|
|
* DMA zone on platforms that have RAM there.
|
|
|
|
*/
|
|
|
|
if (memblock_start_of_DRAM() < U32_MAX)
|
|
|
|
zone_limit = min(zone_limit, U32_MAX);
|
2024-08-28 12:51:24 +03:00
|
|
|
|
2024-08-11 10:09:35 +03:00
|
|
|
return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
|
2014-07-18 11:54:37 +01:00
|
|
|
}
|
|
|
|
|
2022-04-11 17:24:55 +08:00
|
|
|
static void __init zone_sizes_init(void)
|
2016-04-08 15:50:27 -07:00
|
|
|
{
|
|
|
|
unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
|
2024-08-11 10:09:35 +03:00
|
|
|
phys_addr_t __maybe_unused acpi_zone_dma_limit;
|
|
|
|
phys_addr_t __maybe_unused dt_zone_dma_limit;
|
|
|
|
phys_addr_t __maybe_unused dma32_phys_limit =
|
|
|
|
max_zone_phys(DMA_BIT_MASK(32));
|
2016-04-08 15:50:27 -07:00
|
|
|
|
2019-09-11 20:25:45 +02:00
|
|
|
#ifdef CONFIG_ZONE_DMA
|
2024-08-11 10:09:35 +03:00
|
|
|
acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
|
|
|
|
dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
|
|
|
|
zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
|
|
|
|
arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
|
2019-09-11 20:25:45 +02:00
|
|
|
max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
|
|
|
|
#endif
|
2019-05-29 00:08:20 +08:00
|
|
|
#ifdef CONFIG_ZONE_DMA32
|
2021-01-07 14:40:08 +00:00
|
|
|
max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
|
|
|
|
if (!arm64_dma_phys_limit)
|
|
|
|
arm64_dma_phys_limit = dma32_phys_limit;
|
2019-05-29 00:08:20 +08:00
|
|
|
#endif
|
2023-04-07 09:15:07 +08:00
|
|
|
if (!arm64_dma_phys_limit)
|
|
|
|
arm64_dma_phys_limit = PHYS_MASK + 1;
|
2022-04-11 17:24:55 +08:00
|
|
|
max_zone_pfns[ZONE_NORMAL] = max_pfn;
|
2016-04-08 15:50:27 -07:00
|
|
|
|
2020-06-03 15:57:10 -07:00
|
|
|
free_area_init(max_zone_pfns);
|
2016-04-08 15:50:27 -07:00
|
|
|
}
|
|
|
|
|
2021-06-30 18:51:19 -07:00
|
|
|
int pfn_is_map_memory(unsigned long pfn)
|
2012-03-05 11:49:27 +00:00
|
|
|
{
|
2021-03-05 10:54:58 +05:30
|
|
|
phys_addr_t addr = PFN_PHYS(pfn);
|
2018-12-11 18:48:48 +00:00
|
|
|
|
2021-06-30 18:51:19 -07:00
|
|
|
/* avoid false positives for bogus PFNs, see comment in pfn_valid() */
|
|
|
|
if (PHYS_PFN(addr) != pfn)
|
2018-12-11 18:48:48 +00:00
|
|
|
return 0;
|
2021-03-05 10:54:57 +05:30
|
|
|
|
arm64: mm: check for upper PAGE_SHIFT bits in pfn_valid()
ARM64's pfn_valid() shifts away the upper PAGE_SHIFT bits of the input
before seeing if the PFN is valid. This leads to false positives when
some of the upper bits are set, but the lower bits match a valid PFN.
For example, the following userspace code looks up a bogus entry in
/proc/kpageflags:
int pagemap = open("/proc/self/pagemap", O_RDONLY);
int pageflags = open("/proc/kpageflags", O_RDONLY);
uint64_t pfn, val;
lseek64(pagemap, [...], SEEK_SET);
read(pagemap, &pfn, sizeof(pfn));
if (pfn & (1UL << 63)) { /* valid PFN */
pfn &= ((1UL << 55) - 1); /* clear flag bits */
pfn |= (1UL << 55);
lseek64(pageflags, pfn * sizeof(uint64_t), SEEK_SET);
read(pageflags, &val, sizeof(val));
}
On ARM64 this causes the userspace process to crash with SIGSEGV rather
than reading (1 << KPF_NOPAGE). kpageflags_read() treats the offset as
valid, and stable_page_flags() will try to access an address between the
user and kernel address ranges.
Fixes: c1cc1552616d ("arm64: MMU initialisation")
Cc: stable@vger.kernel.org
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2018-08-15 12:51:21 -07:00
|
|
|
return memblock_is_map_memory(addr);
|
2012-03-05 11:49:27 +00:00
|
|
|
}
|
2021-06-30 18:51:19 -07:00
|
|
|
EXPORT_SYMBOL(pfn_is_map_memory);
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2021-12-15 14:45:58 +08:00
|
|
|
static phys_addr_t memory_limit __ro_after_init = PHYS_ADDR_MAX;
|
2015-01-15 16:42:14 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Limit the memory size that was specified via FDT.
|
|
|
|
*/
|
|
|
|
static int __init early_mem(char *p)
|
|
|
|
{
|
|
|
|
if (!p)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
memory_limit = memparse(p, &p) & PAGE_MASK;
|
|
|
|
pr_notice("Memory limited to %lldMB\n", memory_limit >> 20);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
early_param("mem", early_mem);
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
void __init arm64_memblock_init(void)
|
|
|
|
{
|
2021-08-26 18:56:13 +02:00
|
|
|
s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Corner case: 52-bit VA capable systems running KVM in nVHE mode may
|
|
|
|
* be limited in their ability to support a linear map that exceeds 51
|
|
|
|
* bits of VA space, depending on the placement of the ID map. Given
|
|
|
|
* that the placement of the ID map may be randomized, let's simply
|
|
|
|
* limit the kernel's linear map to 51 bits as well if we detect this
|
|
|
|
* configuration.
|
|
|
|
*/
|
|
|
|
if (IS_ENABLED(CONFIG_KVM) && vabits_actual == 52 &&
|
|
|
|
is_hyp_mode_available() && !is_kernel_in_hyp_mode()) {
|
|
|
|
pr_info("Capping linear region to 51 bits for KVM in nVHE mode on LVA capable hardware.\n");
|
|
|
|
linear_region_size = min_t(u64, linear_region_size, BIT(51));
|
|
|
|
}
|
2016-02-16 13:52:42 +01:00
|
|
|
|
2018-01-18 19:13:11 +00:00
|
|
|
/* Remove memory above our supported physical address size */
|
|
|
|
memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);
|
|
|
|
|
2016-02-16 13:52:42 +01:00
|
|
|
/*
|
|
|
|
* Select a suitable value for the base of physical memory.
|
|
|
|
*/
|
|
|
|
memstart_addr = round_down(memblock_start_of_DRAM(),
|
|
|
|
ARM64_MEMSTART_ALIGN);
|
|
|
|
|
arm64: Warn the user when a small VA_BITS value wastes memory
The memblock code ignores any memory that doesn't fit in the
linear mapping. In order to preserve the distance between two physical
memory locations and their mappings in the linear map, any hole between
two memory regions occupies the same space in the linear map.
On most systems, this is hardly a problem (the memory banks are close
together, and VA_BITS represents a large space compared to the available
memory *and* the potential gaps).
On NUMA systems, things are quite different: the gaps between the
memory nodes can be pretty large compared to the memory size itself,
and the range from memblock_start_of_DRAM() to memblock_end_of_DRAM()
can exceed the space described by VA_BITS.
Unfortunately, we're not very good at making this obvious to the user,
and on a D05 system (two sockets and 4 nodes with 64GB each)
accidentally configured with 39bit VA, we display something like this:
[ 0.000000] NUMA: NODE_DATA [mem 0x1ffbffe100-0x1ffbffffff]
[ 0.000000] NUMA: NODE_DATA [mem 0x2febfc1100-0x2febfc2fff]
[ 0.000000] NUMA: Initmem setup node 2 [<memory-less node>]
[ 0.000000] NUMA: NODE_DATA [mem 0x2febfbf200-0x2febfc10ff]
[ 0.000000] NUMA: NODE_DATA(2) on node 1
[ 0.000000] NUMA: Initmem setup node 3 [<memory-less node>]
[ 0.000000] NUMA: NODE_DATA [mem 0x2febfbd300-0x2febfbf1ff]
[ 0.000000] NUMA: NODE_DATA(3) on node 1
which isn't very explicit, and doesn't tell the user why 128GB
have suddently disappeared.
Let's add a warning message telling the user that memory has been
truncated, and offer a potential solution (bumping VA_BITS up).
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20201215152918.1511108-1-maz@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-12-15 15:29:18 +00:00
|
|
|
if ((memblock_end_of_DRAM() - memstart_addr) > linear_region_size)
|
|
|
|
pr_warn("Memory doesn't fit in the linear mapping, VA_BITS too small\n");
|
|
|
|
|
2016-02-16 13:52:42 +01:00
|
|
|
/*
|
|
|
|
* Remove the memory that we will not be able to cover with the
|
|
|
|
* linear mapping. Take care not to clip the kernel which may be
|
|
|
|
* high in memory.
|
|
|
|
*/
|
2017-01-10 13:35:49 -08:00
|
|
|
memblock_remove(max_t(u64, memstart_addr + linear_region_size,
|
|
|
|
__pa_symbol(_end)), ULLONG_MAX);
|
2016-03-30 14:25:46 +02:00
|
|
|
if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
|
|
|
|
/* ensure that memstart_addr remains sufficiently aligned */
|
|
|
|
memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
|
|
|
|
ARM64_MEMSTART_ALIGN);
|
|
|
|
memblock_remove(0, memstart_addr);
|
|
|
|
}
|
2016-02-16 13:52:42 +01:00
|
|
|
|
arm64: mm: use single quantity to represent the PA to VA translation
On arm64, the global variable memstart_addr represents the physical
address of PAGE_OFFSET, and so physical to virtual translations or
vice versa used to come down to simple additions or subtractions
involving the values of PAGE_OFFSET and memstart_addr.
When support for 52-bit virtual addressing was introduced, we had to
deal with PAGE_OFFSET potentially being outside of the region that
can be covered by the virtual range (as the 52-bit VA capable build
needs to be able to run on systems that are only 48-bit VA capable),
and for this reason, another translation was introduced, and recorded
in the global variable physvirt_offset.
However, if we go back to the original definition of memstart_addr,
i.e., the physical address of PAGE_OFFSET, it turns out that there is
no need for two separate translations: instead, we can simply subtract
the size of the unaddressable VA space from memstart_addr to make the
available physical memory appear in the 48-bit addressable VA region.
This simplifies things, but also fixes a bug on KASLR builds, which
may update memstart_addr later on in arm64_memblock_init(), but fails
to update vmemmap and physvirt_offset accordingly.
Fixes: 5383cc6efed1 ("arm64: mm: Introduce vabits_actual")
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Steve Capper <steve.capper@arm.com>
Link: https://lore.kernel.org/r/20201008153602.9467-2-ardb@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2020-10-08 17:35:59 +02:00
|
|
|
/*
|
|
|
|
* If we are running with a 52-bit kernel VA config on a system that
|
|
|
|
* does not support it, we have to place the available physical
|
|
|
|
* memory in the 48-bit addressable part of the linear region, i.e.,
|
|
|
|
* we have to move it upward. Since memstart_addr represents the
|
|
|
|
* physical address of PAGE_OFFSET, we have to *subtract* from it.
|
|
|
|
*/
|
|
|
|
if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
|
arm64: Enable LPA2 at boot if supported by the system
Update the early kernel mapping code to take 52-bit virtual addressing
into account based on the LPA2 feature. This is a bit more involved than
LVA (which is supported with 64k pages only), given that some page table
descriptor bits change meaning in this case.
To keep the handling in asm to a minimum, the initial ID map is still
created with 48-bit virtual addressing, which implies that the kernel
image must be loaded into 48-bit addressable physical memory. This is
currently required by the boot protocol, even though we happen to
support placement outside of that for LVA/64k based configurations.
Enabling LPA2 involves more than setting TCR.T1SZ to a lower value,
there is also a DS bit in TCR that needs to be set, and which changes
the meaning of bits [9:8] in all page table descriptors. Since we cannot
enable DS and every live page table descriptor at the same time, let's
pivot through another temporary mapping. This avoids the need to
reintroduce manipulations of the page tables with the MMU and caches
disabled.
To permit the LPA2 feature to be overridden on the kernel command line,
which may be necessary to work around silicon errata, or to deal with
mismatched features on heterogeneous SoC designs, test for CPU feature
overrides first, and only then enable LPA2.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-78-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2024-02-14 13:29:19 +01:00
|
|
|
memstart_addr -= _PAGE_OFFSET(vabits_actual) - _PAGE_OFFSET(52);
|
arm64: mm: use single quantity to represent the PA to VA translation
On arm64, the global variable memstart_addr represents the physical
address of PAGE_OFFSET, and so physical to virtual translations or
vice versa used to come down to simple additions or subtractions
involving the values of PAGE_OFFSET and memstart_addr.
When support for 52-bit virtual addressing was introduced, we had to
deal with PAGE_OFFSET potentially being outside of the region that
can be covered by the virtual range (as the 52-bit VA capable build
needs to be able to run on systems that are only 48-bit VA capable),
and for this reason, another translation was introduced, and recorded
in the global variable physvirt_offset.
However, if we go back to the original definition of memstart_addr,
i.e., the physical address of PAGE_OFFSET, it turns out that there is
no need for two separate translations: instead, we can simply subtract
the size of the unaddressable VA space from memstart_addr to make the
available physical memory appear in the 48-bit addressable VA region.
This simplifies things, but also fixes a bug on KASLR builds, which
may update memstart_addr later on in arm64_memblock_init(), but fails
to update vmemmap and physvirt_offset accordingly.
Fixes: 5383cc6efed1 ("arm64: mm: Introduce vabits_actual")
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Steve Capper <steve.capper@arm.com>
Link: https://lore.kernel.org/r/20201008153602.9467-2-ardb@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2020-10-08 17:35:59 +02:00
|
|
|
|
2016-02-16 13:52:42 +01:00
|
|
|
/*
|
|
|
|
* Apply the memory limit if it was set. Since the kernel may be loaded
|
|
|
|
* high up in memory, add back the kernel region that must be accessible
|
|
|
|
* via the linear mapping.
|
|
|
|
*/
|
2018-06-14 15:28:02 -07:00
|
|
|
if (memory_limit != PHYS_ADDR_MAX) {
|
2016-07-28 15:48:29 -07:00
|
|
|
memblock_mem_limit_remove_map(memory_limit);
|
2017-01-10 13:35:49 -08:00
|
|
|
memblock_add(__pa_symbol(_text), (u64)(_end - _text));
|
2016-02-16 13:52:42 +01:00
|
|
|
}
|
2015-01-15 16:42:14 +00:00
|
|
|
|
2018-11-05 14:54:29 -08:00
|
|
|
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
|
2016-03-30 15:18:42 +02:00
|
|
|
/*
|
|
|
|
* Add back the memory we just removed if it results in the
|
|
|
|
* initrd to become inaccessible via the linear mapping.
|
|
|
|
* Otherwise, this is a no-op
|
|
|
|
*/
|
2018-11-05 14:54:29 -08:00
|
|
|
u64 base = phys_initrd_start & PAGE_MASK;
|
2019-04-17 21:29:29 -07:00
|
|
|
u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;
|
2016-03-30 15:18:42 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We can only add back the initrd memory if we don't end up
|
|
|
|
* with more memory than we can address via the linear mapping.
|
|
|
|
* It is up to the bootloader to position the kernel and the
|
|
|
|
* initrd reasonably close to each other (i.e., within 32 GB of
|
|
|
|
* each other) so that all granule/#levels combinations can
|
|
|
|
* always access both.
|
|
|
|
*/
|
|
|
|
if (WARN(base < memblock_start_of_DRAM() ||
|
|
|
|
base + size > memblock_start_of_DRAM() +
|
|
|
|
linear_region_size,
|
|
|
|
"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
|
2019-04-03 17:58:39 +01:00
|
|
|
phys_initrd_size = 0;
|
2016-03-30 15:18:42 +02:00
|
|
|
} else {
|
|
|
|
memblock_add(base, size);
|
2022-06-14 17:21:55 +08:00
|
|
|
memblock_clear_nomap(base, size);
|
2016-03-30 15:18:42 +02:00
|
|
|
memblock_reserve(base, size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-01-29 11:59:03 +01:00
|
|
|
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
|
|
|
|
extern u16 memstart_offset_seed;
|
2020-10-14 10:18:57 +02:00
|
|
|
u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
|
|
|
|
int parange = cpuid_feature_extract_unsigned_field(
|
2022-09-05 23:54:01 +01:00
|
|
|
mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT);
|
2020-10-14 10:18:57 +02:00
|
|
|
s64 range = linear_region_size -
|
|
|
|
BIT(id_aa64mmfr0_parange_to_phys_shift(parange));
|
2016-01-29 11:59:03 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the size of the linear region exceeds, by a sufficient
|
2020-10-14 10:18:57 +02:00
|
|
|
* margin, the size of the region that the physical memory can
|
|
|
|
* span, randomize the linear region as well.
|
2016-01-29 11:59:03 +01:00
|
|
|
*/
|
2020-10-14 10:18:57 +02:00
|
|
|
if (memstart_offset_seed > 0 && range >= (s64)ARM64_MEMSTART_ALIGN) {
|
2018-12-24 07:40:07 +00:00
|
|
|
range /= ARM64_MEMSTART_ALIGN;
|
2016-01-29 11:59:03 +01:00
|
|
|
memstart_addr -= ARM64_MEMSTART_ALIGN *
|
|
|
|
((range * memstart_offset_seed) >> 16);
|
|
|
|
}
|
|
|
|
}
|
2015-01-15 16:42:14 +00:00
|
|
|
|
2014-06-24 16:51:35 +01:00
|
|
|
/*
|
|
|
|
* Register the kernel text, kernel data, initrd, and initial
|
|
|
|
* pagetables with memblock.
|
|
|
|
*/
|
arm64: omit [_text, _stext) from permanent kernel mapping
In a previous patch, we increased the size of the EFI PE/COFF header
to 64 KB, which resulted in the _stext symbol to appear at a fixed
offset of 64 KB into the image.
Since 64 KB is also the largest page size we support, this completely
removes the need to map the first 64 KB of the kernel image, given that
it only contains the arm64 Image header and the EFI header, neither of
which we ever access again after booting the kernel. More importantly,
we should avoid an executable mapping of non-executable and not entirely
predictable data, to deal with the unlikely event that we inadvertently
emitted something that looks like an opcode that could be used as a
gadget for speculative execution.
So let's limit the kernel mapping of .text to the [_stext, _etext)
region, which matches the view of generic code (such as kallsyms) when
it reasons about the boundaries of the kernel's .text section.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201117124729.12642-2-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-17 13:47:27 +01:00
|
|
|
memblock_reserve(__pa_symbol(_stext), _end - _stext);
|
2018-11-05 14:54:29 -08:00
|
|
|
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
|
2016-02-16 13:52:41 +01:00
|
|
|
/* the generic initrd code expects virtual addresses */
|
2018-11-05 14:54:29 -08:00
|
|
|
initrd_start = __phys_to_virt(phys_initrd_start);
|
|
|
|
initrd_end = initrd_start + phys_initrd_size;
|
2016-02-16 13:52:41 +01:00
|
|
|
}
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2014-09-08 13:01:08 -04:00
|
|
|
early_init_fdt_scan_reserved_mem();
|
2014-06-13 13:41:20 +01:00
|
|
|
|
2017-12-04 14:13:05 +00:00
|
|
|
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
|
2012-03-05 11:49:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void __init bootmem_init(void)
|
|
|
|
{
|
|
|
|
unsigned long min, max;
|
|
|
|
|
|
|
|
min = PFN_UP(memblock_start_of_DRAM());
|
|
|
|
max = PFN_DOWN(memblock_end_of_DRAM());
|
|
|
|
|
2015-04-14 15:48:33 -07:00
|
|
|
early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
|
|
|
|
|
2016-04-08 15:50:27 -07:00
|
|
|
max_pfn = max_low_pfn = max;
|
2019-03-21 12:21:25 +08:00
|
|
|
min_low_pfn = min;
|
2016-04-08 15:50:27 -07:00
|
|
|
|
2020-11-18 16:38:25 -08:00
|
|
|
arch_numa_init();
|
2020-06-18 09:58:28 +12:00
|
|
|
|
|
|
|
/*
|
2020-11-18 16:38:25 -08:00
|
|
|
* must be done after arch_numa_init() which calls numa_init() to
|
2020-06-18 09:58:28 +12:00
|
|
|
* initialize node_online_map that gets used in hugetlb_cma_reserve()
|
|
|
|
* while allocating required CMA size across online nodes.
|
|
|
|
*/
|
2020-07-01 10:12:01 +05:30
|
|
|
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
|
|
|
|
arm64_hugetlb_cma_reserve();
|
2020-06-18 09:58:28 +12:00
|
|
|
#endif
|
|
|
|
|
KVM: arm64: Prepare the creation of s1 mappings at EL2
When memory protection is enabled, the EL2 code needs the ability to
create and manage its own page-table. To do so, introduce a new set of
hypercalls to bootstrap a memory management system at EL2.
This leads to the following boot flow in nVHE Protected mode:
1. the host allocates memory for the hypervisor very early on, using
the memblock API;
2. the host creates a set of stage 1 page-table for EL2, installs the
EL2 vectors, and issues the __pkvm_init hypercall;
3. during __pkvm_init, the hypervisor re-creates its stage 1 page-table
and stores it in the memory pool provided by the host;
4. the hypervisor then extends its stage 1 mappings to include a
vmemmap in the EL2 VA space, hence allowing to use the buddy
allocator introduced in a previous patch;
5. the hypervisor jumps back in the idmap page, switches from the
host-provided page-table to the new one, and wraps up its
initialization by enabling the new allocator, before returning to
the host.
6. the host can free the now unused page-table created for EL2, and
will now need to issue hypercalls to make changes to the EL2 stage 1
mappings instead of modifying them directly.
Note that for the sake of simplifying the review, this patch focuses on
the hypervisor side of things. In other words, this only implements the
new hypercalls, but does not make use of them from the host yet. The
host-side changes will follow in a subsequent patch.
Credits to Will for __pkvm_init_switch_pgd.
Acked-by: Will Deacon <will@kernel.org>
Co-authored-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210319100146.1149909-18-qperret@google.com
2021-03-19 10:01:25 +00:00
|
|
|
kvm_hyp_reserve();
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
/*
|
2020-08-06 23:24:02 -07:00
|
|
|
* sparse_init() tries to allocate memory from memblock, so must be
|
|
|
|
* done after the fixed reservations
|
2012-03-05 11:49:27 +00:00
|
|
|
*/
|
|
|
|
sparse_init();
|
2022-04-11 17:24:55 +08:00
|
|
|
zone_sizes_init();
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2021-01-07 14:40:08 +00:00
|
|
|
/*
|
|
|
|
* Reserve the CMA area after arm64_dma_phys_limit was initialised.
|
|
|
|
*/
|
|
|
|
dma_contiguous_reserve(arm64_dma_phys_limit);
|
|
|
|
|
2020-11-19 18:53:53 +01:00
|
|
|
/*
|
|
|
|
* request_standard_resources() depends on crashkernel's memory being
|
|
|
|
* reserved, so do it here.
|
|
|
|
*/
|
2023-09-14 11:31:40 +08:00
|
|
|
arch_reserve_crashkernel();
|
2020-11-19 18:53:53 +01:00
|
|
|
|
2016-04-08 15:50:27 -07:00
|
|
|
memblock_dump_all();
|
2012-03-05 11:49:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* mem_init() marks the free areas in the mem_map and tells us how much memory
|
|
|
|
* is free. This is done after various parts of the system have claimed their
|
|
|
|
* memory after the kernel image.
|
|
|
|
*/
|
|
|
|
void __init mem_init(void)
|
|
|
|
{
|
2024-10-17 14:14:30 +01:00
|
|
|
unsigned int flags = SWIOTLB_VERBOSE;
|
2023-06-12 16:32:01 +01:00
|
|
|
bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
|
|
|
|
|
2024-10-17 14:14:30 +01:00
|
|
|
if (is_realm_world()) {
|
|
|
|
swiotlb = true;
|
|
|
|
flags |= SWIOTLB_FORCE;
|
|
|
|
}
|
|
|
|
|
2023-10-05 16:40:30 +01:00
|
|
|
if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
|
|
|
|
/*
|
|
|
|
* If no bouncing needed for ZONE_DMA, reduce the swiotlb
|
|
|
|
* buffer for kmalloc() bouncing to 1MB per 1GB of RAM.
|
|
|
|
*/
|
|
|
|
unsigned long size =
|
|
|
|
DIV_ROUND_UP(memblock_phys_mem_size(), 1024);
|
|
|
|
swiotlb_adjust_size(min(swiotlb_size_or_default(), size));
|
2023-06-12 16:32:01 +01:00
|
|
|
swiotlb = true;
|
2023-10-05 16:40:30 +01:00
|
|
|
}
|
2023-06-12 16:32:01 +01:00
|
|
|
|
2024-10-17 14:14:30 +01:00
|
|
|
swiotlb_init(swiotlb, flags);
|
|
|
|
swiotlb_update_mem_attributes();
|
2015-02-05 18:01:53 +00:00
|
|
|
|
2013-07-03 15:03:49 -07:00
|
|
|
/* this will put all unused low memory onto the freelists */
|
2018-10-30 15:09:30 -07:00
|
|
|
memblock_free_all();
|
2012-03-05 11:49:27 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check boundaries twice: Some fundamental inconsistencies can be
|
|
|
|
* detected at build time already.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_COMPAT
|
2018-12-06 22:50:37 +00:00
|
|
|
BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
|
2012-03-05 11:49:27 +00:00
|
|
|
#endif
|
|
|
|
|
2021-05-10 17:52:06 +05:30
|
|
|
/*
|
|
|
|
* Selected page table levels should match when derived from
|
|
|
|
* scratch using the virtual address range and page size.
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(ARM64_HW_PGTABLE_LEVELS(CONFIG_ARM64_VA_BITS) !=
|
|
|
|
CONFIG_PGTABLE_LEVELS);
|
|
|
|
|
2013-07-03 15:03:49 -07:00
|
|
|
if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
|
2012-03-05 11:49:27 +00:00
|
|
|
extern int sysctl_overcommit_memory;
|
|
|
|
/*
|
|
|
|
* On a machine this small we won't get anywhere without
|
|
|
|
* overcommit, so turn it on by default.
|
|
|
|
*/
|
|
|
|
sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void free_initmem(void)
|
|
|
|
{
|
2024-09-05 16:29:35 +01:00
|
|
|
void *lm_init_begin = lm_alias(__init_begin);
|
|
|
|
void *lm_init_end = lm_alias(__init_end);
|
|
|
|
|
|
|
|
WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE));
|
|
|
|
WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE));
|
2024-09-02 10:39:35 +08:00
|
|
|
|
|
|
|
/* Delete __init region from memblock.reserved. */
|
2024-09-05 16:29:35 +01:00
|
|
|
memblock_free(lm_init_begin, lm_init_end - lm_init_begin);
|
2024-09-02 10:39:35 +08:00
|
|
|
|
2024-09-05 16:29:35 +01:00
|
|
|
free_reserved_area(lm_init_begin, lm_init_end,
|
2019-10-04 09:53:58 +05:30
|
|
|
POISON_FREE_INITMEM, "unused kernel");
|
2016-09-05 19:30:22 +08:00
|
|
|
/*
|
|
|
|
* Unmap the __init region but leave the VM area in place. This
|
|
|
|
* prevents the region from being reused for kernel modules, which
|
|
|
|
* is not supported by kallsyms.
|
|
|
|
*/
|
2021-04-29 22:59:01 -07:00
|
|
|
vunmap_range((u64)__init_begin, (u64)__init_end);
|
2012-03-05 11:49:27 +00:00
|
|
|
}
|
|
|
|
|
2020-06-29 10:08:31 +05:30
|
|
|
void dump_mem_limit(void)
|
2016-02-16 13:52:42 +01:00
|
|
|
{
|
2018-06-14 15:28:02 -07:00
|
|
|
if (memory_limit != PHYS_ADDR_MAX) {
|
2016-02-16 13:52:42 +01:00
|
|
|
pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
|
|
|
|
} else {
|
|
|
|
pr_emerg("Memory Limit: none\n");
|
|
|
|
}
|
|
|
|
}
|
2024-05-05 19:06:24 +03:00
|
|
|
|
|
|
|
#ifdef CONFIG_EXECMEM
|
|
|
|
static u64 module_direct_base __ro_after_init = 0;
|
|
|
|
static u64 module_plt_base __ro_after_init = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Choose a random page-aligned base address for a window of 'size' bytes which
|
|
|
|
* entirely contains the interval [start, end - 1].
|
|
|
|
*/
|
|
|
|
static u64 __init random_bounding_box(u64 size, u64 start, u64 end)
|
|
|
|
{
|
|
|
|
u64 max_pgoff, pgoff;
|
|
|
|
|
|
|
|
if ((end - start) >= size)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
max_pgoff = (size - (end - start)) / PAGE_SIZE;
|
|
|
|
pgoff = get_random_u32_inclusive(0, max_pgoff);
|
|
|
|
|
|
|
|
return start - pgoff * PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Modules may directly reference data and text anywhere within the kernel
|
|
|
|
* image and other modules. References using PREL32 relocations have a +/-2G
|
|
|
|
* range, and so we need to ensure that the entire kernel image and all modules
|
|
|
|
* fall within a 2G window such that these are always within range.
|
|
|
|
*
|
|
|
|
* Modules may directly branch to functions and code within the kernel text,
|
|
|
|
* and to functions and code within other modules. These branches will use
|
|
|
|
* CALL26/JUMP26 relocations with a +/-128M range. Without PLTs, we must ensure
|
|
|
|
* that the entire kernel text and all module text falls within a 128M window
|
|
|
|
* such that these are always within range. With PLTs, we can expand this to a
|
|
|
|
* 2G window.
|
|
|
|
*
|
|
|
|
* We chose the 128M region to surround the entire kernel image (rather than
|
|
|
|
* just the text) as using the same bounds for the 128M and 2G regions ensures
|
|
|
|
* by construction that we never select a 128M region that is not a subset of
|
|
|
|
* the 2G region. For very large and unusual kernel configurations this means
|
|
|
|
* we may fall back to PLTs where they could have been avoided, but this keeps
|
|
|
|
* the logic significantly simpler.
|
|
|
|
*/
|
|
|
|
static int __init module_init_limits(void)
|
|
|
|
{
|
|
|
|
u64 kernel_end = (u64)_end;
|
|
|
|
u64 kernel_start = (u64)_text;
|
|
|
|
u64 kernel_size = kernel_end - kernel_start;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The default modules region is placed immediately below the kernel
|
|
|
|
* image, and is large enough to use the full 2G relocation range.
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(KIMAGE_VADDR != MODULES_END);
|
|
|
|
BUILD_BUG_ON(MODULES_VSIZE < SZ_2G);
|
|
|
|
|
|
|
|
if (!kaslr_enabled()) {
|
|
|
|
if (kernel_size < SZ_128M)
|
|
|
|
module_direct_base = kernel_end - SZ_128M;
|
|
|
|
if (kernel_size < SZ_2G)
|
|
|
|
module_plt_base = kernel_end - SZ_2G;
|
|
|
|
} else {
|
|
|
|
u64 min = kernel_start;
|
|
|
|
u64 max = kernel_end;
|
|
|
|
|
|
|
|
if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
|
|
|
|
pr_info("2G module region forced by RANDOMIZE_MODULE_REGION_FULL\n");
|
|
|
|
} else {
|
|
|
|
module_direct_base = random_bounding_box(SZ_128M, min, max);
|
|
|
|
if (module_direct_base) {
|
|
|
|
min = module_direct_base;
|
|
|
|
max = module_direct_base + SZ_128M;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
module_plt_base = random_bounding_box(SZ_2G, min, max);
|
|
|
|
}
|
|
|
|
|
|
|
|
pr_info("%llu pages in range for non-PLT usage",
|
|
|
|
module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0);
|
|
|
|
pr_info("%llu pages in range for PLT usage",
|
|
|
|
module_plt_base ? (SZ_2G - kernel_size) / PAGE_SIZE : 0);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct execmem_info execmem_info __ro_after_init;
|
|
|
|
|
|
|
|
struct execmem_info __init *execmem_arch_setup(void)
|
|
|
|
{
|
|
|
|
unsigned long fallback_start = 0, fallback_end = 0;
|
|
|
|
unsigned long start = 0, end = 0;
|
|
|
|
|
|
|
|
module_init_limits();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Where possible, prefer to allocate within direct branch range of the
|
|
|
|
* kernel such that no PLTs are necessary.
|
|
|
|
*/
|
|
|
|
if (module_direct_base) {
|
|
|
|
start = module_direct_base;
|
|
|
|
end = module_direct_base + SZ_128M;
|
|
|
|
|
|
|
|
if (module_plt_base) {
|
|
|
|
fallback_start = module_plt_base;
|
|
|
|
fallback_end = module_plt_base + SZ_2G;
|
|
|
|
}
|
|
|
|
} else if (module_plt_base) {
|
|
|
|
start = module_plt_base;
|
|
|
|
end = module_plt_base + SZ_2G;
|
|
|
|
}
|
|
|
|
|
|
|
|
execmem_info = (struct execmem_info){
|
|
|
|
.ranges = {
|
|
|
|
[EXECMEM_DEFAULT] = {
|
|
|
|
.start = start,
|
|
|
|
.end = end,
|
|
|
|
.pgprot = PAGE_KERNEL,
|
|
|
|
.alignment = 1,
|
|
|
|
.fallback_start = fallback_start,
|
|
|
|
.fallback_end = fallback_end,
|
|
|
|
},
|
|
|
|
[EXECMEM_KPROBES] = {
|
|
|
|
.start = VMALLOC_START,
|
|
|
|
.end = VMALLOC_END,
|
|
|
|
.pgprot = PAGE_KERNEL_ROX,
|
|
|
|
.alignment = 1,
|
|
|
|
},
|
|
|
|
[EXECMEM_BPF] = {
|
|
|
|
.start = VMALLOC_START,
|
|
|
|
.end = VMALLOC_END,
|
|
|
|
.pgprot = PAGE_KERNEL,
|
|
|
|
.alignment = 1,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
return &execmem_info;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_EXECMEM */
|