Sourabh Jain 849599b702 powerpc/crash: add crash memory hotplug support
Extend the arch crash hotplug handler, as introduced by the patch title
("powerpc: add crash CPU hotplug support"), to also support memory
add/remove events.

Elfcorehdr describes the memory of the crash kernel to capture the
kernel; hence, it needs to be updated if memory resources change due to
memory add/remove events. Therefore, arch_crash_handle_hotplug_event()
is updated to recreate the elfcorehdr and replace it with the previous
one on memory add/remove events.

The memblock list is used to prepare the elfcorehdr. In the case of
memory hot remove, the memblock list is updated after the arch crash
hotplug handler is triggered, as depicted in Figure 1. Thus, the
hot-removed memory is explicitly removed from the crash memory ranges
to ensure that the memory ranges added to elfcorehdr do not include the
hot-removed memory.

    Memory remove
          |
          v
    Offline pages
          |
          v
 Initiate memory notify call <----> crash hotplug handler
 chain for MEM_OFFLINE event
          |
          v
 Update memblock list

 	Figure 1

There are two system calls, `kexec_file_load` and `kexec_load`, used to
load the kdump image. A few changes have been made to ensure that the
kernel can safely update the elfcorehdr component of the kdump image for
both system calls.

For the kexec_file_load syscall, kdump image is prepared in the kernel.
To support an increasing number of memory regions, the elfcorehdr is
built with extra buffer space to ensure that it can accommodate
additional memory ranges in future.

For the kexec_load syscall, the elfcorehdr is updated only if the
KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag is passed to the kernel by the
kexec tool. Passing this flag to the kernel indicates that the
elfcorehdr is built to accommodate additional memory ranges and the
elfcorehdr segment is not considered for SHA calculation, making it safe
to update.

The changes related to this feature are kept under the CRASH_HOTPLUG
config, and it is enabled by default.

Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Acked-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20240326055413.186534-7-sourabhjain@linux.ibm.com
2024-04-23 15:00:04 +10:00

709 lines
18 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* powerpc code to implement the kexec_file_load syscall
*
* Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
* Copyright (C) 2004 IBM Corp.
* Copyright (C) 2004,2005 Milton D Miller II, IBM Corporation
* Copyright (C) 2005 R Sharada (sharada@in.ibm.com)
* Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com)
* Copyright (C) 2020 IBM Corporation
*
* Based on kexec-tools' kexec-ppc64.c, fs2dt.c.
* Heavily modified for the kernel by
* Hari Bathini, IBM Corporation.
*/
#define pr_fmt(fmt) "kexec ranges: " fmt
#include <linux/sort.h>
#include <linux/kexec.h>
#include <linux/of.h>
#include <linux/slab.h>
#include <linux/memblock.h>
#include <linux/crash_core.h>
#include <asm/sections.h>
#include <asm/kexec_ranges.h>
#include <asm/crashdump-ppc64.h>
#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_DUMP)
/**
* get_max_nr_ranges - Get the max no. of ranges crash_mem structure
* could hold, given the size allocated for it.
* @size: Allocation size of crash_mem structure.
*
* Returns the maximum no. of ranges.
*/
static inline unsigned int get_max_nr_ranges(size_t size)
{
return ((size - sizeof(struct crash_mem)) /
sizeof(struct range));
}
/**
* get_mem_rngs_size - Get the allocated size of mem_rngs based on
* max_nr_ranges and chunk size.
* @mem_rngs: Memory ranges.
*
* Returns the maximum size of @mem_rngs.
*/
static inline size_t get_mem_rngs_size(struct crash_mem *mem_rngs)
{
size_t size;
if (!mem_rngs)
return 0;
size = (sizeof(struct crash_mem) +
(mem_rngs->max_nr_ranges * sizeof(struct range)));
/*
* Memory is allocated in size multiple of MEM_RANGE_CHUNK_SZ.
* So, align to get the actual length.
*/
return ALIGN(size, MEM_RANGE_CHUNK_SZ);
}
/**
* __add_mem_range - add a memory range to memory ranges list.
* @mem_ranges: Range list to add the memory range to.
* @base: Base address of the range to add.
* @size: Size of the memory range to add.
*
* (Re)allocates memory, if needed.
*
* Returns 0 on success, negative errno on error.
*/
static int __add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
{
struct crash_mem *mem_rngs = *mem_ranges;
if (!mem_rngs || (mem_rngs->nr_ranges == mem_rngs->max_nr_ranges)) {
mem_rngs = realloc_mem_ranges(mem_ranges);
if (!mem_rngs)
return -ENOMEM;
}
mem_rngs->ranges[mem_rngs->nr_ranges].start = base;
mem_rngs->ranges[mem_rngs->nr_ranges].end = base + size - 1;
pr_debug("Added memory range [%#016llx - %#016llx] at index %d\n",
base, base + size - 1, mem_rngs->nr_ranges);
mem_rngs->nr_ranges++;
return 0;
}
/**
* __merge_memory_ranges - Merges the given memory ranges list.
* @mem_rngs: Range list to merge.
*
* Assumes a sorted range list.
*
* Returns nothing.
*/
static void __merge_memory_ranges(struct crash_mem *mem_rngs)
{
struct range *ranges;
int i, idx;
if (!mem_rngs)
return;
idx = 0;
ranges = &(mem_rngs->ranges[0]);
for (i = 1; i < mem_rngs->nr_ranges; i++) {
if (ranges[i].start <= (ranges[i-1].end + 1))
ranges[idx].end = ranges[i].end;
else {
idx++;
if (i == idx)
continue;
ranges[idx] = ranges[i];
}
}
mem_rngs->nr_ranges = idx + 1;
}
/* cmp_func_t callback to sort ranges with sort() */
static int rngcmp(const void *_x, const void *_y)
{
const struct range *x = _x, *y = _y;
if (x->start > y->start)
return 1;
if (x->start < y->start)
return -1;
return 0;
}
/**
* sort_memory_ranges - Sorts the given memory ranges list.
* @mem_rngs: Range list to sort.
* @merge: If true, merge the list after sorting.
*
* Returns nothing.
*/
void sort_memory_ranges(struct crash_mem *mem_rngs, bool merge)
{
int i;
if (!mem_rngs)
return;
/* Sort the ranges in-place */
sort(&(mem_rngs->ranges[0]), mem_rngs->nr_ranges,
sizeof(mem_rngs->ranges[0]), rngcmp, NULL);
if (merge)
__merge_memory_ranges(mem_rngs);
/* For debugging purpose */
pr_debug("Memory ranges:\n");
for (i = 0; i < mem_rngs->nr_ranges; i++) {
pr_debug("\t[%03d][%#016llx - %#016llx]\n", i,
mem_rngs->ranges[i].start,
mem_rngs->ranges[i].end);
}
}
/**
* realloc_mem_ranges - reallocate mem_ranges with size incremented
* by MEM_RANGE_CHUNK_SZ. Frees up the old memory,
* if memory allocation fails.
* @mem_ranges: Memory ranges to reallocate.
*
* Returns pointer to reallocated memory on success, NULL otherwise.
*/
struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges)
{
struct crash_mem *mem_rngs = *mem_ranges;
unsigned int nr_ranges;
size_t size;
size = get_mem_rngs_size(mem_rngs);
nr_ranges = mem_rngs ? mem_rngs->nr_ranges : 0;
size += MEM_RANGE_CHUNK_SZ;
mem_rngs = krealloc(*mem_ranges, size, GFP_KERNEL);
if (!mem_rngs) {
kfree(*mem_ranges);
*mem_ranges = NULL;
return NULL;
}
mem_rngs->nr_ranges = nr_ranges;
mem_rngs->max_nr_ranges = get_max_nr_ranges(size);
*mem_ranges = mem_rngs;
return mem_rngs;
}
/**
* add_mem_range - Updates existing memory range, if there is an overlap.
* Else, adds a new memory range.
* @mem_ranges: Range list to add the memory range to.
* @base: Base address of the range to add.
* @size: Size of the memory range to add.
*
* (Re)allocates memory, if needed.
*
* Returns 0 on success, negative errno on error.
*/
int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
{
struct crash_mem *mem_rngs = *mem_ranges;
u64 mstart, mend, end;
unsigned int i;
if (!size)
return 0;
end = base + size - 1;
if (!mem_rngs || !(mem_rngs->nr_ranges))
return __add_mem_range(mem_ranges, base, size);
for (i = 0; i < mem_rngs->nr_ranges; i++) {
mstart = mem_rngs->ranges[i].start;
mend = mem_rngs->ranges[i].end;
if (base < mend && end > mstart) {
if (base < mstart)
mem_rngs->ranges[i].start = base;
if (end > mend)
mem_rngs->ranges[i].end = end;
return 0;
}
}
return __add_mem_range(mem_ranges, base, size);
}
#endif /* CONFIG_KEXEC_FILE || CONFIG_CRASH_DUMP */
#ifdef CONFIG_KEXEC_FILE
/**
* add_tce_mem_ranges - Adds tce-table range to the given memory ranges list.
* @mem_ranges: Range list to add the memory range(s) to.
*
* Returns 0 on success, negative errno on error.
*/
static int add_tce_mem_ranges(struct crash_mem **mem_ranges)
{
struct device_node *dn = NULL;
int ret = 0;
for_each_node_by_type(dn, "pci") {
u64 base;
u32 size;
ret = of_property_read_u64(dn, "linux,tce-base", &base);
ret |= of_property_read_u32(dn, "linux,tce-size", &size);
if (ret) {
/*
* It is ok to have pci nodes without tce. So, ignore
* property does not exist error.
*/
if (ret == -EINVAL) {
ret = 0;
continue;
}
break;
}
ret = add_mem_range(mem_ranges, base, size);
if (ret)
break;
}
of_node_put(dn);
return ret;
}
/**
* add_initrd_mem_range - Adds initrd range to the given memory ranges list,
* if the initrd was retained.
* @mem_ranges: Range list to add the memory range to.
*
* Returns 0 on success, negative errno on error.
*/
static int add_initrd_mem_range(struct crash_mem **mem_ranges)
{
u64 base, end;
int ret;
/* This range means something, only if initrd was retained */
if (!strstr(saved_command_line, "retain_initrd"))
return 0;
ret = of_property_read_u64(of_chosen, "linux,initrd-start", &base);
ret |= of_property_read_u64(of_chosen, "linux,initrd-end", &end);
if (!ret)
ret = add_mem_range(mem_ranges, base, end - base + 1);
return ret;
}
/**
* add_htab_mem_range - Adds htab range to the given memory ranges list,
* if it exists
* @mem_ranges: Range list to add the memory range to.
*
* Returns 0 on success, negative errno on error.
*/
static int add_htab_mem_range(struct crash_mem **mem_ranges)
{
#ifdef CONFIG_PPC_64S_HASH_MMU
if (!htab_address)
return 0;
return add_mem_range(mem_ranges, __pa(htab_address), htab_size_bytes);
#else
return 0;
#endif
}
/**
* add_kernel_mem_range - Adds kernel text region to the given
* memory ranges list.
* @mem_ranges: Range list to add the memory range to.
*
* Returns 0 on success, negative errno on error.
*/
static int add_kernel_mem_range(struct crash_mem **mem_ranges)
{
return add_mem_range(mem_ranges, 0, __pa(_end));
}
#endif /* CONFIG_KEXEC_FILE */
#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_DUMP)
/**
* add_rtas_mem_range - Adds RTAS region to the given memory ranges list.
* @mem_ranges: Range list to add the memory range to.
*
* Returns 0 on success, negative errno on error.
*/
static int add_rtas_mem_range(struct crash_mem **mem_ranges)
{
struct device_node *dn;
u32 base, size;
int ret = 0;
dn = of_find_node_by_path("/rtas");
if (!dn)
return 0;
ret = of_property_read_u32(dn, "linux,rtas-base", &base);
ret |= of_property_read_u32(dn, "rtas-size", &size);
if (!ret)
ret = add_mem_range(mem_ranges, base, size);
of_node_put(dn);
return ret;
}
/**
* add_opal_mem_range - Adds OPAL region to the given memory ranges list.
* @mem_ranges: Range list to add the memory range to.
*
* Returns 0 on success, negative errno on error.
*/
static int add_opal_mem_range(struct crash_mem **mem_ranges)
{
struct device_node *dn;
u64 base, size;
int ret;
dn = of_find_node_by_path("/ibm,opal");
if (!dn)
return 0;
ret = of_property_read_u64(dn, "opal-base-address", &base);
ret |= of_property_read_u64(dn, "opal-runtime-size", &size);
if (!ret)
ret = add_mem_range(mem_ranges, base, size);
of_node_put(dn);
return ret;
}
#endif /* CONFIG_KEXEC_FILE || CONFIG_CRASH_DUMP */
#ifdef CONFIG_KEXEC_FILE
/**
* add_reserved_mem_ranges - Adds "/reserved-ranges" regions exported by f/w
* to the given memory ranges list.
* @mem_ranges: Range list to add the memory ranges to.
*
* Returns 0 on success, negative errno on error.
*/
static int add_reserved_mem_ranges(struct crash_mem **mem_ranges)
{
int n_mem_addr_cells, n_mem_size_cells, i, len, cells, ret = 0;
struct device_node *root = of_find_node_by_path("/");
const __be32 *prop;
prop = of_get_property(root, "reserved-ranges", &len);
n_mem_addr_cells = of_n_addr_cells(root);
n_mem_size_cells = of_n_size_cells(root);
of_node_put(root);
if (!prop)
return 0;
cells = n_mem_addr_cells + n_mem_size_cells;
/* Each reserved range is an (address,size) pair */
for (i = 0; i < (len / (sizeof(u32) * cells)); i++) {
u64 base, size;
base = of_read_number(prop + (i * cells), n_mem_addr_cells);
size = of_read_number(prop + (i * cells) + n_mem_addr_cells,
n_mem_size_cells);
ret = add_mem_range(mem_ranges, base, size);
if (ret)
break;
}
return ret;
}
/**
* get_reserved_memory_ranges - Get reserve memory ranges. This list includes
* memory regions that should be added to the
* memory reserve map to ensure the region is
* protected from any mischief.
* @mem_ranges: Range list to add the memory ranges to.
*
* Returns 0 on success, negative errno on error.
*/
int get_reserved_memory_ranges(struct crash_mem **mem_ranges)
{
int ret;
ret = add_rtas_mem_range(mem_ranges);
if (ret)
goto out;
ret = add_tce_mem_ranges(mem_ranges);
if (ret)
goto out;
ret = add_reserved_mem_ranges(mem_ranges);
out:
if (ret)
pr_err("Failed to setup reserved memory ranges\n");
return ret;
}
/**
* get_exclude_memory_ranges - Get exclude memory ranges. This list includes
* regions like opal/rtas, tce-table, initrd,
* kernel, htab which should be avoided while
* setting up kexec load segments.
* @mem_ranges: Range list to add the memory ranges to.
*
* Returns 0 on success, negative errno on error.
*/
int get_exclude_memory_ranges(struct crash_mem **mem_ranges)
{
int ret;
ret = add_tce_mem_ranges(mem_ranges);
if (ret)
goto out;
ret = add_initrd_mem_range(mem_ranges);
if (ret)
goto out;
ret = add_htab_mem_range(mem_ranges);
if (ret)
goto out;
ret = add_kernel_mem_range(mem_ranges);
if (ret)
goto out;
ret = add_rtas_mem_range(mem_ranges);
if (ret)
goto out;
ret = add_opal_mem_range(mem_ranges);
if (ret)
goto out;
ret = add_reserved_mem_ranges(mem_ranges);
if (ret)
goto out;
/* exclude memory ranges should be sorted for easy lookup */
sort_memory_ranges(*mem_ranges, true);
out:
if (ret)
pr_err("Failed to setup exclude memory ranges\n");
return ret;
}
#ifdef CONFIG_CRASH_DUMP
/**
* get_usable_memory_ranges - Get usable memory ranges. This list includes
* regions like crashkernel, opal/rtas & tce-table,
* that kdump kernel could use.
* @mem_ranges: Range list to add the memory ranges to.
*
* Returns 0 on success, negative errno on error.
*/
int get_usable_memory_ranges(struct crash_mem **mem_ranges)
{
int ret;
/*
* Early boot failure observed on guests when low memory (first memory
* block?) is not added to usable memory. So, add [0, crashk_res.end]
* instead of [crashk_res.start, crashk_res.end] to workaround it.
* Also, crashed kernel's memory must be added to reserve map to
* avoid kdump kernel from using it.
*/
ret = add_mem_range(mem_ranges, 0, crashk_res.end + 1);
if (ret)
goto out;
ret = add_rtas_mem_range(mem_ranges);
if (ret)
goto out;
ret = add_opal_mem_range(mem_ranges);
if (ret)
goto out;
ret = add_tce_mem_ranges(mem_ranges);
out:
if (ret)
pr_err("Failed to setup usable memory ranges\n");
return ret;
}
#endif /* CONFIG_CRASH_DUMP */
#endif /* CONFIG_KEXEC_FILE */
#ifdef CONFIG_CRASH_DUMP
/**
* get_crash_memory_ranges - Get crash memory ranges. This list includes
* first/crashing kernel's memory regions that
* would be exported via an elfcore.
* @mem_ranges: Range list to add the memory ranges to.
*
* Returns 0 on success, negative errno on error.
*/
int get_crash_memory_ranges(struct crash_mem **mem_ranges)
{
phys_addr_t base, end;
struct crash_mem *tmem;
u64 i;
int ret;
for_each_mem_range(i, &base, &end) {
u64 size = end - base;
/* Skip backup memory region, which needs a separate entry */
if (base == BACKUP_SRC_START) {
if (size > BACKUP_SRC_SIZE) {
base = BACKUP_SRC_END + 1;
size -= BACKUP_SRC_SIZE;
} else
continue;
}
ret = add_mem_range(mem_ranges, base, size);
if (ret)
goto out;
/* Try merging adjacent ranges before reallocation attempt */
if ((*mem_ranges)->nr_ranges == (*mem_ranges)->max_nr_ranges)
sort_memory_ranges(*mem_ranges, true);
}
/* Reallocate memory ranges if there is no space to split ranges */
tmem = *mem_ranges;
if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) {
tmem = realloc_mem_ranges(mem_ranges);
if (!tmem)
goto out;
}
/* Exclude crashkernel region */
ret = crash_exclude_mem_range(tmem, crashk_res.start, crashk_res.end);
if (ret)
goto out;
/*
* FIXME: For now, stay in parity with kexec-tools but if RTAS/OPAL
* regions are exported to save their context at the time of
* crash, they should actually be backed up just like the
* first 64K bytes of memory.
*/
ret = add_rtas_mem_range(mem_ranges);
if (ret)
goto out;
ret = add_opal_mem_range(mem_ranges);
if (ret)
goto out;
/* create a separate program header for the backup region */
ret = add_mem_range(mem_ranges, BACKUP_SRC_START, BACKUP_SRC_SIZE);
if (ret)
goto out;
sort_memory_ranges(*mem_ranges, false);
out:
if (ret)
pr_err("Failed to setup crash memory ranges\n");
return ret;
}
/**
* remove_mem_range - Removes the given memory range from the range list.
* @mem_ranges: Range list to remove the memory range to.
* @base: Base address of the range to remove.
* @size: Size of the memory range to remove.
*
* (Re)allocates memory, if needed.
*
* Returns 0 on success, negative errno on error.
*/
int remove_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
{
u64 end;
int ret = 0;
unsigned int i;
u64 mstart, mend;
struct crash_mem *mem_rngs = *mem_ranges;
if (!size)
return 0;
/*
* Memory range are stored as start and end address, use
* the same format to do remove operation.
*/
end = base + size - 1;
for (i = 0; i < mem_rngs->nr_ranges; i++) {
mstart = mem_rngs->ranges[i].start;
mend = mem_rngs->ranges[i].end;
/*
* Memory range to remove is not part of this range entry
* in the memory range list
*/
if (!(base >= mstart && end <= mend))
continue;
/*
* Memory range to remove is equivalent to this entry in the
* memory range list. Remove the range entry from the list.
*/
if (base == mstart && end == mend) {
for (; i < mem_rngs->nr_ranges - 1; i++) {
mem_rngs->ranges[i].start = mem_rngs->ranges[i+1].start;
mem_rngs->ranges[i].end = mem_rngs->ranges[i+1].end;
}
mem_rngs->nr_ranges--;
goto out;
}
/*
* Start address of the memory range to remove and the
* current memory range entry in the list is same. Just
* move the start address of the current memory range
* entry in the list to end + 1.
*/
else if (base == mstart) {
mem_rngs->ranges[i].start = end + 1;
goto out;
}
/*
* End address of the memory range to remove and the
* current memory range entry in the list is same.
* Just move the end address of the current memory
* range entry in the list to base - 1.
*/
else if (end == mend) {
mem_rngs->ranges[i].end = base - 1;
goto out;
}
/*
* Memory range to remove is not at the edge of current
* memory range entry. Split the current memory entry into
* two half.
*/
else {
mem_rngs->ranges[i].end = base - 1;
size = mem_rngs->ranges[i].end - end;
ret = add_mem_range(mem_ranges, end + 1, size);
}
}
out:
return ret;
}
#endif /* CONFIG_CRASH_DUMP */