1012 lines
25 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* efi.c - EFI subsystem
*
* Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com>
* Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com>
* Copyright (C) 2013 Tom Gundersen <teg@jklm.no>
*
* This code registers /sys/firmware/efi{,/efivars} when EFI is supported,
* allowing the efivarfs to be mounted or the efivars module to be loaded.
* The existance of /sys/firmware/efi may also be used by userspace to
* determine that the system supports EFI.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kobject.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/efi.h>
#include <linux/of.h>
#include <linux/io.h>
#include <linux/kexec.h>
#include <linux/platform_device.h>
#include <linux/random.h>
#include <linux/reboot.h>
#include <linux/slab.h>
#include <linux/acpi.h>
#include <linux/ucs2_string.h>
2016-02-29 21:22:52 +00:00
#include <linux/memblock.h>
#include <linux/security.h>
#include <asm/early_ioremap.h>
struct efi __read_mostly efi = {
.runtime_supported_mask = EFI_RT_SUPPORTED_ALL,
.acpi = EFI_INVALID_TABLE_ADDR,
.acpi20 = EFI_INVALID_TABLE_ADDR,
.smbios = EFI_INVALID_TABLE_ADDR,
.smbios3 = EFI_INVALID_TABLE_ADDR,
.esrt = EFI_INVALID_TABLE_ADDR,
.tpm_log = EFI_INVALID_TABLE_ADDR,
.tpm_final_log = EFI_INVALID_TABLE_ADDR,
};
EXPORT_SYMBOL(efi);
unsigned long __ro_after_init efi_rng_seed = EFI_INVALID_TABLE_ADDR;
static unsigned long __initdata mem_reserve = EFI_INVALID_TABLE_ADDR;
static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR;
struct mm_struct efi_mm = {
.mm_rb = RB_ROOT,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
MMAP_LOCK_INITIALIZER(efi_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(efi_mm.mmlist),
.cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
};
efi: Use a work queue to invoke EFI Runtime Services Presently, when a user process requests the kernel to execute any UEFI runtime service, the kernel temporarily switches to a separate set of page tables that describe the virtual mapping of the UEFI runtime services regions in memory. Since UEFI runtime services are typically invoked with interrupts enabled, any code that may be called during this time, will have an incorrect view of the process's address space. Although it is unusual for code running in interrupt context to make assumptions about the process context it runs in, there are cases (such as the perf subsystem taking samples) where this causes problems. So let's set up a work queue for calling UEFI runtime services, so that the actual calls are made when the work queue items are dispatched by a work queue worker running in a separate kernel thread. Such threads are not expected to have userland mappings in the first place, and so the additional mappings created for the UEFI runtime services can never clash with any. The ResetSystem() runtime service is not covered by the work queue handling, since it is not expected to return, and may be called at a time when the kernel is torn down to the point where we cannot expect work queues to still be operational. The non-blocking variants of SetVariable() and QueryVariableInfo() are also excluded: these are intended to be used from atomic context, which obviously rules out waiting for a completion to be signalled by another thread. Note that these variants are currently only used for UEFI runtime services calls that occur very early in the boot, and for ones that occur in critical conditions, e.g., to flush kernel logs to UEFI variables via efi-pstore. Suggested-by: Andy Lutomirski <luto@kernel.org> Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> [ardb: exclude ResetSystem() from the workqueue treatment merge from 2 separate patches and rewrite commit log] Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20180711094040.12506-4-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-07-11 11:40:35 +02:00
struct workqueue_struct *efi_rts_wq;
static bool disable_runtime;
static int __init setup_noefi(char *arg)
{
disable_runtime = true;
return 0;
}
early_param("noefi", setup_noefi);
bool efi_runtime_disabled(void)
{
return disable_runtime;
}
bool __pure __efi_soft_reserve_enabled(void)
{
return !efi_enabled(EFI_MEM_NO_SOFT_RESERVE);
}
static int __init parse_efi_cmdline(char *str)
{
efi: Check for NULL efi kernel parameters Even though it is documented how to specifiy efi parameters, it is possible to cause a kernel panic due to a dereference of a NULL pointer when parsing such parameters if "efi" alone is given: PANIC: early exception 0e rip 10:ffffffff812fb361 error 0 cr2 0 [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 4.2.0-rc1+ #450 [ 0.000000] ffffffff81fe20a9 ffffffff81e03d50 ffffffff8184bb0f 00000000000003f8 [ 0.000000] 0000000000000000 ffffffff81e03e08 ffffffff81f371a1 64656c62616e6520 [ 0.000000] 0000000000000069 000000000000005f 0000000000000000 0000000000000000 [ 0.000000] Call Trace: [ 0.000000] [<ffffffff8184bb0f>] dump_stack+0x45/0x57 [ 0.000000] [<ffffffff81f371a1>] early_idt_handler_common+0x81/0xae [ 0.000000] [<ffffffff812fb361>] ? parse_option_str+0x11/0x90 [ 0.000000] [<ffffffff81f4dd69>] arch_parse_efi_cmdline+0x15/0x42 [ 0.000000] [<ffffffff81f376e1>] do_early_param+0x50/0x8a [ 0.000000] [<ffffffff8106b1b3>] parse_args+0x1e3/0x400 [ 0.000000] [<ffffffff81f37a43>] parse_early_options+0x24/0x28 [ 0.000000] [<ffffffff81f37691>] ? loglevel+0x31/0x31 [ 0.000000] [<ffffffff81f37a78>] parse_early_param+0x31/0x3d [ 0.000000] [<ffffffff81f3ae98>] setup_arch+0x2de/0xc08 [ 0.000000] [<ffffffff8109629a>] ? vprintk_default+0x1a/0x20 [ 0.000000] [<ffffffff81f37b20>] start_kernel+0x90/0x423 [ 0.000000] [<ffffffff81f37495>] x86_64_start_reservations+0x2a/0x2c [ 0.000000] [<ffffffff81f37582>] x86_64_start_kernel+0xeb/0xef [ 0.000000] RIP 0xffffffff81ba2efc This panic is not reproducible with "efi=" as this will result in a non-NULL zero-length string. Thus, verify that the pointer to the parameter string is not NULL. This is consistent with other parameter-parsing functions which check for NULL pointers. Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> Cc: Dave Young <dyoung@redhat.com> Cc: <stable@vger.kernel.org> Signed-off-by: Matt Fleming <matt.fleming@intel.com>
2015-07-15 19:36:03 -07:00
if (!str) {
pr_warn("need at least one option\n");
return -EINVAL;
}
if (parse_option_str(str, "debug"))
set_bit(EFI_DBG, &efi.flags);
if (parse_option_str(str, "noruntime"))
disable_runtime = true;
if (parse_option_str(str, "nosoftreserve"))
set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags);
return 0;
}
early_param("efi", parse_efi_cmdline);
struct kobject *efi_kobj;
/*
* Let's not leave out systab information that snuck into
* the efivars driver
* Note, do not add more fields in systab sysfs file as it breaks sysfs
* one value per file rule!
*/
static ssize_t systab_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
char *str = buf;
if (!kobj || !buf)
return -EINVAL;
if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
str += sprintf(str, "ACPI20=0x%lx\n", efi.acpi20);
if (efi.acpi != EFI_INVALID_TABLE_ADDR)
str += sprintf(str, "ACPI=0x%lx\n", efi.acpi);
/*
* If both SMBIOS and SMBIOS3 entry points are implemented, the
* SMBIOS3 entry point shall be preferred, so we list it first to
* let applications stop parsing after the first match.
*/
if (efi.smbios3 != EFI_INVALID_TABLE_ADDR)
str += sprintf(str, "SMBIOS3=0x%lx\n", efi.smbios3);
if (efi.smbios != EFI_INVALID_TABLE_ADDR)
str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
if (IS_ENABLED(CONFIG_IA64) || IS_ENABLED(CONFIG_X86))
str = efi_systab_show_arch(str);
return str - buf;
}
static struct kobj_attribute efi_attr_systab = __ATTR_RO_MODE(systab, 0400);
static ssize_t fw_platform_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", efi_enabled(EFI_64BIT) ? 64 : 32);
}
extern __weak struct kobj_attribute efi_attr_fw_vendor;
extern __weak struct kobj_attribute efi_attr_runtime;
extern __weak struct kobj_attribute efi_attr_config_table;
static struct kobj_attribute efi_attr_fw_platform_size =
__ATTR_RO(fw_platform_size);
static struct attribute *efi_subsys_attrs[] = {
&efi_attr_systab.attr,
&efi_attr_fw_platform_size.attr,
&efi_attr_fw_vendor.attr,
&efi_attr_runtime.attr,
&efi_attr_config_table.attr,
NULL,
};
umode_t __weak efi_attr_is_visible(struct kobject *kobj, struct attribute *attr,
int n)
{
return attr->mode;
}
static const struct attribute_group efi_subsys_attr_group = {
.attrs = efi_subsys_attrs,
.is_visible = efi_attr_is_visible,
};
static struct efivars generic_efivars;
static struct efivar_operations generic_ops;
static int generic_ops_register(void)
{
generic_ops.get_variable = efi.get_variable;
generic_ops.set_variable = efi.set_variable;
generic_ops.set_variable_nonblocking = efi.set_variable_nonblocking;
generic_ops.get_next_variable = efi.get_next_variable;
generic_ops.query_variable_store = efi_query_variable_store;
return efivars_register(&generic_efivars, &generic_ops, efi_kobj);
}
static void generic_ops_unregister(void)
{
efivars_unregister(&generic_efivars);
}
#ifdef CONFIG_EFI_CUSTOM_SSDT_OVERLAYS
#define EFIVAR_SSDT_NAME_MAX 16
static char efivar_ssdt[EFIVAR_SSDT_NAME_MAX] __initdata;
static int __init efivar_ssdt_setup(char *str)
{
int ret = security_locked_down(LOCKDOWN_ACPI_TABLES);
if (ret)
return ret;
if (strlen(str) < sizeof(efivar_ssdt))
memcpy(efivar_ssdt, str, strlen(str));
else
pr_warn("efivar_ssdt: name too long: %s\n", str);
return 0;
}
__setup("efivar_ssdt=", efivar_ssdt_setup);
static __init int efivar_ssdt_iter(efi_char16_t *name, efi_guid_t vendor,
unsigned long name_size, void *data)
{
struct efivar_entry *entry;
struct list_head *list = data;
char utf8_name[EFIVAR_SSDT_NAME_MAX];
int limit = min_t(unsigned long, EFIVAR_SSDT_NAME_MAX, name_size);
ucs2_as_utf8(utf8_name, name, limit - 1);
if (strncmp(utf8_name, efivar_ssdt, limit) != 0)
return 0;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return 0;
memcpy(entry->var.VariableName, name, name_size);
memcpy(&entry->var.VendorGuid, &vendor, sizeof(efi_guid_t));
efivar_entry_add(entry, list);
return 0;
}
static __init int efivar_ssdt_load(void)
{
LIST_HEAD(entries);
struct efivar_entry *entry, *aux;
unsigned long size;
void *data;
int ret;
2019-10-02 18:58:59 +02:00
if (!efivar_ssdt[0])
return 0;
ret = efivar_init(efivar_ssdt_iter, &entries, true, &entries);
list_for_each_entry_safe(entry, aux, &entries, list) {
pr_info("loading SSDT from variable %s-%pUl\n", efivar_ssdt,
&entry->var.VendorGuid);
list_del(&entry->list);
ret = efivar_entry_size(entry, &size);
if (ret) {
pr_err("failed to get var size\n");
goto free_entry;
}
data = kmalloc(size, GFP_KERNEL);
if (!data) {
ret = -ENOMEM;
goto free_entry;
}
ret = efivar_entry_get(entry, NULL, &size, data);
if (ret) {
pr_err("failed to get var data\n");
goto free_data;
}
ret = acpi_load_table(data, NULL);
if (ret) {
pr_err("failed to load table: %d\n", ret);
goto free_data;
}
goto free_entry;
free_data:
kfree(data);
free_entry:
kfree(entry);
}
return ret;
}
#else
static inline int efivar_ssdt_load(void) { return 0; }
#endif
#ifdef CONFIG_DEBUG_FS
#define EFI_DEBUGFS_MAX_BLOBS 32
static struct debugfs_blob_wrapper debugfs_blob[EFI_DEBUGFS_MAX_BLOBS];
static void __init efi_debugfs_init(void)
{
struct dentry *efi_debugfs;
efi_memory_desc_t *md;
char name[32];
int type_count[EFI_BOOT_SERVICES_DATA + 1] = {};
int i = 0;
efi_debugfs = debugfs_create_dir("efi", NULL);
if (IS_ERR_OR_NULL(efi_debugfs))
return;
for_each_efi_memory_desc(md) {
switch (md->type) {
case EFI_BOOT_SERVICES_CODE:
snprintf(name, sizeof(name), "boot_services_code%d",
type_count[md->type]++);
break;
case EFI_BOOT_SERVICES_DATA:
snprintf(name, sizeof(name), "boot_services_data%d",
type_count[md->type]++);
break;
default:
continue;
}
if (i >= EFI_DEBUGFS_MAX_BLOBS) {
pr_warn("More then %d EFI boot service segments, only showing first %d in debugfs\n",
EFI_DEBUGFS_MAX_BLOBS, EFI_DEBUGFS_MAX_BLOBS);
break;
}
debugfs_blob[i].size = md->num_pages << EFI_PAGE_SHIFT;
debugfs_blob[i].data = memremap(md->phys_addr,
debugfs_blob[i].size,
MEMREMAP_WB);
if (!debugfs_blob[i].data)
continue;
debugfs_create_blob(name, 0400, efi_debugfs, &debugfs_blob[i]);
i++;
}
}
#else
static inline void efi_debugfs_init(void) {}
#endif
/*
* We register the efi subsystem with the firmware subsystem and the
* efivars subsystem with the efi subsystem, if the system was booted with
* EFI.
*/
static int __init efisubsys_init(void)
{
int error;
if (!efi_enabled(EFI_RUNTIME_SERVICES))
efi.runtime_supported_mask = 0;
if (!efi_enabled(EFI_BOOT))
return 0;
if (efi.runtime_supported_mask) {
/*
* Since we process only one efi_runtime_service() at a time, an
* ordered workqueue (which creates only one execution context)
* should suffice for all our needs.
*/
efi_rts_wq = alloc_ordered_workqueue("efi_rts_wq", 0);
if (!efi_rts_wq) {
pr_err("Creating efi_rts_wq failed, EFI runtime services disabled.\n");
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
efi.runtime_supported_mask = 0;
return 0;
}
efi: Use a work queue to invoke EFI Runtime Services Presently, when a user process requests the kernel to execute any UEFI runtime service, the kernel temporarily switches to a separate set of page tables that describe the virtual mapping of the UEFI runtime services regions in memory. Since UEFI runtime services are typically invoked with interrupts enabled, any code that may be called during this time, will have an incorrect view of the process's address space. Although it is unusual for code running in interrupt context to make assumptions about the process context it runs in, there are cases (such as the perf subsystem taking samples) where this causes problems. So let's set up a work queue for calling UEFI runtime services, so that the actual calls are made when the work queue items are dispatched by a work queue worker running in a separate kernel thread. Such threads are not expected to have userland mappings in the first place, and so the additional mappings created for the UEFI runtime services can never clash with any. The ResetSystem() runtime service is not covered by the work queue handling, since it is not expected to return, and may be called at a time when the kernel is torn down to the point where we cannot expect work queues to still be operational. The non-blocking variants of SetVariable() and QueryVariableInfo() are also excluded: these are intended to be used from atomic context, which obviously rules out waiting for a completion to be signalled by another thread. Note that these variants are currently only used for UEFI runtime services calls that occur very early in the boot, and for ones that occur in critical conditions, e.g., to flush kernel logs to UEFI variables via efi-pstore. Suggested-by: Andy Lutomirski <luto@kernel.org> Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> [ardb: exclude ResetSystem() from the workqueue treatment merge from 2 separate patches and rewrite commit log] Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20180711094040.12506-4-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-07-11 11:40:35 +02:00
}
if (efi_rt_services_supported(EFI_RT_SUPPORTED_TIME_SERVICES))
platform_device_register_simple("rtc-efi", 0, NULL, 0);
/* We register the efi directory at /sys/firmware/efi */
efi_kobj = kobject_create_and_add("efi", firmware_kobj);
if (!efi_kobj) {
pr_err("efi: Firmware registration failed.\n");
return -ENOMEM;
}
if (efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES)) {
efivar_ssdt_load();
error = generic_ops_register();
if (error)
goto err_put;
platform_device_register_simple("efivars", 0, NULL, 0);
}
error = sysfs_create_group(efi_kobj, &efi_subsys_attr_group);
if (error) {
pr_err("efi: Sysfs attribute export failed with error %d.\n",
error);
goto err_unregister;
}
error = efi_runtime_map_init(efi_kobj);
if (error)
goto err_remove_group;
/* and the standard mountpoint for efivarfs */
error = sysfs_create_mount_point(efi_kobj, "efivars");
if (error) {
pr_err("efivars: Subsystem registration failed.\n");
goto err_remove_group;
}
if (efi_enabled(EFI_DBG) && efi_enabled(EFI_PRESERVE_BS_REGIONS))
efi_debugfs_init();
return 0;
err_remove_group:
sysfs_remove_group(efi_kobj, &efi_subsys_attr_group);
err_unregister:
if (efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES))
generic_ops_unregister();
err_put:
kobject_put(efi_kobj);
return error;
}
subsys_initcall(efisubsys_init);
/*
* Find the efi memory descriptor for a given physical address. Given a
* physical address, determine if it exists within an EFI Memory Map entry,
* and if so, populate the supplied memory descriptor with the appropriate
* data.
*/
efi: Drop type and attribute checks in efi_mem_desc_lookup() The current implementation of efi_mem_desc_lookup() includes the following check on the memory descriptor it returns: if (!(md->attribute & EFI_MEMORY_RUNTIME) && md->type != EFI_BOOT_SERVICES_DATA && md->type != EFI_RUNTIME_SERVICES_DATA) { continue; } This means that only EfiBootServicesData or EfiRuntimeServicesData regions are considered, or any other region type provided that it has the EFI_MEMORY_RUNTIME attribute set. Given what the name of the function implies, and the fact that any physical address can be described in the UEFI memory map only a single time, it does not make sense to impose this condition in the body of the loop, but instead, should be imposed by the caller depending on the value that is returned to it. Two such callers exist at the moment: - The BGRT code when running on x86, via efi_mem_reserve() and efi_arch_mem_reserve(). In this case, the region is already known to be EfiBootServicesData, and so the check is redundant. - The ESRT handling code which introduced this function, which calls it both directly from efi_esrt_init() and again via efi_mem_reserve() and efi_arch_mem_reserve() [on x86]. So let's move this check into the callers instead. This preserves the current behavior both for BGRT and ESRT handling, and allows the lookup routine to be reused by other [upcoming] users that don't have this limitation. In the ESRT case, keep the entire condition, so that platforms that deviate from the UEFI spec and use something other than EfiBootServicesData for the ESRT table will keep working as before. For x86's efi_arch_mem_reserve() implementation, limit the type to EfiBootServicesData, since it is the only type the reservation code expects to operate on in the first place. While we're at it, drop the __init annotation so that drivers can use it as well. Tested-by: Laszlo Ersek <lersek@redhat.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Jones <pjones@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20180711094040.12506-8-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-07-11 11:40:39 +02:00
int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md)
{
efi_memory_desc_t *md;
if (!efi_enabled(EFI_MEMMAP)) {
pr_err_once("EFI_MEMMAP is not enabled.\n");
return -EINVAL;
}
if (!out_md) {
pr_err_once("out_md is null.\n");
return -EINVAL;
}
for_each_efi_memory_desc(md) {
u64 size;
u64 end;
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
if (phys_addr >= md->phys_addr && phys_addr < end) {
memcpy(out_md, md, sizeof(*out_md));
return 0;
}
}
return -ENOENT;
}
/*
* Calculate the highest address of an efi memory descriptor.
*/
u64 __init efi_mem_desc_end(efi_memory_desc_t *md)
{
u64 size = md->num_pages << EFI_PAGE_SHIFT;
u64 end = md->phys_addr + size;
return end;
}
2016-02-29 21:22:52 +00:00
void __init __weak efi_arch_mem_reserve(phys_addr_t addr, u64 size) {}
/**
* efi_mem_reserve - Reserve an EFI memory region
* @addr: Physical address to reserve
* @size: Size of reservation
*
* Mark a region as reserved from general kernel allocation and
* prevent it being released by efi_free_boot_services().
*
* This function should be called drivers once they've parsed EFI
* configuration tables to figure out where their data lives, e.g.
* efi_esrt_init().
*/
void __init efi_mem_reserve(phys_addr_t addr, u64 size)
{
if (!memblock_is_region_reserved(addr, size))
memblock_reserve(addr, size);
/*
* Some architectures (x86) reserve all boot services ranges
* until efi_free_boot_services() because of buggy firmware
* implementations. This means the above memblock_reserve() is
* superfluous on x86 and instead what it needs to do is
* ensure the @start, @size is not freed.
*/
efi_arch_mem_reserve(addr, size);
}
static const efi_config_table_type_t common_tables[] __initconst = {
{ACPI_20_TABLE_GUID, &efi.acpi20, "ACPI 2.0" },
{ACPI_TABLE_GUID, &efi.acpi, "ACPI" },
{SMBIOS_TABLE_GUID, &efi.smbios, "SMBIOS" },
{SMBIOS3_TABLE_GUID, &efi.smbios3, "SMBIOS 3.0" },
{EFI_SYSTEM_RESOURCE_TABLE_GUID, &efi.esrt, "ESRT" },
{EFI_MEMORY_ATTRIBUTES_TABLE_GUID, &efi_mem_attr_table, "MEMATTR" },
{LINUX_EFI_RANDOM_SEED_TABLE_GUID, &efi_rng_seed, "RNG" },
{LINUX_EFI_TPM_EVENT_LOG_GUID, &efi.tpm_log, "TPMEventLog" },
{LINUX_EFI_TPM_FINAL_LOG_GUID, &efi.tpm_final_log, "TPMFinalLog" },
{LINUX_EFI_MEMRESERVE_TABLE_GUID, &mem_reserve, "MEMRESERVE" },
{EFI_RT_PROPERTIES_TABLE_GUID, &rt_prop, "RTPROP" },
#ifdef CONFIG_EFI_RCI2_TABLE
{DELLEMC_EFI_RCI2_TABLE_GUID, &rci2_table_phys },
#endif
{},
};
static __init int match_config_table(const efi_guid_t *guid,
unsigned long table,
const efi_config_table_type_t *table_types)
{
int i;
for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) {
if (!efi_guidcmp(*guid, table_types[i].guid)) {
*(table_types[i].ptr) = table;
if (table_types[i].name[0])
pr_cont("%s=0x%lx ",
table_types[i].name, table);
return 1;
}
}
return 0;
}
int __init efi_config_parse_tables(const efi_config_table_t *config_tables,
int count,
const efi_config_table_type_t *arch_tables)
{
const efi_config_table_64_t *tbl64 = (void *)config_tables;
const efi_config_table_32_t *tbl32 = (void *)config_tables;
const efi_guid_t *guid;
unsigned long table;
int i;
pr_info("");
for (i = 0; i < count; i++) {
if (!IS_ENABLED(CONFIG_X86)) {
guid = &config_tables[i].guid;
table = (unsigned long)config_tables[i].table;
} else if (efi_enabled(EFI_64BIT)) {
guid = &tbl64[i].guid;
table = tbl64[i].table;
if (IS_ENABLED(CONFIG_X86_32) &&
tbl64[i].table > U32_MAX) {
pr_cont("\n");
pr_err("Table located above 4GB, disabling EFI.\n");
return -EINVAL;
}
} else {
guid = &tbl32[i].guid;
table = tbl32[i].table;
}
if (!match_config_table(guid, table, common_tables) && arch_tables)
match_config_table(guid, table, arch_tables);
}
pr_cont("\n");
set_bit(EFI_CONFIG_TABLES, &efi.flags);
if (efi_rng_seed != EFI_INVALID_TABLE_ADDR) {
struct linux_efi_random_seed *seed;
u32 size = 0;
seed = early_memremap(efi_rng_seed, sizeof(*seed));
if (seed != NULL) {
size = READ_ONCE(seed->size);
early_memunmap(seed, sizeof(*seed));
} else {
pr_err("Could not map UEFI random seed!\n");
}
if (size > 0) {
seed = early_memremap(efi_rng_seed,
sizeof(*seed) + size);
if (seed != NULL) {
pr_notice("seeding entropy pool\n");
add_bootloader_randomness(seed->bits, size);
early_memunmap(seed, sizeof(*seed) + size);
} else {
pr_err("Could not map UEFI random seed!\n");
}
}
}
efi/x86: Ignore the memory attributes table on i386 Commit: 3a6b6c6fb23667fa ("efi: Make EFI_MEMORY_ATTRIBUTES_TABLE initialization common across all architectures") moved the call to efi_memattr_init() from ARM specific to the generic EFI init code, in order to be able to apply the restricted permissions described in that table on x86 as well. We never enabled this feature fully on i386, and so mapping and reserving this table is pointless. However, due to the early call to memblock_reserve(), the memory bookkeeping gets confused to the point where it produces the splat below when we try to map the memory later on: ------------[ cut here ]------------ ioremap on RAM at 0x3f251000 - 0x3fa1afff WARNING: CPU: 0 PID: 0 at arch/x86/mm/ioremap.c:166 __ioremap_caller ... Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.20.0 #48 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 EIP: __ioremap_caller.constprop.0+0x249/0x260 Code: 90 0f b7 05 4e 38 40 de 09 45 e0 e9 09 ff ff ff 90 8d 45 ec c6 05 ... EAX: 00000029 EBX: 00000000 ECX: de59c228 EDX: 00000001 ESI: 3f250fff EDI: 00000000 EBP: de3edf20 ESP: de3edee0 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00200296 CR0: 80050033 CR2: ffd17000 CR3: 1e58c000 CR4: 00040690 Call Trace: ioremap_cache+0xd/0x10 ? old_map_region+0x72/0x9d old_map_region+0x72/0x9d efi_map_region+0x8/0xa efi_enter_virtual_mode+0x260/0x43b start_kernel+0x329/0x3aa i386_start_kernel+0xa7/0xab startup_32_smp+0x164/0x168 ---[ end trace e15ccf6b9f356833 ]--- Let's work around this by disregarding the memory attributes table altogether on i386, which does not result in a loss of functionality or protection, given that we never consumed the contents. Fixes: 3a6b6c6fb23667fa ("efi: Make EFI_MEMORY_ATTRIBUTES_TABLE ... ") Tested-by: Arvind Sankar <nivedita@alum.mit.edu> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Link: https://lore.kernel.org/r/20200304165917.5893-1-ardb@kernel.org Link: https://lore.kernel.org/r/20200308080859.21568-21-ardb@kernel.org
2020-03-08 09:08:51 +01:00
if (!IS_ENABLED(CONFIG_X86_32) && efi_enabled(EFI_MEMMAP))
efi_memattr_init();
efi_tpm_eventlog_init();
if (mem_reserve != EFI_INVALID_TABLE_ADDR) {
unsigned long prsv = mem_reserve;
while (prsv) {
struct linux_efi_memreserve *rsv;
u8 *p;
/*
* Just map a full page: that is what we will get
* anyway, and it permits us to map the entire entry
* before knowing its size.
*/
p = early_memremap(ALIGN_DOWN(prsv, PAGE_SIZE),
PAGE_SIZE);
if (p == NULL) {
pr_err("Could not map UEFI memreserve entry!\n");
return -ENOMEM;
}
rsv = (void *)(p + prsv % PAGE_SIZE);
/* reserve the entry itself */
efi: Replace zero-length array and use struct_size() helper The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. Lastly, make use of the sizeof_field() helper instead of an open-coded version. This issue was found with the help of Coccinelle and audited _manually_. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20200527171425.GA4053@embeddedor Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
2020-05-27 12:14:25 -05:00
memblock_reserve(prsv,
struct_size(rsv, entry, rsv->size));
for (i = 0; i < atomic_read(&rsv->count); i++) {
memblock_reserve(rsv->entry[i].base,
rsv->entry[i].size);
}
prsv = rsv->next;
early_memunmap(p, PAGE_SIZE);
}
}
if (rt_prop != EFI_INVALID_TABLE_ADDR) {
efi_rt_properties_table_t *tbl;
tbl = early_memremap(rt_prop, sizeof(*tbl));
if (tbl) {
efi.runtime_supported_mask &= tbl->runtime_services_supported;
early_memunmap(tbl, sizeof(*tbl));
}
}
return 0;
}
int __init efi_systab_check_header(const efi_table_hdr_t *systab_hdr,
int min_major_version)
{
if (systab_hdr->signature != EFI_SYSTEM_TABLE_SIGNATURE) {
pr_err("System table signature incorrect!\n");
return -EINVAL;
}
if ((systab_hdr->revision >> 16) < min_major_version)
pr_err("Warning: System table version %d.%02d, expected %d.00 or greater!\n",
systab_hdr->revision >> 16,
systab_hdr->revision & 0xffff,
min_major_version);
return 0;
}
#ifndef CONFIG_IA64
static const efi_char16_t *__init map_fw_vendor(unsigned long fw_vendor,
size_t size)
{
const efi_char16_t *ret;
ret = early_memremap_ro(fw_vendor, size);
if (!ret)
pr_err("Could not map the firmware vendor!\n");
return ret;
}
static void __init unmap_fw_vendor(const void *fw_vendor, size_t size)
{
early_memunmap((void *)fw_vendor, size);
}
#else
#define map_fw_vendor(p, s) __va(p)
#define unmap_fw_vendor(v, s)
#endif
void __init efi_systab_report_header(const efi_table_hdr_t *systab_hdr,
unsigned long fw_vendor)
{
char vendor[100] = "unknown";
const efi_char16_t *c16;
size_t i;
c16 = map_fw_vendor(fw_vendor, sizeof(vendor) * sizeof(efi_char16_t));
if (c16) {
for (i = 0; i < sizeof(vendor) - 1 && c16[i]; ++i)
vendor[i] = c16[i];
vendor[i] = '\0';
unmap_fw_vendor(c16, sizeof(vendor) * sizeof(efi_char16_t));
}
pr_info("EFI v%u.%.02u by %s\n",
systab_hdr->revision >> 16,
systab_hdr->revision & 0xffff,
vendor);
}
static __initdata char memory_type_name[][20] = {
"Reserved",
"Loader Code",
"Loader Data",
"Boot Code",
"Boot Data",
"Runtime Code",
"Runtime Data",
"Conventional Memory",
"Unusable Memory",
"ACPI Reclaim Memory",
"ACPI Memory NVS",
"Memory Mapped I/O",
"MMIO Port Space",
"PAL Code",
"Persistent Memory",
};
char * __init efi_md_typeattr_format(char *buf, size_t size,
const efi_memory_desc_t *md)
{
char *pos;
int type_len;
u64 attr;
pos = buf;
if (md->type >= ARRAY_SIZE(memory_type_name))
type_len = snprintf(pos, size, "[type=%u", md->type);
else
type_len = snprintf(pos, size, "[%-*s",
(int)(sizeof(memory_type_name[0]) - 1),
memory_type_name[md->type]);
if (type_len >= size)
return buf;
pos += type_len;
size -= type_len;
attr = md->attribute;
if (attr & ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT |
EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_RO |
EFI_MEMORY_WP | EFI_MEMORY_RP | EFI_MEMORY_XP |
efi: Enumerate EFI_MEMORY_SP UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the interpretation of the EFI Memory Types as "reserved for a specific purpose". The intent of this bit is to allow the OS to identify precious or scarce memory resources and optionally manage it separately from EfiConventionalMemory. As defined older OSes that do not know about this attribute are permitted to ignore it and the memory will be handled according to the OS default policy for the given memory type. In other words, this "specific purpose" hint is deliberately weaker than EfiReservedMemoryType in that the system continues to operate if the OS takes no action on the attribute. The risk of taking no action is potentially unwanted / unmovable kernel allocations from the designated resource that prevent the full realization of the "specific purpose". For example, consider a system with a high-bandwidth memory pool. Older kernels are permitted to boot and consume that memory as conventional "System-RAM" newer kernels may arrange for that memory to be set aside (soft reserved) by the system administrator for a dedicated high-bandwidth memory aware application to consume. Specifically, this mechanism allows for the elimination of scenarios where platform firmware tries to game OS policy by lying about ACPI SLIT values, i.e. claiming that a precious memory resource has a high distance to trigger the OS to avoid it by default. This reservation hint allows platform-firmware to instead tell the truth about performance characteristics by indicate to OS memory management to put immovable allocations elsewhere. Implement simple detection of the bit for EFI memory table dumps and save the kernel policy for a follow-on change. Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Acked-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2019-11-06 17:43:00 -08:00
EFI_MEMORY_NV | EFI_MEMORY_SP |
EFI_MEMORY_RUNTIME | EFI_MEMORY_MORE_RELIABLE))
snprintf(pos, size, "|attr=0x%016llx]",
(unsigned long long)attr);
else
snprintf(pos, size,
efi: Enumerate EFI_MEMORY_SP UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the interpretation of the EFI Memory Types as "reserved for a specific purpose". The intent of this bit is to allow the OS to identify precious or scarce memory resources and optionally manage it separately from EfiConventionalMemory. As defined older OSes that do not know about this attribute are permitted to ignore it and the memory will be handled according to the OS default policy for the given memory type. In other words, this "specific purpose" hint is deliberately weaker than EfiReservedMemoryType in that the system continues to operate if the OS takes no action on the attribute. The risk of taking no action is potentially unwanted / unmovable kernel allocations from the designated resource that prevent the full realization of the "specific purpose". For example, consider a system with a high-bandwidth memory pool. Older kernels are permitted to boot and consume that memory as conventional "System-RAM" newer kernels may arrange for that memory to be set aside (soft reserved) by the system administrator for a dedicated high-bandwidth memory aware application to consume. Specifically, this mechanism allows for the elimination of scenarios where platform firmware tries to game OS policy by lying about ACPI SLIT values, i.e. claiming that a precious memory resource has a high distance to trigger the OS to avoid it by default. This reservation hint allows platform-firmware to instead tell the truth about performance characteristics by indicate to OS memory management to put immovable allocations elsewhere. Implement simple detection of the bit for EFI memory table dumps and save the kernel policy for a follow-on change. Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Acked-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2019-11-06 17:43:00 -08:00
"|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
attr & EFI_MEMORY_RUNTIME ? "RUN" : "",
attr & EFI_MEMORY_MORE_RELIABLE ? "MR" : "",
efi: Enumerate EFI_MEMORY_SP UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the interpretation of the EFI Memory Types as "reserved for a specific purpose". The intent of this bit is to allow the OS to identify precious or scarce memory resources and optionally manage it separately from EfiConventionalMemory. As defined older OSes that do not know about this attribute are permitted to ignore it and the memory will be handled according to the OS default policy for the given memory type. In other words, this "specific purpose" hint is deliberately weaker than EfiReservedMemoryType in that the system continues to operate if the OS takes no action on the attribute. The risk of taking no action is potentially unwanted / unmovable kernel allocations from the designated resource that prevent the full realization of the "specific purpose". For example, consider a system with a high-bandwidth memory pool. Older kernels are permitted to boot and consume that memory as conventional "System-RAM" newer kernels may arrange for that memory to be set aside (soft reserved) by the system administrator for a dedicated high-bandwidth memory aware application to consume. Specifically, this mechanism allows for the elimination of scenarios where platform firmware tries to game OS policy by lying about ACPI SLIT values, i.e. claiming that a precious memory resource has a high distance to trigger the OS to avoid it by default. This reservation hint allows platform-firmware to instead tell the truth about performance characteristics by indicate to OS memory management to put immovable allocations elsewhere. Implement simple detection of the bit for EFI memory table dumps and save the kernel policy for a follow-on change. Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Acked-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2019-11-06 17:43:00 -08:00
attr & EFI_MEMORY_SP ? "SP" : "",
attr & EFI_MEMORY_NV ? "NV" : "",
attr & EFI_MEMORY_XP ? "XP" : "",
attr & EFI_MEMORY_RP ? "RP" : "",
attr & EFI_MEMORY_WP ? "WP" : "",
attr & EFI_MEMORY_RO ? "RO" : "",
attr & EFI_MEMORY_UCE ? "UCE" : "",
attr & EFI_MEMORY_WB ? "WB" : "",
attr & EFI_MEMORY_WT ? "WT" : "",
attr & EFI_MEMORY_WC ? "WC" : "",
attr & EFI_MEMORY_UC ? "UC" : "");
return buf;
}
/*
* IA64 has a funky EFI memory map that doesn't work the same way as
* other architectures.
*/
#ifndef CONFIG_IA64
/*
* efi_mem_attributes - lookup memmap attributes for physical address
* @phys_addr: the physical address to lookup
*
* Search in the EFI memory map for the region covering
* @phys_addr. Returns the EFI memory attributes if the region
* was found in the memory map, 0 otherwise.
*/
u64 efi_mem_attributes(unsigned long phys_addr)
{
efi_memory_desc_t *md;
if (!efi_enabled(EFI_MEMMAP))
return 0;
for_each_efi_memory_desc(md) {
if ((md->phys_addr <= phys_addr) &&
(phys_addr < (md->phys_addr +
(md->num_pages << EFI_PAGE_SHIFT))))
return md->attribute;
}
return 0;
}
/*
* efi_mem_type - lookup memmap type for physical address
* @phys_addr: the physical address to lookup
*
* Search in the EFI memory map for the region covering @phys_addr.
* Returns the EFI memory type if the region was found in the memory
* map, -EINVAL otherwise.
*/
int efi_mem_type(unsigned long phys_addr)
{
const efi_memory_desc_t *md;
if (!efi_enabled(EFI_MEMMAP))
return -ENOTSUPP;
for_each_efi_memory_desc(md) {
if ((md->phys_addr <= phys_addr) &&
(phys_addr < (md->phys_addr +
(md->num_pages << EFI_PAGE_SHIFT))))
return md->type;
}
return -EINVAL;
}
#endif
int efi_status_to_err(efi_status_t status)
{
int err;
switch (status) {
case EFI_SUCCESS:
err = 0;
break;
case EFI_INVALID_PARAMETER:
err = -EINVAL;
break;
case EFI_OUT_OF_RESOURCES:
err = -ENOSPC;
break;
case EFI_DEVICE_ERROR:
err = -EIO;
break;
case EFI_WRITE_PROTECTED:
err = -EROFS;
break;
case EFI_SECURITY_VIOLATION:
err = -EACCES;
break;
case EFI_NOT_FOUND:
err = -ENOENT;
break;
case EFI_ABORTED:
err = -EINTR;
break;
default:
err = -EINVAL;
}
return err;
}
static DEFINE_SPINLOCK(efi_mem_reserve_persistent_lock);
static struct linux_efi_memreserve *efi_memreserve_root __ro_after_init;
static int __init efi_memreserve_map_root(void)
{
if (mem_reserve == EFI_INVALID_TABLE_ADDR)
return -ENODEV;
efi_memreserve_root = memremap(mem_reserve,
sizeof(*efi_memreserve_root),
MEMREMAP_WB);
if (WARN_ON_ONCE(!efi_memreserve_root))
return -ENOMEM;
return 0;
}
static int efi_mem_reserve_iomem(phys_addr_t addr, u64 size)
{
struct resource *res, *parent;
res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
if (!res)
return -ENOMEM;
res->name = "reserved";
res->flags = IORESOURCE_MEM;
res->start = addr;
res->end = addr + size - 1;
/* we expect a conflict with a 'System RAM' region */
parent = request_resource_conflict(&iomem_resource, res);
return parent ? request_resource(parent, res) : 0;
}
int __ref efi_mem_reserve_persistent(phys_addr_t addr, u64 size)
{
struct linux_efi_memreserve *rsv;
efi: Reduce the amount of memblock reservations for persistent allocations The current implementation of efi_mem_reserve_persistent() is rather naive, in the sense that for each invocation, it creates a separate linked list entry to describe the reservation. Since the linked list entries themselves need to persist across subsequent kexec reboots, every reservation created this way results in two memblock_reserve() calls at the next boot. On arm64 systems with 100s of CPUs, this may result in a excessive number of memblock reservations, and needless fragmentation. So instead, make use of the newly updated struct linux_efi_memreserve layout to put multiple reservations into a single linked list entry. This should get rid of the numerous tiny memblock reservations, and effectively cut the total number of reservations in half on arm64 systems with many CPUs. [ mingo: build warning fix. ] Tested-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arend van Spriel <arend.vanspriel@broadcom.com> Cc: Bhupesh Sharma <bhsharma@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Eric Snowberg <eric.snowberg@oracle.com> Cc: Hans de Goede <hdegoede@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Jon Hunter <jonathanh@nvidia.com> Cc: Julien Thierry <julien.thierry@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matt Fleming <matt@codeblueprint.co.uk> Cc: Nathan Chancellor <natechancellor@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> Cc: Sedat Dilek <sedat.dilek@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: YiFei Zhu <zhuyifei1999@gmail.com> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181129171230.18699-11-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-11-29 18:12:29 +01:00
unsigned long prsv;
int rc, index;
if (efi_memreserve_root == (void *)ULONG_MAX)
return -ENODEV;
if (!efi_memreserve_root) {
rc = efi_memreserve_map_root();
if (rc)
return rc;
}
efi: Reduce the amount of memblock reservations for persistent allocations The current implementation of efi_mem_reserve_persistent() is rather naive, in the sense that for each invocation, it creates a separate linked list entry to describe the reservation. Since the linked list entries themselves need to persist across subsequent kexec reboots, every reservation created this way results in two memblock_reserve() calls at the next boot. On arm64 systems with 100s of CPUs, this may result in a excessive number of memblock reservations, and needless fragmentation. So instead, make use of the newly updated struct linux_efi_memreserve layout to put multiple reservations into a single linked list entry. This should get rid of the numerous tiny memblock reservations, and effectively cut the total number of reservations in half on arm64 systems with many CPUs. [ mingo: build warning fix. ] Tested-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arend van Spriel <arend.vanspriel@broadcom.com> Cc: Bhupesh Sharma <bhsharma@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Eric Snowberg <eric.snowberg@oracle.com> Cc: Hans de Goede <hdegoede@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Jon Hunter <jonathanh@nvidia.com> Cc: Julien Thierry <julien.thierry@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matt Fleming <matt@codeblueprint.co.uk> Cc: Nathan Chancellor <natechancellor@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> Cc: Sedat Dilek <sedat.dilek@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: YiFei Zhu <zhuyifei1999@gmail.com> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181129171230.18699-11-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-11-29 18:12:29 +01:00
/* first try to find a slot in an existing linked list entry */
for (prsv = efi_memreserve_root->next; prsv; prsv = rsv->next) {
rsv = memremap(prsv, sizeof(*rsv), MEMREMAP_WB);
efi: Reduce the amount of memblock reservations for persistent allocations The current implementation of efi_mem_reserve_persistent() is rather naive, in the sense that for each invocation, it creates a separate linked list entry to describe the reservation. Since the linked list entries themselves need to persist across subsequent kexec reboots, every reservation created this way results in two memblock_reserve() calls at the next boot. On arm64 systems with 100s of CPUs, this may result in a excessive number of memblock reservations, and needless fragmentation. So instead, make use of the newly updated struct linux_efi_memreserve layout to put multiple reservations into a single linked list entry. This should get rid of the numerous tiny memblock reservations, and effectively cut the total number of reservations in half on arm64 systems with many CPUs. [ mingo: build warning fix. ] Tested-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arend van Spriel <arend.vanspriel@broadcom.com> Cc: Bhupesh Sharma <bhsharma@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Eric Snowberg <eric.snowberg@oracle.com> Cc: Hans de Goede <hdegoede@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Jon Hunter <jonathanh@nvidia.com> Cc: Julien Thierry <julien.thierry@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matt Fleming <matt@codeblueprint.co.uk> Cc: Nathan Chancellor <natechancellor@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> Cc: Sedat Dilek <sedat.dilek@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: YiFei Zhu <zhuyifei1999@gmail.com> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181129171230.18699-11-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-11-29 18:12:29 +01:00
index = atomic_fetch_add_unless(&rsv->count, 1, rsv->size);
if (index < rsv->size) {
rsv->entry[index].base = addr;
rsv->entry[index].size = size;
memunmap(rsv);
return efi_mem_reserve_iomem(addr, size);
efi: Reduce the amount of memblock reservations for persistent allocations The current implementation of efi_mem_reserve_persistent() is rather naive, in the sense that for each invocation, it creates a separate linked list entry to describe the reservation. Since the linked list entries themselves need to persist across subsequent kexec reboots, every reservation created this way results in two memblock_reserve() calls at the next boot. On arm64 systems with 100s of CPUs, this may result in a excessive number of memblock reservations, and needless fragmentation. So instead, make use of the newly updated struct linux_efi_memreserve layout to put multiple reservations into a single linked list entry. This should get rid of the numerous tiny memblock reservations, and effectively cut the total number of reservations in half on arm64 systems with many CPUs. [ mingo: build warning fix. ] Tested-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arend van Spriel <arend.vanspriel@broadcom.com> Cc: Bhupesh Sharma <bhsharma@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Eric Snowberg <eric.snowberg@oracle.com> Cc: Hans de Goede <hdegoede@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Jon Hunter <jonathanh@nvidia.com> Cc: Julien Thierry <julien.thierry@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matt Fleming <matt@codeblueprint.co.uk> Cc: Nathan Chancellor <natechancellor@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> Cc: Sedat Dilek <sedat.dilek@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: YiFei Zhu <zhuyifei1999@gmail.com> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181129171230.18699-11-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-11-29 18:12:29 +01:00
}
memunmap(rsv);
efi: Reduce the amount of memblock reservations for persistent allocations The current implementation of efi_mem_reserve_persistent() is rather naive, in the sense that for each invocation, it creates a separate linked list entry to describe the reservation. Since the linked list entries themselves need to persist across subsequent kexec reboots, every reservation created this way results in two memblock_reserve() calls at the next boot. On arm64 systems with 100s of CPUs, this may result in a excessive number of memblock reservations, and needless fragmentation. So instead, make use of the newly updated struct linux_efi_memreserve layout to put multiple reservations into a single linked list entry. This should get rid of the numerous tiny memblock reservations, and effectively cut the total number of reservations in half on arm64 systems with many CPUs. [ mingo: build warning fix. ] Tested-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arend van Spriel <arend.vanspriel@broadcom.com> Cc: Bhupesh Sharma <bhsharma@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Eric Snowberg <eric.snowberg@oracle.com> Cc: Hans de Goede <hdegoede@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Jon Hunter <jonathanh@nvidia.com> Cc: Julien Thierry <julien.thierry@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matt Fleming <matt@codeblueprint.co.uk> Cc: Nathan Chancellor <natechancellor@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com> Cc: Sedat Dilek <sedat.dilek@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: YiFei Zhu <zhuyifei1999@gmail.com> Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181129171230.18699-11-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-11-29 18:12:29 +01:00
}
/* no slot found - allocate a new linked list entry */
rsv = (struct linux_efi_memreserve *)__get_free_page(GFP_ATOMIC);
if (!rsv)
return -ENOMEM;
rc = efi_mem_reserve_iomem(__pa(rsv), SZ_4K);
if (rc) {
free_page((unsigned long)rsv);
return rc;
}
/*
* The memremap() call above assumes that a linux_efi_memreserve entry
* never crosses a page boundary, so let's ensure that this remains true
* even when kexec'ing a 4k pages kernel from a >4k pages kernel, by
* using SZ_4K explicitly in the size calculation below.
*/
rsv->size = EFI_MEMRESERVE_COUNT(SZ_4K);
atomic_set(&rsv->count, 1);
rsv->entry[0].base = addr;
rsv->entry[0].size = size;
spin_lock(&efi_mem_reserve_persistent_lock);
rsv->next = efi_memreserve_root->next;
efi_memreserve_root->next = __pa(rsv);
spin_unlock(&efi_mem_reserve_persistent_lock);
return efi_mem_reserve_iomem(addr, size);
}
static int __init efi_memreserve_root_init(void)
{
if (efi_memreserve_root)
return 0;
if (efi_memreserve_map_root())
efi_memreserve_root = (void *)ULONG_MAX;
return 0;
}
early_initcall(efi_memreserve_root_init);
#ifdef CONFIG_KEXEC
static int update_efi_random_seed(struct notifier_block *nb,
unsigned long code, void *unused)
{
struct linux_efi_random_seed *seed;
u32 size = 0;
if (!kexec_in_progress)
return NOTIFY_DONE;
seed = memremap(efi_rng_seed, sizeof(*seed), MEMREMAP_WB);
if (seed != NULL) {
size = min(seed->size, EFI_RANDOM_SEED_SIZE);
memunmap(seed);
} else {
pr_err("Could not map UEFI random seed!\n");
}
if (size > 0) {
seed = memremap(efi_rng_seed, sizeof(*seed) + size,
MEMREMAP_WB);
if (seed != NULL) {
seed->size = size;
get_random_bytes(seed->bits, seed->size);
memunmap(seed);
} else {
pr_err("Could not map UEFI random seed!\n");
}
}
return NOTIFY_DONE;
}
static struct notifier_block efi_random_seed_nb = {
.notifier_call = update_efi_random_seed,
};
static int __init register_update_efi_random_seed(void)
{
if (efi_rng_seed == EFI_INVALID_TABLE_ADDR)
return 0;
return register_reboot_notifier(&efi_random_seed_nb);
}
late_initcall(register_update_efi_random_seed);
#endif