2019-05-19 12:08:55 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* linux/kernel/resource.c
|
|
|
|
*
|
|
|
|
* Copyright (C) 1999 Linus Torvalds
|
|
|
|
* Copyright (C) 1999 Martin Mares <mj@ucw.cz>
|
|
|
|
*
|
|
|
|
* Arbitrary resource management.
|
|
|
|
*/
|
|
|
|
|
2012-07-30 21:42:58 +00:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2011-05-23 18:51:41 +00:00
|
|
|
#include <linux/export.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/ioport.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/proc_fs.h>
|
2020-11-27 16:41:24 +00:00
|
|
|
#include <linux/pseudo_fs.h>
|
2010-03-29 17:38:00 +00:00
|
|
|
#include <linux/sched.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
#include <linux/seq_file.h>
|
devres: device resource management
Implement device resource management, in short, devres. A device
driver can allocate arbirary size of devres data which is associated
with a release function. On driver detach, release function is
invoked on the devres data, then, devres data is freed.
devreses are typed by associated release functions. Some devreses are
better represented by single instance of the type while others need
multiple instances sharing the same release function. Both usages are
supported.
devreses can be grouped using devres group such that a device driver
can easily release acquired resources halfway through initialization
or selectively release resources (e.g. resources for port 1 out of 4
ports).
This patch adds devres core including documentation and the following
managed interfaces.
* alloc/free : devm_kzalloc(), devm_kzfree()
* IO region : devm_request_region(), devm_release_region()
* IRQ : devm_request_irq(), devm_free_irq()
* DMA : dmam_alloc_coherent(), dmam_free_coherent(),
dmam_declare_coherent_memory(), dmam_pool_create(),
dmam_pool_destroy()
* PCI : pcim_enable_device(), pcim_pin_device(), pci_is_managed()
* iomap : devm_ioport_map(), devm_ioport_unmap(), devm_ioremap(),
devm_ioremap_nocache(), devm_iounmap(), pcim_iomap_table(),
pcim_iomap(), pcim_iounmap()
Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
2007-01-20 07:00:26 +00:00
|
|
|
#include <linux/device.h>
|
2008-10-28 18:45:42 +00:00
|
|
|
#include <linux/pfn.h>
|
mem hotunplug: fix kfree() of bootmem memory
When hot removing memory presented at boot time, following messages are shown:
kernel BUG at mm/slub.c:3409!
invalid opcode: 0000 [#1] SMP
Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
CPU 0
Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15
RIP: kfree+0x232/0x240
Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
Call Trace:
__release_region+0xd4/0xe0
__remove_pages+0x52/0x110
arch_remove_memory+0x89/0xd0
remove_memory+0xc4/0x100
acpi_memory_device_remove+0x6d/0xb1
acpi_device_remove+0x89/0xab
__device_release_driver+0x7c/0xf0
device_release_driver+0x2f/0x50
acpi_bus_device_detach+0x6c/0x70
acpi_ns_walk_namespace+0x11a/0x250
acpi_walk_namespace+0xee/0x137
acpi_bus_trim+0x33/0x7a
acpi_bus_hot_remove_device+0xc4/0x1a1
acpi_os_execute_deferred+0x27/0x34
process_one_work+0x1f7/0x590
worker_thread+0x11a/0x370
kthread+0xee/0x100
ret_from_fork+0x7c/0xb0
RIP [<ffffffff811c41d2>] kfree+0x232/0x240
RSP <ffff88084678d968>
The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree(). So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.
But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it. So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:
When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource. If it is not allocated by bootmem, free_resource()
release it by kfree().
And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not. If there is a released resource structure,
get_resource() returns it. If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().
[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:56 +00:00
|
|
|
#include <linux/mm.h>
|
2020-11-27 16:41:24 +00:00
|
|
|
#include <linux/mount.h>
|
2015-02-05 05:44:43 +00:00
|
|
|
#include <linux/resource_ext.h>
|
2020-11-27 16:41:24 +00:00
|
|
|
#include <uapi/linux/magic.h>
|
2023-11-15 13:00:27 +00:00
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/vmalloc.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
#include <asm/io.h>
|
|
|
|
|
|
|
|
|
|
|
|
struct resource ioport_resource = {
|
|
|
|
.name = "PCI IO",
|
2006-06-13 00:11:31 +00:00
|
|
|
.start = 0,
|
2005-04-16 22:20:36 +00:00
|
|
|
.end = IO_SPACE_LIMIT,
|
|
|
|
.flags = IORESOURCE_IO,
|
|
|
|
};
|
|
|
|
EXPORT_SYMBOL(ioport_resource);
|
|
|
|
|
|
|
|
struct resource iomem_resource = {
|
|
|
|
.name = "PCI mem",
|
2006-06-13 00:11:31 +00:00
|
|
|
.start = 0,
|
|
|
|
.end = -1,
|
2005-04-16 22:20:36 +00:00
|
|
|
.flags = IORESOURCE_MEM,
|
|
|
|
};
|
|
|
|
EXPORT_SYMBOL(iomem_resource);
|
|
|
|
|
|
|
|
static DEFINE_RWLOCK(resource_lock);
|
|
|
|
|
resource: avoid unnecessary resource tree walking in __region_intersects()
Currently, if __region_intersects() finds any overlapped but unmatched
resource, it walks the descendant resource tree to check for overlapped
and matched descendant resources using for_each_resource(). However, in
current kernel, for_each_resource() iterates not only the descendant tree,
but also subsequent sibling trees in certain scenarios. While this
doesn't introduce bugs, it makes code hard to be understood and
potentially inefficient.
So, the patch revises next_resource() and for_each_resource() and makes
for_each_resource() traverse the subtree under the specified subtree root
only. Test shows that this avoids unnecessary resource tree walking in
__region_intersects().
For the example resource tree as follows,
X
|
A----D----E
|
B--C
if 'A' is the overlapped but unmatched resource, original kernel
iterates 'B', 'C', 'D', 'E' when it walks the descendant tree. While
the patched kernel iterates only 'B', 'C'.
Thanks David Hildenbrand for providing a good resource tree example.
Link: https://lkml.kernel.org/r/20241029122735.79164-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-29 12:27:35 +00:00
|
|
|
/*
|
|
|
|
* Return the next node of @p in pre-order tree traversal. If
|
|
|
|
* @skip_children is true, skip the descendant nodes of @p in
|
|
|
|
* traversal. If @p is a descendant of @subtree_root, only traverse
|
|
|
|
* the subtree under @subtree_root.
|
|
|
|
*/
|
|
|
|
static struct resource *next_resource(struct resource *p, bool skip_children,
|
|
|
|
struct resource *subtree_root)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2023-09-12 16:53:11 +00:00
|
|
|
if (!skip_children && p->child)
|
2005-04-16 22:20:36 +00:00
|
|
|
return p->child;
|
resource: avoid unnecessary resource tree walking in __region_intersects()
Currently, if __region_intersects() finds any overlapped but unmatched
resource, it walks the descendant resource tree to check for overlapped
and matched descendant resources using for_each_resource(). However, in
current kernel, for_each_resource() iterates not only the descendant tree,
but also subsequent sibling trees in certain scenarios. While this
doesn't introduce bugs, it makes code hard to be understood and
potentially inefficient.
So, the patch revises next_resource() and for_each_resource() and makes
for_each_resource() traverse the subtree under the specified subtree root
only. Test shows that this avoids unnecessary resource tree walking in
__region_intersects().
For the example resource tree as follows,
X
|
A----D----E
|
B--C
if 'A' is the overlapped but unmatched resource, original kernel
iterates 'B', 'C', 'D', 'E' when it walks the descendant tree. While
the patched kernel iterates only 'B', 'C'.
Thanks David Hildenbrand for providing a good resource tree example.
Link: https://lkml.kernel.org/r/20241029122735.79164-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-29 12:27:35 +00:00
|
|
|
while (!p->sibling && p->parent) {
|
2005-04-16 22:20:36 +00:00
|
|
|
p = p->parent;
|
resource: avoid unnecessary resource tree walking in __region_intersects()
Currently, if __region_intersects() finds any overlapped but unmatched
resource, it walks the descendant resource tree to check for overlapped
and matched descendant resources using for_each_resource(). However, in
current kernel, for_each_resource() iterates not only the descendant tree,
but also subsequent sibling trees in certain scenarios. While this
doesn't introduce bugs, it makes code hard to be understood and
potentially inefficient.
So, the patch revises next_resource() and for_each_resource() and makes
for_each_resource() traverse the subtree under the specified subtree root
only. Test shows that this avoids unnecessary resource tree walking in
__region_intersects().
For the example resource tree as follows,
X
|
A----D----E
|
B--C
if 'A' is the overlapped but unmatched resource, original kernel
iterates 'B', 'C', 'D', 'E' when it walks the descendant tree. While
the patched kernel iterates only 'B', 'C'.
Thanks David Hildenbrand for providing a good resource tree example.
Link: https://lkml.kernel.org/r/20241029122735.79164-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-29 12:27:35 +00:00
|
|
|
if (p == subtree_root)
|
|
|
|
return NULL;
|
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
return p->sibling;
|
|
|
|
}
|
|
|
|
|
resource: avoid unnecessary resource tree walking in __region_intersects()
Currently, if __region_intersects() finds any overlapped but unmatched
resource, it walks the descendant resource tree to check for overlapped
and matched descendant resources using for_each_resource(). However, in
current kernel, for_each_resource() iterates not only the descendant tree,
but also subsequent sibling trees in certain scenarios. While this
doesn't introduce bugs, it makes code hard to be understood and
potentially inefficient.
So, the patch revises next_resource() and for_each_resource() and makes
for_each_resource() traverse the subtree under the specified subtree root
only. Test shows that this avoids unnecessary resource tree walking in
__region_intersects().
For the example resource tree as follows,
X
|
A----D----E
|
B--C
if 'A' is the overlapped but unmatched resource, original kernel
iterates 'B', 'C', 'D', 'E' when it walks the descendant tree. While
the patched kernel iterates only 'B', 'C'.
Thanks David Hildenbrand for providing a good resource tree example.
Link: https://lkml.kernel.org/r/20241029122735.79164-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-29 12:27:35 +00:00
|
|
|
/*
|
|
|
|
* Traverse the resource subtree under @_root in pre-order, excluding
|
|
|
|
* @_root itself.
|
|
|
|
*
|
|
|
|
* NOTE: '__p' is introduced to avoid shadowing '_p' outside of loop.
|
|
|
|
* And it is referenced to avoid unused variable warning.
|
|
|
|
*/
|
2021-11-09 02:35:46 +00:00
|
|
|
#define for_each_resource(_root, _p, _skip_children) \
|
resource: avoid unnecessary resource tree walking in __region_intersects()
Currently, if __region_intersects() finds any overlapped but unmatched
resource, it walks the descendant resource tree to check for overlapped
and matched descendant resources using for_each_resource(). However, in
current kernel, for_each_resource() iterates not only the descendant tree,
but also subsequent sibling trees in certain scenarios. While this
doesn't introduce bugs, it makes code hard to be understood and
potentially inefficient.
So, the patch revises next_resource() and for_each_resource() and makes
for_each_resource() traverse the subtree under the specified subtree root
only. Test shows that this avoids unnecessary resource tree walking in
__region_intersects().
For the example resource tree as follows,
X
|
A----D----E
|
B--C
if 'A' is the overlapped but unmatched resource, original kernel
iterates 'B', 'C', 'D', 'E' when it walks the descendant tree. While
the patched kernel iterates only 'B', 'C'.
Thanks David Hildenbrand for providing a good resource tree example.
Link: https://lkml.kernel.org/r/20241029122735.79164-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-29 12:27:35 +00:00
|
|
|
for (typeof(_root) __root = (_root), __p = _p = __root->child; \
|
|
|
|
__p && _p; _p = next_resource(_p, _skip_children, __root))
|
2021-11-09 02:35:46 +00:00
|
|
|
|
2008-09-26 08:10:12 +00:00
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
|
|
|
|
enum { MAX_IORES_LEVEL = 5 };
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
static void *r_start(struct seq_file *m, loff_t *pos)
|
|
|
|
__acquires(resource_lock)
|
|
|
|
{
|
2023-09-12 16:53:10 +00:00
|
|
|
struct resource *root = pde_data(file_inode(m->file));
|
|
|
|
struct resource *p;
|
|
|
|
loff_t l = *pos;
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
read_lock(&resource_lock);
|
2023-09-12 16:53:10 +00:00
|
|
|
for_each_resource(root, p, false) {
|
|
|
|
if (l-- == 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
2023-09-12 16:53:10 +00:00
|
|
|
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
|
|
|
|
{
|
|
|
|
struct resource *p = v;
|
2023-09-12 16:53:11 +00:00
|
|
|
|
2023-09-12 16:53:10 +00:00
|
|
|
(*pos)++;
|
2023-09-12 16:53:11 +00:00
|
|
|
|
resource: avoid unnecessary resource tree walking in __region_intersects()
Currently, if __region_intersects() finds any overlapped but unmatched
resource, it walks the descendant resource tree to check for overlapped
and matched descendant resources using for_each_resource(). However, in
current kernel, for_each_resource() iterates not only the descendant tree,
but also subsequent sibling trees in certain scenarios. While this
doesn't introduce bugs, it makes code hard to be understood and
potentially inefficient.
So, the patch revises next_resource() and for_each_resource() and makes
for_each_resource() traverse the subtree under the specified subtree root
only. Test shows that this avoids unnecessary resource tree walking in
__region_intersects().
For the example resource tree as follows,
X
|
A----D----E
|
B--C
if 'A' is the overlapped but unmatched resource, original kernel
iterates 'B', 'C', 'D', 'E' when it walks the descendant tree. While
the patched kernel iterates only 'B', 'C'.
Thanks David Hildenbrand for providing a good resource tree example.
Link: https://lkml.kernel.org/r/20241029122735.79164-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-29 12:27:35 +00:00
|
|
|
return (void *)next_resource(p, false, NULL);
|
2023-09-12 16:53:10 +00:00
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
static void r_stop(struct seq_file *m, void *v)
|
|
|
|
__releases(resource_lock)
|
|
|
|
{
|
|
|
|
read_unlock(&resource_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int r_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2022-01-22 06:14:23 +00:00
|
|
|
struct resource *root = pde_data(file_inode(m->file));
|
2005-04-16 22:20:36 +00:00
|
|
|
struct resource *r = v, *p;
|
2016-04-14 19:05:37 +00:00
|
|
|
unsigned long long start, end;
|
2005-04-16 22:20:36 +00:00
|
|
|
int width = root->end < 0x10000 ? 4 : 8;
|
|
|
|
int depth;
|
|
|
|
|
|
|
|
for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
|
|
|
|
if (p->parent == root)
|
|
|
|
break;
|
2016-04-14 19:05:37 +00:00
|
|
|
|
|
|
|
if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) {
|
|
|
|
start = r->start;
|
|
|
|
end = r->end;
|
|
|
|
} else {
|
|
|
|
start = end = 0;
|
|
|
|
}
|
|
|
|
|
2006-06-12 22:18:31 +00:00
|
|
|
seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
|
2005-04-16 22:20:36 +00:00
|
|
|
depth * 2, "",
|
2016-04-14 19:05:37 +00:00
|
|
|
width, start,
|
|
|
|
width, end,
|
2005-04-16 22:20:36 +00:00
|
|
|
r->name ? r->name : "<BAD>");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-12-07 04:40:36 +00:00
|
|
|
static const struct seq_operations resource_op = {
|
2005-04-16 22:20:36 +00:00
|
|
|
.start = r_start,
|
|
|
|
.next = r_next,
|
|
|
|
.stop = r_stop,
|
|
|
|
.show = r_show,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init ioresources_init(void)
|
|
|
|
{
|
2018-04-11 09:52:39 +00:00
|
|
|
proc_create_seq_data("ioports", 0, NULL, &resource_op,
|
|
|
|
&ioport_resource);
|
|
|
|
proc_create_seq_data("iomem", 0, NULL, &resource_op, &iomem_resource);
|
2005-04-16 22:20:36 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
__initcall(ioresources_init);
|
|
|
|
|
|
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
|
mem hotunplug: fix kfree() of bootmem memory
When hot removing memory presented at boot time, following messages are shown:
kernel BUG at mm/slub.c:3409!
invalid opcode: 0000 [#1] SMP
Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
CPU 0
Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15
RIP: kfree+0x232/0x240
Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
Call Trace:
__release_region+0xd4/0xe0
__remove_pages+0x52/0x110
arch_remove_memory+0x89/0xd0
remove_memory+0xc4/0x100
acpi_memory_device_remove+0x6d/0xb1
acpi_device_remove+0x89/0xab
__device_release_driver+0x7c/0xf0
device_release_driver+0x2f/0x50
acpi_bus_device_detach+0x6c/0x70
acpi_ns_walk_namespace+0x11a/0x250
acpi_walk_namespace+0xee/0x137
acpi_bus_trim+0x33/0x7a
acpi_bus_hot_remove_device+0xc4/0x1a1
acpi_os_execute_deferred+0x27/0x34
process_one_work+0x1f7/0x590
worker_thread+0x11a/0x370
kthread+0xee/0x100
ret_from_fork+0x7c/0xb0
RIP [<ffffffff811c41d2>] kfree+0x232/0x240
RSP <ffff88084678d968>
The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree(). So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.
But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it. So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:
When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource. If it is not allocated by bootmem, free_resource()
release it by kfree().
And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not. If there is a released resource structure,
get_resource() returns it. If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().
[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:56 +00:00
|
|
|
static void free_resource(struct resource *res)
|
|
|
|
{
|
2022-03-23 23:07:18 +00:00
|
|
|
/**
|
|
|
|
* If the resource was allocated using memblock early during boot
|
|
|
|
* we'll leak it here: we can only return full pages back to the
|
|
|
|
* buddy and trying to be smart and reusing them eventually in
|
|
|
|
* alloc_resource() overcomplicates resource handling.
|
|
|
|
*/
|
|
|
|
if (res && PageSlab(virt_to_head_page(res)))
|
mem hotunplug: fix kfree() of bootmem memory
When hot removing memory presented at boot time, following messages are shown:
kernel BUG at mm/slub.c:3409!
invalid opcode: 0000 [#1] SMP
Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
CPU 0
Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15
RIP: kfree+0x232/0x240
Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
Call Trace:
__release_region+0xd4/0xe0
__remove_pages+0x52/0x110
arch_remove_memory+0x89/0xd0
remove_memory+0xc4/0x100
acpi_memory_device_remove+0x6d/0xb1
acpi_device_remove+0x89/0xab
__device_release_driver+0x7c/0xf0
device_release_driver+0x2f/0x50
acpi_bus_device_detach+0x6c/0x70
acpi_ns_walk_namespace+0x11a/0x250
acpi_walk_namespace+0xee/0x137
acpi_bus_trim+0x33/0x7a
acpi_bus_hot_remove_device+0xc4/0x1a1
acpi_os_execute_deferred+0x27/0x34
process_one_work+0x1f7/0x590
worker_thread+0x11a/0x370
kthread+0xee/0x100
ret_from_fork+0x7c/0xb0
RIP [<ffffffff811c41d2>] kfree+0x232/0x240
RSP <ffff88084678d968>
The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree(). So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.
But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it. So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:
When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource. If it is not allocated by bootmem, free_resource()
release it by kfree().
And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not. If there is a released resource structure,
get_resource() returns it. If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().
[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:56 +00:00
|
|
|
kfree(res);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct resource *alloc_resource(gfp_t flags)
|
|
|
|
{
|
2022-03-23 23:07:18 +00:00
|
|
|
return kzalloc(sizeof(struct resource), flags);
|
mem hotunplug: fix kfree() of bootmem memory
When hot removing memory presented at boot time, following messages are shown:
kernel BUG at mm/slub.c:3409!
invalid opcode: 0000 [#1] SMP
Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
CPU 0
Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15
RIP: kfree+0x232/0x240
Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
Call Trace:
__release_region+0xd4/0xe0
__remove_pages+0x52/0x110
arch_remove_memory+0x89/0xd0
remove_memory+0xc4/0x100
acpi_memory_device_remove+0x6d/0xb1
acpi_device_remove+0x89/0xab
__device_release_driver+0x7c/0xf0
device_release_driver+0x2f/0x50
acpi_bus_device_detach+0x6c/0x70
acpi_ns_walk_namespace+0x11a/0x250
acpi_walk_namespace+0xee/0x137
acpi_bus_trim+0x33/0x7a
acpi_bus_hot_remove_device+0xc4/0x1a1
acpi_os_execute_deferred+0x27/0x34
process_one_work+0x1f7/0x590
worker_thread+0x11a/0x370
kthread+0xee/0x100
ret_from_fork+0x7c/0xb0
RIP [<ffffffff811c41d2>] kfree+0x232/0x240
RSP <ffff88084678d968>
The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree(). So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.
But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it. So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:
When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource. If it is not allocated by bootmem, free_resource()
release it by kfree().
And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not. If there is a released resource structure,
get_resource() returns it. If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().
[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:56 +00:00
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/* Return the conflict entry if you can't request it */
|
|
|
|
static struct resource * __request_resource(struct resource *root, struct resource *new)
|
|
|
|
{
|
2006-06-12 23:09:23 +00:00
|
|
|
resource_size_t start = new->start;
|
|
|
|
resource_size_t end = new->end;
|
2005-04-16 22:20:36 +00:00
|
|
|
struct resource *tmp, **p;
|
|
|
|
|
|
|
|
if (end < start)
|
|
|
|
return root;
|
|
|
|
if (start < root->start)
|
|
|
|
return root;
|
|
|
|
if (end > root->end)
|
|
|
|
return root;
|
|
|
|
p = &root->child;
|
|
|
|
for (;;) {
|
|
|
|
tmp = *p;
|
|
|
|
if (!tmp || tmp->start > end) {
|
|
|
|
new->sibling = tmp;
|
|
|
|
*p = new;
|
|
|
|
new->parent = root;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
p = &tmp->sibling;
|
|
|
|
if (tmp->end < start)
|
|
|
|
continue;
|
|
|
|
return tmp;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-09 19:47:04 +00:00
|
|
|
static int __release_resource(struct resource *old, bool release_child)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2016-03-09 19:47:04 +00:00
|
|
|
struct resource *tmp, **p, *chd;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
p = &old->parent->child;
|
|
|
|
for (;;) {
|
|
|
|
tmp = *p;
|
|
|
|
if (!tmp)
|
|
|
|
break;
|
|
|
|
if (tmp == old) {
|
2016-03-09 19:47:04 +00:00
|
|
|
if (release_child || !(tmp->child)) {
|
|
|
|
*p = tmp->sibling;
|
|
|
|
} else {
|
|
|
|
for (chd = tmp->child;; chd = chd->sibling) {
|
|
|
|
chd->parent = tmp->parent;
|
|
|
|
if (!(chd->sibling))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
*p = tmp->child;
|
|
|
|
chd->sibling = tmp->sibling;
|
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
old->parent = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
p = &tmp->sibling;
|
|
|
|
}
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2009-12-22 23:02:22 +00:00
|
|
|
static void __release_child_resources(struct resource *r)
|
|
|
|
{
|
|
|
|
struct resource *tmp, *p;
|
|
|
|
resource_size_t size;
|
|
|
|
|
|
|
|
p = r->child;
|
|
|
|
r->child = NULL;
|
|
|
|
while (p) {
|
|
|
|
tmp = p;
|
|
|
|
p = p->sibling;
|
|
|
|
|
|
|
|
tmp->parent = NULL;
|
|
|
|
tmp->sibling = NULL;
|
|
|
|
__release_child_resources(tmp);
|
|
|
|
|
|
|
|
printk(KERN_DEBUG "release child resource %pR\n", tmp);
|
|
|
|
/* need to restore size, and keep flags */
|
|
|
|
size = resource_size(tmp);
|
|
|
|
tmp->start = 0;
|
|
|
|
tmp->end = size - 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void release_child_resources(struct resource *r)
|
|
|
|
{
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
__release_child_resources(r);
|
|
|
|
write_unlock(&resource_lock);
|
|
|
|
}
|
|
|
|
|
2006-10-03 08:13:51 +00:00
|
|
|
/**
|
2010-03-12 00:01:09 +00:00
|
|
|
* request_resource_conflict - request and reserve an I/O or memory resource
|
2006-10-03 08:13:51 +00:00
|
|
|
* @root: root resource descriptor
|
|
|
|
* @new: resource descriptor desired by caller
|
|
|
|
*
|
2010-03-12 00:01:09 +00:00
|
|
|
* Returns 0 for success, conflict resource on error.
|
2006-10-03 08:13:51 +00:00
|
|
|
*/
|
2010-03-12 00:01:09 +00:00
|
|
|
struct resource *request_resource_conflict(struct resource *root, struct resource *new)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
|
|
|
struct resource *conflict;
|
|
|
|
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
conflict = __request_resource(root, new);
|
|
|
|
write_unlock(&resource_lock);
|
2010-03-12 00:01:09 +00:00
|
|
|
return conflict;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* request_resource - request and reserve an I/O or memory resource
|
|
|
|
* @root: root resource descriptor
|
|
|
|
* @new: resource descriptor desired by caller
|
|
|
|
*
|
|
|
|
* Returns 0 for success, negative error code on error.
|
|
|
|
*/
|
|
|
|
int request_resource(struct resource *root, struct resource *new)
|
|
|
|
{
|
|
|
|
struct resource *conflict;
|
|
|
|
|
|
|
|
conflict = request_resource_conflict(root, new);
|
2005-04-16 22:20:36 +00:00
|
|
|
return conflict ? -EBUSY : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(request_resource);
|
|
|
|
|
2006-10-03 08:13:51 +00:00
|
|
|
/**
|
|
|
|
* release_resource - release a previously reserved resource
|
|
|
|
* @old: resource pointer
|
|
|
|
*/
|
2005-04-16 22:20:36 +00:00
|
|
|
int release_resource(struct resource *old)
|
|
|
|
{
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
write_lock(&resource_lock);
|
2016-03-09 19:47:04 +00:00
|
|
|
retval = __release_resource(old, true);
|
2005-04-16 22:20:36 +00:00
|
|
|
write_unlock(&resource_lock);
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(release_resource);
|
|
|
|
|
2024-09-25 15:43:35 +00:00
|
|
|
static bool is_type_match(struct resource *p, unsigned long flags, unsigned long desc)
|
|
|
|
{
|
|
|
|
return (p->flags & flags) == flags && (desc == IORES_DESC_NONE || desc == p->desc);
|
|
|
|
}
|
|
|
|
|
2018-11-05 09:33:07 +00:00
|
|
|
/**
|
2020-12-16 04:46:16 +00:00
|
|
|
* find_next_iomem_res - Finds the lowest iomem resource that covers part of
|
|
|
|
* [@start..@end].
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
*
|
2018-11-05 09:33:07 +00:00
|
|
|
* If a resource is found, returns 0 and @*res is overwritten with the part
|
|
|
|
* of the resource that's within [@start..@end]; if none is found, returns
|
2019-07-18 22:57:31 +00:00
|
|
|
* -ENODEV. Returns -EINVAL for invalid parameters.
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
*
|
2018-11-05 09:33:07 +00:00
|
|
|
* @start: start address of the resource searched for
|
|
|
|
* @end: end address of same resource
|
|
|
|
* @flags: flags which the resource must have
|
|
|
|
* @desc: descriptor the resource must have
|
|
|
|
* @res: return ptr, if resource found
|
2020-12-16 04:46:16 +00:00
|
|
|
*
|
|
|
|
* The caller must specify @start, @end, @flags, and @desc
|
|
|
|
* (which may be IORES_DESC_NONE).
|
2006-06-27 09:53:36 +00:00
|
|
|
*/
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
|
|
|
|
unsigned long flags, unsigned long desc,
|
2021-05-07 01:05:20 +00:00
|
|
|
struct resource *res)
|
2006-06-27 09:53:36 +00:00
|
|
|
{
|
|
|
|
struct resource *p;
|
|
|
|
|
2018-10-09 14:11:21 +00:00
|
|
|
if (!res)
|
|
|
|
return -EINVAL;
|
2006-06-27 09:53:36 +00:00
|
|
|
|
2018-10-09 14:11:21 +00:00
|
|
|
if (start >= end)
|
|
|
|
return -EINVAL;
|
2014-08-08 21:25:50 +00:00
|
|
|
|
2014-08-29 22:18:29 +00:00
|
|
|
read_lock(&resource_lock);
|
|
|
|
|
2023-09-12 16:53:10 +00:00
|
|
|
for_each_resource(&iomem_resource, p, false) {
|
2019-07-18 22:57:34 +00:00
|
|
|
/* If we passed the resource we are looking for, stop */
|
2006-06-27 09:53:36 +00:00
|
|
|
if (p->start > end) {
|
|
|
|
p = NULL;
|
|
|
|
break;
|
|
|
|
}
|
2019-07-18 22:57:34 +00:00
|
|
|
|
|
|
|
/* Skip until we find a range that matches what we look for */
|
|
|
|
if (p->end < start)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Found a match, break */
|
2024-09-25 15:43:35 +00:00
|
|
|
if (is_type_match(p, flags, desc))
|
|
|
|
break;
|
2006-06-27 09:53:36 +00:00
|
|
|
}
|
2014-08-08 21:25:50 +00:00
|
|
|
|
2019-07-18 22:57:31 +00:00
|
|
|
if (p) {
|
|
|
|
/* copy data */
|
2020-10-13 23:49:18 +00:00
|
|
|
*res = (struct resource) {
|
|
|
|
.start = max(start, p->start),
|
|
|
|
.end = min(end, p->end),
|
|
|
|
.flags = p->flags,
|
|
|
|
.desc = p->desc,
|
|
|
|
.parent = p->parent,
|
|
|
|
};
|
2019-07-18 22:57:31 +00:00
|
|
|
}
|
|
|
|
|
2006-06-27 09:53:36 +00:00
|
|
|
read_unlock(&resource_lock);
|
2019-07-18 22:57:31 +00:00
|
|
|
return p ? 0 : -ENODEV;
|
2006-06-27 09:53:36 +00:00
|
|
|
}
|
2009-09-22 23:45:46 +00:00
|
|
|
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
|
|
|
|
unsigned long flags, unsigned long desc,
|
2021-05-07 01:05:20 +00:00
|
|
|
void *arg,
|
2017-10-20 14:30:51 +00:00
|
|
|
int (*func)(struct resource *, void *))
|
2017-10-20 14:30:50 +00:00
|
|
|
{
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
struct resource res;
|
2019-02-25 18:57:30 +00:00
|
|
|
int ret = -EINVAL;
|
2017-10-20 14:30:50 +00:00
|
|
|
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
while (start < end &&
|
2021-05-07 01:05:20 +00:00
|
|
|
!find_next_iomem_res(start, end, flags, desc, &res)) {
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
ret = (*func)(&res, arg);
|
2017-10-20 14:30:50 +00:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
start = res.end + 1;
|
2017-10-20 14:30:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-10-09 14:11:21 +00:00
|
|
|
/**
|
2020-12-16 04:46:16 +00:00
|
|
|
* walk_iomem_res_desc - Walks through iomem resources and calls func()
|
|
|
|
* with matching resource ranges.
|
|
|
|
* *
|
2016-01-26 20:57:29 +00:00
|
|
|
* @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
|
|
|
|
* @flags: I/O resource flags
|
|
|
|
* @start: start addr
|
|
|
|
* @end: end addr
|
2018-11-05 02:40:14 +00:00
|
|
|
* @arg: function argument for the callback @func
|
|
|
|
* @func: callback function that is called for each qualifying resource area
|
2016-01-26 20:57:29 +00:00
|
|
|
*
|
2020-12-16 04:46:16 +00:00
|
|
|
* All the memory ranges which overlap start,end and also match flags and
|
|
|
|
* desc are valid candidates.
|
|
|
|
*
|
2016-01-26 20:57:29 +00:00
|
|
|
* NOTE: For a new descriptor search, define a new IORES_DESC in
|
|
|
|
* <linux/ioport.h> and set it in 'desc' of a target resource entry.
|
|
|
|
*/
|
|
|
|
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
|
2017-10-20 14:30:51 +00:00
|
|
|
u64 end, void *arg, int (*func)(struct resource *, void *))
|
2016-01-26 20:57:29 +00:00
|
|
|
{
|
2021-05-07 01:05:20 +00:00
|
|
|
return __walk_iomem_res_desc(start, end, flags, desc, arg, func);
|
2016-01-26 20:57:29 +00:00
|
|
|
}
|
2018-06-02 18:43:39 +00:00
|
|
|
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
|
2016-01-26 20:57:29 +00:00
|
|
|
|
2014-08-08 21:25:50 +00:00
|
|
|
/*
|
2016-01-26 20:57:26 +00:00
|
|
|
* This function calls the @func callback against all memory ranges of type
|
|
|
|
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
|
|
|
|
* Now, this function is only for System RAM, it deals with full ranges and
|
|
|
|
* not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate
|
|
|
|
* ranges.
|
2014-08-08 21:25:50 +00:00
|
|
|
*/
|
|
|
|
int walk_system_ram_res(u64 start, u64 end, void *arg,
|
2018-10-09 14:11:21 +00:00
|
|
|
int (*func)(struct resource *, void *))
|
2014-08-08 21:25:50 +00:00
|
|
|
{
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
|
2014-08-08 21:25:50 +00:00
|
|
|
|
2021-05-07 01:05:20 +00:00
|
|
|
return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
|
|
|
|
func);
|
2017-10-20 14:30:52 +00:00
|
|
|
}
|
|
|
|
|
2023-11-15 13:00:27 +00:00
|
|
|
/*
|
|
|
|
* This function, being a variant of walk_system_ram_res(), calls the @func
|
|
|
|
* callback against all memory ranges of type System RAM which are marked as
|
|
|
|
* IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
|
|
|
|
* higher to lower.
|
|
|
|
*/
|
|
|
|
int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
|
|
|
|
int (*func)(struct resource *, void *))
|
|
|
|
{
|
|
|
|
struct resource res, *rams;
|
|
|
|
int rams_size = 16, i;
|
|
|
|
unsigned long flags;
|
|
|
|
int ret = -1;
|
|
|
|
|
|
|
|
/* create a list */
|
|
|
|
rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL);
|
|
|
|
if (!rams)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
|
|
|
|
i = 0;
|
|
|
|
while ((start < end) &&
|
|
|
|
(!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) {
|
|
|
|
if (i >= rams_size) {
|
|
|
|
/* re-alloc */
|
|
|
|
struct resource *rams_new;
|
|
|
|
|
mm: kvmalloc: align kvrealloc() with krealloc()
Besides the obvious (and desired) difference between krealloc() and
kvrealloc(), there is some inconsistency in their function signatures and
behavior:
- krealloc() frees the memory when the requested size is zero, whereas
kvrealloc() simply returns a pointer to the existing allocation.
- krealloc() behaves like kmalloc() if a NULL pointer is passed, whereas
kvrealloc() does not accept a NULL pointer at all and, if passed,
would fault instead.
- krealloc() is self-contained, whereas kvrealloc() relies on the caller
to provide the size of the previous allocation.
Inconsistent behavior throughout allocation APIs is error prone, hence
make kvrealloc() behave like krealloc(), which seems superior in all
mentioned aspects.
Besides that, implementing kvrealloc() by making use of krealloc() and
vrealloc() provides oppertunities to grow (and shrink) allocations more
efficiently. For instance, vrealloc() can be optimized to allocate and
map additional pages to grow the allocation or unmap and free unused pages
to shrink the allocation.
[dakr@kernel.org: document concurrency restrictions]
Link: https://lkml.kernel.org/r/20240725125442.4957-1-dakr@kernel.org
[dakr@kernel.org: disable KASAN when switching to vmalloc]
Link: https://lkml.kernel.org/r/20240730185049.6244-2-dakr@kernel.org
[dakr@kernel.org: properly document __GFP_ZERO behavior]
Link: https://lkml.kernel.org/r/20240730185049.6244-5-dakr@kernel.org
Link: https://lkml.kernel.org/r/20240722163111.4766-3-dakr@kernel.org
Signed-off-by: Danilo Krummrich <dakr@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Uladzislau Rezki <urezki@gmail.com>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-07-22 16:29:24 +00:00
|
|
|
rams_new = kvrealloc(rams, (rams_size + 16) * sizeof(struct resource),
|
2023-11-15 13:00:27 +00:00
|
|
|
GFP_KERNEL);
|
|
|
|
if (!rams_new)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
rams = rams_new;
|
|
|
|
rams_size += 16;
|
|
|
|
}
|
|
|
|
|
2024-10-17 19:03:47 +00:00
|
|
|
rams[i++] = res;
|
2023-11-15 13:00:27 +00:00
|
|
|
start = res.end + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* go reverse */
|
|
|
|
for (i--; i >= 0; i--) {
|
|
|
|
ret = (*func)(&rams[i], arg);
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
kvfree(rams);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-10-20 14:30:52 +00:00
|
|
|
/*
|
|
|
|
* This function calls the @func callback against all memory ranges, which
|
|
|
|
* are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
|
|
|
|
*/
|
|
|
|
int walk_mem_res(u64 start, u64 end, void *arg,
|
|
|
|
int (*func)(struct resource *, void *))
|
|
|
|
{
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
|
2017-10-20 14:30:52 +00:00
|
|
|
|
2021-05-07 01:05:20 +00:00
|
|
|
return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
|
|
|
|
func);
|
2014-08-08 21:25:50 +00:00
|
|
|
}
|
|
|
|
|
2009-09-22 23:45:46 +00:00
|
|
|
/*
|
2016-01-26 20:57:26 +00:00
|
|
|
* This function calls the @func callback against all memory ranges of type
|
|
|
|
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
|
|
|
|
* It is to be used only for System RAM.
|
2009-09-22 23:45:46 +00:00
|
|
|
*/
|
|
|
|
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
|
2018-10-09 14:11:21 +00:00
|
|
|
void *arg, int (*func)(unsigned long, unsigned long, void *))
|
2007-10-16 08:26:10 +00:00
|
|
|
{
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
resource_size_t start, end;
|
|
|
|
unsigned long flags;
|
2007-10-16 08:26:10 +00:00
|
|
|
struct resource res;
|
2010-03-01 13:55:51 +00:00
|
|
|
unsigned long pfn, end_pfn;
|
2019-02-25 18:57:30 +00:00
|
|
|
int ret = -EINVAL;
|
2009-09-22 23:45:46 +00:00
|
|
|
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
start = (u64) start_pfn << PAGE_SHIFT;
|
|
|
|
end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
|
|
|
|
flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
|
|
|
|
while (start < end &&
|
2021-05-07 01:05:20 +00:00
|
|
|
!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) {
|
2019-09-23 22:35:55 +00:00
|
|
|
pfn = PFN_UP(res.start);
|
|
|
|
end_pfn = PFN_DOWN(res.end + 1);
|
2010-03-01 13:55:51 +00:00
|
|
|
if (end_pfn > pfn)
|
2010-03-02 19:21:09 +00:00
|
|
|
ret = (*func)(pfn, end_pfn - pfn, arg);
|
2007-10-16 08:26:10 +00:00
|
|
|
if (ret)
|
|
|
|
break;
|
resource: Fix find_next_iomem_res() iteration issue
Previously find_next_iomem_res() used "*res" as both an input parameter for
the range to search and the type of resource to search for, and an output
parameter for the resource we found, which makes the interface confusing.
The current callers use find_next_iomem_res() incorrectly because they
allocate a single struct resource and use it for repeated calls to
find_next_iomem_res(). When find_next_iomem_res() returns a resource, it
overwrites the start, end, flags, and desc members of the struct. If we
call find_next_iomem_res() again, we must update or restore these fields.
The previous code restored res.start and res.end, but not res.flags or
res.desc.
Since the callers did not restore res.flags, if they searched for flags
IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags
IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would
incorrectly skip resources unless they were also marked as
IORESOURCE_SYSRAM.
Fix this by restructuring the interface so it takes explicit "start, end,
flags" parameters and uses "*res" only as an output parameter.
Based on a patch by Lianbo Jiang <lijiang@redhat.com>.
[ bp: While at it:
- make comments kernel-doc style.
-
Originally-by: http://lore.kernel.org/lkml/20180921073211.20097-2-lijiang@redhat.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Brijesh Singh <brijesh.singh@amd.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Lianbo Jiang <lijiang@redhat.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
CC: bhe@redhat.com
CC: dan.j.williams@intel.com
CC: dyoung@redhat.com
CC: kexec@lists.infradead.org
CC: mingo@redhat.com
CC: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/153805812916.1157.177580438135143788.stgit@bhelgaas-glaptop.roam.corp.google.com
2018-09-27 14:22:09 +00:00
|
|
|
start = res.end + 1;
|
2007-10-16 08:26:10 +00:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-01-22 08:16:19 +00:00
|
|
|
static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
2017-10-20 14:30:50 +00:00
|
|
|
|
2010-01-22 08:16:19 +00:00
|
|
|
/*
|
|
|
|
* This generic page_is_ram() returns true if specified address is
|
2016-01-26 20:57:26 +00:00
|
|
|
* registered as System RAM in iomem_resource list.
|
2010-01-22 08:16:19 +00:00
|
|
|
*/
|
2010-01-27 00:31:19 +00:00
|
|
|
int __weak page_is_ram(unsigned long pfn)
|
2010-01-22 08:16:19 +00:00
|
|
|
{
|
|
|
|
return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
|
|
|
|
}
|
2013-06-06 22:20:51 +00:00
|
|
|
EXPORT_SYMBOL_GPL(page_is_ram);
|
2010-01-22 08:16:19 +00:00
|
|
|
|
2022-05-20 20:41:24 +00:00
|
|
|
static int __region_intersects(struct resource *parent, resource_size_t start,
|
|
|
|
size_t size, unsigned long flags,
|
|
|
|
unsigned long desc)
|
2021-05-07 01:05:24 +00:00
|
|
|
{
|
|
|
|
int type = 0; int other = 0;
|
resource: fix region_intersects() vs add_memory_driver_managed()
On a system with CXL memory, the resource tree (/proc/iomem) related to
CXL memory may look like something as follows.
490000000-50fffffff : CXL Window 0
490000000-50fffffff : region0
490000000-50fffffff : dax0.0
490000000-50fffffff : System RAM (kmem)
Because drivers/dax/kmem.c calls add_memory_driver_managed() during
onlining CXL memory, which makes "System RAM (kmem)" a descendant of "CXL
Window X". This confuses region_intersects(), which expects all "System
RAM" resources to be at the top level of iomem_resource. This can lead to
bugs.
For example, when the following command line is executed to write some
memory in CXL memory range via /dev/mem,
$ dd if=data of=/dev/mem bs=$((1 << 10)) seek=$((0x490000000 >> 10)) count=1
dd: error writing '/dev/mem': Bad address
1+0 records in
0+0 records out
0 bytes copied, 0.0283507 s, 0.0 kB/s
the command fails as expected. However, the error code is wrong. It
should be "Operation not permitted" instead of "Bad address". More
seriously, the /dev/mem permission checking in devmem_is_allowed() passes
incorrectly. Although the accessing is prevented later because ioremap()
isn't allowed to map system RAM, it is a potential security issue. During
command executing, the following warning is reported in the kernel log for
calling ioremap() on system RAM.
ioremap on RAM at 0x0000000490000000 - 0x0000000490000fff
WARNING: CPU: 2 PID: 416 at arch/x86/mm/ioremap.c:216 __ioremap_caller.constprop.0+0x131/0x35d
Call Trace:
memremap+0xcb/0x184
xlate_dev_mem_ptr+0x25/0x2f
write_mem+0x94/0xfb
vfs_write+0x128/0x26d
ksys_write+0xac/0xfe
do_syscall_64+0x9a/0xfd
entry_SYSCALL_64_after_hwframe+0x4b/0x53
The details of command execution process are as follows. In the above
resource tree, "System RAM" is a descendant of "CXL Window 0" instead of a
top level resource. So, region_intersects() will report no System RAM
resources in the CXL memory region incorrectly, because it only checks the
top level resources. Consequently, devmem_is_allowed() will return 1
(allow access via /dev/mem) for CXL memory region incorrectly.
Fortunately, ioremap() doesn't allow to map System RAM and reject the
access.
So, region_intersects() needs to be fixed to work correctly with the
resource tree with "System RAM" not at top level as above. To fix it, if
we found a unmatched resource in the top level, we will continue to search
matched resources in its descendant resources. So, we will not miss any
matched resources in resource tree anymore.
In the new implementation, an example resource tree
|------------- "CXL Window 0" ------------|
|-- "System RAM" --|
will behave similar as the following fake resource tree for
region_intersects(, IORESOURCE_SYSTEM_RAM, ),
|-- "System RAM" --||-- "CXL Window 0a" --|
Where "CXL Window 0a" is part of the original "CXL Window 0" that
isn't covered by "System RAM".
Link: https://lkml.kernel.org/r/20240906030713.204292-2-ying.huang@intel.com
Fixes: c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-06 03:07:11 +00:00
|
|
|
struct resource *p, *dp;
|
2024-09-25 15:43:34 +00:00
|
|
|
struct resource res, o;
|
2024-09-25 15:43:35 +00:00
|
|
|
bool covered;
|
2021-05-07 01:05:24 +00:00
|
|
|
|
|
|
|
res.start = start;
|
|
|
|
res.end = start + size - 1;
|
|
|
|
|
2022-05-20 20:41:24 +00:00
|
|
|
for (p = parent->child; p ; p = p->sibling) {
|
2024-09-25 15:43:34 +00:00
|
|
|
if (!resource_intersection(p, &res, &o))
|
resource: fix region_intersects() vs add_memory_driver_managed()
On a system with CXL memory, the resource tree (/proc/iomem) related to
CXL memory may look like something as follows.
490000000-50fffffff : CXL Window 0
490000000-50fffffff : region0
490000000-50fffffff : dax0.0
490000000-50fffffff : System RAM (kmem)
Because drivers/dax/kmem.c calls add_memory_driver_managed() during
onlining CXL memory, which makes "System RAM (kmem)" a descendant of "CXL
Window X". This confuses region_intersects(), which expects all "System
RAM" resources to be at the top level of iomem_resource. This can lead to
bugs.
For example, when the following command line is executed to write some
memory in CXL memory range via /dev/mem,
$ dd if=data of=/dev/mem bs=$((1 << 10)) seek=$((0x490000000 >> 10)) count=1
dd: error writing '/dev/mem': Bad address
1+0 records in
0+0 records out
0 bytes copied, 0.0283507 s, 0.0 kB/s
the command fails as expected. However, the error code is wrong. It
should be "Operation not permitted" instead of "Bad address". More
seriously, the /dev/mem permission checking in devmem_is_allowed() passes
incorrectly. Although the accessing is prevented later because ioremap()
isn't allowed to map system RAM, it is a potential security issue. During
command executing, the following warning is reported in the kernel log for
calling ioremap() on system RAM.
ioremap on RAM at 0x0000000490000000 - 0x0000000490000fff
WARNING: CPU: 2 PID: 416 at arch/x86/mm/ioremap.c:216 __ioremap_caller.constprop.0+0x131/0x35d
Call Trace:
memremap+0xcb/0x184
xlate_dev_mem_ptr+0x25/0x2f
write_mem+0x94/0xfb
vfs_write+0x128/0x26d
ksys_write+0xac/0xfe
do_syscall_64+0x9a/0xfd
entry_SYSCALL_64_after_hwframe+0x4b/0x53
The details of command execution process are as follows. In the above
resource tree, "System RAM" is a descendant of "CXL Window 0" instead of a
top level resource. So, region_intersects() will report no System RAM
resources in the CXL memory region incorrectly, because it only checks the
top level resources. Consequently, devmem_is_allowed() will return 1
(allow access via /dev/mem) for CXL memory region incorrectly.
Fortunately, ioremap() doesn't allow to map System RAM and reject the
access.
So, region_intersects() needs to be fixed to work correctly with the
resource tree with "System RAM" not at top level as above. To fix it, if
we found a unmatched resource in the top level, we will continue to search
matched resources in its descendant resources. So, we will not miss any
matched resources in resource tree anymore.
In the new implementation, an example resource tree
|------------- "CXL Window 0" ------------|
|-- "System RAM" --|
will behave similar as the following fake resource tree for
region_intersects(, IORESOURCE_SYSTEM_RAM, ),
|-- "System RAM" --||-- "CXL Window 0a" --|
Where "CXL Window 0a" is part of the original "CXL Window 0" that
isn't covered by "System RAM".
Link: https://lkml.kernel.org/r/20240906030713.204292-2-ying.huang@intel.com
Fixes: c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-06 03:07:11 +00:00
|
|
|
continue;
|
2024-09-25 15:43:35 +00:00
|
|
|
if (is_type_match(p, flags, desc)) {
|
resource: fix region_intersects() vs add_memory_driver_managed()
On a system with CXL memory, the resource tree (/proc/iomem) related to
CXL memory may look like something as follows.
490000000-50fffffff : CXL Window 0
490000000-50fffffff : region0
490000000-50fffffff : dax0.0
490000000-50fffffff : System RAM (kmem)
Because drivers/dax/kmem.c calls add_memory_driver_managed() during
onlining CXL memory, which makes "System RAM (kmem)" a descendant of "CXL
Window X". This confuses region_intersects(), which expects all "System
RAM" resources to be at the top level of iomem_resource. This can lead to
bugs.
For example, when the following command line is executed to write some
memory in CXL memory range via /dev/mem,
$ dd if=data of=/dev/mem bs=$((1 << 10)) seek=$((0x490000000 >> 10)) count=1
dd: error writing '/dev/mem': Bad address
1+0 records in
0+0 records out
0 bytes copied, 0.0283507 s, 0.0 kB/s
the command fails as expected. However, the error code is wrong. It
should be "Operation not permitted" instead of "Bad address". More
seriously, the /dev/mem permission checking in devmem_is_allowed() passes
incorrectly. Although the accessing is prevented later because ioremap()
isn't allowed to map system RAM, it is a potential security issue. During
command executing, the following warning is reported in the kernel log for
calling ioremap() on system RAM.
ioremap on RAM at 0x0000000490000000 - 0x0000000490000fff
WARNING: CPU: 2 PID: 416 at arch/x86/mm/ioremap.c:216 __ioremap_caller.constprop.0+0x131/0x35d
Call Trace:
memremap+0xcb/0x184
xlate_dev_mem_ptr+0x25/0x2f
write_mem+0x94/0xfb
vfs_write+0x128/0x26d
ksys_write+0xac/0xfe
do_syscall_64+0x9a/0xfd
entry_SYSCALL_64_after_hwframe+0x4b/0x53
The details of command execution process are as follows. In the above
resource tree, "System RAM" is a descendant of "CXL Window 0" instead of a
top level resource. So, region_intersects() will report no System RAM
resources in the CXL memory region incorrectly, because it only checks the
top level resources. Consequently, devmem_is_allowed() will return 1
(allow access via /dev/mem) for CXL memory region incorrectly.
Fortunately, ioremap() doesn't allow to map System RAM and reject the
access.
So, region_intersects() needs to be fixed to work correctly with the
resource tree with "System RAM" not at top level as above. To fix it, if
we found a unmatched resource in the top level, we will continue to search
matched resources in its descendant resources. So, we will not miss any
matched resources in resource tree anymore.
In the new implementation, an example resource tree
|------------- "CXL Window 0" ------------|
|-- "System RAM" --|
will behave similar as the following fake resource tree for
region_intersects(, IORESOURCE_SYSTEM_RAM, ),
|-- "System RAM" --||-- "CXL Window 0a" --|
Where "CXL Window 0a" is part of the original "CXL Window 0" that
isn't covered by "System RAM".
Link: https://lkml.kernel.org/r/20240906030713.204292-2-ying.huang@intel.com
Fixes: c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-06 03:07:11 +00:00
|
|
|
type++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Continue to search in descendant resources as if the
|
|
|
|
* matched descendant resources cover some ranges of 'p'.
|
|
|
|
*
|
|
|
|
* |------------- "CXL Window 0" ------------|
|
|
|
|
* |-- "System RAM" --|
|
|
|
|
*
|
|
|
|
* will behave similar as the following fake resource
|
|
|
|
* tree when searching "System RAM".
|
|
|
|
*
|
|
|
|
* |-- "System RAM" --||-- "CXL Window 0a" --|
|
|
|
|
*/
|
|
|
|
covered = false;
|
|
|
|
for_each_resource(p, dp, false) {
|
|
|
|
if (!resource_overlaps(dp, &res))
|
|
|
|
continue;
|
2024-09-25 15:43:35 +00:00
|
|
|
if (is_type_match(dp, flags, desc)) {
|
resource: fix region_intersects() vs add_memory_driver_managed()
On a system with CXL memory, the resource tree (/proc/iomem) related to
CXL memory may look like something as follows.
490000000-50fffffff : CXL Window 0
490000000-50fffffff : region0
490000000-50fffffff : dax0.0
490000000-50fffffff : System RAM (kmem)
Because drivers/dax/kmem.c calls add_memory_driver_managed() during
onlining CXL memory, which makes "System RAM (kmem)" a descendant of "CXL
Window X". This confuses region_intersects(), which expects all "System
RAM" resources to be at the top level of iomem_resource. This can lead to
bugs.
For example, when the following command line is executed to write some
memory in CXL memory range via /dev/mem,
$ dd if=data of=/dev/mem bs=$((1 << 10)) seek=$((0x490000000 >> 10)) count=1
dd: error writing '/dev/mem': Bad address
1+0 records in
0+0 records out
0 bytes copied, 0.0283507 s, 0.0 kB/s
the command fails as expected. However, the error code is wrong. It
should be "Operation not permitted" instead of "Bad address". More
seriously, the /dev/mem permission checking in devmem_is_allowed() passes
incorrectly. Although the accessing is prevented later because ioremap()
isn't allowed to map system RAM, it is a potential security issue. During
command executing, the following warning is reported in the kernel log for
calling ioremap() on system RAM.
ioremap on RAM at 0x0000000490000000 - 0x0000000490000fff
WARNING: CPU: 2 PID: 416 at arch/x86/mm/ioremap.c:216 __ioremap_caller.constprop.0+0x131/0x35d
Call Trace:
memremap+0xcb/0x184
xlate_dev_mem_ptr+0x25/0x2f
write_mem+0x94/0xfb
vfs_write+0x128/0x26d
ksys_write+0xac/0xfe
do_syscall_64+0x9a/0xfd
entry_SYSCALL_64_after_hwframe+0x4b/0x53
The details of command execution process are as follows. In the above
resource tree, "System RAM" is a descendant of "CXL Window 0" instead of a
top level resource. So, region_intersects() will report no System RAM
resources in the CXL memory region incorrectly, because it only checks the
top level resources. Consequently, devmem_is_allowed() will return 1
(allow access via /dev/mem) for CXL memory region incorrectly.
Fortunately, ioremap() doesn't allow to map System RAM and reject the
access.
So, region_intersects() needs to be fixed to work correctly with the
resource tree with "System RAM" not at top level as above. To fix it, if
we found a unmatched resource in the top level, we will continue to search
matched resources in its descendant resources. So, we will not miss any
matched resources in resource tree anymore.
In the new implementation, an example resource tree
|------------- "CXL Window 0" ------------|
|-- "System RAM" --|
will behave similar as the following fake resource tree for
region_intersects(, IORESOURCE_SYSTEM_RAM, ),
|-- "System RAM" --||-- "CXL Window 0a" --|
Where "CXL Window 0a" is part of the original "CXL Window 0" that
isn't covered by "System RAM".
Link: https://lkml.kernel.org/r/20240906030713.204292-2-ying.huang@intel.com
Fixes: c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-06 03:07:11 +00:00
|
|
|
type++;
|
|
|
|
/*
|
2024-09-25 15:43:34 +00:00
|
|
|
* Range from 'o.start' to 'dp->start'
|
resource: fix region_intersects() vs add_memory_driver_managed()
On a system with CXL memory, the resource tree (/proc/iomem) related to
CXL memory may look like something as follows.
490000000-50fffffff : CXL Window 0
490000000-50fffffff : region0
490000000-50fffffff : dax0.0
490000000-50fffffff : System RAM (kmem)
Because drivers/dax/kmem.c calls add_memory_driver_managed() during
onlining CXL memory, which makes "System RAM (kmem)" a descendant of "CXL
Window X". This confuses region_intersects(), which expects all "System
RAM" resources to be at the top level of iomem_resource. This can lead to
bugs.
For example, when the following command line is executed to write some
memory in CXL memory range via /dev/mem,
$ dd if=data of=/dev/mem bs=$((1 << 10)) seek=$((0x490000000 >> 10)) count=1
dd: error writing '/dev/mem': Bad address
1+0 records in
0+0 records out
0 bytes copied, 0.0283507 s, 0.0 kB/s
the command fails as expected. However, the error code is wrong. It
should be "Operation not permitted" instead of "Bad address". More
seriously, the /dev/mem permission checking in devmem_is_allowed() passes
incorrectly. Although the accessing is prevented later because ioremap()
isn't allowed to map system RAM, it is a potential security issue. During
command executing, the following warning is reported in the kernel log for
calling ioremap() on system RAM.
ioremap on RAM at 0x0000000490000000 - 0x0000000490000fff
WARNING: CPU: 2 PID: 416 at arch/x86/mm/ioremap.c:216 __ioremap_caller.constprop.0+0x131/0x35d
Call Trace:
memremap+0xcb/0x184
xlate_dev_mem_ptr+0x25/0x2f
write_mem+0x94/0xfb
vfs_write+0x128/0x26d
ksys_write+0xac/0xfe
do_syscall_64+0x9a/0xfd
entry_SYSCALL_64_after_hwframe+0x4b/0x53
The details of command execution process are as follows. In the above
resource tree, "System RAM" is a descendant of "CXL Window 0" instead of a
top level resource. So, region_intersects() will report no System RAM
resources in the CXL memory region incorrectly, because it only checks the
top level resources. Consequently, devmem_is_allowed() will return 1
(allow access via /dev/mem) for CXL memory region incorrectly.
Fortunately, ioremap() doesn't allow to map System RAM and reject the
access.
So, region_intersects() needs to be fixed to work correctly with the
resource tree with "System RAM" not at top level as above. To fix it, if
we found a unmatched resource in the top level, we will continue to search
matched resources in its descendant resources. So, we will not miss any
matched resources in resource tree anymore.
In the new implementation, an example resource tree
|------------- "CXL Window 0" ------------|
|-- "System RAM" --|
will behave similar as the following fake resource tree for
region_intersects(, IORESOURCE_SYSTEM_RAM, ),
|-- "System RAM" --||-- "CXL Window 0a" --|
Where "CXL Window 0a" is part of the original "CXL Window 0" that
isn't covered by "System RAM".
Link: https://lkml.kernel.org/r/20240906030713.204292-2-ying.huang@intel.com
Fixes: c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-06 03:07:11 +00:00
|
|
|
* isn't covered by matched resource.
|
|
|
|
*/
|
2024-09-25 15:43:34 +00:00
|
|
|
if (dp->start > o.start)
|
resource: fix region_intersects() vs add_memory_driver_managed()
On a system with CXL memory, the resource tree (/proc/iomem) related to
CXL memory may look like something as follows.
490000000-50fffffff : CXL Window 0
490000000-50fffffff : region0
490000000-50fffffff : dax0.0
490000000-50fffffff : System RAM (kmem)
Because drivers/dax/kmem.c calls add_memory_driver_managed() during
onlining CXL memory, which makes "System RAM (kmem)" a descendant of "CXL
Window X". This confuses region_intersects(), which expects all "System
RAM" resources to be at the top level of iomem_resource. This can lead to
bugs.
For example, when the following command line is executed to write some
memory in CXL memory range via /dev/mem,
$ dd if=data of=/dev/mem bs=$((1 << 10)) seek=$((0x490000000 >> 10)) count=1
dd: error writing '/dev/mem': Bad address
1+0 records in
0+0 records out
0 bytes copied, 0.0283507 s, 0.0 kB/s
the command fails as expected. However, the error code is wrong. It
should be "Operation not permitted" instead of "Bad address". More
seriously, the /dev/mem permission checking in devmem_is_allowed() passes
incorrectly. Although the accessing is prevented later because ioremap()
isn't allowed to map system RAM, it is a potential security issue. During
command executing, the following warning is reported in the kernel log for
calling ioremap() on system RAM.
ioremap on RAM at 0x0000000490000000 - 0x0000000490000fff
WARNING: CPU: 2 PID: 416 at arch/x86/mm/ioremap.c:216 __ioremap_caller.constprop.0+0x131/0x35d
Call Trace:
memremap+0xcb/0x184
xlate_dev_mem_ptr+0x25/0x2f
write_mem+0x94/0xfb
vfs_write+0x128/0x26d
ksys_write+0xac/0xfe
do_syscall_64+0x9a/0xfd
entry_SYSCALL_64_after_hwframe+0x4b/0x53
The details of command execution process are as follows. In the above
resource tree, "System RAM" is a descendant of "CXL Window 0" instead of a
top level resource. So, region_intersects() will report no System RAM
resources in the CXL memory region incorrectly, because it only checks the
top level resources. Consequently, devmem_is_allowed() will return 1
(allow access via /dev/mem) for CXL memory region incorrectly.
Fortunately, ioremap() doesn't allow to map System RAM and reject the
access.
So, region_intersects() needs to be fixed to work correctly with the
resource tree with "System RAM" not at top level as above. To fix it, if
we found a unmatched resource in the top level, we will continue to search
matched resources in its descendant resources. So, we will not miss any
matched resources in resource tree anymore.
In the new implementation, an example resource tree
|------------- "CXL Window 0" ------------|
|-- "System RAM" --|
will behave similar as the following fake resource tree for
region_intersects(, IORESOURCE_SYSTEM_RAM, ),
|-- "System RAM" --||-- "CXL Window 0a" --|
Where "CXL Window 0a" is part of the original "CXL Window 0" that
isn't covered by "System RAM".
Link: https://lkml.kernel.org/r/20240906030713.204292-2-ying.huang@intel.com
Fixes: c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-06 03:07:11 +00:00
|
|
|
break;
|
2024-09-25 15:43:34 +00:00
|
|
|
if (dp->end >= o.end) {
|
resource: fix region_intersects() vs add_memory_driver_managed()
On a system with CXL memory, the resource tree (/proc/iomem) related to
CXL memory may look like something as follows.
490000000-50fffffff : CXL Window 0
490000000-50fffffff : region0
490000000-50fffffff : dax0.0
490000000-50fffffff : System RAM (kmem)
Because drivers/dax/kmem.c calls add_memory_driver_managed() during
onlining CXL memory, which makes "System RAM (kmem)" a descendant of "CXL
Window X". This confuses region_intersects(), which expects all "System
RAM" resources to be at the top level of iomem_resource. This can lead to
bugs.
For example, when the following command line is executed to write some
memory in CXL memory range via /dev/mem,
$ dd if=data of=/dev/mem bs=$((1 << 10)) seek=$((0x490000000 >> 10)) count=1
dd: error writing '/dev/mem': Bad address
1+0 records in
0+0 records out
0 bytes copied, 0.0283507 s, 0.0 kB/s
the command fails as expected. However, the error code is wrong. It
should be "Operation not permitted" instead of "Bad address". More
seriously, the /dev/mem permission checking in devmem_is_allowed() passes
incorrectly. Although the accessing is prevented later because ioremap()
isn't allowed to map system RAM, it is a potential security issue. During
command executing, the following warning is reported in the kernel log for
calling ioremap() on system RAM.
ioremap on RAM at 0x0000000490000000 - 0x0000000490000fff
WARNING: CPU: 2 PID: 416 at arch/x86/mm/ioremap.c:216 __ioremap_caller.constprop.0+0x131/0x35d
Call Trace:
memremap+0xcb/0x184
xlate_dev_mem_ptr+0x25/0x2f
write_mem+0x94/0xfb
vfs_write+0x128/0x26d
ksys_write+0xac/0xfe
do_syscall_64+0x9a/0xfd
entry_SYSCALL_64_after_hwframe+0x4b/0x53
The details of command execution process are as follows. In the above
resource tree, "System RAM" is a descendant of "CXL Window 0" instead of a
top level resource. So, region_intersects() will report no System RAM
resources in the CXL memory region incorrectly, because it only checks the
top level resources. Consequently, devmem_is_allowed() will return 1
(allow access via /dev/mem) for CXL memory region incorrectly.
Fortunately, ioremap() doesn't allow to map System RAM and reject the
access.
So, region_intersects() needs to be fixed to work correctly with the
resource tree with "System RAM" not at top level as above. To fix it, if
we found a unmatched resource in the top level, we will continue to search
matched resources in its descendant resources. So, we will not miss any
matched resources in resource tree anymore.
In the new implementation, an example resource tree
|------------- "CXL Window 0" ------------|
|-- "System RAM" --|
will behave similar as the following fake resource tree for
region_intersects(, IORESOURCE_SYSTEM_RAM, ),
|-- "System RAM" --||-- "CXL Window 0a" --|
Where "CXL Window 0a" is part of the original "CXL Window 0" that
isn't covered by "System RAM".
Link: https://lkml.kernel.org/r/20240906030713.204292-2-ying.huang@intel.com
Fixes: c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-06 03:07:11 +00:00
|
|
|
covered = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* Remove covered range */
|
2024-09-25 15:43:34 +00:00
|
|
|
o.start = max(o.start, dp->end + 1);
|
resource: fix region_intersects() vs add_memory_driver_managed()
On a system with CXL memory, the resource tree (/proc/iomem) related to
CXL memory may look like something as follows.
490000000-50fffffff : CXL Window 0
490000000-50fffffff : region0
490000000-50fffffff : dax0.0
490000000-50fffffff : System RAM (kmem)
Because drivers/dax/kmem.c calls add_memory_driver_managed() during
onlining CXL memory, which makes "System RAM (kmem)" a descendant of "CXL
Window X". This confuses region_intersects(), which expects all "System
RAM" resources to be at the top level of iomem_resource. This can lead to
bugs.
For example, when the following command line is executed to write some
memory in CXL memory range via /dev/mem,
$ dd if=data of=/dev/mem bs=$((1 << 10)) seek=$((0x490000000 >> 10)) count=1
dd: error writing '/dev/mem': Bad address
1+0 records in
0+0 records out
0 bytes copied, 0.0283507 s, 0.0 kB/s
the command fails as expected. However, the error code is wrong. It
should be "Operation not permitted" instead of "Bad address". More
seriously, the /dev/mem permission checking in devmem_is_allowed() passes
incorrectly. Although the accessing is prevented later because ioremap()
isn't allowed to map system RAM, it is a potential security issue. During
command executing, the following warning is reported in the kernel log for
calling ioremap() on system RAM.
ioremap on RAM at 0x0000000490000000 - 0x0000000490000fff
WARNING: CPU: 2 PID: 416 at arch/x86/mm/ioremap.c:216 __ioremap_caller.constprop.0+0x131/0x35d
Call Trace:
memremap+0xcb/0x184
xlate_dev_mem_ptr+0x25/0x2f
write_mem+0x94/0xfb
vfs_write+0x128/0x26d
ksys_write+0xac/0xfe
do_syscall_64+0x9a/0xfd
entry_SYSCALL_64_after_hwframe+0x4b/0x53
The details of command execution process are as follows. In the above
resource tree, "System RAM" is a descendant of "CXL Window 0" instead of a
top level resource. So, region_intersects() will report no System RAM
resources in the CXL memory region incorrectly, because it only checks the
top level resources. Consequently, devmem_is_allowed() will return 1
(allow access via /dev/mem) for CXL memory region incorrectly.
Fortunately, ioremap() doesn't allow to map System RAM and reject the
access.
So, region_intersects() needs to be fixed to work correctly with the
resource tree with "System RAM" not at top level as above. To fix it, if
we found a unmatched resource in the top level, we will continue to search
matched resources in its descendant resources. So, we will not miss any
matched resources in resource tree anymore.
In the new implementation, an example resource tree
|------------- "CXL Window 0" ------------|
|-- "System RAM" --|
will behave similar as the following fake resource tree for
region_intersects(, IORESOURCE_SYSTEM_RAM, ),
|-- "System RAM" --||-- "CXL Window 0a" --|
Where "CXL Window 0a" is part of the original "CXL Window 0" that
isn't covered by "System RAM".
Link: https://lkml.kernel.org/r/20240906030713.204292-2-ying.huang@intel.com
Fixes: c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-06 03:07:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!covered)
|
|
|
|
other++;
|
2021-05-07 01:05:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (type == 0)
|
|
|
|
return REGION_DISJOINT;
|
|
|
|
|
|
|
|
if (other == 0)
|
|
|
|
return REGION_INTERSECTS;
|
|
|
|
|
|
|
|
return REGION_MIXED;
|
|
|
|
}
|
|
|
|
|
2015-08-11 03:07:05 +00:00
|
|
|
/**
|
|
|
|
* region_intersects() - determine intersection of region with known resources
|
|
|
|
* @start: region start address
|
|
|
|
* @size: size of region
|
2016-01-26 20:57:28 +00:00
|
|
|
* @flags: flags of resource (in iomem_resource)
|
|
|
|
* @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE
|
x86: optimize resource lookups for ioremap
We have a large university system in the UK that is experiencing very long
delays modprobing the driver for a specific I/O device. The delay is from
8-10 minutes per device and there are 31 devices in the system. This 4 to
5 hour delay in starting up those I/O devices is very much a burden on the
customer.
There are two causes for requiring a restart/reload of the drivers. First
is periodic preventive maintenance (PM) and the second is if any of the
devices experience a fatal error. Both of these trigger this excessively
long delay in bringing the system back up to full capability.
The problem was tracked down to a very slow IOREMAP operation and the
excessively long ioresource lookup to insure that the user is not
attempting to ioremap RAM. These patches provide a speed up to that
function.
The modprobe time appears to be affected quite a bit by previous activity
on the ioresource list, which I suspect is due to cache preloading. While
the overall improvement is impacted by other overhead of starting the
devices, this drastically improves the modprobe time.
Also our system is considerably smaller so the percentages gained will not
be the same. Best case improvement with the modprobe on our 20 device
smallish system was from 'real 5m51.913s' to 'real 0m18.275s'.
This patch (of 2):
Since the ioremap operation is verifying that the specified address range
is NOT RAM, it will search the entire ioresource list if the condition is
true. To make matters worse, it does this one 4k page at a time. For a
128M BAR region this is 32 passes to determine the entire region does not
contain any RAM addresses.
This patch provides another resource lookup function, region_is_ram, that
searches for the entire region specified, verifying that it is completely
contained within the resource region. If it is found, then it is checked
to be RAM or not, within a single pass.
The return result reflects if it was found or not (-1), and whether it is
RAM (1) or not (0). This allows the caller to fallback to the previous
page by page search if it was not found.
[akpm@linux-foundation.org: fix spellos and typos in comment]
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Alex Thorlton <athorlton@sgi.com>
Reviewed-by: Cliff Wickman <cpw@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-13 22:54:03 +00:00
|
|
|
*
|
2015-08-11 03:07:05 +00:00
|
|
|
* Check if the specified region partially overlaps or fully eclipses a
|
2016-01-26 20:57:28 +00:00
|
|
|
* resource identified by @flags and @desc (optional with IORES_DESC_NONE).
|
|
|
|
* Return REGION_DISJOINT if the region does not overlap @flags/@desc,
|
|
|
|
* return REGION_MIXED if the region overlaps @flags/@desc and another
|
|
|
|
* resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc
|
|
|
|
* and no other defined resource. Note that REGION_INTERSECTS is also
|
|
|
|
* returned in the case when the specified region overlaps RAM and undefined
|
|
|
|
* memory holes.
|
2015-08-11 03:07:05 +00:00
|
|
|
*
|
|
|
|
* region_intersect() is used by memory remapping functions to ensure
|
|
|
|
* the user is not remapping RAM and is a vast speed up over walking
|
|
|
|
* through the resource table page by page.
|
x86: optimize resource lookups for ioremap
We have a large university system in the UK that is experiencing very long
delays modprobing the driver for a specific I/O device. The delay is from
8-10 minutes per device and there are 31 devices in the system. This 4 to
5 hour delay in starting up those I/O devices is very much a burden on the
customer.
There are two causes for requiring a restart/reload of the drivers. First
is periodic preventive maintenance (PM) and the second is if any of the
devices experience a fatal error. Both of these trigger this excessively
long delay in bringing the system back up to full capability.
The problem was tracked down to a very slow IOREMAP operation and the
excessively long ioresource lookup to insure that the user is not
attempting to ioremap RAM. These patches provide a speed up to that
function.
The modprobe time appears to be affected quite a bit by previous activity
on the ioresource list, which I suspect is due to cache preloading. While
the overall improvement is impacted by other overhead of starting the
devices, this drastically improves the modprobe time.
Also our system is considerably smaller so the percentages gained will not
be the same. Best case improvement with the modprobe on our 20 device
smallish system was from 'real 5m51.913s' to 'real 0m18.275s'.
This patch (of 2):
Since the ioremap operation is verifying that the specified address range
is NOT RAM, it will search the entire ioresource list if the condition is
true. To make matters worse, it does this one 4k page at a time. For a
128M BAR region this is 32 passes to determine the entire region does not
contain any RAM addresses.
This patch provides another resource lookup function, region_is_ram, that
searches for the entire region specified, verifying that it is completely
contained within the resource region. If it is found, then it is checked
to be RAM or not, within a single pass.
The return result reflects if it was found or not (-1), and whether it is
RAM (1) or not (0). This allows the caller to fallback to the previous
page by page search if it was not found.
[akpm@linux-foundation.org: fix spellos and typos in comment]
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Alex Thorlton <athorlton@sgi.com>
Reviewed-by: Cliff Wickman <cpw@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-13 22:54:03 +00:00
|
|
|
*/
|
2016-01-26 20:57:28 +00:00
|
|
|
int region_intersects(resource_size_t start, size_t size, unsigned long flags,
|
|
|
|
unsigned long desc)
|
x86: optimize resource lookups for ioremap
We have a large university system in the UK that is experiencing very long
delays modprobing the driver for a specific I/O device. The delay is from
8-10 minutes per device and there are 31 devices in the system. This 4 to
5 hour delay in starting up those I/O devices is very much a burden on the
customer.
There are two causes for requiring a restart/reload of the drivers. First
is periodic preventive maintenance (PM) and the second is if any of the
devices experience a fatal error. Both of these trigger this excessively
long delay in bringing the system back up to full capability.
The problem was tracked down to a very slow IOREMAP operation and the
excessively long ioresource lookup to insure that the user is not
attempting to ioremap RAM. These patches provide a speed up to that
function.
The modprobe time appears to be affected quite a bit by previous activity
on the ioresource list, which I suspect is due to cache preloading. While
the overall improvement is impacted by other overhead of starting the
devices, this drastically improves the modprobe time.
Also our system is considerably smaller so the percentages gained will not
be the same. Best case improvement with the modprobe on our 20 device
smallish system was from 'real 5m51.913s' to 'real 0m18.275s'.
This patch (of 2):
Since the ioremap operation is verifying that the specified address range
is NOT RAM, it will search the entire ioresource list if the condition is
true. To make matters worse, it does this one 4k page at a time. For a
128M BAR region this is 32 passes to determine the entire region does not
contain any RAM addresses.
This patch provides another resource lookup function, region_is_ram, that
searches for the entire region specified, verifying that it is completely
contained within the resource region. If it is found, then it is checked
to be RAM or not, within a single pass.
The return result reflects if it was found or not (-1), and whether it is
RAM (1) or not (0). This allows the caller to fallback to the previous
page by page search if it was not found.
[akpm@linux-foundation.org: fix spellos and typos in comment]
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Alex Thorlton <athorlton@sgi.com>
Reviewed-by: Cliff Wickman <cpw@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-13 22:54:03 +00:00
|
|
|
{
|
2021-05-07 01:05:24 +00:00
|
|
|
int ret;
|
2019-03-05 08:34:32 +00:00
|
|
|
|
x86: optimize resource lookups for ioremap
We have a large university system in the UK that is experiencing very long
delays modprobing the driver for a specific I/O device. The delay is from
8-10 minutes per device and there are 31 devices in the system. This 4 to
5 hour delay in starting up those I/O devices is very much a burden on the
customer.
There are two causes for requiring a restart/reload of the drivers. First
is periodic preventive maintenance (PM) and the second is if any of the
devices experience a fatal error. Both of these trigger this excessively
long delay in bringing the system back up to full capability.
The problem was tracked down to a very slow IOREMAP operation and the
excessively long ioresource lookup to insure that the user is not
attempting to ioremap RAM. These patches provide a speed up to that
function.
The modprobe time appears to be affected quite a bit by previous activity
on the ioresource list, which I suspect is due to cache preloading. While
the overall improvement is impacted by other overhead of starting the
devices, this drastically improves the modprobe time.
Also our system is considerably smaller so the percentages gained will not
be the same. Best case improvement with the modprobe on our 20 device
smallish system was from 'real 5m51.913s' to 'real 0m18.275s'.
This patch (of 2):
Since the ioremap operation is verifying that the specified address range
is NOT RAM, it will search the entire ioresource list if the condition is
true. To make matters worse, it does this one 4k page at a time. For a
128M BAR region this is 32 passes to determine the entire region does not
contain any RAM addresses.
This patch provides another resource lookup function, region_is_ram, that
searches for the entire region specified, verifying that it is completely
contained within the resource region. If it is found, then it is checked
to be RAM or not, within a single pass.
The return result reflects if it was found or not (-1), and whether it is
RAM (1) or not (0). This allows the caller to fallback to the previous
page by page search if it was not found.
[akpm@linux-foundation.org: fix spellos and typos in comment]
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Alex Thorlton <athorlton@sgi.com>
Reviewed-by: Cliff Wickman <cpw@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-13 22:54:03 +00:00
|
|
|
read_lock(&resource_lock);
|
2022-05-20 20:41:24 +00:00
|
|
|
ret = __region_intersects(&iomem_resource, start, size, flags, desc);
|
x86: optimize resource lookups for ioremap
We have a large university system in the UK that is experiencing very long
delays modprobing the driver for a specific I/O device. The delay is from
8-10 minutes per device and there are 31 devices in the system. This 4 to
5 hour delay in starting up those I/O devices is very much a burden on the
customer.
There are two causes for requiring a restart/reload of the drivers. First
is periodic preventive maintenance (PM) and the second is if any of the
devices experience a fatal error. Both of these trigger this excessively
long delay in bringing the system back up to full capability.
The problem was tracked down to a very slow IOREMAP operation and the
excessively long ioresource lookup to insure that the user is not
attempting to ioremap RAM. These patches provide a speed up to that
function.
The modprobe time appears to be affected quite a bit by previous activity
on the ioresource list, which I suspect is due to cache preloading. While
the overall improvement is impacted by other overhead of starting the
devices, this drastically improves the modprobe time.
Also our system is considerably smaller so the percentages gained will not
be the same. Best case improvement with the modprobe on our 20 device
smallish system was from 'real 5m51.913s' to 'real 0m18.275s'.
This patch (of 2):
Since the ioremap operation is verifying that the specified address range
is NOT RAM, it will search the entire ioresource list if the condition is
true. To make matters worse, it does this one 4k page at a time. For a
128M BAR region this is 32 passes to determine the entire region does not
contain any RAM addresses.
This patch provides another resource lookup function, region_is_ram, that
searches for the entire region specified, verifying that it is completely
contained within the resource region. If it is found, then it is checked
to be RAM or not, within a single pass.
The return result reflects if it was found or not (-1), and whether it is
RAM (1) or not (0). This allows the caller to fallback to the previous
page by page search if it was not found.
[akpm@linux-foundation.org: fix spellos and typos in comment]
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Alex Thorlton <athorlton@sgi.com>
Reviewed-by: Cliff Wickman <cpw@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-13 22:54:03 +00:00
|
|
|
read_unlock(&resource_lock);
|
2015-08-11 03:07:05 +00:00
|
|
|
|
2021-05-07 01:05:24 +00:00
|
|
|
return ret;
|
x86: optimize resource lookups for ioremap
We have a large university system in the UK that is experiencing very long
delays modprobing the driver for a specific I/O device. The delay is from
8-10 minutes per device and there are 31 devices in the system. This 4 to
5 hour delay in starting up those I/O devices is very much a burden on the
customer.
There are two causes for requiring a restart/reload of the drivers. First
is periodic preventive maintenance (PM) and the second is if any of the
devices experience a fatal error. Both of these trigger this excessively
long delay in bringing the system back up to full capability.
The problem was tracked down to a very slow IOREMAP operation and the
excessively long ioresource lookup to insure that the user is not
attempting to ioremap RAM. These patches provide a speed up to that
function.
The modprobe time appears to be affected quite a bit by previous activity
on the ioresource list, which I suspect is due to cache preloading. While
the overall improvement is impacted by other overhead of starting the
devices, this drastically improves the modprobe time.
Also our system is considerably smaller so the percentages gained will not
be the same. Best case improvement with the modprobe on our 20 device
smallish system was from 'real 5m51.913s' to 'real 0m18.275s'.
This patch (of 2):
Since the ioremap operation is verifying that the specified address range
is NOT RAM, it will search the entire ioresource list if the condition is
true. To make matters worse, it does this one 4k page at a time. For a
128M BAR region this is 32 passes to determine the entire region does not
contain any RAM addresses.
This patch provides another resource lookup function, region_is_ram, that
searches for the entire region specified, verifying that it is completely
contained within the resource region. If it is found, then it is checked
to be RAM or not, within a single pass.
The return result reflects if it was found or not (-1), and whether it is
RAM (1) or not (0). This allows the caller to fallback to the previous
page by page search if it was not found.
[akpm@linux-foundation.org: fix spellos and typos in comment]
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Alex Thorlton <athorlton@sgi.com>
Reviewed-by: Cliff Wickman <cpw@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-13 22:54:03 +00:00
|
|
|
}
|
2016-01-26 20:57:28 +00:00
|
|
|
EXPORT_SYMBOL_GPL(region_intersects);
|
x86: optimize resource lookups for ioremap
We have a large university system in the UK that is experiencing very long
delays modprobing the driver for a specific I/O device. The delay is from
8-10 minutes per device and there are 31 devices in the system. This 4 to
5 hour delay in starting up those I/O devices is very much a burden on the
customer.
There are two causes for requiring a restart/reload of the drivers. First
is periodic preventive maintenance (PM) and the second is if any of the
devices experience a fatal error. Both of these trigger this excessively
long delay in bringing the system back up to full capability.
The problem was tracked down to a very slow IOREMAP operation and the
excessively long ioresource lookup to insure that the user is not
attempting to ioremap RAM. These patches provide a speed up to that
function.
The modprobe time appears to be affected quite a bit by previous activity
on the ioresource list, which I suspect is due to cache preloading. While
the overall improvement is impacted by other overhead of starting the
devices, this drastically improves the modprobe time.
Also our system is considerably smaller so the percentages gained will not
be the same. Best case improvement with the modprobe on our 20 device
smallish system was from 'real 5m51.913s' to 'real 0m18.275s'.
This patch (of 2):
Since the ioremap operation is verifying that the specified address range
is NOT RAM, it will search the entire ioresource list if the condition is
true. To make matters worse, it does this one 4k page at a time. For a
128M BAR region this is 32 passes to determine the entire region does not
contain any RAM addresses.
This patch provides another resource lookup function, region_is_ram, that
searches for the entire region specified, verifying that it is completely
contained within the resource region. If it is found, then it is checked
to be RAM or not, within a single pass.
The return result reflects if it was found or not (-1), and whether it is
RAM (1) or not (0). This allows the caller to fallback to the previous
page by page search if it was not found.
[akpm@linux-foundation.org: fix spellos and typos in comment]
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Alex Thorlton <athorlton@sgi.com>
Reviewed-by: Cliff Wickman <cpw@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-13 22:54:03 +00:00
|
|
|
|
2010-12-16 17:38:46 +00:00
|
|
|
void __weak arch_remove_reservations(struct resource *avail)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2010-10-26 21:41:18 +00:00
|
|
|
static void resource_clip(struct resource *res, resource_size_t min,
|
|
|
|
resource_size_t max)
|
|
|
|
{
|
|
|
|
if (res->start < min)
|
|
|
|
res->start = min;
|
|
|
|
if (res->end > max)
|
|
|
|
res->end = max;
|
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
2024-05-07 10:25:17 +00:00
|
|
|
* Find empty space in the resource tree with the given range and
|
2011-07-06 06:44:30 +00:00
|
|
|
* alignment constraints
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
2024-05-07 10:25:17 +00:00
|
|
|
static int __find_resource_space(struct resource *root, struct resource *old,
|
|
|
|
struct resource *new, resource_size_t size,
|
|
|
|
struct resource_constraint *constraint)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
|
|
|
struct resource *this = root->child;
|
2010-10-26 21:41:28 +00:00
|
|
|
struct resource tmp = *new, avail, alloc;
|
2024-05-07 10:25:20 +00:00
|
|
|
resource_alignf alignf = constraint->alignf;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2009-12-20 09:50:02 +00:00
|
|
|
tmp.start = root->start;
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
2010-12-16 17:38:41 +00:00
|
|
|
* Skip past an allocated resource that starts at 0, since the assignment
|
|
|
|
* of this->start - 1 to tmp->end below would cause an underflow.
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
2011-07-06 06:44:30 +00:00
|
|
|
if (this && this->start == root->start) {
|
|
|
|
tmp.start = (this == old) ? old->start : this->end + 1;
|
2005-04-16 22:20:36 +00:00
|
|
|
this = this->sibling;
|
|
|
|
}
|
2010-12-16 17:38:41 +00:00
|
|
|
for(;;) {
|
2005-04-16 22:20:36 +00:00
|
|
|
if (this)
|
2011-07-06 06:44:30 +00:00
|
|
|
tmp.end = (this == old) ? this->end : this->start - 1;
|
2005-04-16 22:20:36 +00:00
|
|
|
else
|
2009-12-20 09:50:02 +00:00
|
|
|
tmp.end = root->end;
|
2010-10-26 21:41:18 +00:00
|
|
|
|
2011-09-22 07:48:58 +00:00
|
|
|
if (tmp.end < tmp.start)
|
|
|
|
goto next;
|
|
|
|
|
2011-07-06 06:44:30 +00:00
|
|
|
resource_clip(&tmp, constraint->min, constraint->max);
|
2010-12-16 17:38:46 +00:00
|
|
|
arch_remove_reservations(&tmp);
|
2010-10-26 21:41:13 +00:00
|
|
|
|
2010-10-26 21:41:28 +00:00
|
|
|
/* Check for overflow after ALIGN() */
|
2011-07-06 06:44:30 +00:00
|
|
|
avail.start = ALIGN(tmp.start, constraint->align);
|
2010-10-26 21:41:28 +00:00
|
|
|
avail.end = tmp.end;
|
2014-02-05 03:32:28 +00:00
|
|
|
avail.flags = new->flags & ~IORESOURCE_UNSET;
|
2010-10-26 21:41:28 +00:00
|
|
|
if (avail.start >= tmp.start) {
|
2014-02-05 03:32:28 +00:00
|
|
|
alloc.flags = avail.flags;
|
2024-05-07 10:25:20 +00:00
|
|
|
if (alignf) {
|
|
|
|
alloc.start = alignf(constraint->alignf_data,
|
|
|
|
&avail, size, constraint->align);
|
|
|
|
} else {
|
|
|
|
alloc.start = avail.start;
|
|
|
|
}
|
2010-10-26 21:41:28 +00:00
|
|
|
alloc.end = alloc.start + size - 1;
|
2018-04-13 22:35:13 +00:00
|
|
|
if (alloc.start <= alloc.end &&
|
|
|
|
resource_contains(&avail, &alloc)) {
|
2010-10-26 21:41:28 +00:00
|
|
|
new->start = alloc.start;
|
|
|
|
new->end = alloc.end;
|
|
|
|
return 0;
|
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
2011-09-22 07:48:58 +00:00
|
|
|
|
|
|
|
next: if (!this || this->end == root->end)
|
2005-04-16 22:20:36 +00:00
|
|
|
break;
|
2011-09-22 07:48:58 +00:00
|
|
|
|
2011-07-06 06:44:30 +00:00
|
|
|
if (this != old)
|
|
|
|
tmp.start = this->end + 1;
|
2005-04-16 22:20:36 +00:00
|
|
|
this = this->sibling;
|
|
|
|
}
|
|
|
|
return -EBUSY;
|
|
|
|
}
|
|
|
|
|
2024-05-07 10:25:18 +00:00
|
|
|
/**
|
|
|
|
* find_resource_space - Find empty space in the resource tree
|
|
|
|
* @root: Root resource descriptor
|
|
|
|
* @new: Resource descriptor awaiting an empty resource space
|
|
|
|
* @size: The minimum size of the empty space
|
|
|
|
* @constraint: The range and alignment constraints to be met
|
|
|
|
*
|
|
|
|
* Finds an empty space under @root in the resource tree satisfying range and
|
|
|
|
* alignment @constraints.
|
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* * %0 - if successful, @new members start, end, and flags are altered.
|
|
|
|
* * %-EBUSY - if no empty space was found.
|
2011-07-06 06:44:30 +00:00
|
|
|
*/
|
2024-05-07 10:25:21 +00:00
|
|
|
int find_resource_space(struct resource *root, struct resource *new,
|
|
|
|
resource_size_t size,
|
|
|
|
struct resource_constraint *constraint)
|
2011-07-06 06:44:30 +00:00
|
|
|
{
|
2024-05-07 10:25:17 +00:00
|
|
|
return __find_resource_space(root, NULL, new, size, constraint);
|
2011-07-06 06:44:30 +00:00
|
|
|
}
|
2024-05-07 10:25:21 +00:00
|
|
|
EXPORT_SYMBOL_GPL(find_resource_space);
|
2011-07-06 06:44:30 +00:00
|
|
|
|
2006-10-03 08:13:51 +00:00
|
|
|
/**
|
2011-07-06 06:44:30 +00:00
|
|
|
* reallocate_resource - allocate a slot in the resource tree given range & alignment.
|
|
|
|
* The resource will be relocated if the new size cannot be reallocated in the
|
|
|
|
* current location.
|
|
|
|
*
|
|
|
|
* @root: root resource descriptor
|
|
|
|
* @old: resource descriptor desired by caller
|
|
|
|
* @newsize: new size of the resource descriptor
|
2024-10-09 12:57:51 +00:00
|
|
|
* @constraint: the memory range and alignment constraints to be met.
|
2011-07-06 06:44:30 +00:00
|
|
|
*/
|
2014-04-03 21:48:36 +00:00
|
|
|
static int reallocate_resource(struct resource *root, struct resource *old,
|
2018-10-09 14:11:21 +00:00
|
|
|
resource_size_t newsize,
|
|
|
|
struct resource_constraint *constraint)
|
2011-07-06 06:44:30 +00:00
|
|
|
{
|
|
|
|
int err=0;
|
|
|
|
struct resource new = *old;
|
|
|
|
struct resource *conflict;
|
|
|
|
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
|
2024-05-07 10:25:17 +00:00
|
|
|
if ((err = __find_resource_space(root, old, &new, newsize, constraint)))
|
2011-07-06 06:44:30 +00:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (resource_contains(&new, old)) {
|
|
|
|
old->start = new.start;
|
|
|
|
old->end = new.end;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (old->child) {
|
|
|
|
err = -EBUSY;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (resource_contains(old, &new)) {
|
|
|
|
old->start = new.start;
|
|
|
|
old->end = new.end;
|
|
|
|
} else {
|
2016-03-09 19:47:04 +00:00
|
|
|
__release_resource(old, true);
|
2011-07-06 06:44:30 +00:00
|
|
|
*old = new;
|
|
|
|
conflict = __request_resource(root, old);
|
|
|
|
BUG_ON(conflict);
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
write_unlock(&resource_lock);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* allocate_resource - allocate empty slot in the resource tree given range & alignment.
|
|
|
|
* The resource will be reallocated with a new size if it was already allocated
|
2006-10-03 08:13:51 +00:00
|
|
|
* @root: root resource descriptor
|
|
|
|
* @new: resource descriptor desired by caller
|
|
|
|
* @size: requested resource region size
|
2012-05-31 23:26:05 +00:00
|
|
|
* @min: minimum boundary to allocate
|
|
|
|
* @max: maximum boundary to allocate
|
2006-10-03 08:13:51 +00:00
|
|
|
* @align: alignment requested, in bytes
|
|
|
|
* @alignf: alignment function, optional, called if not NULL
|
|
|
|
* @alignf_data: arbitrary data to pass to the @alignf function
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
|
|
|
int allocate_resource(struct resource *root, struct resource *new,
|
2006-06-12 23:09:23 +00:00
|
|
|
resource_size_t size, resource_size_t min,
|
|
|
|
resource_size_t max, resource_size_t align,
|
2024-05-07 10:25:19 +00:00
|
|
|
resource_alignf alignf,
|
2005-04-16 22:20:36 +00:00
|
|
|
void *alignf_data)
|
|
|
|
{
|
|
|
|
int err;
|
2011-07-06 06:44:30 +00:00
|
|
|
struct resource_constraint constraint;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2011-07-06 06:44:30 +00:00
|
|
|
constraint.min = min;
|
|
|
|
constraint.max = max;
|
|
|
|
constraint.align = align;
|
|
|
|
constraint.alignf = alignf;
|
|
|
|
constraint.alignf_data = alignf_data;
|
|
|
|
|
|
|
|
if ( new->parent ) {
|
|
|
|
/* resource is already allocated, try reallocating with
|
|
|
|
the new constraints */
|
|
|
|
return reallocate_resource(root, new, size, &constraint);
|
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
write_lock(&resource_lock);
|
2024-05-07 10:25:17 +00:00
|
|
|
err = find_resource_space(root, new, size, &constraint);
|
2005-04-16 22:20:36 +00:00
|
|
|
if (err >= 0 && __request_resource(root, new))
|
|
|
|
err = -EBUSY;
|
|
|
|
write_unlock(&resource_lock);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(allocate_resource);
|
|
|
|
|
2011-05-07 18:53:16 +00:00
|
|
|
/**
|
|
|
|
* lookup_resource - find an existing resource by a resource start address
|
|
|
|
* @root: root resource descriptor
|
|
|
|
* @start: resource start address
|
|
|
|
*
|
|
|
|
* Returns a pointer to the resource if found, NULL otherwise
|
|
|
|
*/
|
|
|
|
struct resource *lookup_resource(struct resource *root, resource_size_t start)
|
|
|
|
{
|
|
|
|
struct resource *res;
|
|
|
|
|
|
|
|
read_lock(&resource_lock);
|
|
|
|
for (res = root->child; res; res = res->sibling) {
|
|
|
|
if (res->start == start)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
read_unlock(&resource_lock);
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2008-08-30 03:18:31 +00:00
|
|
|
/*
|
|
|
|
* Insert a resource into the resource tree. If successful, return NULL,
|
|
|
|
* otherwise return the conflicting resource (compare to __request_resource())
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
2008-08-30 03:18:31 +00:00
|
|
|
static struct resource * __insert_resource(struct resource *parent, struct resource *new)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
|
|
|
struct resource *first, *next;
|
|
|
|
|
2006-06-30 09:31:24 +00:00
|
|
|
for (;; parent = first) {
|
|
|
|
first = __request_resource(parent, new);
|
|
|
|
if (!first)
|
2008-08-30 03:18:31 +00:00
|
|
|
return first;
|
2006-06-30 09:31:24 +00:00
|
|
|
|
|
|
|
if (first == parent)
|
2008-08-30 03:18:31 +00:00
|
|
|
return first;
|
2010-10-27 22:34:52 +00:00
|
|
|
if (WARN_ON(first == new)) /* duplicated insertion */
|
|
|
|
return first;
|
2006-06-30 09:31:24 +00:00
|
|
|
|
|
|
|
if ((first->start > new->start) || (first->end < new->end))
|
|
|
|
break;
|
|
|
|
if ((first->start == new->start) && (first->end == new->end))
|
|
|
|
break;
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for (next = first; ; next = next->sibling) {
|
|
|
|
/* Partial overlap? Bad, and unfixable */
|
|
|
|
if (next->start < new->start || next->end > new->end)
|
2008-08-30 03:18:31 +00:00
|
|
|
return next;
|
2005-04-16 22:20:36 +00:00
|
|
|
if (!next->sibling)
|
|
|
|
break;
|
|
|
|
if (next->sibling->start > new->end)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
new->parent = parent;
|
|
|
|
new->sibling = next->sibling;
|
|
|
|
new->child = first;
|
|
|
|
|
|
|
|
next->sibling = NULL;
|
|
|
|
for (next = first; next; next = next->sibling)
|
|
|
|
next->parent = new;
|
|
|
|
|
|
|
|
if (parent->child == first) {
|
|
|
|
parent->child = new;
|
|
|
|
} else {
|
|
|
|
next = parent->child;
|
|
|
|
while (next->sibling != first)
|
|
|
|
next = next->sibling;
|
|
|
|
next->sibling = new;
|
|
|
|
}
|
2008-08-30 03:18:31 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2008-08-30 03:18:31 +00:00
|
|
|
/**
|
2010-03-12 00:01:09 +00:00
|
|
|
* insert_resource_conflict - Inserts resource in the resource tree
|
2008-08-30 03:18:31 +00:00
|
|
|
* @parent: parent of the new resource
|
|
|
|
* @new: new resource to insert
|
|
|
|
*
|
2010-03-12 00:01:09 +00:00
|
|
|
* Returns 0 on success, conflict resource if the resource can't be inserted.
|
2008-08-30 03:18:31 +00:00
|
|
|
*
|
2010-03-12 00:01:09 +00:00
|
|
|
* This function is equivalent to request_resource_conflict when no conflict
|
2008-08-30 03:18:31 +00:00
|
|
|
* happens. If a conflict happens, and the conflicting resources
|
|
|
|
* entirely fit within the range of the new resource, then the new
|
|
|
|
* resource is inserted and the conflicting resources become children of
|
|
|
|
* the new resource.
|
2016-03-09 19:47:04 +00:00
|
|
|
*
|
|
|
|
* This function is intended for producers of resources, such as FW modules
|
|
|
|
* and bus drivers.
|
2008-08-30 03:18:31 +00:00
|
|
|
*/
|
2010-03-12 00:01:09 +00:00
|
|
|
struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
|
2008-08-30 03:18:31 +00:00
|
|
|
{
|
|
|
|
struct resource *conflict;
|
|
|
|
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
conflict = __insert_resource(parent, new);
|
|
|
|
write_unlock(&resource_lock);
|
2010-03-12 00:01:09 +00:00
|
|
|
return conflict;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* insert_resource - Inserts a resource in the resource tree
|
|
|
|
* @parent: parent of the new resource
|
|
|
|
* @new: new resource to insert
|
|
|
|
*
|
|
|
|
* Returns 0 on success, -EBUSY if the resource can't be inserted.
|
2016-03-09 19:47:04 +00:00
|
|
|
*
|
|
|
|
* This function is intended for producers of resources, such as FW modules
|
|
|
|
* and bus drivers.
|
2010-03-12 00:01:09 +00:00
|
|
|
*/
|
|
|
|
int insert_resource(struct resource *parent, struct resource *new)
|
|
|
|
{
|
|
|
|
struct resource *conflict;
|
|
|
|
|
|
|
|
conflict = insert_resource_conflict(parent, new);
|
2008-08-30 03:18:31 +00:00
|
|
|
return conflict ? -EBUSY : 0;
|
|
|
|
}
|
2016-03-09 19:47:05 +00:00
|
|
|
EXPORT_SYMBOL_GPL(insert_resource);
|
2008-08-30 03:18:31 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* insert_resource_expand_to_fit - Insert a resource into the resource tree
|
2008-09-01 03:31:55 +00:00
|
|
|
* @root: root resource descriptor
|
2008-08-30 03:18:31 +00:00
|
|
|
* @new: new resource to insert
|
|
|
|
*
|
|
|
|
* Insert a resource into the resource tree, possibly expanding it in order
|
|
|
|
* to make it encompass any conflicting resources.
|
|
|
|
*/
|
|
|
|
void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
|
|
|
|
{
|
|
|
|
if (new->parent)
|
|
|
|
return;
|
|
|
|
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
for (;;) {
|
|
|
|
struct resource *conflict;
|
|
|
|
|
|
|
|
conflict = __insert_resource(root, new);
|
|
|
|
if (!conflict)
|
|
|
|
break;
|
|
|
|
if (conflict == root)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* Ok, expand resource to cover the conflict, then try again .. */
|
|
|
|
if (conflict->start < new->start)
|
|
|
|
new->start = conflict->start;
|
|
|
|
if (conflict->end > new->end)
|
|
|
|
new->end = conflict->end;
|
|
|
|
|
2022-11-09 15:56:17 +00:00
|
|
|
pr_info("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
|
2008-08-30 03:18:31 +00:00
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
write_unlock(&resource_lock);
|
|
|
|
}
|
2022-07-13 01:37:54 +00:00
|
|
|
/*
|
|
|
|
* Not for general consumption, only early boot memory map parsing, PCI
|
|
|
|
* resource discovery, and late discovery of CXL resources are expected
|
|
|
|
* to use this interface. The former are built-in and only the latter,
|
|
|
|
* CXL, is a module.
|
|
|
|
*/
|
|
|
|
EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL);
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2016-03-09 19:47:04 +00:00
|
|
|
/**
|
|
|
|
* remove_resource - Remove a resource in the resource tree
|
|
|
|
* @old: resource to remove
|
|
|
|
*
|
|
|
|
* Returns 0 on success, -EINVAL if the resource is not valid.
|
|
|
|
*
|
|
|
|
* This function removes a resource previously inserted by insert_resource()
|
|
|
|
* or insert_resource_conflict(), and moves the children (if any) up to
|
|
|
|
* where they were before. insert_resource() and insert_resource_conflict()
|
|
|
|
* insert a new resource, and move any conflicting resources down to the
|
|
|
|
* children of the new resource.
|
|
|
|
*
|
|
|
|
* insert_resource(), insert_resource_conflict() and remove_resource() are
|
|
|
|
* intended for producers of resources, such as FW modules and bus drivers.
|
|
|
|
*/
|
|
|
|
int remove_resource(struct resource *old)
|
|
|
|
{
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
retval = __release_resource(old, false);
|
|
|
|
write_unlock(&resource_lock);
|
|
|
|
return retval;
|
|
|
|
}
|
2016-03-09 19:47:05 +00:00
|
|
|
EXPORT_SYMBOL_GPL(remove_resource);
|
2016-03-09 19:47:04 +00:00
|
|
|
|
2013-04-29 22:08:17 +00:00
|
|
|
static int __adjust_resource(struct resource *res, resource_size_t start,
|
|
|
|
resource_size_t size)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
|
|
|
struct resource *tmp, *parent = res->parent;
|
2006-06-12 23:09:23 +00:00
|
|
|
resource_size_t end = start + size - 1;
|
2005-04-16 22:20:36 +00:00
|
|
|
int result = -EBUSY;
|
|
|
|
|
2012-05-18 01:51:11 +00:00
|
|
|
if (!parent)
|
|
|
|
goto skip;
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
if ((start < parent->start) || (end > parent->end))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (res->sibling && (res->sibling->start <= end))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
tmp = parent->child;
|
|
|
|
if (tmp != res) {
|
|
|
|
while (tmp->sibling != res)
|
|
|
|
tmp = tmp->sibling;
|
|
|
|
if (start <= tmp->end)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-05-18 01:51:11 +00:00
|
|
|
skip:
|
|
|
|
for (tmp = res->child; tmp; tmp = tmp->sibling)
|
|
|
|
if ((tmp->start < start) || (tmp->end > end))
|
|
|
|
goto out;
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
res->start = start;
|
|
|
|
res->end = end;
|
|
|
|
result = 0;
|
|
|
|
|
|
|
|
out:
|
2013-04-29 22:08:17 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* adjust_resource - modify a resource's start and size
|
|
|
|
* @res: resource to modify
|
|
|
|
* @start: new start value
|
|
|
|
* @size: new size
|
|
|
|
*
|
|
|
|
* Given an existing resource, change its start and size to match the
|
|
|
|
* arguments. Returns 0 on success, -EBUSY if it can't fit.
|
|
|
|
* Existing children of the resource are assumed to be immutable.
|
|
|
|
*/
|
|
|
|
int adjust_resource(struct resource *res, resource_size_t start,
|
2018-10-09 14:11:21 +00:00
|
|
|
resource_size_t size)
|
2013-04-29 22:08:17 +00:00
|
|
|
{
|
|
|
|
int result;
|
|
|
|
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
result = __adjust_resource(res, start, size);
|
2005-04-16 22:20:36 +00:00
|
|
|
write_unlock(&resource_lock);
|
|
|
|
return result;
|
|
|
|
}
|
2012-02-03 13:42:39 +00:00
|
|
|
EXPORT_SYMBOL(adjust_resource);
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2018-10-09 14:11:21 +00:00
|
|
|
static void __init
|
|
|
|
__reserve_region_with_split(struct resource *root, resource_size_t start,
|
|
|
|
resource_size_t end, const char *name)
|
2008-09-04 19:02:44 +00:00
|
|
|
{
|
|
|
|
struct resource *parent = root;
|
|
|
|
struct resource *conflict;
|
mem hotunplug: fix kfree() of bootmem memory
When hot removing memory presented at boot time, following messages are shown:
kernel BUG at mm/slub.c:3409!
invalid opcode: 0000 [#1] SMP
Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
CPU 0
Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15
RIP: kfree+0x232/0x240
Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
Call Trace:
__release_region+0xd4/0xe0
__remove_pages+0x52/0x110
arch_remove_memory+0x89/0xd0
remove_memory+0xc4/0x100
acpi_memory_device_remove+0x6d/0xb1
acpi_device_remove+0x89/0xab
__device_release_driver+0x7c/0xf0
device_release_driver+0x2f/0x50
acpi_bus_device_detach+0x6c/0x70
acpi_ns_walk_namespace+0x11a/0x250
acpi_walk_namespace+0xee/0x137
acpi_bus_trim+0x33/0x7a
acpi_bus_hot_remove_device+0xc4/0x1a1
acpi_os_execute_deferred+0x27/0x34
process_one_work+0x1f7/0x590
worker_thread+0x11a/0x370
kthread+0xee/0x100
ret_from_fork+0x7c/0xb0
RIP [<ffffffff811c41d2>] kfree+0x232/0x240
RSP <ffff88084678d968>
The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree(). So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.
But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it. So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:
When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource. If it is not allocated by bootmem, free_resource()
release it by kfree().
And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not. If there is a released resource structure,
get_resource() returns it. If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().
[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:56 +00:00
|
|
|
struct resource *res = alloc_resource(GFP_ATOMIC);
|
2012-10-05 00:16:55 +00:00
|
|
|
struct resource *next_res = NULL;
|
2017-12-01 20:07:18 +00:00
|
|
|
int type = resource_type(root);
|
2008-09-04 19:02:44 +00:00
|
|
|
|
|
|
|
if (!res)
|
|
|
|
return;
|
|
|
|
|
|
|
|
res->name = name;
|
|
|
|
res->start = start;
|
|
|
|
res->end = end;
|
2017-12-01 20:07:18 +00:00
|
|
|
res->flags = type | IORESOURCE_BUSY;
|
2016-01-26 20:57:19 +00:00
|
|
|
res->desc = IORES_DESC_NONE;
|
2008-09-04 19:02:44 +00:00
|
|
|
|
2012-10-05 00:16:55 +00:00
|
|
|
while (1) {
|
2008-09-04 19:02:44 +00:00
|
|
|
|
2012-10-05 00:16:55 +00:00
|
|
|
conflict = __request_resource(parent, res);
|
|
|
|
if (!conflict) {
|
|
|
|
if (!next_res)
|
|
|
|
break;
|
|
|
|
res = next_res;
|
|
|
|
next_res = NULL;
|
|
|
|
continue;
|
|
|
|
}
|
2008-09-04 19:02:44 +00:00
|
|
|
|
2012-10-05 00:16:55 +00:00
|
|
|
/* conflict covered whole area */
|
|
|
|
if (conflict->start <= res->start &&
|
|
|
|
conflict->end >= res->end) {
|
mem hotunplug: fix kfree() of bootmem memory
When hot removing memory presented at boot time, following messages are shown:
kernel BUG at mm/slub.c:3409!
invalid opcode: 0000 [#1] SMP
Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
CPU 0
Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15
RIP: kfree+0x232/0x240
Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
Call Trace:
__release_region+0xd4/0xe0
__remove_pages+0x52/0x110
arch_remove_memory+0x89/0xd0
remove_memory+0xc4/0x100
acpi_memory_device_remove+0x6d/0xb1
acpi_device_remove+0x89/0xab
__device_release_driver+0x7c/0xf0
device_release_driver+0x2f/0x50
acpi_bus_device_detach+0x6c/0x70
acpi_ns_walk_namespace+0x11a/0x250
acpi_walk_namespace+0xee/0x137
acpi_bus_trim+0x33/0x7a
acpi_bus_hot_remove_device+0xc4/0x1a1
acpi_os_execute_deferred+0x27/0x34
process_one_work+0x1f7/0x590
worker_thread+0x11a/0x370
kthread+0xee/0x100
ret_from_fork+0x7c/0xb0
RIP [<ffffffff811c41d2>] kfree+0x232/0x240
RSP <ffff88084678d968>
The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree(). So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.
But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it. So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:
When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource. If it is not allocated by bootmem, free_resource()
release it by kfree().
And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not. If there is a released resource structure,
get_resource() returns it. If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().
[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:56 +00:00
|
|
|
free_resource(res);
|
2012-10-05 00:16:55 +00:00
|
|
|
WARN_ON(next_res);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* failed, split and try again */
|
|
|
|
if (conflict->start > res->start) {
|
|
|
|
end = res->end;
|
|
|
|
res->end = conflict->start - 1;
|
|
|
|
if (conflict->end < end) {
|
mem hotunplug: fix kfree() of bootmem memory
When hot removing memory presented at boot time, following messages are shown:
kernel BUG at mm/slub.c:3409!
invalid opcode: 0000 [#1] SMP
Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
CPU 0
Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15
RIP: kfree+0x232/0x240
Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
Call Trace:
__release_region+0xd4/0xe0
__remove_pages+0x52/0x110
arch_remove_memory+0x89/0xd0
remove_memory+0xc4/0x100
acpi_memory_device_remove+0x6d/0xb1
acpi_device_remove+0x89/0xab
__device_release_driver+0x7c/0xf0
device_release_driver+0x2f/0x50
acpi_bus_device_detach+0x6c/0x70
acpi_ns_walk_namespace+0x11a/0x250
acpi_walk_namespace+0xee/0x137
acpi_bus_trim+0x33/0x7a
acpi_bus_hot_remove_device+0xc4/0x1a1
acpi_os_execute_deferred+0x27/0x34
process_one_work+0x1f7/0x590
worker_thread+0x11a/0x370
kthread+0xee/0x100
ret_from_fork+0x7c/0xb0
RIP [<ffffffff811c41d2>] kfree+0x232/0x240
RSP <ffff88084678d968>
The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree(). So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.
But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it. So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:
When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource. If it is not allocated by bootmem, free_resource()
release it by kfree().
And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not. If there is a released resource structure,
get_resource() returns it. If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().
[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:56 +00:00
|
|
|
next_res = alloc_resource(GFP_ATOMIC);
|
2012-10-05 00:16:55 +00:00
|
|
|
if (!next_res) {
|
mem hotunplug: fix kfree() of bootmem memory
When hot removing memory presented at boot time, following messages are shown:
kernel BUG at mm/slub.c:3409!
invalid opcode: 0000 [#1] SMP
Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
CPU 0
Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15
RIP: kfree+0x232/0x240
Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
Call Trace:
__release_region+0xd4/0xe0
__remove_pages+0x52/0x110
arch_remove_memory+0x89/0xd0
remove_memory+0xc4/0x100
acpi_memory_device_remove+0x6d/0xb1
acpi_device_remove+0x89/0xab
__device_release_driver+0x7c/0xf0
device_release_driver+0x2f/0x50
acpi_bus_device_detach+0x6c/0x70
acpi_ns_walk_namespace+0x11a/0x250
acpi_walk_namespace+0xee/0x137
acpi_bus_trim+0x33/0x7a
acpi_bus_hot_remove_device+0xc4/0x1a1
acpi_os_execute_deferred+0x27/0x34
process_one_work+0x1f7/0x590
worker_thread+0x11a/0x370
kthread+0xee/0x100
ret_from_fork+0x7c/0xb0
RIP [<ffffffff811c41d2>] kfree+0x232/0x240
RSP <ffff88084678d968>
The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree(). So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.
But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it. So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:
When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource. If it is not allocated by bootmem, free_resource()
release it by kfree().
And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not. If there is a released resource structure,
get_resource() returns it. If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().
[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:56 +00:00
|
|
|
free_resource(res);
|
2012-10-05 00:16:55 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
next_res->name = name;
|
|
|
|
next_res->start = conflict->end + 1;
|
|
|
|
next_res->end = end;
|
2017-12-01 20:07:18 +00:00
|
|
|
next_res->flags = type | IORESOURCE_BUSY;
|
2016-01-26 20:57:19 +00:00
|
|
|
next_res->desc = IORES_DESC_NONE;
|
2012-10-05 00:16:55 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
res->start = conflict->end + 1;
|
|
|
|
}
|
|
|
|
}
|
2008-09-04 19:02:44 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2018-10-09 14:11:21 +00:00
|
|
|
void __init
|
|
|
|
reserve_region_with_split(struct resource *root, resource_size_t start,
|
|
|
|
resource_size_t end, const char *name)
|
2008-09-04 19:02:44 +00:00
|
|
|
{
|
2012-07-30 21:42:58 +00:00
|
|
|
int abort = 0;
|
|
|
|
|
2008-09-04 19:02:44 +00:00
|
|
|
write_lock(&resource_lock);
|
2012-07-30 21:42:58 +00:00
|
|
|
if (root->start > start || root->end < end) {
|
|
|
|
pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
|
|
|
|
(unsigned long long)start, (unsigned long long)end,
|
|
|
|
root);
|
|
|
|
if (start > root->end || end < root->start)
|
|
|
|
abort = 1;
|
|
|
|
else {
|
|
|
|
if (end > root->end)
|
|
|
|
end = root->end;
|
|
|
|
if (start < root->start)
|
|
|
|
start = root->start;
|
|
|
|
pr_err("fixing request to [0x%llx-0x%llx]\n",
|
|
|
|
(unsigned long long)start,
|
|
|
|
(unsigned long long)end);
|
|
|
|
}
|
|
|
|
dump_stack();
|
|
|
|
}
|
|
|
|
if (!abort)
|
|
|
|
__reserve_region_with_split(root, start, end, name);
|
2008-09-04 19:02:44 +00:00
|
|
|
write_unlock(&resource_lock);
|
|
|
|
}
|
|
|
|
|
PCI: clean up resource alignment management
Done per Linus' request and suggestions. Linus has explained that
better than I'll be able to explain:
On Thu, Mar 27, 2008 at 10:12:10AM -0700, Linus Torvalds wrote:
> Actually, before we go any further, there might be a less intrusive
> alternative: add just a couple of flags to the resource flags field (we
> still have something like 8 unused bits on 32-bit), and use those to
> implement a generic "resource_alignment()" routine.
>
> Two flags would do it:
>
> - IORESOURCE_SIZEALIGN: size indicates alignment (regular PCI device
> resources)
>
> - IORESOURCE_STARTALIGN: start field is alignment (PCI bus resources
> during probing)
>
> and then the case of both flags zero (or both bits set) would actually be
> "invalid", and we would also clear the IORESOURCE_STARTALIGN flag when we
> actually allocate the resource (so that we don't use the "start" field as
> alignment incorrectly when it no longer indicates alignment).
>
> That wouldn't be totally generic, but it would have the nice property of
> automatically at least add sanity checking for that whole "res->start has
> the odd meaning of 'alignment' during probing" and remove the need for a
> new field, and it would allow us to have a generic "resource_alignment()"
> routine that just gets a resource pointer.
Besides, I removed IORESOURCE_BUS_HAS_VGA flag which was unused for ages.
Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Gary Hade <garyhade@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2008-03-30 15:50:14 +00:00
|
|
|
/**
|
|
|
|
* resource_alignment - calculate resource's alignment
|
|
|
|
* @res: resource pointer
|
|
|
|
*
|
|
|
|
* Returns alignment on success, 0 (invalid alignment) on failure.
|
|
|
|
*/
|
|
|
|
resource_size_t resource_alignment(struct resource *res)
|
|
|
|
{
|
|
|
|
switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
|
|
|
|
case IORESOURCE_SIZEALIGN:
|
2008-07-30 05:32:57 +00:00
|
|
|
return resource_size(res);
|
PCI: clean up resource alignment management
Done per Linus' request and suggestions. Linus has explained that
better than I'll be able to explain:
On Thu, Mar 27, 2008 at 10:12:10AM -0700, Linus Torvalds wrote:
> Actually, before we go any further, there might be a less intrusive
> alternative: add just a couple of flags to the resource flags field (we
> still have something like 8 unused bits on 32-bit), and use those to
> implement a generic "resource_alignment()" routine.
>
> Two flags would do it:
>
> - IORESOURCE_SIZEALIGN: size indicates alignment (regular PCI device
> resources)
>
> - IORESOURCE_STARTALIGN: start field is alignment (PCI bus resources
> during probing)
>
> and then the case of both flags zero (or both bits set) would actually be
> "invalid", and we would also clear the IORESOURCE_STARTALIGN flag when we
> actually allocate the resource (so that we don't use the "start" field as
> alignment incorrectly when it no longer indicates alignment).
>
> That wouldn't be totally generic, but it would have the nice property of
> automatically at least add sanity checking for that whole "res->start has
> the odd meaning of 'alignment' during probing" and remove the need for a
> new field, and it would allow us to have a generic "resource_alignment()"
> routine that just gets a resource pointer.
Besides, I removed IORESOURCE_BUS_HAS_VGA flag which was unused for ages.
Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Gary Hade <garyhade@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2008-03-30 15:50:14 +00:00
|
|
|
case IORESOURCE_STARTALIGN:
|
|
|
|
return res->start;
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* This is compatibility stuff for IO resources.
|
|
|
|
*
|
|
|
|
* Note how this, unlike the above, knows about
|
|
|
|
* the IO flag meanings (busy etc).
|
|
|
|
*
|
2006-10-03 08:13:51 +00:00
|
|
|
* request_region creates a new busy region.
|
2005-04-16 22:20:36 +00:00
|
|
|
*
|
2006-10-03 08:13:51 +00:00
|
|
|
* release_region releases a matching busy region.
|
|
|
|
*/
|
|
|
|
|
2010-03-29 17:38:00 +00:00
|
|
|
static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
|
|
|
|
|
2020-11-27 16:41:24 +00:00
|
|
|
static struct inode *iomem_inode;
|
|
|
|
|
|
|
|
#ifdef CONFIG_IO_STRICT_DEVMEM
|
|
|
|
static void revoke_iomem(struct resource *res)
|
|
|
|
{
|
|
|
|
/* pairs with smp_store_release() in iomem_init_inode() */
|
|
|
|
struct inode *inode = smp_load_acquire(&iomem_inode);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check that the initialization has completed. Losing the race
|
|
|
|
* is ok because it means drivers are claiming resources before
|
|
|
|
* the fs_initcall level of init and prevent iomem_get_mapping users
|
|
|
|
* from establishing mappings.
|
|
|
|
*/
|
|
|
|
if (!inode)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The expectation is that the driver has successfully marked
|
|
|
|
* the resource busy by this point, so devmem_is_allowed()
|
|
|
|
* should start returning false, however for performance this
|
|
|
|
* does not iterate the entire resource range.
|
|
|
|
*/
|
|
|
|
if (devmem_is_allowed(PHYS_PFN(res->start)) &&
|
|
|
|
devmem_is_allowed(PHYS_PFN(res->end))) {
|
|
|
|
/*
|
|
|
|
* *cringe* iomem=relaxed says "go ahead, what's the
|
|
|
|
* worst that can happen?"
|
|
|
|
*/
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void revoke_iomem(struct resource *res) {}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct address_space *iomem_get_mapping(void)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This function is only called from file open paths, hence guaranteed
|
|
|
|
* that fs_initcalls have completed and no need to check for NULL. But
|
|
|
|
* since revoke_iomem can be called before the initcall we still need
|
|
|
|
* the barrier to appease checkers.
|
|
|
|
*/
|
|
|
|
return smp_load_acquire(&iomem_inode)->i_mapping;
|
|
|
|
}
|
|
|
|
|
2021-05-07 01:05:27 +00:00
|
|
|
static int __request_region_locked(struct resource *res, struct resource *parent,
|
2006-06-12 23:09:23 +00:00
|
|
|
resource_size_t start, resource_size_t n,
|
2008-10-23 02:55:31 +00:00
|
|
|
const char *name, int flags)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2010-03-29 17:38:00 +00:00
|
|
|
DECLARE_WAITQUEUE(wait, current);
|
2008-10-16 05:05:14 +00:00
|
|
|
|
|
|
|
res->name = name;
|
|
|
|
res->start = start;
|
|
|
|
res->end = start + n - 1;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
struct resource *conflict;
|
|
|
|
|
2016-03-09 19:47:03 +00:00
|
|
|
res->flags = resource_type(parent) | resource_ext_type(parent);
|
|
|
|
res->flags |= IORESOURCE_BUSY | flags;
|
|
|
|
res->desc = parent->desc;
|
|
|
|
|
2008-10-16 05:05:14 +00:00
|
|
|
conflict = __request_resource(parent, res);
|
|
|
|
if (!conflict)
|
2005-04-16 22:20:36 +00:00
|
|
|
break;
|
2019-02-25 18:57:33 +00:00
|
|
|
/*
|
|
|
|
* mm/hmm.c reserves physical addresses which then
|
|
|
|
* become unavailable to other users. Conflicts are
|
|
|
|
* not expected. Warn to aid debugging if encountered.
|
|
|
|
*/
|
|
|
|
if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
|
|
|
|
pr_warn("Unaddressable device %s %pR conflicts with %pR",
|
|
|
|
conflict->name, conflict, res);
|
|
|
|
}
|
2008-10-16 05:05:14 +00:00
|
|
|
if (conflict != parent) {
|
2015-09-09 22:15:18 +00:00
|
|
|
if (!(conflict->flags & IORESOURCE_BUSY)) {
|
|
|
|
parent = conflict;
|
2008-10-16 05:05:14 +00:00
|
|
|
continue;
|
2015-09-09 22:15:18 +00:00
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
2010-03-29 17:38:00 +00:00
|
|
|
if (conflict->flags & flags & IORESOURCE_MUXED) {
|
|
|
|
add_wait_queue(&muxed_resource_wait, &wait);
|
|
|
|
write_unlock(&resource_lock);
|
|
|
|
set_current_state(TASK_UNINTERRUPTIBLE);
|
|
|
|
schedule();
|
|
|
|
remove_wait_queue(&muxed_resource_wait, &wait);
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
continue;
|
|
|
|
}
|
2008-10-16 05:05:14 +00:00
|
|
|
/* Uhhuh, that didn't work out.. */
|
2021-05-07 01:05:27 +00:00
|
|
|
return -EBUSY;
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
2021-05-07 01:05:27 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* __request_region - create a new busy resource region
|
|
|
|
* @parent: parent resource descriptor
|
|
|
|
* @start: resource start address
|
|
|
|
* @n: resource region size
|
|
|
|
* @name: reserving caller's ID string
|
|
|
|
* @flags: IO resource flags
|
|
|
|
*/
|
|
|
|
struct resource *__request_region(struct resource *parent,
|
|
|
|
resource_size_t start, resource_size_t n,
|
|
|
|
const char *name, int flags)
|
|
|
|
{
|
|
|
|
struct resource *res = alloc_resource(GFP_KERNEL);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!res)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
ret = __request_region_locked(res, parent, start, n, name, flags);
|
2008-10-16 05:05:14 +00:00
|
|
|
write_unlock(&resource_lock);
|
2020-05-21 21:06:17 +00:00
|
|
|
|
2021-05-07 01:05:27 +00:00
|
|
|
if (ret) {
|
|
|
|
free_resource(res);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (parent == &iomem_resource)
|
2020-11-27 16:41:24 +00:00
|
|
|
revoke_iomem(res);
|
2020-05-21 21:06:17 +00:00
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__request_region);
|
|
|
|
|
2006-10-03 08:13:51 +00:00
|
|
|
/**
|
|
|
|
* __release_region - release a previously reserved resource region
|
|
|
|
* @parent: parent resource descriptor
|
|
|
|
* @start: resource start address
|
|
|
|
* @n: resource region size
|
|
|
|
*
|
|
|
|
* The described resource region must match a currently busy region.
|
|
|
|
*/
|
2006-06-12 23:09:23 +00:00
|
|
|
void __release_region(struct resource *parent, resource_size_t start,
|
2018-10-09 14:11:21 +00:00
|
|
|
resource_size_t n)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
|
|
|
struct resource **p;
|
2006-06-12 23:09:23 +00:00
|
|
|
resource_size_t end;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
p = &parent->child;
|
|
|
|
end = start + n - 1;
|
|
|
|
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
struct resource *res = *p;
|
|
|
|
|
|
|
|
if (!res)
|
|
|
|
break;
|
|
|
|
if (res->start <= start && res->end >= end) {
|
|
|
|
if (!(res->flags & IORESOURCE_BUSY)) {
|
|
|
|
p = &res->child;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (res->start != start || res->end != end)
|
|
|
|
break;
|
|
|
|
*p = res->sibling;
|
|
|
|
write_unlock(&resource_lock);
|
2010-03-29 17:38:00 +00:00
|
|
|
if (res->flags & IORESOURCE_MUXED)
|
|
|
|
wake_up(&muxed_resource_wait);
|
mem hotunplug: fix kfree() of bootmem memory
When hot removing memory presented at boot time, following messages are shown:
kernel BUG at mm/slub.c:3409!
invalid opcode: 0000 [#1] SMP
Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
CPU 0
Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15
RIP: kfree+0x232/0x240
Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
Call Trace:
__release_region+0xd4/0xe0
__remove_pages+0x52/0x110
arch_remove_memory+0x89/0xd0
remove_memory+0xc4/0x100
acpi_memory_device_remove+0x6d/0xb1
acpi_device_remove+0x89/0xab
__device_release_driver+0x7c/0xf0
device_release_driver+0x2f/0x50
acpi_bus_device_detach+0x6c/0x70
acpi_ns_walk_namespace+0x11a/0x250
acpi_walk_namespace+0xee/0x137
acpi_bus_trim+0x33/0x7a
acpi_bus_hot_remove_device+0xc4/0x1a1
acpi_os_execute_deferred+0x27/0x34
process_one_work+0x1f7/0x590
worker_thread+0x11a/0x370
kthread+0xee/0x100
ret_from_fork+0x7c/0xb0
RIP [<ffffffff811c41d2>] kfree+0x232/0x240
RSP <ffff88084678d968>
The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree(). So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.
But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it. So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:
When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource. If it is not allocated by bootmem, free_resource()
release it by kfree().
And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not. If there is a released resource structure,
get_resource() returns it. If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().
[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:56 +00:00
|
|
|
free_resource(res);
|
2005-04-16 22:20:36 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
p = &res->sibling;
|
|
|
|
}
|
|
|
|
|
|
|
|
write_unlock(&resource_lock);
|
|
|
|
|
2022-11-09 15:56:17 +00:00
|
|
|
pr_warn("Trying to free nonexistent resource <%pa-%pa>\n", &start, &end);
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__release_region);
|
|
|
|
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
#ifdef CONFIG_MEMORY_HOTREMOVE
|
|
|
|
/**
|
|
|
|
* release_mem_region_adjustable - release a previously reserved memory region
|
|
|
|
* @start: resource start address
|
|
|
|
* @size: resource region size
|
|
|
|
*
|
|
|
|
* This interface is intended for memory hot-delete. The requested region
|
|
|
|
* is released from a currently busy memory resource. The requested region
|
|
|
|
* must either match exactly or fit into a single busy resource entry. In
|
|
|
|
* the latter case, the remaining resource is adjusted accordingly.
|
|
|
|
* Existing children of the busy memory resource must be immutable in the
|
|
|
|
* request.
|
|
|
|
*
|
|
|
|
* Note:
|
|
|
|
* - Additional release conditions, such as overlapping region, can be
|
|
|
|
* supported after they are confirmed as valid cases.
|
|
|
|
* - When a busy memory resource gets split into two entries, the code
|
|
|
|
* assumes that all children remain in the lower address entry for
|
|
|
|
* simplicity. Enhance this logic when necessary.
|
|
|
|
*/
|
2020-10-16 03:09:12 +00:00
|
|
|
void release_mem_region_adjustable(resource_size_t start, resource_size_t size)
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
{
|
2020-10-16 03:09:12 +00:00
|
|
|
struct resource *parent = &iomem_resource;
|
kernel/resource: make release_mem_region_adjustable() never fail
Patch series "selective merging of system ram resources", v4.
Some add_memory*() users add memory in small, contiguous memory blocks.
Examples include virtio-mem, hyper-v balloon, and the XEN balloon.
This can quickly result in a lot of memory resources, whereby the actual
resource boundaries are not of interest (e.g., it might be relevant for
DIMMs, exposed via /proc/iomem to user space). We really want to merge
added resources in this scenario where possible.
Resources are effectively stored in a list-based tree. Having a lot of
resources not only wastes memory, it also makes traversing that tree more
expensive, and makes /proc/iomem explode in size (e.g., requiring
kexec-tools to manually merge resources when creating a kdump header. The
current kexec-tools resource count limit does not allow for more than
~100GB of memory with a memory block size of 128MB on x86-64).
Let's allow to selectively merge system ram resources by specifying a new
flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only
tested with virtio-mem.
This patch (of 8):
Let's make sure splitting a resource on memory hotunplug will never fail.
This will become more relevant once we merge selected System RAM resources
- then, we'll trigger that case more often on memory hotunplug.
In general, this function is already unlikely to fail. When we remove
memory, we free up quite a lot of metadata (memmap, page tables, memory
block device, etc.). The only reason it could really fail would be when
injecting allocation errors.
All other error cases inside release_mem_region_adjustable() seem to be
sanity checks if the function would be abused in different context - let's
add WARN_ON_ONCE() in these cases so we can catch them.
[natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable]
Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com
Link: https://github.com/ClangBuiltLinux/linux/issues/1159
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Julien Grall <julien@xen.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Leonardo Bras <leobras.c@gmail.com>
Cc: Libor Pechacek <lpechacek@suse.cz>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pingfan Liu <kernelfans@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Roger Pau Monn <roger.pau@citrix.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Wei Liu <wei.liu@kernel.org>
Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-16 03:08:28 +00:00
|
|
|
struct resource *new_res = NULL;
|
|
|
|
bool alloc_nofail = false;
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
struct resource **p;
|
|
|
|
struct resource *res;
|
|
|
|
resource_size_t end;
|
|
|
|
|
|
|
|
end = start + size - 1;
|
kernel/resource: make release_mem_region_adjustable() never fail
Patch series "selective merging of system ram resources", v4.
Some add_memory*() users add memory in small, contiguous memory blocks.
Examples include virtio-mem, hyper-v balloon, and the XEN balloon.
This can quickly result in a lot of memory resources, whereby the actual
resource boundaries are not of interest (e.g., it might be relevant for
DIMMs, exposed via /proc/iomem to user space). We really want to merge
added resources in this scenario where possible.
Resources are effectively stored in a list-based tree. Having a lot of
resources not only wastes memory, it also makes traversing that tree more
expensive, and makes /proc/iomem explode in size (e.g., requiring
kexec-tools to manually merge resources when creating a kdump header. The
current kexec-tools resource count limit does not allow for more than
~100GB of memory with a memory block size of 128MB on x86-64).
Let's allow to selectively merge system ram resources by specifying a new
flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only
tested with virtio-mem.
This patch (of 8):
Let's make sure splitting a resource on memory hotunplug will never fail.
This will become more relevant once we merge selected System RAM resources
- then, we'll trigger that case more often on memory hotunplug.
In general, this function is already unlikely to fail. When we remove
memory, we free up quite a lot of metadata (memmap, page tables, memory
block device, etc.). The only reason it could really fail would be when
injecting allocation errors.
All other error cases inside release_mem_region_adjustable() seem to be
sanity checks if the function would be abused in different context - let's
add WARN_ON_ONCE() in these cases so we can catch them.
[natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable]
Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com
Link: https://github.com/ClangBuiltLinux/linux/issues/1159
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Julien Grall <julien@xen.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Leonardo Bras <leobras.c@gmail.com>
Cc: Libor Pechacek <lpechacek@suse.cz>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pingfan Liu <kernelfans@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Roger Pau Monn <roger.pau@citrix.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Wei Liu <wei.liu@kernel.org>
Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-16 03:08:28 +00:00
|
|
|
if (WARN_ON_ONCE((start < parent->start) || (end > parent->end)))
|
|
|
|
return;
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
|
kernel/resource: make release_mem_region_adjustable() never fail
Patch series "selective merging of system ram resources", v4.
Some add_memory*() users add memory in small, contiguous memory blocks.
Examples include virtio-mem, hyper-v balloon, and the XEN balloon.
This can quickly result in a lot of memory resources, whereby the actual
resource boundaries are not of interest (e.g., it might be relevant for
DIMMs, exposed via /proc/iomem to user space). We really want to merge
added resources in this scenario where possible.
Resources are effectively stored in a list-based tree. Having a lot of
resources not only wastes memory, it also makes traversing that tree more
expensive, and makes /proc/iomem explode in size (e.g., requiring
kexec-tools to manually merge resources when creating a kdump header. The
current kexec-tools resource count limit does not allow for more than
~100GB of memory with a memory block size of 128MB on x86-64).
Let's allow to selectively merge system ram resources by specifying a new
flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only
tested with virtio-mem.
This patch (of 8):
Let's make sure splitting a resource on memory hotunplug will never fail.
This will become more relevant once we merge selected System RAM resources
- then, we'll trigger that case more often on memory hotunplug.
In general, this function is already unlikely to fail. When we remove
memory, we free up quite a lot of metadata (memmap, page tables, memory
block device, etc.). The only reason it could really fail would be when
injecting allocation errors.
All other error cases inside release_mem_region_adjustable() seem to be
sanity checks if the function would be abused in different context - let's
add WARN_ON_ONCE() in these cases so we can catch them.
[natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable]
Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com
Link: https://github.com/ClangBuiltLinux/linux/issues/1159
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Julien Grall <julien@xen.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Leonardo Bras <leobras.c@gmail.com>
Cc: Libor Pechacek <lpechacek@suse.cz>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pingfan Liu <kernelfans@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Roger Pau Monn <roger.pau@citrix.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Wei Liu <wei.liu@kernel.org>
Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-16 03:08:28 +00:00
|
|
|
/*
|
|
|
|
* We free up quite a lot of memory on memory hotunplug (esp., memap),
|
|
|
|
* just before releasing the region. This is highly unlikely to
|
|
|
|
* fail - let's play save and make it never fail as the caller cannot
|
|
|
|
* perform any error handling (e.g., trying to re-add memory will fail
|
|
|
|
* similarly).
|
|
|
|
*/
|
|
|
|
retry:
|
|
|
|
new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0));
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
|
|
|
|
p = &parent->child;
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
|
|
|
|
while ((res = *p)) {
|
|
|
|
if (res->start >= end)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* look for the next resource if it does not fit into */
|
|
|
|
if (res->start > start || res->end < end) {
|
|
|
|
p = &res->sibling;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(res->flags & IORESOURCE_MEM))
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!(res->flags & IORESOURCE_BUSY)) {
|
|
|
|
p = &res->child;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* found the target resource; let's adjust accordingly */
|
|
|
|
if (res->start == start && res->end == end) {
|
|
|
|
/* free the whole entry */
|
|
|
|
*p = res->sibling;
|
mem hotunplug: fix kfree() of bootmem memory
When hot removing memory presented at boot time, following messages are shown:
kernel BUG at mm/slub.c:3409!
invalid opcode: 0000 [#1] SMP
Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
CPU 0
Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15
RIP: kfree+0x232/0x240
Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
Call Trace:
__release_region+0xd4/0xe0
__remove_pages+0x52/0x110
arch_remove_memory+0x89/0xd0
remove_memory+0xc4/0x100
acpi_memory_device_remove+0x6d/0xb1
acpi_device_remove+0x89/0xab
__device_release_driver+0x7c/0xf0
device_release_driver+0x2f/0x50
acpi_bus_device_detach+0x6c/0x70
acpi_ns_walk_namespace+0x11a/0x250
acpi_walk_namespace+0xee/0x137
acpi_bus_trim+0x33/0x7a
acpi_bus_hot_remove_device+0xc4/0x1a1
acpi_os_execute_deferred+0x27/0x34
process_one_work+0x1f7/0x590
worker_thread+0x11a/0x370
kthread+0xee/0x100
ret_from_fork+0x7c/0xb0
RIP [<ffffffff811c41d2>] kfree+0x232/0x240
RSP <ffff88084678d968>
The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree(). So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.
But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it. So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:
When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource. If it is not allocated by bootmem, free_resource()
release it by kfree().
And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not. If there is a released resource structure,
get_resource() returns it. If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().
[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:56 +00:00
|
|
|
free_resource(res);
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
} else if (res->start == start && res->end != end) {
|
|
|
|
/* adjust the start */
|
kernel/resource: make release_mem_region_adjustable() never fail
Patch series "selective merging of system ram resources", v4.
Some add_memory*() users add memory in small, contiguous memory blocks.
Examples include virtio-mem, hyper-v balloon, and the XEN balloon.
This can quickly result in a lot of memory resources, whereby the actual
resource boundaries are not of interest (e.g., it might be relevant for
DIMMs, exposed via /proc/iomem to user space). We really want to merge
added resources in this scenario where possible.
Resources are effectively stored in a list-based tree. Having a lot of
resources not only wastes memory, it also makes traversing that tree more
expensive, and makes /proc/iomem explode in size (e.g., requiring
kexec-tools to manually merge resources when creating a kdump header. The
current kexec-tools resource count limit does not allow for more than
~100GB of memory with a memory block size of 128MB on x86-64).
Let's allow to selectively merge system ram resources by specifying a new
flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only
tested with virtio-mem.
This patch (of 8):
Let's make sure splitting a resource on memory hotunplug will never fail.
This will become more relevant once we merge selected System RAM resources
- then, we'll trigger that case more often on memory hotunplug.
In general, this function is already unlikely to fail. When we remove
memory, we free up quite a lot of metadata (memmap, page tables, memory
block device, etc.). The only reason it could really fail would be when
injecting allocation errors.
All other error cases inside release_mem_region_adjustable() seem to be
sanity checks if the function would be abused in different context - let's
add WARN_ON_ONCE() in these cases so we can catch them.
[natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable]
Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com
Link: https://github.com/ClangBuiltLinux/linux/issues/1159
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Julien Grall <julien@xen.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Leonardo Bras <leobras.c@gmail.com>
Cc: Libor Pechacek <lpechacek@suse.cz>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pingfan Liu <kernelfans@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Roger Pau Monn <roger.pau@citrix.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Wei Liu <wei.liu@kernel.org>
Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-16 03:08:28 +00:00
|
|
|
WARN_ON_ONCE(__adjust_resource(res, end + 1,
|
|
|
|
res->end - end));
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
} else if (res->start != start && res->end == end) {
|
|
|
|
/* adjust the end */
|
kernel/resource: make release_mem_region_adjustable() never fail
Patch series "selective merging of system ram resources", v4.
Some add_memory*() users add memory in small, contiguous memory blocks.
Examples include virtio-mem, hyper-v balloon, and the XEN balloon.
This can quickly result in a lot of memory resources, whereby the actual
resource boundaries are not of interest (e.g., it might be relevant for
DIMMs, exposed via /proc/iomem to user space). We really want to merge
added resources in this scenario where possible.
Resources are effectively stored in a list-based tree. Having a lot of
resources not only wastes memory, it also makes traversing that tree more
expensive, and makes /proc/iomem explode in size (e.g., requiring
kexec-tools to manually merge resources when creating a kdump header. The
current kexec-tools resource count limit does not allow for more than
~100GB of memory with a memory block size of 128MB on x86-64).
Let's allow to selectively merge system ram resources by specifying a new
flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only
tested with virtio-mem.
This patch (of 8):
Let's make sure splitting a resource on memory hotunplug will never fail.
This will become more relevant once we merge selected System RAM resources
- then, we'll trigger that case more often on memory hotunplug.
In general, this function is already unlikely to fail. When we remove
memory, we free up quite a lot of metadata (memmap, page tables, memory
block device, etc.). The only reason it could really fail would be when
injecting allocation errors.
All other error cases inside release_mem_region_adjustable() seem to be
sanity checks if the function would be abused in different context - let's
add WARN_ON_ONCE() in these cases so we can catch them.
[natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable]
Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com
Link: https://github.com/ClangBuiltLinux/linux/issues/1159
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Julien Grall <julien@xen.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Leonardo Bras <leobras.c@gmail.com>
Cc: Libor Pechacek <lpechacek@suse.cz>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pingfan Liu <kernelfans@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Roger Pau Monn <roger.pau@citrix.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Wei Liu <wei.liu@kernel.org>
Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-16 03:08:28 +00:00
|
|
|
WARN_ON_ONCE(__adjust_resource(res, res->start,
|
|
|
|
start - res->start));
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
} else {
|
kernel/resource: make release_mem_region_adjustable() never fail
Patch series "selective merging of system ram resources", v4.
Some add_memory*() users add memory in small, contiguous memory blocks.
Examples include virtio-mem, hyper-v balloon, and the XEN balloon.
This can quickly result in a lot of memory resources, whereby the actual
resource boundaries are not of interest (e.g., it might be relevant for
DIMMs, exposed via /proc/iomem to user space). We really want to merge
added resources in this scenario where possible.
Resources are effectively stored in a list-based tree. Having a lot of
resources not only wastes memory, it also makes traversing that tree more
expensive, and makes /proc/iomem explode in size (e.g., requiring
kexec-tools to manually merge resources when creating a kdump header. The
current kexec-tools resource count limit does not allow for more than
~100GB of memory with a memory block size of 128MB on x86-64).
Let's allow to selectively merge system ram resources by specifying a new
flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only
tested with virtio-mem.
This patch (of 8):
Let's make sure splitting a resource on memory hotunplug will never fail.
This will become more relevant once we merge selected System RAM resources
- then, we'll trigger that case more often on memory hotunplug.
In general, this function is already unlikely to fail. When we remove
memory, we free up quite a lot of metadata (memmap, page tables, memory
block device, etc.). The only reason it could really fail would be when
injecting allocation errors.
All other error cases inside release_mem_region_adjustable() seem to be
sanity checks if the function would be abused in different context - let's
add WARN_ON_ONCE() in these cases so we can catch them.
[natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable]
Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com
Link: https://github.com/ClangBuiltLinux/linux/issues/1159
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Julien Grall <julien@xen.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Leonardo Bras <leobras.c@gmail.com>
Cc: Libor Pechacek <lpechacek@suse.cz>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pingfan Liu <kernelfans@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Roger Pau Monn <roger.pau@citrix.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Wei Liu <wei.liu@kernel.org>
Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-16 03:08:28 +00:00
|
|
|
/* split into two entries - we need a new resource */
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
if (!new_res) {
|
kernel/resource: make release_mem_region_adjustable() never fail
Patch series "selective merging of system ram resources", v4.
Some add_memory*() users add memory in small, contiguous memory blocks.
Examples include virtio-mem, hyper-v balloon, and the XEN balloon.
This can quickly result in a lot of memory resources, whereby the actual
resource boundaries are not of interest (e.g., it might be relevant for
DIMMs, exposed via /proc/iomem to user space). We really want to merge
added resources in this scenario where possible.
Resources are effectively stored in a list-based tree. Having a lot of
resources not only wastes memory, it also makes traversing that tree more
expensive, and makes /proc/iomem explode in size (e.g., requiring
kexec-tools to manually merge resources when creating a kdump header. The
current kexec-tools resource count limit does not allow for more than
~100GB of memory with a memory block size of 128MB on x86-64).
Let's allow to selectively merge system ram resources by specifying a new
flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only
tested with virtio-mem.
This patch (of 8):
Let's make sure splitting a resource on memory hotunplug will never fail.
This will become more relevant once we merge selected System RAM resources
- then, we'll trigger that case more often on memory hotunplug.
In general, this function is already unlikely to fail. When we remove
memory, we free up quite a lot of metadata (memmap, page tables, memory
block device, etc.). The only reason it could really fail would be when
injecting allocation errors.
All other error cases inside release_mem_region_adjustable() seem to be
sanity checks if the function would be abused in different context - let's
add WARN_ON_ONCE() in these cases so we can catch them.
[natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable]
Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com
Link: https://github.com/ClangBuiltLinux/linux/issues/1159
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Julien Grall <julien@xen.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Leonardo Bras <leobras.c@gmail.com>
Cc: Libor Pechacek <lpechacek@suse.cz>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pingfan Liu <kernelfans@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Roger Pau Monn <roger.pau@citrix.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Wei Liu <wei.liu@kernel.org>
Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-16 03:08:28 +00:00
|
|
|
new_res = alloc_resource(GFP_ATOMIC);
|
|
|
|
if (!new_res) {
|
|
|
|
alloc_nofail = true;
|
|
|
|
write_unlock(&resource_lock);
|
|
|
|
goto retry;
|
|
|
|
}
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
}
|
|
|
|
new_res->name = res->name;
|
|
|
|
new_res->start = end + 1;
|
|
|
|
new_res->end = res->end;
|
|
|
|
new_res->flags = res->flags;
|
2016-01-26 20:57:19 +00:00
|
|
|
new_res->desc = res->desc;
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
new_res->parent = res->parent;
|
|
|
|
new_res->sibling = res->sibling;
|
|
|
|
new_res->child = NULL;
|
|
|
|
|
kernel/resource: make release_mem_region_adjustable() never fail
Patch series "selective merging of system ram resources", v4.
Some add_memory*() users add memory in small, contiguous memory blocks.
Examples include virtio-mem, hyper-v balloon, and the XEN balloon.
This can quickly result in a lot of memory resources, whereby the actual
resource boundaries are not of interest (e.g., it might be relevant for
DIMMs, exposed via /proc/iomem to user space). We really want to merge
added resources in this scenario where possible.
Resources are effectively stored in a list-based tree. Having a lot of
resources not only wastes memory, it also makes traversing that tree more
expensive, and makes /proc/iomem explode in size (e.g., requiring
kexec-tools to manually merge resources when creating a kdump header. The
current kexec-tools resource count limit does not allow for more than
~100GB of memory with a memory block size of 128MB on x86-64).
Let's allow to selectively merge system ram resources by specifying a new
flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only
tested with virtio-mem.
This patch (of 8):
Let's make sure splitting a resource on memory hotunplug will never fail.
This will become more relevant once we merge selected System RAM resources
- then, we'll trigger that case more often on memory hotunplug.
In general, this function is already unlikely to fail. When we remove
memory, we free up quite a lot of metadata (memmap, page tables, memory
block device, etc.). The only reason it could really fail would be when
injecting allocation errors.
All other error cases inside release_mem_region_adjustable() seem to be
sanity checks if the function would be abused in different context - let's
add WARN_ON_ONCE() in these cases so we can catch them.
[natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable]
Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com
Link: https://github.com/ClangBuiltLinux/linux/issues/1159
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Julien Grall <julien@xen.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Leonardo Bras <leobras.c@gmail.com>
Cc: Libor Pechacek <lpechacek@suse.cz>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pingfan Liu <kernelfans@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Roger Pau Monn <roger.pau@citrix.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Wei Liu <wei.liu@kernel.org>
Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-16 03:08:28 +00:00
|
|
|
if (WARN_ON_ONCE(__adjust_resource(res, res->start,
|
|
|
|
start - res->start)))
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
break;
|
|
|
|
res->sibling = new_res;
|
|
|
|
new_res = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
write_unlock(&resource_lock);
|
mem hotunplug: fix kfree() of bootmem memory
When hot removing memory presented at boot time, following messages are shown:
kernel BUG at mm/slub.c:3409!
invalid opcode: 0000 [#1] SMP
Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod
CPU 0
Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15
RIP: kfree+0x232/0x240
Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80)
Call Trace:
__release_region+0xd4/0xe0
__remove_pages+0x52/0x110
arch_remove_memory+0x89/0xd0
remove_memory+0xc4/0x100
acpi_memory_device_remove+0x6d/0xb1
acpi_device_remove+0x89/0xab
__device_release_driver+0x7c/0xf0
device_release_driver+0x2f/0x50
acpi_bus_device_detach+0x6c/0x70
acpi_ns_walk_namespace+0x11a/0x250
acpi_walk_namespace+0xee/0x137
acpi_bus_trim+0x33/0x7a
acpi_bus_hot_remove_device+0xc4/0x1a1
acpi_os_execute_deferred+0x27/0x34
process_one_work+0x1f7/0x590
worker_thread+0x11a/0x370
kthread+0xee/0x100
ret_from_fork+0x7c/0xb0
RIP [<ffffffff811c41d2>] kfree+0x232/0x240
RSP <ffff88084678d968>
The reason why the messages are shown is to release a resource
structure, allocated by bootmem, by kfree(). So when we release a
resource structure, we should check whether it is allocated by bootmem
or not.
But even if we know a resource structure is allocated by bootmem, we
cannot release it since SLxB cannot treat it. So for reusing a resource
structure, this patch remembers it by using bootmem_resource as follows:
When releasing a resource structure by free_resource(), free_resource()
checks whether the resource structure is allocated by bootmem or not.
If it is allocated by bootmem, free_resource() adds it to
bootmem_resource. If it is not allocated by bootmem, free_resource()
release it by kfree().
And when getting a new resource structure by get_resource(),
get_resource() checks whether bootmem_resource has released resource
structures or not. If there is a released resource structure,
get_resource() returns it. If there is not a releaed resource
structure, get_resource() returns new resource structure allocated by
kzalloc().
[akpm@linux-foundation.org: s/get_resource/alloc_resource/]
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:56 +00:00
|
|
|
free_resource(new_res);
|
resource: add release_mem_region_adjustable()
Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource. This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.
This new interface is intended for memory hot-delete. During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820. Each memory resource entry usually covers
the whole contigous memory range. Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory. Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.
This new interface is restrictive (i.e. release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource(). Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.
There is no change to the existing interfaces since their restriction is
valid for I/O resources.
[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-29 22:08:19 +00:00
|
|
|
}
|
|
|
|
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
|
|
|
|
2020-10-16 03:08:49 +00:00
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
|
|
static bool system_ram_resources_mergeable(struct resource *r1,
|
|
|
|
struct resource *r2)
|
|
|
|
{
|
|
|
|
/* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */
|
|
|
|
return r1->flags == r2->flags && r1->end + 1 == r2->start &&
|
|
|
|
r1->name == r2->name && r1->desc == r2->desc &&
|
|
|
|
!r1->child && !r2->child;
|
|
|
|
}
|
|
|
|
|
2020-12-16 04:46:16 +00:00
|
|
|
/**
|
2020-10-16 03:08:49 +00:00
|
|
|
* merge_system_ram_resource - mark the System RAM resource mergeable and try to
|
2020-12-16 04:46:16 +00:00
|
|
|
* merge it with adjacent, mergeable resources
|
2020-10-16 03:08:49 +00:00
|
|
|
* @res: resource descriptor
|
|
|
|
*
|
|
|
|
* This interface is intended for memory hotplug, whereby lots of contiguous
|
|
|
|
* system ram resources are added (e.g., via add_memory*()) by a driver, and
|
|
|
|
* the actual resource boundaries are not of interest (e.g., it might be
|
|
|
|
* relevant for DIMMs). Only resources that are marked mergeable, that have the
|
|
|
|
* same parent, and that don't have any children are considered. All mergeable
|
|
|
|
* resources must be immutable during the request.
|
|
|
|
*
|
|
|
|
* Note:
|
|
|
|
* - The caller has to make sure that no pointers to resources that are
|
|
|
|
* marked mergeable are used anymore after this call - the resource might
|
|
|
|
* be freed and the pointer might be stale!
|
|
|
|
* - release_mem_region_adjustable() will split on demand on memory hotunplug
|
|
|
|
*/
|
|
|
|
void merge_system_ram_resource(struct resource *res)
|
|
|
|
{
|
|
|
|
const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
|
|
|
|
struct resource *cur;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE((res->flags & flags) != flags))
|
|
|
|
return;
|
|
|
|
|
|
|
|
write_lock(&resource_lock);
|
|
|
|
res->flags |= IORESOURCE_SYSRAM_MERGEABLE;
|
|
|
|
|
|
|
|
/* Try to merge with next item in the list. */
|
|
|
|
cur = res->sibling;
|
|
|
|
if (cur && system_ram_resources_mergeable(res, cur)) {
|
|
|
|
res->end = cur->end;
|
|
|
|
res->sibling = cur->sibling;
|
|
|
|
free_resource(cur);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Try to merge with previous item in the list. */
|
|
|
|
cur = res->parent->child;
|
|
|
|
while (cur && cur->sibling != res)
|
|
|
|
cur = cur->sibling;
|
|
|
|
if (cur && system_ram_resources_mergeable(cur, res)) {
|
|
|
|
cur->end = res->end;
|
|
|
|
cur->sibling = res->sibling;
|
|
|
|
free_resource(res);
|
|
|
|
}
|
|
|
|
write_unlock(&resource_lock);
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_MEMORY_HOTPLUG */
|
|
|
|
|
devres: device resource management
Implement device resource management, in short, devres. A device
driver can allocate arbirary size of devres data which is associated
with a release function. On driver detach, release function is
invoked on the devres data, then, devres data is freed.
devreses are typed by associated release functions. Some devreses are
better represented by single instance of the type while others need
multiple instances sharing the same release function. Both usages are
supported.
devreses can be grouped using devres group such that a device driver
can easily release acquired resources halfway through initialization
or selectively release resources (e.g. resources for port 1 out of 4
ports).
This patch adds devres core including documentation and the following
managed interfaces.
* alloc/free : devm_kzalloc(), devm_kzfree()
* IO region : devm_request_region(), devm_release_region()
* IRQ : devm_request_irq(), devm_free_irq()
* DMA : dmam_alloc_coherent(), dmam_free_coherent(),
dmam_declare_coherent_memory(), dmam_pool_create(),
dmam_pool_destroy()
* PCI : pcim_enable_device(), pcim_pin_device(), pci_is_managed()
* iomap : devm_ioport_map(), devm_ioport_unmap(), devm_ioremap(),
devm_ioremap_nocache(), devm_iounmap(), pcim_iomap_table(),
pcim_iomap(), pcim_iounmap()
Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
2007-01-20 07:00:26 +00:00
|
|
|
/*
|
|
|
|
* Managed region resource
|
|
|
|
*/
|
2014-08-01 12:15:10 +00:00
|
|
|
static void devm_resource_release(struct device *dev, void *ptr)
|
|
|
|
{
|
|
|
|
struct resource **r = ptr;
|
|
|
|
|
|
|
|
release_resource(*r);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* devm_request_resource() - request and reserve an I/O or memory resource
|
|
|
|
* @dev: device for which to request the resource
|
|
|
|
* @root: root of the resource tree from which to request the resource
|
|
|
|
* @new: descriptor of the resource to request
|
|
|
|
*
|
|
|
|
* This is a device-managed version of request_resource(). There is usually
|
|
|
|
* no need to release resources requested by this function explicitly since
|
|
|
|
* that will be taken care of when the device is unbound from its driver.
|
|
|
|
* If for some reason the resource needs to be released explicitly, because
|
|
|
|
* of ordering issues for example, drivers must call devm_release_resource()
|
|
|
|
* rather than the regular release_resource().
|
|
|
|
*
|
|
|
|
* When a conflict is detected between any existing resources and the newly
|
|
|
|
* requested resource, an error message will be printed.
|
|
|
|
*
|
|
|
|
* Returns 0 on success or a negative error code on failure.
|
|
|
|
*/
|
|
|
|
int devm_request_resource(struct device *dev, struct resource *root,
|
|
|
|
struct resource *new)
|
|
|
|
{
|
|
|
|
struct resource *conflict, **ptr;
|
|
|
|
|
|
|
|
ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL);
|
|
|
|
if (!ptr)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
*ptr = new;
|
|
|
|
|
|
|
|
conflict = request_resource_conflict(root, new);
|
|
|
|
if (conflict) {
|
|
|
|
dev_err(dev, "resource collision: %pR conflicts with %s %pR\n",
|
|
|
|
new, conflict->name, conflict);
|
|
|
|
devres_free(ptr);
|
|
|
|
return -EBUSY;
|
|
|
|
}
|
|
|
|
|
|
|
|
devres_add(dev, ptr);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(devm_request_resource);
|
|
|
|
|
|
|
|
static int devm_resource_match(struct device *dev, void *res, void *data)
|
|
|
|
{
|
|
|
|
struct resource **ptr = res;
|
|
|
|
|
|
|
|
return *ptr == data;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* devm_release_resource() - release a previously requested resource
|
|
|
|
* @dev: device for which to release the resource
|
|
|
|
* @new: descriptor of the resource to release
|
|
|
|
*
|
|
|
|
* Releases a resource previously requested using devm_request_resource().
|
|
|
|
*/
|
|
|
|
void devm_release_resource(struct device *dev, struct resource *new)
|
|
|
|
{
|
|
|
|
WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match,
|
|
|
|
new));
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(devm_release_resource);
|
|
|
|
|
devres: device resource management
Implement device resource management, in short, devres. A device
driver can allocate arbirary size of devres data which is associated
with a release function. On driver detach, release function is
invoked on the devres data, then, devres data is freed.
devreses are typed by associated release functions. Some devreses are
better represented by single instance of the type while others need
multiple instances sharing the same release function. Both usages are
supported.
devreses can be grouped using devres group such that a device driver
can easily release acquired resources halfway through initialization
or selectively release resources (e.g. resources for port 1 out of 4
ports).
This patch adds devres core including documentation and the following
managed interfaces.
* alloc/free : devm_kzalloc(), devm_kzfree()
* IO region : devm_request_region(), devm_release_region()
* IRQ : devm_request_irq(), devm_free_irq()
* DMA : dmam_alloc_coherent(), dmam_free_coherent(),
dmam_declare_coherent_memory(), dmam_pool_create(),
dmam_pool_destroy()
* PCI : pcim_enable_device(), pcim_pin_device(), pci_is_managed()
* iomap : devm_ioport_map(), devm_ioport_unmap(), devm_ioremap(),
devm_ioremap_nocache(), devm_iounmap(), pcim_iomap_table(),
pcim_iomap(), pcim_iounmap()
Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
2007-01-20 07:00:26 +00:00
|
|
|
struct region_devres {
|
|
|
|
struct resource *parent;
|
|
|
|
resource_size_t start;
|
|
|
|
resource_size_t n;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void devm_region_release(struct device *dev, void *res)
|
|
|
|
{
|
|
|
|
struct region_devres *this = res;
|
|
|
|
|
|
|
|
__release_region(this->parent, this->start, this->n);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int devm_region_match(struct device *dev, void *res, void *match_data)
|
|
|
|
{
|
|
|
|
struct region_devres *this = res, *match = match_data;
|
|
|
|
|
|
|
|
return this->parent == match->parent &&
|
|
|
|
this->start == match->start && this->n == match->n;
|
|
|
|
}
|
|
|
|
|
2018-10-09 14:11:21 +00:00
|
|
|
struct resource *
|
|
|
|
__devm_request_region(struct device *dev, struct resource *parent,
|
|
|
|
resource_size_t start, resource_size_t n, const char *name)
|
devres: device resource management
Implement device resource management, in short, devres. A device
driver can allocate arbirary size of devres data which is associated
with a release function. On driver detach, release function is
invoked on the devres data, then, devres data is freed.
devreses are typed by associated release functions. Some devreses are
better represented by single instance of the type while others need
multiple instances sharing the same release function. Both usages are
supported.
devreses can be grouped using devres group such that a device driver
can easily release acquired resources halfway through initialization
or selectively release resources (e.g. resources for port 1 out of 4
ports).
This patch adds devres core including documentation and the following
managed interfaces.
* alloc/free : devm_kzalloc(), devm_kzfree()
* IO region : devm_request_region(), devm_release_region()
* IRQ : devm_request_irq(), devm_free_irq()
* DMA : dmam_alloc_coherent(), dmam_free_coherent(),
dmam_declare_coherent_memory(), dmam_pool_create(),
dmam_pool_destroy()
* PCI : pcim_enable_device(), pcim_pin_device(), pci_is_managed()
* iomap : devm_ioport_map(), devm_ioport_unmap(), devm_ioremap(),
devm_ioremap_nocache(), devm_iounmap(), pcim_iomap_table(),
pcim_iomap(), pcim_iounmap()
Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
2007-01-20 07:00:26 +00:00
|
|
|
{
|
|
|
|
struct region_devres *dr = NULL;
|
|
|
|
struct resource *res;
|
|
|
|
|
|
|
|
dr = devres_alloc(devm_region_release, sizeof(struct region_devres),
|
|
|
|
GFP_KERNEL);
|
|
|
|
if (!dr)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
dr->parent = parent;
|
|
|
|
dr->start = start;
|
|
|
|
dr->n = n;
|
|
|
|
|
2008-10-23 02:55:31 +00:00
|
|
|
res = __request_region(parent, start, n, name, 0);
|
devres: device resource management
Implement device resource management, in short, devres. A device
driver can allocate arbirary size of devres data which is associated
with a release function. On driver detach, release function is
invoked on the devres data, then, devres data is freed.
devreses are typed by associated release functions. Some devreses are
better represented by single instance of the type while others need
multiple instances sharing the same release function. Both usages are
supported.
devreses can be grouped using devres group such that a device driver
can easily release acquired resources halfway through initialization
or selectively release resources (e.g. resources for port 1 out of 4
ports).
This patch adds devres core including documentation and the following
managed interfaces.
* alloc/free : devm_kzalloc(), devm_kzfree()
* IO region : devm_request_region(), devm_release_region()
* IRQ : devm_request_irq(), devm_free_irq()
* DMA : dmam_alloc_coherent(), dmam_free_coherent(),
dmam_declare_coherent_memory(), dmam_pool_create(),
dmam_pool_destroy()
* PCI : pcim_enable_device(), pcim_pin_device(), pci_is_managed()
* iomap : devm_ioport_map(), devm_ioport_unmap(), devm_ioremap(),
devm_ioremap_nocache(), devm_iounmap(), pcim_iomap_table(),
pcim_iomap(), pcim_iounmap()
Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
2007-01-20 07:00:26 +00:00
|
|
|
if (res)
|
|
|
|
devres_add(dev, dr);
|
|
|
|
else
|
|
|
|
devres_free(dr);
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__devm_request_region);
|
|
|
|
|
|
|
|
void __devm_release_region(struct device *dev, struct resource *parent,
|
|
|
|
resource_size_t start, resource_size_t n)
|
|
|
|
{
|
|
|
|
struct region_devres match_data = { parent, start, n };
|
|
|
|
|
|
|
|
__release_region(parent, start, n);
|
|
|
|
WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match,
|
|
|
|
&match_data));
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__devm_release_region);
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
2017-12-01 17:50:33 +00:00
|
|
|
* Reserve I/O ports or memory based on "reserve=" kernel parameter.
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
|
|
|
#define MAXRESERVE 4
|
|
|
|
static int __init reserve_setup(char *str)
|
|
|
|
{
|
|
|
|
static int reserved;
|
|
|
|
static struct resource reserve[MAXRESERVE];
|
|
|
|
|
|
|
|
for (;;) {
|
2009-06-30 18:41:31 +00:00
|
|
|
unsigned int io_start, io_num;
|
2005-04-16 22:20:36 +00:00
|
|
|
int x = reserved;
|
2017-12-01 17:50:33 +00:00
|
|
|
struct resource *parent;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2017-12-01 17:50:33 +00:00
|
|
|
if (get_option(&str, &io_start) != 2)
|
2005-04-16 22:20:36 +00:00
|
|
|
break;
|
2017-12-01 17:50:33 +00:00
|
|
|
if (get_option(&str, &io_num) == 0)
|
2005-04-16 22:20:36 +00:00
|
|
|
break;
|
|
|
|
if (x < MAXRESERVE) {
|
|
|
|
struct resource *res = reserve + x;
|
2017-12-01 17:50:33 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the region starts below 0x10000, we assume it's
|
|
|
|
* I/O port space; otherwise assume it's memory.
|
|
|
|
*/
|
|
|
|
if (io_start < 0x10000) {
|
|
|
|
res->flags = IORESOURCE_IO;
|
|
|
|
parent = &ioport_resource;
|
|
|
|
} else {
|
|
|
|
res->flags = IORESOURCE_MEM;
|
|
|
|
parent = &iomem_resource;
|
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
res->name = "reserved";
|
|
|
|
res->start = io_start;
|
|
|
|
res->end = io_start + io_num - 1;
|
2017-12-01 17:50:33 +00:00
|
|
|
res->flags |= IORESOURCE_BUSY;
|
2016-01-26 20:57:19 +00:00
|
|
|
res->desc = IORES_DESC_NONE;
|
2005-04-16 22:20:36 +00:00
|
|
|
res->child = NULL;
|
2017-12-01 17:50:33 +00:00
|
|
|
if (request_resource(parent, res) == 0)
|
2005-04-16 22:20:36 +00:00
|
|
|
reserved = x+1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("reserve=", reserve_setup);
|
2008-09-26 01:43:34 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if the requested addr and size spans more than any slot in the
|
|
|
|
* iomem resource tree.
|
|
|
|
*/
|
|
|
|
int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
|
|
|
|
{
|
2022-11-09 15:56:17 +00:00
|
|
|
resource_size_t end = addr + size - 1;
|
2023-09-12 16:53:10 +00:00
|
|
|
struct resource *p;
|
2008-09-26 01:43:34 +00:00
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
read_lock(&resource_lock);
|
2023-09-12 16:53:10 +00:00
|
|
|
for_each_resource(&iomem_resource, p, false) {
|
2008-09-26 01:43:34 +00:00
|
|
|
/*
|
|
|
|
* We can probably skip the resources without
|
|
|
|
* IORESOURCE_IO attribute?
|
|
|
|
*/
|
2022-11-09 15:56:17 +00:00
|
|
|
if (p->start > end)
|
2008-09-26 01:43:34 +00:00
|
|
|
continue;
|
|
|
|
if (p->end < addr)
|
|
|
|
continue;
|
2008-10-28 18:45:42 +00:00
|
|
|
if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
|
2022-11-09 15:56:17 +00:00
|
|
|
PFN_DOWN(p->end) >= PFN_DOWN(end))
|
2008-09-26 01:43:34 +00:00
|
|
|
continue;
|
2008-12-13 17:15:27 +00:00
|
|
|
/*
|
|
|
|
* if a resource is "BUSY", it's not a hardware resource
|
|
|
|
* but a driver mapping of such a resource; we don't want
|
|
|
|
* to warn for those; some drivers legitimately map only
|
|
|
|
* partial hardware resources. (example: vesafb)
|
|
|
|
*/
|
|
|
|
if (p->flags & IORESOURCE_BUSY)
|
|
|
|
continue;
|
|
|
|
|
2022-11-09 15:56:17 +00:00
|
|
|
pr_warn("resource sanity check: requesting [mem %pa-%pa], which spans more than %s %pR\n",
|
|
|
|
&addr, &end, p->name, p);
|
2008-09-26 01:43:34 +00:00
|
|
|
err = -1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
read_unlock(&resource_lock);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
2008-10-23 02:55:31 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_STRICT_DEVMEM
|
|
|
|
static int strict_iomem_checks = 1;
|
|
|
|
#else
|
|
|
|
static int strict_iomem_checks;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
kernel/resource: disallow access to exclusive system RAM regions
virtio-mem dynamically exposes memory inside a device memory region as
system RAM to Linux, coordinating with the hypervisor which parts are
actually "plugged" and consequently usable/accessible.
On the one hand, the virtio-mem driver adds/removes whole memory blocks,
creating/removing busy IORESOURCE_SYSTEM_RAM resources, on the other
hand, it logically (un)plugs memory inside added memory blocks,
dynamically either exposing them to the buddy or hiding them from the
buddy and marking them PG_offline.
In contrast to physical devices, like a DIMM, the virtio-mem driver is
required to actually make use of any of the device-provided memory,
because it performs the handshake with the hypervisor. virtio-mem
memory cannot simply be access via /dev/mem without a driver.
There is no safe way to:
a) Access plugged memory blocks via /dev/mem, as they might contain
unplugged holes or might get silently unplugged by the virtio-mem
driver and consequently turned inaccessible.
b) Access unplugged memory blocks via /dev/mem because the virtio-mem
driver is required to make them actually accessible first.
The virtio-spec states that unplugged memory blocks MUST NOT be written,
and only selected unplugged memory blocks MAY be read. We want to make
sure, this is the case in sane environments -- where the virtio-mem driver
was loaded.
We want to make sure that in a sane environment, nobody "accidentially"
accesses unplugged memory inside the device managed region. For example,
a user might spot a memory region in /proc/iomem and try accessing it via
/dev/mem via gdb or dumping it via something else. By the time the mmap()
happens, the memory might already have been removed by the virtio-mem
driver silently: the mmap() would succeeed and user space might
accidentially access unplugged memory.
So once the driver was loaded and detected the device along the
device-managed region, we just want to disallow any access via /dev/mem to
it.
In an ideal world, we would mark the whole region as busy ("owned by a
driver") and exclude it; however, that would be wrong, as we don't really
have actual system RAM at these ranges added to Linux ("busy system RAM").
Instead, we want to mark such ranges as "not actual busy system RAM but
still soft-reserved and prepared by a driver for future use."
Let's teach iomem_is_exclusive() to reject access to any range with
"IORESOURCE_SYSTEM_RAM | IORESOURCE_EXCLUSIVE", even if not busy and even
if "iomem=relaxed" is set. Introduce EXCLUSIVE_SYSTEM_RAM to make it
easier for applicable drivers to depend on this setting in their Kconfig.
For now, there are no applicable ranges and we'll modify virtio-mem next
to properly set IORESOURCE_EXCLUSIVE on the parent resource container it
creates to contain all actual busy system RAM added via
add_memory_driver_managed().
Link: https://lkml.kernel.org/r/20210920142856.17758-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-09 02:35:50 +00:00
|
|
|
* Check if an address is exclusive to the kernel and must not be mapped to
|
|
|
|
* user space, for example, via /dev/mem.
|
|
|
|
*
|
|
|
|
* Returns true if exclusive to the kernel, otherwise returns false.
|
2008-10-23 02:55:31 +00:00
|
|
|
*/
|
2022-09-26 21:57:10 +00:00
|
|
|
bool resource_is_exclusive(struct resource *root, u64 addr, resource_size_t size)
|
2008-10-23 02:55:31 +00:00
|
|
|
{
|
kernel/resource: disallow access to exclusive system RAM regions
virtio-mem dynamically exposes memory inside a device memory region as
system RAM to Linux, coordinating with the hypervisor which parts are
actually "plugged" and consequently usable/accessible.
On the one hand, the virtio-mem driver adds/removes whole memory blocks,
creating/removing busy IORESOURCE_SYSTEM_RAM resources, on the other
hand, it logically (un)plugs memory inside added memory blocks,
dynamically either exposing them to the buddy or hiding them from the
buddy and marking them PG_offline.
In contrast to physical devices, like a DIMM, the virtio-mem driver is
required to actually make use of any of the device-provided memory,
because it performs the handshake with the hypervisor. virtio-mem
memory cannot simply be access via /dev/mem without a driver.
There is no safe way to:
a) Access plugged memory blocks via /dev/mem, as they might contain
unplugged holes or might get silently unplugged by the virtio-mem
driver and consequently turned inaccessible.
b) Access unplugged memory blocks via /dev/mem because the virtio-mem
driver is required to make them actually accessible first.
The virtio-spec states that unplugged memory blocks MUST NOT be written,
and only selected unplugged memory blocks MAY be read. We want to make
sure, this is the case in sane environments -- where the virtio-mem driver
was loaded.
We want to make sure that in a sane environment, nobody "accidentially"
accesses unplugged memory inside the device managed region. For example,
a user might spot a memory region in /proc/iomem and try accessing it via
/dev/mem via gdb or dumping it via something else. By the time the mmap()
happens, the memory might already have been removed by the virtio-mem
driver silently: the mmap() would succeeed and user space might
accidentially access unplugged memory.
So once the driver was loaded and detected the device along the
device-managed region, we just want to disallow any access via /dev/mem to
it.
In an ideal world, we would mark the whole region as busy ("owned by a
driver") and exclude it; however, that would be wrong, as we don't really
have actual system RAM at these ranges added to Linux ("busy system RAM").
Instead, we want to mark such ranges as "not actual busy system RAM but
still soft-reserved and prepared by a driver for future use."
Let's teach iomem_is_exclusive() to reject access to any range with
"IORESOURCE_SYSTEM_RAM | IORESOURCE_EXCLUSIVE", even if not busy and even
if "iomem=relaxed" is set. Introduce EXCLUSIVE_SYSTEM_RAM to make it
easier for applicable drivers to depend on this setting in their Kconfig.
For now, there are no applicable ranges and we'll modify virtio-mem next
to properly set IORESOURCE_EXCLUSIVE on the parent resource container it
creates to contain all actual busy system RAM added via
add_memory_driver_managed().
Link: https://lkml.kernel.org/r/20210920142856.17758-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-09 02:35:50 +00:00
|
|
|
const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM |
|
|
|
|
IORESOURCE_EXCLUSIVE;
|
2021-11-09 02:35:46 +00:00
|
|
|
bool skip_children = false, err = false;
|
|
|
|
struct resource *p;
|
2008-10-23 02:55:31 +00:00
|
|
|
|
|
|
|
read_lock(&resource_lock);
|
2022-09-26 21:57:10 +00:00
|
|
|
for_each_resource(root, p, skip_children) {
|
2008-10-23 02:55:31 +00:00
|
|
|
if (p->start >= addr + size)
|
|
|
|
break;
|
2021-11-09 02:35:46 +00:00
|
|
|
if (p->end < addr) {
|
|
|
|
skip_children = true;
|
2008-10-23 02:55:31 +00:00
|
|
|
continue;
|
2021-11-09 02:35:46 +00:00
|
|
|
}
|
|
|
|
skip_children = false;
|
|
|
|
|
kernel/resource: disallow access to exclusive system RAM regions
virtio-mem dynamically exposes memory inside a device memory region as
system RAM to Linux, coordinating with the hypervisor which parts are
actually "plugged" and consequently usable/accessible.
On the one hand, the virtio-mem driver adds/removes whole memory blocks,
creating/removing busy IORESOURCE_SYSTEM_RAM resources, on the other
hand, it logically (un)plugs memory inside added memory blocks,
dynamically either exposing them to the buddy or hiding them from the
buddy and marking them PG_offline.
In contrast to physical devices, like a DIMM, the virtio-mem driver is
required to actually make use of any of the device-provided memory,
because it performs the handshake with the hypervisor. virtio-mem
memory cannot simply be access via /dev/mem without a driver.
There is no safe way to:
a) Access plugged memory blocks via /dev/mem, as they might contain
unplugged holes or might get silently unplugged by the virtio-mem
driver and consequently turned inaccessible.
b) Access unplugged memory blocks via /dev/mem because the virtio-mem
driver is required to make them actually accessible first.
The virtio-spec states that unplugged memory blocks MUST NOT be written,
and only selected unplugged memory blocks MAY be read. We want to make
sure, this is the case in sane environments -- where the virtio-mem driver
was loaded.
We want to make sure that in a sane environment, nobody "accidentially"
accesses unplugged memory inside the device managed region. For example,
a user might spot a memory region in /proc/iomem and try accessing it via
/dev/mem via gdb or dumping it via something else. By the time the mmap()
happens, the memory might already have been removed by the virtio-mem
driver silently: the mmap() would succeeed and user space might
accidentially access unplugged memory.
So once the driver was loaded and detected the device along the
device-managed region, we just want to disallow any access via /dev/mem to
it.
In an ideal world, we would mark the whole region as busy ("owned by a
driver") and exclude it; however, that would be wrong, as we don't really
have actual system RAM at these ranges added to Linux ("busy system RAM").
Instead, we want to mark such ranges as "not actual busy system RAM but
still soft-reserved and prepared by a driver for future use."
Let's teach iomem_is_exclusive() to reject access to any range with
"IORESOURCE_SYSTEM_RAM | IORESOURCE_EXCLUSIVE", even if not busy and even
if "iomem=relaxed" is set. Introduce EXCLUSIVE_SYSTEM_RAM to make it
easier for applicable drivers to depend on this setting in their Kconfig.
For now, there are no applicable ranges and we'll modify virtio-mem next
to properly set IORESOURCE_EXCLUSIVE on the parent resource container it
creates to contain all actual busy system RAM added via
add_memory_driver_managed().
Link: https://lkml.kernel.org/r/20210920142856.17758-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-09 02:35:50 +00:00
|
|
|
/*
|
|
|
|
* IORESOURCE_SYSTEM_RAM resources are exclusive if
|
|
|
|
* IORESOURCE_EXCLUSIVE is set, even if they
|
|
|
|
* are not busy and even if "iomem=relaxed" is set. The
|
|
|
|
* responsible driver dynamically adds/removes system RAM within
|
|
|
|
* such an area and uncontrolled access is dangerous.
|
|
|
|
*/
|
|
|
|
if ((p->flags & exclusive_system_ram) == exclusive_system_ram) {
|
|
|
|
err = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2015-11-23 23:49:03 +00:00
|
|
|
/*
|
|
|
|
* A resource is exclusive if IORESOURCE_EXCLUSIVE is set
|
|
|
|
* or CONFIG_IO_STRICT_DEVMEM is enabled and the
|
|
|
|
* resource is busy.
|
|
|
|
*/
|
kernel/resource: disallow access to exclusive system RAM regions
virtio-mem dynamically exposes memory inside a device memory region as
system RAM to Linux, coordinating with the hypervisor which parts are
actually "plugged" and consequently usable/accessible.
On the one hand, the virtio-mem driver adds/removes whole memory blocks,
creating/removing busy IORESOURCE_SYSTEM_RAM resources, on the other
hand, it logically (un)plugs memory inside added memory blocks,
dynamically either exposing them to the buddy or hiding them from the
buddy and marking them PG_offline.
In contrast to physical devices, like a DIMM, the virtio-mem driver is
required to actually make use of any of the device-provided memory,
because it performs the handshake with the hypervisor. virtio-mem
memory cannot simply be access via /dev/mem without a driver.
There is no safe way to:
a) Access plugged memory blocks via /dev/mem, as they might contain
unplugged holes or might get silently unplugged by the virtio-mem
driver and consequently turned inaccessible.
b) Access unplugged memory blocks via /dev/mem because the virtio-mem
driver is required to make them actually accessible first.
The virtio-spec states that unplugged memory blocks MUST NOT be written,
and only selected unplugged memory blocks MAY be read. We want to make
sure, this is the case in sane environments -- where the virtio-mem driver
was loaded.
We want to make sure that in a sane environment, nobody "accidentially"
accesses unplugged memory inside the device managed region. For example,
a user might spot a memory region in /proc/iomem and try accessing it via
/dev/mem via gdb or dumping it via something else. By the time the mmap()
happens, the memory might already have been removed by the virtio-mem
driver silently: the mmap() would succeeed and user space might
accidentially access unplugged memory.
So once the driver was loaded and detected the device along the
device-managed region, we just want to disallow any access via /dev/mem to
it.
In an ideal world, we would mark the whole region as busy ("owned by a
driver") and exclude it; however, that would be wrong, as we don't really
have actual system RAM at these ranges added to Linux ("busy system RAM").
Instead, we want to mark such ranges as "not actual busy system RAM but
still soft-reserved and prepared by a driver for future use."
Let's teach iomem_is_exclusive() to reject access to any range with
"IORESOURCE_SYSTEM_RAM | IORESOURCE_EXCLUSIVE", even if not busy and even
if "iomem=relaxed" is set. Introduce EXCLUSIVE_SYSTEM_RAM to make it
easier for applicable drivers to depend on this setting in their Kconfig.
For now, there are no applicable ranges and we'll modify virtio-mem next
to properly set IORESOURCE_EXCLUSIVE on the parent resource container it
creates to contain all actual busy system RAM added via
add_memory_driver_managed().
Link: https://lkml.kernel.org/r/20210920142856.17758-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-09 02:35:50 +00:00
|
|
|
if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY))
|
2015-11-23 23:49:03 +00:00
|
|
|
continue;
|
|
|
|
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|
|
|
|
|| p->flags & IORESOURCE_EXCLUSIVE) {
|
2018-02-06 23:41:28 +00:00
|
|
|
err = true;
|
2008-10-23 02:55:31 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
read_unlock(&resource_lock);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2022-09-26 21:57:10 +00:00
|
|
|
bool iomem_is_exclusive(u64 addr)
|
|
|
|
{
|
|
|
|
return resource_is_exclusive(&iomem_resource, addr & PAGE_MASK,
|
|
|
|
PAGE_SIZE);
|
|
|
|
}
|
|
|
|
|
2015-02-05 05:44:43 +00:00
|
|
|
struct resource_entry *resource_list_create_entry(struct resource *res,
|
|
|
|
size_t extra_size)
|
|
|
|
{
|
|
|
|
struct resource_entry *entry;
|
|
|
|
|
|
|
|
entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
|
|
|
|
if (entry) {
|
|
|
|
INIT_LIST_HEAD(&entry->node);
|
|
|
|
entry->res = res ? res : &entry->__res;
|
|
|
|
}
|
|
|
|
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(resource_list_create_entry);
|
|
|
|
|
|
|
|
void resource_list_free(struct list_head *head)
|
|
|
|
{
|
|
|
|
struct resource_entry *entry, *tmp;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(entry, tmp, head, node)
|
|
|
|
resource_list_destroy_entry(entry);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(resource_list_free);
|
|
|
|
|
2022-05-20 20:41:24 +00:00
|
|
|
#ifdef CONFIG_GET_FREE_REGION
|
|
|
|
#define GFR_DESCENDING (1UL << 0)
|
|
|
|
#define GFR_REQUEST_REGION (1UL << 1)
|
2024-09-06 03:07:13 +00:00
|
|
|
#ifdef PA_SECTION_SHIFT
|
|
|
|
#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT)
|
|
|
|
#else
|
|
|
|
#define GFR_DEFAULT_ALIGN PAGE_SIZE
|
|
|
|
#endif
|
2022-05-20 20:41:24 +00:00
|
|
|
|
|
|
|
static resource_size_t gfr_start(struct resource *base, resource_size_t size,
|
|
|
|
resource_size_t align, unsigned long flags)
|
|
|
|
{
|
|
|
|
if (flags & GFR_DESCENDING) {
|
|
|
|
resource_size_t end;
|
|
|
|
|
2024-10-09 02:50:24 +00:00
|
|
|
end = min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END);
|
2022-05-20 20:41:24 +00:00
|
|
|
return end - size + 1;
|
|
|
|
}
|
|
|
|
|
2024-09-06 03:07:12 +00:00
|
|
|
return ALIGN(max(base->start, align), align);
|
2022-05-20 20:41:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool gfr_continue(struct resource *base, resource_size_t addr,
|
|
|
|
resource_size_t size, unsigned long flags)
|
|
|
|
{
|
|
|
|
if (flags & GFR_DESCENDING)
|
|
|
|
return addr > size && addr >= base->start;
|
|
|
|
/*
|
|
|
|
* In the ascend case be careful that the last increment by
|
|
|
|
* @size did not wrap 0.
|
|
|
|
*/
|
|
|
|
return addr > addr - size &&
|
2024-10-09 02:50:24 +00:00
|
|
|
addr <= min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END);
|
2022-05-20 20:41:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
|
|
|
|
unsigned long flags)
|
|
|
|
{
|
|
|
|
if (flags & GFR_DESCENDING)
|
|
|
|
return addr - size;
|
|
|
|
return addr + size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void remove_free_mem_region(void *_res)
|
|
|
|
{
|
|
|
|
struct resource *res = _res;
|
|
|
|
|
|
|
|
if (res->parent)
|
|
|
|
remove_resource(res);
|
|
|
|
free_resource(res);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct resource *
|
|
|
|
get_free_mem_region(struct device *dev, struct resource *base,
|
|
|
|
resource_size_t size, const unsigned long align,
|
|
|
|
const char *name, const unsigned long desc,
|
|
|
|
const unsigned long flags)
|
2019-06-26 12:27:06 +00:00
|
|
|
{
|
2022-05-20 20:41:24 +00:00
|
|
|
resource_size_t addr;
|
2019-06-26 12:27:06 +00:00
|
|
|
struct resource *res;
|
2021-05-07 01:05:30 +00:00
|
|
|
struct region_devres *dr = NULL;
|
2019-06-26 12:27:06 +00:00
|
|
|
|
2022-05-20 20:41:24 +00:00
|
|
|
size = ALIGN(size, align);
|
2019-06-26 12:27:06 +00:00
|
|
|
|
2021-05-07 01:05:30 +00:00
|
|
|
res = alloc_resource(GFP_KERNEL);
|
|
|
|
if (!res)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
2022-05-20 20:41:24 +00:00
|
|
|
if (dev && (flags & GFR_REQUEST_REGION)) {
|
2021-05-07 01:05:30 +00:00
|
|
|
dr = devres_alloc(devm_region_release,
|
|
|
|
sizeof(struct region_devres), GFP_KERNEL);
|
|
|
|
if (!dr) {
|
|
|
|
free_resource(res);
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
2022-05-20 20:41:24 +00:00
|
|
|
} else if (dev) {
|
|
|
|
if (devm_add_action_or_reset(dev, remove_free_mem_region, res))
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2021-05-07 01:05:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
write_lock(&resource_lock);
|
2022-05-20 20:41:24 +00:00
|
|
|
for (addr = gfr_start(base, size, align, flags);
|
2023-11-13 22:13:24 +00:00
|
|
|
gfr_continue(base, addr, align, flags);
|
|
|
|
addr = gfr_next(addr, align, flags)) {
|
2022-05-20 20:41:24 +00:00
|
|
|
if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
|
|
|
|
REGION_DISJOINT)
|
2019-06-26 12:27:06 +00:00
|
|
|
continue;
|
|
|
|
|
2022-05-20 20:41:24 +00:00
|
|
|
if (flags & GFR_REQUEST_REGION) {
|
|
|
|
if (__request_region_locked(res, &iomem_resource, addr,
|
|
|
|
size, name, 0))
|
|
|
|
break;
|
2021-05-07 01:05:30 +00:00
|
|
|
|
2022-05-20 20:41:24 +00:00
|
|
|
if (dev) {
|
|
|
|
dr->parent = &iomem_resource;
|
|
|
|
dr->start = addr;
|
|
|
|
dr->n = size;
|
|
|
|
devres_add(dev, dr);
|
|
|
|
}
|
2021-05-07 01:05:30 +00:00
|
|
|
|
2022-05-20 20:41:24 +00:00
|
|
|
res->desc = desc;
|
|
|
|
write_unlock(&resource_lock);
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A driver is claiming this region so revoke any
|
|
|
|
* mappings.
|
|
|
|
*/
|
|
|
|
revoke_iomem(res);
|
|
|
|
} else {
|
|
|
|
res->start = addr;
|
|
|
|
res->end = addr + size - 1;
|
|
|
|
res->name = name;
|
|
|
|
res->desc = desc;
|
|
|
|
res->flags = IORESOURCE_MEM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only succeed if the resource hosts an exclusive
|
|
|
|
* range after the insert
|
|
|
|
*/
|
|
|
|
if (__insert_resource(base, res) || res->child)
|
|
|
|
break;
|
|
|
|
|
|
|
|
write_unlock(&resource_lock);
|
|
|
|
}
|
2021-05-07 01:05:30 +00:00
|
|
|
|
2019-06-26 12:27:06 +00:00
|
|
|
return res;
|
|
|
|
}
|
2021-05-07 01:05:30 +00:00
|
|
|
write_unlock(&resource_lock);
|
|
|
|
|
2022-05-20 20:41:24 +00:00
|
|
|
if (flags & GFR_REQUEST_REGION) {
|
|
|
|
free_resource(res);
|
2021-05-07 01:05:30 +00:00
|
|
|
devres_free(dr);
|
2022-05-20 20:41:24 +00:00
|
|
|
} else if (dev)
|
|
|
|
devm_release_action(dev, remove_free_mem_region, res);
|
2019-06-26 12:27:06 +00:00
|
|
|
|
|
|
|
return ERR_PTR(-ERANGE);
|
|
|
|
}
|
2019-08-18 09:05:54 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* devm_request_free_mem_region - find free region for device private memory
|
|
|
|
*
|
|
|
|
* @dev: device struct to bind the resource to
|
|
|
|
* @size: size in bytes of the device memory to add
|
|
|
|
* @base: resource tree to look in
|
|
|
|
*
|
|
|
|
* This function tries to find an empty range of physical address big enough to
|
|
|
|
* contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
|
|
|
|
* memory, which in turn allocates struct pages.
|
|
|
|
*/
|
|
|
|
struct resource *devm_request_free_mem_region(struct device *dev,
|
|
|
|
struct resource *base, unsigned long size)
|
|
|
|
{
|
2022-05-20 20:41:24 +00:00
|
|
|
unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
|
|
|
|
|
|
|
|
return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN,
|
|
|
|
dev_name(dev),
|
|
|
|
IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
|
2019-08-18 09:05:54 +00:00
|
|
|
}
|
2019-06-26 12:27:06 +00:00
|
|
|
EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
|
2019-08-18 09:05:54 +00:00
|
|
|
|
|
|
|
struct resource *request_free_mem_region(struct resource *base,
|
|
|
|
unsigned long size, const char *name)
|
|
|
|
{
|
2022-05-20 20:41:24 +00:00
|
|
|
unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
|
|
|
|
|
|
|
|
return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name,
|
|
|
|
IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
|
2019-08-18 09:05:54 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(request_free_mem_region);
|
|
|
|
|
2022-05-20 20:41:24 +00:00
|
|
|
/**
|
|
|
|
* alloc_free_mem_region - find a free region relative to @base
|
|
|
|
* @base: resource that will parent the new resource
|
|
|
|
* @size: size in bytes of memory to allocate from @base
|
|
|
|
* @align: alignment requirements for the allocation
|
|
|
|
* @name: resource name
|
|
|
|
*
|
|
|
|
* Buses like CXL, that can dynamically instantiate new memory regions,
|
|
|
|
* need a method to allocate physical address space for those regions.
|
|
|
|
* Allocate and insert a new resource to cover a free, unclaimed by a
|
|
|
|
* descendant of @base, range in the span of @base.
|
|
|
|
*/
|
|
|
|
struct resource *alloc_free_mem_region(struct resource *base,
|
|
|
|
unsigned long size, unsigned long align,
|
|
|
|
const char *name)
|
|
|
|
{
|
|
|
|
/* Default of ascending direction and insert resource */
|
|
|
|
unsigned long flags = 0;
|
|
|
|
|
|
|
|
return get_free_mem_region(NULL, base, size, align, name,
|
|
|
|
IORES_DESC_NONE, flags);
|
|
|
|
}
|
2024-09-06 03:07:13 +00:00
|
|
|
EXPORT_SYMBOL_GPL(alloc_free_mem_region);
|
2022-05-20 20:41:24 +00:00
|
|
|
#endif /* CONFIG_GET_FREE_REGION */
|
2019-06-26 12:27:06 +00:00
|
|
|
|
2008-10-23 02:55:31 +00:00
|
|
|
static int __init strict_iomem(char *str)
|
|
|
|
{
|
|
|
|
if (strstr(str, "relaxed"))
|
|
|
|
strict_iomem_checks = 0;
|
|
|
|
if (strstr(str, "strict"))
|
|
|
|
strict_iomem_checks = 1;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2020-11-27 16:41:24 +00:00
|
|
|
static int iomem_fs_init_fs_context(struct fs_context *fc)
|
|
|
|
{
|
|
|
|
return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct file_system_type iomem_fs_type = {
|
|
|
|
.name = "iomem",
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.init_fs_context = iomem_fs_init_fs_context,
|
|
|
|
.kill_sb = kill_anon_super,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init iomem_init_inode(void)
|
|
|
|
{
|
|
|
|
static struct vfsmount *iomem_vfs_mount;
|
|
|
|
static int iomem_fs_cnt;
|
|
|
|
struct inode *inode;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
rc = simple_pin_fs(&iomem_fs_type, &iomem_vfs_mount, &iomem_fs_cnt);
|
|
|
|
if (rc < 0) {
|
|
|
|
pr_err("Cannot mount iomem pseudo filesystem: %d\n", rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb);
|
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
rc = PTR_ERR(inode);
|
|
|
|
pr_err("Cannot allocate inode for iomem: %d\n", rc);
|
|
|
|
simple_release_fs(&iomem_vfs_mount, &iomem_fs_cnt);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Publish iomem revocation inode initialized.
|
|
|
|
* Pairs with smp_load_acquire() in revoke_iomem().
|
|
|
|
*/
|
|
|
|
smp_store_release(&iomem_inode, inode);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
fs_initcall(iomem_init_inode);
|
|
|
|
|
2008-10-23 02:55:31 +00:00
|
|
|
__setup("iomem=", strict_iomem);
|