mm/demotion: build demotion targets based on explicit memory tiers

This patch switch the demotion target building logic to use memory tiers
instead of NUMA distance.  All N_MEMORY NUMA nodes will be placed in the
default memory tier and additional memory tiers will be added by drivers
like dax kmem.

This patch builds the demotion target for a NUMA node by looking at all
memory tiers below the tier to which the NUMA node belongs.  The closest
node in the immediately following memory tier is used as a demotion
target.

Since we are now only building demotion target for N_MEMORY NUMA nodes the
CPU hotplug calls are removed in this patch.

Link: https://lkml.kernel.org/r/20220818131042.113280-6-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Wei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Jagdish Gediya <jvgediya.oss@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Aneesh Kumar K.V 2022-08-18 18:40:37 +05:30 committed by Andrew Morton
parent 7b88bda376
commit 6c542ab757
5 changed files with 239 additions and 423 deletions

View File

@ -37,6 +37,14 @@ struct memory_dev_type *alloc_memory_type(int adistance);
void destroy_memory_type(struct memory_dev_type *memtype);
void init_node_memory_type(int node, struct memory_dev_type *default_type);
void clear_node_memory_type(int node, struct memory_dev_type *memtype);
#ifdef CONFIG_MIGRATION
int next_demotion_node(int node);
#else
static inline int next_demotion_node(int node)
{
return NUMA_NO_NODE;
}
#endif
#else
@ -63,5 +71,10 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt
{
}
static inline int next_demotion_node(int node)
{
return NUMA_NO_NODE;
}
#endif /* CONFIG_NUMA */
#endif /* _LINUX_MEMORY_TIERS_H */

View File

@ -100,19 +100,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
#endif /* CONFIG_MIGRATION */
#if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA)
extern void set_migration_target_nodes(void);
extern void migrate_on_reclaim_init(void);
extern int next_demotion_node(int node);
#else
static inline void set_migration_target_nodes(void) {}
static inline void migrate_on_reclaim_init(void) {}
static inline int next_demotion_node(int node)
{
return NUMA_NO_NODE;
}
#endif
#ifdef CONFIG_COMPACTION
bool PageMovable(struct page *page);
void __SetPageMovable(struct page *page, const struct movable_operations *ops);

View File

@ -6,6 +6,8 @@
#include <linux/memory.h>
#include <linux/memory-tiers.h>
#include "internal.h"
struct memory_tier {
/* hierarchy of memory tiers */
struct list_head list;
@ -19,6 +21,10 @@ struct memory_tier {
int adistance_start;
};
struct demotion_nodes {
nodemask_t preferred;
};
struct node_memory_type_map {
struct memory_dev_type *memtype;
int map_count;
@ -28,6 +34,66 @@ static DEFINE_MUTEX(memory_tier_lock);
static LIST_HEAD(memory_tiers);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
static struct memory_dev_type *default_dram_type;
#ifdef CONFIG_MIGRATION
/*
* node_demotion[] examples:
*
* Example 1:
*
* Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
*
* node distances:
* node 0 1 2 3
* 0 10 20 30 40
* 1 20 10 40 30
* 2 30 40 10 40
* 3 40 30 40 10
*
* memory_tiers0 = 0-1
* memory_tiers1 = 2-3
*
* node_demotion[0].preferred = 2
* node_demotion[1].preferred = 3
* node_demotion[2].preferred = <empty>
* node_demotion[3].preferred = <empty>
*
* Example 2:
*
* Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
*
* node distances:
* node 0 1 2
* 0 10 20 30
* 1 20 10 30
* 2 30 30 10
*
* memory_tiers0 = 0-2
*
* node_demotion[0].preferred = <empty>
* node_demotion[1].preferred = <empty>
* node_demotion[2].preferred = <empty>
*
* Example 3:
*
* Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
*
* node distances:
* node 0 1 2
* 0 10 20 30
* 1 20 10 40
* 2 30 40 10
*
* memory_tiers0 = 1
* memory_tiers1 = 0
* memory_tiers2 = 2
*
* node_demotion[0].preferred = 2
* node_demotion[1].preferred = 0
* node_demotion[2].preferred = <empty>
*
*/
static struct demotion_nodes *node_demotion __read_mostly;
#endif /* CONFIG_MIGRATION */
static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
{
@ -73,6 +139,154 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
return new_memtier;
}
static struct memory_tier *__node_get_memory_tier(int node)
{
struct memory_dev_type *memtype;
memtype = node_memory_types[node];
if (memtype && node_isset(node, memtype->nodes))
return memtype->memtier;
return NULL;
}
#ifdef CONFIG_MIGRATION
/**
* next_demotion_node() - Get the next node in the demotion path
* @node: The starting node to lookup the next node
*
* Return: node id for next memory node in the demotion path hierarchy
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
* @node online or guarantee that it *continues* to be the next demotion
* target.
*/
int next_demotion_node(int node)
{
struct demotion_nodes *nd;
int target;
if (!node_demotion)
return NUMA_NO_NODE;
nd = &node_demotion[node];
/*
* node_demotion[] is updated without excluding this
* function from running.
*
* Make sure to use RCU over entire code blocks if
* node_demotion[] reads need to be consistent.
*/
rcu_read_lock();
/*
* If there are multiple target nodes, just select one
* target node randomly.
*
* In addition, we can also use round-robin to select
* target node, but we should introduce another variable
* for node_demotion[] to record last selected target node,
* that may cause cache ping-pong due to the changing of
* last target node. Or introducing per-cpu data to avoid
* caching issue, which seems more complicated. So selecting
* target node randomly seems better until now.
*/
target = node_random(&nd->preferred);
rcu_read_unlock();
return target;
}
static void disable_all_demotion_targets(void)
{
int node;
for_each_node_state(node, N_MEMORY)
node_demotion[node].preferred = NODE_MASK_NONE;
/*
* Ensure that the "disable" is visible across the system.
* Readers will see either a combination of before+disable
* state or disable+after. They will never see before and
* after state together.
*/
synchronize_rcu();
}
static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
{
nodemask_t nodes = NODE_MASK_NONE;
struct memory_dev_type *memtype;
list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
nodes_or(nodes, nodes, memtype->nodes);
return nodes;
}
/*
* Find an automatic demotion target for all memory
* nodes. Failing here is OK. It might just indicate
* being at the end of a chain.
*/
static void establish_demotion_targets(void)
{
struct memory_tier *memtier;
struct demotion_nodes *nd;
int target = NUMA_NO_NODE, node;
int distance, best_distance;
nodemask_t tier_nodes;
lockdep_assert_held_once(&memory_tier_lock);
if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
return;
disable_all_demotion_targets();
for_each_node_state(node, N_MEMORY) {
best_distance = -1;
nd = &node_demotion[node];
memtier = __node_get_memory_tier(node);
if (!memtier || list_is_last(&memtier->list, &memory_tiers))
continue;
/*
* Get the lower memtier to find the demotion node list.
*/
memtier = list_next_entry(memtier, list);
tier_nodes = get_memtier_nodemask(memtier);
/*
* find_next_best_node, use 'used' nodemask as a skip list.
* Add all memory nodes except the selected memory tier
* nodelist to skip list so that we find the best node from the
* memtier nodelist.
*/
nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
/*
* Find all the nodes in the memory tier node list of same best distance.
* add them to the preferred mask. We randomly select between nodes
* in the preferred mask when allocating pages during demotion.
*/
do {
target = find_next_best_node(node, &tier_nodes);
if (target == NUMA_NO_NODE)
break;
distance = node_distance(node, target);
if (distance == best_distance || best_distance == -1) {
best_distance = distance;
node_set(target, nd->preferred);
} else {
break;
}
} while (1);
}
}
#else
static inline void disable_all_demotion_targets(void) {}
static inline void establish_demotion_targets(void) {}
#endif /* CONFIG_MIGRATION */
static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
{
if (!node_memory_types[node].memtype)
@ -109,16 +323,6 @@ static struct memory_tier *set_node_memory_tier(int node)
return memtier;
}
static struct memory_tier *__node_get_memory_tier(int node)
{
struct memory_dev_type *memtype;
memtype = node_memory_types[node];
if (memtype && node_isset(node, memtype->nodes))
return memtype->memtier;
return NULL;
}
static void destroy_memory_tier(struct memory_tier *memtier)
{
list_del(&memtier->list);
@ -207,6 +411,7 @@ EXPORT_SYMBOL_GPL(clear_node_memory_type);
static int __meminit memtier_hotplug_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
struct memory_tier *memtier;
struct memory_notify *arg = _arg;
/*
@ -219,12 +424,15 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
switch (action) {
case MEM_OFFLINE:
mutex_lock(&memory_tier_lock);
clear_node_memory_tier(arg->status_change_nid);
if (clear_node_memory_tier(arg->status_change_nid))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
break;
case MEM_ONLINE:
mutex_lock(&memory_tier_lock);
set_node_memory_tier(arg->status_change_nid);
memtier = set_node_memory_tier(arg->status_change_nid);
if (!IS_ERR(memtier))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
break;
}
@ -237,6 +445,11 @@ static int __init memory_tier_init(void)
int node;
struct memory_tier *memtier;
#ifdef CONFIG_MIGRATION
node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
GFP_KERNEL);
WARN_ON(!node_demotion);
#endif
mutex_lock(&memory_tier_lock);
/*
* For now we can have 4 faster memory tiers with smaller adistance
@ -259,6 +472,7 @@ static int __init memory_tier_init(void)
*/
break;
}
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);

View File

@ -2198,398 +2198,4 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
/*
* node_demotion[] example:
*
* Consider a system with two sockets. Each socket has
* three classes of memory attached: fast, medium and slow.
* Each memory class is placed in its own NUMA node. The
* CPUs are placed in the node with the "fast" memory. The
* 6 NUMA nodes (0-5) might be split among the sockets like
* this:
*
* Socket A: 0, 1, 2
* Socket B: 3, 4, 5
*
* When Node 0 fills up, its memory should be migrated to
* Node 1. When Node 1 fills up, it should be migrated to
* Node 2. The migration path start on the nodes with the
* processors (since allocations default to this node) and
* fast memory, progress through medium and end with the
* slow memory:
*
* 0 -> 1 -> 2 -> stop
* 3 -> 4 -> 5 -> stop
*
* This is represented in the node_demotion[] like this:
*
* { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
* { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
* { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
* { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
* { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
* { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
*
* Moreover some systems may have multiple slow memory nodes.
* Suppose a system has one socket with 3 memory nodes, node 0
* is fast memory type, and node 1/2 both are slow memory
* type, and the distance between fast memory node and slow
* memory node is same. So the migration path should be:
*
* 0 -> 1/2 -> stop
*
* This is represented in the node_demotion[] like this:
* { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
* { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
* { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
*/
/*
* Writes to this array occur without locking. Cycles are
* not allowed: Node X demotes to Y which demotes to X...
*
* If multiple reads are performed, a single rcu_read_lock()
* must be held over all reads to ensure that no cycles are
* observed.
*/
#define DEFAULT_DEMOTION_TARGET_NODES 15
#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
#else
#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
#endif
struct demotion_nodes {
unsigned short nr;
short nodes[DEMOTION_TARGET_NODES];
};
static struct demotion_nodes *node_demotion __read_mostly;
/**
* next_demotion_node() - Get the next node in the demotion path
* @node: The starting node to lookup the next node
*
* Return: node id for next memory node in the demotion path hierarchy
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
* @node online or guarantee that it *continues* to be the next demotion
* target.
*/
int next_demotion_node(int node)
{
struct demotion_nodes *nd;
unsigned short target_nr, index;
int target;
if (!node_demotion)
return NUMA_NO_NODE;
nd = &node_demotion[node];
/*
* node_demotion[] is updated without excluding this
* function from running. RCU doesn't provide any
* compiler barriers, so the READ_ONCE() is required
* to avoid compiler reordering or read merging.
*
* Make sure to use RCU over entire code blocks if
* node_demotion[] reads need to be consistent.
*/
rcu_read_lock();
target_nr = READ_ONCE(nd->nr);
switch (target_nr) {
case 0:
target = NUMA_NO_NODE;
goto out;
case 1:
index = 0;
break;
default:
/*
* If there are multiple target nodes, just select one
* target node randomly.
*
* In addition, we can also use round-robin to select
* target node, but we should introduce another variable
* for node_demotion[] to record last selected target node,
* that may cause cache ping-pong due to the changing of
* last target node. Or introducing per-cpu data to avoid
* caching issue, which seems more complicated. So selecting
* target node randomly seems better until now.
*/
index = get_random_int() % target_nr;
break;
}
target = READ_ONCE(nd->nodes[index]);
out:
rcu_read_unlock();
return target;
}
/* Disable reclaim-based migration. */
static void __disable_all_migrate_targets(void)
{
int node, i;
if (!node_demotion)
return;
for_each_online_node(node) {
node_demotion[node].nr = 0;
for (i = 0; i < DEMOTION_TARGET_NODES; i++)
node_demotion[node].nodes[i] = NUMA_NO_NODE;
}
}
static void disable_all_migrate_targets(void)
{
__disable_all_migrate_targets();
/*
* Ensure that the "disable" is visible across the system.
* Readers will see either a combination of before+disable
* state or disable+after. They will never see before and
* after state together.
*
* The before+after state together might have cycles and
* could cause readers to do things like loop until this
* function finishes. This ensures they can only see a
* single "bad" read and would, for instance, only loop
* once.
*/
synchronize_rcu();
}
/*
* Find an automatic demotion target for 'node'.
* Failing here is OK. It might just indicate
* being at the end of a chain.
*/
static int establish_migrate_target(int node, nodemask_t *used,
int best_distance)
{
int migration_target, index, val;
struct demotion_nodes *nd;
if (!node_demotion)
return NUMA_NO_NODE;
nd = &node_demotion[node];
migration_target = find_next_best_node(node, used);
if (migration_target == NUMA_NO_NODE)
return NUMA_NO_NODE;
/*
* If the node has been set a migration target node before,
* which means it's the best distance between them. Still
* check if this node can be demoted to other target nodes
* if they have a same best distance.
*/
if (best_distance != -1) {
val = node_distance(node, migration_target);
if (val > best_distance)
goto out_clear;
}
index = nd->nr;
if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
"Exceeds maximum demotion target nodes\n"))
goto out_clear;
nd->nodes[index] = migration_target;
nd->nr++;
return migration_target;
out_clear:
node_clear(migration_target, *used);
return NUMA_NO_NODE;
}
/*
* When memory fills up on a node, memory contents can be
* automatically migrated to another node instead of
* discarded at reclaim.
*
* Establish a "migration path" which will start at nodes
* with CPUs and will follow the priorities used to build the
* page allocator zonelists.
*
* The difference here is that cycles must be avoided. If
* node0 migrates to node1, then neither node1, nor anything
* node1 migrates to can migrate to node0. Also one node can
* be migrated to multiple nodes if the target nodes all have
* a same best-distance against the source node.
*
* This function can run simultaneously with readers of
* node_demotion[]. However, it can not run simultaneously
* with itself. Exclusion is provided by memory hotplug events
* being single-threaded.
*/
static void __set_migration_target_nodes(void)
{
nodemask_t next_pass;
nodemask_t this_pass;
nodemask_t used_targets = NODE_MASK_NONE;
int node, best_distance;
/*
* Avoid any oddities like cycles that could occur
* from changes in the topology. This will leave
* a momentary gap when migration is disabled.
*/
disable_all_migrate_targets();
/*
* Allocations go close to CPUs, first. Assume that
* the migration path starts at the nodes with CPUs.
*/
next_pass = node_states[N_CPU];
again:
this_pass = next_pass;
next_pass = NODE_MASK_NONE;
/*
* To avoid cycles in the migration "graph", ensure
* that migration sources are not future targets by
* setting them in 'used_targets'. Do this only
* once per pass so that multiple source nodes can
* share a target node.
*
* 'used_targets' will become unavailable in future
* passes. This limits some opportunities for
* multiple source nodes to share a destination.
*/
nodes_or(used_targets, used_targets, this_pass);
for_each_node_mask(node, this_pass) {
best_distance = -1;
/*
* Try to set up the migration path for the node, and the target
* migration nodes can be multiple, so doing a loop to find all
* the target nodes if they all have a best node distance.
*/
do {
int target_node =
establish_migrate_target(node, &used_targets,
best_distance);
if (target_node == NUMA_NO_NODE)
break;
if (best_distance == -1)
best_distance = node_distance(node, target_node);
/*
* Visit targets from this pass in the next pass.
* Eventually, every node will have been part of
* a pass, and will become set in 'used_targets'.
*/
node_set(target_node, next_pass);
} while (1);
}
/*
* 'next_pass' contains nodes which became migration
* targets in this pass. Make additional passes until
* no more migrations targets are available.
*/
if (!nodes_empty(next_pass))
goto again;
}
/*
* For callers that do not hold get_online_mems() already.
*/
void set_migration_target_nodes(void)
{
get_online_mems();
__set_migration_target_nodes();
put_online_mems();
}
/*
* This leaves migrate-on-reclaim transiently disabled between
* the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
* whether reclaim-based migration is enabled or not, which
* ensures that the user can turn reclaim-based migration at
* any time without needing to recalculate migration targets.
*
* These callbacks already hold get_online_mems(). That is why
* __set_migration_target_nodes() can be used as opposed to
* set_migration_target_nodes().
*/
#ifdef CONFIG_MEMORY_HOTPLUG
static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
struct memory_notify *arg = _arg;
/*
* Only update the node migration order when a node is
* changing status, like online->offline. This avoids
* the overhead of synchronize_rcu() in most cases.
*/
if (arg->status_change_nid < 0)
return notifier_from_errno(0);
switch (action) {
case MEM_GOING_OFFLINE:
/*
* Make sure there are not transient states where
* an offline node is a migration target. This
* will leave migration disabled until the offline
* completes and the MEM_OFFLINE case below runs.
*/
disable_all_migrate_targets();
break;
case MEM_OFFLINE:
case MEM_ONLINE:
/*
* Recalculate the target nodes once the node
* reaches its final state (online or offline).
*/
__set_migration_target_nodes();
break;
case MEM_CANCEL_OFFLINE:
/*
* MEM_GOING_OFFLINE disabled all the migration
* targets. Reenable them.
*/
__set_migration_target_nodes();
break;
case MEM_GOING_ONLINE:
case MEM_CANCEL_ONLINE:
break;
}
return notifier_from_errno(0);
}
#endif
void __init migrate_on_reclaim_init(void)
{
node_demotion = kcalloc(nr_node_ids,
sizeof(struct demotion_nodes),
GFP_KERNEL);
WARN_ON(!node_demotion);
#ifdef CONFIG_MEMORY_HOTPLUG
hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
#endif
/*
* At this point, all numa nodes with memory/CPus have their state
* properly set, so we can build the demotion order now.
* Let us hold the cpu_hotplug lock just, as we could possibily have
* CPU hotplug events during boot.
*/
cpus_read_lock();
set_migration_target_nodes();
cpus_read_unlock();
}
#endif /* CONFIG_NUMA */

View File

@ -28,7 +28,6 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
#include <linux/migrate.h>
#include "internal.h"
@ -2068,7 +2067,6 @@ static int vmstat_cpu_online(unsigned int cpu)
if (!node_state(cpu_to_node(cpu), N_CPU)) {
node_set_state(cpu_to_node(cpu), N_CPU);
set_migration_target_nodes();
}
return 0;
@ -2093,7 +2091,6 @@ static int vmstat_cpu_dead(unsigned int cpu)
return 0;
node_clear_state(node, N_CPU);
set_migration_target_nodes();
return 0;
}
@ -2126,7 +2123,6 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
migrate_on_reclaim_init();
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);