mm: memcg: introduce memcontrol-v1.c
Patch series "mm: memcg: separate legacy cgroup v1 code and put under
config option", v2.
Cgroups v2 have been around for a while and many users have fully adopted
them, so they never use cgroups v1 features and functionality. Yet they
have to "pay" for the cgroup v1 support anyway:
1) the kernel binary contains an unused cgroup v1 code,
2) some code paths have additional checks which are not needed,
3) some common structures like task_struct and mem_cgroup contain unused
cgroup v1-specific members.
Cgroup v1's memory controller has a number of features that are not
supported by cgroup v2 and their implementation is pretty much self
contained. Most notably, these features are: soft limit reclaim, oom
handling in userspace, complicated event notification system, charge
migration. Cgroup v1-specific code in memcontrol.c is close to 4k lines
in size and it's intervened with generic and cgroup v2-specific code.
It's a burden on developers and maintainers.
This patchset aims to solve these problems by:
1) moving cgroup v1-specific memcg code to the new mm/memcontrol-v1.c file,
2) putting definitions shared by memcontrol.c and memcontrol-v1.c into the
mm/memcontrol-v1.h header,
3) introducing the CONFIG_MEMCG_V1 config option, turned off by default,
4) making memcontrol-v1.c to compile only if CONFIG_MEMCG_V1 is set.
If CONFIG_MEMCG_V1 is not set, cgroup v1 memory controller is still available
for mounting, however no memory-specific control knobs are present.
This patch (of 14):
This patch introduces the mm/memcontrol-v1.c source file which will be
used for all legacy (cgroup v1) memory cgroup code. It also introduces
mm/memcontrol-v1.h to keep declarations shared between mm/memcontrol.c and
mm/memcontrol-v1.c.
As of now, let's compile it if CONFIG_MEMCG is set, similar to
mm/memcontrol.c. Later on it can be switched to use a separate config
option, so that the legacy code won't be compiled if not required.
Link: https://lkml.kernel.org/r/20240625005906.106920-1-roman.gushchin@linux.dev
Link: https://lkml.kernel.org/r/20240625005906.106920-2-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-25 00:58:53 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
|
2024-06-25 00:58:54 +00:00
|
|
|
#include <linux/memcontrol.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/mm_inline.h>
|
|
|
|
|
mm: memcg: introduce memcontrol-v1.c
Patch series "mm: memcg: separate legacy cgroup v1 code and put under
config option", v2.
Cgroups v2 have been around for a while and many users have fully adopted
them, so they never use cgroups v1 features and functionality. Yet they
have to "pay" for the cgroup v1 support anyway:
1) the kernel binary contains an unused cgroup v1 code,
2) some code paths have additional checks which are not needed,
3) some common structures like task_struct and mem_cgroup contain unused
cgroup v1-specific members.
Cgroup v1's memory controller has a number of features that are not
supported by cgroup v2 and their implementation is pretty much self
contained. Most notably, these features are: soft limit reclaim, oom
handling in userspace, complicated event notification system, charge
migration. Cgroup v1-specific code in memcontrol.c is close to 4k lines
in size and it's intervened with generic and cgroup v2-specific code.
It's a burden on developers and maintainers.
This patchset aims to solve these problems by:
1) moving cgroup v1-specific memcg code to the new mm/memcontrol-v1.c file,
2) putting definitions shared by memcontrol.c and memcontrol-v1.c into the
mm/memcontrol-v1.h header,
3) introducing the CONFIG_MEMCG_V1 config option, turned off by default,
4) making memcontrol-v1.c to compile only if CONFIG_MEMCG_V1 is set.
If CONFIG_MEMCG_V1 is not set, cgroup v1 memory controller is still available
for mounting, however no memory-specific control knobs are present.
This patch (of 14):
This patch introduces the mm/memcontrol-v1.c source file which will be
used for all legacy (cgroup v1) memory cgroup code. It also introduces
mm/memcontrol-v1.h to keep declarations shared between mm/memcontrol.c and
mm/memcontrol-v1.c.
As of now, let's compile it if CONFIG_MEMCG is set, similar to
mm/memcontrol.c. Later on it can be switched to use a separate config
option, so that the legacy code won't be compiled if not required.
Link: https://lkml.kernel.org/r/20240625005906.106920-1-roman.gushchin@linux.dev
Link: https://lkml.kernel.org/r/20240625005906.106920-2-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-25 00:58:53 +00:00
|
|
|
#include "memcontrol-v1.h"
|
2024-06-25 00:58:54 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Cgroups above their limits are maintained in a RB-Tree, independent of
|
|
|
|
* their hierarchy representation
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct mem_cgroup_tree_per_node {
|
|
|
|
struct rb_root rb_root;
|
|
|
|
struct rb_node *rb_rightmost;
|
|
|
|
spinlock_t lock;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct mem_cgroup_tree {
|
|
|
|
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Maximum loops in mem_cgroup_soft_reclaim(), used for soft
|
|
|
|
* limit reclaim to prevent infinite loops, if they ever occur.
|
|
|
|
*/
|
|
|
|
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
|
|
|
|
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
|
|
|
|
|
|
|
|
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
|
|
|
|
struct mem_cgroup_tree_per_node *mctz,
|
|
|
|
unsigned long new_usage_in_excess)
|
|
|
|
{
|
|
|
|
struct rb_node **p = &mctz->rb_root.rb_node;
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct mem_cgroup_per_node *mz_node;
|
|
|
|
bool rightmost = true;
|
|
|
|
|
|
|
|
if (mz->on_tree)
|
|
|
|
return;
|
|
|
|
|
|
|
|
mz->usage_in_excess = new_usage_in_excess;
|
|
|
|
if (!mz->usage_in_excess)
|
|
|
|
return;
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
mz_node = rb_entry(parent, struct mem_cgroup_per_node,
|
|
|
|
tree_node);
|
|
|
|
if (mz->usage_in_excess < mz_node->usage_in_excess) {
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
rightmost = false;
|
|
|
|
} else {
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rightmost)
|
|
|
|
mctz->rb_rightmost = &mz->tree_node;
|
|
|
|
|
|
|
|
rb_link_node(&mz->tree_node, parent, p);
|
|
|
|
rb_insert_color(&mz->tree_node, &mctz->rb_root);
|
|
|
|
mz->on_tree = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
|
|
|
|
struct mem_cgroup_tree_per_node *mctz)
|
|
|
|
{
|
|
|
|
if (!mz->on_tree)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (&mz->tree_node == mctz->rb_rightmost)
|
|
|
|
mctz->rb_rightmost = rb_prev(&mz->tree_node);
|
|
|
|
|
|
|
|
rb_erase(&mz->tree_node, &mctz->rb_root);
|
|
|
|
mz->on_tree = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
|
|
|
|
struct mem_cgroup_tree_per_node *mctz)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&mctz->lock, flags);
|
|
|
|
__mem_cgroup_remove_exceeded(mz, mctz);
|
|
|
|
spin_unlock_irqrestore(&mctz->lock, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
unsigned long nr_pages = page_counter_read(&memcg->memory);
|
|
|
|
unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
|
|
|
|
unsigned long excess = 0;
|
|
|
|
|
|
|
|
if (nr_pages > soft_limit)
|
|
|
|
excess = nr_pages - soft_limit;
|
|
|
|
|
|
|
|
return excess;
|
|
|
|
}
|
|
|
|
|
|
|
|
void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
|
|
|
|
{
|
|
|
|
unsigned long excess;
|
|
|
|
struct mem_cgroup_per_node *mz;
|
|
|
|
struct mem_cgroup_tree_per_node *mctz;
|
|
|
|
|
|
|
|
if (lru_gen_enabled()) {
|
|
|
|
if (soft_limit_excess(memcg))
|
|
|
|
lru_gen_soft_reclaim(memcg, nid);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
mctz = soft_limit_tree.rb_tree_per_node[nid];
|
|
|
|
if (!mctz)
|
|
|
|
return;
|
|
|
|
/*
|
|
|
|
* Necessary to update all ancestors when hierarchy is used.
|
|
|
|
* because their event counter is not touched.
|
|
|
|
*/
|
|
|
|
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
|
|
|
|
mz = memcg->nodeinfo[nid];
|
|
|
|
excess = soft_limit_excess(memcg);
|
|
|
|
/*
|
|
|
|
* We have to update the tree if mz is on RB-tree or
|
|
|
|
* mem is over its softlimit.
|
|
|
|
*/
|
|
|
|
if (excess || mz->on_tree) {
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&mctz->lock, flags);
|
|
|
|
/* if on-tree, remove it */
|
|
|
|
if (mz->on_tree)
|
|
|
|
__mem_cgroup_remove_exceeded(mz, mctz);
|
|
|
|
/*
|
|
|
|
* Insert again. mz->usage_in_excess will be updated.
|
|
|
|
* If excess is 0, no tree ops.
|
|
|
|
*/
|
|
|
|
__mem_cgroup_insert_exceeded(mz, mctz, excess);
|
|
|
|
spin_unlock_irqrestore(&mctz->lock, flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
struct mem_cgroup_tree_per_node *mctz;
|
|
|
|
struct mem_cgroup_per_node *mz;
|
|
|
|
int nid;
|
|
|
|
|
|
|
|
for_each_node(nid) {
|
|
|
|
mz = memcg->nodeinfo[nid];
|
|
|
|
mctz = soft_limit_tree.rb_tree_per_node[nid];
|
|
|
|
if (mctz)
|
|
|
|
mem_cgroup_remove_exceeded(mz, mctz);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct mem_cgroup_per_node *
|
|
|
|
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
|
|
|
|
{
|
|
|
|
struct mem_cgroup_per_node *mz;
|
|
|
|
|
|
|
|
retry:
|
|
|
|
mz = NULL;
|
|
|
|
if (!mctz->rb_rightmost)
|
|
|
|
goto done; /* Nothing to reclaim from */
|
|
|
|
|
|
|
|
mz = rb_entry(mctz->rb_rightmost,
|
|
|
|
struct mem_cgroup_per_node, tree_node);
|
|
|
|
/*
|
|
|
|
* Remove the node now but someone else can add it back,
|
|
|
|
* we will to add it back at the end of reclaim to its correct
|
|
|
|
* position in the tree.
|
|
|
|
*/
|
|
|
|
__mem_cgroup_remove_exceeded(mz, mctz);
|
|
|
|
if (!soft_limit_excess(mz->memcg) ||
|
|
|
|
!css_tryget(&mz->memcg->css))
|
|
|
|
goto retry;
|
|
|
|
done:
|
|
|
|
return mz;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct mem_cgroup_per_node *
|
|
|
|
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
|
|
|
|
{
|
|
|
|
struct mem_cgroup_per_node *mz;
|
|
|
|
|
|
|
|
spin_lock_irq(&mctz->lock);
|
|
|
|
mz = __mem_cgroup_largest_soft_limit_node(mctz);
|
|
|
|
spin_unlock_irq(&mctz->lock);
|
|
|
|
return mz;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
|
|
|
|
pg_data_t *pgdat,
|
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *victim = NULL;
|
|
|
|
int total = 0;
|
|
|
|
int loop = 0;
|
|
|
|
unsigned long excess;
|
|
|
|
unsigned long nr_scanned;
|
|
|
|
struct mem_cgroup_reclaim_cookie reclaim = {
|
|
|
|
.pgdat = pgdat,
|
|
|
|
};
|
|
|
|
|
|
|
|
excess = soft_limit_excess(root_memcg);
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
|
|
|
|
if (!victim) {
|
|
|
|
loop++;
|
|
|
|
if (loop >= 2) {
|
|
|
|
/*
|
|
|
|
* If we have not been able to reclaim
|
|
|
|
* anything, it might because there are
|
|
|
|
* no reclaimable pages under this hierarchy
|
|
|
|
*/
|
|
|
|
if (!total)
|
|
|
|
break;
|
|
|
|
/*
|
|
|
|
* We want to do more targeted reclaim.
|
|
|
|
* excess >> 2 is not to excessive so as to
|
|
|
|
* reclaim too much, nor too less that we keep
|
|
|
|
* coming back to reclaim from this cgroup
|
|
|
|
*/
|
|
|
|
if (total >= (excess >> 2) ||
|
|
|
|
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
total += mem_cgroup_shrink_node(victim, gfp_mask, false,
|
|
|
|
pgdat, &nr_scanned);
|
|
|
|
*total_scanned += nr_scanned;
|
|
|
|
if (!soft_limit_excess(root_memcg))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
mem_cgroup_iter_break(root_memcg, victim);
|
|
|
|
return total;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned)
|
|
|
|
{
|
|
|
|
unsigned long nr_reclaimed = 0;
|
|
|
|
struct mem_cgroup_per_node *mz, *next_mz = NULL;
|
|
|
|
unsigned long reclaimed;
|
|
|
|
int loop = 0;
|
|
|
|
struct mem_cgroup_tree_per_node *mctz;
|
|
|
|
unsigned long excess;
|
|
|
|
|
|
|
|
if (lru_gen_enabled())
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (order > 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do not even bother to check the largest node if the root
|
|
|
|
* is empty. Do it lockless to prevent lock bouncing. Races
|
|
|
|
* are acceptable as soft limit is best effort anyway.
|
|
|
|
*/
|
|
|
|
if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This loop can run a while, specially if mem_cgroup's continuously
|
|
|
|
* keep exceeding their soft limit and putting the system under
|
|
|
|
* pressure
|
|
|
|
*/
|
|
|
|
do {
|
|
|
|
if (next_mz)
|
|
|
|
mz = next_mz;
|
|
|
|
else
|
|
|
|
mz = mem_cgroup_largest_soft_limit_node(mctz);
|
|
|
|
if (!mz)
|
|
|
|
break;
|
|
|
|
|
|
|
|
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
|
|
|
|
gfp_mask, total_scanned);
|
|
|
|
nr_reclaimed += reclaimed;
|
|
|
|
spin_lock_irq(&mctz->lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we failed to reclaim anything from this memory cgroup
|
|
|
|
* it is time to move on to the next cgroup
|
|
|
|
*/
|
|
|
|
next_mz = NULL;
|
|
|
|
if (!reclaimed)
|
|
|
|
next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
|
|
|
|
|
|
|
|
excess = soft_limit_excess(mz->memcg);
|
|
|
|
/*
|
|
|
|
* One school of thought says that we should not add
|
|
|
|
* back the node to the tree if reclaim returns 0.
|
|
|
|
* But our reclaim could return 0, simply because due
|
|
|
|
* to priority we are exposing a smaller subset of
|
|
|
|
* memory to reclaim from. Consider this as a longer
|
|
|
|
* term TODO.
|
|
|
|
*/
|
|
|
|
/* If excess == 0, no tree ops */
|
|
|
|
__mem_cgroup_insert_exceeded(mz, mctz, excess);
|
|
|
|
spin_unlock_irq(&mctz->lock);
|
|
|
|
css_put(&mz->memcg->css);
|
|
|
|
loop++;
|
|
|
|
/*
|
|
|
|
* Could not reclaim anything and there are no more
|
|
|
|
* mem cgroups to try or we seem to be looping without
|
|
|
|
* reclaiming anything.
|
|
|
|
*/
|
|
|
|
if (!nr_reclaimed &&
|
|
|
|
(next_mz == NULL ||
|
|
|
|
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
|
|
|
|
break;
|
|
|
|
} while (!nr_reclaimed);
|
|
|
|
if (next_mz)
|
|
|
|
css_put(&next_mz->memcg->css);
|
|
|
|
return nr_reclaimed;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __init memcg1_init(void)
|
|
|
|
{
|
|
|
|
int node;
|
|
|
|
|
|
|
|
for_each_node(node) {
|
|
|
|
struct mem_cgroup_tree_per_node *rtpn;
|
|
|
|
|
|
|
|
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
|
|
|
|
|
|
|
|
rtpn->rb_root = RB_ROOT;
|
|
|
|
rtpn->rb_rightmost = NULL;
|
|
|
|
spin_lock_init(&rtpn->lock);
|
|
|
|
soft_limit_tree.rb_tree_per_node[node] = rtpn;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
subsys_initcall(memcg1_init);
|