linux-stable/mm/memcontrol-v1.c

// SPDX-License-Identifier: GPL-2.0-or-later

#include <linux/memcontrol.h>
#include <linux/swap.h>
#include <linux/mm_inline.h>

#include "memcontrol-v1.h"

/*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
 */

struct mem_cgroup_tree_per_node {
	struct rb_root rb_root;
	struct rb_node *rb_rightmost;
	spinlock_t lock;
};

struct mem_cgroup_tree {
	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

/*
 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2

static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
					 struct mem_cgroup_tree_per_node *mctz,
					 unsigned long new_usage_in_excess)
{
	struct rb_node **p = &mctz->rb_root.rb_node;
	struct rb_node *parent = NULL;
	struct mem_cgroup_per_node *mz_node;
	bool rightmost = true;

	if (mz->on_tree)
		return;

	mz->usage_in_excess = new_usage_in_excess;
	if (!mz->usage_in_excess)
		return;
	while (*p) {
		parent = *p;
		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
					tree_node);
		if (mz->usage_in_excess < mz_node->usage_in_excess) {
			p = &(*p)->rb_left;
			rightmost = false;
		} else {
			p = &(*p)->rb_right;
		}
	}

	if (rightmost)
		mctz->rb_rightmost = &mz->tree_node;

	rb_link_node(&mz->tree_node, parent, p);
	rb_insert_color(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = true;
}

static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
					 struct mem_cgroup_tree_per_node *mctz)
{
	if (!mz->on_tree)
		return;

	if (&mz->tree_node == mctz->rb_rightmost)
		mctz->rb_rightmost = rb_prev(&mz->tree_node);

	rb_erase(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = false;
}

static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
				       struct mem_cgroup_tree_per_node *mctz)
{
	unsigned long flags;

	spin_lock_irqsave(&mctz->lock, flags);
	__mem_cgroup_remove_exceeded(mz, mctz);
	spin_unlock_irqrestore(&mctz->lock, flags);
}

static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
	unsigned long nr_pages = page_counter_read(&memcg->memory);
	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
	unsigned long excess = 0;

	if (nr_pages > soft_limit)
		excess = nr_pages - soft_limit;

	return excess;
}

void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
{
	unsigned long excess;
	struct mem_cgroup_per_node *mz;
	struct mem_cgroup_tree_per_node *mctz;

	if (lru_gen_enabled()) {
		if (soft_limit_excess(memcg))
			lru_gen_soft_reclaim(memcg, nid);
		return;
	}

	mctz = soft_limit_tree.rb_tree_per_node[nid];
	if (!mctz)
		return;
	/*
	 * Necessary to update all ancestors when hierarchy is used.
	 * because their event counter is not touched.
	 */
	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
		mz = memcg->nodeinfo[nid];
		excess = soft_limit_excess(memcg);
		/*
		 * We have to update the tree if mz is on RB-tree or
		 * mem is over its softlimit.
		 */
		if (excess || mz->on_tree) {
			unsigned long flags;

			spin_lock_irqsave(&mctz->lock, flags);
			/* if on-tree, remove it */
			if (mz->on_tree)
				__mem_cgroup_remove_exceeded(mz, mctz);
			/*
			 * Insert again. mz->usage_in_excess will be updated.
			 * If excess is 0, no tree ops.
			 */
			__mem_cgroup_insert_exceeded(mz, mctz, excess);
			spin_unlock_irqrestore(&mctz->lock, flags);
		}
	}
}

void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
	struct mem_cgroup_tree_per_node *mctz;
	struct mem_cgroup_per_node *mz;
	int nid;

	for_each_node(nid) {
		mz = memcg->nodeinfo[nid];
		mctz = soft_limit_tree.rb_tree_per_node[nid];
		if (mctz)
			mem_cgroup_remove_exceeded(mz, mctz);
	}
}

static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
	struct mem_cgroup_per_node *mz;

retry:
	mz = NULL;
	if (!mctz->rb_rightmost)
		goto done;		/* Nothing to reclaim from */

	mz = rb_entry(mctz->rb_rightmost,
		      struct mem_cgroup_per_node, tree_node);
	/*
	 * Remove the node now but someone else can add it back,
	 * we will to add it back at the end of reclaim to its correct
	 * position in the tree.
	 */
	__mem_cgroup_remove_exceeded(mz, mctz);
	if (!soft_limit_excess(mz->memcg) ||
	    !css_tryget(&mz->memcg->css))
		goto retry;
done:
	return mz;
}

static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
	struct mem_cgroup_per_node *mz;

	spin_lock_irq(&mctz->lock);
	mz = __mem_cgroup_largest_soft_limit_node(mctz);
	spin_unlock_irq(&mctz->lock);
	return mz;
}

static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
				   pg_data_t *pgdat,
				   gfp_t gfp_mask,
				   unsigned long *total_scanned)
{
	struct mem_cgroup *victim = NULL;
	int total = 0;
	int loop = 0;
	unsigned long excess;
	unsigned long nr_scanned;
	struct mem_cgroup_reclaim_cookie reclaim = {
		.pgdat = pgdat,
	};

	excess = soft_limit_excess(root_memcg);

	while (1) {
		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
		if (!victim) {
			loop++;
			if (loop >= 2) {
				/*
				 * If we have not been able to reclaim
				 * anything, it might because there are
				 * no reclaimable pages under this hierarchy
				 */
				if (!total)
					break;
				/*
				 * We want to do more targeted reclaim.
				 * excess >> 2 is not to excessive so as to
				 * reclaim too much, nor too less that we keep
				 * coming back to reclaim from this cgroup
				 */
				if (total >= (excess >> 2) ||
					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
					break;
			}
			continue;
		}
		total += mem_cgroup_shrink_node(victim, gfp_mask, false,
					pgdat, &nr_scanned);
		*total_scanned += nr_scanned;
		if (!soft_limit_excess(root_memcg))
			break;
	}
	mem_cgroup_iter_break(root_memcg, victim);
	return total;
}

unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
					    gfp_t gfp_mask,
					    unsigned long *total_scanned)
{
	unsigned long nr_reclaimed = 0;
	struct mem_cgroup_per_node *mz, *next_mz = NULL;
	unsigned long reclaimed;
	int loop = 0;
	struct mem_cgroup_tree_per_node *mctz;
	unsigned long excess;

	if (lru_gen_enabled())
		return 0;

	if (order > 0)
		return 0;

	mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];

	/*
	 * Do not even bother to check the largest node if the root
	 * is empty. Do it lockless to prevent lock bouncing. Races
	 * are acceptable as soft limit is best effort anyway.
	 */
	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
		return 0;

	/*
	 * This loop can run a while, specially if mem_cgroup's continuously
	 * keep exceeding their soft limit and putting the system under
	 * pressure
	 */
	do {
		if (next_mz)
			mz = next_mz;
		else
			mz = mem_cgroup_largest_soft_limit_node(mctz);
		if (!mz)
			break;

		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
						    gfp_mask, total_scanned);
		nr_reclaimed += reclaimed;
		spin_lock_irq(&mctz->lock);

		/*
		 * If we failed to reclaim anything from this memory cgroup
		 * it is time to move on to the next cgroup
		 */
		next_mz = NULL;
		if (!reclaimed)
			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);

		excess = soft_limit_excess(mz->memcg);
		/*
		 * One school of thought says that we should not add
		 * back the node to the tree if reclaim returns 0.
		 * But our reclaim could return 0, simply because due
		 * to priority we are exposing a smaller subset of
		 * memory to reclaim from. Consider this as a longer
		 * term TODO.
		 */
		/* If excess == 0, no tree ops */
		__mem_cgroup_insert_exceeded(mz, mctz, excess);
		spin_unlock_irq(&mctz->lock);
		css_put(&mz->memcg->css);
		loop++;
		/*
		 * Could not reclaim anything and there are no more
		 * mem cgroups to try or we seem to be looping without
		 * reclaiming anything.
		 */
		if (!nr_reclaimed &&
			(next_mz == NULL ||
			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
			break;
	} while (!nr_reclaimed);
	if (next_mz)
		css_put(&next_mz->memcg->css);
	return nr_reclaimed;
}

static int __init memcg1_init(void)
{
	int node;

	for_each_node(node) {
		struct mem_cgroup_tree_per_node *rtpn;

		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);

		rtpn->rb_root = RB_ROOT;
		rtpn->rb_rightmost = NULL;
		spin_lock_init(&rtpn->lock);
		soft_limit_tree.rb_tree_per_node[node] = rtpn;
	}

	return 0;
}
subsys_initcall(memcg1_init);
mm: memcg: introduce memcontrol-v1.c Patch series "mm: memcg: separate legacy cgroup v1 code and put under config option", v2. Cgroups v2 have been around for a while and many users have fully adopted them, so they never use cgroups v1 features and functionality. Yet they have to "pay" for the cgroup v1 support anyway: 1) the kernel binary contains an unused cgroup v1 code, 2) some code paths have additional checks which are not needed, 3) some common structures like task_struct and mem_cgroup contain unused cgroup v1-specific members. Cgroup v1's memory controller has a number of features that are not supported by cgroup v2 and their implementation is pretty much self contained. Most notably, these features are: soft limit reclaim, oom handling in userspace, complicated event notification system, charge migration. Cgroup v1-specific code in memcontrol.c is close to 4k lines in size and it's intervened with generic and cgroup v2-specific code. It's a burden on developers and maintainers. This patchset aims to solve these problems by: 1) moving cgroup v1-specific memcg code to the new mm/memcontrol-v1.c file, 2) putting definitions shared by memcontrol.c and memcontrol-v1.c into the mm/memcontrol-v1.h header, 3) introducing the CONFIG_MEMCG_V1 config option, turned off by default, 4) making memcontrol-v1.c to compile only if CONFIG_MEMCG_V1 is set. If CONFIG_MEMCG_V1 is not set, cgroup v1 memory controller is still available for mounting, however no memory-specific control knobs are present. This patch (of 14): This patch introduces the mm/memcontrol-v1.c source file which will be used for all legacy (cgroup v1) memory cgroup code. It also introduces mm/memcontrol-v1.h to keep declarations shared between mm/memcontrol.c and mm/memcontrol-v1.c. As of now, let's compile it if CONFIG_MEMCG is set, similar to mm/memcontrol.c. Later on it can be switched to use a separate config option, so that the legacy code won't be compiled if not required. Link: https://lkml.kernel.org/r/20240625005906.106920-1-roman.gushchin@linux.dev Link: https://lkml.kernel.org/r/20240625005906.106920-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Roman Gushchin <roman.gushchin@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> 2024-06-25 00:58:53 +00:00			`// SPDX-License-Identifier: GPL-2.0-or-later`

mm: memcg: move soft limit reclaim code to memcontrol-v1.c Soft limits are cgroup v1-specific and are not supported by cgroup v2, so let's move the corresponding code into memcontrol-v1.c. Aside from simple moving the code, this commits introduces a trivial memcg1_soft_limit_reset() function to reset soft limits and also moves the global soft limit tree initialization code into a new memcg1_init() function. It also moves corresponding declarations shared between memcontrol.c and memcontrol-v1.c into mm/memcontrol-v1.h. Link: https://lkml.kernel.org/r/20240625005906.106920-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Muchun Song <muchun.song@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> 2024-06-25 00:58:54 +00:00			`#include <linux/memcontrol.h>`
			`#include <linux/swap.h>`
			`#include <linux/mm_inline.h>`

mm: memcg: introduce memcontrol-v1.c Patch series "mm: memcg: separate legacy cgroup v1 code and put under config option", v2. Cgroups v2 have been around for a while and many users have fully adopted them, so they never use cgroups v1 features and functionality. Yet they have to "pay" for the cgroup v1 support anyway: 1) the kernel binary contains an unused cgroup v1 code, 2) some code paths have additional checks which are not needed, 3) some common structures like task_struct and mem_cgroup contain unused cgroup v1-specific members. Cgroup v1's memory controller has a number of features that are not supported by cgroup v2 and their implementation is pretty much self contained. Most notably, these features are: soft limit reclaim, oom handling in userspace, complicated event notification system, charge migration. Cgroup v1-specific code in memcontrol.c is close to 4k lines in size and it's intervened with generic and cgroup v2-specific code. It's a burden on developers and maintainers. This patchset aims to solve these problems by: 1) moving cgroup v1-specific memcg code to the new mm/memcontrol-v1.c file, 2) putting definitions shared by memcontrol.c and memcontrol-v1.c into the mm/memcontrol-v1.h header, 3) introducing the CONFIG_MEMCG_V1 config option, turned off by default, 4) making memcontrol-v1.c to compile only if CONFIG_MEMCG_V1 is set. If CONFIG_MEMCG_V1 is not set, cgroup v1 memory controller is still available for mounting, however no memory-specific control knobs are present. This patch (of 14): This patch introduces the mm/memcontrol-v1.c source file which will be used for all legacy (cgroup v1) memory cgroup code. It also introduces mm/memcontrol-v1.h to keep declarations shared between mm/memcontrol.c and mm/memcontrol-v1.c. As of now, let's compile it if CONFIG_MEMCG is set, similar to mm/memcontrol.c. Later on it can be switched to use a separate config option, so that the legacy code won't be compiled if not required. Link: https://lkml.kernel.org/r/20240625005906.106920-1-roman.gushchin@linux.dev Link: https://lkml.kernel.org/r/20240625005906.106920-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Roman Gushchin <roman.gushchin@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> 2024-06-25 00:58:53 +00:00			`#include "memcontrol-v1.h"`
mm: memcg: move soft limit reclaim code to memcontrol-v1.c Soft limits are cgroup v1-specific and are not supported by cgroup v2, so let's move the corresponding code into memcontrol-v1.c. Aside from simple moving the code, this commits introduces a trivial memcg1_soft_limit_reset() function to reset soft limits and also moves the global soft limit tree initialization code into a new memcg1_init() function. It also moves corresponding declarations shared between memcontrol.c and memcontrol-v1.c into mm/memcontrol-v1.h. Link: https://lkml.kernel.org/r/20240625005906.106920-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Muchun Song <muchun.song@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> 2024-06-25 00:58:54 +00:00
			`/*`
			`* Cgroups above their limits are maintained in a RB-Tree, independent of`
			`* their hierarchy representation`
			`*/`

			`struct mem_cgroup_tree_per_node {`
			`struct rb_root rb_root;`
			`struct rb_node *rb_rightmost;`
			`spinlock_t lock;`
			`};`

			`struct mem_cgroup_tree {`
			`struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];`
			`};`

			`static struct mem_cgroup_tree soft_limit_tree __read_mostly;`

			`/*`
			`* Maximum loops in mem_cgroup_soft_reclaim(), used for soft`
			`* limit reclaim to prevent infinite loops, if they ever occur.`
			`*/`
			`#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100`
			`#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2`

			`static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,`
			`struct mem_cgroup_tree_per_node *mctz,`
			`unsigned long new_usage_in_excess)`
			`{`
			`struct rb_node **p = &mctz->rb_root.rb_node;`
			`struct rb_node *parent = NULL;`
			`struct mem_cgroup_per_node *mz_node;`
			`bool rightmost = true;`

			`if (mz->on_tree)`
			`return;`

			`mz->usage_in_excess = new_usage_in_excess;`
			`if (!mz->usage_in_excess)`
			`return;`
			`while (*p) {`
			`parent = *p;`
			`mz_node = rb_entry(parent, struct mem_cgroup_per_node,`
			`tree_node);`
			`if (mz->usage_in_excess < mz_node->usage_in_excess) {`
			`p = &(*p)->rb_left;`
			`rightmost = false;`
			`} else {`
			`p = &(*p)->rb_right;`
			`}`
			`}`

			`if (rightmost)`
			`mctz->rb_rightmost = &mz->tree_node;`

			`rb_link_node(&mz->tree_node, parent, p);`
			`rb_insert_color(&mz->tree_node, &mctz->rb_root);`
			`mz->on_tree = true;`
			`}`

			`static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,`
			`struct mem_cgroup_tree_per_node *mctz)`
			`{`
			`if (!mz->on_tree)`
			`return;`

			`if (&mz->tree_node == mctz->rb_rightmost)`
			`mctz->rb_rightmost = rb_prev(&mz->tree_node);`

			`rb_erase(&mz->tree_node, &mctz->rb_root);`
			`mz->on_tree = false;`
			`}`

			`static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,`
			`struct mem_cgroup_tree_per_node *mctz)`
			`{`
			`unsigned long flags;`

			`spin_lock_irqsave(&mctz->lock, flags);`
			`__mem_cgroup_remove_exceeded(mz, mctz);`
			`spin_unlock_irqrestore(&mctz->lock, flags);`
			`}`

			`static unsigned long soft_limit_excess(struct mem_cgroup *memcg)`
			`{`
			`unsigned long nr_pages = page_counter_read(&memcg->memory);`
			`unsigned long soft_limit = READ_ONCE(memcg->soft_limit);`
			`unsigned long excess = 0;`

			`if (nr_pages > soft_limit)`
			`excess = nr_pages - soft_limit;`

			`return excess;`
			`}`

			`void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)`
			`{`
			`unsigned long excess;`
			`struct mem_cgroup_per_node *mz;`
			`struct mem_cgroup_tree_per_node *mctz;`

			`if (lru_gen_enabled()) {`
			`if (soft_limit_excess(memcg))`
			`lru_gen_soft_reclaim(memcg, nid);`
			`return;`
			`}`

			`mctz = soft_limit_tree.rb_tree_per_node[nid];`
			`if (!mctz)`
			`return;`
			`/*`
			`* Necessary to update all ancestors when hierarchy is used.`
			`* because their event counter is not touched.`
			`*/`
			`for (; memcg; memcg = parent_mem_cgroup(memcg)) {`
			`mz = memcg->nodeinfo[nid];`
			`excess = soft_limit_excess(memcg);`
			`/*`
			`* We have to update the tree if mz is on RB-tree or`
			`* mem is over its softlimit.`
			`*/`
			`if (excess \|\| mz->on_tree) {`
			`unsigned long flags;`

			`spin_lock_irqsave(&mctz->lock, flags);`
			`/* if on-tree, remove it */`
			`if (mz->on_tree)`
			`__mem_cgroup_remove_exceeded(mz, mctz);`
			`/*`
			`* Insert again. mz->usage_in_excess will be updated.`
			`* If excess is 0, no tree ops.`
			`*/`
			`__mem_cgroup_insert_exceeded(mz, mctz, excess);`
			`spin_unlock_irqrestore(&mctz->lock, flags);`
			`}`
			`}`
			`}`

			`void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)`
			`{`
			`struct mem_cgroup_tree_per_node *mctz;`
			`struct mem_cgroup_per_node *mz;`
			`int nid;`

			`for_each_node(nid) {`
			`mz = memcg->nodeinfo[nid];`
			`mctz = soft_limit_tree.rb_tree_per_node[nid];`
			`if (mctz)`
			`mem_cgroup_remove_exceeded(mz, mctz);`
			`}`
			`}`

			`static struct mem_cgroup_per_node *`
			`__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)`
			`{`
			`struct mem_cgroup_per_node *mz;`

			`retry:`
			`mz = NULL;`
			`if (!mctz->rb_rightmost)`
			`goto done; /* Nothing to reclaim from */`

			`mz = rb_entry(mctz->rb_rightmost,`
			`struct mem_cgroup_per_node, tree_node);`
			`/*`
			`* Remove the node now but someone else can add it back,`
			`* we will to add it back at the end of reclaim to its correct`
			`* position in the tree.`
			`*/`
			`__mem_cgroup_remove_exceeded(mz, mctz);`
			`if (!soft_limit_excess(mz->memcg) \|\|`
			`!css_tryget(&mz->memcg->css))`
			`goto retry;`
			`done:`
			`return mz;`
			`}`

			`static struct mem_cgroup_per_node *`
			`mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)`
			`{`
			`struct mem_cgroup_per_node *mz;`

			`spin_lock_irq(&mctz->lock);`
			`mz = __mem_cgroup_largest_soft_limit_node(mctz);`
			`spin_unlock_irq(&mctz->lock);`
			`return mz;`
			`}`

			`static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,`
			`pg_data_t *pgdat,`
			`gfp_t gfp_mask,`
			`unsigned long *total_scanned)`
			`{`
			`struct mem_cgroup *victim = NULL;`
			`int total = 0;`
			`int loop = 0;`
			`unsigned long excess;`
			`unsigned long nr_scanned;`
			`struct mem_cgroup_reclaim_cookie reclaim = {`
			`.pgdat = pgdat,`
			`};`

			`excess = soft_limit_excess(root_memcg);`

			`while (1) {`
			`victim = mem_cgroup_iter(root_memcg, victim, &reclaim);`
			`if (!victim) {`
			`loop++;`
			`if (loop >= 2) {`
			`/*`
			`* If we have not been able to reclaim`
			`* anything, it might because there are`
			`* no reclaimable pages under this hierarchy`
			`*/`
			`if (!total)`
			`break;`
			`/*`
			`* We want to do more targeted reclaim.`
			`* excess >> 2 is not to excessive so as to`
			`* reclaim too much, nor too less that we keep`
			`* coming back to reclaim from this cgroup`
			`*/`
			`if (total >= (excess >> 2) \|\|`
			`(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))`
			`break;`
			`}`
			`continue;`
			`}`
			`total += mem_cgroup_shrink_node(victim, gfp_mask, false,`
			`pgdat, &nr_scanned);`
			`*total_scanned += nr_scanned;`
			`if (!soft_limit_excess(root_memcg))`
			`break;`
			`}`
			`mem_cgroup_iter_break(root_memcg, victim);`
			`return total;`
			`}`

			`unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,`
			`gfp_t gfp_mask,`
			`unsigned long *total_scanned)`
			`{`
			`unsigned long nr_reclaimed = 0;`
			`struct mem_cgroup_per_node mz, next_mz = NULL;`
			`unsigned long reclaimed;`
			`int loop = 0;`
			`struct mem_cgroup_tree_per_node *mctz;`
			`unsigned long excess;`

			`if (lru_gen_enabled())`
			`return 0;`

			`if (order > 0)`
			`return 0;`

			`mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];`

			`/*`
			`* Do not even bother to check the largest node if the root`
			`* is empty. Do it lockless to prevent lock bouncing. Races`
			`* are acceptable as soft limit is best effort anyway.`
			`*/`
			`if (!mctz \|\| RB_EMPTY_ROOT(&mctz->rb_root))`
			`return 0;`

			`/*`
			`* This loop can run a while, specially if mem_cgroup's continuously`
			`* keep exceeding their soft limit and putting the system under`
			`* pressure`
			`*/`
			`do {`
			`if (next_mz)`
			`mz = next_mz;`
			`else`
			`mz = mem_cgroup_largest_soft_limit_node(mctz);`
			`if (!mz)`
			`break;`

			`reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,`
			`gfp_mask, total_scanned);`
			`nr_reclaimed += reclaimed;`
			`spin_lock_irq(&mctz->lock);`

			`/*`
			`* If we failed to reclaim anything from this memory cgroup`
			`* it is time to move on to the next cgroup`
			`*/`
			`next_mz = NULL;`
			`if (!reclaimed)`
			`next_mz = __mem_cgroup_largest_soft_limit_node(mctz);`

			`excess = soft_limit_excess(mz->memcg);`
			`/*`
			`* One school of thought says that we should not add`
			`* back the node to the tree if reclaim returns 0.`
			`* But our reclaim could return 0, simply because due`
			`* to priority we are exposing a smaller subset of`
			`* memory to reclaim from. Consider this as a longer`
			`* term TODO.`
			`*/`
			`/* If excess == 0, no tree ops */`
			`__mem_cgroup_insert_exceeded(mz, mctz, excess);`
			`spin_unlock_irq(&mctz->lock);`
			`css_put(&mz->memcg->css);`
			`loop++;`
			`/*`
			`* Could not reclaim anything and there are no more`
			`* mem cgroups to try or we seem to be looping without`
			`* reclaiming anything.`
			`*/`
			`if (!nr_reclaimed &&`
			`(next_mz == NULL \|\|`
			`loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))`
			`break;`
			`} while (!nr_reclaimed);`
			`if (next_mz)`
			`css_put(&next_mz->memcg->css);`
			`return nr_reclaimed;`
			`}`

			`static int __init memcg1_init(void)`
			`{`
			`int node;`

			`for_each_node(node) {`
			`struct mem_cgroup_tree_per_node *rtpn;`

			`rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);`

			`rtpn->rb_root = RB_ROOT;`
			`rtpn->rb_rightmost = NULL;`
			`spin_lock_init(&rtpn->lock);`
			`soft_limit_tree.rb_tree_per_node[node] = rtpn;`
			`}`

			`return 0;`
			`}`
			`subsys_initcall(memcg1_init);`