mm: memcontrol: hook up vmpressure to socket pressure

Let the networking stack know when a memcg is under reclaim pressure so
that it can clamp its transmit windows accordingly.

Whenever the reclaim efficiency of a cgroup's LRU lists drops low enough
for a MEDIUM or HIGH vmpressure event to occur, assert a pressure state
in the socket and tcp memory code that tells it to curb consumption
growth from sockets associated with said control group.

Traditionally, vmpressure reports for the entire subtree of a memcg
under pressure, which drops useful information on the individual groups
reclaimed.  However, it's too late to change the userinterface, so add a
second reporting mode that reports on the level of reclaim instead of at
the level of pressure, and use that report for sockets.

vmpressure events are naturally edge triggered, so for hysteresis assert
socket pressure for a second to allow for subsequent vmpressure events
to occur before letting the socket code return to normal.

This will likely need finetuning for a wider variety of workloads, but
for now stick to the vmpressure presets and keep hysteresis simple.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Johannes Weiner 2016-01-14 15:21:32 -08:00 committed by Linus Torvalds
parent f7e1cb6ec5
commit 8e8ae64524
5 changed files with 104 additions and 40 deletions

View File

@ -249,6 +249,10 @@ struct mem_cgroup {
struct wb_domain cgwb_domain; struct wb_domain cgwb_domain;
#endif #endif
#ifdef CONFIG_INET
unsigned long socket_pressure;
#endif
/* List of events which userspace want to receive */ /* List of events which userspace want to receive */
struct list_head event_list; struct list_head event_list;
spinlock_t event_list_lock; spinlock_t event_list_lock;
@ -290,18 +294,34 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
static inline static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL; return css ? container_of(css, struct mem_cgroup, css) : NULL;
} }
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
struct mem_cgroup *, struct mem_cgroup *,
struct mem_cgroup_reclaim_cookie *); struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
/**
* parent_mem_cgroup - find the accounting parent of a memcg
* @memcg: memcg whose parent to find
*
* Returns the parent memcg, or NULL if this is the root or the memory
* controller is in legacy no-hierarchy mode.
*/
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
if (!memcg->memory.parent)
return NULL;
return mem_cgroup_from_counter(memcg->memory.parent, memory);
}
static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
struct mem_cgroup *root) struct mem_cgroup *root)
{ {
@ -689,10 +709,14 @@ extern struct static_key memcg_sockets_enabled_key;
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{ {
#ifdef CONFIG_MEMCG_KMEM #ifdef CONFIG_MEMCG_KMEM
return memcg->tcp_mem.memory_pressure; if (memcg->tcp_mem.memory_pressure)
#else return true;
return false;
#endif #endif
do {
if (time_before(jiffies, memcg->socket_pressure))
return true;
} while ((memcg = parent_mem_cgroup(memcg)));
return false;
} }
#else #else
#define mem_cgroup_sockets_enabled 0 #define mem_cgroup_sockets_enabled 0

View File

@ -12,6 +12,9 @@
struct vmpressure { struct vmpressure {
unsigned long scanned; unsigned long scanned;
unsigned long reclaimed; unsigned long reclaimed;
unsigned long tree_scanned;
unsigned long tree_reclaimed;
/* The lock is used to keep the scanned/reclaimed above in sync. */ /* The lock is used to keep the scanned/reclaimed above in sync. */
struct spinlock sr_lock; struct spinlock sr_lock;
@ -26,7 +29,7 @@ struct vmpressure {
struct mem_cgroup; struct mem_cgroup;
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed); unsigned long scanned, unsigned long reclaimed);
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
@ -40,7 +43,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
extern void vmpressure_unregister_event(struct mem_cgroup *memcg, extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd); struct eventfd_ctx *eventfd);
#else #else
static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed) {} unsigned long scanned, unsigned long reclaimed) {}
static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
int prio) {} int prio) {}

View File

@ -1113,9 +1113,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
return ret; return ret;
} }
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/** /**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup * mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup * @memcg: the memory cgroup
@ -4183,17 +4180,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
kfree(memcg); kfree(memcg);
} }
/*
* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
*/
struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
if (!memcg->memory.parent)
return NULL;
return mem_cgroup_from_counter(memcg->memory.parent, memory);
}
EXPORT_SYMBOL(parent_mem_cgroup);
static struct cgroup_subsys_state * __ref static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{ {
@ -4233,6 +4219,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
#endif #endif
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list); INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
#ifdef CONFIG_INET
memcg->socket_pressure = jiffies;
#endif #endif
return &memcg->css; return &memcg->css;

View File

@ -137,14 +137,11 @@ struct vmpressure_event {
}; };
static bool vmpressure_event(struct vmpressure *vmpr, static bool vmpressure_event(struct vmpressure *vmpr,
unsigned long scanned, unsigned long reclaimed) enum vmpressure_levels level)
{ {
struct vmpressure_event *ev; struct vmpressure_event *ev;
enum vmpressure_levels level;
bool signalled = false; bool signalled = false;
level = vmpressure_calc_level(scanned, reclaimed);
mutex_lock(&vmpr->events_lock); mutex_lock(&vmpr->events_lock);
list_for_each_entry(ev, &vmpr->events, node) { list_for_each_entry(ev, &vmpr->events, node) {
@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work)
struct vmpressure *vmpr = work_to_vmpressure(work); struct vmpressure *vmpr = work_to_vmpressure(work);
unsigned long scanned; unsigned long scanned;
unsigned long reclaimed; unsigned long reclaimed;
enum vmpressure_levels level;
spin_lock(&vmpr->sr_lock); spin_lock(&vmpr->sr_lock);
/* /*
@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work)
* here. No need for any locks here since we don't care if * here. No need for any locks here since we don't care if
* vmpr->reclaimed is in sync. * vmpr->reclaimed is in sync.
*/ */
scanned = vmpr->scanned; scanned = vmpr->tree_scanned;
if (!scanned) { if (!scanned) {
spin_unlock(&vmpr->sr_lock); spin_unlock(&vmpr->sr_lock);
return; return;
} }
reclaimed = vmpr->reclaimed; reclaimed = vmpr->tree_reclaimed;
vmpr->scanned = 0; vmpr->tree_scanned = 0;
vmpr->reclaimed = 0; vmpr->tree_reclaimed = 0;
spin_unlock(&vmpr->sr_lock); spin_unlock(&vmpr->sr_lock);
level = vmpressure_calc_level(scanned, reclaimed);
do { do {
if (vmpressure_event(vmpr, scanned, reclaimed)) if (vmpressure_event(vmpr, level))
break; break;
/* /*
* If not handled, propagate the event upward into the * If not handled, propagate the event upward into the
@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work)
* vmpressure() - Account memory pressure through scanned/reclaimed ratio * vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask * @gfp: reclaimer's gfp mask
* @memcg: cgroup memory controller handle * @memcg: cgroup memory controller handle
* @tree: legacy subtree mode
* @scanned: number of pages scanned * @scanned: number of pages scanned
* @reclaimed: number of pages reclaimed * @reclaimed: number of pages reclaimed
* *
@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work)
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
* pressure index is then further refined and averaged over time. * pressure index is then further refined and averaged over time.
* *
* If @tree is set, vmpressure is in traditional userspace reporting
* mode: @memcg is considered the pressure root and userspace is
* notified of the entire subtree's reclaim efficiency.
*
* If @tree is not set, reclaim efficiency is recorded for @memcg, and
* only in-kernel users are notified.
*
* This function does not return any value. * This function does not return any value.
*/ */
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed) unsigned long scanned, unsigned long reclaimed)
{ {
struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
if (!scanned) if (!scanned)
return; return;
spin_lock(&vmpr->sr_lock); if (tree) {
vmpr->scanned += scanned; spin_lock(&vmpr->sr_lock);
vmpr->reclaimed += reclaimed; vmpr->tree_scanned += scanned;
scanned = vmpr->scanned; vmpr->tree_reclaimed += reclaimed;
spin_unlock(&vmpr->sr_lock); scanned = vmpr->scanned;
spin_unlock(&vmpr->sr_lock);
if (scanned < vmpressure_win) if (scanned < vmpressure_win)
return; return;
schedule_work(&vmpr->work); schedule_work(&vmpr->work);
} else {
enum vmpressure_levels level;
/* For now, no users for root-level efficiency */
if (memcg == root_mem_cgroup)
return;
spin_lock(&vmpr->sr_lock);
scanned = vmpr->scanned += scanned;
reclaimed = vmpr->reclaimed += reclaimed;
if (scanned < vmpressure_win) {
spin_unlock(&vmpr->sr_lock);
return;
}
vmpr->scanned = vmpr->reclaimed = 0;
spin_unlock(&vmpr->sr_lock);
level = vmpressure_calc_level(scanned, reclaimed);
if (level > VMPRESSURE_LOW) {
/*
* Let the socket buffer allocator know that
* we are having trouble reclaiming LRU pages.
*
* For hysteresis keep the pressure state
* asserted for a second in which subsequent
* pressure events can occur.
*/
memcg->socket_pressure = jiffies + HZ;
}
}
} }
/** /**
@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* to the vmpressure() basically means that we signal 'critical' * to the vmpressure() basically means that we signal 'critical'
* level. * level.
*/ */
vmpressure(gfp, memcg, vmpressure_win, 0); vmpressure(gfp, memcg, true, vmpressure_win, 0);
} }
/** /**

View File

@ -2396,6 +2396,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
memcg = mem_cgroup_iter(root, NULL, &reclaim); memcg = mem_cgroup_iter(root, NULL, &reclaim);
do { do {
unsigned long lru_pages; unsigned long lru_pages;
unsigned long reclaimed;
unsigned long scanned; unsigned long scanned;
struct lruvec *lruvec; struct lruvec *lruvec;
int swappiness; int swappiness;
@ -2408,6 +2409,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
lruvec = mem_cgroup_zone_lruvec(zone, memcg); lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg); swappiness = mem_cgroup_swappiness(memcg);
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned; scanned = sc->nr_scanned;
shrink_lruvec(lruvec, swappiness, sc, &lru_pages); shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
@ -2418,6 +2420,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
memcg, sc->nr_scanned - scanned, memcg, sc->nr_scanned - scanned,
lru_pages); lru_pages);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
/* /*
* Direct reclaim and kswapd have to scan all memory * Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the * cgroups to fulfill the overall scan target for the
@ -2449,7 +2456,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
reclaim_state->reclaimed_slab = 0; reclaim_state->reclaimed_slab = 0;
} }
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, /* Record the subtree's reclaim efficiency */
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned, sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed); sc->nr_reclaimed - nr_reclaimed);