mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-04 04:06:26 +00:00
mm: memcg: move legacy memcg event code into memcontrol-v1.c
Cgroup v1's memory controller contains a pretty complicated event notifications mechanism which is not used on cgroup v2. Let's move the corresponding code into memcontrol-v1.c. Please, note, that mem_cgroup_event_ratelimit() remains in memcontrol.c, otherwise it would require exporting too many details on memcg stats outside of memcontrol.c. Link: https://lkml.kernel.org/r/20240625005906.106920-7-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Muchun Song <muchun.song@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
b9eaacb1db
commit
66d60c428b
@ -69,18 +69,6 @@ struct mem_cgroup_id {
|
|||||||
refcount_t ref;
|
refcount_t ref;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
|
||||||
* Per memcg event counter is incremented at every pagein/pageout. With THP,
|
|
||||||
* it will be incremented by the number of pages. This counter is used
|
|
||||||
* to trigger some periodic events. This is straightforward and better
|
|
||||||
* than using jiffies etc. to handle periodic memcg event.
|
|
||||||
*/
|
|
||||||
enum mem_cgroup_events_target {
|
|
||||||
MEM_CGROUP_TARGET_THRESH,
|
|
||||||
MEM_CGROUP_TARGET_SOFTLIMIT,
|
|
||||||
MEM_CGROUP_NTARGETS,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct memcg_vmstats_percpu;
|
struct memcg_vmstats_percpu;
|
||||||
struct memcg_vmstats;
|
struct memcg_vmstats;
|
||||||
struct lruvec_stats_percpu;
|
struct lruvec_stats_percpu;
|
||||||
|
@ -6,6 +6,10 @@
|
|||||||
#include <linux/pagewalk.h>
|
#include <linux/pagewalk.h>
|
||||||
#include <linux/backing-dev.h>
|
#include <linux/backing-dev.h>
|
||||||
#include <linux/swap_cgroup.h>
|
#include <linux/swap_cgroup.h>
|
||||||
|
#include <linux/eventfd.h>
|
||||||
|
#include <linux/poll.h>
|
||||||
|
#include <linux/sort.h>
|
||||||
|
#include <linux/file.h>
|
||||||
|
|
||||||
#include "internal.h"
|
#include "internal.h"
|
||||||
#include "swap.h"
|
#include "swap.h"
|
||||||
@ -60,6 +64,54 @@ static struct move_charge_struct {
|
|||||||
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
|
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* for OOM */
|
||||||
|
struct mem_cgroup_eventfd_list {
|
||||||
|
struct list_head list;
|
||||||
|
struct eventfd_ctx *eventfd;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* cgroup_event represents events which userspace want to receive.
|
||||||
|
*/
|
||||||
|
struct mem_cgroup_event {
|
||||||
|
/*
|
||||||
|
* memcg which the event belongs to.
|
||||||
|
*/
|
||||||
|
struct mem_cgroup *memcg;
|
||||||
|
/*
|
||||||
|
* eventfd to signal userspace about the event.
|
||||||
|
*/
|
||||||
|
struct eventfd_ctx *eventfd;
|
||||||
|
/*
|
||||||
|
* Each of these stored in a list by the cgroup.
|
||||||
|
*/
|
||||||
|
struct list_head list;
|
||||||
|
/*
|
||||||
|
* register_event() callback will be used to add new userspace
|
||||||
|
* waiter for changes related to this event. Use eventfd_signal()
|
||||||
|
* on eventfd to send notification to userspace.
|
||||||
|
*/
|
||||||
|
int (*register_event)(struct mem_cgroup *memcg,
|
||||||
|
struct eventfd_ctx *eventfd, const char *args);
|
||||||
|
/*
|
||||||
|
* unregister_event() callback will be called when userspace closes
|
||||||
|
* the eventfd or on cgroup removing. This callback must be set,
|
||||||
|
* if you want provide notification functionality.
|
||||||
|
*/
|
||||||
|
void (*unregister_event)(struct mem_cgroup *memcg,
|
||||||
|
struct eventfd_ctx *eventfd);
|
||||||
|
/*
|
||||||
|
* All fields below needed to unregister event when
|
||||||
|
* userspace closes eventfd.
|
||||||
|
*/
|
||||||
|
poll_table pt;
|
||||||
|
wait_queue_head_t *wqh;
|
||||||
|
wait_queue_entry_t wait;
|
||||||
|
struct work_struct remove;
|
||||||
|
};
|
||||||
|
|
||||||
|
extern spinlock_t memcg_oom_lock;
|
||||||
|
|
||||||
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
|
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
|
||||||
struct mem_cgroup_tree_per_node *mctz,
|
struct mem_cgroup_tree_per_node *mctz,
|
||||||
unsigned long new_usage_in_excess)
|
unsigned long new_usage_in_excess)
|
||||||
@ -1306,6 +1358,607 @@ void memcg1_move_task(void)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_threshold_ary *t;
|
||||||
|
unsigned long usage;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
if (!swap)
|
||||||
|
t = rcu_dereference(memcg->thresholds.primary);
|
||||||
|
else
|
||||||
|
t = rcu_dereference(memcg->memsw_thresholds.primary);
|
||||||
|
|
||||||
|
if (!t)
|
||||||
|
goto unlock;
|
||||||
|
|
||||||
|
usage = mem_cgroup_usage(memcg, swap);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* current_threshold points to threshold just below or equal to usage.
|
||||||
|
* If it's not true, a threshold was crossed after last
|
||||||
|
* call of __mem_cgroup_threshold().
|
||||||
|
*/
|
||||||
|
i = t->current_threshold;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Iterate backward over array of thresholds starting from
|
||||||
|
* current_threshold and check if a threshold is crossed.
|
||||||
|
* If none of thresholds below usage is crossed, we read
|
||||||
|
* only one element of the array here.
|
||||||
|
*/
|
||||||
|
for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
|
||||||
|
eventfd_signal(t->entries[i].eventfd);
|
||||||
|
|
||||||
|
/* i = current_threshold + 1 */
|
||||||
|
i++;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Iterate forward over array of thresholds starting from
|
||||||
|
* current_threshold+1 and check if a threshold is crossed.
|
||||||
|
* If none of thresholds above usage is crossed, we read
|
||||||
|
* only one element of the array here.
|
||||||
|
*/
|
||||||
|
for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
|
||||||
|
eventfd_signal(t->entries[i].eventfd);
|
||||||
|
|
||||||
|
/* Update current_threshold */
|
||||||
|
t->current_threshold = i - 1;
|
||||||
|
unlock:
|
||||||
|
rcu_read_unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mem_cgroup_threshold(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
while (memcg) {
|
||||||
|
__mem_cgroup_threshold(memcg, false);
|
||||||
|
if (do_memsw_account())
|
||||||
|
__mem_cgroup_threshold(memcg, true);
|
||||||
|
|
||||||
|
memcg = parent_mem_cgroup(memcg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check events in order.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
void memcg_check_events(struct mem_cgroup *memcg, int nid)
|
||||||
|
{
|
||||||
|
if (IS_ENABLED(CONFIG_PREEMPT_RT))
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* threshold event is triggered in finer grain than soft limit */
|
||||||
|
if (unlikely(mem_cgroup_event_ratelimit(memcg,
|
||||||
|
MEM_CGROUP_TARGET_THRESH))) {
|
||||||
|
bool do_softlimit;
|
||||||
|
|
||||||
|
do_softlimit = mem_cgroup_event_ratelimit(memcg,
|
||||||
|
MEM_CGROUP_TARGET_SOFTLIMIT);
|
||||||
|
mem_cgroup_threshold(memcg);
|
||||||
|
if (unlikely(do_softlimit))
|
||||||
|
memcg1_update_tree(memcg, nid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int compare_thresholds(const void *a, const void *b)
|
||||||
|
{
|
||||||
|
const struct mem_cgroup_threshold *_a = a;
|
||||||
|
const struct mem_cgroup_threshold *_b = b;
|
||||||
|
|
||||||
|
if (_a->threshold > _b->threshold)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (_a->threshold < _b->threshold)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_eventfd_list *ev;
|
||||||
|
|
||||||
|
spin_lock(&memcg_oom_lock);
|
||||||
|
|
||||||
|
list_for_each_entry(ev, &memcg->oom_notify, list)
|
||||||
|
eventfd_signal(ev->eventfd);
|
||||||
|
|
||||||
|
spin_unlock(&memcg_oom_lock);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
struct mem_cgroup *iter;
|
||||||
|
|
||||||
|
for_each_mem_cgroup_tree(iter, memcg)
|
||||||
|
mem_cgroup_oom_notify_cb(iter);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
|
||||||
|
struct eventfd_ctx *eventfd, const char *args, enum res_type type)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_thresholds *thresholds;
|
||||||
|
struct mem_cgroup_threshold_ary *new;
|
||||||
|
unsigned long threshold;
|
||||||
|
unsigned long usage;
|
||||||
|
int i, size, ret;
|
||||||
|
|
||||||
|
ret = page_counter_memparse(args, "-1", &threshold);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
mutex_lock(&memcg->thresholds_lock);
|
||||||
|
|
||||||
|
if (type == _MEM) {
|
||||||
|
thresholds = &memcg->thresholds;
|
||||||
|
usage = mem_cgroup_usage(memcg, false);
|
||||||
|
} else if (type == _MEMSWAP) {
|
||||||
|
thresholds = &memcg->memsw_thresholds;
|
||||||
|
usage = mem_cgroup_usage(memcg, true);
|
||||||
|
} else
|
||||||
|
BUG();
|
||||||
|
|
||||||
|
/* Check if a threshold crossed before adding a new one */
|
||||||
|
if (thresholds->primary)
|
||||||
|
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
|
||||||
|
|
||||||
|
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
|
||||||
|
|
||||||
|
/* Allocate memory for new array of thresholds */
|
||||||
|
new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
|
||||||
|
if (!new) {
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto unlock;
|
||||||
|
}
|
||||||
|
new->size = size;
|
||||||
|
|
||||||
|
/* Copy thresholds (if any) to new array */
|
||||||
|
if (thresholds->primary)
|
||||||
|
memcpy(new->entries, thresholds->primary->entries,
|
||||||
|
flex_array_size(new, entries, size - 1));
|
||||||
|
|
||||||
|
/* Add new threshold */
|
||||||
|
new->entries[size - 1].eventfd = eventfd;
|
||||||
|
new->entries[size - 1].threshold = threshold;
|
||||||
|
|
||||||
|
/* Sort thresholds. Registering of new threshold isn't time-critical */
|
||||||
|
sort(new->entries, size, sizeof(*new->entries),
|
||||||
|
compare_thresholds, NULL);
|
||||||
|
|
||||||
|
/* Find current threshold */
|
||||||
|
new->current_threshold = -1;
|
||||||
|
for (i = 0; i < size; i++) {
|
||||||
|
if (new->entries[i].threshold <= usage) {
|
||||||
|
/*
|
||||||
|
* new->current_threshold will not be used until
|
||||||
|
* rcu_assign_pointer(), so it's safe to increment
|
||||||
|
* it here.
|
||||||
|
*/
|
||||||
|
++new->current_threshold;
|
||||||
|
} else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Free old spare buffer and save old primary buffer as spare */
|
||||||
|
kfree(thresholds->spare);
|
||||||
|
thresholds->spare = thresholds->primary;
|
||||||
|
|
||||||
|
rcu_assign_pointer(thresholds->primary, new);
|
||||||
|
|
||||||
|
/* To be sure that nobody uses thresholds */
|
||||||
|
synchronize_rcu();
|
||||||
|
|
||||||
|
unlock:
|
||||||
|
mutex_unlock(&memcg->thresholds_lock);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
|
||||||
|
struct eventfd_ctx *eventfd, const char *args)
|
||||||
|
{
|
||||||
|
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
|
||||||
|
struct eventfd_ctx *eventfd, const char *args)
|
||||||
|
{
|
||||||
|
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
|
||||||
|
struct eventfd_ctx *eventfd, enum res_type type)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_thresholds *thresholds;
|
||||||
|
struct mem_cgroup_threshold_ary *new;
|
||||||
|
unsigned long usage;
|
||||||
|
int i, j, size, entries;
|
||||||
|
|
||||||
|
mutex_lock(&memcg->thresholds_lock);
|
||||||
|
|
||||||
|
if (type == _MEM) {
|
||||||
|
thresholds = &memcg->thresholds;
|
||||||
|
usage = mem_cgroup_usage(memcg, false);
|
||||||
|
} else if (type == _MEMSWAP) {
|
||||||
|
thresholds = &memcg->memsw_thresholds;
|
||||||
|
usage = mem_cgroup_usage(memcg, true);
|
||||||
|
} else
|
||||||
|
BUG();
|
||||||
|
|
||||||
|
if (!thresholds->primary)
|
||||||
|
goto unlock;
|
||||||
|
|
||||||
|
/* Check if a threshold crossed before removing */
|
||||||
|
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
|
||||||
|
|
||||||
|
/* Calculate new number of threshold */
|
||||||
|
size = entries = 0;
|
||||||
|
for (i = 0; i < thresholds->primary->size; i++) {
|
||||||
|
if (thresholds->primary->entries[i].eventfd != eventfd)
|
||||||
|
size++;
|
||||||
|
else
|
||||||
|
entries++;
|
||||||
|
}
|
||||||
|
|
||||||
|
new = thresholds->spare;
|
||||||
|
|
||||||
|
/* If no items related to eventfd have been cleared, nothing to do */
|
||||||
|
if (!entries)
|
||||||
|
goto unlock;
|
||||||
|
|
||||||
|
/* Set thresholds array to NULL if we don't have thresholds */
|
||||||
|
if (!size) {
|
||||||
|
kfree(new);
|
||||||
|
new = NULL;
|
||||||
|
goto swap_buffers;
|
||||||
|
}
|
||||||
|
|
||||||
|
new->size = size;
|
||||||
|
|
||||||
|
/* Copy thresholds and find current threshold */
|
||||||
|
new->current_threshold = -1;
|
||||||
|
for (i = 0, j = 0; i < thresholds->primary->size; i++) {
|
||||||
|
if (thresholds->primary->entries[i].eventfd == eventfd)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
new->entries[j] = thresholds->primary->entries[i];
|
||||||
|
if (new->entries[j].threshold <= usage) {
|
||||||
|
/*
|
||||||
|
* new->current_threshold will not be used
|
||||||
|
* until rcu_assign_pointer(), so it's safe to increment
|
||||||
|
* it here.
|
||||||
|
*/
|
||||||
|
++new->current_threshold;
|
||||||
|
}
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
|
||||||
|
swap_buffers:
|
||||||
|
/* Swap primary and spare array */
|
||||||
|
thresholds->spare = thresholds->primary;
|
||||||
|
|
||||||
|
rcu_assign_pointer(thresholds->primary, new);
|
||||||
|
|
||||||
|
/* To be sure that nobody uses thresholds */
|
||||||
|
synchronize_rcu();
|
||||||
|
|
||||||
|
/* If all events are unregistered, free the spare array */
|
||||||
|
if (!new) {
|
||||||
|
kfree(thresholds->spare);
|
||||||
|
thresholds->spare = NULL;
|
||||||
|
}
|
||||||
|
unlock:
|
||||||
|
mutex_unlock(&memcg->thresholds_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
|
||||||
|
struct eventfd_ctx *eventfd)
|
||||||
|
{
|
||||||
|
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
|
||||||
|
struct eventfd_ctx *eventfd)
|
||||||
|
{
|
||||||
|
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
|
||||||
|
struct eventfd_ctx *eventfd, const char *args)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_eventfd_list *event;
|
||||||
|
|
||||||
|
event = kmalloc(sizeof(*event), GFP_KERNEL);
|
||||||
|
if (!event)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
spin_lock(&memcg_oom_lock);
|
||||||
|
|
||||||
|
event->eventfd = eventfd;
|
||||||
|
list_add(&event->list, &memcg->oom_notify);
|
||||||
|
|
||||||
|
/* already in OOM ? */
|
||||||
|
if (memcg->under_oom)
|
||||||
|
eventfd_signal(eventfd);
|
||||||
|
spin_unlock(&memcg_oom_lock);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
|
||||||
|
struct eventfd_ctx *eventfd)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_eventfd_list *ev, *tmp;
|
||||||
|
|
||||||
|
spin_lock(&memcg_oom_lock);
|
||||||
|
|
||||||
|
list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
|
||||||
|
if (ev->eventfd == eventfd) {
|
||||||
|
list_del(&ev->list);
|
||||||
|
kfree(ev);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
spin_unlock(&memcg_oom_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* DO NOT USE IN NEW FILES.
|
||||||
|
*
|
||||||
|
* "cgroup.event_control" implementation.
|
||||||
|
*
|
||||||
|
* This is way over-engineered. It tries to support fully configurable
|
||||||
|
* events for each user. Such level of flexibility is completely
|
||||||
|
* unnecessary especially in the light of the planned unified hierarchy.
|
||||||
|
*
|
||||||
|
* Please deprecate this and replace with something simpler if at all
|
||||||
|
* possible.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Unregister event and free resources.
|
||||||
|
*
|
||||||
|
* Gets called from workqueue.
|
||||||
|
*/
|
||||||
|
static void memcg_event_remove(struct work_struct *work)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_event *event =
|
||||||
|
container_of(work, struct mem_cgroup_event, remove);
|
||||||
|
struct mem_cgroup *memcg = event->memcg;
|
||||||
|
|
||||||
|
remove_wait_queue(event->wqh, &event->wait);
|
||||||
|
|
||||||
|
event->unregister_event(memcg, event->eventfd);
|
||||||
|
|
||||||
|
/* Notify userspace the event is going away. */
|
||||||
|
eventfd_signal(event->eventfd);
|
||||||
|
|
||||||
|
eventfd_ctx_put(event->eventfd);
|
||||||
|
kfree(event);
|
||||||
|
css_put(&memcg->css);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Gets called on EPOLLHUP on eventfd when user closes it.
|
||||||
|
*
|
||||||
|
* Called with wqh->lock held and interrupts disabled.
|
||||||
|
*/
|
||||||
|
static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
|
||||||
|
int sync, void *key)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_event *event =
|
||||||
|
container_of(wait, struct mem_cgroup_event, wait);
|
||||||
|
struct mem_cgroup *memcg = event->memcg;
|
||||||
|
__poll_t flags = key_to_poll(key);
|
||||||
|
|
||||||
|
if (flags & EPOLLHUP) {
|
||||||
|
/*
|
||||||
|
* If the event has been detached at cgroup removal, we
|
||||||
|
* can simply return knowing the other side will cleanup
|
||||||
|
* for us.
|
||||||
|
*
|
||||||
|
* We can't race against event freeing since the other
|
||||||
|
* side will require wqh->lock via remove_wait_queue(),
|
||||||
|
* which we hold.
|
||||||
|
*/
|
||||||
|
spin_lock(&memcg->event_list_lock);
|
||||||
|
if (!list_empty(&event->list)) {
|
||||||
|
list_del_init(&event->list);
|
||||||
|
/*
|
||||||
|
* We are in atomic context, but cgroup_event_remove()
|
||||||
|
* may sleep, so we have to call it in workqueue.
|
||||||
|
*/
|
||||||
|
schedule_work(&event->remove);
|
||||||
|
}
|
||||||
|
spin_unlock(&memcg->event_list_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void memcg_event_ptable_queue_proc(struct file *file,
|
||||||
|
wait_queue_head_t *wqh, poll_table *pt)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_event *event =
|
||||||
|
container_of(pt, struct mem_cgroup_event, pt);
|
||||||
|
|
||||||
|
event->wqh = wqh;
|
||||||
|
add_wait_queue(wqh, &event->wait);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* DO NOT USE IN NEW FILES.
|
||||||
|
*
|
||||||
|
* Parse input and register new cgroup event handler.
|
||||||
|
*
|
||||||
|
* Input must be in format '<event_fd> <control_fd> <args>'.
|
||||||
|
* Interpretation of args is defined by control file implementation.
|
||||||
|
*/
|
||||||
|
ssize_t memcg_write_event_control(struct kernfs_open_file *of,
|
||||||
|
char *buf, size_t nbytes, loff_t off)
|
||||||
|
{
|
||||||
|
struct cgroup_subsys_state *css = of_css(of);
|
||||||
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||||
|
struct mem_cgroup_event *event;
|
||||||
|
struct cgroup_subsys_state *cfile_css;
|
||||||
|
unsigned int efd, cfd;
|
||||||
|
struct fd efile;
|
||||||
|
struct fd cfile;
|
||||||
|
struct dentry *cdentry;
|
||||||
|
const char *name;
|
||||||
|
char *endp;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
if (IS_ENABLED(CONFIG_PREEMPT_RT))
|
||||||
|
return -EOPNOTSUPP;
|
||||||
|
|
||||||
|
buf = strstrip(buf);
|
||||||
|
|
||||||
|
efd = simple_strtoul(buf, &endp, 10);
|
||||||
|
if (*endp != ' ')
|
||||||
|
return -EINVAL;
|
||||||
|
buf = endp + 1;
|
||||||
|
|
||||||
|
cfd = simple_strtoul(buf, &endp, 10);
|
||||||
|
if ((*endp != ' ') && (*endp != '\0'))
|
||||||
|
return -EINVAL;
|
||||||
|
buf = endp + 1;
|
||||||
|
|
||||||
|
event = kzalloc(sizeof(*event), GFP_KERNEL);
|
||||||
|
if (!event)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
event->memcg = memcg;
|
||||||
|
INIT_LIST_HEAD(&event->list);
|
||||||
|
init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
|
||||||
|
init_waitqueue_func_entry(&event->wait, memcg_event_wake);
|
||||||
|
INIT_WORK(&event->remove, memcg_event_remove);
|
||||||
|
|
||||||
|
efile = fdget(efd);
|
||||||
|
if (!efile.file) {
|
||||||
|
ret = -EBADF;
|
||||||
|
goto out_kfree;
|
||||||
|
}
|
||||||
|
|
||||||
|
event->eventfd = eventfd_ctx_fileget(efile.file);
|
||||||
|
if (IS_ERR(event->eventfd)) {
|
||||||
|
ret = PTR_ERR(event->eventfd);
|
||||||
|
goto out_put_efile;
|
||||||
|
}
|
||||||
|
|
||||||
|
cfile = fdget(cfd);
|
||||||
|
if (!cfile.file) {
|
||||||
|
ret = -EBADF;
|
||||||
|
goto out_put_eventfd;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* the process need read permission on control file */
|
||||||
|
/* AV: shouldn't we check that it's been opened for read instead? */
|
||||||
|
ret = file_permission(cfile.file, MAY_READ);
|
||||||
|
if (ret < 0)
|
||||||
|
goto out_put_cfile;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The control file must be a regular cgroup1 file. As a regular cgroup
|
||||||
|
* file can't be renamed, it's safe to access its name afterwards.
|
||||||
|
*/
|
||||||
|
cdentry = cfile.file->f_path.dentry;
|
||||||
|
if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto out_put_cfile;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Determine the event callbacks and set them in @event. This used
|
||||||
|
* to be done via struct cftype but cgroup core no longer knows
|
||||||
|
* about these events. The following is crude but the whole thing
|
||||||
|
* is for compatibility anyway.
|
||||||
|
*
|
||||||
|
* DO NOT ADD NEW FILES.
|
||||||
|
*/
|
||||||
|
name = cdentry->d_name.name;
|
||||||
|
|
||||||
|
if (!strcmp(name, "memory.usage_in_bytes")) {
|
||||||
|
event->register_event = mem_cgroup_usage_register_event;
|
||||||
|
event->unregister_event = mem_cgroup_usage_unregister_event;
|
||||||
|
} else if (!strcmp(name, "memory.oom_control")) {
|
||||||
|
event->register_event = mem_cgroup_oom_register_event;
|
||||||
|
event->unregister_event = mem_cgroup_oom_unregister_event;
|
||||||
|
} else if (!strcmp(name, "memory.pressure_level")) {
|
||||||
|
event->register_event = vmpressure_register_event;
|
||||||
|
event->unregister_event = vmpressure_unregister_event;
|
||||||
|
} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
|
||||||
|
event->register_event = memsw_cgroup_usage_register_event;
|
||||||
|
event->unregister_event = memsw_cgroup_usage_unregister_event;
|
||||||
|
} else {
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto out_put_cfile;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Verify @cfile should belong to @css. Also, remaining events are
|
||||||
|
* automatically removed on cgroup destruction but the removal is
|
||||||
|
* asynchronous, so take an extra ref on @css.
|
||||||
|
*/
|
||||||
|
cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
|
||||||
|
&memory_cgrp_subsys);
|
||||||
|
ret = -EINVAL;
|
||||||
|
if (IS_ERR(cfile_css))
|
||||||
|
goto out_put_cfile;
|
||||||
|
if (cfile_css != css) {
|
||||||
|
css_put(cfile_css);
|
||||||
|
goto out_put_cfile;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = event->register_event(memcg, event->eventfd, buf);
|
||||||
|
if (ret)
|
||||||
|
goto out_put_css;
|
||||||
|
|
||||||
|
vfs_poll(efile.file, &event->pt);
|
||||||
|
|
||||||
|
spin_lock_irq(&memcg->event_list_lock);
|
||||||
|
list_add(&event->list, &memcg->event_list);
|
||||||
|
spin_unlock_irq(&memcg->event_list_lock);
|
||||||
|
|
||||||
|
fdput(cfile);
|
||||||
|
fdput(efile);
|
||||||
|
|
||||||
|
return nbytes;
|
||||||
|
|
||||||
|
out_put_css:
|
||||||
|
css_put(css);
|
||||||
|
out_put_cfile:
|
||||||
|
fdput(cfile);
|
||||||
|
out_put_eventfd:
|
||||||
|
eventfd_ctx_put(event->eventfd);
|
||||||
|
out_put_efile:
|
||||||
|
fdput(efile);
|
||||||
|
out_kfree:
|
||||||
|
kfree(event);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void memcg1_css_offline(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_event *event, *tmp;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Unregister events and notify userspace.
|
||||||
|
* Notify userspace about cgroup removing only after rmdir of cgroup
|
||||||
|
* directory to avoid race between userspace and kernelspace.
|
||||||
|
*/
|
||||||
|
spin_lock_irq(&memcg->event_list_lock);
|
||||||
|
list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
|
||||||
|
list_del_init(&event->list);
|
||||||
|
schedule_work(&event->remove);
|
||||||
|
}
|
||||||
|
spin_unlock_irq(&memcg->event_list_lock);
|
||||||
|
}
|
||||||
|
|
||||||
static int __init memcg1_init(void)
|
static int __init memcg1_init(void)
|
||||||
{
|
{
|
||||||
int node;
|
int node;
|
||||||
|
@ -41,4 +41,55 @@ u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
|
|||||||
int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
|
int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
|
||||||
struct cftype *cft, u64 val);
|
struct cftype *cft, u64 val);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Per memcg event counter is incremented at every pagein/pageout. With THP,
|
||||||
|
* it will be incremented by the number of pages. This counter is used
|
||||||
|
* to trigger some periodic events. This is straightforward and better
|
||||||
|
* than using jiffies etc. to handle periodic memcg event.
|
||||||
|
*/
|
||||||
|
enum mem_cgroup_events_target {
|
||||||
|
MEM_CGROUP_TARGET_THRESH,
|
||||||
|
MEM_CGROUP_TARGET_SOFTLIMIT,
|
||||||
|
MEM_CGROUP_NTARGETS,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Whether legacy memory+swap accounting is active */
|
||||||
|
static bool do_memsw_account(void)
|
||||||
|
{
|
||||||
|
return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Iteration constructs for visiting all cgroups (under a tree). If
|
||||||
|
* loops are exited prematurely (break), mem_cgroup_iter_break() must
|
||||||
|
* be used for reference counting.
|
||||||
|
*/
|
||||||
|
#define for_each_mem_cgroup_tree(iter, root) \
|
||||||
|
for (iter = mem_cgroup_iter(root, NULL, NULL); \
|
||||||
|
iter != NULL; \
|
||||||
|
iter = mem_cgroup_iter(root, iter, NULL))
|
||||||
|
|
||||||
|
#define for_each_mem_cgroup(iter) \
|
||||||
|
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
|
||||||
|
iter != NULL; \
|
||||||
|
iter = mem_cgroup_iter(NULL, iter, NULL))
|
||||||
|
|
||||||
|
void memcg1_css_offline(struct mem_cgroup *memcg);
|
||||||
|
|
||||||
|
/* for encoding cft->private value on file */
|
||||||
|
enum res_type {
|
||||||
|
_MEM,
|
||||||
|
_MEMSWAP,
|
||||||
|
_KMEM,
|
||||||
|
_TCP,
|
||||||
|
};
|
||||||
|
|
||||||
|
bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
|
||||||
|
enum mem_cgroup_events_target target);
|
||||||
|
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
|
||||||
|
void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
|
||||||
|
ssize_t memcg_write_event_control(struct kernfs_open_file *of,
|
||||||
|
char *buf, size_t nbytes, loff_t off);
|
||||||
|
|
||||||
|
|
||||||
#endif /* __MM_MEMCONTROL_V1_H */
|
#endif /* __MM_MEMCONTROL_V1_H */
|
||||||
|
687
mm/memcontrol.c
687
mm/memcontrol.c
@ -46,9 +46,6 @@
|
|||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
#include <linux/swapops.h>
|
#include <linux/swapops.h>
|
||||||
#include <linux/spinlock.h>
|
#include <linux/spinlock.h>
|
||||||
#include <linux/eventfd.h>
|
|
||||||
#include <linux/poll.h>
|
|
||||||
#include <linux/sort.h>
|
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
#include <linux/seq_file.h>
|
#include <linux/seq_file.h>
|
||||||
#include <linux/vmpressure.h>
|
#include <linux/vmpressure.h>
|
||||||
@ -58,7 +55,6 @@
|
|||||||
#include <linux/cpu.h>
|
#include <linux/cpu.h>
|
||||||
#include <linux/oom.h>
|
#include <linux/oom.h>
|
||||||
#include <linux/lockdep.h>
|
#include <linux/lockdep.h>
|
||||||
#include <linux/file.h>
|
|
||||||
#include <linux/resume_user_mode.h>
|
#include <linux/resume_user_mode.h>
|
||||||
#include <linux/psi.h>
|
#include <linux/psi.h>
|
||||||
#include <linux/seq_buf.h>
|
#include <linux/seq_buf.h>
|
||||||
@ -96,91 +92,13 @@ static bool cgroup_memory_nobpf __ro_after_init;
|
|||||||
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
|
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Whether legacy memory+swap accounting is active */
|
|
||||||
static bool do_memsw_account(void)
|
|
||||||
{
|
|
||||||
return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define THRESHOLDS_EVENTS_TARGET 128
|
#define THRESHOLDS_EVENTS_TARGET 128
|
||||||
#define SOFTLIMIT_EVENTS_TARGET 1024
|
#define SOFTLIMIT_EVENTS_TARGET 1024
|
||||||
|
|
||||||
/* for OOM */
|
|
||||||
struct mem_cgroup_eventfd_list {
|
|
||||||
struct list_head list;
|
|
||||||
struct eventfd_ctx *eventfd;
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
|
||||||
* cgroup_event represents events which userspace want to receive.
|
|
||||||
*/
|
|
||||||
struct mem_cgroup_event {
|
|
||||||
/*
|
|
||||||
* memcg which the event belongs to.
|
|
||||||
*/
|
|
||||||
struct mem_cgroup *memcg;
|
|
||||||
/*
|
|
||||||
* eventfd to signal userspace about the event.
|
|
||||||
*/
|
|
||||||
struct eventfd_ctx *eventfd;
|
|
||||||
/*
|
|
||||||
* Each of these stored in a list by the cgroup.
|
|
||||||
*/
|
|
||||||
struct list_head list;
|
|
||||||
/*
|
|
||||||
* register_event() callback will be used to add new userspace
|
|
||||||
* waiter for changes related to this event. Use eventfd_signal()
|
|
||||||
* on eventfd to send notification to userspace.
|
|
||||||
*/
|
|
||||||
int (*register_event)(struct mem_cgroup *memcg,
|
|
||||||
struct eventfd_ctx *eventfd, const char *args);
|
|
||||||
/*
|
|
||||||
* unregister_event() callback will be called when userspace closes
|
|
||||||
* the eventfd or on cgroup removing. This callback must be set,
|
|
||||||
* if you want provide notification functionality.
|
|
||||||
*/
|
|
||||||
void (*unregister_event)(struct mem_cgroup *memcg,
|
|
||||||
struct eventfd_ctx *eventfd);
|
|
||||||
/*
|
|
||||||
* All fields below needed to unregister event when
|
|
||||||
* userspace closes eventfd.
|
|
||||||
*/
|
|
||||||
poll_table pt;
|
|
||||||
wait_queue_head_t *wqh;
|
|
||||||
wait_queue_entry_t wait;
|
|
||||||
struct work_struct remove;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
|
|
||||||
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
|
|
||||||
|
|
||||||
/* for encoding cft->private value on file */
|
|
||||||
enum res_type {
|
|
||||||
_MEM,
|
|
||||||
_MEMSWAP,
|
|
||||||
_KMEM,
|
|
||||||
_TCP,
|
|
||||||
};
|
|
||||||
|
|
||||||
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
|
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
|
||||||
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
|
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
|
||||||
#define MEMFILE_ATTR(val) ((val) & 0xffff)
|
#define MEMFILE_ATTR(val) ((val) & 0xffff)
|
||||||
|
|
||||||
/*
|
|
||||||
* Iteration constructs for visiting all cgroups (under a tree). If
|
|
||||||
* loops are exited prematurely (break), mem_cgroup_iter_break() must
|
|
||||||
* be used for reference counting.
|
|
||||||
*/
|
|
||||||
#define for_each_mem_cgroup_tree(iter, root) \
|
|
||||||
for (iter = mem_cgroup_iter(root, NULL, NULL); \
|
|
||||||
iter != NULL; \
|
|
||||||
iter = mem_cgroup_iter(root, iter, NULL))
|
|
||||||
|
|
||||||
#define for_each_mem_cgroup(iter) \
|
|
||||||
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
|
|
||||||
iter != NULL; \
|
|
||||||
iter = mem_cgroup_iter(NULL, iter, NULL))
|
|
||||||
|
|
||||||
static inline bool task_is_dying(void)
|
static inline bool task_is_dying(void)
|
||||||
{
|
{
|
||||||
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
|
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
|
||||||
@ -939,8 +857,8 @@ void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
|
|||||||
__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
|
__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
|
bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
|
||||||
enum mem_cgroup_events_target target)
|
enum mem_cgroup_events_target target)
|
||||||
{
|
{
|
||||||
unsigned long val, next;
|
unsigned long val, next;
|
||||||
|
|
||||||
@ -964,28 +882,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Check events in order.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
void memcg_check_events(struct mem_cgroup *memcg, int nid)
|
|
||||||
{
|
|
||||||
if (IS_ENABLED(CONFIG_PREEMPT_RT))
|
|
||||||
return;
|
|
||||||
|
|
||||||
/* threshold event is triggered in finer grain than soft limit */
|
|
||||||
if (unlikely(mem_cgroup_event_ratelimit(memcg,
|
|
||||||
MEM_CGROUP_TARGET_THRESH))) {
|
|
||||||
bool do_softlimit;
|
|
||||||
|
|
||||||
do_softlimit = mem_cgroup_event_ratelimit(memcg,
|
|
||||||
MEM_CGROUP_TARGET_SOFTLIMIT);
|
|
||||||
mem_cgroup_threshold(memcg);
|
|
||||||
if (unlikely(do_softlimit))
|
|
||||||
memcg1_update_tree(memcg, nid);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
|
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
@ -1725,7 +1621,7 @@ static struct lockdep_map memcg_oom_lock_dep_map = {
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static DEFINE_SPINLOCK(memcg_oom_lock);
|
DEFINE_SPINLOCK(memcg_oom_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check OOM-Killer is already running under our hierarchy.
|
* Check OOM-Killer is already running under our hierarchy.
|
||||||
@ -3543,7 +3439,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
|
|||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
|
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
|
||||||
{
|
{
|
||||||
unsigned long val;
|
unsigned long val;
|
||||||
|
|
||||||
@ -4044,331 +3940,6 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
|
|
||||||
{
|
|
||||||
struct mem_cgroup_threshold_ary *t;
|
|
||||||
unsigned long usage;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
rcu_read_lock();
|
|
||||||
if (!swap)
|
|
||||||
t = rcu_dereference(memcg->thresholds.primary);
|
|
||||||
else
|
|
||||||
t = rcu_dereference(memcg->memsw_thresholds.primary);
|
|
||||||
|
|
||||||
if (!t)
|
|
||||||
goto unlock;
|
|
||||||
|
|
||||||
usage = mem_cgroup_usage(memcg, swap);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* current_threshold points to threshold just below or equal to usage.
|
|
||||||
* If it's not true, a threshold was crossed after last
|
|
||||||
* call of __mem_cgroup_threshold().
|
|
||||||
*/
|
|
||||||
i = t->current_threshold;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Iterate backward over array of thresholds starting from
|
|
||||||
* current_threshold and check if a threshold is crossed.
|
|
||||||
* If none of thresholds below usage is crossed, we read
|
|
||||||
* only one element of the array here.
|
|
||||||
*/
|
|
||||||
for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
|
|
||||||
eventfd_signal(t->entries[i].eventfd);
|
|
||||||
|
|
||||||
/* i = current_threshold + 1 */
|
|
||||||
i++;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Iterate forward over array of thresholds starting from
|
|
||||||
* current_threshold+1 and check if a threshold is crossed.
|
|
||||||
* If none of thresholds above usage is crossed, we read
|
|
||||||
* only one element of the array here.
|
|
||||||
*/
|
|
||||||
for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
|
|
||||||
eventfd_signal(t->entries[i].eventfd);
|
|
||||||
|
|
||||||
/* Update current_threshold */
|
|
||||||
t->current_threshold = i - 1;
|
|
||||||
unlock:
|
|
||||||
rcu_read_unlock();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mem_cgroup_threshold(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
while (memcg) {
|
|
||||||
__mem_cgroup_threshold(memcg, false);
|
|
||||||
if (do_memsw_account())
|
|
||||||
__mem_cgroup_threshold(memcg, true);
|
|
||||||
|
|
||||||
memcg = parent_mem_cgroup(memcg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int compare_thresholds(const void *a, const void *b)
|
|
||||||
{
|
|
||||||
const struct mem_cgroup_threshold *_a = a;
|
|
||||||
const struct mem_cgroup_threshold *_b = b;
|
|
||||||
|
|
||||||
if (_a->threshold > _b->threshold)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (_a->threshold < _b->threshold)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
struct mem_cgroup_eventfd_list *ev;
|
|
||||||
|
|
||||||
spin_lock(&memcg_oom_lock);
|
|
||||||
|
|
||||||
list_for_each_entry(ev, &memcg->oom_notify, list)
|
|
||||||
eventfd_signal(ev->eventfd);
|
|
||||||
|
|
||||||
spin_unlock(&memcg_oom_lock);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
struct mem_cgroup *iter;
|
|
||||||
|
|
||||||
for_each_mem_cgroup_tree(iter, memcg)
|
|
||||||
mem_cgroup_oom_notify_cb(iter);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
|
|
||||||
struct eventfd_ctx *eventfd, const char *args, enum res_type type)
|
|
||||||
{
|
|
||||||
struct mem_cgroup_thresholds *thresholds;
|
|
||||||
struct mem_cgroup_threshold_ary *new;
|
|
||||||
unsigned long threshold;
|
|
||||||
unsigned long usage;
|
|
||||||
int i, size, ret;
|
|
||||||
|
|
||||||
ret = page_counter_memparse(args, "-1", &threshold);
|
|
||||||
if (ret)
|
|
||||||
return ret;
|
|
||||||
|
|
||||||
mutex_lock(&memcg->thresholds_lock);
|
|
||||||
|
|
||||||
if (type == _MEM) {
|
|
||||||
thresholds = &memcg->thresholds;
|
|
||||||
usage = mem_cgroup_usage(memcg, false);
|
|
||||||
} else if (type == _MEMSWAP) {
|
|
||||||
thresholds = &memcg->memsw_thresholds;
|
|
||||||
usage = mem_cgroup_usage(memcg, true);
|
|
||||||
} else
|
|
||||||
BUG();
|
|
||||||
|
|
||||||
/* Check if a threshold crossed before adding a new one */
|
|
||||||
if (thresholds->primary)
|
|
||||||
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
|
|
||||||
|
|
||||||
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
|
|
||||||
|
|
||||||
/* Allocate memory for new array of thresholds */
|
|
||||||
new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
|
|
||||||
if (!new) {
|
|
||||||
ret = -ENOMEM;
|
|
||||||
goto unlock;
|
|
||||||
}
|
|
||||||
new->size = size;
|
|
||||||
|
|
||||||
/* Copy thresholds (if any) to new array */
|
|
||||||
if (thresholds->primary)
|
|
||||||
memcpy(new->entries, thresholds->primary->entries,
|
|
||||||
flex_array_size(new, entries, size - 1));
|
|
||||||
|
|
||||||
/* Add new threshold */
|
|
||||||
new->entries[size - 1].eventfd = eventfd;
|
|
||||||
new->entries[size - 1].threshold = threshold;
|
|
||||||
|
|
||||||
/* Sort thresholds. Registering of new threshold isn't time-critical */
|
|
||||||
sort(new->entries, size, sizeof(*new->entries),
|
|
||||||
compare_thresholds, NULL);
|
|
||||||
|
|
||||||
/* Find current threshold */
|
|
||||||
new->current_threshold = -1;
|
|
||||||
for (i = 0; i < size; i++) {
|
|
||||||
if (new->entries[i].threshold <= usage) {
|
|
||||||
/*
|
|
||||||
* new->current_threshold will not be used until
|
|
||||||
* rcu_assign_pointer(), so it's safe to increment
|
|
||||||
* it here.
|
|
||||||
*/
|
|
||||||
++new->current_threshold;
|
|
||||||
} else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Free old spare buffer and save old primary buffer as spare */
|
|
||||||
kfree(thresholds->spare);
|
|
||||||
thresholds->spare = thresholds->primary;
|
|
||||||
|
|
||||||
rcu_assign_pointer(thresholds->primary, new);
|
|
||||||
|
|
||||||
/* To be sure that nobody uses thresholds */
|
|
||||||
synchronize_rcu();
|
|
||||||
|
|
||||||
unlock:
|
|
||||||
mutex_unlock(&memcg->thresholds_lock);
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
|
|
||||||
struct eventfd_ctx *eventfd, const char *args)
|
|
||||||
{
|
|
||||||
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
|
|
||||||
struct eventfd_ctx *eventfd, const char *args)
|
|
||||||
{
|
|
||||||
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
|
|
||||||
struct eventfd_ctx *eventfd, enum res_type type)
|
|
||||||
{
|
|
||||||
struct mem_cgroup_thresholds *thresholds;
|
|
||||||
struct mem_cgroup_threshold_ary *new;
|
|
||||||
unsigned long usage;
|
|
||||||
int i, j, size, entries;
|
|
||||||
|
|
||||||
mutex_lock(&memcg->thresholds_lock);
|
|
||||||
|
|
||||||
if (type == _MEM) {
|
|
||||||
thresholds = &memcg->thresholds;
|
|
||||||
usage = mem_cgroup_usage(memcg, false);
|
|
||||||
} else if (type == _MEMSWAP) {
|
|
||||||
thresholds = &memcg->memsw_thresholds;
|
|
||||||
usage = mem_cgroup_usage(memcg, true);
|
|
||||||
} else
|
|
||||||
BUG();
|
|
||||||
|
|
||||||
if (!thresholds->primary)
|
|
||||||
goto unlock;
|
|
||||||
|
|
||||||
/* Check if a threshold crossed before removing */
|
|
||||||
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
|
|
||||||
|
|
||||||
/* Calculate new number of threshold */
|
|
||||||
size = entries = 0;
|
|
||||||
for (i = 0; i < thresholds->primary->size; i++) {
|
|
||||||
if (thresholds->primary->entries[i].eventfd != eventfd)
|
|
||||||
size++;
|
|
||||||
else
|
|
||||||
entries++;
|
|
||||||
}
|
|
||||||
|
|
||||||
new = thresholds->spare;
|
|
||||||
|
|
||||||
/* If no items related to eventfd have been cleared, nothing to do */
|
|
||||||
if (!entries)
|
|
||||||
goto unlock;
|
|
||||||
|
|
||||||
/* Set thresholds array to NULL if we don't have thresholds */
|
|
||||||
if (!size) {
|
|
||||||
kfree(new);
|
|
||||||
new = NULL;
|
|
||||||
goto swap_buffers;
|
|
||||||
}
|
|
||||||
|
|
||||||
new->size = size;
|
|
||||||
|
|
||||||
/* Copy thresholds and find current threshold */
|
|
||||||
new->current_threshold = -1;
|
|
||||||
for (i = 0, j = 0; i < thresholds->primary->size; i++) {
|
|
||||||
if (thresholds->primary->entries[i].eventfd == eventfd)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
new->entries[j] = thresholds->primary->entries[i];
|
|
||||||
if (new->entries[j].threshold <= usage) {
|
|
||||||
/*
|
|
||||||
* new->current_threshold will not be used
|
|
||||||
* until rcu_assign_pointer(), so it's safe to increment
|
|
||||||
* it here.
|
|
||||||
*/
|
|
||||||
++new->current_threshold;
|
|
||||||
}
|
|
||||||
j++;
|
|
||||||
}
|
|
||||||
|
|
||||||
swap_buffers:
|
|
||||||
/* Swap primary and spare array */
|
|
||||||
thresholds->spare = thresholds->primary;
|
|
||||||
|
|
||||||
rcu_assign_pointer(thresholds->primary, new);
|
|
||||||
|
|
||||||
/* To be sure that nobody uses thresholds */
|
|
||||||
synchronize_rcu();
|
|
||||||
|
|
||||||
/* If all events are unregistered, free the spare array */
|
|
||||||
if (!new) {
|
|
||||||
kfree(thresholds->spare);
|
|
||||||
thresholds->spare = NULL;
|
|
||||||
}
|
|
||||||
unlock:
|
|
||||||
mutex_unlock(&memcg->thresholds_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
|
|
||||||
struct eventfd_ctx *eventfd)
|
|
||||||
{
|
|
||||||
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
|
|
||||||
struct eventfd_ctx *eventfd)
|
|
||||||
{
|
|
||||||
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
|
|
||||||
struct eventfd_ctx *eventfd, const char *args)
|
|
||||||
{
|
|
||||||
struct mem_cgroup_eventfd_list *event;
|
|
||||||
|
|
||||||
event = kmalloc(sizeof(*event), GFP_KERNEL);
|
|
||||||
if (!event)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
spin_lock(&memcg_oom_lock);
|
|
||||||
|
|
||||||
event->eventfd = eventfd;
|
|
||||||
list_add(&event->list, &memcg->oom_notify);
|
|
||||||
|
|
||||||
/* already in OOM ? */
|
|
||||||
if (memcg->under_oom)
|
|
||||||
eventfd_signal(eventfd);
|
|
||||||
spin_unlock(&memcg_oom_lock);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
|
|
||||||
struct eventfd_ctx *eventfd)
|
|
||||||
{
|
|
||||||
struct mem_cgroup_eventfd_list *ev, *tmp;
|
|
||||||
|
|
||||||
spin_lock(&memcg_oom_lock);
|
|
||||||
|
|
||||||
list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
|
|
||||||
if (ev->eventfd == eventfd) {
|
|
||||||
list_del(&ev->list);
|
|
||||||
kfree(ev);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
spin_unlock(&memcg_oom_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
|
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
|
||||||
{
|
{
|
||||||
struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
|
struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
|
||||||
@ -4609,243 +4180,6 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
|
|||||||
|
|
||||||
#endif /* CONFIG_CGROUP_WRITEBACK */
|
#endif /* CONFIG_CGROUP_WRITEBACK */
|
||||||
|
|
||||||
/*
|
|
||||||
* DO NOT USE IN NEW FILES.
|
|
||||||
*
|
|
||||||
* "cgroup.event_control" implementation.
|
|
||||||
*
|
|
||||||
* This is way over-engineered. It tries to support fully configurable
|
|
||||||
* events for each user. Such level of flexibility is completely
|
|
||||||
* unnecessary especially in the light of the planned unified hierarchy.
|
|
||||||
*
|
|
||||||
* Please deprecate this and replace with something simpler if at all
|
|
||||||
* possible.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Unregister event and free resources.
|
|
||||||
*
|
|
||||||
* Gets called from workqueue.
|
|
||||||
*/
|
|
||||||
static void memcg_event_remove(struct work_struct *work)
|
|
||||||
{
|
|
||||||
struct mem_cgroup_event *event =
|
|
||||||
container_of(work, struct mem_cgroup_event, remove);
|
|
||||||
struct mem_cgroup *memcg = event->memcg;
|
|
||||||
|
|
||||||
remove_wait_queue(event->wqh, &event->wait);
|
|
||||||
|
|
||||||
event->unregister_event(memcg, event->eventfd);
|
|
||||||
|
|
||||||
/* Notify userspace the event is going away. */
|
|
||||||
eventfd_signal(event->eventfd);
|
|
||||||
|
|
||||||
eventfd_ctx_put(event->eventfd);
|
|
||||||
kfree(event);
|
|
||||||
css_put(&memcg->css);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Gets called on EPOLLHUP on eventfd when user closes it.
|
|
||||||
*
|
|
||||||
* Called with wqh->lock held and interrupts disabled.
|
|
||||||
*/
|
|
||||||
static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
|
|
||||||
int sync, void *key)
|
|
||||||
{
|
|
||||||
struct mem_cgroup_event *event =
|
|
||||||
container_of(wait, struct mem_cgroup_event, wait);
|
|
||||||
struct mem_cgroup *memcg = event->memcg;
|
|
||||||
__poll_t flags = key_to_poll(key);
|
|
||||||
|
|
||||||
if (flags & EPOLLHUP) {
|
|
||||||
/*
|
|
||||||
* If the event has been detached at cgroup removal, we
|
|
||||||
* can simply return knowing the other side will cleanup
|
|
||||||
* for us.
|
|
||||||
*
|
|
||||||
* We can't race against event freeing since the other
|
|
||||||
* side will require wqh->lock via remove_wait_queue(),
|
|
||||||
* which we hold.
|
|
||||||
*/
|
|
||||||
spin_lock(&memcg->event_list_lock);
|
|
||||||
if (!list_empty(&event->list)) {
|
|
||||||
list_del_init(&event->list);
|
|
||||||
/*
|
|
||||||
* We are in atomic context, but cgroup_event_remove()
|
|
||||||
* may sleep, so we have to call it in workqueue.
|
|
||||||
*/
|
|
||||||
schedule_work(&event->remove);
|
|
||||||
}
|
|
||||||
spin_unlock(&memcg->event_list_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void memcg_event_ptable_queue_proc(struct file *file,
|
|
||||||
wait_queue_head_t *wqh, poll_table *pt)
|
|
||||||
{
|
|
||||||
struct mem_cgroup_event *event =
|
|
||||||
container_of(pt, struct mem_cgroup_event, pt);
|
|
||||||
|
|
||||||
event->wqh = wqh;
|
|
||||||
add_wait_queue(wqh, &event->wait);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* DO NOT USE IN NEW FILES.
|
|
||||||
*
|
|
||||||
* Parse input and register new cgroup event handler.
|
|
||||||
*
|
|
||||||
* Input must be in format '<event_fd> <control_fd> <args>'.
|
|
||||||
* Interpretation of args is defined by control file implementation.
|
|
||||||
*/
|
|
||||||
static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
|
|
||||||
char *buf, size_t nbytes, loff_t off)
|
|
||||||
{
|
|
||||||
struct cgroup_subsys_state *css = of_css(of);
|
|
||||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
|
||||||
struct mem_cgroup_event *event;
|
|
||||||
struct cgroup_subsys_state *cfile_css;
|
|
||||||
unsigned int efd, cfd;
|
|
||||||
struct fd efile;
|
|
||||||
struct fd cfile;
|
|
||||||
struct dentry *cdentry;
|
|
||||||
const char *name;
|
|
||||||
char *endp;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
if (IS_ENABLED(CONFIG_PREEMPT_RT))
|
|
||||||
return -EOPNOTSUPP;
|
|
||||||
|
|
||||||
buf = strstrip(buf);
|
|
||||||
|
|
||||||
efd = simple_strtoul(buf, &endp, 10);
|
|
||||||
if (*endp != ' ')
|
|
||||||
return -EINVAL;
|
|
||||||
buf = endp + 1;
|
|
||||||
|
|
||||||
cfd = simple_strtoul(buf, &endp, 10);
|
|
||||||
if ((*endp != ' ') && (*endp != '\0'))
|
|
||||||
return -EINVAL;
|
|
||||||
buf = endp + 1;
|
|
||||||
|
|
||||||
event = kzalloc(sizeof(*event), GFP_KERNEL);
|
|
||||||
if (!event)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
event->memcg = memcg;
|
|
||||||
INIT_LIST_HEAD(&event->list);
|
|
||||||
init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
|
|
||||||
init_waitqueue_func_entry(&event->wait, memcg_event_wake);
|
|
||||||
INIT_WORK(&event->remove, memcg_event_remove);
|
|
||||||
|
|
||||||
efile = fdget(efd);
|
|
||||||
if (!efile.file) {
|
|
||||||
ret = -EBADF;
|
|
||||||
goto out_kfree;
|
|
||||||
}
|
|
||||||
|
|
||||||
event->eventfd = eventfd_ctx_fileget(efile.file);
|
|
||||||
if (IS_ERR(event->eventfd)) {
|
|
||||||
ret = PTR_ERR(event->eventfd);
|
|
||||||
goto out_put_efile;
|
|
||||||
}
|
|
||||||
|
|
||||||
cfile = fdget(cfd);
|
|
||||||
if (!cfile.file) {
|
|
||||||
ret = -EBADF;
|
|
||||||
goto out_put_eventfd;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* the process need read permission on control file */
|
|
||||||
/* AV: shouldn't we check that it's been opened for read instead? */
|
|
||||||
ret = file_permission(cfile.file, MAY_READ);
|
|
||||||
if (ret < 0)
|
|
||||||
goto out_put_cfile;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The control file must be a regular cgroup1 file. As a regular cgroup
|
|
||||||
* file can't be renamed, it's safe to access its name afterwards.
|
|
||||||
*/
|
|
||||||
cdentry = cfile.file->f_path.dentry;
|
|
||||||
if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
|
|
||||||
ret = -EINVAL;
|
|
||||||
goto out_put_cfile;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Determine the event callbacks and set them in @event. This used
|
|
||||||
* to be done via struct cftype but cgroup core no longer knows
|
|
||||||
* about these events. The following is crude but the whole thing
|
|
||||||
* is for compatibility anyway.
|
|
||||||
*
|
|
||||||
* DO NOT ADD NEW FILES.
|
|
||||||
*/
|
|
||||||
name = cdentry->d_name.name;
|
|
||||||
|
|
||||||
if (!strcmp(name, "memory.usage_in_bytes")) {
|
|
||||||
event->register_event = mem_cgroup_usage_register_event;
|
|
||||||
event->unregister_event = mem_cgroup_usage_unregister_event;
|
|
||||||
} else if (!strcmp(name, "memory.oom_control")) {
|
|
||||||
event->register_event = mem_cgroup_oom_register_event;
|
|
||||||
event->unregister_event = mem_cgroup_oom_unregister_event;
|
|
||||||
} else if (!strcmp(name, "memory.pressure_level")) {
|
|
||||||
event->register_event = vmpressure_register_event;
|
|
||||||
event->unregister_event = vmpressure_unregister_event;
|
|
||||||
} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
|
|
||||||
event->register_event = memsw_cgroup_usage_register_event;
|
|
||||||
event->unregister_event = memsw_cgroup_usage_unregister_event;
|
|
||||||
} else {
|
|
||||||
ret = -EINVAL;
|
|
||||||
goto out_put_cfile;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Verify @cfile should belong to @css. Also, remaining events are
|
|
||||||
* automatically removed on cgroup destruction but the removal is
|
|
||||||
* asynchronous, so take an extra ref on @css.
|
|
||||||
*/
|
|
||||||
cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
|
|
||||||
&memory_cgrp_subsys);
|
|
||||||
ret = -EINVAL;
|
|
||||||
if (IS_ERR(cfile_css))
|
|
||||||
goto out_put_cfile;
|
|
||||||
if (cfile_css != css) {
|
|
||||||
css_put(cfile_css);
|
|
||||||
goto out_put_cfile;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = event->register_event(memcg, event->eventfd, buf);
|
|
||||||
if (ret)
|
|
||||||
goto out_put_css;
|
|
||||||
|
|
||||||
vfs_poll(efile.file, &event->pt);
|
|
||||||
|
|
||||||
spin_lock_irq(&memcg->event_list_lock);
|
|
||||||
list_add(&event->list, &memcg->event_list);
|
|
||||||
spin_unlock_irq(&memcg->event_list_lock);
|
|
||||||
|
|
||||||
fdput(cfile);
|
|
||||||
fdput(efile);
|
|
||||||
|
|
||||||
return nbytes;
|
|
||||||
|
|
||||||
out_put_css:
|
|
||||||
css_put(css);
|
|
||||||
out_put_cfile:
|
|
||||||
fdput(cfile);
|
|
||||||
out_put_eventfd:
|
|
||||||
eventfd_ctx_put(event->eventfd);
|
|
||||||
out_put_efile:
|
|
||||||
fdput(efile);
|
|
||||||
out_kfree:
|
|
||||||
kfree(event);
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG)
|
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG)
|
||||||
static int mem_cgroup_slab_show(struct seq_file *m, void *p)
|
static int mem_cgroup_slab_show(struct seq_file *m, void *p)
|
||||||
{
|
{
|
||||||
@ -5312,19 +4646,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
|
|||||||
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
|
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
|
||||||
{
|
{
|
||||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||||
struct mem_cgroup_event *event, *tmp;
|
|
||||||
|
|
||||||
/*
|
memcg1_css_offline(memcg);
|
||||||
* Unregister events and notify userspace.
|
|
||||||
* Notify userspace about cgroup removing only after rmdir of cgroup
|
|
||||||
* directory to avoid race between userspace and kernelspace.
|
|
||||||
*/
|
|
||||||
spin_lock_irq(&memcg->event_list_lock);
|
|
||||||
list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
|
|
||||||
list_del_init(&event->list);
|
|
||||||
schedule_work(&event->remove);
|
|
||||||
}
|
|
||||||
spin_unlock_irq(&memcg->event_list_lock);
|
|
||||||
|
|
||||||
page_counter_set_min(&memcg->memory, 0);
|
page_counter_set_min(&memcg->memory, 0);
|
||||||
page_counter_set_low(&memcg->memory, 0);
|
page_counter_set_low(&memcg->memory, 0);
|
||||||
|
Loading…
Reference in New Issue
Block a user