2012-07-06 20:25:10 +00:00
|
|
|
/*
|
|
|
|
* Slab allocator functions that are independent of the allocator strategy
|
|
|
|
*
|
|
|
|
* (C) 2012 Christoph Lameter <cl@linux.com>
|
|
|
|
*/
|
|
|
|
#include <linux/slab.h>
|
|
|
|
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/poison.h>
|
|
|
|
#include <linux/interrupt.h>
|
|
|
|
#include <linux/memory.h>
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/module.h>
|
2012-07-06 20:25:13 +00:00
|
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/uaccess.h>
|
2012-10-19 14:20:25 +00:00
|
|
|
#include <linux/seq_file.h>
|
|
|
|
#include <linux/proc_fs.h>
|
2012-07-06 20:25:10 +00:00
|
|
|
#include <asm/cacheflush.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
#include <asm/page.h>
|
2012-12-18 22:22:34 +00:00
|
|
|
#include <linux/memcontrol.h>
|
2014-08-06 23:04:44 +00:00
|
|
|
|
|
|
|
#define CREATE_TRACE_POINTS
|
2013-09-04 16:35:34 +00:00
|
|
|
#include <trace/events/kmem.h>
|
2012-07-06 20:25:10 +00:00
|
|
|
|
2012-07-06 20:25:11 +00:00
|
|
|
#include "slab.h"
|
|
|
|
|
|
|
|
enum slab_state slab_state;
|
2012-07-06 20:25:12 +00:00
|
|
|
LIST_HEAD(slab_caches);
|
|
|
|
DEFINE_MUTEX(slab_mutex);
|
2012-09-05 00:20:33 +00:00
|
|
|
struct kmem_cache *kmem_cache;
|
2012-07-06 20:25:11 +00:00
|
|
|
|
2014-10-09 22:26:22 +00:00
|
|
|
/*
|
|
|
|
* Set of flags that will prevent slab merging
|
|
|
|
*/
|
|
|
|
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
|
|
|
|
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
|
2016-03-25 21:21:59 +00:00
|
|
|
SLAB_FAILSLAB | SLAB_KASAN)
|
2014-10-09 22:26:22 +00:00
|
|
|
|
2016-01-14 23:18:15 +00:00
|
|
|
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
|
|
|
|
SLAB_NOTRACK | SLAB_ACCOUNT)
|
2014-10-09 22:26:22 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge control. If this is set then no merging of slab caches will occur.
|
|
|
|
* (Could be removed. This was introduced to pacify the merge skeptics.)
|
|
|
|
*/
|
|
|
|
static int slab_nomerge;
|
|
|
|
|
|
|
|
static int __init setup_slab_nomerge(char *str)
|
|
|
|
{
|
|
|
|
slab_nomerge = 1;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_SLUB
|
|
|
|
__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
__setup("slab_nomerge", setup_slab_nomerge);
|
|
|
|
|
2014-10-09 22:26:00 +00:00
|
|
|
/*
|
|
|
|
* Determine the size of a slab object
|
|
|
|
*/
|
|
|
|
unsigned int kmem_cache_size(struct kmem_cache *s)
|
|
|
|
{
|
|
|
|
return s->object_size;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kmem_cache_size);
|
|
|
|
|
2012-08-16 07:09:46 +00:00
|
|
|
#ifdef CONFIG_DEBUG_VM
|
2014-04-07 22:39:26 +00:00
|
|
|
static int kmem_cache_sanity_check(const char *name, size_t size)
|
2012-07-06 20:25:10 +00:00
|
|
|
{
|
|
|
|
struct kmem_cache *s = NULL;
|
|
|
|
|
|
|
|
if (!name || in_interrupt() || size < sizeof(void *) ||
|
|
|
|
size > KMALLOC_MAX_SIZE) {
|
2012-08-16 07:09:46 +00:00
|
|
|
pr_err("kmem_cache_create(%s) integrity check failed\n", name);
|
|
|
|
return -EINVAL;
|
2012-07-06 20:25:10 +00:00
|
|
|
}
|
2012-08-16 07:12:18 +00:00
|
|
|
|
2012-07-06 20:25:13 +00:00
|
|
|
list_for_each_entry(s, &slab_caches, list) {
|
|
|
|
char tmp;
|
|
|
|
int res;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This happens when the module gets unloaded and doesn't
|
|
|
|
* destroy its slab cache and no-one else reuses the vmalloc
|
|
|
|
* area of the module. Print a warning.
|
|
|
|
*/
|
|
|
|
res = probe_kernel_address(s->name, tmp);
|
|
|
|
if (res) {
|
2012-08-16 07:09:46 +00:00
|
|
|
pr_err("Slab cache with size %d has lost its name\n",
|
2012-07-06 20:25:13 +00:00
|
|
|
s->object_size);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
|
2012-08-16 07:09:46 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else
|
2014-04-07 22:39:26 +00:00
|
|
|
static inline int kmem_cache_sanity_check(const char *name, size_t size)
|
2012-08-16 07:09:46 +00:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2012-07-06 20:25:13 +00:00
|
|
|
#endif
|
|
|
|
|
2015-09-04 22:45:34 +00:00
|
|
|
void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
|
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
2016-03-15 21:54:00 +00:00
|
|
|
for (i = 0; i < nr; i++) {
|
|
|
|
if (s)
|
|
|
|
kmem_cache_free(s, p[i]);
|
|
|
|
else
|
|
|
|
kfree(p[i]);
|
|
|
|
}
|
2015-09-04 22:45:34 +00:00
|
|
|
}
|
|
|
|
|
2015-11-20 23:57:58 +00:00
|
|
|
int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
|
2015-09-04 22:45:34 +00:00
|
|
|
void **p)
|
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
|
|
void *x = p[i] = kmem_cache_alloc(s, flags);
|
|
|
|
if (!x) {
|
|
|
|
__kmem_cache_free_bulk(s, i, p);
|
2015-11-20 23:57:58 +00:00
|
|
|
return 0;
|
2015-09-04 22:45:34 +00:00
|
|
|
}
|
|
|
|
}
|
2015-11-20 23:57:58 +00:00
|
|
|
return i;
|
2015-09-04 22:45:34 +00:00
|
|
|
}
|
|
|
|
|
2016-01-20 23:02:32 +00:00
|
|
|
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
|
2015-02-12 22:59:20 +00:00
|
|
|
void slab_init_memcg_params(struct kmem_cache *s)
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
{
|
2015-02-12 22:59:20 +00:00
|
|
|
s->memcg_params.is_root_cache = true;
|
2015-02-12 22:59:23 +00:00
|
|
|
INIT_LIST_HEAD(&s->memcg_params.list);
|
2015-02-12 22:59:20 +00:00
|
|
|
RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int init_memcg_params(struct kmem_cache *s,
|
|
|
|
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
|
|
|
|
{
|
|
|
|
struct memcg_cache_array *arr;
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
if (memcg) {
|
|
|
|
s->memcg_params.is_root_cache = false;
|
|
|
|
s->memcg_params.memcg = memcg;
|
|
|
|
s->memcg_params.root_cache = root_cache;
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
return 0;
|
2015-02-12 22:59:20 +00:00
|
|
|
}
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
slab_init_memcg_params(s);
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
if (!memcg_nr_cache_ids)
|
|
|
|
return 0;
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
arr = kzalloc(sizeof(struct memcg_cache_array) +
|
|
|
|
memcg_nr_cache_ids * sizeof(void *),
|
|
|
|
GFP_KERNEL);
|
|
|
|
if (!arr)
|
|
|
|
return -ENOMEM;
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
static void destroy_memcg_params(struct kmem_cache *s)
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
{
|
2015-02-12 22:59:20 +00:00
|
|
|
if (is_root_cache(s))
|
|
|
|
kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
}
|
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
static int update_memcg_params(struct kmem_cache *s, int new_array_size)
|
2014-10-09 22:28:47 +00:00
|
|
|
{
|
2015-02-12 22:59:20 +00:00
|
|
|
struct memcg_cache_array *old, *new;
|
2014-10-09 22:28:47 +00:00
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
if (!is_root_cache(s))
|
|
|
|
return 0;
|
2014-10-09 22:28:47 +00:00
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
new = kzalloc(sizeof(struct memcg_cache_array) +
|
|
|
|
new_array_size * sizeof(void *), GFP_KERNEL);
|
|
|
|
if (!new)
|
2014-10-09 22:28:47 +00:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
old = rcu_dereference_protected(s->memcg_params.memcg_caches,
|
|
|
|
lockdep_is_held(&slab_mutex));
|
|
|
|
if (old)
|
|
|
|
memcpy(new->entries, old->entries,
|
|
|
|
memcg_nr_cache_ids * sizeof(void *));
|
2014-10-09 22:28:47 +00:00
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
rcu_assign_pointer(s->memcg_params.memcg_caches, new);
|
|
|
|
if (old)
|
|
|
|
kfree_rcu(old, rcu);
|
2014-10-09 22:28:47 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-18 22:22:38 +00:00
|
|
|
int memcg_update_all_caches(int num_memcgs)
|
|
|
|
{
|
|
|
|
struct kmem_cache *s;
|
|
|
|
int ret = 0;
|
|
|
|
|
2015-02-12 22:59:01 +00:00
|
|
|
mutex_lock(&slab_mutex);
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-18 22:22:38 +00:00
|
|
|
list_for_each_entry(s, &slab_caches, list) {
|
2015-02-12 22:59:20 +00:00
|
|
|
ret = update_memcg_params(s, num_memcgs);
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-18 22:22:38 +00:00
|
|
|
/*
|
|
|
|
* Instead of freeing the memory, we'll just leave the caches
|
|
|
|
* up to this point in an updated state.
|
|
|
|
*/
|
|
|
|
if (ret)
|
2015-02-12 22:59:01 +00:00
|
|
|
break;
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-18 22:22:38 +00:00
|
|
|
}
|
|
|
|
mutex_unlock(&slab_mutex);
|
|
|
|
return ret;
|
|
|
|
}
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
#else
|
2015-02-12 22:59:20 +00:00
|
|
|
static inline int init_memcg_params(struct kmem_cache *s,
|
|
|
|
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
static inline void destroy_memcg_params(struct kmem_cache *s)
|
memcg: move memcg_{alloc,free}_cache_params to slab_common.c
The only reason why they live in memcontrol.c is that we get/put css
reference to the owner memory cgroup in them. However, we can do that in
memcg_{un,}register_cache. OTOH, there are several reasons to move them
to slab_common.c.
First, I think that the less public interface functions we have in
memcontrol.h the better. Since the functions I move don't depend on
memcontrol, I think it's worth making them private to slab, especially
taking into account that the arrays are defined on the slab's side too.
Second, the way how per-memcg arrays are updated looks rather awkward: it
proceeds from memcontrol.c (__memcg_activate_kmem) to slab_common.c
(memcg_update_all_caches) and back to memcontrol.c again
(memcg_update_array_size). In the following patches I move the function
relocating the arrays (memcg_update_array_size) to slab_common.c and
therefore get rid this circular call path. I think we should have the
cache allocation stuff in the same place where we have relocation, because
it's easier to follow the code then. So I move arrays alloc/free
functions to slab_common.c too.
The third point isn't obvious. I'm going to make the list_lru structure
per-memcg to allow targeted kmem reclaim. That means we will have
per-memcg arrays in list_lrus too. It turns out that it's much easier to
update these arrays in list_lru.c rather than in memcontrol.c, because all
the stuff we need is defined there. This patch makes memcg caches arrays
allocation path conform that of the upcoming list_lru.
So let's move these functions to slab_common.c and make them static.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-09 22:28:43 +00:00
|
|
|
{
|
|
|
|
}
|
2016-01-20 23:02:32 +00:00
|
|
|
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-18 22:22:38 +00:00
|
|
|
|
2014-10-09 22:26:22 +00:00
|
|
|
/*
|
|
|
|
* Find a mergeable slab cache
|
|
|
|
*/
|
|
|
|
int slab_unmergeable(struct kmem_cache *s)
|
|
|
|
{
|
|
|
|
if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (!is_root_cache(s))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (s->ctor)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We may have set a slab to be unmergeable during bootstrap.
|
|
|
|
*/
|
|
|
|
if (s->refcount < 0)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct kmem_cache *find_mergeable(size_t size, size_t align,
|
|
|
|
unsigned long flags, const char *name, void (*ctor)(void *))
|
|
|
|
{
|
|
|
|
struct kmem_cache *s;
|
|
|
|
|
2017-02-22 23:40:59 +00:00
|
|
|
if (slab_nomerge)
|
2014-10-09 22:26:22 +00:00
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (ctor)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
size = ALIGN(size, sizeof(void *));
|
|
|
|
align = calculate_alignment(flags, align, size);
|
|
|
|
size = ALIGN(size, align);
|
|
|
|
flags = kmem_cache_flags(size, flags, name, NULL);
|
|
|
|
|
2017-02-22 23:40:59 +00:00
|
|
|
if (flags & SLAB_NEVER_MERGE)
|
|
|
|
return NULL;
|
|
|
|
|
2014-12-10 23:42:18 +00:00
|
|
|
list_for_each_entry_reverse(s, &slab_caches, list) {
|
2014-10-09 22:26:22 +00:00
|
|
|
if (slab_unmergeable(s))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (size > s->size)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
|
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* Check if alignment is compatible.
|
|
|
|
* Courtesy of Adrian Drzewiecki
|
|
|
|
*/
|
|
|
|
if ((s->size & ~(align - 1)) != s->size)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (s->size - size >= sizeof(void *))
|
|
|
|
continue;
|
|
|
|
|
mm/slab: fix unalignment problem on Malta with EVA due to slab merge
Unlike SLUB, sometimes, object isn't started at the beginning of the
slab in SLAB. This causes the unalignment problem after slab merging is
supported by commit 12220dea07f1 ("mm/slab: support slab merge").
Following is the report from Markos that fail to boot on Malta with EVA.
Calibrating delay loop... 19.86 BogoMIPS (lpj=99328)
pid_max: default: 32768 minimum: 301
Mount-cache hash table entries: 4096 (order: 0, 16384 bytes)
Mountpoint-cache hash table entries: 4096 (order: 0, 16384 bytes)
Kernel bug detected[#1]:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.17.0-05639-g12220dea07f1 #1631
task: 1f04f5d8 ti: 1f050000 task.ti: 1f050000
epc : 80141190 alloc_unbound_pwq+0x234/0x304
Not tainted
ra : 80141184 alloc_unbound_pwq+0x228/0x304
Process swapper/0 (pid: 1, threadinfo=1f050000, task=1f04f5d8, tls=00000000)
Call Trace:
alloc_unbound_pwq+0x234/0x304
apply_workqueue_attrs+0x11c/0x294
__alloc_workqueue_key+0x23c/0x470
init_workqueues+0x320/0x400
do_one_initcall+0xe8/0x23c
kernel_init_freeable+0x9c/0x224
kernel_init+0x10/0x100
ret_from_kernel_thread+0x14/0x1c
[ end trace cb88537fdc8fa200 ]
Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
alloc_unbound_pwq() allocates slab object from pool_workqueue. This
kmem_cache requires 256 bytes alignment, but, current merging code
doesn't honor that, and merge it with kmalloc-256. kmalloc-256 requires
only cacheline size alignment so that above failure occurs. However, in
x86, kmalloc-256 is luckily aligned in 256 bytes, so the problem didn't
happen on it.
To fix this problem, this patch introduces alignment mismatch check in
find_mergeable(). This will fix the problem.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reported-by: Markos Chandras <Markos.Chandras@imgtec.com>
Tested-by: Markos Chandras <Markos.Chandras@imgtec.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-11-13 23:19:25 +00:00
|
|
|
if (IS_ENABLED(CONFIG_SLAB) && align &&
|
|
|
|
(align > s->align || s->align % align))
|
|
|
|
continue;
|
|
|
|
|
2014-10-09 22:26:22 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2012-11-28 16:23:16 +00:00
|
|
|
/*
|
|
|
|
* Figure out what the alignment of the objects will be given a set of
|
|
|
|
* flags, a user specified alignment and the size of the objects.
|
|
|
|
*/
|
|
|
|
unsigned long calculate_alignment(unsigned long flags,
|
|
|
|
unsigned long align, unsigned long size)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If the user wants hardware cache aligned objects then follow that
|
|
|
|
* suggestion if the object is sufficiently large.
|
|
|
|
*
|
|
|
|
* The hardware cache alignment cannot override the specified
|
|
|
|
* alignment though. If that is greater then use it.
|
|
|
|
*/
|
|
|
|
if (flags & SLAB_HWCACHE_ALIGN) {
|
|
|
|
unsigned long ralign = cache_line_size();
|
|
|
|
while (size <= ralign / 2)
|
|
|
|
ralign /= 2;
|
|
|
|
align = max(align, ralign);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (align < ARCH_SLAB_MINALIGN)
|
|
|
|
align = ARCH_SLAB_MINALIGN;
|
|
|
|
|
|
|
|
return ALIGN(align, sizeof(void *));
|
|
|
|
}
|
|
|
|
|
2015-11-06 02:45:08 +00:00
|
|
|
static struct kmem_cache *create_cache(const char *name,
|
|
|
|
size_t object_size, size_t size, size_t align,
|
|
|
|
unsigned long flags, void (*ctor)(void *),
|
|
|
|
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
|
2014-04-07 22:39:26 +00:00
|
|
|
{
|
|
|
|
struct kmem_cache *s;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = -ENOMEM;
|
|
|
|
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
|
|
|
|
if (!s)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
s->name = name;
|
|
|
|
s->object_size = object_size;
|
|
|
|
s->size = size;
|
|
|
|
s->align = align;
|
|
|
|
s->ctor = ctor;
|
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
err = init_memcg_params(s, memcg, root_cache);
|
2014-04-07 22:39:26 +00:00
|
|
|
if (err)
|
|
|
|
goto out_free_cache;
|
|
|
|
|
|
|
|
err = __kmem_cache_create(s, flags);
|
|
|
|
if (err)
|
|
|
|
goto out_free_cache;
|
|
|
|
|
|
|
|
s->refcount = 1;
|
|
|
|
list_add(&s->list, &slab_caches);
|
|
|
|
out:
|
|
|
|
if (err)
|
|
|
|
return ERR_PTR(err);
|
|
|
|
return s;
|
|
|
|
|
|
|
|
out_free_cache:
|
2015-02-12 22:59:20 +00:00
|
|
|
destroy_memcg_params(s);
|
2015-02-10 22:09:40 +00:00
|
|
|
kmem_cache_free(kmem_cache, s);
|
2014-04-07 22:39:26 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2012-11-28 16:23:16 +00:00
|
|
|
|
2012-08-16 07:09:46 +00:00
|
|
|
/*
|
|
|
|
* kmem_cache_create - Create a cache.
|
|
|
|
* @name: A string which is used in /proc/slabinfo to identify this cache.
|
|
|
|
* @size: The size of objects to be created in this cache.
|
|
|
|
* @align: The required alignment for the objects.
|
|
|
|
* @flags: SLAB flags
|
|
|
|
* @ctor: A constructor for the objects.
|
|
|
|
*
|
|
|
|
* Returns a ptr to the cache on success, NULL on failure.
|
|
|
|
* Cannot be called within a interrupt, but can be interrupted.
|
|
|
|
* The @ctor is run when new pages are allocated by the cache.
|
|
|
|
*
|
|
|
|
* The flags are
|
|
|
|
*
|
|
|
|
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
|
|
|
|
* to catch references to uninitialised memory.
|
|
|
|
*
|
|
|
|
* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
|
|
|
|
* for buffer overruns.
|
|
|
|
*
|
|
|
|
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
|
|
|
|
* cacheline. This can be beneficial if you're counting cycles as closely
|
|
|
|
* as davem.
|
|
|
|
*/
|
2012-12-18 22:22:34 +00:00
|
|
|
struct kmem_cache *
|
2014-04-07 22:39:26 +00:00
|
|
|
kmem_cache_create(const char *name, size_t size, size_t align,
|
|
|
|
unsigned long flags, void (*ctor)(void *))
|
2012-08-16 07:09:46 +00:00
|
|
|
{
|
2015-11-06 02:45:43 +00:00
|
|
|
struct kmem_cache *s = NULL;
|
2015-02-13 22:36:38 +00:00
|
|
|
const char *cache_name;
|
2014-01-23 23:52:55 +00:00
|
|
|
int err;
|
2012-07-06 20:25:10 +00:00
|
|
|
|
2012-08-16 07:09:46 +00:00
|
|
|
get_online_cpus();
|
slab: get_online_mems for kmem_cache_{create,destroy,shrink}
When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node. To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.
To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug. For
instance, in case of slub:
CPU0 CPU1
---- ----
kmem_cache_create: online_pages:
__kmem_cache_create: slab_memory_callback:
slab_mem_going_online_callback:
lock slab_mutex
for each slab_caches list entry
allocate kmem_cache node
unlock slab_mutex
lock slab_mutex
init_kmem_cache_nodes:
for_each_node_state(node, N_NORMAL_MEMORY)
allocate kmem_cache node
add kmem_cache to slab_caches list
unlock slab_mutex
online_pages (continued):
node_states_set_node
As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.
To avoid issues like that we should hold get/put_online_mems() during
the whole kmem cache creation/destruction/shrink paths, just like we
deal with cpu hotplug. This patch does the trick.
Note, that after it's applied, there is no need in taking the slab_mutex
for kmem_cache_shrink any more, so it is removed from there.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:20 +00:00
|
|
|
get_online_mems();
|
2015-02-12 22:59:01 +00:00
|
|
|
memcg_get_cache_ids();
|
slab: get_online_mems for kmem_cache_{create,destroy,shrink}
When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node. To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.
To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug. For
instance, in case of slub:
CPU0 CPU1
---- ----
kmem_cache_create: online_pages:
__kmem_cache_create: slab_memory_callback:
slab_mem_going_online_callback:
lock slab_mutex
for each slab_caches list entry
allocate kmem_cache node
unlock slab_mutex
lock slab_mutex
init_kmem_cache_nodes:
for_each_node_state(node, N_NORMAL_MEMORY)
allocate kmem_cache node
add kmem_cache to slab_caches list
unlock slab_mutex
online_pages (continued):
node_states_set_node
As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.
To avoid issues like that we should hold get/put_online_mems() during
the whole kmem cache creation/destruction/shrink paths, just like we
deal with cpu hotplug. This patch does the trick.
Note, that after it's applied, there is no need in taking the slab_mutex
for kmem_cache_shrink any more, so it is removed from there.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:20 +00:00
|
|
|
|
2012-08-16 07:09:46 +00:00
|
|
|
mutex_lock(&slab_mutex);
|
2012-09-05 00:20:33 +00:00
|
|
|
|
2014-04-07 22:39:26 +00:00
|
|
|
err = kmem_cache_sanity_check(name, size);
|
2014-10-09 22:25:58 +00:00
|
|
|
if (err) {
|
2014-01-23 23:52:55 +00:00
|
|
|
goto out_unlock;
|
2014-10-09 22:25:58 +00:00
|
|
|
}
|
2012-09-05 00:20:33 +00:00
|
|
|
|
2016-12-13 00:41:38 +00:00
|
|
|
/* Refuse requests with allocator specific flags */
|
|
|
|
if (flags & ~SLAB_FLAGS_PERMITTED) {
|
|
|
|
err = -EINVAL;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2012-10-17 11:36:51 +00:00
|
|
|
/*
|
|
|
|
* Some allocators will constraint the set of valid flags to a subset
|
|
|
|
* of all flags. We expect them to define CACHE_CREATE_MASK in this
|
|
|
|
* case, and we'll just provide them with a sanitized version of the
|
|
|
|
* passed flags.
|
|
|
|
*/
|
|
|
|
flags &= CACHE_CREATE_MASK;
|
2012-09-05 00:20:33 +00:00
|
|
|
|
2014-04-07 22:39:26 +00:00
|
|
|
s = __kmem_cache_alias(name, size, align, flags, ctor);
|
|
|
|
if (s)
|
2014-01-23 23:52:55 +00:00
|
|
|
goto out_unlock;
|
2012-12-18 22:22:34 +00:00
|
|
|
|
2015-02-13 22:36:38 +00:00
|
|
|
cache_name = kstrdup_const(name, GFP_KERNEL);
|
2014-04-07 22:39:26 +00:00
|
|
|
if (!cache_name) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2012-09-04 23:38:33 +00:00
|
|
|
|
2015-11-06 02:45:08 +00:00
|
|
|
s = create_cache(cache_name, size, size,
|
|
|
|
calculate_alignment(flags, align, size),
|
|
|
|
flags, ctor, NULL, NULL);
|
2014-04-07 22:39:26 +00:00
|
|
|
if (IS_ERR(s)) {
|
|
|
|
err = PTR_ERR(s);
|
2015-02-13 22:36:38 +00:00
|
|
|
kfree_const(cache_name);
|
2014-04-07 22:39:26 +00:00
|
|
|
}
|
2014-01-23 23:52:55 +00:00
|
|
|
|
|
|
|
out_unlock:
|
2012-07-06 20:25:13 +00:00
|
|
|
mutex_unlock(&slab_mutex);
|
slab: get_online_mems for kmem_cache_{create,destroy,shrink}
When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node. To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.
To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug. For
instance, in case of slub:
CPU0 CPU1
---- ----
kmem_cache_create: online_pages:
__kmem_cache_create: slab_memory_callback:
slab_mem_going_online_callback:
lock slab_mutex
for each slab_caches list entry
allocate kmem_cache node
unlock slab_mutex
lock slab_mutex
init_kmem_cache_nodes:
for_each_node_state(node, N_NORMAL_MEMORY)
allocate kmem_cache node
add kmem_cache to slab_caches list
unlock slab_mutex
online_pages (continued):
node_states_set_node
As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.
To avoid issues like that we should hold get/put_online_mems() during
the whole kmem cache creation/destruction/shrink paths, just like we
deal with cpu hotplug. This patch does the trick.
Note, that after it's applied, there is no need in taking the slab_mutex
for kmem_cache_shrink any more, so it is removed from there.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:20 +00:00
|
|
|
|
2015-02-12 22:59:01 +00:00
|
|
|
memcg_put_cache_ids();
|
slab: get_online_mems for kmem_cache_{create,destroy,shrink}
When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node. To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.
To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug. For
instance, in case of slub:
CPU0 CPU1
---- ----
kmem_cache_create: online_pages:
__kmem_cache_create: slab_memory_callback:
slab_mem_going_online_callback:
lock slab_mutex
for each slab_caches list entry
allocate kmem_cache node
unlock slab_mutex
lock slab_mutex
init_kmem_cache_nodes:
for_each_node_state(node, N_NORMAL_MEMORY)
allocate kmem_cache node
add kmem_cache to slab_caches list
unlock slab_mutex
online_pages (continued):
node_states_set_node
As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.
To avoid issues like that we should hold get/put_online_mems() during
the whole kmem cache creation/destruction/shrink paths, just like we
deal with cpu hotplug. This patch does the trick.
Note, that after it's applied, there is no need in taking the slab_mutex
for kmem_cache_shrink any more, so it is removed from there.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:20 +00:00
|
|
|
put_online_mems();
|
2012-07-06 20:25:13 +00:00
|
|
|
put_online_cpus();
|
|
|
|
|
slab: fix wrong retval on kmem_cache_create_memcg error path
On kmem_cache_create_memcg() error path we set 'err', but leave 's' (the
new cache ptr) undefined. The latter can be NULL if we could not
allocate the cache, or pointing to a freed area if we failed somewhere
later while trying to initialize it. Initially we checked 'err'
immediately before exiting the function and returned NULL if it was set
ignoring the value of 's':
out_unlock:
...
if (err) {
/* report error */
return NULL;
}
return s;
Recently this check was, in fact, broken by commit f717eb3abb5e ("slab:
do not panic if we fail to create memcg cache"), which turned it to:
out_unlock:
...
if (err && !memcg) {
/* report error */
return NULL;
}
return s;
As a result, if we are failing creating a cache for a memcg, we will
skip the check and return 's' that can contain crap. Obviously, commit
f717eb3abb5e intended not to return crap on error allocating a cache for
a memcg, but only to remove the error reporting in this case, so the
check should look like this:
out_unlock:
...
if (err) {
if (!memcg)
return NULL;
/* report error */
return NULL;
}
return s;
[rientjes@google.com: despaghettification]
[vdavydov@parallels.com: patch monkeying]
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Dave Jones <davej@redhat.com>
Reported-by: Dave Jones <davej@redhat.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-01-29 22:05:48 +00:00
|
|
|
if (err) {
|
2012-09-05 00:20:33 +00:00
|
|
|
if (flags & SLAB_PANIC)
|
|
|
|
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
|
|
|
|
name, err);
|
|
|
|
else {
|
2016-03-17 21:19:50 +00:00
|
|
|
pr_warn("kmem_cache_create(%s) failed with error %d\n",
|
2012-09-05 00:20:33 +00:00
|
|
|
name, err);
|
|
|
|
dump_stack();
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
2012-07-06 20:25:10 +00:00
|
|
|
return s;
|
|
|
|
}
|
2014-04-07 22:39:26 +00:00
|
|
|
EXPORT_SYMBOL(kmem_cache_create);
|
2012-12-18 22:22:34 +00:00
|
|
|
|
2015-11-06 02:45:08 +00:00
|
|
|
static int shutdown_cache(struct kmem_cache *s,
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
struct list_head *release, bool *need_rcu_barrier)
|
|
|
|
{
|
2015-11-06 02:45:14 +00:00
|
|
|
if (__kmem_cache_shutdown(s) != 0)
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
if (s->flags & SLAB_DESTROY_BY_RCU)
|
|
|
|
*need_rcu_barrier = true;
|
|
|
|
|
|
|
|
list_move(&s->list, release);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-11-06 02:45:08 +00:00
|
|
|
static void release_caches(struct list_head *release, bool need_rcu_barrier)
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
{
|
|
|
|
struct kmem_cache *s, *s2;
|
|
|
|
|
|
|
|
if (need_rcu_barrier)
|
|
|
|
rcu_barrier();
|
|
|
|
|
|
|
|
list_for_each_entry_safe(s, s2, release, list) {
|
|
|
|
#ifdef SLAB_SUPPORTS_SYSFS
|
2017-02-22 23:41:11 +00:00
|
|
|
sysfs_slab_release(s);
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
#else
|
|
|
|
slab_kmem_cache_release(s);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-01-20 23:02:32 +00:00
|
|
|
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
|
2014-04-07 22:39:26 +00:00
|
|
|
/*
|
2014-06-04 23:10:02 +00:00
|
|
|
* memcg_create_kmem_cache - Create a cache for a memory cgroup.
|
2014-04-07 22:39:26 +00:00
|
|
|
* @memcg: The memory cgroup the new cache is for.
|
|
|
|
* @root_cache: The parent of the new cache.
|
|
|
|
*
|
|
|
|
* This function attempts to create a kmem cache that will serve allocation
|
|
|
|
* requests going from @memcg to @root_cache. The new cache inherits properties
|
|
|
|
* from its parent.
|
|
|
|
*/
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
void memcg_create_kmem_cache(struct mem_cgroup *memcg,
|
|
|
|
struct kmem_cache *root_cache)
|
2012-12-18 22:22:34 +00:00
|
|
|
{
|
2015-02-10 22:11:44 +00:00
|
|
|
static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
|
2015-09-08 22:01:02 +00:00
|
|
|
struct cgroup_subsys_state *css = &memcg->css;
|
2015-02-12 22:59:20 +00:00
|
|
|
struct memcg_cache_array *arr;
|
memcg, slab: simplify synchronization scheme
At present, we have the following mutexes protecting data related to per
memcg kmem caches:
- slab_mutex. This one is held during the whole kmem cache creation
and destruction paths. We also take it when updating per root cache
memcg_caches arrays (see memcg_update_all_caches). As a result, taking
it guarantees there will be no changes to any kmem cache (including per
memcg). Why do we need something else then? The point is it is
private to slab implementation and has some internal dependencies with
other mutexes (get_online_cpus). So we just don't want to rely upon it
and prefer to introduce additional mutexes instead.
- activate_kmem_mutex. Initially it was added to synchronize
initializing kmem limit (memcg_activate_kmem). However, since we can
grow per root cache memcg_caches arrays only on kmem limit
initialization (see memcg_update_all_caches), we also employ it to
protect against memcg_caches arrays relocation (e.g. see
__kmem_cache_destroy_memcg_children).
- We have a convention not to take slab_mutex in memcontrol.c, but we
want to walk over per memcg memcg_slab_caches lists there (e.g. for
destroying all memcg caches on offline). So we have per memcg
slab_caches_mutex's protecting those lists.
The mutexes are taken in the following order:
activate_kmem_mutex -> slab_mutex -> memcg::slab_caches_mutex
Such a syncrhonization scheme has a number of flaws, for instance:
- We can't call kmem_cache_{destroy,shrink} while walking over a
memcg::memcg_slab_caches list due to locking order. As a result, in
mem_cgroup_destroy_all_caches we schedule the
memcg_cache_params::destroy work shrinking and destroying the cache.
- We don't have a mutex to synchronize per memcg caches destruction
between memcg offline (mem_cgroup_destroy_all_caches) and root cache
destruction (__kmem_cache_destroy_memcg_children). Currently we just
don't bother about it.
This patch simplifies it by substituting per memcg slab_caches_mutex's
with the global memcg_slab_mutex. It will be held whenever a new per
memcg cache is created or destroyed, so it protects per root cache
memcg_caches arrays and per memcg memcg_slab_caches lists. The locking
order is following:
activate_kmem_mutex -> memcg_slab_mutex -> slab_mutex
This allows us to call kmem_cache_{create,shrink,destroy} under the
memcg_slab_mutex. As a result, we don't need memcg_cache_params::destroy
work any more - we can simply destroy caches while iterating over a per
memcg slab caches list.
Also using the global mutex simplifies synchronization between concurrent
per memcg caches creation/destruction, e.g. mem_cgroup_destroy_all_caches
vs __kmem_cache_destroy_memcg_children.
The downside of this is that we substitute per-memcg slab_caches_mutex's
with a hummer-like global mutex, but since we already take either the
slab_mutex or the cgroup_mutex along with a memcg::slab_caches_mutex, it
shouldn't hurt concurrency a lot.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:40 +00:00
|
|
|
struct kmem_cache *s = NULL;
|
2014-04-07 22:39:26 +00:00
|
|
|
char *cache_name;
|
2015-02-12 22:59:20 +00:00
|
|
|
int idx;
|
2014-04-07 22:39:26 +00:00
|
|
|
|
|
|
|
get_online_cpus();
|
slab: get_online_mems for kmem_cache_{create,destroy,shrink}
When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node. To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.
To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug. For
instance, in case of slub:
CPU0 CPU1
---- ----
kmem_cache_create: online_pages:
__kmem_cache_create: slab_memory_callback:
slab_mem_going_online_callback:
lock slab_mutex
for each slab_caches list entry
allocate kmem_cache node
unlock slab_mutex
lock slab_mutex
init_kmem_cache_nodes:
for_each_node_state(node, N_NORMAL_MEMORY)
allocate kmem_cache node
add kmem_cache to slab_caches list
unlock slab_mutex
online_pages (continued):
node_states_set_node
As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.
To avoid issues like that we should hold get/put_online_mems() during
the whole kmem cache creation/destruction/shrink paths, just like we
deal with cpu hotplug. This patch does the trick.
Note, that after it's applied, there is no need in taking the slab_mutex
for kmem_cache_shrink any more, so it is removed from there.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:20 +00:00
|
|
|
get_online_mems();
|
|
|
|
|
2014-04-07 22:39:26 +00:00
|
|
|
mutex_lock(&slab_mutex);
|
|
|
|
|
2015-02-12 22:59:32 +00:00
|
|
|
/*
|
2016-01-20 23:02:24 +00:00
|
|
|
* The memory cgroup could have been offlined while the cache
|
2015-02-12 22:59:32 +00:00
|
|
|
* creation work was pending.
|
|
|
|
*/
|
2016-03-17 21:18:33 +00:00
|
|
|
if (memcg->kmem_state != KMEM_ONLINE)
|
2015-02-12 22:59:32 +00:00
|
|
|
goto out_unlock;
|
|
|
|
|
2015-02-12 22:59:20 +00:00
|
|
|
idx = memcg_cache_id(memcg);
|
|
|
|
arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
|
|
|
|
lockdep_is_held(&slab_mutex));
|
|
|
|
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
/*
|
|
|
|
* Since per-memcg caches are created asynchronously on first
|
|
|
|
* allocation (see memcg_kmem_get_cache()), several threads can try to
|
|
|
|
* create the same cache, but only one of them may succeed.
|
|
|
|
*/
|
2015-02-12 22:59:20 +00:00
|
|
|
if (arr->entries[idx])
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
goto out_unlock;
|
|
|
|
|
2015-02-12 22:59:29 +00:00
|
|
|
cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
|
2016-07-20 22:44:57 +00:00
|
|
|
cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name,
|
|
|
|
css->serial_nr, memcg_name_buf);
|
2014-04-07 22:39:26 +00:00
|
|
|
if (!cache_name)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2015-11-06 02:45:08 +00:00
|
|
|
s = create_cache(cache_name, root_cache->object_size,
|
|
|
|
root_cache->size, root_cache->align,
|
2016-11-10 18:46:41 +00:00
|
|
|
root_cache->flags & CACHE_CREATE_MASK,
|
|
|
|
root_cache->ctor, memcg, root_cache);
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
/*
|
|
|
|
* If we could not create a memcg cache, do not complain, because
|
|
|
|
* that's not critical at all as we can always proceed with the root
|
|
|
|
* cache.
|
|
|
|
*/
|
memcg, slab: simplify synchronization scheme
At present, we have the following mutexes protecting data related to per
memcg kmem caches:
- slab_mutex. This one is held during the whole kmem cache creation
and destruction paths. We also take it when updating per root cache
memcg_caches arrays (see memcg_update_all_caches). As a result, taking
it guarantees there will be no changes to any kmem cache (including per
memcg). Why do we need something else then? The point is it is
private to slab implementation and has some internal dependencies with
other mutexes (get_online_cpus). So we just don't want to rely upon it
and prefer to introduce additional mutexes instead.
- activate_kmem_mutex. Initially it was added to synchronize
initializing kmem limit (memcg_activate_kmem). However, since we can
grow per root cache memcg_caches arrays only on kmem limit
initialization (see memcg_update_all_caches), we also employ it to
protect against memcg_caches arrays relocation (e.g. see
__kmem_cache_destroy_memcg_children).
- We have a convention not to take slab_mutex in memcontrol.c, but we
want to walk over per memcg memcg_slab_caches lists there (e.g. for
destroying all memcg caches on offline). So we have per memcg
slab_caches_mutex's protecting those lists.
The mutexes are taken in the following order:
activate_kmem_mutex -> slab_mutex -> memcg::slab_caches_mutex
Such a syncrhonization scheme has a number of flaws, for instance:
- We can't call kmem_cache_{destroy,shrink} while walking over a
memcg::memcg_slab_caches list due to locking order. As a result, in
mem_cgroup_destroy_all_caches we schedule the
memcg_cache_params::destroy work shrinking and destroying the cache.
- We don't have a mutex to synchronize per memcg caches destruction
between memcg offline (mem_cgroup_destroy_all_caches) and root cache
destruction (__kmem_cache_destroy_memcg_children). Currently we just
don't bother about it.
This patch simplifies it by substituting per memcg slab_caches_mutex's
with the global memcg_slab_mutex. It will be held whenever a new per
memcg cache is created or destroyed, so it protects per root cache
memcg_caches arrays and per memcg memcg_slab_caches lists. The locking
order is following:
activate_kmem_mutex -> memcg_slab_mutex -> slab_mutex
This allows us to call kmem_cache_{create,shrink,destroy} under the
memcg_slab_mutex. As a result, we don't need memcg_cache_params::destroy
work any more - we can simply destroy caches while iterating over a per
memcg slab caches list.
Also using the global mutex simplifies synchronization between concurrent
per memcg caches creation/destruction, e.g. mem_cgroup_destroy_all_caches
vs __kmem_cache_destroy_memcg_children.
The downside of this is that we substitute per-memcg slab_caches_mutex's
with a hummer-like global mutex, but since we already take either the
slab_mutex or the cgroup_mutex along with a memcg::slab_caches_mutex, it
shouldn't hurt concurrency a lot.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:40 +00:00
|
|
|
if (IS_ERR(s)) {
|
2014-04-07 22:39:26 +00:00
|
|
|
kfree(cache_name);
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
goto out_unlock;
|
memcg, slab: simplify synchronization scheme
At present, we have the following mutexes protecting data related to per
memcg kmem caches:
- slab_mutex. This one is held during the whole kmem cache creation
and destruction paths. We also take it when updating per root cache
memcg_caches arrays (see memcg_update_all_caches). As a result, taking
it guarantees there will be no changes to any kmem cache (including per
memcg). Why do we need something else then? The point is it is
private to slab implementation and has some internal dependencies with
other mutexes (get_online_cpus). So we just don't want to rely upon it
and prefer to introduce additional mutexes instead.
- activate_kmem_mutex. Initially it was added to synchronize
initializing kmem limit (memcg_activate_kmem). However, since we can
grow per root cache memcg_caches arrays only on kmem limit
initialization (see memcg_update_all_caches), we also employ it to
protect against memcg_caches arrays relocation (e.g. see
__kmem_cache_destroy_memcg_children).
- We have a convention not to take slab_mutex in memcontrol.c, but we
want to walk over per memcg memcg_slab_caches lists there (e.g. for
destroying all memcg caches on offline). So we have per memcg
slab_caches_mutex's protecting those lists.
The mutexes are taken in the following order:
activate_kmem_mutex -> slab_mutex -> memcg::slab_caches_mutex
Such a syncrhonization scheme has a number of flaws, for instance:
- We can't call kmem_cache_{destroy,shrink} while walking over a
memcg::memcg_slab_caches list due to locking order. As a result, in
mem_cgroup_destroy_all_caches we schedule the
memcg_cache_params::destroy work shrinking and destroying the cache.
- We don't have a mutex to synchronize per memcg caches destruction
between memcg offline (mem_cgroup_destroy_all_caches) and root cache
destruction (__kmem_cache_destroy_memcg_children). Currently we just
don't bother about it.
This patch simplifies it by substituting per memcg slab_caches_mutex's
with the global memcg_slab_mutex. It will be held whenever a new per
memcg cache is created or destroyed, so it protects per root cache
memcg_caches arrays and per memcg memcg_slab_caches lists. The locking
order is following:
activate_kmem_mutex -> memcg_slab_mutex -> slab_mutex
This allows us to call kmem_cache_{create,shrink,destroy} under the
memcg_slab_mutex. As a result, we don't need memcg_cache_params::destroy
work any more - we can simply destroy caches while iterating over a per
memcg slab caches list.
Also using the global mutex simplifies synchronization between concurrent
per memcg caches creation/destruction, e.g. mem_cgroup_destroy_all_caches
vs __kmem_cache_destroy_memcg_children.
The downside of this is that we substitute per-memcg slab_caches_mutex's
with a hummer-like global mutex, but since we already take either the
slab_mutex or the cgroup_mutex along with a memcg::slab_caches_mutex, it
shouldn't hurt concurrency a lot.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:40 +00:00
|
|
|
}
|
2014-04-07 22:39:26 +00:00
|
|
|
|
2015-02-12 22:59:23 +00:00
|
|
|
list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
|
|
|
|
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
/*
|
|
|
|
* Since readers won't lock (see cache_from_memcg_idx()), we need a
|
|
|
|
* barrier here to ensure nobody will see the kmem_cache partially
|
|
|
|
* initialized.
|
|
|
|
*/
|
|
|
|
smp_wmb();
|
2015-02-12 22:59:20 +00:00
|
|
|
arr->entries[idx] = s;
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
|
2014-04-07 22:39:26 +00:00
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&slab_mutex);
|
slab: get_online_mems for kmem_cache_{create,destroy,shrink}
When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node. To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.
To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug. For
instance, in case of slub:
CPU0 CPU1
---- ----
kmem_cache_create: online_pages:
__kmem_cache_create: slab_memory_callback:
slab_mem_going_online_callback:
lock slab_mutex
for each slab_caches list entry
allocate kmem_cache node
unlock slab_mutex
lock slab_mutex
init_kmem_cache_nodes:
for_each_node_state(node, N_NORMAL_MEMORY)
allocate kmem_cache node
add kmem_cache to slab_caches list
unlock slab_mutex
online_pages (continued):
node_states_set_node
As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.
To avoid issues like that we should hold get/put_online_mems() during
the whole kmem cache creation/destruction/shrink paths, just like we
deal with cpu hotplug. This patch does the trick.
Note, that after it's applied, there is no need in taking the slab_mutex
for kmem_cache_shrink any more, so it is removed from there.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:20 +00:00
|
|
|
|
|
|
|
put_online_mems();
|
2014-04-07 22:39:26 +00:00
|
|
|
put_online_cpus();
|
2012-12-18 22:22:34 +00:00
|
|
|
}
|
2014-04-07 22:39:28 +00:00
|
|
|
|
2015-02-12 22:59:32 +00:00
|
|
|
void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
int idx;
|
|
|
|
struct memcg_cache_array *arr;
|
slub: make dead caches discard free slabs immediately
To speed up further allocations SLUB may store empty slabs in per cpu/node
partial lists instead of freeing them immediately. This prevents per
memcg caches destruction, because kmem caches created for a memory cgroup
are only destroyed after the last page charged to the cgroup is freed.
To fix this issue, this patch resurrects approach first proposed in [1].
It forbids SLUB to cache empty slabs after the memory cgroup that the
cache belongs to was destroyed. It is achieved by setting kmem_cache's
cpu_partial and min_partial constants to 0 and tuning put_cpu_partial() so
that it would drop frozen empty slabs immediately if cpu_partial = 0.
The runtime overhead is minimal. From all the hot functions, we only
touch relatively cold put_cpu_partial(): we make it call
unfreeze_partials() after freezing a slab that belongs to an offline
memory cgroup. Since slab freezing exists to avoid moving slabs from/to a
partial list on free/alloc, and there can't be allocations from dead
caches, it shouldn't cause any overhead. We do have to disable preemption
for put_cpu_partial() to achieve that though.
The original patch was accepted well and even merged to the mm tree.
However, I decided to withdraw it due to changes happening to the memcg
core at that time. I had an idea of introducing per-memcg shrinkers for
kmem caches, but now, as memcg has finally settled down, I do not see it
as an option, because SLUB shrinker would be too costly to call since SLUB
does not keep free slabs on a separate list. Besides, we currently do not
even call per-memcg shrinkers for offline memcgs. Overall, it would
introduce much more complexity to both SLUB and memcg than this small
patch.
Regarding to SLAB, there's no problem with it, because it shrinks
per-cpu/node caches periodically. Thanks to list_lru reparenting, we no
longer keep entries for offline cgroups in per-memcg arrays (such as
memcg_cache_params->memcg_caches), so we do not have to bother if a
per-memcg cache will be shrunk a bit later than it could be.
[1] http://thread.gmane.org/gmane.linux.kernel.mm/118649/focus=118650
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 22:59:47 +00:00
|
|
|
struct kmem_cache *s, *c;
|
2015-02-12 22:59:32 +00:00
|
|
|
|
|
|
|
idx = memcg_cache_id(memcg);
|
|
|
|
|
slub: make dead caches discard free slabs immediately
To speed up further allocations SLUB may store empty slabs in per cpu/node
partial lists instead of freeing them immediately. This prevents per
memcg caches destruction, because kmem caches created for a memory cgroup
are only destroyed after the last page charged to the cgroup is freed.
To fix this issue, this patch resurrects approach first proposed in [1].
It forbids SLUB to cache empty slabs after the memory cgroup that the
cache belongs to was destroyed. It is achieved by setting kmem_cache's
cpu_partial and min_partial constants to 0 and tuning put_cpu_partial() so
that it would drop frozen empty slabs immediately if cpu_partial = 0.
The runtime overhead is minimal. From all the hot functions, we only
touch relatively cold put_cpu_partial(): we make it call
unfreeze_partials() after freezing a slab that belongs to an offline
memory cgroup. Since slab freezing exists to avoid moving slabs from/to a
partial list on free/alloc, and there can't be allocations from dead
caches, it shouldn't cause any overhead. We do have to disable preemption
for put_cpu_partial() to achieve that though.
The original patch was accepted well and even merged to the mm tree.
However, I decided to withdraw it due to changes happening to the memcg
core at that time. I had an idea of introducing per-memcg shrinkers for
kmem caches, but now, as memcg has finally settled down, I do not see it
as an option, because SLUB shrinker would be too costly to call since SLUB
does not keep free slabs on a separate list. Besides, we currently do not
even call per-memcg shrinkers for offline memcgs. Overall, it would
introduce much more complexity to both SLUB and memcg than this small
patch.
Regarding to SLAB, there's no problem with it, because it shrinks
per-cpu/node caches periodically. Thanks to list_lru reparenting, we no
longer keep entries for offline cgroups in per-memcg arrays (such as
memcg_cache_params->memcg_caches), so we do not have to bother if a
per-memcg cache will be shrunk a bit later than it could be.
[1] http://thread.gmane.org/gmane.linux.kernel.mm/118649/focus=118650
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 22:59:47 +00:00
|
|
|
get_online_cpus();
|
|
|
|
get_online_mems();
|
|
|
|
|
2015-02-12 22:59:32 +00:00
|
|
|
mutex_lock(&slab_mutex);
|
|
|
|
list_for_each_entry(s, &slab_caches, list) {
|
|
|
|
if (!is_root_cache(s))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
|
|
|
|
lockdep_is_held(&slab_mutex));
|
slub: make dead caches discard free slabs immediately
To speed up further allocations SLUB may store empty slabs in per cpu/node
partial lists instead of freeing them immediately. This prevents per
memcg caches destruction, because kmem caches created for a memory cgroup
are only destroyed after the last page charged to the cgroup is freed.
To fix this issue, this patch resurrects approach first proposed in [1].
It forbids SLUB to cache empty slabs after the memory cgroup that the
cache belongs to was destroyed. It is achieved by setting kmem_cache's
cpu_partial and min_partial constants to 0 and tuning put_cpu_partial() so
that it would drop frozen empty slabs immediately if cpu_partial = 0.
The runtime overhead is minimal. From all the hot functions, we only
touch relatively cold put_cpu_partial(): we make it call
unfreeze_partials() after freezing a slab that belongs to an offline
memory cgroup. Since slab freezing exists to avoid moving slabs from/to a
partial list on free/alloc, and there can't be allocations from dead
caches, it shouldn't cause any overhead. We do have to disable preemption
for put_cpu_partial() to achieve that though.
The original patch was accepted well and even merged to the mm tree.
However, I decided to withdraw it due to changes happening to the memcg
core at that time. I had an idea of introducing per-memcg shrinkers for
kmem caches, but now, as memcg has finally settled down, I do not see it
as an option, because SLUB shrinker would be too costly to call since SLUB
does not keep free slabs on a separate list. Besides, we currently do not
even call per-memcg shrinkers for offline memcgs. Overall, it would
introduce much more complexity to both SLUB and memcg than this small
patch.
Regarding to SLAB, there's no problem with it, because it shrinks
per-cpu/node caches periodically. Thanks to list_lru reparenting, we no
longer keep entries for offline cgroups in per-memcg arrays (such as
memcg_cache_params->memcg_caches), so we do not have to bother if a
per-memcg cache will be shrunk a bit later than it could be.
[1] http://thread.gmane.org/gmane.linux.kernel.mm/118649/focus=118650
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 22:59:47 +00:00
|
|
|
c = arr->entries[idx];
|
|
|
|
if (!c)
|
|
|
|
continue;
|
|
|
|
|
Revert "slub: move synchronize_sched out of slab_mutex on shrink"
Patch series "slab: make memcg slab destruction scalable", v3.
With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is not
under memory pressure. When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code.
I've seen machines which end up with hundred thousands of caches and
many millions of kernfs_nodes. The current code is O(N^2) on the total
number of caches and has synchronous rcu_barrier() and
synchronize_sched() in cgroup offline / release path which is executed
while holding cgroup_mutex. Combined, this leads to very expensive and
slow cache destruction operations which can easily keep running for half
a day.
This also messes up /proc/slabinfo along with other cache iterating
operations. seq_file operates on 4k chunks and on each 4k boundary
tries to seek to the last position in the list. With a huge number of
caches on the list, this becomes very slow and very prone to the list
content changing underneath it leading to a lot of missing and/or
duplicate entries.
This patchset addresses the scalability problem.
* Add root and per-memcg lists. Update each user to use the
appropriate list.
* Make rcu_barrier() for SLAB_DESTROY_BY_RCU caches globally batched
and asynchronous.
* For dying empty slub caches, remove the sysfs files after
deactivation so that we don't end up with millions of sysfs files
without any useful information on them.
This patchset contains the following nine patches.
0001-Revert-slub-move-synchronize_sched-out-of-slab_mutex.patch
0002-slub-separate-out-sysfs_slab_release-from-sysfs_slab.patch
0003-slab-remove-synchronous-rcu_barrier-call-in-memcg-ca.patch
0004-slab-reorganize-memcg_cache_params.patch
0005-slab-link-memcg-kmem_caches-on-their-associated-memo.patch
0006-slab-implement-slab_root_caches-list.patch
0007-slab-introduce-__kmemcg_cache_deactivate.patch
0008-slab-remove-synchronous-synchronize_sched-from-memcg.patch
0009-slab-remove-slub-sysfs-interface-files-early-for-emp.patch
0010-slab-use-memcg_kmem_cache_wq-for-slab-destruction-op.patch
0001 reverts an existing optimization to prepare for the following
changes. 0002 is a prep patch. 0003 makes rcu_barrier() in release
path batched and asynchronous. 0004-0006 separate out the lists.
0007-0008 replace synchronize_sched() in slub destruction path with
call_rcu_sched(). 0009 removes sysfs files early for empty dying
caches. 0010 makes destruction work items use a workqueue with limited
concurrency.
This patch (of 10):
Revert 89e364db71fb5e ("slub: move synchronize_sched out of slab_mutex on
shrink").
With kmem cgroup support enabled, kmem_caches can be created and destroyed
frequently and a great number of near empty kmem_caches can accumulate if
there are a lot of transient cgroups and the system is not under memory
pressure. When memory reclaim starts under such conditions, it can lead
to consecutive deactivation and destruction of many kmem_caches, easily
hundreds of thousands on moderately large systems, exposing scalability
issues in the current slab management code. This is one of the patches to
address the issue.
Moving synchronize_sched() out of slab_mutex isn't enough as it's still
inside cgroup_mutex. The whole deactivation / release path will be
updated to avoid all synchronous RCU operations. Revert this insufficient
optimization in preparation to ease future changes.
Link: http://lkml.kernel.org/r/20170117235411.9408-2-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-22 23:41:08 +00:00
|
|
|
__kmem_cache_shrink(c, true);
|
2015-02-12 22:59:32 +00:00
|
|
|
arr->entries[idx] = NULL;
|
|
|
|
}
|
|
|
|
mutex_unlock(&slab_mutex);
|
slub: make dead caches discard free slabs immediately
To speed up further allocations SLUB may store empty slabs in per cpu/node
partial lists instead of freeing them immediately. This prevents per
memcg caches destruction, because kmem caches created for a memory cgroup
are only destroyed after the last page charged to the cgroup is freed.
To fix this issue, this patch resurrects approach first proposed in [1].
It forbids SLUB to cache empty slabs after the memory cgroup that the
cache belongs to was destroyed. It is achieved by setting kmem_cache's
cpu_partial and min_partial constants to 0 and tuning put_cpu_partial() so
that it would drop frozen empty slabs immediately if cpu_partial = 0.
The runtime overhead is minimal. From all the hot functions, we only
touch relatively cold put_cpu_partial(): we make it call
unfreeze_partials() after freezing a slab that belongs to an offline
memory cgroup. Since slab freezing exists to avoid moving slabs from/to a
partial list on free/alloc, and there can't be allocations from dead
caches, it shouldn't cause any overhead. We do have to disable preemption
for put_cpu_partial() to achieve that though.
The original patch was accepted well and even merged to the mm tree.
However, I decided to withdraw it due to changes happening to the memcg
core at that time. I had an idea of introducing per-memcg shrinkers for
kmem caches, but now, as memcg has finally settled down, I do not see it
as an option, because SLUB shrinker would be too costly to call since SLUB
does not keep free slabs on a separate list. Besides, we currently do not
even call per-memcg shrinkers for offline memcgs. Overall, it would
introduce much more complexity to both SLUB and memcg than this small
patch.
Regarding to SLAB, there's no problem with it, because it shrinks
per-cpu/node caches periodically. Thanks to list_lru reparenting, we no
longer keep entries for offline cgroups in per-memcg arrays (such as
memcg_cache_params->memcg_caches), so we do not have to bother if a
per-memcg cache will be shrunk a bit later than it could be.
[1] http://thread.gmane.org/gmane.linux.kernel.mm/118649/focus=118650
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 22:59:47 +00:00
|
|
|
|
|
|
|
put_online_mems();
|
|
|
|
put_online_cpus();
|
2015-02-12 22:59:32 +00:00
|
|
|
}
|
|
|
|
|
2015-11-06 02:45:11 +00:00
|
|
|
static int __shutdown_memcg_cache(struct kmem_cache *s,
|
|
|
|
struct list_head *release, bool *need_rcu_barrier)
|
|
|
|
{
|
|
|
|
BUG_ON(is_root_cache(s));
|
|
|
|
|
|
|
|
if (shutdown_cache(s, release, need_rcu_barrier))
|
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
list_del(&s->memcg_params.list);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
|
2014-04-07 22:39:28 +00:00
|
|
|
{
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
LIST_HEAD(release);
|
|
|
|
bool need_rcu_barrier = false;
|
|
|
|
struct kmem_cache *s, *s2;
|
2014-04-07 22:39:28 +00:00
|
|
|
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
get_online_cpus();
|
|
|
|
get_online_mems();
|
2014-04-07 22:39:28 +00:00
|
|
|
|
|
|
|
mutex_lock(&slab_mutex);
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
list_for_each_entry_safe(s, s2, &slab_caches, list) {
|
2015-02-12 22:59:20 +00:00
|
|
|
if (is_root_cache(s) || s->memcg_params.memcg != memcg)
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* The cgroup is about to be freed and therefore has no charges
|
|
|
|
* left. Hence, all its caches must be empty by now.
|
|
|
|
*/
|
2015-11-06 02:45:11 +00:00
|
|
|
BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier));
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
}
|
|
|
|
mutex_unlock(&slab_mutex);
|
2014-04-07 22:39:28 +00:00
|
|
|
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
put_online_mems();
|
|
|
|
put_online_cpus();
|
|
|
|
|
2015-11-06 02:45:08 +00:00
|
|
|
release_caches(&release, need_rcu_barrier);
|
2014-04-07 22:39:28 +00:00
|
|
|
}
|
2015-11-06 02:45:11 +00:00
|
|
|
|
|
|
|
static int shutdown_memcg_caches(struct kmem_cache *s,
|
|
|
|
struct list_head *release, bool *need_rcu_barrier)
|
|
|
|
{
|
|
|
|
struct memcg_cache_array *arr;
|
|
|
|
struct kmem_cache *c, *c2;
|
|
|
|
LIST_HEAD(busy);
|
|
|
|
int i;
|
|
|
|
|
|
|
|
BUG_ON(!is_root_cache(s));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* First, shutdown active caches, i.e. caches that belong to online
|
|
|
|
* memory cgroups.
|
|
|
|
*/
|
|
|
|
arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
|
|
|
|
lockdep_is_held(&slab_mutex));
|
|
|
|
for_each_memcg_cache_index(i) {
|
|
|
|
c = arr->entries[i];
|
|
|
|
if (!c)
|
|
|
|
continue;
|
|
|
|
if (__shutdown_memcg_cache(c, release, need_rcu_barrier))
|
|
|
|
/*
|
|
|
|
* The cache still has objects. Move it to a temporary
|
|
|
|
* list so as not to try to destroy it for a second
|
|
|
|
* time while iterating over inactive caches below.
|
|
|
|
*/
|
|
|
|
list_move(&c->memcg_params.list, &busy);
|
|
|
|
else
|
|
|
|
/*
|
|
|
|
* The cache is empty and will be destroyed soon. Clear
|
|
|
|
* the pointer to it in the memcg_caches array so that
|
|
|
|
* it will never be accessed even if the root cache
|
|
|
|
* stays alive.
|
|
|
|
*/
|
|
|
|
arr->entries[i] = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Second, shutdown all caches left from memory cgroups that are now
|
|
|
|
* offline.
|
|
|
|
*/
|
|
|
|
list_for_each_entry_safe(c, c2, &s->memcg_params.list,
|
|
|
|
memcg_params.list)
|
|
|
|
__shutdown_memcg_cache(c, release, need_rcu_barrier);
|
|
|
|
|
|
|
|
list_splice(&busy, &s->memcg_params.list);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A cache being destroyed must be empty. In particular, this means
|
|
|
|
* that all per memcg caches attached to it must be empty too.
|
|
|
|
*/
|
|
|
|
if (!list_empty(&s->memcg_params.list))
|
|
|
|
return -EBUSY;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline int shutdown_memcg_caches(struct kmem_cache *s,
|
|
|
|
struct list_head *release, bool *need_rcu_barrier)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2016-01-20 23:02:32 +00:00
|
|
|
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
|
2012-07-06 20:25:11 +00:00
|
|
|
|
2014-05-06 19:50:08 +00:00
|
|
|
void slab_kmem_cache_release(struct kmem_cache *s)
|
|
|
|
{
|
2016-02-17 21:11:37 +00:00
|
|
|
__kmem_cache_release(s);
|
2015-02-12 22:59:20 +00:00
|
|
|
destroy_memcg_params(s);
|
2015-02-13 22:36:38 +00:00
|
|
|
kfree_const(s->name);
|
2014-05-06 19:50:08 +00:00
|
|
|
kmem_cache_free(kmem_cache, s);
|
|
|
|
}
|
|
|
|
|
2012-09-04 23:18:33 +00:00
|
|
|
void kmem_cache_destroy(struct kmem_cache *s)
|
|
|
|
{
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
LIST_HEAD(release);
|
|
|
|
bool need_rcu_barrier = false;
|
2015-11-06 02:45:11 +00:00
|
|
|
int err;
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
|
2015-09-08 22:00:50 +00:00
|
|
|
if (unlikely(!s))
|
|
|
|
return;
|
|
|
|
|
2012-09-04 23:18:33 +00:00
|
|
|
get_online_cpus();
|
slab: get_online_mems for kmem_cache_{create,destroy,shrink}
When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node. To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.
To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug. For
instance, in case of slub:
CPU0 CPU1
---- ----
kmem_cache_create: online_pages:
__kmem_cache_create: slab_memory_callback:
slab_mem_going_online_callback:
lock slab_mutex
for each slab_caches list entry
allocate kmem_cache node
unlock slab_mutex
lock slab_mutex
init_kmem_cache_nodes:
for_each_node_state(node, N_NORMAL_MEMORY)
allocate kmem_cache node
add kmem_cache to slab_caches list
unlock slab_mutex
online_pages (continued):
node_states_set_node
As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.
To avoid issues like that we should hold get/put_online_mems() during
the whole kmem cache creation/destruction/shrink paths, just like we
deal with cpu hotplug. This patch does the trick.
Note, that after it's applied, there is no need in taking the slab_mutex
for kmem_cache_shrink any more, so it is removed from there.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:20 +00:00
|
|
|
get_online_mems();
|
|
|
|
|
mm: kasan: initial memory quarantine implementation
Quarantine isolates freed objects in a separate queue. The objects are
returned to the allocator later, which helps to detect use-after-free
errors.
When the object is freed, its state changes from KASAN_STATE_ALLOC to
KASAN_STATE_QUARANTINE. The object is poisoned and put into quarantine
instead of being returned to the allocator, therefore every subsequent
access to that object triggers a KASAN error, and the error handler is
able to say where the object has been allocated and deallocated.
When it's time for the object to leave quarantine, its state becomes
KASAN_STATE_FREE and it's returned to the allocator. From now on the
allocator may reuse it for another allocation. Before that happens,
it's still possible to detect a use-after free on that object (it
retains the allocation/deallocation stacks).
When the allocator reuses this object, the shadow is unpoisoned and old
allocation/deallocation stacks are wiped. Therefore a use of this
object, even an incorrect one, won't trigger ASan warning.
Without the quarantine, it's not guaranteed that the objects aren't
reused immediately, that's why the probability of catching a
use-after-free is lower than with quarantine in place.
Quarantine isolates freed objects in a separate queue. The objects are
returned to the allocator later, which helps to detect use-after-free
errors.
Freed objects are first added to per-cpu quarantine queues. When a
cache is destroyed or memory shrinking is requested, the objects are
moved into the global quarantine queue. Whenever a kmalloc call allows
memory reclaiming, the oldest objects are popped out of the global queue
until the total size of objects in quarantine is less than 3/4 of the
maximum quarantine size (which is a fraction of installed physical
memory).
As long as an object remains in the quarantine, KASAN is able to report
accesses to it, so the chance of reporting a use-after-free is
increased. Once the object leaves quarantine, the allocator may reuse
it, in which case the object is unpoisoned and KASAN can't detect
incorrect accesses to it.
Right now quarantine support is only enabled in SLAB allocator.
Unification of KASAN features in SLAB and SLUB will be done later.
This patch is based on the "mm: kasan: quarantine" patch originally
prepared by Dmitry Chernenkov. A number of improvements have been
suggested by Andrey Ryabinin.
[glider@google.com: v9]
Link: http://lkml.kernel.org/r/1462987130-144092-1-git-send-email-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-05-20 23:59:11 +00:00
|
|
|
kasan_cache_destroy(s);
|
2012-09-04 23:18:33 +00:00
|
|
|
mutex_lock(&slab_mutex);
|
2014-04-07 22:39:28 +00:00
|
|
|
|
2012-09-04 23:18:33 +00:00
|
|
|
s->refcount--;
|
2014-04-07 22:39:28 +00:00
|
|
|
if (s->refcount)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2015-11-06 02:45:11 +00:00
|
|
|
err = shutdown_memcg_caches(s, &release, &need_rcu_barrier);
|
|
|
|
if (!err)
|
2015-11-06 02:45:14 +00:00
|
|
|
err = shutdown_cache(s, &release, &need_rcu_barrier);
|
2014-04-07 22:39:28 +00:00
|
|
|
|
2015-11-06 02:45:14 +00:00
|
|
|
if (err) {
|
2016-03-17 21:19:47 +00:00
|
|
|
pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
|
|
|
|
s->name);
|
2015-11-06 02:45:14 +00:00
|
|
|
dump_stack();
|
|
|
|
}
|
2014-04-07 22:39:28 +00:00
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&slab_mutex);
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
|
slab: get_online_mems for kmem_cache_{create,destroy,shrink}
When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node. To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.
To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug. For
instance, in case of slub:
CPU0 CPU1
---- ----
kmem_cache_create: online_pages:
__kmem_cache_create: slab_memory_callback:
slab_mem_going_online_callback:
lock slab_mutex
for each slab_caches list entry
allocate kmem_cache node
unlock slab_mutex
lock slab_mutex
init_kmem_cache_nodes:
for_each_node_state(node, N_NORMAL_MEMORY)
allocate kmem_cache node
add kmem_cache to slab_caches list
unlock slab_mutex
online_pages (continued):
node_states_set_node
As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.
To avoid issues like that we should hold get/put_online_mems() during
the whole kmem cache creation/destruction/shrink paths, just like we
deal with cpu hotplug. This patch does the trick.
Note, that after it's applied, there is no need in taking the slab_mutex
for kmem_cache_shrink any more, so it is removed from there.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:20 +00:00
|
|
|
put_online_mems();
|
2012-09-04 23:18:33 +00:00
|
|
|
put_online_cpus();
|
memcg: zap memcg_slab_caches and memcg_slab_mutex
mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup. Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed. The
list is protected by memcg_slab_mutex. The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.
However, we can perfectly get on without these two. To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.
Apart from this nice cleanup, it also:
- assures that rcu_barrier() is called once at max when a root cache is
destroyed or a memory cgroup is freed, no matter how many caches have
SLAB_DESTROY_BY_RCU flag set;
- fixes the race between kmem_cache_destroy and kmem_cache_create that
exists, because memcg_cleanup_cache_params, which is called from
kmem_cache_destroy after checking that kmem_cache->refcount=0,
releases the slab_mutex, which gives kmem_cache_create a chance to
make an alias to a cache doomed to be destroyed.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-10 22:11:47 +00:00
|
|
|
|
2015-11-06 02:45:08 +00:00
|
|
|
release_caches(&release, need_rcu_barrier);
|
2012-09-04 23:18:33 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kmem_cache_destroy);
|
|
|
|
|
slab: get_online_mems for kmem_cache_{create,destroy,shrink}
When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node. To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.
To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug. For
instance, in case of slub:
CPU0 CPU1
---- ----
kmem_cache_create: online_pages:
__kmem_cache_create: slab_memory_callback:
slab_mem_going_online_callback:
lock slab_mutex
for each slab_caches list entry
allocate kmem_cache node
unlock slab_mutex
lock slab_mutex
init_kmem_cache_nodes:
for_each_node_state(node, N_NORMAL_MEMORY)
allocate kmem_cache node
add kmem_cache to slab_caches list
unlock slab_mutex
online_pages (continued):
node_states_set_node
As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.
To avoid issues like that we should hold get/put_online_mems() during
the whole kmem cache creation/destruction/shrink paths, just like we
deal with cpu hotplug. This patch does the trick.
Note, that after it's applied, there is no need in taking the slab_mutex
for kmem_cache_shrink any more, so it is removed from there.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:20 +00:00
|
|
|
/**
|
|
|
|
* kmem_cache_shrink - Shrink a cache.
|
|
|
|
* @cachep: The cache to shrink.
|
|
|
|
*
|
|
|
|
* Releases as many slabs as possible for a cache.
|
|
|
|
* To help debugging, a zero exit status indicates all slabs were released.
|
|
|
|
*/
|
|
|
|
int kmem_cache_shrink(struct kmem_cache *cachep)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
get_online_cpus();
|
|
|
|
get_online_mems();
|
mm: kasan: initial memory quarantine implementation
Quarantine isolates freed objects in a separate queue. The objects are
returned to the allocator later, which helps to detect use-after-free
errors.
When the object is freed, its state changes from KASAN_STATE_ALLOC to
KASAN_STATE_QUARANTINE. The object is poisoned and put into quarantine
instead of being returned to the allocator, therefore every subsequent
access to that object triggers a KASAN error, and the error handler is
able to say where the object has been allocated and deallocated.
When it's time for the object to leave quarantine, its state becomes
KASAN_STATE_FREE and it's returned to the allocator. From now on the
allocator may reuse it for another allocation. Before that happens,
it's still possible to detect a use-after free on that object (it
retains the allocation/deallocation stacks).
When the allocator reuses this object, the shadow is unpoisoned and old
allocation/deallocation stacks are wiped. Therefore a use of this
object, even an incorrect one, won't trigger ASan warning.
Without the quarantine, it's not guaranteed that the objects aren't
reused immediately, that's why the probability of catching a
use-after-free is lower than with quarantine in place.
Quarantine isolates freed objects in a separate queue. The objects are
returned to the allocator later, which helps to detect use-after-free
errors.
Freed objects are first added to per-cpu quarantine queues. When a
cache is destroyed or memory shrinking is requested, the objects are
moved into the global quarantine queue. Whenever a kmalloc call allows
memory reclaiming, the oldest objects are popped out of the global queue
until the total size of objects in quarantine is less than 3/4 of the
maximum quarantine size (which is a fraction of installed physical
memory).
As long as an object remains in the quarantine, KASAN is able to report
accesses to it, so the chance of reporting a use-after-free is
increased. Once the object leaves quarantine, the allocator may reuse
it, in which case the object is unpoisoned and KASAN can't detect
incorrect accesses to it.
Right now quarantine support is only enabled in SLAB allocator.
Unification of KASAN features in SLAB and SLUB will be done later.
This patch is based on the "mm: kasan: quarantine" patch originally
prepared by Dmitry Chernenkov. A number of improvements have been
suggested by Andrey Ryabinin.
[glider@google.com: v9]
Link: http://lkml.kernel.org/r/1462987130-144092-1-git-send-email-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-05-20 23:59:11 +00:00
|
|
|
kasan_cache_shrink(cachep);
|
Revert "slub: move synchronize_sched out of slab_mutex on shrink"
Patch series "slab: make memcg slab destruction scalable", v3.
With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is not
under memory pressure. When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code.
I've seen machines which end up with hundred thousands of caches and
many millions of kernfs_nodes. The current code is O(N^2) on the total
number of caches and has synchronous rcu_barrier() and
synchronize_sched() in cgroup offline / release path which is executed
while holding cgroup_mutex. Combined, this leads to very expensive and
slow cache destruction operations which can easily keep running for half
a day.
This also messes up /proc/slabinfo along with other cache iterating
operations. seq_file operates on 4k chunks and on each 4k boundary
tries to seek to the last position in the list. With a huge number of
caches on the list, this becomes very slow and very prone to the list
content changing underneath it leading to a lot of missing and/or
duplicate entries.
This patchset addresses the scalability problem.
* Add root and per-memcg lists. Update each user to use the
appropriate list.
* Make rcu_barrier() for SLAB_DESTROY_BY_RCU caches globally batched
and asynchronous.
* For dying empty slub caches, remove the sysfs files after
deactivation so that we don't end up with millions of sysfs files
without any useful information on them.
This patchset contains the following nine patches.
0001-Revert-slub-move-synchronize_sched-out-of-slab_mutex.patch
0002-slub-separate-out-sysfs_slab_release-from-sysfs_slab.patch
0003-slab-remove-synchronous-rcu_barrier-call-in-memcg-ca.patch
0004-slab-reorganize-memcg_cache_params.patch
0005-slab-link-memcg-kmem_caches-on-their-associated-memo.patch
0006-slab-implement-slab_root_caches-list.patch
0007-slab-introduce-__kmemcg_cache_deactivate.patch
0008-slab-remove-synchronous-synchronize_sched-from-memcg.patch
0009-slab-remove-slub-sysfs-interface-files-early-for-emp.patch
0010-slab-use-memcg_kmem_cache_wq-for-slab-destruction-op.patch
0001 reverts an existing optimization to prepare for the following
changes. 0002 is a prep patch. 0003 makes rcu_barrier() in release
path batched and asynchronous. 0004-0006 separate out the lists.
0007-0008 replace synchronize_sched() in slub destruction path with
call_rcu_sched(). 0009 removes sysfs files early for empty dying
caches. 0010 makes destruction work items use a workqueue with limited
concurrency.
This patch (of 10):
Revert 89e364db71fb5e ("slub: move synchronize_sched out of slab_mutex on
shrink").
With kmem cgroup support enabled, kmem_caches can be created and destroyed
frequently and a great number of near empty kmem_caches can accumulate if
there are a lot of transient cgroups and the system is not under memory
pressure. When memory reclaim starts under such conditions, it can lead
to consecutive deactivation and destruction of many kmem_caches, easily
hundreds of thousands on moderately large systems, exposing scalability
issues in the current slab management code. This is one of the patches to
address the issue.
Moving synchronize_sched() out of slab_mutex isn't enough as it's still
inside cgroup_mutex. The whole deactivation / release path will be
updated to avoid all synchronous RCU operations. Revert this insufficient
optimization in preparation to ease future changes.
Link: http://lkml.kernel.org/r/20170117235411.9408-2-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-22 23:41:08 +00:00
|
|
|
ret = __kmem_cache_shrink(cachep, false);
|
slab: get_online_mems for kmem_cache_{create,destroy,shrink}
When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node. To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.
To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug. For
instance, in case of slub:
CPU0 CPU1
---- ----
kmem_cache_create: online_pages:
__kmem_cache_create: slab_memory_callback:
slab_mem_going_online_callback:
lock slab_mutex
for each slab_caches list entry
allocate kmem_cache node
unlock slab_mutex
lock slab_mutex
init_kmem_cache_nodes:
for_each_node_state(node, N_NORMAL_MEMORY)
allocate kmem_cache node
add kmem_cache to slab_caches list
unlock slab_mutex
online_pages (continued):
node_states_set_node
As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.
To avoid issues like that we should hold get/put_online_mems() during
the whole kmem cache creation/destruction/shrink paths, just like we
deal with cpu hotplug. This patch does the trick.
Note, that after it's applied, there is no need in taking the slab_mutex
for kmem_cache_shrink any more, so it is removed from there.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-04 23:07:20 +00:00
|
|
|
put_online_mems();
|
|
|
|
put_online_cpus();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kmem_cache_shrink);
|
|
|
|
|
2015-11-06 02:44:59 +00:00
|
|
|
bool slab_is_available(void)
|
2012-07-06 20:25:11 +00:00
|
|
|
{
|
|
|
|
return slab_state >= UP;
|
|
|
|
}
|
2012-10-19 14:20:25 +00:00
|
|
|
|
2012-11-28 16:23:07 +00:00
|
|
|
#ifndef CONFIG_SLOB
|
|
|
|
/* Create a cache during boot when no slab services are available yet */
|
|
|
|
void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
|
|
|
|
unsigned long flags)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
s->name = name;
|
|
|
|
s->size = s->object_size = size;
|
2012-11-28 16:23:16 +00:00
|
|
|
s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
|
2015-02-12 22:59:20 +00:00
|
|
|
|
|
|
|
slab_init_memcg_params(s);
|
|
|
|
|
2012-11-28 16:23:07 +00:00
|
|
|
err = __kmem_cache_create(s, flags);
|
|
|
|
|
|
|
|
if (err)
|
2013-01-10 19:00:53 +00:00
|
|
|
panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n",
|
2012-11-28 16:23:07 +00:00
|
|
|
name, size, err);
|
|
|
|
|
|
|
|
s->refcount = -1; /* Exempt from merging for now */
|
|
|
|
}
|
|
|
|
|
|
|
|
struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
|
|
|
|
unsigned long flags)
|
|
|
|
{
|
|
|
|
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
|
|
|
|
|
|
|
|
if (!s)
|
|
|
|
panic("Out of memory when creating slab %s\n", name);
|
|
|
|
|
|
|
|
create_boot_cache(s, name, size, flags);
|
|
|
|
list_add(&s->list, &slab_caches);
|
|
|
|
s->refcount = 1;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2013-01-10 19:12:17 +00:00
|
|
|
struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
|
|
|
|
EXPORT_SYMBOL(kmalloc_caches);
|
|
|
|
|
|
|
|
#ifdef CONFIG_ZONE_DMA
|
|
|
|
struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
|
|
|
|
EXPORT_SYMBOL(kmalloc_dma_caches);
|
|
|
|
#endif
|
|
|
|
|
2013-01-10 19:14:19 +00:00
|
|
|
/*
|
|
|
|
* Conversion table for small slabs sizes / 8 to the index in the
|
|
|
|
* kmalloc array. This is necessary for slabs < 192 since we have non power
|
|
|
|
* of two cache sizes there. The size of larger slabs can be determined using
|
|
|
|
* fls.
|
|
|
|
*/
|
|
|
|
static s8 size_index[24] = {
|
|
|
|
3, /* 8 */
|
|
|
|
4, /* 16 */
|
|
|
|
5, /* 24 */
|
|
|
|
5, /* 32 */
|
|
|
|
6, /* 40 */
|
|
|
|
6, /* 48 */
|
|
|
|
6, /* 56 */
|
|
|
|
6, /* 64 */
|
|
|
|
1, /* 72 */
|
|
|
|
1, /* 80 */
|
|
|
|
1, /* 88 */
|
|
|
|
1, /* 96 */
|
|
|
|
7, /* 104 */
|
|
|
|
7, /* 112 */
|
|
|
|
7, /* 120 */
|
|
|
|
7, /* 128 */
|
|
|
|
2, /* 136 */
|
|
|
|
2, /* 144 */
|
|
|
|
2, /* 152 */
|
|
|
|
2, /* 160 */
|
|
|
|
2, /* 168 */
|
|
|
|
2, /* 176 */
|
|
|
|
2, /* 184 */
|
|
|
|
2 /* 192 */
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline int size_index_elem(size_t bytes)
|
|
|
|
{
|
|
|
|
return (bytes - 1) / 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the kmem_cache structure that serves a given size of
|
|
|
|
* allocation
|
|
|
|
*/
|
|
|
|
struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
|
|
|
|
{
|
|
|
|
int index;
|
|
|
|
|
2013-08-02 02:02:42 +00:00
|
|
|
if (unlikely(size > KMALLOC_MAX_SIZE)) {
|
2013-06-10 19:18:00 +00:00
|
|
|
WARN_ON_ONCE(!(flags & __GFP_NOWARN));
|
2013-05-03 15:43:18 +00:00
|
|
|
return NULL;
|
2013-06-10 19:18:00 +00:00
|
|
|
}
|
2013-05-03 15:43:18 +00:00
|
|
|
|
2013-01-10 19:14:19 +00:00
|
|
|
if (size <= 192) {
|
|
|
|
if (!size)
|
|
|
|
return ZERO_SIZE_PTR;
|
|
|
|
|
|
|
|
index = size_index[size_index_elem(size)];
|
|
|
|
} else
|
|
|
|
index = fls(size - 1);
|
|
|
|
|
|
|
|
#ifdef CONFIG_ZONE_DMA
|
2013-02-04 14:46:46 +00:00
|
|
|
if (unlikely((flags & GFP_DMA)))
|
2013-01-10 19:14:19 +00:00
|
|
|
return kmalloc_dma_caches[index];
|
|
|
|
|
|
|
|
#endif
|
|
|
|
return kmalloc_caches[index];
|
|
|
|
}
|
|
|
|
|
2015-06-24 23:55:54 +00:00
|
|
|
/*
|
|
|
|
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
|
|
|
|
* kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
|
|
|
|
* kmalloc-67108864.
|
|
|
|
*/
|
2017-02-22 23:41:05 +00:00
|
|
|
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
|
2015-06-24 23:55:54 +00:00
|
|
|
{NULL, 0}, {"kmalloc-96", 96},
|
|
|
|
{"kmalloc-192", 192}, {"kmalloc-8", 8},
|
|
|
|
{"kmalloc-16", 16}, {"kmalloc-32", 32},
|
|
|
|
{"kmalloc-64", 64}, {"kmalloc-128", 128},
|
|
|
|
{"kmalloc-256", 256}, {"kmalloc-512", 512},
|
|
|
|
{"kmalloc-1024", 1024}, {"kmalloc-2048", 2048},
|
|
|
|
{"kmalloc-4096", 4096}, {"kmalloc-8192", 8192},
|
|
|
|
{"kmalloc-16384", 16384}, {"kmalloc-32768", 32768},
|
|
|
|
{"kmalloc-65536", 65536}, {"kmalloc-131072", 131072},
|
|
|
|
{"kmalloc-262144", 262144}, {"kmalloc-524288", 524288},
|
|
|
|
{"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152},
|
|
|
|
{"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608},
|
|
|
|
{"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432},
|
|
|
|
{"kmalloc-67108864", 67108864}
|
|
|
|
};
|
|
|
|
|
2013-01-10 19:12:17 +00:00
|
|
|
/*
|
2015-06-24 23:55:57 +00:00
|
|
|
* Patch up the size_index table if we have strange large alignment
|
|
|
|
* requirements for the kmalloc array. This is only the case for
|
|
|
|
* MIPS it seems. The standard arches will not generate any code here.
|
|
|
|
*
|
|
|
|
* Largest permitted alignment is 256 bytes due to the way we
|
|
|
|
* handle the index determination for the smaller caches.
|
|
|
|
*
|
|
|
|
* Make sure that nothing crazy happens if someone starts tinkering
|
|
|
|
* around with ARCH_KMALLOC_MINALIGN
|
2013-01-10 19:12:17 +00:00
|
|
|
*/
|
2015-06-24 23:55:57 +00:00
|
|
|
void __init setup_kmalloc_cache_index_table(void)
|
2013-01-10 19:12:17 +00:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2013-01-10 19:14:19 +00:00
|
|
|
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
|
|
|
|
(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
|
|
|
|
|
|
|
|
for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
|
|
|
|
int elem = size_index_elem(i);
|
|
|
|
|
|
|
|
if (elem >= ARRAY_SIZE(size_index))
|
|
|
|
break;
|
|
|
|
size_index[elem] = KMALLOC_SHIFT_LOW;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (KMALLOC_MIN_SIZE >= 64) {
|
|
|
|
/*
|
|
|
|
* The 96 byte size cache is not used if the alignment
|
|
|
|
* is 64 byte.
|
|
|
|
*/
|
|
|
|
for (i = 64 + 8; i <= 96; i += 8)
|
|
|
|
size_index[size_index_elem(i)] = 7;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (KMALLOC_MIN_SIZE >= 128) {
|
|
|
|
/*
|
|
|
|
* The 192 byte sized cache is not used if the alignment
|
|
|
|
* is 128 byte. Redirect kmalloc to use the 256 byte cache
|
|
|
|
* instead.
|
|
|
|
*/
|
|
|
|
for (i = 128 + 8; i <= 192; i += 8)
|
|
|
|
size_index[size_index_elem(i)] = 8;
|
|
|
|
}
|
2015-06-24 23:55:57 +00:00
|
|
|
}
|
|
|
|
|
2015-06-30 14:01:11 +00:00
|
|
|
static void __init new_kmalloc_cache(int idx, unsigned long flags)
|
2015-06-29 14:28:08 +00:00
|
|
|
{
|
|
|
|
kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
|
|
|
|
kmalloc_info[idx].size, flags);
|
|
|
|
}
|
|
|
|
|
2015-06-24 23:55:57 +00:00
|
|
|
/*
|
|
|
|
* Create the kmalloc array. Some of the regular kmalloc arrays
|
|
|
|
* may already have been created because they were needed to
|
|
|
|
* enable allocations for slab creation.
|
|
|
|
*/
|
|
|
|
void __init create_kmalloc_caches(unsigned long flags)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2015-06-29 14:28:08 +00:00
|
|
|
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
|
|
|
|
if (!kmalloc_caches[i])
|
|
|
|
new_kmalloc_cache(i, flags);
|
2013-01-10 19:12:17 +00:00
|
|
|
|
2013-05-08 19:56:28 +00:00
|
|
|
/*
|
2015-06-29 14:28:08 +00:00
|
|
|
* Caches that are not of the two-to-the-power-of size.
|
|
|
|
* These have to be created immediately after the
|
|
|
|
* earlier power of two caches
|
2013-05-08 19:56:28 +00:00
|
|
|
*/
|
2015-06-29 14:28:08 +00:00
|
|
|
if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
|
|
|
|
new_kmalloc_cache(1, flags);
|
|
|
|
if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
|
|
|
|
new_kmalloc_cache(2, flags);
|
2013-05-03 18:04:18 +00:00
|
|
|
}
|
|
|
|
|
2013-01-10 19:12:17 +00:00
|
|
|
/* Kmalloc array is now usable */
|
|
|
|
slab_state = UP;
|
|
|
|
|
|
|
|
#ifdef CONFIG_ZONE_DMA
|
|
|
|
for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
|
|
|
|
struct kmem_cache *s = kmalloc_caches[i];
|
|
|
|
|
|
|
|
if (s) {
|
|
|
|
int size = kmalloc_size(i);
|
|
|
|
char *n = kasprintf(GFP_NOWAIT,
|
|
|
|
"dma-kmalloc-%d", size);
|
|
|
|
|
|
|
|
BUG_ON(!n);
|
|
|
|
kmalloc_dma_caches[i] = create_kmalloc_cache(n,
|
|
|
|
size, SLAB_CACHE_DMA | flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
2012-11-28 16:23:07 +00:00
|
|
|
#endif /* !CONFIG_SLOB */
|
|
|
|
|
2014-06-04 23:07:04 +00:00
|
|
|
/*
|
|
|
|
* To avoid unnecessary overhead, we pass through large allocation requests
|
|
|
|
* directly to the page allocator. We use __GFP_COMP, because we will need to
|
|
|
|
* know the allocation order to free the pages properly in kfree.
|
|
|
|
*/
|
2014-06-04 23:06:39 +00:00
|
|
|
void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
|
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
flags |= __GFP_COMP;
|
mm: charge/uncharge kmemcg from generic page allocator paths
Currently, to charge a non-slab allocation to kmemcg one has to use
alloc_kmem_pages helper with __GFP_ACCOUNT flag. A page allocated with
this helper should finally be freed using free_kmem_pages, otherwise it
won't be uncharged.
This API suits its current users fine, but it turns out to be impossible
to use along with page reference counting, i.e. when an allocation is
supposed to be freed with put_page, as it is the case with pipe or unix
socket buffers.
To overcome this limitation, this patch moves charging/uncharging to
generic page allocator paths, i.e. to __alloc_pages_nodemask and
free_pages_prepare, and zaps alloc/free_kmem_pages helpers. This way,
one can use any of the available page allocation functions to get the
allocated page charged to kmemcg - it's enough to pass __GFP_ACCOUNT,
just like in case of kmalloc and friends. A charged page will be
automatically uncharged on free.
To make it possible, we need to mark pages charged to kmemcg somehow.
To avoid introducing a new page flag, we make use of page->_mapcount for
marking such pages. Since pages charged to kmemcg are not supposed to
be mapped to userspace, it should work just fine. There are other
(ab)users of page->_mapcount - buddy and balloon pages - but we don't
conflict with them.
In case kmemcg is compiled out or not used at runtime, this patch
introduces no overhead to generic page allocator paths. If kmemcg is
used, it will be plus one gfp flags check on alloc and plus one
page->_mapcount check on free, which shouldn't hurt performance, because
the data accessed are hot.
Link: http://lkml.kernel.org/r/a9736d856f895bcb465d9f257b54efe32eda6f99.1464079538.git.vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-07-26 22:24:24 +00:00
|
|
|
page = alloc_pages(flags, order);
|
2014-06-04 23:06:39 +00:00
|
|
|
ret = page ? page_address(page) : NULL;
|
|
|
|
kmemleak_alloc(ret, size, 1, flags);
|
2016-03-25 21:22:02 +00:00
|
|
|
kasan_kmalloc_large(ret, size, flags);
|
2014-06-04 23:06:39 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kmalloc_order);
|
|
|
|
|
2013-09-04 16:35:34 +00:00
|
|
|
#ifdef CONFIG_TRACING
|
|
|
|
void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
|
|
|
|
{
|
|
|
|
void *ret = kmalloc_order(size, flags, order);
|
|
|
|
trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kmalloc_order_trace);
|
|
|
|
#endif
|
2012-11-28 16:23:07 +00:00
|
|
|
|
2016-07-26 22:21:56 +00:00
|
|
|
#ifdef CONFIG_SLAB_FREELIST_RANDOM
|
|
|
|
/* Randomize a generic freelist */
|
|
|
|
static void freelist_randomize(struct rnd_state *state, unsigned int *list,
|
|
|
|
size_t count)
|
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
unsigned int rand;
|
|
|
|
|
|
|
|
for (i = 0; i < count; i++)
|
|
|
|
list[i] = i;
|
|
|
|
|
|
|
|
/* Fisher-Yates shuffle */
|
|
|
|
for (i = count - 1; i > 0; i--) {
|
|
|
|
rand = prandom_u32_state(state);
|
|
|
|
rand %= (i + 1);
|
|
|
|
swap(list[i], list[rand]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Create a random sequence per cache */
|
|
|
|
int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
|
|
|
|
gfp_t gfp)
|
|
|
|
{
|
|
|
|
struct rnd_state state;
|
|
|
|
|
|
|
|
if (count < 2 || cachep->random_seq)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
|
|
|
|
if (!cachep->random_seq)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* Get best entropy at this stage of boot */
|
|
|
|
prandom_seed_state(&state, get_random_long());
|
|
|
|
|
|
|
|
freelist_randomize(&state, cachep->random_seq, count);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Destroy the per-cache random freelist sequence */
|
|
|
|
void cache_random_seq_destroy(struct kmem_cache *cachep)
|
|
|
|
{
|
|
|
|
kfree(cachep->random_seq);
|
|
|
|
cachep->random_seq = NULL;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_SLAB_FREELIST_RANDOM */
|
|
|
|
|
2012-10-19 14:20:25 +00:00
|
|
|
#ifdef CONFIG_SLABINFO
|
2013-07-04 00:33:24 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_SLAB
|
|
|
|
#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
|
|
|
|
#else
|
|
|
|
#define SLABINFO_RIGHTS S_IRUSR
|
|
|
|
#endif
|
|
|
|
|
2014-12-10 23:44:19 +00:00
|
|
|
static void print_slabinfo_header(struct seq_file *m)
|
2012-10-19 14:20:26 +00:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Output format version, so at least we can change it
|
|
|
|
* without _too_ many complaints.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_DEBUG_SLAB
|
|
|
|
seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
|
|
|
|
#else
|
|
|
|
seq_puts(m, "slabinfo - version: 2.1\n");
|
|
|
|
#endif
|
2016-03-17 21:19:47 +00:00
|
|
|
seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
|
2012-10-19 14:20:26 +00:00
|
|
|
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
|
|
|
|
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
|
|
|
|
#ifdef CONFIG_DEBUG_SLAB
|
2016-03-17 21:19:47 +00:00
|
|
|
seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
|
2012-10-19 14:20:26 +00:00
|
|
|
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
|
|
|
|
#endif
|
|
|
|
seq_putc(m, '\n');
|
|
|
|
}
|
|
|
|
|
2014-12-10 23:42:16 +00:00
|
|
|
void *slab_start(struct seq_file *m, loff_t *pos)
|
2012-10-19 14:20:25 +00:00
|
|
|
{
|
|
|
|
mutex_lock(&slab_mutex);
|
|
|
|
return seq_list_start(&slab_caches, *pos);
|
|
|
|
}
|
|
|
|
|
2013-07-08 00:08:28 +00:00
|
|
|
void *slab_next(struct seq_file *m, void *p, loff_t *pos)
|
2012-10-19 14:20:25 +00:00
|
|
|
{
|
|
|
|
return seq_list_next(p, &slab_caches, pos);
|
|
|
|
}
|
|
|
|
|
2013-07-08 00:08:28 +00:00
|
|
|
void slab_stop(struct seq_file *m, void *p)
|
2012-10-19 14:20:25 +00:00
|
|
|
{
|
|
|
|
mutex_unlock(&slab_mutex);
|
|
|
|
}
|
|
|
|
|
2012-12-18 22:23:01 +00:00
|
|
|
static void
|
|
|
|
memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
|
|
|
|
{
|
|
|
|
struct kmem_cache *c;
|
|
|
|
struct slabinfo sinfo;
|
|
|
|
|
|
|
|
if (!is_root_cache(s))
|
|
|
|
return;
|
|
|
|
|
2015-02-12 22:59:23 +00:00
|
|
|
for_each_memcg_cache(c, s) {
|
2012-12-18 22:23:01 +00:00
|
|
|
memset(&sinfo, 0, sizeof(sinfo));
|
|
|
|
get_slabinfo(c, &sinfo);
|
|
|
|
|
|
|
|
info->active_slabs += sinfo.active_slabs;
|
|
|
|
info->num_slabs += sinfo.num_slabs;
|
|
|
|
info->shared_avail += sinfo.shared_avail;
|
|
|
|
info->active_objs += sinfo.active_objs;
|
|
|
|
info->num_objs += sinfo.num_objs;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-10 23:44:19 +00:00
|
|
|
static void cache_show(struct kmem_cache *s, struct seq_file *m)
|
2012-10-19 14:20:25 +00:00
|
|
|
{
|
2012-10-19 14:20:27 +00:00
|
|
|
struct slabinfo sinfo;
|
|
|
|
|
|
|
|
memset(&sinfo, 0, sizeof(sinfo));
|
|
|
|
get_slabinfo(s, &sinfo);
|
|
|
|
|
2012-12-18 22:23:01 +00:00
|
|
|
memcg_accumulate_slabinfo(s, &sinfo);
|
|
|
|
|
2012-10-19 14:20:27 +00:00
|
|
|
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
|
2012-12-18 22:23:01 +00:00
|
|
|
cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
|
2012-10-19 14:20:27 +00:00
|
|
|
sinfo.objects_per_slab, (1 << sinfo.cache_order));
|
|
|
|
|
|
|
|
seq_printf(m, " : tunables %4u %4u %4u",
|
|
|
|
sinfo.limit, sinfo.batchcount, sinfo.shared);
|
|
|
|
seq_printf(m, " : slabdata %6lu %6lu %6lu",
|
|
|
|
sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
|
|
|
|
slabinfo_show_stats(m, s);
|
|
|
|
seq_putc(m, '\n');
|
2012-10-19 14:20:25 +00:00
|
|
|
}
|
|
|
|
|
2014-12-10 23:42:16 +00:00
|
|
|
static int slab_show(struct seq_file *m, void *p)
|
2012-12-18 22:23:01 +00:00
|
|
|
{
|
|
|
|
struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
|
|
|
|
|
2014-12-10 23:42:16 +00:00
|
|
|
if (p == slab_caches.next)
|
|
|
|
print_slabinfo_header(m);
|
2014-12-10 23:44:19 +00:00
|
|
|
if (is_root_cache(s))
|
|
|
|
cache_show(s, m);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-01-20 23:02:32 +00:00
|
|
|
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
|
2014-12-10 23:44:19 +00:00
|
|
|
int memcg_slab_show(struct seq_file *m, void *p)
|
|
|
|
{
|
|
|
|
struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
|
|
|
|
|
|
|
|
if (p == slab_caches.next)
|
|
|
|
print_slabinfo_header(m);
|
2015-02-12 22:59:20 +00:00
|
|
|
if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
|
2014-12-10 23:44:19 +00:00
|
|
|
cache_show(s, m);
|
|
|
|
return 0;
|
2012-12-18 22:23:01 +00:00
|
|
|
}
|
2014-12-10 23:44:19 +00:00
|
|
|
#endif
|
2012-12-18 22:23:01 +00:00
|
|
|
|
2012-10-19 14:20:25 +00:00
|
|
|
/*
|
|
|
|
* slabinfo_op - iterator that generates /proc/slabinfo
|
|
|
|
*
|
|
|
|
* Output layout:
|
|
|
|
* cache-name
|
|
|
|
* num-active-objs
|
|
|
|
* total-objs
|
|
|
|
* object size
|
|
|
|
* num-active-slabs
|
|
|
|
* total-slabs
|
|
|
|
* num-pages-per-slab
|
|
|
|
* + further values on SMP and with statistics enabled
|
|
|
|
*/
|
|
|
|
static const struct seq_operations slabinfo_op = {
|
2014-12-10 23:42:16 +00:00
|
|
|
.start = slab_start,
|
2013-07-08 00:08:28 +00:00
|
|
|
.next = slab_next,
|
|
|
|
.stop = slab_stop,
|
2014-12-10 23:42:16 +00:00
|
|
|
.show = slab_show,
|
2012-10-19 14:20:25 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static int slabinfo_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
return seq_open(file, &slabinfo_op);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations proc_slabinfo_operations = {
|
|
|
|
.open = slabinfo_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.write = slabinfo_write,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init slab_proc_init(void)
|
|
|
|
{
|
2013-07-04 00:33:24 +00:00
|
|
|
proc_create("slabinfo", SLABINFO_RIGHTS, NULL,
|
|
|
|
&proc_slabinfo_operations);
|
2012-10-19 14:20:25 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
module_init(slab_proc_init);
|
|
|
|
#endif /* CONFIG_SLABINFO */
|
2014-08-06 23:04:44 +00:00
|
|
|
|
|
|
|
static __always_inline void *__do_krealloc(const void *p, size_t new_size,
|
|
|
|
gfp_t flags)
|
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
size_t ks = 0;
|
|
|
|
|
|
|
|
if (p)
|
|
|
|
ks = ksize(p);
|
|
|
|
|
2015-02-13 22:39:42 +00:00
|
|
|
if (ks >= new_size) {
|
2016-03-25 21:22:02 +00:00
|
|
|
kasan_krealloc((void *)p, new_size, flags);
|
2014-08-06 23:04:44 +00:00
|
|
|
return (void *)p;
|
2015-02-13 22:39:42 +00:00
|
|
|
}
|
2014-08-06 23:04:44 +00:00
|
|
|
|
|
|
|
ret = kmalloc_track_caller(new_size, flags);
|
|
|
|
if (ret && p)
|
|
|
|
memcpy(ret, p, ks);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* __krealloc - like krealloc() but don't free @p.
|
|
|
|
* @p: object to reallocate memory for.
|
|
|
|
* @new_size: how many bytes of memory are required.
|
|
|
|
* @flags: the type of memory to allocate.
|
|
|
|
*
|
|
|
|
* This function is like krealloc() except it never frees the originally
|
|
|
|
* allocated buffer. Use this if you don't want to free the buffer immediately
|
|
|
|
* like, for example, with RCU.
|
|
|
|
*/
|
|
|
|
void *__krealloc(const void *p, size_t new_size, gfp_t flags)
|
|
|
|
{
|
|
|
|
if (unlikely(!new_size))
|
|
|
|
return ZERO_SIZE_PTR;
|
|
|
|
|
|
|
|
return __do_krealloc(p, new_size, flags);
|
|
|
|
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__krealloc);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* krealloc - reallocate memory. The contents will remain unchanged.
|
|
|
|
* @p: object to reallocate memory for.
|
|
|
|
* @new_size: how many bytes of memory are required.
|
|
|
|
* @flags: the type of memory to allocate.
|
|
|
|
*
|
|
|
|
* The contents of the object pointed to are preserved up to the
|
|
|
|
* lesser of the new and old sizes. If @p is %NULL, krealloc()
|
|
|
|
* behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
|
|
|
|
* %NULL pointer, the object pointed to is freed.
|
|
|
|
*/
|
|
|
|
void *krealloc(const void *p, size_t new_size, gfp_t flags)
|
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
|
|
|
|
if (unlikely(!new_size)) {
|
|
|
|
kfree(p);
|
|
|
|
return ZERO_SIZE_PTR;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = __do_krealloc(p, new_size, flags);
|
|
|
|
if (ret && p != ret)
|
|
|
|
kfree(p);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(krealloc);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kzfree - like kfree but zero memory
|
|
|
|
* @p: object to free memory of
|
|
|
|
*
|
|
|
|
* The memory of the object @p points to is zeroed before freed.
|
|
|
|
* If @p is %NULL, kzfree() does nothing.
|
|
|
|
*
|
|
|
|
* Note: this function zeroes the whole allocated buffer which can be a good
|
|
|
|
* deal bigger than the requested buffer size passed to kmalloc(). So be
|
|
|
|
* careful when using this function in performance sensitive code.
|
|
|
|
*/
|
|
|
|
void kzfree(const void *p)
|
|
|
|
{
|
|
|
|
size_t ks;
|
|
|
|
void *mem = (void *)p;
|
|
|
|
|
|
|
|
if (unlikely(ZERO_OR_NULL_PTR(mem)))
|
|
|
|
return;
|
|
|
|
ks = ksize(mem);
|
|
|
|
memset(mem, 0, ks);
|
|
|
|
kfree(mem);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kzfree);
|
|
|
|
|
|
|
|
/* Tracepoints definitions. */
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kfree);
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
|