mm: introduce slabobj_ext to support slab object extensions

Currently slab pages can store only vectors of obj_cgroup pointers in
page->memcg_data.  Introduce slabobj_ext structure to allow more data to
be stored for each slab object.  Wrap obj_cgroup into slabobj_ext to
support current functionality while allowing to extend slabobj_ext in the
future.

Link: https://lkml.kernel.org/r/20240321163705.3067592-7-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Suren Baghdasaryan 2024-03-21 09:36:28 -07:00 committed by Andrew Morton
parent a5674119f0
commit 21c690a349
9 changed files with 145 additions and 112 deletions

View File

@ -349,8 +349,8 @@ struct mem_cgroup {
extern struct mem_cgroup *root_mem_cgroup;
enum page_memcg_data_flags {
/* page->memcg_data is a pointer to an objcgs vector */
MEMCG_DATA_OBJCGS = (1UL << 0),
/* page->memcg_data is a pointer to an slabobj_ext vector */
MEMCG_DATA_OBJEXTS = (1UL << 0),
/* page has been accounted as a non-slab kernel page */
MEMCG_DATA_KMEM = (1UL << 1),
/* the next bit after the last actual flag */
@ -388,7 +388,7 @@ static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
unsigned long memcg_data = folio->memcg_data;
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
@ -409,7 +409,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
unsigned long memcg_data = folio->memcg_data;
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
@ -506,7 +506,7 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
*/
unsigned long memcg_data = READ_ONCE(folio->memcg_data);
if (memcg_data & MEMCG_DATA_OBJCGS)
if (memcg_data & MEMCG_DATA_OBJEXTS)
return NULL;
if (memcg_data & MEMCG_DATA_KMEM) {
@ -552,7 +552,7 @@ static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *ob
static inline bool folio_memcg_kmem(struct folio *folio)
{
VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio);
VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio);
return folio->memcg_data & MEMCG_DATA_KMEM;
}
@ -1633,6 +1633,14 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
}
#endif /* CONFIG_MEMCG */
/*
* Extended information for slab objects stored as an array in page->memcg_data
* if MEMCG_DATA_OBJEXTS is set.
*/
struct slabobj_ext {
struct obj_cgroup *objcg;
} __aligned(8);
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
__mod_lruvec_kmem_state(p, idx, 1);

View File

@ -169,7 +169,7 @@ struct page {
/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
atomic_t _refcount;
#ifdef CONFIG_MEMCG
#ifdef CONFIG_SLAB_OBJ_EXT
unsigned long memcg_data;
#endif
@ -331,7 +331,7 @@ struct folio {
};
atomic_t _mapcount;
atomic_t _refcount;
#ifdef CONFIG_MEMCG
#ifdef CONFIG_SLAB_OBJ_EXT
unsigned long memcg_data;
#endif
#if defined(WANT_PAGE_VIRTUAL)

View File

@ -929,6 +929,9 @@ config NUMA_BALANCING_DEFAULT_ENABLED
If set, automatic NUMA balancing will be enabled if running on a NUMA
machine.
config SLAB_OBJ_EXT
bool
menuconfig CGROUPS
bool "Control Group support"
select KERNFS
@ -962,6 +965,7 @@ config MEMCG
bool "Memory controller"
select PAGE_COUNTER
select EVENTFD
select SLAB_OBJ_EXT
help
Provides control over the memory footprint of tasks in a cgroup.

View File

@ -595,9 +595,9 @@ static unsigned long kfence_init_pool(void)
continue;
__folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG
slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg |
MEMCG_DATA_OBJCGS;
#ifdef CONFIG_MEMCG_KMEM
slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts |
MEMCG_DATA_OBJEXTS;
#endif
}
@ -645,8 +645,8 @@ static unsigned long kfence_init_pool(void)
if (!i || (i % 2))
continue;
#ifdef CONFIG_MEMCG
slab->memcg_data = 0;
#ifdef CONFIG_MEMCG_KMEM
slab->obj_exts = 0;
#endif
__folio_clear_slab(slab_folio(slab));
}
@ -1139,8 +1139,8 @@ void __kfence_free(void *addr)
{
struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
#ifdef CONFIG_MEMCG
KFENCE_WARN_ON(meta->objcg);
#ifdef CONFIG_MEMCG_KMEM
KFENCE_WARN_ON(meta->obj_exts.objcg);
#endif
/*
* If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing

View File

@ -97,8 +97,8 @@ struct kfence_metadata {
struct kfence_track free_track;
/* For updating alloc_covered on frees. */
u32 alloc_stack_hash;
#ifdef CONFIG_MEMCG
struct obj_cgroup *objcg;
#ifdef CONFIG_MEMCG_KMEM
struct slabobj_ext obj_exts;
#endif
};

View File

@ -2977,13 +2977,6 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
}
#ifdef CONFIG_MEMCG_KMEM
/*
* The allocated objcg pointers array is not accounted directly.
* Moreover, it should not come from DMA buffer and is not readily
* reclaimable. So those GFP bits should be masked off.
*/
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
__GFP_ACCOUNT | __GFP_NOFAIL)
/*
* mod_objcg_mlstate() may be called with irq enabled, so
@ -3003,62 +2996,27 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
rcu_read_unlock();
}
int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
unsigned int objects = objs_per_slab(s, slab);
unsigned long memcg_data;
void *vec;
gfp &= ~OBJCGS_CLEAR_MASK;
vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
slab_nid(slab));
if (!vec)
return -ENOMEM;
memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
if (new_slab) {
/*
* If the slab is brand new and nobody can yet access its
* memcg_data, no synchronization is required and memcg_data can
* be simply assigned.
*/
slab->memcg_data = memcg_data;
} else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
/*
* If the slab is already in use, somebody can allocate and
* assign obj_cgroups in parallel. In this case the existing
* objcg vector should be reused.
*/
kfree(vec);
return 0;
}
kmemleak_not_leak(vec);
return 0;
}
static __always_inline
struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
{
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* slab->memcg_data.
* slab->obj_exts.
*/
if (folio_test_slab(folio)) {
struct obj_cgroup **objcgs;
struct slabobj_ext *obj_exts;
struct slab *slab;
unsigned int off;
slab = folio_slab(folio);
objcgs = slab_objcgs(slab);
if (!objcgs)
obj_exts = slab_obj_exts(slab);
if (!obj_exts)
return NULL;
off = obj_to_index(slab->slab_cache, slab, p);
if (objcgs[off])
return obj_cgroup_memcg(objcgs[off]);
if (obj_exts[off].objcg)
return obj_cgroup_memcg(obj_exts[off].objcg);
return NULL;
}
@ -3066,7 +3024,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
/*
* folio_memcg_check() is used here, because in theory we can encounter
* a folio where the slab flag has been cleared already, but
* slab->memcg_data has not been freed yet
* slab->obj_exts has not been freed yet
* folio_memcg_check() will guarantee that a proper memory
* cgroup pointer or NULL will be returned.
*/

View File

@ -515,7 +515,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
if (!memcg_data)
goto out_unlock;
if (memcg_data & MEMCG_DATA_OBJCGS)
if (memcg_data & MEMCG_DATA_OBJEXTS)
ret += scnprintf(kbuf + ret, count - ret,
"Slab cache page\n");

View File

@ -87,8 +87,8 @@ struct slab {
unsigned int __unused;
atomic_t __page_refcount;
#ifdef CONFIG_MEMCG
unsigned long memcg_data;
#ifdef CONFIG_SLAB_OBJ_EXT
unsigned long obj_exts;
#endif
};
@ -97,8 +97,8 @@ struct slab {
SLAB_MATCH(flags, __page_flags);
SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */
SLAB_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_MEMCG
SLAB_MATCH(memcg_data, memcg_data);
#ifdef CONFIG_SLAB_OBJ_EXT
SLAB_MATCH(memcg_data, obj_exts);
#endif
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
@ -536,42 +536,44 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
return false;
}
#ifdef CONFIG_MEMCG_KMEM
#ifdef CONFIG_SLAB_OBJ_EXT
/*
* slab_objcgs - get the object cgroups vector associated with a slab
* slab_obj_exts - get the pointer to the slab object extension vector
* associated with a slab.
* @slab: a pointer to the slab struct
*
* Returns a pointer to the object cgroups vector associated with the slab,
* Returns a pointer to the object extension vector associated with the slab,
* or NULL if no such vector has been associated yet.
*/
static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
{
unsigned long memcg_data = READ_ONCE(slab->memcg_data);
unsigned long obj_exts = READ_ONCE(slab->obj_exts);
VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS),
#ifdef CONFIG_MEMCG
VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS),
slab_page(slab));
VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab));
VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
return (struct slabobj_ext *)(obj_exts & ~MEMCG_DATA_FLAGS_MASK);
#else
return (struct slabobj_ext *)obj_exts;
#endif
}
int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab);
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr);
#else /* CONFIG_MEMCG_KMEM */
static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
#else /* CONFIG_SLAB_OBJ_EXT */
static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
{
return NULL;
}
static inline int memcg_alloc_slab_cgroups(struct slab *slab,
struct kmem_cache *s, gfp_t gfp,
bool new_slab)
{
return 0;
}
#endif /* CONFIG_MEMCG_KMEM */
#endif /* CONFIG_SLAB_OBJ_EXT */
#ifdef CONFIG_MEMCG_KMEM
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr);
#endif
size_t __ksize(const void *objp);

101
mm/slub.c
View File

@ -1871,13 +1871,78 @@ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
}
#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_free_slab_cgroups(struct slab *slab)
#ifdef CONFIG_SLAB_OBJ_EXT
/*
* The allocated objcg pointers array is not accounted directly.
* Moreover, it should not come from DMA buffer and is not readily
* reclaimable. So those GFP bits should be masked off.
*/
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
__GFP_ACCOUNT | __GFP_NOFAIL)
static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
kfree(slab_objcgs(slab));
slab->memcg_data = 0;
unsigned int objects = objs_per_slab(s, slab);
unsigned long obj_exts;
void *vec;
gfp &= ~OBJCGS_CLEAR_MASK;
vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
slab_nid(slab));
if (!vec)
return -ENOMEM;
obj_exts = (unsigned long)vec;
#ifdef CONFIG_MEMCG
obj_exts |= MEMCG_DATA_OBJEXTS;
#endif
if (new_slab) {
/*
* If the slab is brand new and nobody can yet access its
* obj_exts, no synchronization is required and obj_exts can
* be simply assigned.
*/
slab->obj_exts = obj_exts;
} else if (cmpxchg(&slab->obj_exts, 0, obj_exts)) {
/*
* If the slab is already in use, somebody can allocate and
* assign slabobj_exts in parallel. In this case the existing
* objcg vector should be reused.
*/
kfree(vec);
return 0;
}
kmemleak_not_leak(vec);
return 0;
}
static inline void free_slab_obj_exts(struct slab *slab)
{
struct slabobj_ext *obj_exts;
obj_exts = slab_obj_exts(slab);
if (!obj_exts)
return;
kfree(obj_exts);
slab->obj_exts = 0;
}
#else /* CONFIG_SLAB_OBJ_EXT */
static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
return 0;
}
static inline void free_slab_obj_exts(struct slab *slab)
{
}
#endif /* CONFIG_SLAB_OBJ_EXT */
#ifdef CONFIG_MEMCG_KMEM
static inline size_t obj_full_size(struct kmem_cache *s)
{
/*
@ -1956,15 +2021,15 @@ static void __memcg_slab_post_alloc_hook(struct kmem_cache *s,
if (likely(p[i])) {
slab = virt_to_slab(p[i]);
if (!slab_objcgs(slab) &&
memcg_alloc_slab_cgroups(slab, s, flags, false)) {
if (!slab_obj_exts(slab) &&
alloc_slab_obj_exts(slab, s, flags, false)) {
obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
off = obj_to_index(s, slab, p[i]);
obj_cgroup_get(objcg);
slab_objcgs(slab)[off] = objcg;
slab_obj_exts(slab)[off].objcg = objcg;
mod_objcg_state(objcg, slab_pgdat(slab),
cache_vmstat_idx(s), obj_full_size(s));
} else {
@ -1985,18 +2050,18 @@ void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
static void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects,
struct obj_cgroup **objcgs)
struct slabobj_ext *obj_exts)
{
for (int i = 0; i < objects; i++) {
struct obj_cgroup *objcg;
unsigned int off;
off = obj_to_index(s, slab, p[i]);
objcg = objcgs[off];
objcg = obj_exts[off].objcg;
if (!objcg)
continue;
objcgs[off] = NULL;
obj_exts[off].objcg = NULL;
obj_cgroup_uncharge(objcg, obj_full_size(s));
mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
-obj_full_size(s));
@ -2008,16 +2073,16 @@ static __fastpath_inline
void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
int objects)
{
struct obj_cgroup **objcgs;
struct slabobj_ext *obj_exts;
if (!memcg_kmem_online())
return;
objcgs = slab_objcgs(slab);
if (likely(!objcgs))
obj_exts = slab_obj_exts(slab);
if (likely(!obj_exts))
return;
__memcg_slab_free_hook(s, slab, p, objects, objcgs);
__memcg_slab_free_hook(s, slab, p, objects, obj_exts);
}
static inline
@ -2028,10 +2093,6 @@ void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects,
obj_cgroup_uncharge(objcg, objects * obj_full_size(s));
}
#else /* CONFIG_MEMCG_KMEM */
static inline void memcg_free_slab_cgroups(struct slab *slab)
{
}
static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
struct list_lru *lru,
struct obj_cgroup **objcgp,
@ -2298,7 +2359,7 @@ static __always_inline void account_slab(struct slab *slab, int order,
struct kmem_cache *s, gfp_t gfp)
{
if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
memcg_alloc_slab_cgroups(slab, s, gfp, true);
alloc_slab_obj_exts(slab, s, gfp, true);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
PAGE_SIZE << order);
@ -2308,7 +2369,7 @@ static __always_inline void unaccount_slab(struct slab *slab, int order,
struct kmem_cache *s)
{
if (memcg_kmem_online())
memcg_free_slab_cgroups(slab);
free_slab_obj_exts(slab);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
-(PAGE_SIZE << order));