linux-stable/mm/shrinker.c
Qi Zheng 50d09da8e1 mm: shrinker: make memcg slab shrink lockless
Like global slab shrink, this commit also uses refcount+RCU method to make
memcg slab shrink lockless.

Use the following script to do slab shrink stress test:

```

DIR="/root/shrinker/memcg/mnt"

do_create()
{
    mkdir -p /sys/fs/cgroup/memory/test
    echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    for i in `seq 0 $1`;
    do
        mkdir -p /sys/fs/cgroup/memory/test/$i;
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        mkdir -p $DIR/$i;
    done
}

do_mount()
{
    for i in `seq $1 $2`;
    do
        mount -t tmpfs $i $DIR/$i;
    done
}

do_touch()
{
    for i in `seq $1 $2`;
    do
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
    done
}

case "$1" in
  touch)
    do_touch $2 $3
    ;;
  test)
    do_create 4000
    do_mount 0 4000
    do_touch 0 3000
    ;;
  *)
    exit 1
    ;;
esac
```

Save the above script, then run test and touch commands. Then we can use
the following perf command to view hotspots:

perf top -U -F 999

1) Before applying this patchset:

  33.15%  [kernel]          [k] down_read_trylock
  25.38%  [kernel]          [k] shrink_slab
  21.75%  [kernel]          [k] up_read
   4.45%  [kernel]          [k] _find_next_bit
   2.27%  [kernel]          [k] do_shrink_slab
   1.80%  [kernel]          [k] intel_idle_irq
   1.79%  [kernel]          [k] shrink_lruvec
   0.67%  [kernel]          [k] xas_descend
   0.41%  [kernel]          [k] mem_cgroup_iter
   0.40%  [kernel]          [k] shrink_node
   0.38%  [kernel]          [k] list_lru_count_one

2) After applying this patchset:

  64.56%  [kernel]          [k] shrink_slab
  12.18%  [kernel]          [k] do_shrink_slab
   3.30%  [kernel]          [k] __rcu_read_unlock
   2.61%  [kernel]          [k] shrink_lruvec
   2.49%  [kernel]          [k] __rcu_read_lock
   1.93%  [kernel]          [k] intel_idle_irq
   0.89%  [kernel]          [k] shrink_node
   0.81%  [kernel]          [k] mem_cgroup_iter
   0.77%  [kernel]          [k] mem_cgroup_calculate_protection
   0.66%  [kernel]          [k] list_lru_count_one

We can see that the first perf hotspot becomes shrink_slab, which is what
we expect.

Link: https://lkml.kernel.org/r/20230911094444.68966-44-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-04 10:32:26 -07:00

810 lines
21 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/memcontrol.h>
#include <linux/rwsem.h>
#include <linux/shrinker.h>
#include <linux/rculist.h>
#include <trace/events/vmscan.h>
#include "internal.h"
LIST_HEAD(shrinker_list);
DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG
static int shrinker_nr_max;
static inline int shrinker_unit_size(int nr_items)
{
return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
}
static inline void shrinker_unit_free(struct shrinker_info *info, int start)
{
struct shrinker_info_unit **unit;
int nr, i;
if (!info)
return;
unit = info->unit;
nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
for (i = start; i < nr; i++) {
if (!unit[i])
break;
kfree(unit[i]);
unit[i] = NULL;
}
}
static inline int shrinker_unit_alloc(struct shrinker_info *new,
struct shrinker_info *old, int nid)
{
struct shrinker_info_unit *unit;
int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
int i;
for (i = start; i < nr; i++) {
unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
if (!unit) {
shrinker_unit_free(new, start);
return -ENOMEM;
}
new->unit[i] = unit;
}
return 0;
}
void free_shrinker_info(struct mem_cgroup *memcg)
{
struct mem_cgroup_per_node *pn;
struct shrinker_info *info;
int nid;
for_each_node(nid) {
pn = memcg->nodeinfo[nid];
info = rcu_dereference_protected(pn->shrinker_info, true);
shrinker_unit_free(info, 0);
kvfree(info);
rcu_assign_pointer(pn->shrinker_info, NULL);
}
}
int alloc_shrinker_info(struct mem_cgroup *memcg)
{
struct shrinker_info *info;
int nid, ret = 0;
int array_size = 0;
down_write(&shrinker_rwsem);
array_size = shrinker_unit_size(shrinker_nr_max);
for_each_node(nid) {
info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
if (!info)
goto err;
info->map_nr_max = shrinker_nr_max;
if (shrinker_unit_alloc(info, NULL, nid))
goto err;
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
}
up_write(&shrinker_rwsem);
return ret;
err:
up_write(&shrinker_rwsem);
free_shrinker_info(memcg);
return -ENOMEM;
}
static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
int nid)
{
return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
lockdep_is_held(&shrinker_rwsem));
}
static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
int old_size, int new_nr_max)
{
struct shrinker_info *new, *old;
struct mem_cgroup_per_node *pn;
int nid;
for_each_node(nid) {
pn = memcg->nodeinfo[nid];
old = shrinker_info_protected(memcg, nid);
/* Not yet online memcg */
if (!old)
return 0;
/* Already expanded this shrinker_info */
if (new_nr_max <= old->map_nr_max)
continue;
new = kvmalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
if (!new)
return -ENOMEM;
new->map_nr_max = new_nr_max;
memcpy(new->unit, old->unit, old_size);
if (shrinker_unit_alloc(new, old, nid)) {
kvfree(new);
return -ENOMEM;
}
rcu_assign_pointer(pn->shrinker_info, new);
kvfree_rcu(old, rcu);
}
return 0;
}
static int expand_shrinker_info(int new_id)
{
int ret = 0;
int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
int new_size, old_size = 0;
struct mem_cgroup *memcg;
if (!root_mem_cgroup)
goto out;
lockdep_assert_held(&shrinker_rwsem);
new_size = shrinker_unit_size(new_nr_max);
old_size = shrinker_unit_size(shrinker_nr_max);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
ret = expand_one_shrinker_info(memcg, new_size, old_size,
new_nr_max);
if (ret) {
mem_cgroup_iter_break(NULL, memcg);
goto out;
}
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
out:
if (!ret)
shrinker_nr_max = new_nr_max;
return ret;
}
static inline int shrinker_id_to_index(int shrinker_id)
{
return shrinker_id / SHRINKER_UNIT_BITS;
}
static inline int shrinker_id_to_offset(int shrinker_id)
{
return shrinker_id % SHRINKER_UNIT_BITS;
}
static inline int calc_shrinker_id(int index, int offset)
{
return index * SHRINKER_UNIT_BITS + offset;
}
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct shrinker_info *info;
struct shrinker_info_unit *unit;
rcu_read_lock();
info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
unit = info->unit[shrinker_id_to_index(shrinker_id)];
if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
}
rcu_read_unlock();
}
}
static DEFINE_IDR(shrinker_idr);
static int shrinker_memcg_alloc(struct shrinker *shrinker)
{
int id, ret = -ENOMEM;
if (mem_cgroup_disabled())
return -ENOSYS;
down_write(&shrinker_rwsem);
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
if (id >= shrinker_nr_max) {
if (expand_shrinker_info(id)) {
idr_remove(&shrinker_idr, id);
goto unlock;
}
}
shrinker->id = id;
ret = 0;
unlock:
up_write(&shrinker_rwsem);
return ret;
}
static void shrinker_memcg_remove(struct shrinker *shrinker)
{
int id = shrinker->id;
BUG_ON(id < 0);
lockdep_assert_held(&shrinker_rwsem);
idr_remove(&shrinker_idr, id);
}
static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
struct shrinker_info *info;
struct shrinker_info_unit *unit;
long nr_deferred;
rcu_read_lock();
info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
unit = info->unit[shrinker_id_to_index(shrinker->id)];
nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
rcu_read_unlock();
return nr_deferred;
}
static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
struct shrinker_info *info;
struct shrinker_info_unit *unit;
long nr_deferred;
rcu_read_lock();
info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
unit = info->unit[shrinker_id_to_index(shrinker->id)];
nr_deferred =
atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
rcu_read_unlock();
return nr_deferred;
}
void reparent_shrinker_deferred(struct mem_cgroup *memcg)
{
int nid, index, offset;
long nr;
struct mem_cgroup *parent;
struct shrinker_info *child_info, *parent_info;
struct shrinker_info_unit *child_unit, *parent_unit;
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
/* Prevent from concurrent shrinker_info expand */
down_read(&shrinker_rwsem);
for_each_node(nid) {
child_info = shrinker_info_protected(memcg, nid);
parent_info = shrinker_info_protected(parent, nid);
for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
child_unit = child_info->unit[index];
parent_unit = parent_info->unit[index];
for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
nr = atomic_long_read(&child_unit->nr_deferred[offset]);
atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
}
}
}
up_read(&shrinker_rwsem);
}
#else
static int shrinker_memcg_alloc(struct shrinker *shrinker)
{
return -ENOSYS;
}
static void shrinker_memcg_remove(struct shrinker *shrinker)
{
}
static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
return 0;
}
static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
return 0;
}
#endif /* CONFIG_MEMCG */
static long xchg_nr_deferred(struct shrinker *shrinker,
struct shrink_control *sc)
{
int nid = sc->nid;
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
nid = 0;
if (sc->memcg &&
(shrinker->flags & SHRINKER_MEMCG_AWARE))
return xchg_nr_deferred_memcg(nid, shrinker,
sc->memcg);
return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
}
static long add_nr_deferred(long nr, struct shrinker *shrinker,
struct shrink_control *sc)
{
int nid = sc->nid;
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
nid = 0;
if (sc->memcg &&
(shrinker->flags & SHRINKER_MEMCG_AWARE))
return add_nr_deferred_memcg(nr, nid, shrinker,
sc->memcg);
return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
}
#define SHRINK_BATCH 128
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
struct shrinker *shrinker, int priority)
{
unsigned long freed = 0;
unsigned long long delta;
long total_scan;
long freeable;
long nr;
long new_nr;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
long scanned = 0, next_deferred;
freeable = shrinker->count_objects(shrinker, shrinkctl);
if (freeable == 0 || freeable == SHRINK_EMPTY)
return freeable;
/*
* copy the current shrinker scan count into a local variable
* and zero it so that other concurrent shrinker invocations
* don't also do this scanning work.
*/
nr = xchg_nr_deferred(shrinker, shrinkctl);
if (shrinker->seeks) {
delta = freeable >> priority;
delta *= 4;
do_div(delta, shrinker->seeks);
} else {
/*
* These objects don't require any IO to create. Trim
* them aggressively under memory pressure to keep
* them from causing refetches in the IO caches.
*/
delta = freeable / 2;
}
total_scan = nr >> priority;
total_scan += delta;
total_scan = min(total_scan, (2 * freeable));
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
freeable, delta, total_scan, priority);
/*
* Normally, we should not scan less than batch_size objects in one
* pass to avoid too frequent shrinker calls, but if the slab has less
* than batch_size objects in total and we are really tight on memory,
* we will try to reclaim all available objects, otherwise we can end
* up failing allocations although there are plenty of reclaimable
* objects spread over several slabs with usage less than the
* batch_size.
*
* We detect the "tight on memory" situations by looking at the total
* number of objects we want to scan (total_scan). If it is greater
* than the total number of objects on slab (freeable), we must be
* scanning at high prio and therefore should try to reclaim as much as
* possible.
*/
while (total_scan >= batch_size ||
total_scan >= freeable) {
unsigned long ret;
unsigned long nr_to_scan = min(batch_size, total_scan);
shrinkctl->nr_to_scan = nr_to_scan;
shrinkctl->nr_scanned = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
total_scan -= shrinkctl->nr_scanned;
scanned += shrinkctl->nr_scanned;
cond_resched();
}
/*
* The deferred work is increased by any new work (delta) that wasn't
* done, decreased by old deferred work that was done now.
*
* And it is capped to two times of the freeable items.
*/
next_deferred = max_t(long, (nr + delta - scanned), 0);
next_deferred = min(next_deferred, (2 * freeable));
/*
* move the unused scan count back into the shrinker in a
* manner that handles concurrent updates.
*/
new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
return freed;
}
#ifdef CONFIG_MEMCG
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
int offset, index = 0;
if (!mem_cgroup_online(memcg))
return 0;
/*
* lockless algorithm of memcg shrink.
*
* The shrinker_info may be freed asynchronously via RCU in the
* expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
* to ensure the existence of the shrinker_info.
*
* The shrinker_info_unit is never freed unless its corresponding memcg
* is destroyed. Here we already hold the refcount of memcg, so the
* memcg will not be destroyed, and of course shrinker_info_unit will
* not be freed.
*
* So in the memcg shrink:
* step 1: use rcu_read_lock() to guarantee existence of the
* shrinker_info.
* step 2: after getting shrinker_info_unit we can safely release the
* RCU lock.
* step 3: traverse the bitmap and calculate shrinker_id
* step 4: use rcu_read_lock() to guarantee existence of the shrinker.
* step 5: use shrinker_id to find the shrinker, then use
* shrinker_try_get() to guarantee existence of the shrinker,
* then we can release the RCU lock to do do_shrink_slab() that
* may sleep.
* step 6: do shrinker_put() paired with step 5 to put the refcount,
* if the refcount reaches 0, then wake up the waiter in
* shrinker_free() by calling complete().
* Note: here is different from the global shrink, we don't
* need to acquire the RCU lock to guarantee existence of
* the shrinker, because we don't need to use this
* shrinker to traverse the next shrinker in the bitmap.
* step 7: we have already exited the read-side of rcu critical section
* before calling do_shrink_slab(), the shrinker_info may be
* released in expand_one_shrinker_info(), so go back to step 1
* to reacquire the shrinker_info.
*/
again:
rcu_read_lock();
info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
if (unlikely(!info))
goto unlock;
if (index < shrinker_id_to_index(info->map_nr_max)) {
struct shrinker_info_unit *unit;
unit = info->unit[index];
rcu_read_unlock();
for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
.memcg = memcg,
};
struct shrinker *shrinker;
int shrinker_id = calc_shrinker_id(index, offset);
rcu_read_lock();
shrinker = idr_find(&shrinker_idr, shrinker_id);
if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
clear_bit(offset, unit->map);
rcu_read_unlock();
continue;
}
rcu_read_unlock();
/* Call non-slab shrinkers even though kmem is disabled */
if (!memcg_kmem_online() &&
!(shrinker->flags & SHRINKER_NONSLAB))
continue;
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY) {
clear_bit(offset, unit->map);
/*
* After the shrinker reported that it had no objects to
* free, but before we cleared the corresponding bit in
* the memcg shrinker map, a new object might have been
* added. To make sure, we have the bit set in this
* case, we invoke the shrinker one more time and reset
* the bit if it reports that it is not empty anymore.
* The memory barrier here pairs with the barrier in
* set_shrinker_bit():
*
* list_lru_add() shrink_slab_memcg()
* list_add_tail() clear_bit()
* <MB> <MB>
* set_bit() do_shrink_slab()
*/
smp_mb__after_atomic();
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY)
ret = 0;
else
set_shrinker_bit(memcg, nid, shrinker_id);
}
freed += ret;
shrinker_put(shrinker);
}
index++;
goto again;
}
unlock:
rcu_read_unlock();
return freed;
}
#else /* !CONFIG_MEMCG */
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
return 0;
}
#endif /* CONFIG_MEMCG */
/**
* shrink_slab - shrink slab caches
* @gfp_mask: allocation context
* @nid: node whose slab caches to target
* @memcg: memory cgroup whose slab caches to target
* @priority: the reclaim priority
*
* Call the shrink functions to age shrinkable caches.
*
* @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
* unaware shrinkers will receive a node id of 0 instead.
*
* @memcg specifies the memory cgroup to target. Unaware shrinkers
* are called only if it is the root cgroup.
*
* @priority is sc->priority, we take the number of objects and >> by priority
* in order to get the scan target.
*
* Returns the number of reclaimed slab objects.
*/
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
int priority)
{
unsigned long ret, freed = 0;
struct shrinker *shrinker;
/*
* The root memcg might be allocated even though memcg is disabled
* via "cgroup_disable=memory" boot parameter. This could make
* mem_cgroup_is_root() return false, then just run memcg slab
* shrink, but skip global shrink. This may result in premature
* oom.
*/
if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
/*
* lockless algorithm of global shrink.
*
* In the unregistration setp, the shrinker will be freed asynchronously
* via RCU after its refcount reaches 0. So both rcu_read_lock() and
* shrinker_try_get() can be used to ensure the existence of the shrinker.
*
* So in the global shrink:
* step 1: use rcu_read_lock() to guarantee existence of the shrinker
* and the validity of the shrinker_list walk.
* step 2: use shrinker_try_get() to try get the refcount, if successful,
* then the existence of the shrinker can also be guaranteed,
* so we can release the RCU lock to do do_shrink_slab() that
* may sleep.
* step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
* which ensures that neither this shrinker nor the next shrinker
* will be freed in the next traversal operation.
* step 4: do shrinker_put() paired with step 2 to put the refcount,
* if the refcount reaches 0, then wake up the waiter in
* shrinker_free() by calling complete().
*/
rcu_read_lock();
list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
.memcg = memcg,
};
if (!shrinker_try_get(shrinker))
continue;
rcu_read_unlock();
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY)
ret = 0;
freed += ret;
rcu_read_lock();
shrinker_put(shrinker);
}
rcu_read_unlock();
cond_resched();
return freed;
}
struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
{
struct shrinker *shrinker;
unsigned int size;
va_list ap;
int err;
shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
if (!shrinker)
return NULL;
va_start(ap, fmt);
err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
va_end(ap);
if (err)
goto err_name;
shrinker->flags = flags | SHRINKER_ALLOCATED;
shrinker->seeks = DEFAULT_SEEKS;
if (flags & SHRINKER_MEMCG_AWARE) {
err = shrinker_memcg_alloc(shrinker);
if (err == -ENOSYS) {
/* Memcg is not supported, fallback to non-memcg-aware shrinker. */
shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
goto non_memcg;
}
if (err)
goto err_flags;
return shrinker;
}
non_memcg:
/*
* The nr_deferred is available on per memcg level for memcg aware
* shrinkers, so only allocate nr_deferred in the following cases:
* - non-memcg-aware shrinkers
* - !CONFIG_MEMCG
* - memcg is disabled by kernel command line
*/
size = sizeof(*shrinker->nr_deferred);
if (flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
if (!shrinker->nr_deferred)
goto err_flags;
return shrinker;
err_flags:
shrinker_debugfs_name_free(shrinker);
err_name:
kfree(shrinker);
return NULL;
}
EXPORT_SYMBOL_GPL(shrinker_alloc);
void shrinker_register(struct shrinker *shrinker)
{
if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
return;
}
down_write(&shrinker_rwsem);
list_add_tail_rcu(&shrinker->list, &shrinker_list);
shrinker->flags |= SHRINKER_REGISTERED;
shrinker_debugfs_add(shrinker);
up_write(&shrinker_rwsem);
init_completion(&shrinker->done);
/*
* Now the shrinker is fully set up, take the first reference to it to
* indicate that lookup operations are now allowed to use it via
* shrinker_try_get().
*/
refcount_set(&shrinker->refcount, 1);
}
EXPORT_SYMBOL_GPL(shrinker_register);
static void shrinker_free_rcu_cb(struct rcu_head *head)
{
struct shrinker *shrinker = container_of(head, struct shrinker, rcu);
kfree(shrinker->nr_deferred);
kfree(shrinker);
}
void shrinker_free(struct shrinker *shrinker)
{
struct dentry *debugfs_entry = NULL;
int debugfs_id;
if (!shrinker)
return;
if (shrinker->flags & SHRINKER_REGISTERED) {
/* drop the initial refcount */
shrinker_put(shrinker);
/*
* Wait for all lookups of the shrinker to complete, after that,
* no shrinker is running or will run again, then we can safely
* free it asynchronously via RCU and safely free the structure
* where the shrinker is located, such as super_block etc.
*/
wait_for_completion(&shrinker->done);
}
down_write(&shrinker_rwsem);
if (shrinker->flags & SHRINKER_REGISTERED) {
/*
* Now we can safely remove it from the shrinker_list and then
* free it.
*/
list_del_rcu(&shrinker->list);
debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
shrinker->flags &= ~SHRINKER_REGISTERED;
}
shrinker_debugfs_name_free(shrinker);
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
shrinker_memcg_remove(shrinker);
up_write(&shrinker_rwsem);
if (debugfs_entry)
shrinker_debugfs_remove(debugfs_entry, debugfs_id);
call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
}
EXPORT_SYMBOL_GPL(shrinker_free);