2019-05-27 06:55:06 +00:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2008-02-07 08:13:50 +00:00
|
|
|
/* memcontrol.h - Memory Controller
|
|
|
|
*
|
|
|
|
* Copyright IBM Corporation, 2007
|
|
|
|
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
|
|
|
|
*
|
2008-02-07 08:13:51 +00:00
|
|
|
* Copyright 2007 OpenVZ SWsoft Inc
|
|
|
|
* Author: Pavel Emelianov <xemul@openvz.org>
|
2008-02-07 08:13:50 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _LINUX_MEMCONTROL_H
|
|
|
|
#define _LINUX_MEMCONTROL_H
|
2009-01-08 02:08:02 +00:00
|
|
|
#include <linux/cgroup.h>
|
2011-05-26 23:25:38 +00:00
|
|
|
#include <linux/vm_event_item.h>
|
2012-12-18 22:21:56 +00:00
|
|
|
#include <linux/hardirq.h>
|
2012-12-18 22:22:09 +00:00
|
|
|
#include <linux/jump_label.h>
|
2024-02-02 21:23:18 +00:00
|
|
|
#include <linux/kernel.h>
|
2015-09-08 22:01:02 +00:00
|
|
|
#include <linux/page_counter.h>
|
|
|
|
#include <linux/vmpressure.h>
|
|
|
|
#include <linux/eventfd.h>
|
2017-07-06 22:40:52 +00:00
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/vmstat.h>
|
2015-09-08 22:01:02 +00:00
|
|
|
#include <linux/writeback.h>
|
2016-03-15 21:57:25 +00:00
|
|
|
#include <linux/page-flags.h>
|
mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}
Currently, we maintain two linear arrays per node per memcg, which are
shrinker_info::map and shrinker_info::nr_deferred. And we need to resize
them when the shrinker_nr_max is exceeded, that is, allocate a new array,
and then copy the old array to the new array, and finally free the old
array by RCU.
For shrinker_info::map, we do set_bit() under the RCU lock, so we may set
the value into the old map which is about to be freed. This may cause the
value set to be lost. The current solution is not to copy the old map when
resizing, but to set all the corresponding bits in the new map to 1. This
solves the data loss problem, but bring the overhead of more pointless
loops while doing memcg slab shrink.
For shrinker_info::nr_deferred, we will only modify it under the read lock
of shrinker_rwsem, so it will not run concurrently with the resizing. But
after we make memcg slab shrink lockless, there will be the same data loss
problem as shrinker_info::map, and we can't work around it like the map.
For such resizable arrays, the most straightforward idea is to change it
to xarray, like we did for list_lru [1]. We need to do xa_store() in the
list_lru_add()-->set_shrinker_bit(), but this will cause memory
allocation, and the list_lru_add() doesn't accept failure. A possible
solution is to pre-allocate, but the location of pre-allocation is not
well determined (such as deferred_split_shrinker case).
Therefore, this commit chooses to introduce the following secondary array
for shrinker_info::{map, nr_deferred}:
+---------------+--------+--------+-----+
| shrinker_info | unit 0 | unit 1 | ... | (secondary array)
+---------------+--------+--------+-----+
|
v
+---------------+-----+
| nr_deferred[] | map | (leaf array)
+---------------+-----+
(shrinker_info_unit)
The leaf array is never freed unless the memcg is destroyed. The secondary
array will be resized every time the shrinker id exceeds shrinker_nr_max.
So the shrinker_info_unit can be indexed from both the old and the new
shrinker_info->unit[x]. Then even if we get the old secondary array under
the RCU lock, the found map and nr_deferred are also true, so the updated
nr_deferred and map will not be lost.
[1]. https://lore.kernel.org/all/20220228122126.37293-13-songmuchun@bytedance.com/
[zhengqi.arch@bytedance.com: unlock the &shrinker_rwsem before the call to free_shrinker_info()]
Link: https://lkml.kernel.org/r/20230928141517.12164-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911094444.68966-41-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-09-11 09:44:39 +00:00
|
|
|
#include <linux/shrinker.h>
|
2011-05-26 23:25:38 +00:00
|
|
|
|
2008-02-07 08:13:51 +00:00
|
|
|
struct mem_cgroup;
|
2020-08-07 06:20:49 +00:00
|
|
|
struct obj_cgroup;
|
2008-02-07 08:13:59 +00:00
|
|
|
struct page;
|
|
|
|
struct mm_struct;
|
2012-12-18 22:22:34 +00:00
|
|
|
struct kmem_cache;
|
2008-02-07 08:13:51 +00:00
|
|
|
|
2017-05-03 21:55:13 +00:00
|
|
|
/* Cgroup-specific page state, on top of universal node page state */
|
|
|
|
enum memcg_stat_item {
|
2020-06-03 23:02:01 +00:00
|
|
|
MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
|
2017-05-03 21:55:13 +00:00
|
|
|
MEMCG_SOCK,
|
2020-08-12 01:30:21 +00:00
|
|
|
MEMCG_PERCPU_B,
|
2022-01-14 22:05:45 +00:00
|
|
|
MEMCG_VMALLOC,
|
memcg: add per-memcg total kernel memory stat
Currently memcg stats show several types of kernel memory: kernel stack,
page tables, sock, vmalloc, and slab. However, there are other
allocations with __GFP_ACCOUNT (or supersets such as GFP_KERNEL_ACCOUNT)
that are not accounted in any of those stats, a few examples are:
- various kvm allocations (e.g. allocated pages to create vcpus)
- io_uring
- tmp_page in pipes during pipe_write()
- bpf ringbuffers
- unix sockets
Keeping track of the total kernel memory is essential for the ease of
migration from cgroup v1 to v2 as there are large discrepancies between
v1's kmem.usage_in_bytes and the sum of the available kernel memory
stats in v2. Adding separate memcg stats for all __GFP_ACCOUNT kernel
allocations is an impractical maintenance burden as there a lot of those
all over the kernel code, with more use cases likely to show up in the
future.
Therefore, add a "kernel" memcg stat that is analogous to kmem page
counter, with added benefits such as using rstat infrastructure which
aggregates stats more efficiently. Additionally, this provides a
lighter alternative in case the legacy kmem is deprecated in the future
[yosryahmed@google.com: v2]
Link: https://lkml.kernel.org/r/20220203193856.972500-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20220201200823.3283171-1-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-03-22 21:40:10 +00:00
|
|
|
MEMCG_KMEM,
|
2022-05-19 21:08:53 +00:00
|
|
|
MEMCG_ZSWAP_B,
|
|
|
|
MEMCG_ZSWAPPED,
|
2016-01-20 23:03:22 +00:00
|
|
|
MEMCG_NR_STAT,
|
2011-01-13 23:47:37 +00:00
|
|
|
};
|
|
|
|
|
2018-04-10 23:29:45 +00:00
|
|
|
enum memcg_memory_event {
|
|
|
|
MEMCG_LOW,
|
2017-05-03 21:55:13 +00:00
|
|
|
MEMCG_HIGH,
|
|
|
|
MEMCG_MAX,
|
|
|
|
MEMCG_OOM,
|
2018-06-14 22:28:05 +00:00
|
|
|
MEMCG_OOM_KILL,
|
2022-01-14 22:05:35 +00:00
|
|
|
MEMCG_OOM_GROUP_KILL,
|
2020-06-02 04:49:52 +00:00
|
|
|
MEMCG_SWAP_HIGH,
|
2018-06-08 00:05:35 +00:00
|
|
|
MEMCG_SWAP_MAX,
|
|
|
|
MEMCG_SWAP_FAIL,
|
2018-04-10 23:29:45 +00:00
|
|
|
MEMCG_NR_MEMORY_EVENTS,
|
2017-05-03 21:55:13 +00:00
|
|
|
};
|
|
|
|
|
2012-01-13 01:17:59 +00:00
|
|
|
struct mem_cgroup_reclaim_cookie {
|
2016-07-28 22:46:05 +00:00
|
|
|
pg_data_t *pgdat;
|
2024-09-05 00:30:53 +00:00
|
|
|
int generation;
|
2012-01-13 01:17:59 +00:00
|
|
|
};
|
|
|
|
|
2017-05-03 21:55:13 +00:00
|
|
|
#ifdef CONFIG_MEMCG
|
|
|
|
|
|
|
|
#define MEM_CGROUP_ID_SHIFT 16
|
|
|
|
|
|
|
|
struct mem_cgroup_id {
|
|
|
|
int id;
|
2018-10-26 22:09:28 +00:00
|
|
|
refcount_t ref;
|
2017-05-03 21:55:13 +00:00
|
|
|
};
|
|
|
|
|
2022-09-07 04:35:35 +00:00
|
|
|
struct memcg_vmstats_percpu;
|
2024-08-15 05:04:47 +00:00
|
|
|
struct memcg1_events_percpu;
|
2022-09-07 04:35:35 +00:00
|
|
|
struct memcg_vmstats;
|
2024-05-01 17:26:11 +00:00
|
|
|
struct lruvec_stats_percpu;
|
|
|
|
struct lruvec_stats;
|
2015-09-08 22:01:02 +00:00
|
|
|
|
|
|
|
struct mem_cgroup_reclaim_iter {
|
|
|
|
struct mem_cgroup *position;
|
|
|
|
/* scan generation, increased every round-trip */
|
2024-09-05 00:30:53 +00:00
|
|
|
atomic_t generation;
|
2015-09-08 22:01:02 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
2019-12-01 01:50:12 +00:00
|
|
|
* per-node information in memory controller.
|
2015-09-08 22:01:02 +00:00
|
|
|
*/
|
2016-07-28 22:46:05 +00:00
|
|
|
struct mem_cgroup_per_node {
|
2024-05-28 16:40:50 +00:00
|
|
|
/* Keep the read-only fields at the start */
|
|
|
|
struct mem_cgroup *memcg; /* Back pointer, we cannot */
|
|
|
|
/* use container_of */
|
2018-02-01 00:16:45 +00:00
|
|
|
|
2021-09-02 21:55:00 +00:00
|
|
|
struct lruvec_stats_percpu __percpu *lruvec_stats_percpu;
|
2024-05-01 17:26:11 +00:00
|
|
|
struct lruvec_stats *lruvec_stats;
|
2021-05-05 01:36:23 +00:00
|
|
|
struct shrinker_info __rcu *shrinker_info;
|
2019-09-23 22:38:12 +00:00
|
|
|
|
2024-06-28 21:03:15 +00:00
|
|
|
#ifdef CONFIG_MEMCG_V1
|
2024-05-28 16:40:50 +00:00
|
|
|
/*
|
|
|
|
* Memcg-v1 only stuff in middle as buffer between read mostly fields
|
2024-07-01 18:59:32 +00:00
|
|
|
* and update often fields to avoid false sharing. If v1 stuff is
|
|
|
|
* not present, an explicit padding is needed.
|
2024-05-28 16:40:50 +00:00
|
|
|
*/
|
|
|
|
|
2015-09-08 22:01:02 +00:00
|
|
|
struct rb_node tree_node; /* RB tree node */
|
|
|
|
unsigned long usage_in_excess;/* Set to the value by which */
|
|
|
|
/* the soft limit is exceeded*/
|
|
|
|
bool on_tree;
|
2024-07-01 18:59:32 +00:00
|
|
|
#else
|
|
|
|
CACHELINE_PADDING(_pad1_);
|
2024-06-28 21:03:15 +00:00
|
|
|
#endif
|
2024-05-28 16:40:50 +00:00
|
|
|
|
|
|
|
/* Fields which get updated often at the end. */
|
|
|
|
struct lruvec lruvec;
|
2024-07-23 17:12:44 +00:00
|
|
|
CACHELINE_PADDING(_pad2_);
|
2024-05-28 16:40:50 +00:00
|
|
|
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
|
|
|
|
struct mem_cgroup_reclaim_iter iter;
|
2015-09-08 22:01:02 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct mem_cgroup_threshold {
|
|
|
|
struct eventfd_ctx *eventfd;
|
|
|
|
unsigned long threshold;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* For threshold */
|
|
|
|
struct mem_cgroup_threshold_ary {
|
|
|
|
/* An array index points to threshold just below or equal to usage. */
|
|
|
|
int current_threshold;
|
|
|
|
/* Size of entries[] */
|
|
|
|
unsigned int size;
|
|
|
|
/* Array of thresholds */
|
2023-09-22 17:53:28 +00:00
|
|
|
struct mem_cgroup_threshold entries[] __counted_by(size);
|
2015-09-08 22:01:02 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct mem_cgroup_thresholds {
|
|
|
|
/* Primary thresholds array */
|
|
|
|
struct mem_cgroup_threshold_ary *primary;
|
|
|
|
/*
|
|
|
|
* Spare threshold array.
|
|
|
|
* This is needed to make mem_cgroup_unregister_event() "never fail".
|
|
|
|
* It must be able to store at least primary->size - 1 entries.
|
|
|
|
*/
|
|
|
|
struct mem_cgroup_threshold_ary *spare;
|
|
|
|
};
|
|
|
|
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 16:06:56 +00:00
|
|
|
/*
|
|
|
|
* Remember four most recent foreign writebacks with dirty pages in this
|
|
|
|
* cgroup. Inode sharing is expected to be uncommon and, even if we miss
|
|
|
|
* one in a given round, we're likely to catch it later if it keeps
|
|
|
|
* foreign-dirtying, so a fairly low count should be enough.
|
|
|
|
*
|
|
|
|
* See mem_cgroup_track_foreign_dirty_slowpath() for details.
|
|
|
|
*/
|
|
|
|
#define MEMCG_CGWB_FRN_CNT 4
|
|
|
|
|
|
|
|
struct memcg_cgwb_frn {
|
|
|
|
u64 bdi_id; /* bdi->id of the foreign inode */
|
|
|
|
int memcg_id; /* memcg->css.id of foreign inode */
|
|
|
|
u64 at; /* jiffies_64 at the time of dirtying */
|
|
|
|
struct wb_completion done; /* tracks in-flight foreign writebacks */
|
|
|
|
};
|
|
|
|
|
2020-08-07 06:20:49 +00:00
|
|
|
/*
|
|
|
|
* Bucket for arbitrarily byte-sized objects charged to a memory
|
|
|
|
* cgroup. The bucket can be reparented in one piece when the cgroup
|
|
|
|
* is destroyed, without having to round up the individual references
|
|
|
|
* of all live memory objects in the wild.
|
|
|
|
*/
|
|
|
|
struct obj_cgroup {
|
|
|
|
struct percpu_ref refcnt;
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
atomic_t nr_charged_bytes;
|
|
|
|
union {
|
2022-02-12 00:32:32 +00:00
|
|
|
struct list_head list; /* protected by objcg_lock */
|
2020-08-07 06:20:49 +00:00
|
|
|
struct rcu_head rcu;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
2015-09-08 22:01:02 +00:00
|
|
|
/*
|
|
|
|
* The memory controller data structure. The memory controller controls both
|
|
|
|
* page cache and RSS per cgroup. We would eventually like to provide
|
|
|
|
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
|
|
|
|
* to help the administrator determine what knobs to tune.
|
|
|
|
*/
|
|
|
|
struct mem_cgroup {
|
|
|
|
struct cgroup_subsys_state css;
|
|
|
|
|
2016-07-20 22:44:57 +00:00
|
|
|
/* Private memcg ID. Used to ID objects that outlive the cgroup */
|
|
|
|
struct mem_cgroup_id id;
|
|
|
|
|
2015-09-08 22:01:02 +00:00
|
|
|
/* Accounted resources */
|
2020-10-13 23:52:56 +00:00
|
|
|
struct page_counter memory; /* Both v1 & v2 */
|
|
|
|
|
|
|
|
union {
|
|
|
|
struct page_counter swap; /* v2 only */
|
|
|
|
struct page_counter memsw; /* v1 only */
|
|
|
|
};
|
2016-01-20 23:02:50 +00:00
|
|
|
|
mm, memcg: cg2 memory{.swap,}.peak write handlers
Patch series "mm, memcg: cg2 memory{.swap,}.peak write handlers", v7.
This patch (of 2):
Other mechanisms for querying the peak memory usage of either a process or
v1 memory cgroup allow for resetting the high watermark. Restore parity
with those mechanisms, but with a less racy API.
For example:
- Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
the high watermark.
- writing "5" to the clear_refs pseudo-file in a processes's proc
directory resets the peak RSS.
This change is an evolution of a previous patch, which mostly copied the
cgroup v1 behavior, however, there were concerns about races/ownership
issues with a global reset, so instead this change makes the reset
filedescriptor-local.
Writing any non-empty string to the memory.peak and memory.swap.peak
pseudo-files reset the high watermark to the current usage for subsequent
reads through that same FD.
Notably, following Johannes's suggestion, this implementation moves the
O(FDs that have written) behavior onto the FD write(2) path. Instead, on
the page-allocation path, we simply add one additional watermark to
conditionally bump per-hierarchy level in the page-counter.
Additionally, this takes Longman's suggestion of nesting the
page-charging-path checks for the two watermarks to reduce the number of
common-case comparisons.
This behavior is particularly useful for work scheduling systems that need
to track memory usage of worker processes/cgroups per-work-item. Since
memory can't be squeezed like CPU can (the OOM-killer has opinions), these
systems need to track the peak memory usage to compute system/container
fullness when binpacking workitems.
Most notably, Vimeo's use-case involves a system that's doing global
binpacking across many Kubernetes pods/containers, and while we can use
PSI for some local decisions about overload, we strive to avoid packing
workloads too tightly in the first place. To facilitate this, we track
the peak memory usage. However, since we run with long-lived workers (to
amortize startup costs) we need a way to track the high watermark while a
work-item is executing. Polling runs the risk of missing short spikes
that last for timescales below the polling interval, and peak memory
tracking at the cgroup level is otherwise perfect for this use-case.
As this data is used to ensure that binpacked work ends up with sufficient
headroom, this use-case mostly avoids the inaccuracies surrounding
reclaimable memory.
Link: https://lkml.kernel.org/r/20240730231304.761942-1-davidf@vimeo.com
Link: https://lkml.kernel.org/r/20240729143743.34236-1-davidf@vimeo.com
Link: https://lkml.kernel.org/r/20240729143743.34236-2-davidf@vimeo.com
Signed-off-by: David Finkel <davidf@vimeo.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Waiman Long <longman@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-07-29 14:37:42 +00:00
|
|
|
/* registered local peak watchers */
|
|
|
|
struct list_head memory_peaks;
|
|
|
|
struct list_head swap_peaks;
|
|
|
|
spinlock_t peaks_lock;
|
|
|
|
|
2016-01-14 23:21:29 +00:00
|
|
|
/* Range enforcement for interrupt charges */
|
|
|
|
struct work_struct high_work;
|
|
|
|
|
2024-07-01 15:31:15 +00:00
|
|
|
#ifdef CONFIG_ZSWAP
|
2022-05-19 21:08:53 +00:00
|
|
|
unsigned long zswap_max;
|
zswap: memcontrol: implement zswap writeback disabling
During our experiment with zswap, we sometimes observe swap IOs due to
occasional zswap store failures and writebacks-to-swap. These swapping
IOs prevent many users who cannot tolerate swapping from adopting zswap to
save memory and improve performance where possible.
This patch adds the option to disable this behavior entirely: do not
writeback to backing swapping device when a zswap store attempt fail, and
do not write pages in the zswap pool back to the backing swap device (both
when the pool is full, and when the new zswap shrinker is called).
This new behavior can be opted-in/out on a per-cgroup basis via a new
cgroup file. By default, writebacks to swap device is enabled, which is
the previous behavior. Initially, writeback is enabled for the root
cgroup, and a newly created cgroup will inherit the current setting of its
parent.
Note that this is subtly different from setting memory.swap.max to 0, as
it still allows for pages to be stored in the zswap pool (which itself
consumes swap space in its current form).
This patch should be applied on top of the zswap shrinker series:
https://lore.kernel.org/linux-mm/20231130194023.4102148-1-nphamcs@gmail.com/
as it also disables the zswap shrinker, a major source of zswap
writebacks.
For the most part, this feature is motivated by internal parties who
have already established their opinions regarding swapping - the
workloads that are highly sensitive to IO, and especially those who are
using servers with really slow disk performance (for instance, massive
but slow HDDs). For these folks, it's impossible to convince them to
even entertain zswap if swapping also comes as a packaged deal.
Writeback disabling is quite a useful feature in these situations - on
a mixed workloads deployment, they can disable writeback for the more
IO-sensitive workloads, and enable writeback for other background
workloads.
For instance, on a server with HDD, I allocate memories and populate
them with random values (so that zswap store will always fail), and
specify memory.high low enough to trigger reclaim. The time it takes
to allocate the memories and just read through it a couple of times
(doing silly things like computing the values' average etc.):
zswap.writeback disabled:
real 0m30.537s
user 0m23.687s
sys 0m6.637s
0 pages swapped in
0 pages swapped out
zswap.writeback enabled:
real 0m45.061s
user 0m24.310s
sys 0m8.892s
712686 pages swapped in
461093 pages swapped out
(the last two lines are from vmstat -s).
[nphamcs@gmail.com: add a comment about recurring zswap store failures leading to reclaim inefficiency]
Link: https://lkml.kernel.org/r/20231221005725.3446672-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231207192406.3809579-1-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: David Heidelberg <david@ixit.cz>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-12-07 19:24:06 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Prevent pages from this memcg from being written back from zswap to
|
|
|
|
* swap, and from being swapped out on zswap store failures.
|
|
|
|
*/
|
|
|
|
bool zswap_writeback;
|
2022-05-19 21:08:53 +00:00
|
|
|
#endif
|
|
|
|
|
2015-09-08 22:01:02 +00:00
|
|
|
/* vmpressure notifications */
|
|
|
|
struct vmpressure vmpressure;
|
|
|
|
|
2018-08-22 04:53:54 +00:00
|
|
|
/*
|
|
|
|
* Should the OOM killer kill all belonging tasks, had it kill one?
|
|
|
|
*/
|
|
|
|
bool oom_group;
|
|
|
|
|
2024-06-28 21:03:14 +00:00
|
|
|
int swappiness;
|
2015-09-08 22:01:02 +00:00
|
|
|
|
2019-07-12 03:55:55 +00:00
|
|
|
/* memory.events and memory.events.local */
|
2015-09-18 22:01:59 +00:00
|
|
|
struct cgroup_file events_file;
|
2019-07-12 03:55:55 +00:00
|
|
|
struct cgroup_file events_local_file;
|
2015-09-18 22:01:59 +00:00
|
|
|
|
2018-06-08 00:05:35 +00:00
|
|
|
/* handle for "memory.swap.events" */
|
|
|
|
struct cgroup_file swap_events_file;
|
|
|
|
|
mm: memcontrol: switch to rstat
Replace the memory controller's custom hierarchical stats code with the
generic rstat infrastructure provided by the cgroup core.
The current implementation does batched upward propagation from the
write side (i.e. as stats change). The per-cpu batches introduce an
error, which is multiplied by the number of subgroups in a tree. In
systems with many CPUs and sizable cgroup trees, the error can be large
enough to confuse users (e.g. 32 batch pages * 32 CPUs * 32 subgroups
results in an error of up to 128M per stat item). This can entirely
swallow allocation bursts inside a workload that the user is expecting
to see reflected in the statistics.
In the past, we've done read-side aggregation, where a memory.stat read
would have to walk the entire subtree and add up per-cpu counts. This
became problematic with lazily-freed cgroups: we could have large
subtrees where most cgroups were entirely idle. Hence the switch to
change-driven upward propagation. Unfortunately, it needed to trade
accuracy for speed due to the write side being so hot.
Rstat combines the best of both worlds: from the write side, it cheaply
maintains a queue of cgroups that have pending changes, so that the read
side can do selective tree aggregation. This way the reported stats
will always be precise and recent as can be, while the aggregation can
skip over potentially large numbers of idle cgroups.
The way rstat works is that it implements a tree for tracking cgroups
with pending local changes, as well as a flush function that walks the
tree upwards. The controller then drives this by 1) telling rstat when
a local cgroup stat changes (e.g. mod_memcg_state) and 2) when a flush
is required to get uptodate hierarchy stats for a given subtree (e.g.
when memory.stat is read). The controller also provides a flush
callback that is called during the rstat flush walk for each cgroup and
aggregates its local per-cpu counters and propagates them upwards.
This adds a second vmstats to struct mem_cgroup (MEMCG_NR_STAT +
NR_VM_EVENT_ITEMS) to track pending subtree deltas during upward
aggregation. It removes 3 words from the per-cpu data. It eliminates
memcg_exact_page_state(), since memcg_page_state() is now exact.
[akpm@linux-foundation.org: merge fix]
[hannes@cmpxchg.org: fix a sleep in atomic section problem]
Link: https://lkml.kernel.org/r/20210315234100.64307-1-hannes@cmpxchg.org
Link: https://lkml.kernel.org/r/20210209163304.77088-7-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:26 +00:00
|
|
|
/* memory.stat */
|
2022-09-07 04:35:35 +00:00
|
|
|
struct memcg_vmstats *vmstats;
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-14 22:47:12 +00:00
|
|
|
|
mm: memcontrol: don't batch updates of local VM stats and events
The kernel test robot noticed a 26% will-it-scale pagefault regression
from commit 42a300353577 ("mm: memcontrol: fix recursive statistics
correctness & scalabilty"). This appears to be caused by bouncing the
additional cachelines from the new hierarchical statistics counters.
We can fix this by getting rid of the batched local counters instead.
Originally, there were *only* group-local counters, and they were fully
maintained per cpu. A reader of a stats file high up in the cgroup tree
would have to walk the entire subtree and collect each level's per-cpu
counters to get the recursive view. This was prohibitively expensive,
and so we switched to per-cpu batched updates of the local counters
during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"), reducing the complexity from nr_subgroups *
nr_cpus to nr_subgroups.
With growing machines and cgroup trees, the tree walk itself became too
expensive for monitoring top-level groups, and this is when the culprit
patch added hierarchy counters on each cgroup level. When the per-cpu
batch size would be reached, both the local and the hierarchy counters
would get batch-updated from the per-cpu delta simultaneously.
This makes local and hierarchical counter reads blazingly fast, but it
unfortunately makes the write-side too cache line intense.
Since local counter reads were never a problem - we only centralized
them to accelerate the hierarchy walk - and use of the local counters
are becoming rarer due to replacement with hierarchical views (ongoing
rework in the page reclaim and workingset code), we can make those local
counters unbatched per-cpu counters again.
The scheme will then be as such:
when a memcg statistic changes, the writer will:
- update the local counter (per-cpu)
- update the batch counter (per-cpu). If the batch is full:
- spill the batch into the group's atomic_t
- spill the batch into all ancestors' atomic_ts
- empty out the batch counter (per-cpu)
when a local memcg counter is read, the reader will:
- collect the local counter from all cpus
when a hiearchy memcg counter is read, the reader will:
- read the atomic_t
We might be able to simplify this further and make the recursive
counters unbatched per-cpu counters as well (batch upward propagation,
but leave per-cpu collection to the readers), but that will require a
more in-depth analysis and testing of all the callsites. Deal with the
immediate regression for now.
Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: kernel test robot <rong.a.chen@intel.com>
Tested-by: kernel test robot <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-13 22:55:46 +00:00
|
|
|
/* memory.events */
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-14 22:47:12 +00:00
|
|
|
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
|
2019-07-12 03:55:55 +00:00
|
|
|
atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS];
|
2015-09-08 22:01:02 +00:00
|
|
|
|
2023-08-14 07:09:11 +00:00
|
|
|
/*
|
|
|
|
* Hint of reclaim pressure for socket memroy management. Note
|
|
|
|
* that this indicator should NOT be used in legacy cgroup mode
|
|
|
|
* where socket memory is accounted/charged separately.
|
|
|
|
*/
|
2016-01-20 23:02:47 +00:00
|
|
|
unsigned long socket_pressure;
|
|
|
|
|
2015-09-08 22:01:02 +00:00
|
|
|
int kmemcg_id;
|
2023-10-19 22:53:43 +00:00
|
|
|
/*
|
|
|
|
* memcg->objcg is wiped out as a part of the objcg repaprenting
|
|
|
|
* process. memcg->orig_objcg preserves a pointer (and a reference)
|
|
|
|
* to the original objcg until the end of live of memcg.
|
|
|
|
*/
|
|
|
|
struct obj_cgroup __rcu *objcg;
|
|
|
|
struct obj_cgroup *orig_objcg;
|
2022-02-12 00:32:32 +00:00
|
|
|
/* list of inherited objcgs, protected by objcg_lock */
|
|
|
|
struct list_head objcg_list;
|
2015-09-08 22:01:02 +00:00
|
|
|
|
mm: memcg: relayout structure mem_cgroup to avoid cache interference
0day reported one -22.7% regression for will-it-scale page_fault2
case [1] on a 4 sockets 144 CPU platform, and bisected to it to be
caused by Waiman's optimization (commit bd0b230fe1) of saving one
'struct page_counter' space for 'struct mem_cgroup'.
Initially we thought it was due to the cache alignment change introduced
by the patch, but further debug shows that it is due to some hot data
members ('vmstats_local', 'vmstats_percpu', 'vmstats') sit in 2 adjacent
cacheline (2N and 2N+1 cacheline), and when adjacent cache line prefetch
is enabled, it triggers an "extended level" of cache false sharing for
2 adjacent cache lines.
So exchange the 2 member blocks, while keeping mostly the original
cache alignment, which can restore and even enhance the performance,
and save 64 bytes of space for 'struct mem_cgroup' (from 2880 to 2816,
with 0day's default RHEL-8.3 kernel config)
[1]. https://lore.kernel.org/lkml/20201102091543.GM31092@shao2-debian/
Fixes: bd0b230fe145 ("mm/memcg: unify swap and memsw page counters")
Reported-by: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-11-25 05:22:21 +00:00
|
|
|
struct memcg_vmstats_percpu __percpu *vmstats_percpu;
|
|
|
|
|
2015-09-08 22:01:02 +00:00
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
|
struct list_head cgwb_list;
|
|
|
|
struct wb_domain cgwb_domain;
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 16:06:56 +00:00
|
|
|
struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
|
2015-09-08 22:01:02 +00:00
|
|
|
#endif
|
|
|
|
|
2019-09-23 22:38:15 +00:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
struct deferred_split deferred_split_queue;
|
|
|
|
#endif
|
|
|
|
|
2023-12-27 14:12:02 +00:00
|
|
|
#ifdef CONFIG_LRU_GEN_WALKS_MMU
|
mm: multi-gen LRU: support page table walks
To further exploit spatial locality, the aging prefers to walk page tables
to search for young PTEs and promote hot pages. A kill switch will be
added in the next patch to disable this behavior. When disabled, the
aging relies on the rmap only.
NB: this behavior has nothing similar with the page table scanning in the
2.4 kernel [1], which searches page tables for old PTEs, adds cold pages
to swapcache and unmaps them.
To avoid confusion, the term "iteration" specifically means the traversal
of an entire mm_struct list; the term "walk" will be applied to page
tables and the rmap, as usual.
An mm_struct list is maintained for each memcg, and an mm_struct follows
its owner task to the new memcg when this task is migrated. Given an
lruvec, the aging iterates lruvec_memcg()->mm_list and calls
walk_page_range() with each mm_struct on this list to promote hot pages
before it increments max_seq.
When multiple page table walkers iterate the same list, each of them gets
a unique mm_struct; therefore they can run concurrently. Page table
walkers ignore any misplaced pages, e.g., if an mm_struct was migrated,
pages it left in the previous memcg will not be promoted when its current
memcg is under reclaim. Similarly, page table walkers will not promote
pages from nodes other than the one under reclaim.
This patch uses the following optimizations when walking page tables:
1. It tracks the usage of mm_struct's between context switches so that
page table walkers can skip processes that have been sleeping since
the last iteration.
2. It uses generational Bloom filters to record populated branches so
that page table walkers can reduce their search space based on the
query results, e.g., to skip page tables containing mostly holes or
misplaced pages.
3. It takes advantage of the accessed bit in non-leaf PMD entries when
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
4. It does not zigzag between a PGD table and the same PMD table
spanning multiple VMAs. IOW, it finishes all the VMAs within the
range of the same PMD table before it returns to a PGD table. This
improves the cache performance for workloads that have large
numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.
Server benchmark results:
Single workload:
fio (buffered I/O): no change
Single workload:
memcached (anon): +[8, 10]%
Ops/sec KB/sec
patch1-7: 1147696.57 44640.29
patch1-8: 1245274.91 48435.66
Configurations:
no change
Client benchmark results:
kswapd profiles:
patch1-7
48.16% lzo1x_1_do_compress (real work)
8.20% page_vma_mapped_walk (overhead)
7.06% _raw_spin_unlock_irq
2.92% ptep_clear_flush
2.53% __zram_bvec_write
2.11% do_raw_spin_lock
2.02% memmove
1.93% lru_gen_look_around
1.56% free_unref_page_list
1.40% memset
patch1-8
49.44% lzo1x_1_do_compress (real work)
6.19% page_vma_mapped_walk (overhead)
5.97% _raw_spin_unlock_irq
3.13% get_pfn_folio
2.85% ptep_clear_flush
2.42% __zram_bvec_write
2.08% do_raw_spin_lock
1.92% memmove
1.44% alloc_zspage
1.36% memset
Configurations:
no change
Thanks to the following developers for their efforts [3].
kernel test robot <lkp@intel.com>
[1] https://lwn.net/Articles/23732/
[2] https://llvm.org/docs/ScudoHardenedAllocator.html
[3] https://lore.kernel.org/r/202204160827.ekEARWQo-lkp@intel.com/
Link: https://lkml.kernel.org/r/20220918080010.2920238-9-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-18 08:00:05 +00:00
|
|
|
/* per-memcg mm_struct list */
|
|
|
|
struct lru_gen_mm_list mm_list;
|
|
|
|
#endif
|
|
|
|
|
2024-06-28 21:03:14 +00:00
|
|
|
#ifdef CONFIG_MEMCG_V1
|
|
|
|
/* Legacy consumer-oriented counters */
|
|
|
|
struct page_counter kmem; /* v1 only */
|
|
|
|
struct page_counter tcpmem; /* v1 only */
|
|
|
|
|
2024-08-15 05:04:52 +00:00
|
|
|
struct memcg1_events_percpu __percpu *events_percpu;
|
|
|
|
|
2024-06-28 21:03:14 +00:00
|
|
|
unsigned long soft_limit;
|
|
|
|
|
|
|
|
/* protected by memcg_oom_lock */
|
|
|
|
bool oom_lock;
|
|
|
|
int under_oom;
|
|
|
|
|
|
|
|
/* OOM-Killer disable */
|
|
|
|
int oom_kill_disable;
|
|
|
|
|
|
|
|
/* protect arrays of thresholds */
|
|
|
|
struct mutex thresholds_lock;
|
|
|
|
|
|
|
|
/* thresholds for memory usage. RCU-protected */
|
|
|
|
struct mem_cgroup_thresholds thresholds;
|
|
|
|
|
|
|
|
/* thresholds for mem+swap usage. RCU-protected */
|
|
|
|
struct mem_cgroup_thresholds memsw_thresholds;
|
|
|
|
|
|
|
|
/* For oom notifier event fd */
|
|
|
|
struct list_head oom_notify;
|
|
|
|
|
|
|
|
/* Legacy tcp memory accounting */
|
|
|
|
bool tcpmem_active;
|
|
|
|
int tcpmem_pressure;
|
|
|
|
|
|
|
|
/* List of events which userspace want to receive */
|
|
|
|
struct list_head event_list;
|
|
|
|
spinlock_t event_list_lock;
|
|
|
|
#endif /* CONFIG_MEMCG_V1 */
|
|
|
|
|
2021-06-29 02:38:12 +00:00
|
|
|
struct mem_cgroup_per_node *nodeinfo[];
|
2015-09-08 22:01:02 +00:00
|
|
|
};
|
2016-01-14 23:20:56 +00:00
|
|
|
|
2018-02-01 00:16:45 +00:00
|
|
|
/*
|
2022-08-25 00:05:06 +00:00
|
|
|
* size of first charge trial.
|
|
|
|
* TODO: maybe necessary to use big numbers in big irons or dynamic based of the
|
|
|
|
* workload.
|
2018-02-01 00:16:45 +00:00
|
|
|
*/
|
2022-08-25 00:05:06 +00:00
|
|
|
#define MEMCG_CHARGE_BATCH 64U
|
2018-02-01 00:16:45 +00:00
|
|
|
|
2016-01-14 23:20:56 +00:00
|
|
|
extern struct mem_cgroup *root_mem_cgroup;
|
2015-05-22 21:13:20 +00:00
|
|
|
|
2020-12-01 21:58:29 +00:00
|
|
|
enum page_memcg_data_flags {
|
2024-03-21 16:36:28 +00:00
|
|
|
/* page->memcg_data is a pointer to an slabobj_ext vector */
|
|
|
|
MEMCG_DATA_OBJEXTS = (1UL << 0),
|
2020-12-01 21:58:30 +00:00
|
|
|
/* page has been accounted as a non-slab kernel page */
|
|
|
|
MEMCG_DATA_KMEM = (1UL << 1),
|
2020-12-01 21:58:29 +00:00
|
|
|
/* the next bit after the last actual flag */
|
2020-12-01 21:58:30 +00:00
|
|
|
__NR_MEMCG_DATA_FLAGS = (1UL << 2),
|
2020-12-01 21:58:29 +00:00
|
|
|
};
|
|
|
|
|
2024-03-21 16:36:31 +00:00
|
|
|
#define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS
|
|
|
|
|
|
|
|
#else /* CONFIG_MEMCG */
|
|
|
|
|
|
|
|
#define __FIRST_OBJEXT_FLAG (1UL << 0)
|
|
|
|
|
|
|
|
#endif /* CONFIG_MEMCG */
|
|
|
|
|
|
|
|
enum objext_flags {
|
2024-03-21 16:36:57 +00:00
|
|
|
/* slabobj_ext vector failed to allocate */
|
|
|
|
OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG,
|
2024-03-21 16:36:31 +00:00
|
|
|
/* the next bit after the last actual flag */
|
2024-03-21 16:36:57 +00:00
|
|
|
__NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1),
|
2024-03-21 16:36:31 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)
|
|
|
|
|
|
|
|
#ifdef CONFIG_MEMCG
|
2020-12-01 21:58:29 +00:00
|
|
|
|
2021-06-28 18:59:26 +00:00
|
|
|
static inline bool folio_memcg_kmem(struct folio *folio);
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* After the initialization objcg->memcg is always pointing at
|
|
|
|
* a valid memcg, but can be atomically swapped to the parent memcg.
|
|
|
|
*
|
2024-08-14 09:34:15 +00:00
|
|
|
* The caller must ensure that the returned memcg won't be released.
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
*/
|
|
|
|
static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
|
|
|
|
{
|
2024-08-14 09:34:15 +00:00
|
|
|
lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex));
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
return READ_ONCE(objcg->memcg);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2021-06-28 18:59:26 +00:00
|
|
|
* __folio_memcg - Get the memory cgroup associated with a non-kmem folio
|
|
|
|
* @folio: Pointer to the folio.
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
*
|
2021-06-28 18:59:26 +00:00
|
|
|
* Returns a pointer to the memory cgroup associated with the folio,
|
|
|
|
* or NULL. This function assumes that the folio is known to have a
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
* proper memory cgroup pointer. It's not safe to call this function
|
2021-06-28 18:59:26 +00:00
|
|
|
* against some type of folios, e.g. slab folios or ex-slab folios or
|
|
|
|
* kmem folios.
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
*/
|
2021-06-28 18:59:26 +00:00
|
|
|
static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
{
|
2021-06-28 18:59:26 +00:00
|
|
|
unsigned long memcg_data = folio->memcg_data;
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
|
2021-06-28 18:59:26 +00:00
|
|
|
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
|
2024-03-21 16:36:28 +00:00
|
|
|
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
|
2021-06-28 18:59:26 +00:00
|
|
|
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
|
2024-03-21 16:36:31 +00:00
|
|
|
return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2021-06-28 18:59:26 +00:00
|
|
|
* __folio_objcg - get the object cgroup associated with a kmem folio.
|
|
|
|
* @folio: Pointer to the folio.
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
*
|
2021-06-28 18:59:26 +00:00
|
|
|
* Returns a pointer to the object cgroup associated with the folio,
|
|
|
|
* or NULL. This function assumes that the folio is known to have a
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
* proper object cgroup pointer. It's not safe to call this function
|
2021-06-28 18:59:26 +00:00
|
|
|
* against some type of folios, e.g. slab folios or ex-slab folios or
|
|
|
|
* LRU folios.
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
*/
|
2021-06-28 18:59:26 +00:00
|
|
|
static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
{
|
2021-06-28 18:59:26 +00:00
|
|
|
unsigned long memcg_data = folio->memcg_data;
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
|
2021-06-28 18:59:26 +00:00
|
|
|
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
|
2024-03-21 16:36:28 +00:00
|
|
|
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
|
2021-06-28 18:59:26 +00:00
|
|
|
VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
|
2024-03-21 16:36:31 +00:00
|
|
|
return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
}
|
|
|
|
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
/*
|
2021-06-28 18:59:26 +00:00
|
|
|
* folio_memcg - Get the memory cgroup associated with a folio.
|
|
|
|
* @folio: Pointer to the folio.
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
*
|
2021-06-28 18:59:26 +00:00
|
|
|
* Returns a pointer to the memory cgroup associated with the folio,
|
|
|
|
* or NULL. This function assumes that the folio is known to have a
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
* proper memory cgroup pointer. It's not safe to call this function
|
2021-06-28 18:59:26 +00:00
|
|
|
* against some type of folios, e.g. slab folios or ex-slab folios.
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
*
|
2021-06-28 18:59:26 +00:00
|
|
|
* For a non-kmem folio any of the following ensures folio and memcg binding
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
* stability:
|
|
|
|
*
|
2021-06-28 18:59:26 +00:00
|
|
|
* - the folio lock
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
* - LRU isolation
|
|
|
|
* - exclusive reference
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
*
|
2021-06-28 18:59:26 +00:00
|
|
|
* For a kmem folio a caller should hold an rcu read lock to protect memcg
|
|
|
|
* associated with a kmem folio from being released.
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
*/
|
2021-06-28 18:59:26 +00:00
|
|
|
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
|
|
|
|
{
|
|
|
|
if (folio_memcg_kmem(folio))
|
|
|
|
return obj_cgroup_memcg(__folio_objcg(folio));
|
|
|
|
return __folio_memcg(folio);
|
|
|
|
}
|
|
|
|
|
2024-08-14 09:34:15 +00:00
|
|
|
/*
|
|
|
|
* folio_memcg_charged - If a folio is charged to a memory cgroup.
|
|
|
|
* @folio: Pointer to the folio.
|
|
|
|
*
|
|
|
|
* Returns true if folio is charged to a memory cgroup, otherwise returns false.
|
|
|
|
*/
|
|
|
|
static inline bool folio_memcg_charged(struct folio *folio)
|
|
|
|
{
|
|
|
|
if (folio_memcg_kmem(folio))
|
|
|
|
return __folio_objcg(folio) != NULL;
|
|
|
|
return __folio_memcg(folio) != NULL;
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
/*
|
2022-12-30 07:08:42 +00:00
|
|
|
* folio_memcg_check - Get the memory cgroup associated with a folio.
|
|
|
|
* @folio: Pointer to the folio.
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
*
|
2022-12-30 07:08:42 +00:00
|
|
|
* Returns a pointer to the memory cgroup associated with the folio,
|
|
|
|
* or NULL. This function unlike folio_memcg() can take any folio
|
|
|
|
* as an argument. It has to be used in cases when it's not known if a folio
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
* has an associated memory cgroup pointer or an object cgroups vector or
|
|
|
|
* an object cgroup.
|
|
|
|
*
|
2022-12-30 07:08:42 +00:00
|
|
|
* For a non-kmem folio any of the following ensures folio and memcg binding
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
* stability:
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
*
|
2022-12-30 07:08:42 +00:00
|
|
|
* - the folio lock
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
* - LRU isolation
|
|
|
|
* - exclusive reference
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
*
|
2022-12-30 07:08:42 +00:00
|
|
|
* For a kmem folio a caller should hold an rcu read lock to protect memcg
|
|
|
|
* associated with a kmem folio from being released.
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
*/
|
2022-12-30 07:08:42 +00:00
|
|
|
static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
{
|
|
|
|
/*
|
2022-12-30 07:08:42 +00:00
|
|
|
* Because folio->memcg_data might be changed asynchronously
|
|
|
|
* for slabs, READ_ONCE() should be used here.
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
*/
|
2022-12-30 07:08:42 +00:00
|
|
|
unsigned long memcg_data = READ_ONCE(folio->memcg_data);
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
|
2024-03-21 16:36:28 +00:00
|
|
|
if (memcg_data & MEMCG_DATA_OBJEXTS)
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
return NULL;
|
|
|
|
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
if (memcg_data & MEMCG_DATA_KMEM) {
|
|
|
|
struct obj_cgroup *objcg;
|
|
|
|
|
2024-03-21 16:36:31 +00:00
|
|
|
objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
return obj_cgroup_memcg(objcg);
|
|
|
|
}
|
|
|
|
|
2024-03-21 16:36:31 +00:00
|
|
|
return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
|
2020-12-01 21:58:30 +00:00
|
|
|
}
|
|
|
|
|
2022-12-30 07:08:42 +00:00
|
|
|
static inline struct mem_cgroup *page_memcg_check(struct page *page)
|
|
|
|
{
|
|
|
|
if (PageTail(page))
|
|
|
|
return NULL;
|
|
|
|
return folio_memcg_check((struct folio *)page);
|
|
|
|
}
|
|
|
|
|
2022-03-22 21:40:56 +00:00
|
|
|
static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
retry:
|
|
|
|
memcg = obj_cgroup_memcg(objcg);
|
|
|
|
if (unlikely(!css_tryget(&memcg->css)))
|
|
|
|
goto retry;
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return memcg;
|
|
|
|
}
|
|
|
|
|
2020-12-01 21:58:30 +00:00
|
|
|
/*
|
2021-06-28 18:59:26 +00:00
|
|
|
* folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
|
|
|
|
* @folio: Pointer to the folio.
|
2020-12-01 21:58:30 +00:00
|
|
|
*
|
2021-06-28 18:59:26 +00:00
|
|
|
* Checks if the folio has MemcgKmem flag set. The caller must ensure
|
|
|
|
* that the folio has an associated memory cgroup. It's not safe to call
|
|
|
|
* this function against some types of folios, e.g. slab folios.
|
2020-12-01 21:58:30 +00:00
|
|
|
*/
|
2021-06-28 18:59:26 +00:00
|
|
|
static inline bool folio_memcg_kmem(struct folio *folio)
|
2020-12-01 21:58:30 +00:00
|
|
|
{
|
2021-06-28 18:59:26 +00:00
|
|
|
VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
|
2024-03-21 16:36:28 +00:00
|
|
|
VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio);
|
2021-06-28 18:59:26 +00:00
|
|
|
return folio->memcg_data & MEMCG_DATA_KMEM;
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
}
|
|
|
|
|
2021-06-28 18:59:26 +00:00
|
|
|
static inline bool PageMemcgKmem(struct page *page)
|
|
|
|
{
|
|
|
|
return folio_memcg_kmem(page_folio(page));
|
|
|
|
}
|
|
|
|
|
2018-08-17 22:48:06 +00:00
|
|
|
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return (memcg == root_mem_cgroup);
|
|
|
|
}
|
|
|
|
|
2016-03-15 21:57:16 +00:00
|
|
|
static inline bool mem_cgroup_disabled(void)
|
|
|
|
{
|
|
|
|
return !cgroup_subsys_enabled(memory_cgrp_subsys);
|
|
|
|
}
|
|
|
|
|
2021-08-20 02:04:21 +00:00
|
|
|
static inline void mem_cgroup_protection(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *memcg,
|
|
|
|
unsigned long *min,
|
|
|
|
unsigned long *low)
|
mm, memcg: proportional memory.{low,min} reclaim
cgroup v2 introduces two memory protection thresholds: memory.low
(best-effort) and memory.min (hard protection). While they generally do
what they say on the tin, there is a limitation in their implementation
that makes them difficult to use effectively: that cliff behaviour often
manifests when they become eligible for reclaim. This patch implements
more intuitive and usable behaviour, where we gradually mount more
reclaim pressure as cgroups further and further exceed their protection
thresholds.
This cliff edge behaviour happens because we only choose whether or not
to reclaim based on whether the memcg is within its protection limits
(see the use of mem_cgroup_protected in shrink_node), but we don't vary
our reclaim behaviour based on this information. Imagine the following
timeline, with the numbers the lruvec size in this zone:
1. memory.low=1000000, memory.current=999999. 0 pages may be scanned.
2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned.
3. memory.low=1000000, memory.current=1000001. 1000001* pages may be
scanned. (?!)
* Of course, we won't usually scan all available pages in the zone even
without this patch because of scan control priority, over-reclaim
protection, etc. However, as shown by the tests at the end, these
techniques don't sufficiently throttle such an extreme change in input,
so cliff-like behaviour isn't really averted by their existence alone.
Here's an example of how this plays out in practice. At Facebook, we are
trying to protect various workloads from "system" software, like
configuration management tools, metric collectors, etc (see this[0] case
study). In order to find a suitable memory.low value, we start by
determining the expected memory range within which the workload will be
comfortable operating. This isn't an exact science -- memory usage deemed
"comfortable" will vary over time due to user behaviour, differences in
composition of work, etc, etc. As such we need to ballpark memory.low,
but doing this is currently problematic:
1. If we end up setting it too low for the workload, it won't have
*any* effect (see discussion above). The group will receive the full
weight of reclaim and won't have any priority while competing with the
less important system software, as if we had no memory.low configured
at all.
2. Because of this behaviour, we end up erring on the side of setting
it too high, such that the comfort range is reliably covered. However,
protected memory is completely unavailable to the rest of the system,
so we might cause undue memory and IO pressure there when we *know* we
have some elasticity in the workload.
3. Even if we get the value totally right, smack in the middle of the
comfort zone, we get extreme jumps between no pressure and full
pressure that cause unpredictable pressure spikes in the workload due
to the current binary reclaim behaviour.
With this patch, we can set it to our ballpark estimation without too much
worry. Any undesirable behaviour, such as too much or too little reclaim
pressure on the workload or system will be proportional to how far our
estimation is off. This means we can set memory.low much more
conservatively and thus waste less resources *without* the risk of the
workload falling off a cliff if we overshoot.
As a more abstract technical description, this unintuitive behaviour
results in having to give high-priority workloads a large protection
buffer on top of their expected usage to function reliably, as otherwise
we have abrupt periods of dramatically increased memory pressure which
hamper performance. Having to set these thresholds so high wastes
resources and generally works against the principle of work conservation.
In addition, having proportional memory reclaim behaviour has other
benefits. Most notably, before this patch it's basically mandatory to set
memory.low to a higher than desirable value because otherwise as soon as
you exceed memory.low, all protection is lost, and all pages are eligible
to scan again. By contrast, having a gradual ramp in reclaim pressure
means that you now still get some protection when thresholds are exceeded,
which means that one can now be more comfortable setting memory.low to
lower values without worrying that all protection will be lost. This is
important because workingset size is really hard to know exactly,
especially with variable workloads, so at least getting *some* protection
if your workingset size grows larger than you expect increases user
confidence in setting memory.low without a huge buffer on top being
needed.
Thanks a lot to Johannes Weiner and Tejun Heo for their advice and
assistance in thinking about how to make this work better.
In testing these changes, I intended to verify that:
1. Changes in page scanning become gradual and proportional instead of
binary.
To test this, I experimented stepping further and further down
memory.low protection on a workload that floats around 19G workingset
when under memory.low protection, watching page scan rates for the
workload cgroup:
+------------+-----------------+--------------------+--------------+
| memory.low | test (pgscan/s) | control (pgscan/s) | % of control |
+------------+-----------------+--------------------+--------------+
| 21G | 0 | 0 | N/A |
| 17G | 867 | 3799 | 23% |
| 12G | 1203 | 3543 | 34% |
| 8G | 2534 | 3979 | 64% |
| 4G | 3980 | 4147 | 96% |
| 0 | 3799 | 3980 | 95% |
+------------+-----------------+--------------------+--------------+
As you can see, the test kernel (with a kernel containing this
patch) ramps up page scanning significantly more gradually than the
control kernel (without this patch).
2. More gradual ramp up in reclaim aggression doesn't result in
premature OOMs.
To test this, I wrote a script that slowly increments the number of
pages held by stress(1)'s --vm-keep mode until a production system
entered severe overall memory contention. This script runs in a highly
protected slice taking up the majority of available system memory.
Watching vmstat revealed that page scanning continued essentially
nominally between test and control, without causing forward reclaim
progress to become arrested.
[0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project
[akpm@linux-foundation.org: reflow block comments to fit in 80 cols]
[chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection]
Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name
Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-07 00:58:32 +00:00
|
|
|
{
|
2021-08-20 02:04:21 +00:00
|
|
|
*min = *low = 0;
|
|
|
|
|
mm, memcg: make scan aggression always exclude protection
This patch is an incremental improvement on the existing
memory.{low,min} relative reclaim work to base its scan pressure
calculations on how much protection is available compared to the current
usage, rather than how much the current usage is over some protection
threshold.
This change doesn't change the experience for the user in the normal
case too much. One benefit is that it replaces the (somewhat arbitrary)
100% cutoff with an indefinite slope, which makes it easier to ballpark
a memory.low value.
As well as this, the old methodology doesn't quite apply generically to
machines with varying amounts of physical memory. Let's say we have a
top level cgroup, workload.slice, and another top level cgroup,
system-management.slice. We want to roughly give 12G to
system-management.slice, so on a 32GB machine we set memory.low to 20GB
in workload.slice, and on a 64GB machine we set memory.low to 52GB.
However, because these are relative amounts to the total machine size,
while the amount of memory we want to generally be willing to yield to
system.slice is absolute (12G), we end up putting more pressure on
system.slice just because we have a larger machine and a larger workload
to fill it, which seems fairly unintuitive. With this new behaviour, we
don't end up with this unintended side effect.
Previously the way that memory.low protection works is that if you are
50% over a certain baseline, you get 50% of your normal scan pressure.
This is certainly better than the previous cliff-edge behaviour, but it
can be improved even further by always considering memory under the
currently enforced protection threshold to be out of bounds. This means
that we can set relatively low memory.low thresholds for variable or
bursty workloads while still getting a reasonable level of protection,
whereas with the previous version we may still trivially hit the 100%
clamp. The previous 100% clamp is also somewhat arbitrary, whereas this
one is more concretely based on the currently enforced protection
threshold, which is likely easier to reason about.
There is also a subtle issue with the way that proportional reclaim
worked previously -- it promotes having no memory.low, since it makes
pressure higher during low reclaim. This happens because we base our
scan pressure modulation on how far memory.current is between memory.min
and memory.low, but if memory.low is unset, we only use the overage
method. In most cromulent configurations, this then means that we end
up with *more* pressure than with no memory.low at all when we're in low
reclaim, which is not really very usable or expected.
With this patch, memory.low and memory.min affect reclaim pressure in a
more understandable and composable way. For example, from a user
standpoint, "protected" memory now remains untouchable from a reclaim
aggression standpoint, and users can also have more confidence that
bursty workloads will still receive some amount of guaranteed
protection.
Link: http://lkml.kernel.org/r/20190322160307.GA3316@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-07 00:58:38 +00:00
|
|
|
if (mem_cgroup_disabled())
|
2021-08-20 02:04:21 +00:00
|
|
|
return;
|
mm, memcg: make scan aggression always exclude protection
This patch is an incremental improvement on the existing
memory.{low,min} relative reclaim work to base its scan pressure
calculations on how much protection is available compared to the current
usage, rather than how much the current usage is over some protection
threshold.
This change doesn't change the experience for the user in the normal
case too much. One benefit is that it replaces the (somewhat arbitrary)
100% cutoff with an indefinite slope, which makes it easier to ballpark
a memory.low value.
As well as this, the old methodology doesn't quite apply generically to
machines with varying amounts of physical memory. Let's say we have a
top level cgroup, workload.slice, and another top level cgroup,
system-management.slice. We want to roughly give 12G to
system-management.slice, so on a 32GB machine we set memory.low to 20GB
in workload.slice, and on a 64GB machine we set memory.low to 52GB.
However, because these are relative amounts to the total machine size,
while the amount of memory we want to generally be willing to yield to
system.slice is absolute (12G), we end up putting more pressure on
system.slice just because we have a larger machine and a larger workload
to fill it, which seems fairly unintuitive. With this new behaviour, we
don't end up with this unintended side effect.
Previously the way that memory.low protection works is that if you are
50% over a certain baseline, you get 50% of your normal scan pressure.
This is certainly better than the previous cliff-edge behaviour, but it
can be improved even further by always considering memory under the
currently enforced protection threshold to be out of bounds. This means
that we can set relatively low memory.low thresholds for variable or
bursty workloads while still getting a reasonable level of protection,
whereas with the previous version we may still trivially hit the 100%
clamp. The previous 100% clamp is also somewhat arbitrary, whereas this
one is more concretely based on the currently enforced protection
threshold, which is likely easier to reason about.
There is also a subtle issue with the way that proportional reclaim
worked previously -- it promotes having no memory.low, since it makes
pressure higher during low reclaim. This happens because we base our
scan pressure modulation on how far memory.current is between memory.min
and memory.low, but if memory.low is unset, we only use the overage
method. In most cromulent configurations, this then means that we end
up with *more* pressure than with no memory.low at all when we're in low
reclaim, which is not really very usable or expected.
With this patch, memory.low and memory.min affect reclaim pressure in a
more understandable and composable way. For example, from a user
standpoint, "protected" memory now remains untouchable from a reclaim
aggression standpoint, and users can also have more confidence that
bursty workloads will still receive some amount of guaranteed
protection.
Link: http://lkml.kernel.org/r/20190322160307.GA3316@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-07 00:58:38 +00:00
|
|
|
|
mm, memcg: avoid stale protection values when cgroup is above protection
Patch series "mm, memcg: memory.{low,min} reclaim fix & cleanup", v4.
This series contains a fix for a edge case in my earlier protection
calculation patches, and a patch to make the area overall a little more
robust to hopefully help avoid this in future.
This patch (of 2):
A cgroup can have both memory protection and a memory limit to isolate it
from its siblings in both directions - for example, to prevent it from
being shrunk below 2G under high pressure from outside, but also from
growing beyond 4G under low pressure.
Commit 9783aa9917f8 ("mm, memcg: proportional memory.{low,min} reclaim")
implemented proportional scan pressure so that multiple siblings in excess
of their protection settings don't get reclaimed equally but instead in
accordance to their unprotected portion.
During limit reclaim, this proportionality shouldn't apply of course:
there is no competition, all pressure is from within the cgroup and should
be applied as such. Reclaim should operate at full efficiency.
However, mem_cgroup_protected() never expected anybody to look at the
effective protection values when it indicated that the cgroup is above its
protection. As a result, a query during limit reclaim may return stale
protection values that were calculated by a previous reclaim cycle in
which the cgroup did have siblings.
When this happens, reclaim is unnecessarily hesitant and potentially slow
to meet the desired limit. In theory this could lead to premature OOM
kills, although it's not obvious this has occurred in practice.
Workaround the problem by special casing reclaim roots in
mem_cgroup_protection. These memcgs are never participating in the
reclaim protection because the reclaim is internal.
We have to ignore effective protection values for reclaim roots because
mem_cgroup_protected might be called from racing reclaim contexts with
different roots. Calculation is relying on root -> leaf tree traversal
therefore top-down reclaim protection invariants should hold. The only
exception is the reclaim root which should have effective protection set
to 0 but that would be problematic for the following setup:
Let's have global and A's reclaim in parallel:
|
A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
|\
| C (low = 1G, usage = 2.5G)
B (low = 1G, usage = 0.5G)
for A reclaim we have
B.elow = B.low
C.elow = C.low
For the global reclaim
A.elow = A.low
B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
C.elow = min(C.usage, C.low)
With the effective values resetting we have A reclaim
A.elow = 0
B.elow = B.low
C.elow = C.low
and global reclaim could see the above and then
B.elow = C.elow = 0 because children_low_usage > A.elow
Which means that protected memcgs would get reclaimed.
In future we would like to make mem_cgroup_protected more robust against
racing reclaim contexts but that is likely more complex solution than this
simple workaround.
[hannes@cmpxchg.org - large part of the changelog]
[mhocko@suse.com - workaround explanation]
[chris@chrisdown.name - retitle]
Fixes: 9783aa9917f8 ("mm, memcg: proportional memory.{low,min} reclaim")
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Chris Down <chris@chrisdown.name>
Acked-by: Roman Gushchin <guro@fb.com>
Link: http://lkml.kernel.org/r/cover.1594638158.git.chris@chrisdown.name
Link: http://lkml.kernel.org/r/044fb8ecffd001c7905d27c0c2ad998069fdc396.1594638158.git.chris@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 06:22:01 +00:00
|
|
|
/*
|
|
|
|
* There is no reclaim protection applied to a targeted reclaim.
|
|
|
|
* We are special casing this specific case here because
|
2023-07-27 11:59:34 +00:00
|
|
|
* mem_cgroup_calculate_protection is not robust enough to keep
|
mm, memcg: avoid stale protection values when cgroup is above protection
Patch series "mm, memcg: memory.{low,min} reclaim fix & cleanup", v4.
This series contains a fix for a edge case in my earlier protection
calculation patches, and a patch to make the area overall a little more
robust to hopefully help avoid this in future.
This patch (of 2):
A cgroup can have both memory protection and a memory limit to isolate it
from its siblings in both directions - for example, to prevent it from
being shrunk below 2G under high pressure from outside, but also from
growing beyond 4G under low pressure.
Commit 9783aa9917f8 ("mm, memcg: proportional memory.{low,min} reclaim")
implemented proportional scan pressure so that multiple siblings in excess
of their protection settings don't get reclaimed equally but instead in
accordance to their unprotected portion.
During limit reclaim, this proportionality shouldn't apply of course:
there is no competition, all pressure is from within the cgroup and should
be applied as such. Reclaim should operate at full efficiency.
However, mem_cgroup_protected() never expected anybody to look at the
effective protection values when it indicated that the cgroup is above its
protection. As a result, a query during limit reclaim may return stale
protection values that were calculated by a previous reclaim cycle in
which the cgroup did have siblings.
When this happens, reclaim is unnecessarily hesitant and potentially slow
to meet the desired limit. In theory this could lead to premature OOM
kills, although it's not obvious this has occurred in practice.
Workaround the problem by special casing reclaim roots in
mem_cgroup_protection. These memcgs are never participating in the
reclaim protection because the reclaim is internal.
We have to ignore effective protection values for reclaim roots because
mem_cgroup_protected might be called from racing reclaim contexts with
different roots. Calculation is relying on root -> leaf tree traversal
therefore top-down reclaim protection invariants should hold. The only
exception is the reclaim root which should have effective protection set
to 0 but that would be problematic for the following setup:
Let's have global and A's reclaim in parallel:
|
A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
|\
| C (low = 1G, usage = 2.5G)
B (low = 1G, usage = 0.5G)
for A reclaim we have
B.elow = B.low
C.elow = C.low
For the global reclaim
A.elow = A.low
B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
C.elow = min(C.usage, C.low)
With the effective values resetting we have A reclaim
A.elow = 0
B.elow = B.low
C.elow = C.low
and global reclaim could see the above and then
B.elow = C.elow = 0 because children_low_usage > A.elow
Which means that protected memcgs would get reclaimed.
In future we would like to make mem_cgroup_protected more robust against
racing reclaim contexts but that is likely more complex solution than this
simple workaround.
[hannes@cmpxchg.org - large part of the changelog]
[mhocko@suse.com - workaround explanation]
[chris@chrisdown.name - retitle]
Fixes: 9783aa9917f8 ("mm, memcg: proportional memory.{low,min} reclaim")
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Chris Down <chris@chrisdown.name>
Acked-by: Roman Gushchin <guro@fb.com>
Link: http://lkml.kernel.org/r/cover.1594638158.git.chris@chrisdown.name
Link: http://lkml.kernel.org/r/044fb8ecffd001c7905d27c0c2ad998069fdc396.1594638158.git.chris@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 06:22:01 +00:00
|
|
|
* the protection invariant for calculated effective values for
|
|
|
|
* parallel reclaimers with different reclaim target. This is
|
|
|
|
* especially a problem for tail memcgs (as they have pages on LRU)
|
|
|
|
* which would want to have effective values 0 for targeted reclaim
|
|
|
|
* but a different value for external reclaim.
|
|
|
|
*
|
|
|
|
* Example
|
|
|
|
* Let's have global and A's reclaim in parallel:
|
|
|
|
* |
|
|
|
|
* A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
|
|
|
|
* |\
|
|
|
|
* | C (low = 1G, usage = 2.5G)
|
|
|
|
* B (low = 1G, usage = 0.5G)
|
|
|
|
*
|
|
|
|
* For the global reclaim
|
|
|
|
* A.elow = A.low
|
|
|
|
* B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
|
|
|
|
* C.elow = min(C.usage, C.low)
|
|
|
|
*
|
|
|
|
* With the effective values resetting we have A reclaim
|
|
|
|
* A.elow = 0
|
|
|
|
* B.elow = B.low
|
|
|
|
* C.elow = C.low
|
|
|
|
*
|
|
|
|
* If the global reclaim races with A's reclaim then
|
|
|
|
* B.elow = C.elow = 0 because children_low_usage > A.elow)
|
|
|
|
* is possible and reclaiming B would be violating the protection.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
if (root == memcg)
|
2021-08-20 02:04:21 +00:00
|
|
|
return;
|
mm, memcg: proportional memory.{low,min} reclaim
cgroup v2 introduces two memory protection thresholds: memory.low
(best-effort) and memory.min (hard protection). While they generally do
what they say on the tin, there is a limitation in their implementation
that makes them difficult to use effectively: that cliff behaviour often
manifests when they become eligible for reclaim. This patch implements
more intuitive and usable behaviour, where we gradually mount more
reclaim pressure as cgroups further and further exceed their protection
thresholds.
This cliff edge behaviour happens because we only choose whether or not
to reclaim based on whether the memcg is within its protection limits
(see the use of mem_cgroup_protected in shrink_node), but we don't vary
our reclaim behaviour based on this information. Imagine the following
timeline, with the numbers the lruvec size in this zone:
1. memory.low=1000000, memory.current=999999. 0 pages may be scanned.
2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned.
3. memory.low=1000000, memory.current=1000001. 1000001* pages may be
scanned. (?!)
* Of course, we won't usually scan all available pages in the zone even
without this patch because of scan control priority, over-reclaim
protection, etc. However, as shown by the tests at the end, these
techniques don't sufficiently throttle such an extreme change in input,
so cliff-like behaviour isn't really averted by their existence alone.
Here's an example of how this plays out in practice. At Facebook, we are
trying to protect various workloads from "system" software, like
configuration management tools, metric collectors, etc (see this[0] case
study). In order to find a suitable memory.low value, we start by
determining the expected memory range within which the workload will be
comfortable operating. This isn't an exact science -- memory usage deemed
"comfortable" will vary over time due to user behaviour, differences in
composition of work, etc, etc. As such we need to ballpark memory.low,
but doing this is currently problematic:
1. If we end up setting it too low for the workload, it won't have
*any* effect (see discussion above). The group will receive the full
weight of reclaim and won't have any priority while competing with the
less important system software, as if we had no memory.low configured
at all.
2. Because of this behaviour, we end up erring on the side of setting
it too high, such that the comfort range is reliably covered. However,
protected memory is completely unavailable to the rest of the system,
so we might cause undue memory and IO pressure there when we *know* we
have some elasticity in the workload.
3. Even if we get the value totally right, smack in the middle of the
comfort zone, we get extreme jumps between no pressure and full
pressure that cause unpredictable pressure spikes in the workload due
to the current binary reclaim behaviour.
With this patch, we can set it to our ballpark estimation without too much
worry. Any undesirable behaviour, such as too much or too little reclaim
pressure on the workload or system will be proportional to how far our
estimation is off. This means we can set memory.low much more
conservatively and thus waste less resources *without* the risk of the
workload falling off a cliff if we overshoot.
As a more abstract technical description, this unintuitive behaviour
results in having to give high-priority workloads a large protection
buffer on top of their expected usage to function reliably, as otherwise
we have abrupt periods of dramatically increased memory pressure which
hamper performance. Having to set these thresholds so high wastes
resources and generally works against the principle of work conservation.
In addition, having proportional memory reclaim behaviour has other
benefits. Most notably, before this patch it's basically mandatory to set
memory.low to a higher than desirable value because otherwise as soon as
you exceed memory.low, all protection is lost, and all pages are eligible
to scan again. By contrast, having a gradual ramp in reclaim pressure
means that you now still get some protection when thresholds are exceeded,
which means that one can now be more comfortable setting memory.low to
lower values without worrying that all protection will be lost. This is
important because workingset size is really hard to know exactly,
especially with variable workloads, so at least getting *some* protection
if your workingset size grows larger than you expect increases user
confidence in setting memory.low without a huge buffer on top being
needed.
Thanks a lot to Johannes Weiner and Tejun Heo for their advice and
assistance in thinking about how to make this work better.
In testing these changes, I intended to verify that:
1. Changes in page scanning become gradual and proportional instead of
binary.
To test this, I experimented stepping further and further down
memory.low protection on a workload that floats around 19G workingset
when under memory.low protection, watching page scan rates for the
workload cgroup:
+------------+-----------------+--------------------+--------------+
| memory.low | test (pgscan/s) | control (pgscan/s) | % of control |
+------------+-----------------+--------------------+--------------+
| 21G | 0 | 0 | N/A |
| 17G | 867 | 3799 | 23% |
| 12G | 1203 | 3543 | 34% |
| 8G | 2534 | 3979 | 64% |
| 4G | 3980 | 4147 | 96% |
| 0 | 3799 | 3980 | 95% |
+------------+-----------------+--------------------+--------------+
As you can see, the test kernel (with a kernel containing this
patch) ramps up page scanning significantly more gradually than the
control kernel (without this patch).
2. More gradual ramp up in reclaim aggression doesn't result in
premature OOMs.
To test this, I wrote a script that slowly increments the number of
pages held by stress(1)'s --vm-keep mode until a production system
entered severe overall memory contention. This script runs in a highly
protected slice taking up the majority of available system memory.
Watching vmstat revealed that page scanning continued essentially
nominally between test and control, without causing forward reclaim
progress to become arrested.
[0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project
[akpm@linux-foundation.org: reflow block comments to fit in 80 cols]
[chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection]
Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name
Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-07 00:58:32 +00:00
|
|
|
|
2021-08-20 02:04:21 +00:00
|
|
|
*min = READ_ONCE(memcg->memory.emin);
|
|
|
|
*low = READ_ONCE(memcg->memory.elow);
|
mm, memcg: proportional memory.{low,min} reclaim
cgroup v2 introduces two memory protection thresholds: memory.low
(best-effort) and memory.min (hard protection). While they generally do
what they say on the tin, there is a limitation in their implementation
that makes them difficult to use effectively: that cliff behaviour often
manifests when they become eligible for reclaim. This patch implements
more intuitive and usable behaviour, where we gradually mount more
reclaim pressure as cgroups further and further exceed their protection
thresholds.
This cliff edge behaviour happens because we only choose whether or not
to reclaim based on whether the memcg is within its protection limits
(see the use of mem_cgroup_protected in shrink_node), but we don't vary
our reclaim behaviour based on this information. Imagine the following
timeline, with the numbers the lruvec size in this zone:
1. memory.low=1000000, memory.current=999999. 0 pages may be scanned.
2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned.
3. memory.low=1000000, memory.current=1000001. 1000001* pages may be
scanned. (?!)
* Of course, we won't usually scan all available pages in the zone even
without this patch because of scan control priority, over-reclaim
protection, etc. However, as shown by the tests at the end, these
techniques don't sufficiently throttle such an extreme change in input,
so cliff-like behaviour isn't really averted by their existence alone.
Here's an example of how this plays out in practice. At Facebook, we are
trying to protect various workloads from "system" software, like
configuration management tools, metric collectors, etc (see this[0] case
study). In order to find a suitable memory.low value, we start by
determining the expected memory range within which the workload will be
comfortable operating. This isn't an exact science -- memory usage deemed
"comfortable" will vary over time due to user behaviour, differences in
composition of work, etc, etc. As such we need to ballpark memory.low,
but doing this is currently problematic:
1. If we end up setting it too low for the workload, it won't have
*any* effect (see discussion above). The group will receive the full
weight of reclaim and won't have any priority while competing with the
less important system software, as if we had no memory.low configured
at all.
2. Because of this behaviour, we end up erring on the side of setting
it too high, such that the comfort range is reliably covered. However,
protected memory is completely unavailable to the rest of the system,
so we might cause undue memory and IO pressure there when we *know* we
have some elasticity in the workload.
3. Even if we get the value totally right, smack in the middle of the
comfort zone, we get extreme jumps between no pressure and full
pressure that cause unpredictable pressure spikes in the workload due
to the current binary reclaim behaviour.
With this patch, we can set it to our ballpark estimation without too much
worry. Any undesirable behaviour, such as too much or too little reclaim
pressure on the workload or system will be proportional to how far our
estimation is off. This means we can set memory.low much more
conservatively and thus waste less resources *without* the risk of the
workload falling off a cliff if we overshoot.
As a more abstract technical description, this unintuitive behaviour
results in having to give high-priority workloads a large protection
buffer on top of their expected usage to function reliably, as otherwise
we have abrupt periods of dramatically increased memory pressure which
hamper performance. Having to set these thresholds so high wastes
resources and generally works against the principle of work conservation.
In addition, having proportional memory reclaim behaviour has other
benefits. Most notably, before this patch it's basically mandatory to set
memory.low to a higher than desirable value because otherwise as soon as
you exceed memory.low, all protection is lost, and all pages are eligible
to scan again. By contrast, having a gradual ramp in reclaim pressure
means that you now still get some protection when thresholds are exceeded,
which means that one can now be more comfortable setting memory.low to
lower values without worrying that all protection will be lost. This is
important because workingset size is really hard to know exactly,
especially with variable workloads, so at least getting *some* protection
if your workingset size grows larger than you expect increases user
confidence in setting memory.low without a huge buffer on top being
needed.
Thanks a lot to Johannes Weiner and Tejun Heo for their advice and
assistance in thinking about how to make this work better.
In testing these changes, I intended to verify that:
1. Changes in page scanning become gradual and proportional instead of
binary.
To test this, I experimented stepping further and further down
memory.low protection on a workload that floats around 19G workingset
when under memory.low protection, watching page scan rates for the
workload cgroup:
+------------+-----------------+--------------------+--------------+
| memory.low | test (pgscan/s) | control (pgscan/s) | % of control |
+------------+-----------------+--------------------+--------------+
| 21G | 0 | 0 | N/A |
| 17G | 867 | 3799 | 23% |
| 12G | 1203 | 3543 | 34% |
| 8G | 2534 | 3979 | 64% |
| 4G | 3980 | 4147 | 96% |
| 0 | 3799 | 3980 | 95% |
+------------+-----------------+--------------------+--------------+
As you can see, the test kernel (with a kernel containing this
patch) ramps up page scanning significantly more gradually than the
control kernel (without this patch).
2. More gradual ramp up in reclaim aggression doesn't result in
premature OOMs.
To test this, I wrote a script that slowly increments the number of
pages held by stress(1)'s --vm-keep mode until a production system
entered severe overall memory contention. This script runs in a highly
protected slice taking up the majority of available system memory.
Watching vmstat revealed that page scanning continued essentially
nominally between test and control, without causing forward reclaim
progress to become arrested.
[0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project
[akpm@linux-foundation.org: reflow block comments to fit in 80 cols]
[chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection]
Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name
Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-07 00:58:32 +00:00
|
|
|
}
|
|
|
|
|
2020-08-07 06:22:05 +00:00
|
|
|
void mem_cgroup_calculate_protection(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *memcg);
|
|
|
|
|
mm: memcg: fix stale protection of reclaim target memcg
Patch series "mm: memcg: fix protection of reclaim target memcg", v3.
This series fixes a bug in calculating the protection of the reclaim
target memcg where we end up using stale effective protection values from
the last reclaim operation, instead of completely ignoring the protection
of the reclaim target as intended. More detailed explanation and examples
in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest
case that catches the bug.
This patch (of 3):
When we are doing memcg reclaim, the intended behavior is that we
ignore any protection (memory.min, memory.low) of the target memcg (but
not its children). Ever since the patch pointed to by the "Fixes" tag,
we actually read a stale value for the target memcg protection when
deciding whether to skip the memcg or not because it is protected. If
the stale value happens to be high enough, we don't reclaim from the
target memcg.
Essentially, in some cases we may falsely skip reclaiming from the
target memcg of reclaim because we read a stale protection value from
last time we reclaimed from it.
During reclaim, mem_cgroup_calculate_protection() is used to determine the
effective protection (emin and elow) values of a memcg. The protection of
the reclaim target is ignored, but we cannot set their effective
protection to 0 due to a limitation of the current implementation (see
comment in mem_cgroup_protection()). Instead, we leave their effective
protection values unchaged, and later ignore it in
mem_cgroup_protection().
However, mem_cgroup_protection() is called later in
shrink_lruvec()->get_scan_count(), which is after the
mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result,
the stale effective protection values of the target memcg may lead us to
skip reclaiming from the target memcg entirely, before calling
shrink_lruvec(). This can be even worse with recursive protection, where
the stale target memcg protection can be higher than its standalone
protection. See two examples below (a similar version of example (a) is
added to test_memcontrol in a later patch).
(a) A simple example with proactive reclaim is as follows. Consider the
following hierarchy:
ROOT
|
A
|
B (memory.min = 10M)
Consider the following scenario:
- B has memory.current = 10M.
- The system undergoes global reclaim (or memcg reclaim in A).
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() calculates the effective min (emin)
of B as 10M.
- mem_cgroup_below_min() returns true for B, we do not reclaim from B.
- Now if we want to reclaim 5M from B using proactive reclaim
(memory.reclaim), we should be able to, as the protection of the
target memcg should be ignored.
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() immediately returns for B without
doing anything, as B is the target memcg, relying on
mem_cgroup_protection() to ignore B's stale effective min (still 10M).
- mem_cgroup_below_min() reads the stale effective min for B and we
skip it instead of ignoring its protection as intended, as we never
reach mem_cgroup_protection().
(b) An more complex example with recursive protection is as follows.
Consider the following hierarchy with memory_recursiveprot:
ROOT
|
A (memory.min = 50M)
|
B (memory.min = 10M, memory.high = 40M)
Consider the following scenario:
- B has memory.current = 35M.
- The system undergoes global reclaim (target memcg is NULL).
- B will have an effective min of 50M (all of A's unclaimed protection).
- B will not be reclaimed from.
- Now allocate 10M more memory in B, pushing it above it's high limit.
- The system undergoes memcg reclaim from B (target memcg is B).
- Like example (a), we do nothing in mem_cgroup_calculate_protection(),
then call mem_cgroup_below_min(), which will read the stale effective
min for B (50M) and skip it. In this case, it's even worse because we
are not just considering B's standalone protection (10M), but we are
reading a much higher stale protection (50M) which will cause us to not
reclaim from B at all.
This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple
e{low,min} state mutations from protection checks") which made
mem_cgroup_calculate_protection() only change the state without returning
any value. Before that commit, we used to return MEMCG_PROT_NONE for the
target memcg, which would cause us to skip the
mem_cgroup_below_{min/low}() checks. After that commit we do not return
anything and we end up checking the min & low effective protections for
the target memcg, which are stale.
Update mem_cgroup_supports_protection() to also check if we are reclaiming
from the target, and rename it to mem_cgroup_unprotected() (now returns
true if we should not protect the memcg, much simpler logic).
Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com
Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Chris Down <chris@chrisdown.name>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-02 03:15:10 +00:00
|
|
|
static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
|
|
|
|
struct mem_cgroup *memcg)
|
2020-08-07 06:22:05 +00:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The root memcg doesn't account charges, and doesn't support
|
mm: memcg: fix stale protection of reclaim target memcg
Patch series "mm: memcg: fix protection of reclaim target memcg", v3.
This series fixes a bug in calculating the protection of the reclaim
target memcg where we end up using stale effective protection values from
the last reclaim operation, instead of completely ignoring the protection
of the reclaim target as intended. More detailed explanation and examples
in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest
case that catches the bug.
This patch (of 3):
When we are doing memcg reclaim, the intended behavior is that we
ignore any protection (memory.min, memory.low) of the target memcg (but
not its children). Ever since the patch pointed to by the "Fixes" tag,
we actually read a stale value for the target memcg protection when
deciding whether to skip the memcg or not because it is protected. If
the stale value happens to be high enough, we don't reclaim from the
target memcg.
Essentially, in some cases we may falsely skip reclaiming from the
target memcg of reclaim because we read a stale protection value from
last time we reclaimed from it.
During reclaim, mem_cgroup_calculate_protection() is used to determine the
effective protection (emin and elow) values of a memcg. The protection of
the reclaim target is ignored, but we cannot set their effective
protection to 0 due to a limitation of the current implementation (see
comment in mem_cgroup_protection()). Instead, we leave their effective
protection values unchaged, and later ignore it in
mem_cgroup_protection().
However, mem_cgroup_protection() is called later in
shrink_lruvec()->get_scan_count(), which is after the
mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result,
the stale effective protection values of the target memcg may lead us to
skip reclaiming from the target memcg entirely, before calling
shrink_lruvec(). This can be even worse with recursive protection, where
the stale target memcg protection can be higher than its standalone
protection. See two examples below (a similar version of example (a) is
added to test_memcontrol in a later patch).
(a) A simple example with proactive reclaim is as follows. Consider the
following hierarchy:
ROOT
|
A
|
B (memory.min = 10M)
Consider the following scenario:
- B has memory.current = 10M.
- The system undergoes global reclaim (or memcg reclaim in A).
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() calculates the effective min (emin)
of B as 10M.
- mem_cgroup_below_min() returns true for B, we do not reclaim from B.
- Now if we want to reclaim 5M from B using proactive reclaim
(memory.reclaim), we should be able to, as the protection of the
target memcg should be ignored.
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() immediately returns for B without
doing anything, as B is the target memcg, relying on
mem_cgroup_protection() to ignore B's stale effective min (still 10M).
- mem_cgroup_below_min() reads the stale effective min for B and we
skip it instead of ignoring its protection as intended, as we never
reach mem_cgroup_protection().
(b) An more complex example with recursive protection is as follows.
Consider the following hierarchy with memory_recursiveprot:
ROOT
|
A (memory.min = 50M)
|
B (memory.min = 10M, memory.high = 40M)
Consider the following scenario:
- B has memory.current = 35M.
- The system undergoes global reclaim (target memcg is NULL).
- B will have an effective min of 50M (all of A's unclaimed protection).
- B will not be reclaimed from.
- Now allocate 10M more memory in B, pushing it above it's high limit.
- The system undergoes memcg reclaim from B (target memcg is B).
- Like example (a), we do nothing in mem_cgroup_calculate_protection(),
then call mem_cgroup_below_min(), which will read the stale effective
min for B (50M) and skip it. In this case, it's even worse because we
are not just considering B's standalone protection (10M), but we are
reading a much higher stale protection (50M) which will cause us to not
reclaim from B at all.
This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple
e{low,min} state mutations from protection checks") which made
mem_cgroup_calculate_protection() only change the state without returning
any value. Before that commit, we used to return MEMCG_PROT_NONE for the
target memcg, which would cause us to skip the
mem_cgroup_below_{min/low}() checks. After that commit we do not return
anything and we end up checking the min & low effective protections for
the target memcg, which are stale.
Update mem_cgroup_supports_protection() to also check if we are reclaiming
from the target, and rename it to mem_cgroup_unprotected() (now returns
true if we should not protect the memcg, much simpler logic).
Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com
Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Chris Down <chris@chrisdown.name>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-02 03:15:10 +00:00
|
|
|
* protection. The target memcg's protection is ignored, see
|
|
|
|
* mem_cgroup_calculate_protection() and mem_cgroup_protection()
|
2020-08-07 06:22:05 +00:00
|
|
|
*/
|
mm: memcg: fix stale protection of reclaim target memcg
Patch series "mm: memcg: fix protection of reclaim target memcg", v3.
This series fixes a bug in calculating the protection of the reclaim
target memcg where we end up using stale effective protection values from
the last reclaim operation, instead of completely ignoring the protection
of the reclaim target as intended. More detailed explanation and examples
in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest
case that catches the bug.
This patch (of 3):
When we are doing memcg reclaim, the intended behavior is that we
ignore any protection (memory.min, memory.low) of the target memcg (but
not its children). Ever since the patch pointed to by the "Fixes" tag,
we actually read a stale value for the target memcg protection when
deciding whether to skip the memcg or not because it is protected. If
the stale value happens to be high enough, we don't reclaim from the
target memcg.
Essentially, in some cases we may falsely skip reclaiming from the
target memcg of reclaim because we read a stale protection value from
last time we reclaimed from it.
During reclaim, mem_cgroup_calculate_protection() is used to determine the
effective protection (emin and elow) values of a memcg. The protection of
the reclaim target is ignored, but we cannot set their effective
protection to 0 due to a limitation of the current implementation (see
comment in mem_cgroup_protection()). Instead, we leave their effective
protection values unchaged, and later ignore it in
mem_cgroup_protection().
However, mem_cgroup_protection() is called later in
shrink_lruvec()->get_scan_count(), which is after the
mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result,
the stale effective protection values of the target memcg may lead us to
skip reclaiming from the target memcg entirely, before calling
shrink_lruvec(). This can be even worse with recursive protection, where
the stale target memcg protection can be higher than its standalone
protection. See two examples below (a similar version of example (a) is
added to test_memcontrol in a later patch).
(a) A simple example with proactive reclaim is as follows. Consider the
following hierarchy:
ROOT
|
A
|
B (memory.min = 10M)
Consider the following scenario:
- B has memory.current = 10M.
- The system undergoes global reclaim (or memcg reclaim in A).
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() calculates the effective min (emin)
of B as 10M.
- mem_cgroup_below_min() returns true for B, we do not reclaim from B.
- Now if we want to reclaim 5M from B using proactive reclaim
(memory.reclaim), we should be able to, as the protection of the
target memcg should be ignored.
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() immediately returns for B without
doing anything, as B is the target memcg, relying on
mem_cgroup_protection() to ignore B's stale effective min (still 10M).
- mem_cgroup_below_min() reads the stale effective min for B and we
skip it instead of ignoring its protection as intended, as we never
reach mem_cgroup_protection().
(b) An more complex example with recursive protection is as follows.
Consider the following hierarchy with memory_recursiveprot:
ROOT
|
A (memory.min = 50M)
|
B (memory.min = 10M, memory.high = 40M)
Consider the following scenario:
- B has memory.current = 35M.
- The system undergoes global reclaim (target memcg is NULL).
- B will have an effective min of 50M (all of A's unclaimed protection).
- B will not be reclaimed from.
- Now allocate 10M more memory in B, pushing it above it's high limit.
- The system undergoes memcg reclaim from B (target memcg is B).
- Like example (a), we do nothing in mem_cgroup_calculate_protection(),
then call mem_cgroup_below_min(), which will read the stale effective
min for B (50M) and skip it. In this case, it's even worse because we
are not just considering B's standalone protection (10M), but we are
reading a much higher stale protection (50M) which will cause us to not
reclaim from B at all.
This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple
e{low,min} state mutations from protection checks") which made
mem_cgroup_calculate_protection() only change the state without returning
any value. Before that commit, we used to return MEMCG_PROT_NONE for the
target memcg, which would cause us to skip the
mem_cgroup_below_{min/low}() checks. After that commit we do not return
anything and we end up checking the min & low effective protections for
the target memcg, which are stale.
Update mem_cgroup_supports_protection() to also check if we are reclaiming
from the target, and rename it to mem_cgroup_unprotected() (now returns
true if we should not protect the memcg, much simpler logic).
Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com
Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Chris Down <chris@chrisdown.name>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-02 03:15:10 +00:00
|
|
|
return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) ||
|
|
|
|
memcg == target;
|
2020-08-07 06:22:05 +00:00
|
|
|
}
|
|
|
|
|
mm: memcg: fix stale protection of reclaim target memcg
Patch series "mm: memcg: fix protection of reclaim target memcg", v3.
This series fixes a bug in calculating the protection of the reclaim
target memcg where we end up using stale effective protection values from
the last reclaim operation, instead of completely ignoring the protection
of the reclaim target as intended. More detailed explanation and examples
in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest
case that catches the bug.
This patch (of 3):
When we are doing memcg reclaim, the intended behavior is that we
ignore any protection (memory.min, memory.low) of the target memcg (but
not its children). Ever since the patch pointed to by the "Fixes" tag,
we actually read a stale value for the target memcg protection when
deciding whether to skip the memcg or not because it is protected. If
the stale value happens to be high enough, we don't reclaim from the
target memcg.
Essentially, in some cases we may falsely skip reclaiming from the
target memcg of reclaim because we read a stale protection value from
last time we reclaimed from it.
During reclaim, mem_cgroup_calculate_protection() is used to determine the
effective protection (emin and elow) values of a memcg. The protection of
the reclaim target is ignored, but we cannot set their effective
protection to 0 due to a limitation of the current implementation (see
comment in mem_cgroup_protection()). Instead, we leave their effective
protection values unchaged, and later ignore it in
mem_cgroup_protection().
However, mem_cgroup_protection() is called later in
shrink_lruvec()->get_scan_count(), which is after the
mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result,
the stale effective protection values of the target memcg may lead us to
skip reclaiming from the target memcg entirely, before calling
shrink_lruvec(). This can be even worse with recursive protection, where
the stale target memcg protection can be higher than its standalone
protection. See two examples below (a similar version of example (a) is
added to test_memcontrol in a later patch).
(a) A simple example with proactive reclaim is as follows. Consider the
following hierarchy:
ROOT
|
A
|
B (memory.min = 10M)
Consider the following scenario:
- B has memory.current = 10M.
- The system undergoes global reclaim (or memcg reclaim in A).
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() calculates the effective min (emin)
of B as 10M.
- mem_cgroup_below_min() returns true for B, we do not reclaim from B.
- Now if we want to reclaim 5M from B using proactive reclaim
(memory.reclaim), we should be able to, as the protection of the
target memcg should be ignored.
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() immediately returns for B without
doing anything, as B is the target memcg, relying on
mem_cgroup_protection() to ignore B's stale effective min (still 10M).
- mem_cgroup_below_min() reads the stale effective min for B and we
skip it instead of ignoring its protection as intended, as we never
reach mem_cgroup_protection().
(b) An more complex example with recursive protection is as follows.
Consider the following hierarchy with memory_recursiveprot:
ROOT
|
A (memory.min = 50M)
|
B (memory.min = 10M, memory.high = 40M)
Consider the following scenario:
- B has memory.current = 35M.
- The system undergoes global reclaim (target memcg is NULL).
- B will have an effective min of 50M (all of A's unclaimed protection).
- B will not be reclaimed from.
- Now allocate 10M more memory in B, pushing it above it's high limit.
- The system undergoes memcg reclaim from B (target memcg is B).
- Like example (a), we do nothing in mem_cgroup_calculate_protection(),
then call mem_cgroup_below_min(), which will read the stale effective
min for B (50M) and skip it. In this case, it's even worse because we
are not just considering B's standalone protection (10M), but we are
reading a much higher stale protection (50M) which will cause us to not
reclaim from B at all.
This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple
e{low,min} state mutations from protection checks") which made
mem_cgroup_calculate_protection() only change the state without returning
any value. Before that commit, we used to return MEMCG_PROT_NONE for the
target memcg, which would cause us to skip the
mem_cgroup_below_{min/low}() checks. After that commit we do not return
anything and we end up checking the min & low effective protections for
the target memcg, which are stale.
Update mem_cgroup_supports_protection() to also check if we are reclaiming
from the target, and rename it to mem_cgroup_unprotected() (now returns
true if we should not protect the memcg, much simpler logic).
Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com
Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Chris Down <chris@chrisdown.name>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-02 03:15:10 +00:00
|
|
|
static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
|
|
|
|
struct mem_cgroup *memcg)
|
2020-08-07 06:22:05 +00:00
|
|
|
{
|
mm: memcg: fix stale protection of reclaim target memcg
Patch series "mm: memcg: fix protection of reclaim target memcg", v3.
This series fixes a bug in calculating the protection of the reclaim
target memcg where we end up using stale effective protection values from
the last reclaim operation, instead of completely ignoring the protection
of the reclaim target as intended. More detailed explanation and examples
in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest
case that catches the bug.
This patch (of 3):
When we are doing memcg reclaim, the intended behavior is that we
ignore any protection (memory.min, memory.low) of the target memcg (but
not its children). Ever since the patch pointed to by the "Fixes" tag,
we actually read a stale value for the target memcg protection when
deciding whether to skip the memcg or not because it is protected. If
the stale value happens to be high enough, we don't reclaim from the
target memcg.
Essentially, in some cases we may falsely skip reclaiming from the
target memcg of reclaim because we read a stale protection value from
last time we reclaimed from it.
During reclaim, mem_cgroup_calculate_protection() is used to determine the
effective protection (emin and elow) values of a memcg. The protection of
the reclaim target is ignored, but we cannot set their effective
protection to 0 due to a limitation of the current implementation (see
comment in mem_cgroup_protection()). Instead, we leave their effective
protection values unchaged, and later ignore it in
mem_cgroup_protection().
However, mem_cgroup_protection() is called later in
shrink_lruvec()->get_scan_count(), which is after the
mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result,
the stale effective protection values of the target memcg may lead us to
skip reclaiming from the target memcg entirely, before calling
shrink_lruvec(). This can be even worse with recursive protection, where
the stale target memcg protection can be higher than its standalone
protection. See two examples below (a similar version of example (a) is
added to test_memcontrol in a later patch).
(a) A simple example with proactive reclaim is as follows. Consider the
following hierarchy:
ROOT
|
A
|
B (memory.min = 10M)
Consider the following scenario:
- B has memory.current = 10M.
- The system undergoes global reclaim (or memcg reclaim in A).
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() calculates the effective min (emin)
of B as 10M.
- mem_cgroup_below_min() returns true for B, we do not reclaim from B.
- Now if we want to reclaim 5M from B using proactive reclaim
(memory.reclaim), we should be able to, as the protection of the
target memcg should be ignored.
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() immediately returns for B without
doing anything, as B is the target memcg, relying on
mem_cgroup_protection() to ignore B's stale effective min (still 10M).
- mem_cgroup_below_min() reads the stale effective min for B and we
skip it instead of ignoring its protection as intended, as we never
reach mem_cgroup_protection().
(b) An more complex example with recursive protection is as follows.
Consider the following hierarchy with memory_recursiveprot:
ROOT
|
A (memory.min = 50M)
|
B (memory.min = 10M, memory.high = 40M)
Consider the following scenario:
- B has memory.current = 35M.
- The system undergoes global reclaim (target memcg is NULL).
- B will have an effective min of 50M (all of A's unclaimed protection).
- B will not be reclaimed from.
- Now allocate 10M more memory in B, pushing it above it's high limit.
- The system undergoes memcg reclaim from B (target memcg is B).
- Like example (a), we do nothing in mem_cgroup_calculate_protection(),
then call mem_cgroup_below_min(), which will read the stale effective
min for B (50M) and skip it. In this case, it's even worse because we
are not just considering B's standalone protection (10M), but we are
reading a much higher stale protection (50M) which will cause us to not
reclaim from B at all.
This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple
e{low,min} state mutations from protection checks") which made
mem_cgroup_calculate_protection() only change the state without returning
any value. Before that commit, we used to return MEMCG_PROT_NONE for the
target memcg, which would cause us to skip the
mem_cgroup_below_{min/low}() checks. After that commit we do not return
anything and we end up checking the min & low effective protections for
the target memcg, which are stale.
Update mem_cgroup_supports_protection() to also check if we are reclaiming
from the target, and rename it to mem_cgroup_unprotected() (now returns
true if we should not protect the memcg, much simpler logic).
Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com
Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Chris Down <chris@chrisdown.name>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-02 03:15:10 +00:00
|
|
|
if (mem_cgroup_unprotected(target, memcg))
|
2020-08-07 06:22:05 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return READ_ONCE(memcg->memory.elow) >=
|
|
|
|
page_counter_read(&memcg->memory);
|
|
|
|
}
|
|
|
|
|
mm: memcg: fix stale protection of reclaim target memcg
Patch series "mm: memcg: fix protection of reclaim target memcg", v3.
This series fixes a bug in calculating the protection of the reclaim
target memcg where we end up using stale effective protection values from
the last reclaim operation, instead of completely ignoring the protection
of the reclaim target as intended. More detailed explanation and examples
in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest
case that catches the bug.
This patch (of 3):
When we are doing memcg reclaim, the intended behavior is that we
ignore any protection (memory.min, memory.low) of the target memcg (but
not its children). Ever since the patch pointed to by the "Fixes" tag,
we actually read a stale value for the target memcg protection when
deciding whether to skip the memcg or not because it is protected. If
the stale value happens to be high enough, we don't reclaim from the
target memcg.
Essentially, in some cases we may falsely skip reclaiming from the
target memcg of reclaim because we read a stale protection value from
last time we reclaimed from it.
During reclaim, mem_cgroup_calculate_protection() is used to determine the
effective protection (emin and elow) values of a memcg. The protection of
the reclaim target is ignored, but we cannot set their effective
protection to 0 due to a limitation of the current implementation (see
comment in mem_cgroup_protection()). Instead, we leave their effective
protection values unchaged, and later ignore it in
mem_cgroup_protection().
However, mem_cgroup_protection() is called later in
shrink_lruvec()->get_scan_count(), which is after the
mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result,
the stale effective protection values of the target memcg may lead us to
skip reclaiming from the target memcg entirely, before calling
shrink_lruvec(). This can be even worse with recursive protection, where
the stale target memcg protection can be higher than its standalone
protection. See two examples below (a similar version of example (a) is
added to test_memcontrol in a later patch).
(a) A simple example with proactive reclaim is as follows. Consider the
following hierarchy:
ROOT
|
A
|
B (memory.min = 10M)
Consider the following scenario:
- B has memory.current = 10M.
- The system undergoes global reclaim (or memcg reclaim in A).
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() calculates the effective min (emin)
of B as 10M.
- mem_cgroup_below_min() returns true for B, we do not reclaim from B.
- Now if we want to reclaim 5M from B using proactive reclaim
(memory.reclaim), we should be able to, as the protection of the
target memcg should be ignored.
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() immediately returns for B without
doing anything, as B is the target memcg, relying on
mem_cgroup_protection() to ignore B's stale effective min (still 10M).
- mem_cgroup_below_min() reads the stale effective min for B and we
skip it instead of ignoring its protection as intended, as we never
reach mem_cgroup_protection().
(b) An more complex example with recursive protection is as follows.
Consider the following hierarchy with memory_recursiveprot:
ROOT
|
A (memory.min = 50M)
|
B (memory.min = 10M, memory.high = 40M)
Consider the following scenario:
- B has memory.current = 35M.
- The system undergoes global reclaim (target memcg is NULL).
- B will have an effective min of 50M (all of A's unclaimed protection).
- B will not be reclaimed from.
- Now allocate 10M more memory in B, pushing it above it's high limit.
- The system undergoes memcg reclaim from B (target memcg is B).
- Like example (a), we do nothing in mem_cgroup_calculate_protection(),
then call mem_cgroup_below_min(), which will read the stale effective
min for B (50M) and skip it. In this case, it's even worse because we
are not just considering B's standalone protection (10M), but we are
reading a much higher stale protection (50M) which will cause us to not
reclaim from B at all.
This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple
e{low,min} state mutations from protection checks") which made
mem_cgroup_calculate_protection() only change the state without returning
any value. Before that commit, we used to return MEMCG_PROT_NONE for the
target memcg, which would cause us to skip the
mem_cgroup_below_{min/low}() checks. After that commit we do not return
anything and we end up checking the min & low effective protections for
the target memcg, which are stale.
Update mem_cgroup_supports_protection() to also check if we are reclaiming
from the target, and rename it to mem_cgroup_unprotected() (now returns
true if we should not protect the memcg, much simpler logic).
Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com
Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Chris Down <chris@chrisdown.name>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-02 03:15:10 +00:00
|
|
|
static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
|
|
|
|
struct mem_cgroup *memcg)
|
2020-08-07 06:22:05 +00:00
|
|
|
{
|
mm: memcg: fix stale protection of reclaim target memcg
Patch series "mm: memcg: fix protection of reclaim target memcg", v3.
This series fixes a bug in calculating the protection of the reclaim
target memcg where we end up using stale effective protection values from
the last reclaim operation, instead of completely ignoring the protection
of the reclaim target as intended. More detailed explanation and examples
in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest
case that catches the bug.
This patch (of 3):
When we are doing memcg reclaim, the intended behavior is that we
ignore any protection (memory.min, memory.low) of the target memcg (but
not its children). Ever since the patch pointed to by the "Fixes" tag,
we actually read a stale value for the target memcg protection when
deciding whether to skip the memcg or not because it is protected. If
the stale value happens to be high enough, we don't reclaim from the
target memcg.
Essentially, in some cases we may falsely skip reclaiming from the
target memcg of reclaim because we read a stale protection value from
last time we reclaimed from it.
During reclaim, mem_cgroup_calculate_protection() is used to determine the
effective protection (emin and elow) values of a memcg. The protection of
the reclaim target is ignored, but we cannot set their effective
protection to 0 due to a limitation of the current implementation (see
comment in mem_cgroup_protection()). Instead, we leave their effective
protection values unchaged, and later ignore it in
mem_cgroup_protection().
However, mem_cgroup_protection() is called later in
shrink_lruvec()->get_scan_count(), which is after the
mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result,
the stale effective protection values of the target memcg may lead us to
skip reclaiming from the target memcg entirely, before calling
shrink_lruvec(). This can be even worse with recursive protection, where
the stale target memcg protection can be higher than its standalone
protection. See two examples below (a similar version of example (a) is
added to test_memcontrol in a later patch).
(a) A simple example with proactive reclaim is as follows. Consider the
following hierarchy:
ROOT
|
A
|
B (memory.min = 10M)
Consider the following scenario:
- B has memory.current = 10M.
- The system undergoes global reclaim (or memcg reclaim in A).
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() calculates the effective min (emin)
of B as 10M.
- mem_cgroup_below_min() returns true for B, we do not reclaim from B.
- Now if we want to reclaim 5M from B using proactive reclaim
(memory.reclaim), we should be able to, as the protection of the
target memcg should be ignored.
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() immediately returns for B without
doing anything, as B is the target memcg, relying on
mem_cgroup_protection() to ignore B's stale effective min (still 10M).
- mem_cgroup_below_min() reads the stale effective min for B and we
skip it instead of ignoring its protection as intended, as we never
reach mem_cgroup_protection().
(b) An more complex example with recursive protection is as follows.
Consider the following hierarchy with memory_recursiveprot:
ROOT
|
A (memory.min = 50M)
|
B (memory.min = 10M, memory.high = 40M)
Consider the following scenario:
- B has memory.current = 35M.
- The system undergoes global reclaim (target memcg is NULL).
- B will have an effective min of 50M (all of A's unclaimed protection).
- B will not be reclaimed from.
- Now allocate 10M more memory in B, pushing it above it's high limit.
- The system undergoes memcg reclaim from B (target memcg is B).
- Like example (a), we do nothing in mem_cgroup_calculate_protection(),
then call mem_cgroup_below_min(), which will read the stale effective
min for B (50M) and skip it. In this case, it's even worse because we
are not just considering B's standalone protection (10M), but we are
reading a much higher stale protection (50M) which will cause us to not
reclaim from B at all.
This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple
e{low,min} state mutations from protection checks") which made
mem_cgroup_calculate_protection() only change the state without returning
any value. Before that commit, we used to return MEMCG_PROT_NONE for the
target memcg, which would cause us to skip the
mem_cgroup_below_{min/low}() checks. After that commit we do not return
anything and we end up checking the min & low effective protections for
the target memcg, which are stale.
Update mem_cgroup_supports_protection() to also check if we are reclaiming
from the target, and rename it to mem_cgroup_unprotected() (now returns
true if we should not protect the memcg, much simpler logic).
Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com
Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Chris Down <chris@chrisdown.name>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-02 03:15:10 +00:00
|
|
|
if (mem_cgroup_unprotected(target, memcg))
|
2020-08-07 06:22:05 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return READ_ONCE(memcg->memory.emin) >=
|
|
|
|
page_counter_read(&memcg->memory);
|
|
|
|
}
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-11 23:26:06 +00:00
|
|
|
|
memcontrol: add helpers for hugetlb memcg accounting
Patch series "hugetlb memcg accounting", v4.
Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory. This has been observed in our production system.
For instance, here is one of our usecases: suppose there are two 32G
containers. The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness. But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etcetera fair.
What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max. Similar procedure is done to other memory limits
(memory.low for e.g). However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.
This patch series rectifies this issue by charging the memcg when the
hugetlb folio is allocated, and uncharging when the folio is freed. In
addition, a new selftest is added to demonstrate and verify this new
behavior.
This patch (of 4):
This patch exposes charge committing and cancelling as parts of the memory
controller interface. These functionalities are useful when the
try_charge() and commit_charge() stages have to be separated by other
actions in between (which can fail). One such example is the new hugetlb
accounting behavior in the following patch.
The patch also adds a helper function to obtain a reference to the
current task's memcg.
Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-06 18:46:26 +00:00
|
|
|
void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
|
|
|
|
|
2021-06-25 13:27:04 +00:00
|
|
|
int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* mem_cgroup_charge - Charge a newly allocated folio to a cgroup.
|
|
|
|
* @folio: Folio to charge.
|
|
|
|
* @mm: mm context of the allocating task.
|
|
|
|
* @gfp: Reclaim mode.
|
|
|
|
*
|
|
|
|
* Try to charge @folio to the memcg that @mm belongs to, reclaiming
|
|
|
|
* pages according to @gfp if necessary. If @mm is NULL, try to
|
|
|
|
* charge to the active memcg.
|
|
|
|
*
|
|
|
|
* Do not use this for folios allocated for swapin.
|
|
|
|
*
|
|
|
|
* Return: 0 on success. Otherwise, an error code is returned.
|
|
|
|
*/
|
|
|
|
static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
|
|
|
|
gfp_t gfp)
|
2021-09-02 21:54:50 +00:00
|
|
|
{
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return 0;
|
2021-06-25 13:27:04 +00:00
|
|
|
return __mem_cgroup_charge(folio, mm, gfp);
|
2021-09-02 21:54:50 +00:00
|
|
|
}
|
|
|
|
|
hugetlb: memcg: account hugetlb-backed memory in memory controller
Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory. This has been observed in our production system.
For instance, here is one of our usecases: suppose there are two 32G
containers. The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness. But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etc. fair.
What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max. Similar procedure is done to other memory limits
(memory.low for e.g). However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.
This patch rectifies this issue by charging the memcg when the hugetlb
folio is utilized, and uncharging when the folio is freed (analogous to
the hugetlb controller). Note that we do not charge when the folio is
allocated to the hugetlb pool, because at this point it is not owned by
any memcg.
Some caveats to consider:
* This feature is only available on cgroup v2.
* There is no hugetlb pool management involved in the memory
controller. As stated above, hugetlb folios are only charged towards
the memory controller when it is used. Host overcommit management
has to consider it when configuring hard limits.
* Failure to charge towards the memcg results in SIGBUS. This could
happen even if the hugetlb pool still has pages (but the cgroup
limit is hit and reclaim attempt fails).
* When this feature is enabled, hugetlb pages contribute to memory
reclaim protection. low, min limits tuning must take into account
hugetlb memory.
* Hugetlb pages utilized while this option is not selected will not
be tracked by the memory controller (even if cgroup v2 is remounted
later on).
Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-06 18:46:28 +00:00
|
|
|
int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
|
|
|
|
long nr_pages);
|
|
|
|
|
2022-09-02 19:46:12 +00:00
|
|
|
int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
|
2021-04-30 05:56:36 +00:00
|
|
|
gfp_t gfp, swp_entry_t entry);
|
2024-09-08 23:21:18 +00:00
|
|
|
|
|
|
|
void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
|
mm: memcontrol: convert page cache to a new mem_cgroup_charge() API
The try/commit/cancel protocol that memcg uses dates back to when pages
used to be uncharged upon removal from the page cache, and thus couldn't
be committed before the insertion had succeeded. Nowadays, pages are
uncharged when they are physically freed; it doesn't matter whether the
insertion was successful or not. For the page cache, the transaction
dance has become unnecessary.
Introduce a mem_cgroup_charge() function that simply charges a newly
allocated page to a cgroup and sets up page->mem_cgroup in one single
step. If the insertion fails, the caller doesn't have to do anything but
free/put the page.
Then switch the page cache over to this new API.
Subsequent patches will also convert anon pages, but it needs a bit more
prep work. Right now, memcg depends on page->mapping being already set up
at the time of charging, so that it can maintain its own MEMCG_CACHE and
MEMCG_RSS counters. For anon, page->mapping is set under the same pte
lock under which the page is publishd, so a single charge point that can
block doesn't work there just yet.
The following prep patches will replace the private memcg counters with
the generic vmstat counters, thus removing the page->mapping dependency,
then complete the transition to the new single-point charge API and delete
the old transactional scheme.
v2: leave shmem swapcache when charging fails to avoid double IO (Joonsoo)
v3: rebase on preceeding shmem simplification patch
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Link: http://lkml.kernel.org/r/20200508183105.225460-6-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-03 23:01:41 +00:00
|
|
|
|
2021-05-02 00:42:23 +00:00
|
|
|
void __mem_cgroup_uncharge(struct folio *folio);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* mem_cgroup_uncharge - Uncharge a folio.
|
|
|
|
* @folio: Folio to uncharge.
|
|
|
|
*
|
|
|
|
* Uncharge a folio previously charged with mem_cgroup_charge().
|
|
|
|
*/
|
|
|
|
static inline void mem_cgroup_uncharge(struct folio *folio)
|
2021-09-02 21:54:50 +00:00
|
|
|
{
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
2021-05-02 00:42:23 +00:00
|
|
|
__mem_cgroup_uncharge(folio);
|
2021-09-02 21:54:50 +00:00
|
|
|
}
|
|
|
|
|
2024-02-27 17:42:39 +00:00
|
|
|
void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
|
|
|
|
static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
__mem_cgroup_uncharge_folios(folios);
|
|
|
|
}
|
memcontrol: add helpers for hugetlb memcg accounting
Patch series "hugetlb memcg accounting", v4.
Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory. This has been observed in our production system.
For instance, here is one of our usecases: suppose there are two 32G
containers. The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness. But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etcetera fair.
What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max. Similar procedure is done to other memory limits
(memory.low for e.g). However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.
This patch series rectifies this issue by charging the memcg when the
hugetlb folio is allocated, and uncharging when the folio is freed. In
addition, a new selftest is added to demonstrate and verify this new
behavior.
This patch (of 4):
This patch exposes charge committing and cancelling as parts of the memory
controller interface. These functionalities are useful when the
try_charge() and commit_charge() stages have to be separated by other
actions in between (which can fail). One such example is the new hugetlb
accounting behavior in the following patch.
The patch also adds a helper function to obtain a reference to the
current task's memcg.
Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-06 18:46:26 +00:00
|
|
|
|
2024-02-27 17:42:39 +00:00
|
|
|
void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
|
2023-10-06 18:46:27 +00:00
|
|
|
void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
|
2021-05-06 22:14:59 +00:00
|
|
|
void mem_cgroup_migrate(struct folio *old, struct folio *new);
|
2009-12-16 00:47:03 +00:00
|
|
|
|
2016-07-28 22:45:10 +00:00
|
|
|
/**
|
2019-12-01 01:55:34 +00:00
|
|
|
* mem_cgroup_lruvec - get the lru list vector for a memcg & node
|
2016-07-28 22:45:10 +00:00
|
|
|
* @memcg: memcg of the wanted lruvec
|
2020-12-18 22:01:41 +00:00
|
|
|
* @pgdat: pglist_data
|
2016-07-28 22:45:10 +00:00
|
|
|
*
|
2019-12-01 01:55:34 +00:00
|
|
|
* Returns the lru list vector holding pages for a given @memcg &
|
2020-12-18 22:01:41 +00:00
|
|
|
* @pgdat combination. This can be the node lruvec, if the memory
|
2019-12-01 01:55:34 +00:00
|
|
|
* controller is disabled.
|
2016-07-28 22:45:10 +00:00
|
|
|
*/
|
2019-12-01 01:55:34 +00:00
|
|
|
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
|
|
|
|
struct pglist_data *pgdat)
|
2016-07-28 22:45:10 +00:00
|
|
|
{
|
2016-07-28 22:46:05 +00:00
|
|
|
struct mem_cgroup_per_node *mz;
|
2016-07-28 22:45:10 +00:00
|
|
|
struct lruvec *lruvec;
|
|
|
|
|
|
|
|
if (mem_cgroup_disabled()) {
|
2019-12-01 01:55:34 +00:00
|
|
|
lruvec = &pgdat->__lruvec;
|
2016-07-28 22:45:10 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2019-12-01 01:55:52 +00:00
|
|
|
if (!memcg)
|
|
|
|
memcg = root_mem_cgroup;
|
|
|
|
|
2021-04-30 05:56:14 +00:00
|
|
|
mz = memcg->nodeinfo[pgdat->node_id];
|
2016-07-28 22:45:10 +00:00
|
|
|
lruvec = &mz->lruvec;
|
|
|
|
out:
|
|
|
|
/*
|
|
|
|
* Since a node can be onlined after the mem_cgroup was created,
|
2016-07-28 22:45:31 +00:00
|
|
|
* we have to be prepared to initialize lruvec->pgdat here;
|
2016-07-28 22:45:10 +00:00
|
|
|
* and if offlined then reonlined, we need to reinitialize it.
|
|
|
|
*/
|
2016-07-28 22:46:05 +00:00
|
|
|
if (unlikely(lruvec->pgdat != pgdat))
|
|
|
|
lruvec->pgdat = pgdat;
|
2016-07-28 22:45:10 +00:00
|
|
|
return lruvec;
|
|
|
|
}
|
|
|
|
|
2020-12-18 22:01:41 +00:00
|
|
|
/**
|
2021-06-29 00:00:28 +00:00
|
|
|
* folio_lruvec - return lruvec for isolating/putting an LRU folio
|
|
|
|
* @folio: Pointer to the folio.
|
2020-12-18 22:01:41 +00:00
|
|
|
*
|
2021-06-29 00:00:28 +00:00
|
|
|
* This function relies on folio->mem_cgroup being stable.
|
2020-12-18 22:01:41 +00:00
|
|
|
*/
|
2021-06-29 00:00:28 +00:00
|
|
|
static inline struct lruvec *folio_lruvec(struct folio *folio)
|
2020-12-18 22:01:41 +00:00
|
|
|
{
|
2021-06-29 00:00:28 +00:00
|
|
|
struct mem_cgroup *memcg = folio_memcg(folio);
|
2020-12-18 22:01:41 +00:00
|
|
|
|
2021-06-29 00:00:28 +00:00
|
|
|
VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio);
|
|
|
|
return mem_cgroup_lruvec(memcg, folio_pgdat(folio));
|
2020-12-18 22:01:41 +00:00
|
|
|
}
|
2008-07-25 08:47:15 +00:00
|
|
|
|
2015-09-08 22:01:07 +00:00
|
|
|
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
|
2015-09-09 22:35:35 +00:00
|
|
|
|
fs: fsnotify: account fsnotify metadata to kmemcg
Patch series "Directed kmem charging", v8.
The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs. All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.
The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated. However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg. This patch series
contains two such concrete use-cases i.e. fsnotify and buffer_head.
The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener. The events
are allocated in the context of the event producer. However they should
be charged to the event consumer. Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.
To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg. In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged. For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.
This patch (of 2):
A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener. This can cause
system level memory pressure or OOMs. So, it's better to account the
fsnotify kmem caches to the memcg of the listener.
However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer. This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.
There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener. So, SLAB_ACCOUNT is enough for these caches.
The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.
The allocations from the event caches happen in the context of the event
producer. For such caches we will need to remote charge the allocations
to the listener's memcg. Thus we save the memcg reference in the
fsnotify_group structure of the listener.
This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.
[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-17 22:46:39 +00:00
|
|
|
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
|
|
|
|
|
memcontrol: add helpers for hugetlb memcg accounting
Patch series "hugetlb memcg accounting", v4.
Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory. This has been observed in our production system.
For instance, here is one of our usecases: suppose there are two 32G
containers. The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness. But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etcetera fair.
What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max. Similar procedure is done to other memory limits
(memory.low for e.g). However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.
This patch series rectifies this issue by charging the memcg when the
hugetlb folio is allocated, and uncharging when the folio is freed. In
addition, a new selftest is added to demonstrate and verify this new
behavior.
This patch (of 4):
This patch exposes charge committing and cancelling as parts of the memory
controller interface. These functionalities are useful when the
try_charge() and commit_charge() stages have to be separated by other
actions in between (which can fail). One such example is the new hugetlb
accounting behavior in the following patch.
The patch also adds a helper function to obtain a reference to the
current task's memcg.
Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-06 18:46:26 +00:00
|
|
|
struct mem_cgroup *get_mem_cgroup_from_current(void);
|
|
|
|
|
mm,memcg: provide per-cgroup counters for NUMA balancing operations
The ability to observe the demotion and promotion decisions made by the
kernel on a per-cgroup basis is important for monitoring and tuning
containerized workloads on machines equipped with tiered memory.
Different containers in the system may experience drastically different
memory tiering actions that cannot be distinguished from the global
counters alone.
For example, a container running a workload that has a much hotter memory
accesses will likely see more promotions and fewer demotions, potentially
depriving a colocated container of top tier memory to such an extent that
its performance degrades unacceptably.
For another example, some containers may exhibit longer periods between
data reuse, causing much more numa_hint_faults than numa_pages_migrated.
In this case, tuning hot_threshold_ms may be appropriate, but the signal
can easily be lost if only global counters are available.
In the long term, we hope to introduce per-cgroup control of promotion and
demotion actions to implement memory placement policies in tiering.
This patch set adds seven counters to memory.stat in a cgroup:
numa_pages_migrated, numa_pte_updates, numa_hint_faults, pgdemote_kswapd,
pgdemote_khugepaged, pgdemote_direct and pgpromote_success. pgdemote_*
and pgpromote_success are also available in memory.numa_stat.
count_memcg_events_mm() is added to count multiple event occurrences at
once, and get_mem_cgroup_from_folio() is added because we need to get a
reference to the memcg of a folio before it's migrated to track
numa_pages_migrated. The accounting of PGDEMOTE_* is moved to
shrink_inactive_list() before being changed to per-cgroup.
[kaiyang2@cs.cmu.edu: add documentation of the memcg counters in cgroup-v2.rst]
Link: https://lkml.kernel.org/r/20240814235122.252309-1-kaiyang2@cs.cmu.edu
Link: https://lkml.kernel.org/r/20240814174227.30639-1-kaiyang2@cs.cmu.edu
Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-14 17:42:27 +00:00
|
|
|
struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio);
|
|
|
|
|
2021-06-29 01:59:47 +00:00
|
|
|
struct lruvec *folio_lruvec_lock(struct folio *folio);
|
|
|
|
struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
|
|
|
|
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
|
2020-12-15 20:34:29 +00:00
|
|
|
unsigned long *flags);
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
2021-06-29 01:59:47 +00:00
|
|
|
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio);
|
2020-12-15 20:34:29 +00:00
|
|
|
#else
|
2021-06-29 01:59:47 +00:00
|
|
|
static inline
|
|
|
|
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
|
2020-12-15 20:34:29 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-09-08 22:01:02 +00:00
|
|
|
static inline
|
|
|
|
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
|
|
|
|
return css ? container_of(css, struct mem_cgroup, css) : NULL;
|
|
|
|
}
|
|
|
|
|
2020-08-07 06:20:49 +00:00
|
|
|
static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
|
|
|
|
{
|
|
|
|
return percpu_ref_tryget(&objcg->refcnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void obj_cgroup_get(struct obj_cgroup *objcg)
|
|
|
|
{
|
|
|
|
percpu_ref_get(&objcg->refcnt);
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
|
|
|
|
unsigned long nr)
|
2020-08-07 06:20:49 +00:00
|
|
|
{
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
percpu_ref_get_many(&objcg->refcnt, nr);
|
2020-08-07 06:20:49 +00:00
|
|
|
}
|
|
|
|
|
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:52 +00:00
|
|
|
static inline void obj_cgroup_put(struct obj_cgroup *objcg)
|
2020-08-07 06:20:49 +00:00
|
|
|
{
|
2024-03-16 01:58:03 +00:00
|
|
|
if (objcg)
|
|
|
|
percpu_ref_put(&objcg->refcnt);
|
2020-08-07 06:20:49 +00:00
|
|
|
}
|
|
|
|
|
mm: multi-gen LRU: per-node lru_gen_folio lists
For each node, memcgs are divided into two generations: the old and
the young. For each generation, memcgs are randomly sharded into
multiple bins to improve scalability. For each bin, an RCU hlist_nulls
is virtually divided into three segments: the head, the tail and the
default.
An onlining memcg is added to the tail of a random bin in the old
generation. The eviction starts at the head of a random bin in the old
generation. The per-node memcg generation counter, whose reminder (mod
2) indexes the old generation, is incremented when all its bins become
empty.
There are four operations:
1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
its current generation (old or young) and updates its "seg" to
"head";
2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
its current generation (old or young) and updates its "seg" to
"tail";
3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
the old generation, updates its "gen" to "old" and resets its "seg"
to "default";
4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
in the young generation, updates its "gen" to "young" and resets
its "seg" to "default".
The events that trigger the above operations are:
1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
2. The first attempt to reclaim an memcg below low, which triggers
MEMCG_LRU_TAIL;
3. The first attempt to reclaim an memcg below reclaimable size
threshold, which triggers MEMCG_LRU_TAIL;
4. The second attempt to reclaim an memcg below reclaimable size
threshold, which triggers MEMCG_LRU_YOUNG;
5. Attempting to reclaim an memcg below min, which triggers
MEMCG_LRU_YOUNG;
6. Finishing the aging on the eviction path, which triggers
MEMCG_LRU_YOUNG;
7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
Note that memcg LRU only applies to global reclaim, and the
round-robin incrementing of their max_seq counters ensures the
eventual fairness to all eligible memcgs. For memcg reclaim, it still
relies on mem_cgroup_iter().
Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-22 04:19:04 +00:00
|
|
|
static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return !memcg || css_tryget(&memcg->css);
|
|
|
|
}
|
|
|
|
|
2023-11-30 19:40:19 +00:00
|
|
|
static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return !memcg || css_tryget_online(&memcg->css);
|
|
|
|
}
|
|
|
|
|
2018-08-17 22:46:36 +00:00
|
|
|
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
|
|
|
|
{
|
fs: fsnotify: account fsnotify metadata to kmemcg
Patch series "Directed kmem charging", v8.
The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs. All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.
The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated. However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg. This patch series
contains two such concrete use-cases i.e. fsnotify and buffer_head.
The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener. The events
are allocated in the context of the event producer. However they should
be charged to the event consumer. Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.
To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg. In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged. For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.
This patch (of 2):
A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener. This can cause
system level memory pressure or OOMs. So, it's better to account the
fsnotify kmem caches to the memcg of the listener.
However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer. This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.
There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener. So, SLAB_ACCOUNT is enough for these caches.
The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.
The allocations from the event caches happen in the context of the event
producer. For such caches we will need to remote charge the allocations
to the listener's memcg. Thus we save the memcg reference in the
fsnotify_group structure of the listener.
This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.
[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-17 22:46:39 +00:00
|
|
|
if (memcg)
|
|
|
|
css_put(&memcg->css);
|
2018-08-17 22:46:36 +00:00
|
|
|
}
|
|
|
|
|
2016-01-14 23:21:32 +00:00
|
|
|
#define mem_cgroup_from_counter(counter, member) \
|
|
|
|
container_of(counter, struct mem_cgroup, member)
|
|
|
|
|
2015-09-08 22:01:02 +00:00
|
|
|
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
|
|
|
|
struct mem_cgroup *,
|
|
|
|
struct mem_cgroup_reclaim_cookie *);
|
|
|
|
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
|
2023-06-16 06:30:30 +00:00
|
|
|
void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
|
|
|
|
int (*)(struct task_struct *, void *), void *arg);
|
2015-09-08 22:01:02 +00:00
|
|
|
|
2016-03-15 21:57:16 +00:00
|
|
|
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return 0;
|
|
|
|
|
2016-07-20 22:44:57 +00:00
|
|
|
return memcg->id.id;
|
2016-03-15 21:57:16 +00:00
|
|
|
}
|
2016-07-20 22:44:57 +00:00
|
|
|
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
|
2016-03-15 21:57:16 +00:00
|
|
|
|
mm: memcontrol: introduce mem_cgroup_ino() and mem_cgroup_get_from_ino()
Patch series "mm: introduce shrinker debugfs interface", v5.
The only existing debugging mechanism is a couple of tracepoints in
do_shrink_slab(): mm_shrink_slab_start and mm_shrink_slab_end. They
aren't covering everything though: shrinkers which report 0 objects will
never show up, there is no support for memcg-aware shrinkers. Shrinkers
are identified by their scan function, which is not always enough (e.g.
hard to guess which super block's shrinker it is having only
"super_cache_scan").
To provide a better visibility and debug options for memory shrinkers this
patchset introduces a /sys/kernel/debug/shrinker interface, to some extent
similar to /sys/kernel/slab.
For each shrinker registered in the system a directory is created. As
now, the directory will contain only a "scan" file, which allows to get
the number of managed objects for each memory cgroup (for memcg-aware
shrinkers) and each numa node (for numa-aware shrinkers on a numa
machine). Other interfaces might be added in the future.
To make debugging more pleasant, the patchset also names all shrinkers, so
that debugfs entries can have meaningful names.
This patch (of 5):
Shrinker debugfs requires a way to represent memory cgroups without using
full paths, both for displaying information and getting input from a user.
Cgroup inode number is a perfect way, already used by bpf.
This commit adds a couple of helper functions which will be used to handle
memcg-aware shrinkers.
Link: https://lkml.kernel.org/r/20220601032227.4076670-1-roman.gushchin@linux.dev
Link: https://lkml.kernel.org/r/20220601032227.4076670-2-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-06-01 03:22:22 +00:00
|
|
|
#ifdef CONFIG_SHRINKER_DEBUG
|
|
|
|
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return memcg ? cgroup_ino(memcg->css.cgroup) : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino);
|
|
|
|
#endif
|
|
|
|
|
2019-03-05 23:45:52 +00:00
|
|
|
static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
|
|
|
|
{
|
|
|
|
return mem_cgroup_from_css(seq_css(m));
|
|
|
|
}
|
|
|
|
|
2017-07-06 22:40:25 +00:00
|
|
|
static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
|
|
|
|
{
|
|
|
|
struct mem_cgroup_per_node *mz;
|
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
|
|
|
|
return mz->memcg;
|
|
|
|
}
|
|
|
|
|
2016-01-14 23:21:32 +00:00
|
|
|
/**
|
|
|
|
* parent_mem_cgroup - find the accounting parent of a memcg
|
|
|
|
* @memcg: memcg whose parent to find
|
|
|
|
*
|
2023-08-01 12:43:59 +00:00
|
|
|
* Returns the parent memcg, or NULL if this is the root.
|
2016-01-14 23:21:32 +00:00
|
|
|
*/
|
|
|
|
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
|
|
|
|
{
|
2022-03-22 21:40:16 +00:00
|
|
|
return mem_cgroup_from_css(memcg->css.parent);
|
2016-01-14 23:21:32 +00:00
|
|
|
}
|
|
|
|
|
2015-09-08 22:01:02 +00:00
|
|
|
static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
|
|
|
|
struct mem_cgroup *root)
|
|
|
|
{
|
|
|
|
if (root == memcg)
|
|
|
|
return true;
|
|
|
|
return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
|
|
|
|
}
|
2011-12-11 21:47:03 +00:00
|
|
|
|
2014-12-10 23:44:33 +00:00
|
|
|
static inline bool mm_match_cgroup(struct mm_struct *mm,
|
|
|
|
struct mem_cgroup *memcg)
|
2009-01-08 02:08:07 +00:00
|
|
|
{
|
2012-10-08 23:34:12 +00:00
|
|
|
struct mem_cgroup *task_memcg;
|
2014-12-10 23:44:30 +00:00
|
|
|
bool match = false;
|
2012-05-29 22:06:25 +00:00
|
|
|
|
2009-01-08 02:08:07 +00:00
|
|
|
rcu_read_lock();
|
2012-10-08 23:34:12 +00:00
|
|
|
task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
2014-12-10 23:44:30 +00:00
|
|
|
if (task_memcg)
|
2014-12-10 23:44:33 +00:00
|
|
|
match = mem_cgroup_is_descendant(task_memcg, memcg);
|
2009-01-08 02:08:07 +00:00
|
|
|
rcu_read_unlock();
|
2012-05-29 22:06:25 +00:00
|
|
|
return match;
|
2009-01-08 02:08:07 +00:00
|
|
|
}
|
2008-02-07 08:13:53 +00:00
|
|
|
|
2023-01-16 19:25:07 +00:00
|
|
|
struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio);
|
memcg: add page_cgroup_ino helper
This patchset introduces a new user API for tracking user memory pages
that have not been used for a given period of time. The purpose of this
is to provide the userspace with the means of tracking a workload's
working set, i.e. the set of pages that are actively used by the
workload. Knowing the working set size can be useful for partitioning the
system more efficiently, e.g. by tuning memory cgroup limits
appropriately, or for job placement within a compute cluster.
==== USE CASES ====
The unified cgroup hierarchy has memory.low and memory.high knobs, which
are defined as the low and high boundaries for the workload working set
size. However, the working set size of a workload may be unknown or
change in time. With this patch set, one can periodically estimate the
amount of memory unused by each cgroup and tune their memory.low and
memory.high parameters accordingly, therefore optimizing the overall
memory utilization.
Another use case is balancing workloads within a compute cluster. Knowing
how much memory is not really used by a workload unit may help take a more
optimal decision when considering migrating the unit to another node
within the cluster.
Also, as noted by Minchan, this would be useful for per-process reclaim
(https://lwn.net/Articles/545668/). With idle tracking, we could reclaim idle
pages only by smart user memory manager.
==== USER API ====
The user API consists of two new files:
* /sys/kernel/mm/page_idle/bitmap. This file implements a bitmap where each
bit corresponds to a page, indexed by PFN. When the bit is set, the
corresponding page is idle. A page is considered idle if it has not been
accessed since it was marked idle. To mark a page idle one should set the
bit corresponding to the page by writing to the file. A value written to the
file is OR-ed with the current bitmap value. Only user memory pages can be
marked idle, for other page types input is silently ignored. Writing to this
file beyond max PFN results in the ENXIO error. Only available when
CONFIG_IDLE_PAGE_TRACKING is set.
This file can be used to estimate the amount of pages that are not
used by a particular workload as follows:
1. mark all pages of interest idle by setting corresponding bits in the
/sys/kernel/mm/page_idle/bitmap
2. wait until the workload accesses its working set
3. read /sys/kernel/mm/page_idle/bitmap and count the number of bits set
* /proc/kpagecgroup. This file contains a 64-bit inode number of the
memory cgroup each page is charged to, indexed by PFN. Only available when
CONFIG_MEMCG is set.
This file can be used to find all pages (including unmapped file pages)
accounted to a particular cgroup. Using /sys/kernel/mm/page_idle/bitmap, one
can then estimate the cgroup working set size.
For an example of using these files for estimating the amount of unused
memory pages per each memory cgroup, please see the script attached
below.
==== REASONING ====
The reason to introduce the new user API instead of using
/proc/PID/{clear_refs,smaps} is that the latter has two serious
drawbacks:
- it does not count unmapped file pages
- it affects the reclaimer logic
The new API attempts to overcome them both. For more details on how it
is achieved, please see the comment to patch 6.
==== PATCHSET STRUCTURE ====
The patch set is organized as follows:
- patch 1 adds page_cgroup_ino() helper for the sake of
/proc/kpagecgroup and patches 2-3 do related cleanup
- patch 4 adds /proc/kpagecgroup, which reports cgroup ino each page is
charged to
- patch 5 introduces a new mmu notifier callback, clear_young, which is
a lightweight version of clear_flush_young; it is used in patch 6
- patch 6 implements the idle page tracking feature, including the
userspace API, /sys/kernel/mm/page_idle/bitmap
- patch 7 exports idle flag via /proc/kpageflags
==== SIMILAR WORKS ====
Originally, the patch for tracking idle memory was proposed back in 2011
by Michel Lespinasse (see http://lwn.net/Articles/459269/). The main
difference between Michel's patch and this one is that Michel implemented
a kernel space daemon for estimating idle memory size per cgroup while
this patch only provides the userspace with the minimal API for doing the
job, leaving the rest up to the userspace. However, they both share the
same idea of Idle/Young page flags to avoid affecting the reclaimer logic.
==== PERFORMANCE EVALUATION ====
SPECjvm2008 (https://www.spec.org/jvm2008/) was used to evaluate the
performance impact introduced by this patch set. Three runs were carried
out:
- base: kernel without the patch
- patched: patched kernel, the feature is not used
- patched-active: patched kernel, 1 minute-period daemon is used for
tracking idle memory
For tracking idle memory, idlememstat utility was used:
https://github.com/locker/idlememstat
testcase base patched patched-active
compiler 537.40 ( 0.00)% 532.26 (-0.96)% 538.31 ( 0.17)%
compress 305.47 ( 0.00)% 301.08 (-1.44)% 300.71 (-1.56)%
crypto 284.32 ( 0.00)% 282.21 (-0.74)% 284.87 ( 0.19)%
derby 411.05 ( 0.00)% 413.44 ( 0.58)% 412.07 ( 0.25)%
mpegaudio 189.96 ( 0.00)% 190.87 ( 0.48)% 189.42 (-0.28)%
scimark.large 46.85 ( 0.00)% 46.41 (-0.94)% 47.83 ( 2.09)%
scimark.small 412.91 ( 0.00)% 415.41 ( 0.61)% 421.17 ( 2.00)%
serial 204.23 ( 0.00)% 213.46 ( 4.52)% 203.17 (-0.52)%
startup 36.76 ( 0.00)% 35.49 (-3.45)% 35.64 (-3.05)%
sunflow 115.34 ( 0.00)% 115.08 (-0.23)% 117.37 ( 1.76)%
xml 620.55 ( 0.00)% 619.95 (-0.10)% 620.39 (-0.03)%
composite 211.50 ( 0.00)% 211.15 (-0.17)% 211.67 ( 0.08)%
time idlememstat:
17.20user 65.16system 2:15:23elapsed 1%CPU (0avgtext+0avgdata 8476maxresident)k
448inputs+40outputs (1major+36052minor)pagefaults 0swaps
==== SCRIPT FOR COUNTING IDLE PAGES PER CGROUP ====
#! /usr/bin/python
#
import os
import stat
import errno
import struct
CGROUP_MOUNT = "/sys/fs/cgroup/memory"
BUFSIZE = 8 * 1024 # must be multiple of 8
def get_hugepage_size():
with open("/proc/meminfo", "r") as f:
for s in f:
k, v = s.split(":")
if k == "Hugepagesize":
return int(v.split()[0]) * 1024
PAGE_SIZE = os.sysconf("SC_PAGE_SIZE")
HUGEPAGE_SIZE = get_hugepage_size()
def set_idle():
f = open("/sys/kernel/mm/page_idle/bitmap", "wb", BUFSIZE)
while True:
try:
f.write(struct.pack("Q", pow(2, 64) - 1))
except IOError as err:
if err.errno == errno.ENXIO:
break
raise
f.close()
def count_idle():
f_flags = open("/proc/kpageflags", "rb", BUFSIZE)
f_cgroup = open("/proc/kpagecgroup", "rb", BUFSIZE)
with open("/sys/kernel/mm/page_idle/bitmap", "rb", BUFSIZE) as f:
while f.read(BUFSIZE): pass # update idle flag
idlememsz = {}
while True:
s1, s2 = f_flags.read(8), f_cgroup.read(8)
if not s1 or not s2:
break
flags, = struct.unpack('Q', s1)
cgino, = struct.unpack('Q', s2)
unevictable = (flags >> 18) & 1
huge = (flags >> 22) & 1
idle = (flags >> 25) & 1
if idle and not unevictable:
idlememsz[cgino] = idlememsz.get(cgino, 0) + \
(HUGEPAGE_SIZE if huge else PAGE_SIZE)
f_flags.close()
f_cgroup.close()
return idlememsz
if __name__ == "__main__":
print "Setting the idle flag for each page..."
set_idle()
raw_input("Wait until the workload accesses its working set, "
"then press Enter")
print "Counting idle pages..."
idlememsz = count_idle()
for dir, subdirs, files in os.walk(CGROUP_MOUNT):
ino = os.stat(dir)[stat.ST_INO]
print dir + ": " + str(idlememsz.get(ino, 0) / 1024) + " kB"
==== END SCRIPT ====
This patch (of 8):
Add page_cgroup_ino() helper to memcg.
This function returns the inode number of the closest online ancestor of
the memory cgroup a page is charged to. It is required for exporting
information about which page is charged to which cgroup to userspace,
which will be introduced by a following patch.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-09 22:35:28 +00:00
|
|
|
ino_t page_cgroup_ino(struct page *page);
|
2009-12-16 11:19:59 +00:00
|
|
|
|
2016-01-20 23:03:02 +00:00
|
|
|
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return true;
|
|
|
|
return !!(memcg->css.flags & CSS_ONLINE);
|
|
|
|
}
|
|
|
|
|
2015-09-08 22:01:02 +00:00
|
|
|
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
|
2017-01-11 00:58:04 +00:00
|
|
|
int zid, int nr_pages);
|
2015-09-08 22:01:02 +00:00
|
|
|
|
2017-01-11 00:58:04 +00:00
|
|
|
static inline
|
|
|
|
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
|
|
|
|
enum lru_list lru, int zone_idx)
|
|
|
|
{
|
|
|
|
struct mem_cgroup_per_node *mz;
|
|
|
|
|
|
|
|
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
|
2020-08-15 00:31:37 +00:00
|
|
|
return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
|
2015-09-08 22:01:02 +00:00
|
|
|
}
|
|
|
|
|
2023-09-14 15:21:39 +00:00
|
|
|
void mem_cgroup_handle_over_high(gfp_t gfp_mask);
|
memcg: punt high overage reclaim to return-to-userland path
Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped. If a process performs only
speculative allocations, it can blow way past the high limit. This is
actually easily reproducible by simply doing "find /". slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.
This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.
- All over-high reclaims can use GFP_KERNEL regardless of the specific
gfp mask in use, e.g. GFP_NOFS, when the limit was breached.
- It copes with prio inversion. Previously, a low-prio task with
small memory.high might perform over-high reclaim with a bunch of
locks held. If a higher prio task needed any of these locks, it
would have to wait until the low prio task finished reclaim and
released the locks. By handing over-high reclaim to the task exit
path this issue can be avoided.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 02:46:11 +00:00
|
|
|
|
2018-06-08 00:06:18 +00:00
|
|
|
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
|
2016-10-07 23:57:23 +00:00
|
|
|
|
mm, memcg: proportional memory.{low,min} reclaim
cgroup v2 introduces two memory protection thresholds: memory.low
(best-effort) and memory.min (hard protection). While they generally do
what they say on the tin, there is a limitation in their implementation
that makes them difficult to use effectively: that cliff behaviour often
manifests when they become eligible for reclaim. This patch implements
more intuitive and usable behaviour, where we gradually mount more
reclaim pressure as cgroups further and further exceed their protection
thresholds.
This cliff edge behaviour happens because we only choose whether or not
to reclaim based on whether the memcg is within its protection limits
(see the use of mem_cgroup_protected in shrink_node), but we don't vary
our reclaim behaviour based on this information. Imagine the following
timeline, with the numbers the lruvec size in this zone:
1. memory.low=1000000, memory.current=999999. 0 pages may be scanned.
2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned.
3. memory.low=1000000, memory.current=1000001. 1000001* pages may be
scanned. (?!)
* Of course, we won't usually scan all available pages in the zone even
without this patch because of scan control priority, over-reclaim
protection, etc. However, as shown by the tests at the end, these
techniques don't sufficiently throttle such an extreme change in input,
so cliff-like behaviour isn't really averted by their existence alone.
Here's an example of how this plays out in practice. At Facebook, we are
trying to protect various workloads from "system" software, like
configuration management tools, metric collectors, etc (see this[0] case
study). In order to find a suitable memory.low value, we start by
determining the expected memory range within which the workload will be
comfortable operating. This isn't an exact science -- memory usage deemed
"comfortable" will vary over time due to user behaviour, differences in
composition of work, etc, etc. As such we need to ballpark memory.low,
but doing this is currently problematic:
1. If we end up setting it too low for the workload, it won't have
*any* effect (see discussion above). The group will receive the full
weight of reclaim and won't have any priority while competing with the
less important system software, as if we had no memory.low configured
at all.
2. Because of this behaviour, we end up erring on the side of setting
it too high, such that the comfort range is reliably covered. However,
protected memory is completely unavailable to the rest of the system,
so we might cause undue memory and IO pressure there when we *know* we
have some elasticity in the workload.
3. Even if we get the value totally right, smack in the middle of the
comfort zone, we get extreme jumps between no pressure and full
pressure that cause unpredictable pressure spikes in the workload due
to the current binary reclaim behaviour.
With this patch, we can set it to our ballpark estimation without too much
worry. Any undesirable behaviour, such as too much or too little reclaim
pressure on the workload or system will be proportional to how far our
estimation is off. This means we can set memory.low much more
conservatively and thus waste less resources *without* the risk of the
workload falling off a cliff if we overshoot.
As a more abstract technical description, this unintuitive behaviour
results in having to give high-priority workloads a large protection
buffer on top of their expected usage to function reliably, as otherwise
we have abrupt periods of dramatically increased memory pressure which
hamper performance. Having to set these thresholds so high wastes
resources and generally works against the principle of work conservation.
In addition, having proportional memory reclaim behaviour has other
benefits. Most notably, before this patch it's basically mandatory to set
memory.low to a higher than desirable value because otherwise as soon as
you exceed memory.low, all protection is lost, and all pages are eligible
to scan again. By contrast, having a gradual ramp in reclaim pressure
means that you now still get some protection when thresholds are exceeded,
which means that one can now be more comfortable setting memory.low to
lower values without worrying that all protection will be lost. This is
important because workingset size is really hard to know exactly,
especially with variable workloads, so at least getting *some* protection
if your workingset size grows larger than you expect increases user
confidence in setting memory.low without a huge buffer on top being
needed.
Thanks a lot to Johannes Weiner and Tejun Heo for their advice and
assistance in thinking about how to make this work better.
In testing these changes, I intended to verify that:
1. Changes in page scanning become gradual and proportional instead of
binary.
To test this, I experimented stepping further and further down
memory.low protection on a workload that floats around 19G workingset
when under memory.low protection, watching page scan rates for the
workload cgroup:
+------------+-----------------+--------------------+--------------+
| memory.low | test (pgscan/s) | control (pgscan/s) | % of control |
+------------+-----------------+--------------------+--------------+
| 21G | 0 | 0 | N/A |
| 17G | 867 | 3799 | 23% |
| 12G | 1203 | 3543 | 34% |
| 8G | 2534 | 3979 | 64% |
| 4G | 3980 | 4147 | 96% |
| 0 | 3799 | 3980 | 95% |
+------------+-----------------+--------------------+--------------+
As you can see, the test kernel (with a kernel containing this
patch) ramps up page scanning significantly more gradually than the
control kernel (without this patch).
2. More gradual ramp up in reclaim aggression doesn't result in
premature OOMs.
To test this, I wrote a script that slowly increments the number of
pages held by stress(1)'s --vm-keep mode until a production system
entered severe overall memory contention. This script runs in a highly
protected slice taking up the majority of available system memory.
Watching vmstat revealed that page scanning continued essentially
nominally between test and control, without causing forward reclaim
progress to become arrested.
[0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project
[akpm@linux-foundation.org: reflow block comments to fit in 80 cols]
[chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection]
Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name
Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-07 00:58:32 +00:00
|
|
|
unsigned long mem_cgroup_size(struct mem_cgroup *memcg);
|
|
|
|
|
mm, oom: add oom victim's memcg to the oom context information
The current oom report doesn't display victim's memcg context during the
global OOM situation. While this information is not strictly needed, it
can be really helpful for containerized environments to locate which
container has lost a process. Now that we have a single line for the oom
context, we can trivially add both the oom memcg (this can be either
global_oom or a specific memcg which hits its hard limits) and task_memcg
which is the victim's memcg.
Below is the single line output in the oom report after this patch.
- global oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,global_oom,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
- memcg oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,oom_memcg=<memcg>,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
[penguin-kernel@I-love.SAKURA.ne.jp: use pr_cont() in mem_cgroup_print_oom_context()]
Link: http://lkml.kernel.org/r/201812190723.wBJ7NdkN032628@www262.sakura.ne.jp
Link: http://lkml.kernel.org/r/1542799799-36184-2-git-send-email-ufo19890607@gmail.com
Signed-off-by: yuzhoujian <yuzhoujian@didichuxing.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 08:36:10 +00:00
|
|
|
void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
|
2015-09-08 22:01:07 +00:00
|
|
|
struct task_struct *p);
|
2008-02-07 08:14:32 +00:00
|
|
|
|
mm, oom: add oom victim's memcg to the oom context information
The current oom report doesn't display victim's memcg context during the
global OOM situation. While this information is not strictly needed, it
can be really helpful for containerized environments to locate which
container has lost a process. Now that we have a single line for the oom
context, we can trivially add both the oom memcg (this can be either
global_oom or a specific memcg which hits its hard limits) and task_memcg
which is the victim's memcg.
Below is the single line output in the oom report after this patch.
- global oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,global_oom,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
- memcg oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,oom_memcg=<memcg>,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
[penguin-kernel@I-love.SAKURA.ne.jp: use pr_cont() in mem_cgroup_print_oom_context()]
Link: http://lkml.kernel.org/r/201812190723.wBJ7NdkN032628@www262.sakura.ne.jp
Link: http://lkml.kernel.org/r/1542799799-36184-2-git-send-email-ufo19890607@gmail.com
Signed-off-by: yuzhoujian <yuzhoujian@didichuxing.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 08:36:10 +00:00
|
|
|
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg);
|
|
|
|
|
2018-08-22 04:53:54 +00:00
|
|
|
struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
|
|
|
|
struct mem_cgroup *oom_domain);
|
|
|
|
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-12 22:13:44 +00:00
|
|
|
|
2024-05-01 17:26:17 +00:00
|
|
|
void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
|
|
|
|
int val);
|
2017-05-03 21:55:03 +00:00
|
|
|
|
2017-09-06 23:22:09 +00:00
|
|
|
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
2017-07-06 22:40:52 +00:00
|
|
|
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
2024-05-01 17:26:17 +00:00
|
|
|
enum memcg_stat_item idx, int val)
|
2017-05-03 21:55:03 +00:00
|
|
|
{
|
2018-02-21 22:45:24 +00:00
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
2018-02-01 00:16:45 +00:00
|
|
|
__mod_memcg_state(memcg, idx, val);
|
2018-02-21 22:45:24 +00:00
|
|
|
local_irq_restore(flags);
|
2017-05-03 21:55:03 +00:00
|
|
|
}
|
|
|
|
|
2022-01-14 22:05:45 +00:00
|
|
|
static inline void mod_memcg_page_state(struct page *page,
|
2024-05-01 17:26:17 +00:00
|
|
|
enum memcg_stat_item idx, int val)
|
2022-01-14 22:05:45 +00:00
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2024-05-24 01:49:50 +00:00
|
|
|
memcg = folio_memcg(page_folio(page));
|
2022-01-14 22:05:45 +00:00
|
|
|
if (memcg)
|
|
|
|
mod_memcg_state(memcg, idx, val);
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2022-09-07 04:35:35 +00:00
|
|
|
unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
|
2024-05-01 17:26:11 +00:00
|
|
|
unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
|
|
|
|
unsigned long lruvec_page_state_local(struct lruvec *lruvec,
|
|
|
|
enum node_stat_item idx);
|
2011-01-13 23:47:37 +00:00
|
|
|
|
mm: memcg: restore subtree stats flushing
Stats flushing for memcg currently follows the following rules:
- Always flush the entire memcg hierarchy (i.e. flush the root).
- Only one flusher is allowed at a time. If someone else tries to flush
concurrently, they skip and return immediately.
- A periodic flusher flushes all the stats every 2 seconds.
The reason this approach is followed is because all flushes are serialized
by a global rstat spinlock. On the memcg side, flushing is invoked from
userspace reads as well as in-kernel flushers (e.g. reclaim, refault,
etc). This approach aims to avoid serializing all flushers on the global
lock, which can cause a significant performance hit under high
concurrency.
This approach has the following problems:
- Occasionally a userspace read of the stats of a non-root cgroup will
be too expensive as it has to flush the entire hierarchy [1].
- Sometimes the stats accuracy are compromised if there is an ongoing
flush, and we skip and return before the subtree of interest is
actually flushed, yielding stale stats (by up to 2s due to periodic
flushing). This is more visible when reading stats from userspace,
but can also affect in-kernel flushers.
The latter problem is particulary a concern when userspace reads stats
after an event occurs, but gets stats from before the event. Examples:
- When memory usage / pressure spikes, a userspace OOM handler may look
at the stats of different memcgs to select a victim based on various
heuristics (e.g. how much private memory will be freed by killing
this). Reading stale stats from before the usage spike in this case
may cause a wrongful OOM kill.
- A proactive reclaimer may read the stats after writing to
memory.reclaim to measure the success of the reclaim operation. Stale
stats from before reclaim may give a false negative.
- Reading the stats of a parent and a child memcg may be inconsistent
(child larger than parent), if the flush doesn't happen when the
parent is read, but happens when the child is read.
As for in-kernel flushers, they will occasionally get stale stats. No
regressions are currently known from this, but if there are regressions,
they would be very difficult to debug and link to the source of the
problem.
This patch aims to fix these problems by restoring subtree flushing, and
removing the unified/coalesced flushing logic that skips flushing if there
is an ongoing flush. This change would introduce a significant regression
with global stats flushing thresholds. With per-memcg stats flushing
thresholds, this seems to perform really well. The thresholds protect the
underlying lock from unnecessary contention.
This patch was tested in two ways to ensure the latency of flushing is
up to par, on a machine with 384 cpus:
- A synthetic test with 5000 concurrent workers in 500 cgroups doing
allocations and reclaim, as well as 1000 readers for memory.stat
(variation of [2]). No regressions were noticed in the total runtime.
Note that significant regressions in this test are observed with
global stats thresholds, but not with per-memcg thresholds.
- A synthetic stress test for concurrently reading memcg stats while
memory allocation/freeing workers are running in the background,
provided by Wei Xu [3]. With 250k threads reading the stats every
100ms in 50k cgroups, 99.9% of reads take <= 50us. Less than 0.01%
of reads take more than 1ms, and no reads take more than 100ms.
[1] https://lore.kernel.org/lkml/CABWYdi0c6__rh-K7dcM_pkf9BJdTRtAU08M43KO9ME4-dsgfoQ@mail.gmail.com/
[2] https://lore.kernel.org/lkml/CAJD7tka13M-zVZTyQJYL1iUAYvuQ1fcHbCjcOBZcz6POYTV-4g@mail.gmail.com/
[3] https://lore.kernel.org/lkml/CAAPL-u9D2b=iF5Lf_cRnKxUfkiEe0AMDTu6yhrUAzX0b6a6rDg@mail.gmail.com/
[akpm@linux-foundation.org: fix mm/zswap.c]
[yosryahmed@google.com: remove stats flushing mutex]
Link: https://lkml.kernel.org/r/CAJD7tkZgP3m-VVPn+fF_YuvXeQYK=tZZjJHj=dzD=CcSSpp2qg@mail.gmail.com
Link: https://lkml.kernel.org/r/20231129032154.3710765-6-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ivan Babrou <ivan@cloudflare.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutny <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-11-29 03:21:53 +00:00
|
|
|
void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
|
|
|
|
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
|
2021-09-02 21:55:04 +00:00
|
|
|
|
2020-12-15 03:07:04 +00:00
|
|
|
void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
|
2020-08-07 06:21:37 +00:00
|
|
|
|
2020-12-15 03:07:04 +00:00
|
|
|
static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
|
2020-08-07 06:21:37 +00:00
|
|
|
int val)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
2020-12-15 03:07:04 +00:00
|
|
|
__mod_lruvec_kmem_state(p, idx, val);
|
2020-08-07 06:21:37 +00:00
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
2019-05-14 22:47:09 +00:00
|
|
|
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
|
|
|
|
unsigned long count);
|
2018-02-01 00:16:37 +00:00
|
|
|
|
2017-07-06 22:40:25 +00:00
|
|
|
static inline void count_memcg_events(struct mem_cgroup *memcg,
|
2018-04-10 23:29:45 +00:00
|
|
|
enum vm_event_item idx,
|
|
|
|
unsigned long count)
|
2017-07-06 22:40:25 +00:00
|
|
|
{
|
2018-02-21 22:45:24 +00:00
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
2018-02-01 00:16:45 +00:00
|
|
|
__count_memcg_events(memcg, idx, count);
|
2018-02-21 22:45:24 +00:00
|
|
|
local_irq_restore(flags);
|
2017-07-06 22:40:25 +00:00
|
|
|
}
|
|
|
|
|
2022-05-13 03:23:03 +00:00
|
|
|
static inline void count_memcg_folio_events(struct folio *folio,
|
|
|
|
enum vm_event_item idx, unsigned long nr)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = folio_memcg(folio);
|
|
|
|
|
|
|
|
if (memcg)
|
|
|
|
count_memcg_events(memcg, idx, nr);
|
|
|
|
}
|
|
|
|
|
mm,memcg: provide per-cgroup counters for NUMA balancing operations
The ability to observe the demotion and promotion decisions made by the
kernel on a per-cgroup basis is important for monitoring and tuning
containerized workloads on machines equipped with tiered memory.
Different containers in the system may experience drastically different
memory tiering actions that cannot be distinguished from the global
counters alone.
For example, a container running a workload that has a much hotter memory
accesses will likely see more promotions and fewer demotions, potentially
depriving a colocated container of top tier memory to such an extent that
its performance degrades unacceptably.
For another example, some containers may exhibit longer periods between
data reuse, causing much more numa_hint_faults than numa_pages_migrated.
In this case, tuning hot_threshold_ms may be appropriate, but the signal
can easily be lost if only global counters are available.
In the long term, we hope to introduce per-cgroup control of promotion and
demotion actions to implement memory placement policies in tiering.
This patch set adds seven counters to memory.stat in a cgroup:
numa_pages_migrated, numa_pte_updates, numa_hint_faults, pgdemote_kswapd,
pgdemote_khugepaged, pgdemote_direct and pgpromote_success. pgdemote_*
and pgpromote_success are also available in memory.numa_stat.
count_memcg_events_mm() is added to count multiple event occurrences at
once, and get_mem_cgroup_from_folio() is added because we need to get a
reference to the memcg of a folio before it's migrated to track
numa_pages_migrated. The accounting of PGDEMOTE_* is moved to
shrink_inactive_list() before being changed to per-cgroup.
[kaiyang2@cs.cmu.edu: add documentation of the memcg counters in cgroup-v2.rst]
Link: https://lkml.kernel.org/r/20240814235122.252309-1-kaiyang2@cs.cmu.edu
Link: https://lkml.kernel.org/r/20240814174227.30639-1-kaiyang2@cs.cmu.edu
Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-14 17:42:27 +00:00
|
|
|
static inline void count_memcg_events_mm(struct mm_struct *mm,
|
|
|
|
enum vm_event_item idx, unsigned long count)
|
2012-12-12 21:51:57 +00:00
|
|
|
{
|
2015-09-08 22:01:02 +00:00
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
2012-12-12 21:51:57 +00:00
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
2015-09-08 22:01:02 +00:00
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
2018-06-14 22:28:05 +00:00
|
|
|
if (likely(memcg))
|
mm,memcg: provide per-cgroup counters for NUMA balancing operations
The ability to observe the demotion and promotion decisions made by the
kernel on a per-cgroup basis is important for monitoring and tuning
containerized workloads on machines equipped with tiered memory.
Different containers in the system may experience drastically different
memory tiering actions that cannot be distinguished from the global
counters alone.
For example, a container running a workload that has a much hotter memory
accesses will likely see more promotions and fewer demotions, potentially
depriving a colocated container of top tier memory to such an extent that
its performance degrades unacceptably.
For another example, some containers may exhibit longer periods between
data reuse, causing much more numa_hint_faults than numa_pages_migrated.
In this case, tuning hot_threshold_ms may be appropriate, but the signal
can easily be lost if only global counters are available.
In the long term, we hope to introduce per-cgroup control of promotion and
demotion actions to implement memory placement policies in tiering.
This patch set adds seven counters to memory.stat in a cgroup:
numa_pages_migrated, numa_pte_updates, numa_hint_faults, pgdemote_kswapd,
pgdemote_khugepaged, pgdemote_direct and pgpromote_success. pgdemote_*
and pgpromote_success are also available in memory.numa_stat.
count_memcg_events_mm() is added to count multiple event occurrences at
once, and get_mem_cgroup_from_folio() is added because we need to get a
reference to the memcg of a folio before it's migrated to track
numa_pages_migrated. The accounting of PGDEMOTE_* is moved to
shrink_inactive_list() before being changed to per-cgroup.
[kaiyang2@cs.cmu.edu: add documentation of the memcg counters in cgroup-v2.rst]
Link: https://lkml.kernel.org/r/20240814235122.252309-1-kaiyang2@cs.cmu.edu
Link: https://lkml.kernel.org/r/20240814174227.30639-1-kaiyang2@cs.cmu.edu
Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-14 17:42:27 +00:00
|
|
|
count_memcg_events(memcg, idx, count);
|
2015-09-08 22:01:02 +00:00
|
|
|
rcu_read_unlock();
|
2012-12-12 21:51:57 +00:00
|
|
|
}
|
2018-02-01 00:16:37 +00:00
|
|
|
|
mm,memcg: provide per-cgroup counters for NUMA balancing operations
The ability to observe the demotion and promotion decisions made by the
kernel on a per-cgroup basis is important for monitoring and tuning
containerized workloads on machines equipped with tiered memory.
Different containers in the system may experience drastically different
memory tiering actions that cannot be distinguished from the global
counters alone.
For example, a container running a workload that has a much hotter memory
accesses will likely see more promotions and fewer demotions, potentially
depriving a colocated container of top tier memory to such an extent that
its performance degrades unacceptably.
For another example, some containers may exhibit longer periods between
data reuse, causing much more numa_hint_faults than numa_pages_migrated.
In this case, tuning hot_threshold_ms may be appropriate, but the signal
can easily be lost if only global counters are available.
In the long term, we hope to introduce per-cgroup control of promotion and
demotion actions to implement memory placement policies in tiering.
This patch set adds seven counters to memory.stat in a cgroup:
numa_pages_migrated, numa_pte_updates, numa_hint_faults, pgdemote_kswapd,
pgdemote_khugepaged, pgdemote_direct and pgpromote_success. pgdemote_*
and pgpromote_success are also available in memory.numa_stat.
count_memcg_events_mm() is added to count multiple event occurrences at
once, and get_mem_cgroup_from_folio() is added because we need to get a
reference to the memcg of a folio before it's migrated to track
numa_pages_migrated. The accounting of PGDEMOTE_* is moved to
shrink_inactive_list() before being changed to per-cgroup.
[kaiyang2@cs.cmu.edu: add documentation of the memcg counters in cgroup-v2.rst]
Link: https://lkml.kernel.org/r/20240814235122.252309-1-kaiyang2@cs.cmu.edu
Link: https://lkml.kernel.org/r/20240814174227.30639-1-kaiyang2@cs.cmu.edu
Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-14 17:42:27 +00:00
|
|
|
static inline void count_memcg_event_mm(struct mm_struct *mm,
|
|
|
|
enum vm_event_item idx)
|
|
|
|
{
|
|
|
|
count_memcg_events_mm(mm, idx, 1);
|
|
|
|
}
|
|
|
|
|
2018-04-10 23:29:45 +00:00
|
|
|
static inline void memcg_memory_event(struct mem_cgroup *memcg,
|
|
|
|
enum memcg_memory_event event)
|
2018-02-01 00:16:37 +00:00
|
|
|
{
|
2020-11-14 06:52:13 +00:00
|
|
|
bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
|
|
|
|
event == MEMCG_SWAP_FAIL;
|
|
|
|
|
2019-07-12 03:55:55 +00:00
|
|
|
atomic_long_inc(&memcg->memory_events_local[event]);
|
2020-11-14 06:52:13 +00:00
|
|
|
if (!swap_event)
|
|
|
|
cgroup_file_notify(&memcg->events_local_file);
|
2019-07-12 03:55:55 +00:00
|
|
|
|
mm, memcg: consider subtrees in memory.events
memory.stat and other files already consider subtrees in their output, and
we should too in order to not present an inconsistent interface.
The current situation is fairly confusing, because people interacting with
cgroups expect hierarchical behaviour in the vein of memory.stat,
cgroup.events, and other files. For example, this causes confusion when
debugging reclaim events under low, as currently these always read "0" at
non-leaf memcg nodes, which frequently causes people to misdiagnose breach
behaviour. The same confusion applies to other counters in this file when
debugging issues.
Aggregation is done at write time instead of at read-time since these
counters aren't hot (unlike memory.stat which is per-page, so it does it
at read time), and it makes sense to bundle this with the file
notifications.
After this patch, events are propagated up the hierarchy:
[root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events
low 0
high 0
max 0
oom 0
oom_kill 0
[root@ktst ~]# systemd-run -p MemoryMax=1 true
Running as unit: run-r251162a189fb4562b9dabfdc9b0422f5.service
[root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events
low 0
high 0
max 7
oom 1
oom_kill 1
As this is a change in behaviour, this can be reverted to the old
behaviour by mounting with the `memory_localevents' flag set. However, we
use the new behaviour by default as there's a lack of evidence that there
are any current users of memory.events that would find this change
undesirable.
akpm: this is a behaviour change, so Cc:stable. THis is so that
forthcoming distros which use cgroup v2 are more likely to pick up the
revised behaviour.
Link: http://lkml.kernel.org/r/20190208224419.GA24772@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-01 05:30:22 +00:00
|
|
|
do {
|
|
|
|
atomic_long_inc(&memcg->memory_events[event]);
|
2020-11-14 06:52:13 +00:00
|
|
|
if (swap_event)
|
|
|
|
cgroup_file_notify(&memcg->swap_events_file);
|
|
|
|
else
|
|
|
|
cgroup_file_notify(&memcg->events_file);
|
mm, memcg: consider subtrees in memory.events
memory.stat and other files already consider subtrees in their output, and
we should too in order to not present an inconsistent interface.
The current situation is fairly confusing, because people interacting with
cgroups expect hierarchical behaviour in the vein of memory.stat,
cgroup.events, and other files. For example, this causes confusion when
debugging reclaim events under low, as currently these always read "0" at
non-leaf memcg nodes, which frequently causes people to misdiagnose breach
behaviour. The same confusion applies to other counters in this file when
debugging issues.
Aggregation is done at write time instead of at read-time since these
counters aren't hot (unlike memory.stat which is per-page, so it does it
at read time), and it makes sense to bundle this with the file
notifications.
After this patch, events are propagated up the hierarchy:
[root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events
low 0
high 0
max 0
oom 0
oom_kill 0
[root@ktst ~]# systemd-run -p MemoryMax=1 true
Running as unit: run-r251162a189fb4562b9dabfdc9b0422f5.service
[root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events
low 0
high 0
max 7
oom 1
oom_kill 1
As this is a change in behaviour, this can be reverted to the old
behaviour by mounting with the `memory_localevents' flag set. However, we
use the new behaviour by default as there's a lack of evidence that there
are any current users of memory.events that would find this change
undesirable.
akpm: this is a behaviour change, so Cc:stable. THis is so that
forthcoming distros which use cgroup v2 are more likely to pick up the
revised behaviour.
Link: http://lkml.kernel.org/r/20190208224419.GA24772@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-01 05:30:22 +00:00
|
|
|
|
mm, memcg: fix inconsistent oom event behavior
A recent commit 9852ae3fe529 ("mm, memcg: consider subtrees in
memory.events") changed the behavior of memcg events, which will now
consider subtrees in memory.events.
But oom_kill event is a special one as it is used in both cgroup1 and
cgroup2. In cgroup1, it is displayed in memory.oom_control. The file
memory.oom_control is in both root memcg and non root memcg, that is
different with memory.event as it only in non-root memcg. That commit
is okay for cgroup2, but it is not okay for cgroup1 as it will cause
inconsistent behavior between root memcg and non-root memcg.
Here's an example on why this behavior is inconsistent in cgroup1.
root memcg
/
memcg foo
/
memcg bar
Suppose there's an oom_kill in memcg bar, then the oon_kill will be
root memcg : memory.oom_control(oom_kill) 0
/
memcg foo : memory.oom_control(oom_kill) 1
/
memcg bar : memory.oom_control(oom_kill) 1
For the non-root memcg, its memory.oom_control(oom_kill) includes its
descendants' oom_kill, but for root memcg, it doesn't include its
descendants' oom_kill. That means, memory.oom_control(oom_kill) has
different meanings in different memcgs. That is inconsistent. Then the
user has to know whether the memcg is root or not.
If we can't fully support it in cgroup1, for example by adding
memory.events.local into cgroup1 as well, then let's don't touch its
original behavior.
Fixes: 9852ae3fe529 ("mm, memcg: consider subtrees in memory.events")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Chris Down <chris@chrisdown.name>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20200502141055.7378-1-laoar.shao@gmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-05-14 00:50:34 +00:00
|
|
|
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
|
|
|
break;
|
mm, memcg: consider subtrees in memory.events
memory.stat and other files already consider subtrees in their output, and
we should too in order to not present an inconsistent interface.
The current situation is fairly confusing, because people interacting with
cgroups expect hierarchical behaviour in the vein of memory.stat,
cgroup.events, and other files. For example, this causes confusion when
debugging reclaim events under low, as currently these always read "0" at
non-leaf memcg nodes, which frequently causes people to misdiagnose breach
behaviour. The same confusion applies to other counters in this file when
debugging issues.
Aggregation is done at write time instead of at read-time since these
counters aren't hot (unlike memory.stat which is per-page, so it does it
at read time), and it makes sense to bundle this with the file
notifications.
After this patch, events are propagated up the hierarchy:
[root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events
low 0
high 0
max 0
oom 0
oom_kill 0
[root@ktst ~]# systemd-run -p MemoryMax=1 true
Running as unit: run-r251162a189fb4562b9dabfdc9b0422f5.service
[root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events
low 0
high 0
max 7
oom 1
oom_kill 1
As this is a change in behaviour, this can be reverted to the old
behaviour by mounting with the `memory_localevents' flag set. However, we
use the new behaviour by default as there's a lack of evidence that there
are any current users of memory.events that would find this change
undesirable.
akpm: this is a behaviour change, so Cc:stable. THis is so that
forthcoming distros which use cgroup v2 are more likely to pick up the
revised behaviour.
Link: http://lkml.kernel.org/r/20190208224419.GA24772@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-01 05:30:22 +00:00
|
|
|
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
|
|
|
|
break;
|
|
|
|
} while ((memcg = parent_mem_cgroup(memcg)) &&
|
|
|
|
!mem_cgroup_is_root(memcg));
|
2018-02-01 00:16:37 +00:00
|
|
|
}
|
|
|
|
|
2018-06-14 22:28:05 +00:00
|
|
|
static inline void memcg_memory_event_mm(struct mm_struct *mm,
|
|
|
|
enum memcg_memory_event event)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
|
|
|
if (likely(memcg))
|
|
|
|
memcg_memory_event(memcg, event);
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2024-02-26 20:55:31 +00:00
|
|
|
void split_page_memcg(struct page *head, int old_order, int new_order);
|
2011-01-20 22:44:24 +00:00
|
|
|
|
2012-07-31 23:43:02 +00:00
|
|
|
#else /* CONFIG_MEMCG */
|
2016-03-15 21:57:16 +00:00
|
|
|
|
|
|
|
#define MEM_CGROUP_ID_SHIFT 0
|
|
|
|
|
2021-06-28 18:59:26 +00:00
|
|
|
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2024-10-26 16:37:07 +00:00
|
|
|
static inline bool folio_memcg_charged(struct folio *folio)
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
{
|
2024-10-26 16:37:07 +00:00
|
|
|
return false;
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
}
|
|
|
|
|
2022-12-30 07:08:42 +00:00
|
|
|
static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: Use helpers to read page's memcg data
Patch series "mm: allow mapping accounted kernel pages to userspace", v6.
Currently a non-slab kernel page which has been charged to a memory cgroup
can't be mapped to userspace. The underlying reason is simple: PageKmemcg
flag is defined as a page type (like buddy, offline, etc), so it takes a
bit from a page->mapped counter. Pages with a type set can't be mapped to
userspace.
But in general the kmemcg flag has nothing to do with mapping to
userspace. It only means that the page has been accounted by the page
allocator, so it has to be properly uncharged on release.
Some bpf maps are mapping the vmalloc-based memory to userspace, and their
memory can't be accounted because of this implementation detail.
This patchset removes this limitation by moving the PageKmemcg flag into
one of the free bits of the page->mem_cgroup pointer. Also it formalizes
accesses to the page->mem_cgroup and page->obj_cgroups using new helpers,
adds several checks and removes a couple of obsolete functions. As the
result the code became more robust with fewer open-coded bit tricks.
This patch (of 4):
Currently there are many open-coded reads of the page->mem_cgroup pointer,
as well as a couple of read helpers, which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer for
storing additional bits of information. In fact, we already do this for
slab pages, where the last bit indicates that a pointer has an attached
vector of objcg pointers instead of a regular memcg pointer.
This commits uses 2 existing helpers and introduces a new helper to
converts all read sides to calls of these helpers:
struct mem_cgroup *page_memcg(struct page *page);
struct mem_cgroup *page_memcg_rcu(struct page *page);
struct mem_cgroup *page_memcg_check(struct page *page);
page_memcg_check() is intended to be used in cases when the page can be a
slab page and have a memcg pointer pointing at objcg vector. It does
check the lowest bit, and if set, returns NULL. page_memcg() contains a
VM_BUG_ON_PAGE() check for the page not being a slab page.
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com
Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com
Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com
2020-12-01 21:58:27 +00:00
|
|
|
static inline struct mem_cgroup *page_memcg_check(struct page *page)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2023-11-30 19:40:20 +00:00
|
|
|
static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-06-28 18:59:26 +00:00
|
|
|
static inline bool folio_memcg_kmem(struct folio *folio)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-12-01 21:58:30 +00:00
|
|
|
static inline bool PageMemcgKmem(struct page *page)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-08-17 22:48:06 +00:00
|
|
|
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-03-15 21:57:16 +00:00
|
|
|
static inline bool mem_cgroup_disabled(void)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-04-10 23:29:45 +00:00
|
|
|
static inline void memcg_memory_event(struct mem_cgroup *memcg,
|
|
|
|
enum memcg_memory_event event)
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-11 23:26:06 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2018-06-14 22:28:05 +00:00
|
|
|
static inline void memcg_memory_event_mm(struct mm_struct *mm,
|
|
|
|
enum memcg_memory_event event)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2021-08-20 02:04:21 +00:00
|
|
|
static inline void mem_cgroup_protection(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *memcg,
|
|
|
|
unsigned long *min,
|
|
|
|
unsigned long *low)
|
mm, memcg: proportional memory.{low,min} reclaim
cgroup v2 introduces two memory protection thresholds: memory.low
(best-effort) and memory.min (hard protection). While they generally do
what they say on the tin, there is a limitation in their implementation
that makes them difficult to use effectively: that cliff behaviour often
manifests when they become eligible for reclaim. This patch implements
more intuitive and usable behaviour, where we gradually mount more
reclaim pressure as cgroups further and further exceed their protection
thresholds.
This cliff edge behaviour happens because we only choose whether or not
to reclaim based on whether the memcg is within its protection limits
(see the use of mem_cgroup_protected in shrink_node), but we don't vary
our reclaim behaviour based on this information. Imagine the following
timeline, with the numbers the lruvec size in this zone:
1. memory.low=1000000, memory.current=999999. 0 pages may be scanned.
2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned.
3. memory.low=1000000, memory.current=1000001. 1000001* pages may be
scanned. (?!)
* Of course, we won't usually scan all available pages in the zone even
without this patch because of scan control priority, over-reclaim
protection, etc. However, as shown by the tests at the end, these
techniques don't sufficiently throttle such an extreme change in input,
so cliff-like behaviour isn't really averted by their existence alone.
Here's an example of how this plays out in practice. At Facebook, we are
trying to protect various workloads from "system" software, like
configuration management tools, metric collectors, etc (see this[0] case
study). In order to find a suitable memory.low value, we start by
determining the expected memory range within which the workload will be
comfortable operating. This isn't an exact science -- memory usage deemed
"comfortable" will vary over time due to user behaviour, differences in
composition of work, etc, etc. As such we need to ballpark memory.low,
but doing this is currently problematic:
1. If we end up setting it too low for the workload, it won't have
*any* effect (see discussion above). The group will receive the full
weight of reclaim and won't have any priority while competing with the
less important system software, as if we had no memory.low configured
at all.
2. Because of this behaviour, we end up erring on the side of setting
it too high, such that the comfort range is reliably covered. However,
protected memory is completely unavailable to the rest of the system,
so we might cause undue memory and IO pressure there when we *know* we
have some elasticity in the workload.
3. Even if we get the value totally right, smack in the middle of the
comfort zone, we get extreme jumps between no pressure and full
pressure that cause unpredictable pressure spikes in the workload due
to the current binary reclaim behaviour.
With this patch, we can set it to our ballpark estimation without too much
worry. Any undesirable behaviour, such as too much or too little reclaim
pressure on the workload or system will be proportional to how far our
estimation is off. This means we can set memory.low much more
conservatively and thus waste less resources *without* the risk of the
workload falling off a cliff if we overshoot.
As a more abstract technical description, this unintuitive behaviour
results in having to give high-priority workloads a large protection
buffer on top of their expected usage to function reliably, as otherwise
we have abrupt periods of dramatically increased memory pressure which
hamper performance. Having to set these thresholds so high wastes
resources and generally works against the principle of work conservation.
In addition, having proportional memory reclaim behaviour has other
benefits. Most notably, before this patch it's basically mandatory to set
memory.low to a higher than desirable value because otherwise as soon as
you exceed memory.low, all protection is lost, and all pages are eligible
to scan again. By contrast, having a gradual ramp in reclaim pressure
means that you now still get some protection when thresholds are exceeded,
which means that one can now be more comfortable setting memory.low to
lower values without worrying that all protection will be lost. This is
important because workingset size is really hard to know exactly,
especially with variable workloads, so at least getting *some* protection
if your workingset size grows larger than you expect increases user
confidence in setting memory.low without a huge buffer on top being
needed.
Thanks a lot to Johannes Weiner and Tejun Heo for their advice and
assistance in thinking about how to make this work better.
In testing these changes, I intended to verify that:
1. Changes in page scanning become gradual and proportional instead of
binary.
To test this, I experimented stepping further and further down
memory.low protection on a workload that floats around 19G workingset
when under memory.low protection, watching page scan rates for the
workload cgroup:
+------------+-----------------+--------------------+--------------+
| memory.low | test (pgscan/s) | control (pgscan/s) | % of control |
+------------+-----------------+--------------------+--------------+
| 21G | 0 | 0 | N/A |
| 17G | 867 | 3799 | 23% |
| 12G | 1203 | 3543 | 34% |
| 8G | 2534 | 3979 | 64% |
| 4G | 3980 | 4147 | 96% |
| 0 | 3799 | 3980 | 95% |
+------------+-----------------+--------------------+--------------+
As you can see, the test kernel (with a kernel containing this
patch) ramps up page scanning significantly more gradually than the
control kernel (without this patch).
2. More gradual ramp up in reclaim aggression doesn't result in
premature OOMs.
To test this, I wrote a script that slowly increments the number of
pages held by stress(1)'s --vm-keep mode until a production system
entered severe overall memory contention. This script runs in a highly
protected slice taking up the majority of available system memory.
Watching vmstat revealed that page scanning continued essentially
nominally between test and control, without causing forward reclaim
progress to become arrested.
[0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project
[akpm@linux-foundation.org: reflow block comments to fit in 80 cols]
[chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection]
Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name
Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-07 00:58:32 +00:00
|
|
|
{
|
2021-08-20 02:04:21 +00:00
|
|
|
*min = *low = 0;
|
mm, memcg: proportional memory.{low,min} reclaim
cgroup v2 introduces two memory protection thresholds: memory.low
(best-effort) and memory.min (hard protection). While they generally do
what they say on the tin, there is a limitation in their implementation
that makes them difficult to use effectively: that cliff behaviour often
manifests when they become eligible for reclaim. This patch implements
more intuitive and usable behaviour, where we gradually mount more
reclaim pressure as cgroups further and further exceed their protection
thresholds.
This cliff edge behaviour happens because we only choose whether or not
to reclaim based on whether the memcg is within its protection limits
(see the use of mem_cgroup_protected in shrink_node), but we don't vary
our reclaim behaviour based on this information. Imagine the following
timeline, with the numbers the lruvec size in this zone:
1. memory.low=1000000, memory.current=999999. 0 pages may be scanned.
2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned.
3. memory.low=1000000, memory.current=1000001. 1000001* pages may be
scanned. (?!)
* Of course, we won't usually scan all available pages in the zone even
without this patch because of scan control priority, over-reclaim
protection, etc. However, as shown by the tests at the end, these
techniques don't sufficiently throttle such an extreme change in input,
so cliff-like behaviour isn't really averted by their existence alone.
Here's an example of how this plays out in practice. At Facebook, we are
trying to protect various workloads from "system" software, like
configuration management tools, metric collectors, etc (see this[0] case
study). In order to find a suitable memory.low value, we start by
determining the expected memory range within which the workload will be
comfortable operating. This isn't an exact science -- memory usage deemed
"comfortable" will vary over time due to user behaviour, differences in
composition of work, etc, etc. As such we need to ballpark memory.low,
but doing this is currently problematic:
1. If we end up setting it too low for the workload, it won't have
*any* effect (see discussion above). The group will receive the full
weight of reclaim and won't have any priority while competing with the
less important system software, as if we had no memory.low configured
at all.
2. Because of this behaviour, we end up erring on the side of setting
it too high, such that the comfort range is reliably covered. However,
protected memory is completely unavailable to the rest of the system,
so we might cause undue memory and IO pressure there when we *know* we
have some elasticity in the workload.
3. Even if we get the value totally right, smack in the middle of the
comfort zone, we get extreme jumps between no pressure and full
pressure that cause unpredictable pressure spikes in the workload due
to the current binary reclaim behaviour.
With this patch, we can set it to our ballpark estimation without too much
worry. Any undesirable behaviour, such as too much or too little reclaim
pressure on the workload or system will be proportional to how far our
estimation is off. This means we can set memory.low much more
conservatively and thus waste less resources *without* the risk of the
workload falling off a cliff if we overshoot.
As a more abstract technical description, this unintuitive behaviour
results in having to give high-priority workloads a large protection
buffer on top of their expected usage to function reliably, as otherwise
we have abrupt periods of dramatically increased memory pressure which
hamper performance. Having to set these thresholds so high wastes
resources and generally works against the principle of work conservation.
In addition, having proportional memory reclaim behaviour has other
benefits. Most notably, before this patch it's basically mandatory to set
memory.low to a higher than desirable value because otherwise as soon as
you exceed memory.low, all protection is lost, and all pages are eligible
to scan again. By contrast, having a gradual ramp in reclaim pressure
means that you now still get some protection when thresholds are exceeded,
which means that one can now be more comfortable setting memory.low to
lower values without worrying that all protection will be lost. This is
important because workingset size is really hard to know exactly,
especially with variable workloads, so at least getting *some* protection
if your workingset size grows larger than you expect increases user
confidence in setting memory.low without a huge buffer on top being
needed.
Thanks a lot to Johannes Weiner and Tejun Heo for their advice and
assistance in thinking about how to make this work better.
In testing these changes, I intended to verify that:
1. Changes in page scanning become gradual and proportional instead of
binary.
To test this, I experimented stepping further and further down
memory.low protection on a workload that floats around 19G workingset
when under memory.low protection, watching page scan rates for the
workload cgroup:
+------------+-----------------+--------------------+--------------+
| memory.low | test (pgscan/s) | control (pgscan/s) | % of control |
+------------+-----------------+--------------------+--------------+
| 21G | 0 | 0 | N/A |
| 17G | 867 | 3799 | 23% |
| 12G | 1203 | 3543 | 34% |
| 8G | 2534 | 3979 | 64% |
| 4G | 3980 | 4147 | 96% |
| 0 | 3799 | 3980 | 95% |
+------------+-----------------+--------------------+--------------+
As you can see, the test kernel (with a kernel containing this
patch) ramps up page scanning significantly more gradually than the
control kernel (without this patch).
2. More gradual ramp up in reclaim aggression doesn't result in
premature OOMs.
To test this, I wrote a script that slowly increments the number of
pages held by stress(1)'s --vm-keep mode until a production system
entered severe overall memory contention. This script runs in a highly
protected slice taking up the majority of available system memory.
Watching vmstat revealed that page scanning continued essentially
nominally between test and control, without causing forward reclaim
progress to become arrested.
[0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project
[akpm@linux-foundation.org: reflow block comments to fit in 80 cols]
[chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection]
Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name
Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-07 00:58:32 +00:00
|
|
|
}
|
|
|
|
|
2020-08-07 06:22:05 +00:00
|
|
|
static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
mm: memcg: fix stale protection of reclaim target memcg
Patch series "mm: memcg: fix protection of reclaim target memcg", v3.
This series fixes a bug in calculating the protection of the reclaim
target memcg where we end up using stale effective protection values from
the last reclaim operation, instead of completely ignoring the protection
of the reclaim target as intended. More detailed explanation and examples
in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest
case that catches the bug.
This patch (of 3):
When we are doing memcg reclaim, the intended behavior is that we
ignore any protection (memory.min, memory.low) of the target memcg (but
not its children). Ever since the patch pointed to by the "Fixes" tag,
we actually read a stale value for the target memcg protection when
deciding whether to skip the memcg or not because it is protected. If
the stale value happens to be high enough, we don't reclaim from the
target memcg.
Essentially, in some cases we may falsely skip reclaiming from the
target memcg of reclaim because we read a stale protection value from
last time we reclaimed from it.
During reclaim, mem_cgroup_calculate_protection() is used to determine the
effective protection (emin and elow) values of a memcg. The protection of
the reclaim target is ignored, but we cannot set their effective
protection to 0 due to a limitation of the current implementation (see
comment in mem_cgroup_protection()). Instead, we leave their effective
protection values unchaged, and later ignore it in
mem_cgroup_protection().
However, mem_cgroup_protection() is called later in
shrink_lruvec()->get_scan_count(), which is after the
mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result,
the stale effective protection values of the target memcg may lead us to
skip reclaiming from the target memcg entirely, before calling
shrink_lruvec(). This can be even worse with recursive protection, where
the stale target memcg protection can be higher than its standalone
protection. See two examples below (a similar version of example (a) is
added to test_memcontrol in a later patch).
(a) A simple example with proactive reclaim is as follows. Consider the
following hierarchy:
ROOT
|
A
|
B (memory.min = 10M)
Consider the following scenario:
- B has memory.current = 10M.
- The system undergoes global reclaim (or memcg reclaim in A).
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() calculates the effective min (emin)
of B as 10M.
- mem_cgroup_below_min() returns true for B, we do not reclaim from B.
- Now if we want to reclaim 5M from B using proactive reclaim
(memory.reclaim), we should be able to, as the protection of the
target memcg should be ignored.
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() immediately returns for B without
doing anything, as B is the target memcg, relying on
mem_cgroup_protection() to ignore B's stale effective min (still 10M).
- mem_cgroup_below_min() reads the stale effective min for B and we
skip it instead of ignoring its protection as intended, as we never
reach mem_cgroup_protection().
(b) An more complex example with recursive protection is as follows.
Consider the following hierarchy with memory_recursiveprot:
ROOT
|
A (memory.min = 50M)
|
B (memory.min = 10M, memory.high = 40M)
Consider the following scenario:
- B has memory.current = 35M.
- The system undergoes global reclaim (target memcg is NULL).
- B will have an effective min of 50M (all of A's unclaimed protection).
- B will not be reclaimed from.
- Now allocate 10M more memory in B, pushing it above it's high limit.
- The system undergoes memcg reclaim from B (target memcg is B).
- Like example (a), we do nothing in mem_cgroup_calculate_protection(),
then call mem_cgroup_below_min(), which will read the stale effective
min for B (50M) and skip it. In this case, it's even worse because we
are not just considering B's standalone protection (10M), but we are
reading a much higher stale protection (50M) which will cause us to not
reclaim from B at all.
This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple
e{low,min} state mutations from protection checks") which made
mem_cgroup_calculate_protection() only change the state without returning
any value. Before that commit, we used to return MEMCG_PROT_NONE for the
target memcg, which would cause us to skip the
mem_cgroup_below_{min/low}() checks. After that commit we do not return
anything and we end up checking the min & low effective protections for
the target memcg, which are stale.
Update mem_cgroup_supports_protection() to also check if we are reclaiming
from the target, and rename it to mem_cgroup_unprotected() (now returns
true if we should not protect the memcg, much simpler logic).
Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com
Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Chris Down <chris@chrisdown.name>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-02 03:15:10 +00:00
|
|
|
static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
|
|
|
|
struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
|
|
|
|
struct mem_cgroup *memcg)
|
2020-08-07 06:22:05 +00:00
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
mm: memcg: fix stale protection of reclaim target memcg
Patch series "mm: memcg: fix protection of reclaim target memcg", v3.
This series fixes a bug in calculating the protection of the reclaim
target memcg where we end up using stale effective protection values from
the last reclaim operation, instead of completely ignoring the protection
of the reclaim target as intended. More detailed explanation and examples
in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest
case that catches the bug.
This patch (of 3):
When we are doing memcg reclaim, the intended behavior is that we
ignore any protection (memory.min, memory.low) of the target memcg (but
not its children). Ever since the patch pointed to by the "Fixes" tag,
we actually read a stale value for the target memcg protection when
deciding whether to skip the memcg or not because it is protected. If
the stale value happens to be high enough, we don't reclaim from the
target memcg.
Essentially, in some cases we may falsely skip reclaiming from the
target memcg of reclaim because we read a stale protection value from
last time we reclaimed from it.
During reclaim, mem_cgroup_calculate_protection() is used to determine the
effective protection (emin and elow) values of a memcg. The protection of
the reclaim target is ignored, but we cannot set their effective
protection to 0 due to a limitation of the current implementation (see
comment in mem_cgroup_protection()). Instead, we leave their effective
protection values unchaged, and later ignore it in
mem_cgroup_protection().
However, mem_cgroup_protection() is called later in
shrink_lruvec()->get_scan_count(), which is after the
mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result,
the stale effective protection values of the target memcg may lead us to
skip reclaiming from the target memcg entirely, before calling
shrink_lruvec(). This can be even worse with recursive protection, where
the stale target memcg protection can be higher than its standalone
protection. See two examples below (a similar version of example (a) is
added to test_memcontrol in a later patch).
(a) A simple example with proactive reclaim is as follows. Consider the
following hierarchy:
ROOT
|
A
|
B (memory.min = 10M)
Consider the following scenario:
- B has memory.current = 10M.
- The system undergoes global reclaim (or memcg reclaim in A).
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() calculates the effective min (emin)
of B as 10M.
- mem_cgroup_below_min() returns true for B, we do not reclaim from B.
- Now if we want to reclaim 5M from B using proactive reclaim
(memory.reclaim), we should be able to, as the protection of the
target memcg should be ignored.
- In shrink_node_memcgs():
- mem_cgroup_calculate_protection() immediately returns for B without
doing anything, as B is the target memcg, relying on
mem_cgroup_protection() to ignore B's stale effective min (still 10M).
- mem_cgroup_below_min() reads the stale effective min for B and we
skip it instead of ignoring its protection as intended, as we never
reach mem_cgroup_protection().
(b) An more complex example with recursive protection is as follows.
Consider the following hierarchy with memory_recursiveprot:
ROOT
|
A (memory.min = 50M)
|
B (memory.min = 10M, memory.high = 40M)
Consider the following scenario:
- B has memory.current = 35M.
- The system undergoes global reclaim (target memcg is NULL).
- B will have an effective min of 50M (all of A's unclaimed protection).
- B will not be reclaimed from.
- Now allocate 10M more memory in B, pushing it above it's high limit.
- The system undergoes memcg reclaim from B (target memcg is B).
- Like example (a), we do nothing in mem_cgroup_calculate_protection(),
then call mem_cgroup_below_min(), which will read the stale effective
min for B (50M) and skip it. In this case, it's even worse because we
are not just considering B's standalone protection (10M), but we are
reading a much higher stale protection (50M) which will cause us to not
reclaim from B at all.
This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple
e{low,min} state mutations from protection checks") which made
mem_cgroup_calculate_protection() only change the state without returning
any value. Before that commit, we used to return MEMCG_PROT_NONE for the
target memcg, which would cause us to skip the
mem_cgroup_below_{min/low}() checks. After that commit we do not return
anything and we end up checking the min & low effective protections for
the target memcg, which are stale.
Update mem_cgroup_supports_protection() to also check if we are reclaiming
from the target, and rename it to mem_cgroup_unprotected() (now returns
true if we should not protect the memcg, much simpler logic).
Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com
Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Chris Down <chris@chrisdown.name>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-02 03:15:10 +00:00
|
|
|
static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
|
|
|
|
struct mem_cgroup *memcg)
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-11 23:26:06 +00:00
|
|
|
{
|
2020-08-07 06:22:05 +00:00
|
|
|
return false;
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-11 23:26:06 +00:00
|
|
|
}
|
|
|
|
|
memcontrol: add helpers for hugetlb memcg accounting
Patch series "hugetlb memcg accounting", v4.
Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory. This has been observed in our production system.
For instance, here is one of our usecases: suppose there are two 32G
containers. The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness. But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etcetera fair.
What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max. Similar procedure is done to other memory limits
(memory.low for e.g). However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.
This patch series rectifies this issue by charging the memcg when the
hugetlb folio is allocated, and uncharging when the folio is freed. In
addition, a new selftest is added to demonstrate and verify this new
behavior.
This patch (of 4):
This patch exposes charge committing and cancelling as parts of the memory
controller interface. These functionalities are useful when the
try_charge() and commit_charge() stages have to be separated by other
actions in between (which can fail). One such example is the new hugetlb
accounting behavior in the following patch.
The patch also adds a helper function to obtain a reference to the
current task's memcg.
Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-06 18:46:26 +00:00
|
|
|
static inline void mem_cgroup_commit_charge(struct folio *folio,
|
|
|
|
struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2021-06-25 13:27:04 +00:00
|
|
|
static inline int mem_cgroup_charge(struct folio *folio,
|
|
|
|
struct mm_struct *mm, gfp_t gfp)
|
mm: memcontrol: convert page cache to a new mem_cgroup_charge() API
The try/commit/cancel protocol that memcg uses dates back to when pages
used to be uncharged upon removal from the page cache, and thus couldn't
be committed before the insertion had succeeded. Nowadays, pages are
uncharged when they are physically freed; it doesn't matter whether the
insertion was successful or not. For the page cache, the transaction
dance has become unnecessary.
Introduce a mem_cgroup_charge() function that simply charges a newly
allocated page to a cgroup and sets up page->mem_cgroup in one single
step. If the insertion fails, the caller doesn't have to do anything but
free/put the page.
Then switch the page cache over to this new API.
Subsequent patches will also convert anon pages, but it needs a bit more
prep work. Right now, memcg depends on page->mapping being already set up
at the time of charging, so that it can maintain its own MEMCG_CACHE and
MEMCG_RSS counters. For anon, page->mapping is set under the same pte
lock under which the page is publishd, so a single charge point that can
block doesn't work there just yet.
The following prep patches will replace the private memcg counters with
the generic vmstat counters, thus removing the page->mapping dependency,
then complete the transition to the new single-point charge API and delete
the old transactional scheme.
v2: leave shmem swapcache when charging fails to avoid double IO (Joonsoo)
v3: rebase on preceeding shmem simplification patch
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Link: http://lkml.kernel.org/r/20200508183105.225460-6-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-03 23:01:41 +00:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
hugetlb: memcg: account hugetlb-backed memory in memory controller
Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory. This has been observed in our production system.
For instance, here is one of our usecases: suppose there are two 32G
containers. The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness. But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etc. fair.
What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max. Similar procedure is done to other memory limits
(memory.low for e.g). However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.
This patch rectifies this issue by charging the memcg when the hugetlb
folio is utilized, and uncharging when the folio is freed (analogous to
the hugetlb controller). Note that we do not charge when the folio is
allocated to the hugetlb pool, because at this point it is not owned by
any memcg.
Some caveats to consider:
* This feature is only available on cgroup v2.
* There is no hugetlb pool management involved in the memory
controller. As stated above, hugetlb folios are only charged towards
the memory controller when it is used. Host overcommit management
has to consider it when configuring hard limits.
* Failure to charge towards the memcg results in SIGBUS. This could
happen even if the hugetlb pool still has pages (but the cgroup
limit is hit and reclaim attempt fails).
* When this feature is enabled, hugetlb pages contribute to memory
reclaim protection. low, min limits tuning must take into account
hugetlb memory.
* Hugetlb pages utilized while this option is not selected will not
be tracked by the memory controller (even if cgroup v2 is remounted
later on).
Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-06 18:46:28 +00:00
|
|
|
static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
|
|
|
|
gfp_t gfp, long nr_pages)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-09-02 19:46:12 +00:00
|
|
|
static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
|
2021-04-30 05:56:36 +00:00
|
|
|
struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-09-08 23:21:18 +00:00
|
|
|
static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr)
|
2021-04-30 05:56:36 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2021-05-02 00:42:23 +00:00
|
|
|
static inline void mem_cgroup_uncharge(struct folio *folio)
|
2009-12-16 00:47:03 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2024-02-27 17:42:39 +00:00
|
|
|
static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
memcontrol: add helpers for hugetlb memcg accounting
Patch series "hugetlb memcg accounting", v4.
Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory. This has been observed in our production system.
For instance, here is one of our usecases: suppose there are two 32G
containers. The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness. But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etcetera fair.
What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max. Similar procedure is done to other memory limits
(memory.low for e.g). However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.
This patch series rectifies this issue by charging the memcg when the
hugetlb folio is allocated, and uncharging when the folio is freed. In
addition, a new selftest is added to demonstrate and verify this new
behavior.
This patch (of 4):
This patch exposes charge committing and cancelling as parts of the memory
controller interface. These functionalities are useful when the
try_charge() and commit_charge() stages have to be separated by other
actions in between (which can fail). One such example is the new hugetlb
accounting behavior in the following patch.
The patch also adds a helper function to obtain a reference to the
current task's memcg.
Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-06 18:46:26 +00:00
|
|
|
static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
|
|
|
|
unsigned int nr_pages)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2023-10-06 18:46:27 +00:00
|
|
|
static inline void mem_cgroup_replace_folio(struct folio *old,
|
|
|
|
struct folio *new)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2021-05-06 22:14:59 +00:00
|
|
|
static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
|
memcg: remove refcnt from page_cgroup
memcg: performance improvements
Patch Description
1/5 ... remove refcnt fron page_cgroup patch (shmem handling is fixed)
2/5 ... swapcache handling patch
3/5 ... add helper function for shmem's memory reclaim patch
4/5 ... optimize by likely/unlikely ppatch
5/5 ... remove redundunt check patch (shmem handling is fixed.)
Unix bench result.
== 2.6.26-rc2-mm1 + memory resource controller
Execl Throughput 2915.4 lps (29.6 secs, 3 samples)
C Compiler Throughput 1019.3 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5796.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1097.7 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 565.3 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1022128.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 544057.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 346481.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 319325.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 148788.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 99051.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2058917.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1606109.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 854789.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 126145.2 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 2915.4 678.0
File Copy 1024 bufsize 2000 maxblocks 3960.0 346481.0 875.0
File Copy 256 bufsize 500 maxblocks 1655.0 99051.0 598.5
File Copy 4096 bufsize 8000 maxblocks 5800.0 854789.0 1473.8
Shell Scripts (8 concurrent) 6.0 1097.7 1829.5
=========
FINAL SCORE 991.3
== 2.6.26-rc2-mm1 + this set ==
Execl Throughput 3012.9 lps (29.9 secs, 3 samples)
C Compiler Throughput 981.0 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5872.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1120.3 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 578.0 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1003993.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 550452.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 347159.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 314644.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 151852.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 101000.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2033256.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1611814.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 847979.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 128148.7 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 3012.9 700.7
File Copy 1024 bufsize 2000 maxblocks 3960.0 347159.0 876.7
File Copy 256 bufsize 500 maxblocks 1655.0 101000.0 610.3
File Copy 4096 bufsize 8000 maxblocks 5800.0 847979.0 1462.0
Shell Scripts (8 concurrent) 6.0 1120.3 1867.2
=========
FINAL SCORE 1004.6
This patch:
Remove refcnt from page_cgroup().
After this,
* A page is charged only when !page_mapped() && no page_cgroup is assigned.
* Anon page is newly mapped.
* File page is added to mapping->tree.
* A page is uncharged only when
* Anon page is fully unmapped.
* File page is removed from LRU.
There is no change in behavior from user's view.
This patch also removes unnecessary calls in rmap.c which was used only for
refcnt mangement.
[akpm@linux-foundation.org: fix warning]
[hugh@veritas.com: fix shmem_unuse_inode charging]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-25 08:47:14 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2019-12-01 01:55:34 +00:00
|
|
|
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
|
|
|
|
struct pglist_data *pgdat)
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 02:08:01 +00:00
|
|
|
{
|
2019-12-01 01:55:34 +00:00
|
|
|
return &pgdat->__lruvec;
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 02:08:01 +00:00
|
|
|
}
|
|
|
|
|
2021-06-29 00:00:28 +00:00
|
|
|
static inline struct lruvec *folio_lruvec(struct folio *folio)
|
2008-02-07 08:13:56 +00:00
|
|
|
{
|
2021-06-29 00:00:28 +00:00
|
|
|
struct pglist_data *pgdat = folio_pgdat(folio);
|
2019-12-01 01:55:34 +00:00
|
|
|
return &pgdat->__lruvec;
|
2008-02-07 08:13:56 +00:00
|
|
|
}
|
|
|
|
|
2021-06-29 01:59:47 +00:00
|
|
|
static inline
|
|
|
|
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
|
mm: memcontrol: switch to rstat
Replace the memory controller's custom hierarchical stats code with the
generic rstat infrastructure provided by the cgroup core.
The current implementation does batched upward propagation from the
write side (i.e. as stats change). The per-cpu batches introduce an
error, which is multiplied by the number of subgroups in a tree. In
systems with many CPUs and sizable cgroup trees, the error can be large
enough to confuse users (e.g. 32 batch pages * 32 CPUs * 32 subgroups
results in an error of up to 128M per stat item). This can entirely
swallow allocation bursts inside a workload that the user is expecting
to see reflected in the statistics.
In the past, we've done read-side aggregation, where a memory.stat read
would have to walk the entire subtree and add up per-cpu counts. This
became problematic with lazily-freed cgroups: we could have large
subtrees where most cgroups were entirely idle. Hence the switch to
change-driven upward propagation. Unfortunately, it needed to trade
accuracy for speed due to the write side being so hot.
Rstat combines the best of both worlds: from the write side, it cheaply
maintains a queue of cgroups that have pending changes, so that the read
side can do selective tree aggregation. This way the reported stats
will always be precise and recent as can be, while the aggregation can
skip over potentially large numbers of idle cgroups.
The way rstat works is that it implements a tree for tracking cgroups
with pending local changes, as well as a flush function that walks the
tree upwards. The controller then drives this by 1) telling rstat when
a local cgroup stat changes (e.g. mod_memcg_state) and 2) when a flush
is required to get uptodate hierarchy stats for a given subtree (e.g.
when memory.stat is read). The controller also provides a flush
callback that is called during the rstat flush walk for each cgroup and
aggregates its local per-cpu counters and propagates them upwards.
This adds a second vmstats to struct mem_cgroup (MEMCG_NR_STAT +
NR_VM_EVENT_ITEMS) to track pending subtree deltas during upward
aggregation. It removes 3 words from the per-cpu data. It eliminates
memcg_exact_page_state(), since memcg_page_state() is now exact.
[akpm@linux-foundation.org: merge fix]
[hannes@cmpxchg.org: fix a sleep in atomic section problem]
Link: https://lkml.kernel.org/r/20210315234100.64307-1-hannes@cmpxchg.org
Link: https://lkml.kernel.org/r/20210209163304.77088-7-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:26 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
mm: vmscan: detect file thrashing at the reclaim root
We use refault information to determine whether the cache workingset is
stable or transitioning, and dynamically adjust the inactive:active file
LRU ratio so as to maximize protection from one-off cache during stable
periods, and minimize IO during transitions.
With cgroups and their nested LRU lists, we currently don't do this
correctly. While recursive cgroup reclaim establishes a relative LRU
order among the pages of all involved cgroups, refaults only affect the
local LRU order in the cgroup in which they are occuring. As a result,
cache transitions can take longer in a cgrouped system as the active pages
of sibling cgroups aren't challenged when they should be.
[ Right now, this is somewhat theoretical, because the siblings, under
continued regular reclaim pressure, should eventually run out of
inactive pages - and since inactive:active *size* balancing is also
done on a cgroup-local level, we will challenge the active pages
eventually in most cases. But the next patch will move that relative
size enforcement to the reclaim root as well, and then this patch
here will be necessary to propagate refault pressure to siblings. ]
This patch moves refault detection to the root of reclaim. Instead of
remembering the cgroup owner of an evicted page, remember the cgroup that
caused the reclaim to happen. When refaults later occur, they'll
correctly influence the cross-cgroup LRU order that reclaim follows.
I.e. if global reclaim kicked out pages in some subgroup A/B/C, the
refault of those pages will challenge the global LRU order, and not just
the local order down inside C.
[hannes@cmpxchg.org: use page_memcg() instead of another lookup]
Link: http://lkml.kernel.org/r/20191115160722.GA309754@cmpxchg.org
Link: http://lkml.kernel.org/r/20191107205334.158354-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-01 01:55:59 +00:00
|
|
|
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2012-10-08 23:34:12 +00:00
|
|
|
static inline bool mm_match_cgroup(struct mm_struct *mm,
|
2011-11-02 20:38:15 +00:00
|
|
|
struct mem_cgroup *memcg)
|
2008-02-07 08:14:01 +00:00
|
|
|
{
|
2012-10-08 23:34:12 +00:00
|
|
|
return true;
|
2008-02-07 08:14:01 +00:00
|
|
|
}
|
|
|
|
|
fs: fsnotify: account fsnotify metadata to kmemcg
Patch series "Directed kmem charging", v8.
The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs. All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.
The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated. However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg. This patch series
contains two such concrete use-cases i.e. fsnotify and buffer_head.
The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener. The events
are allocated in the context of the event producer. However they should
be charged to the event consumer. Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.
To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg. In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged. For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.
This patch (of 2):
A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener. This can cause
system level memory pressure or OOMs. So, it's better to account the
fsnotify kmem caches to the memcg of the listener.
However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer. This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.
There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener. So, SLAB_ACCOUNT is enough for these caches.
The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.
The allocations from the event caches happen in the context of the event
producer. For such caches we will need to remote charge the allocations
to the listener's memcg. Thus we save the memcg reference in the
fsnotify_group structure of the listener.
This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.
[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-17 22:46:39 +00:00
|
|
|
static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
memcontrol: add helpers for hugetlb memcg accounting
Patch series "hugetlb memcg accounting", v4.
Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory. This has been observed in our production system.
For instance, here is one of our usecases: suppose there are two 32G
containers. The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness. But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etcetera fair.
What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max. Similar procedure is done to other memory limits
(memory.low for e.g). However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.
This patch series rectifies this issue by charging the memcg when the
hugetlb folio is allocated, and uncharging when the folio is freed. In
addition, a new selftest is added to demonstrate and verify this new
behavior.
This patch (of 4):
This patch exposes charge committing and cancelling as parts of the memory
controller interface. These functionalities are useful when the
try_charge() and commit_charge() stages have to be separated by other
actions in between (which can fail). One such example is the new hugetlb
accounting behavior in the following patch.
The patch also adds a helper function to obtain a reference to the
current task's memcg.
Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-06 18:46:26 +00:00
|
|
|
static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
mm,memcg: provide per-cgroup counters for NUMA balancing operations
The ability to observe the demotion and promotion decisions made by the
kernel on a per-cgroup basis is important for monitoring and tuning
containerized workloads on machines equipped with tiered memory.
Different containers in the system may experience drastically different
memory tiering actions that cannot be distinguished from the global
counters alone.
For example, a container running a workload that has a much hotter memory
accesses will likely see more promotions and fewer demotions, potentially
depriving a colocated container of top tier memory to such an extent that
its performance degrades unacceptably.
For another example, some containers may exhibit longer periods between
data reuse, causing much more numa_hint_faults than numa_pages_migrated.
In this case, tuning hot_threshold_ms may be appropriate, but the signal
can easily be lost if only global counters are available.
In the long term, we hope to introduce per-cgroup control of promotion and
demotion actions to implement memory placement policies in tiering.
This patch set adds seven counters to memory.stat in a cgroup:
numa_pages_migrated, numa_pte_updates, numa_hint_faults, pgdemote_kswapd,
pgdemote_khugepaged, pgdemote_direct and pgpromote_success. pgdemote_*
and pgpromote_success are also available in memory.numa_stat.
count_memcg_events_mm() is added to count multiple event occurrences at
once, and get_mem_cgroup_from_folio() is added because we need to get a
reference to the memcg of a folio before it's migrated to track
numa_pages_migrated. The accounting of PGDEMOTE_* is moved to
shrink_inactive_list() before being changed to per-cgroup.
[kaiyang2@cs.cmu.edu: add documentation of the memcg counters in cgroup-v2.rst]
Link: https://lkml.kernel.org/r/20240814235122.252309-1-kaiyang2@cs.cmu.edu
Link: https://lkml.kernel.org/r/20240814174227.30639-1-kaiyang2@cs.cmu.edu
Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-14 17:42:27 +00:00
|
|
|
static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-06-29 02:38:21 +00:00
|
|
|
static inline
|
|
|
|
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
mm: define obj_cgroup_get() if CONFIG_MEMCG is not defined
Patch series "mm: zswap swap-out of large folios", v10.
This patch series enables zswap_store() to accept and store large folios.
The most significant contribution in this series is from the earlier RFC
submitted by Ryan Roberts [1]. Ryan's original RFC has been migrated to
mm-unstable as of 9-30-2024 in patch 6 of this series, and adapted based
on code review comments received for the current patch-series.
[1]: [RFC PATCH v1] mm: zswap: Store large folios without splitting
https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u
The first few patches do the prep work for supporting large folios in
zswap_store. Patch 6 provides the main functionality to swap-out large
folios in zswap. Patch 7 adds sysfs per-order hugepages "zswpout"
counters that get incremented upon successful zswap_store of large folios,
and also updates the documentation for this:
/sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpout
This patch series is a prerequisite for zswap compress batching of large
folio swap-out and decompress batching of swap-ins based on
swapin_readahead(), using Intel IAA hardware acceleration, which we would
like to submit in subsequent patch-series, with performance improvement
data.
Thanks to Ying Huang for pre-posting review feedback and suggestions!
Thanks also to Nhat, Yosry, Johannes, Barry, Chengming, Usama, Ying and
Matthew for their helpful feedback, code/data reviews and suggestions!
Co-development signoff request:
===============================
I would like to thank Ryan Roberts for his original RFC [1] and request
his co-developer signoff on patch 6 in this series. Thanks Ryan!
System setup for testing:
=========================
Testing of this patch series was done with mm-unstable as of 9-27-2024,
commit de2fbaa6d9c3576ec7133ed02a370ec9376bf000 (without this patch-series)
and mm-unstable 9-30-2024 commit c121617e3606be6575cdacfdb63cc8d67b46a568
(with this patch-series). Data was gathered on an Intel Sapphire Rapids
server, dual-socket 56 cores per socket, 4 IAA devices per socket, 503 GiB
RAM and 525G SSD disk partition swap. Core frequency was fixed at 2500MHz.
The vm-scalability "usemem" test was run in a cgroup whose memory.high
was fixed at 150G. The is no swap limit set for the cgroup. 30 usemem
processes were run, each allocating and writing 10G of memory, and sleeping
for 10 sec before exiting:
usemem --init-time -w -O -s 10 -n 30 10g
Other kernel configuration parameters:
zswap compressors : zstd, deflate-iaa
zswap allocator : zsmalloc
vm.page-cluster : 2
In the experiments where "deflate-iaa" is used as the zswap compressor,
IAA "compression verification" is enabled by default
(cat /sys/bus/dsa/drivers/crypto/verify_compress). Hence each IAA
compression will be decompressed internally by the "iaa_crypto" driver, the
crc-s returned by the hardware will be compared and errors reported in case
of mismatches. Thus "deflate-iaa" helps ensure better data integrity as
compared to the software compressors, and the experimental data listed
below is with verify_compress set to "1".
Metrics reporting methodology:
==============================
Total and average throughput are derived from the individual 30 processes'
throughputs reported by usemem. elapsed/sys times are measured with perf.
All percentage changes are "new" vs. "old"; hence a positive value
denotes an increase in the metric, whether it is throughput or latency,
and a negative value denotes a reduction in the metric. Positive throughput
change percentages and negative latency change percentages denote improvements.
The vm stats and sysfs hugepages stats included with the performance data
provide details on the swapout activity to zswap/swap device.
Testing labels used in data summaries:
======================================
The data refers to these test configurations and the before/after
comparisons that they do:
before-case1:
-------------
mm-unstable 9-27-2024, CONFIG_THP_SWAP=N (compares zswap 4K vs. zswap 64K)
In this scenario, CONFIG_THP_SWAP=N results in 64K/2M folios to be split
into 4K folios that get processed by zswap.
before-case2:
-------------
mm-unstable 9-27-2024, CONFIG_THP_SWAP=Y (compares SSD swap large folios vs. zswap large folios)
In this scenario, CONFIG_THP_SWAP=Y results in zswap rejecting large
folios, which will then be stored by the SSD swap device.
after:
------
v10 of this patch-series, CONFIG_THP_SWAP=Y
The "after" is CONFIG_THP_SWAP=Y and v10 of this patch-series, that results
in 64K/2M folios to not be split, and to be processed by zswap_store.
Regression Testing:
===================
I ran vm-scalability usemem without large folios, i.e., only 4K folios with
mm-unstable and this patch-series. The main goal was to make sure that
there is no functional or performance regression wrt the earlier zswap
behavior for 4K folios, now that 4K folios will be processed by the new
zswap_store() code.
The data indicates there is no significant regression.
-------------------------------------------------------------------------------
4K folios:
==========
zswap compressor zstd zstd zstd zstd v10
before-case1 before-case2 after vs. vs.
case1 case2
-------------------------------------------------------------------------------
Total throughput (KB/s) 4,793,363 4,880,978 4,853,074 1% -1%
Average throughput (KB/s) 159,778 162,699 161,769 1% -1%
elapsed time (sec) 130.14 123.17 126.29 -3% 3%
sys time (sec) 3,135.53 2,985.64 3,083.18 -2% 3%
memcg_high 446,826 444,626 452,930
memcg_swap_fail 0 0 0
zswpout 48,932,107 48,931,971 48,931,820
zswpin 383 386 397
pswpout 0 0 0
pswpin 0 0 0
thp_swpout 0 0 0
thp_swpout_fallback 0 0 0
64kB-mthp_swpout_fallback 0 0 0
pgmajfault 3,063 3,077 3,479
swap_ra 93 94 96
swap_ra_hit 47 47 50
ZSWPOUT-64kB n/a n/a 0
SWPOUT-64kB 0 0 0
-------------------------------------------------------------------------------
Performance Testing:
====================
We list the data for 64K folios with before/after data per-compressor,
followed by the same for 2M pmd-mappable folios.
-------------------------------------------------------------------------------
64K folios: zstd:
=================
zswap compressor zstd zstd zstd zstd v10
before-case1 before-case2 after vs. vs.
case1 case2
-------------------------------------------------------------------------------
Total throughput (KB/s) 5,222,213 1,076,611 6,159,776 18% 472%
Average throughput (KB/s) 174,073 35,887 205,325 18% 472%
elapsed time (sec) 120.50 347.16 108.33 -10% -69%
sys time (sec) 2,930.33 248.16 2,549.65 -13% 927%
memcg_high 416,773 552,200 465,874
memcg_swap_fail 3,192,906 1,293 1,012
zswpout 48,931,583 20,903 48,931,218
zswpin 384 363 410
pswpout 0 40,778,448 0
pswpin 0 16 0
thp_swpout 0 0 0
thp_swpout_fallback 0 0 0
64kB-mthp_swpout_fallback 3,192,906 1,293 1,012
pgmajfault 3,452 3,072 3,061
swap_ra 90 87 107
swap_ra_hit 42 43 57
ZSWPOUT-64kB n/a n/a 3,057,173
SWPOUT-64kB 0 2,548,653 0
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
64K folios: deflate-iaa:
========================
zswap compressor deflate-iaa deflate-iaa deflate-iaa deflate-iaa v10
before-case1 before-case2 after vs. vs.
case1 case2
-------------------------------------------------------------------------------
Total throughput (KB/s) 5,652,608 1,089,180 7,189,778 27% 560%
Average throughput (KB/s) 188,420 36,306 239,659 27% 560%
elapsed time (sec) 102.90 343.35 87.05 -15% -75%
sys time (sec) 2,246.86 213.53 1,864.16 -17% 773%
memcg_high 576,104 502,907 642,083
memcg_swap_fail 4,016,117 1,407 1,478
zswpout 61,163,423 22,444 57,798,716
zswpin 401 368 454
pswpout 0 40,862,080 0
pswpin 0 20 0
thp_swpout 0 0 0
thp_swpout_fallback 0 0 0
64kB-mthp_swpout_fallback 4,016,117 1,407 1,478
pgmajfault 3,063 3,153 3,122
swap_ra 96 93 156
swap_ra_hit 46 45 83
ZSWPOUT-64kB n/a n/a 3,611,032
SWPOUT-64kB 0 2,553,880 0
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
2M folios: zstd:
================
zswap compressor zstd zstd zstd zstd v10
before-case1 before-case2 after vs. vs.
case1 case2
-------------------------------------------------------------------------------
Total throughput (KB/s) 5,895,500 1,109,694 6,484,224 10% 484%
Average throughput (KB/s) 196,516 36,989 216,140 10% 484%
elapsed time (sec) 108.77 334.28 106.33 -2% -68%
sys time (sec) 2,657.14 94.88 2,376.13 -11% 2404%
memcg_high 64,200 66,316 56,898
memcg_swap_fail 101,182 70 27
zswpout 48,931,499 36,507 48,890,640
zswpin 380 379 377
pswpout 0 40,166,400 0
pswpin 0 0 0
thp_swpout 0 78,450 0
thp_swpout_fallback 101,182 70 27
2MB-mthp_swpout_fallback 0 0 27
pgmajfault 3,067 3,417 3,311
swap_ra 91 90 854
swap_ra_hit 45 45 810
ZSWPOUT-2MB n/a n/a 95,459
SWPOUT-2MB 0 78,450 0
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
2M folios: deflate-iaa:
=======================
zswap compressor deflate-iaa deflate-iaa deflate-iaa deflate-iaa v10
before-case1 before-case2 after vs. vs.
case1 case2
-------------------------------------------------------------------------------
Total throughput (KB/s) 6,286,587 1,126,785 7,073,464 13% 528%
Average throughput (KB/s) 209,552 37,559 235,782 13% 528%
elapsed time (sec) 96.19 333.03 85.79 -11% -74%
sys time (sec) 2,141.44 99.96 1,826.67 -15% 1727%
memcg_high 99,253 64,666 79,718
memcg_swap_fail 129,074 53 165
zswpout 61,312,794 28,321 56,045,120
zswpin 383 406 403
pswpout 0 40,048,128 0
pswpin 0 0 0
thp_swpout 0 78,219 0
thp_swpout_fallback 129,074 53 165
2MB-mthp_swpout_fallback 0 0 165
pgmajfault 3,430 3,077 31,468
swap_ra 91 103 84,373
swap_ra_hit 47 46 84,317
ZSWPOUT-2MB n/a n/a 109,229
SWPOUT-2MB 0 78,219 0
-------------------------------------------------------------------------------
And finally, this is a comparison of deflate-iaa vs. zstd with v10 of this
patch-series:
---------------------------------------------
zswap_store large folios v10
Impr w/ deflate-iaa vs. zstd
64K folios 2M folios
---------------------------------------------
Throughput (KB/s) 17% 9%
elapsed time (sec) -20% -19%
sys time (sec) -27% -23%
---------------------------------------------
Conclusions based on the performance results:
=============================================
v10 wrt before-case1:
---------------------
We see significant improvements in throughput, elapsed and sys time for
zstd and deflate-iaa, when comparing before-case1 (THP_SWAP=N) vs. after
(THP_SWAP=Y) with zswap_store large folios.
v10 wrt before-case2:
---------------------
We see even more significant improvements in throughput and elapsed time
for zstd and deflate-iaa, when comparing before-case2 (large-folio-SSD)
vs. after (large-folio-zswap). The sys time increases with
large-folio-zswap as expected, due to the CPU compression time
vs. asynchronous disk write times, as pointed out by Ying and Yosry.
In before-case2, when zswap does not store large folios, only allocations
and cgroup charging due to 4K folio zswap stores count towards the cgroup
memory limit. However, in the after scenario, with the introduction of
zswap_store() of large folios, there is an added component of the zswap
compressed pool usage from large folio stores from potentially all 30
processes, that gets counted towards the memory limit. As a result, we see
higher swapout activity in the "after" data.
Summary:
========
The v10 data presented above shows that zswap_store of large folios
demonstrates good throughput/performance improvements compared to
conventional SSD swap of large folios with a sufficiently large 525G SSD
swap device. Hence, it seems reasonable for zswap_store to support large
folios, so that further performance improvements can be implemented.
In the experimental setup used in this patchset, we have enabled IAA
compress verification to ensure additional hardware data integrity CRC
checks not currently done by the software compressors. We see good
throughput/latency improvements with deflate-iaa vs. zstd with zswap_store
of large folios.
Some of the ideas for further reducing latency that have shown promise in
our experiments, are:
1) IAA compress/decompress batching.
2) Distributing compress jobs across all IAA devices on the socket.
The tests run for this patchset are using only 1 IAA device per core, that
avails of 2 compress engines on the device. In our experiments with IAA
batching, we distribute compress jobs from all cores to the 8 compress
engines available per socket. We further compress the pages in each folio
in parallel in the accelerator. As a result, we improve compress latency
and reclaim throughput.
In decompress batching, we use swapin_readahead to generate a prefetch
batch of 4K folios that we decompress in parallel in IAA.
------------------------------------------------------------------------------
IAA compress/decompress batching
Further improvements wrt v10 zswap_store Sequential
subpage store using "deflate-iaa":
"deflate-iaa" Batching "deflate-iaa-canned" [2] Batching
Additional Impr Additional Impr
64K folios 2M folios 64K folios 2M folios
------------------------------------------------------------------------------
Throughput (KB/s) 19% 43% 26% 55%
elapsed time (sec) -5% -14% -10% -21%
sys time (sec) 4% -7% -4% -18%
------------------------------------------------------------------------------
With zswap IAA compress/decompress batching, we are able to demonstrate
significant performance improvements and memory savings in server
scalability experiments in highly contended system scenarios under
significant memory pressure; as compared to software compressors. We hope
to submit this work in subsequent patch series. The current patch-series is
a prequisite for these future submissions.
[1] https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u
[2] https://patchwork.kernel.org/project/linux-crypto/cover/cover.1710969449.git.andre.glover@linux.intel.com/
This patch (of 6):
This resolves an issue with obj_cgroup_get() not being defined if
CONFIG_MEMCG is not defined.
Before this patch, we would see build errors if obj_cgroup_get() is called
from code that is agnostic of CONFIG_MEMCG.
The zswap_store() changes for large folios in subsequent commits will
require the use of obj_cgroup_get() in zswap code that falls into this
category.
Link: https://lkml.kernel.org/r/20241001053222.6944-1-kanchana.p.sridhar@intel.com
Link: https://lkml.kernel.org/r/20241001053222.6944-2-kanchana.p.sridhar@intel.com
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Wajdi Feghali <wajdi.k.feghali@intel.com>
Cc: "Zou, Nanhai" <nanhai.zou@intel.com>
Cc: Barry Song <21cnbao@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-01 05:32:16 +00:00
|
|
|
static inline void obj_cgroup_get(struct obj_cgroup *objcg)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2022-05-19 21:08:53 +00:00
|
|
|
static inline void obj_cgroup_put(struct obj_cgroup *objcg)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
mm: multi-gen LRU: per-node lru_gen_folio lists
For each node, memcgs are divided into two generations: the old and
the young. For each generation, memcgs are randomly sharded into
multiple bins to improve scalability. For each bin, an RCU hlist_nulls
is virtually divided into three segments: the head, the tail and the
default.
An onlining memcg is added to the tail of a random bin in the old
generation. The eviction starts at the head of a random bin in the old
generation. The per-node memcg generation counter, whose reminder (mod
2) indexes the old generation, is incremented when all its bins become
empty.
There are four operations:
1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
its current generation (old or young) and updates its "seg" to
"head";
2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
its current generation (old or young) and updates its "seg" to
"tail";
3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
the old generation, updates its "gen" to "old" and resets its "seg"
to "default";
4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
in the young generation, updates its "gen" to "young" and resets
its "seg" to "default".
The events that trigger the above operations are:
1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
2. The first attempt to reclaim an memcg below low, which triggers
MEMCG_LRU_TAIL;
3. The first attempt to reclaim an memcg below reclaimable size
threshold, which triggers MEMCG_LRU_TAIL;
4. The second attempt to reclaim an memcg below reclaimable size
threshold, which triggers MEMCG_LRU_YOUNG;
5. Attempting to reclaim an memcg below min, which triggers
MEMCG_LRU_YOUNG;
6. Finishing the aging on the eviction path, which triggers
MEMCG_LRU_YOUNG;
7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
Note that memcg LRU only applies to global reclaim, and the
round-robin incrementing of their max_seq counters ensures the
eventual fairness to all eligible memcgs. For memcg reclaim, it still
relies on mem_cgroup_iter().
Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-22 04:19:04 +00:00
|
|
|
static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-11-30 19:40:19 +00:00
|
|
|
static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-08-17 22:46:36 +00:00
|
|
|
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2021-06-29 01:59:47 +00:00
|
|
|
static inline struct lruvec *folio_lruvec_lock(struct folio *folio)
|
2020-12-15 20:34:29 +00:00
|
|
|
{
|
2021-06-29 01:59:47 +00:00
|
|
|
struct pglist_data *pgdat = folio_pgdat(folio);
|
2020-12-15 20:34:29 +00:00
|
|
|
|
|
|
|
spin_lock(&pgdat->__lruvec.lru_lock);
|
|
|
|
return &pgdat->__lruvec;
|
|
|
|
}
|
|
|
|
|
2021-06-29 01:59:47 +00:00
|
|
|
static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
|
2020-12-15 20:34:29 +00:00
|
|
|
{
|
2021-06-29 01:59:47 +00:00
|
|
|
struct pglist_data *pgdat = folio_pgdat(folio);
|
2020-12-15 20:34:29 +00:00
|
|
|
|
|
|
|
spin_lock_irq(&pgdat->__lruvec.lru_lock);
|
|
|
|
return &pgdat->__lruvec;
|
|
|
|
}
|
|
|
|
|
2021-06-29 01:59:47 +00:00
|
|
|
static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
|
2020-12-15 20:34:29 +00:00
|
|
|
unsigned long *flagsp)
|
|
|
|
{
|
2021-06-29 01:59:47 +00:00
|
|
|
struct pglist_data *pgdat = folio_pgdat(folio);
|
2020-12-15 20:34:29 +00:00
|
|
|
|
|
|
|
spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
|
|
|
|
return &pgdat->__lruvec;
|
|
|
|
}
|
|
|
|
|
2012-01-13 01:17:59 +00:00
|
|
|
static inline struct mem_cgroup *
|
|
|
|
mem_cgroup_iter(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *prev,
|
|
|
|
struct mem_cgroup_reclaim_cookie *reclaim)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *prev)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2023-06-16 06:30:30 +00:00
|
|
|
static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
|
2016-10-07 23:57:23 +00:00
|
|
|
int (*fn)(struct task_struct *, void *), void *arg)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2016-03-15 21:57:16 +00:00
|
|
|
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
|
2009-01-08 02:08:02 +00:00
|
|
|
{
|
2016-03-15 21:57:16 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
|
|
|
|
{
|
|
|
|
WARN_ON_ONCE(id);
|
|
|
|
/* XXX: This should always return root_mem_cgroup */
|
|
|
|
return NULL;
|
2009-01-08 02:08:02 +00:00
|
|
|
}
|
2009-01-08 02:08:08 +00:00
|
|
|
|
mm: memcontrol: introduce mem_cgroup_ino() and mem_cgroup_get_from_ino()
Patch series "mm: introduce shrinker debugfs interface", v5.
The only existing debugging mechanism is a couple of tracepoints in
do_shrink_slab(): mm_shrink_slab_start and mm_shrink_slab_end. They
aren't covering everything though: shrinkers which report 0 objects will
never show up, there is no support for memcg-aware shrinkers. Shrinkers
are identified by their scan function, which is not always enough (e.g.
hard to guess which super block's shrinker it is having only
"super_cache_scan").
To provide a better visibility and debug options for memory shrinkers this
patchset introduces a /sys/kernel/debug/shrinker interface, to some extent
similar to /sys/kernel/slab.
For each shrinker registered in the system a directory is created. As
now, the directory will contain only a "scan" file, which allows to get
the number of managed objects for each memory cgroup (for memcg-aware
shrinkers) and each numa node (for numa-aware shrinkers on a numa
machine). Other interfaces might be added in the future.
To make debugging more pleasant, the patchset also names all shrinkers, so
that debugfs entries can have meaningful names.
This patch (of 5):
Shrinker debugfs requires a way to represent memory cgroups without using
full paths, both for displaying information and getting input from a user.
Cgroup inode number is a perfect way, already used by bpf.
This commit adds a couple of helper functions which will be used to handle
memcg-aware shrinkers.
Link: https://lkml.kernel.org/r/20220601032227.4076670-1-roman.gushchin@linux.dev
Link: https://lkml.kernel.org/r/20220601032227.4076670-2-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-06-01 03:22:22 +00:00
|
|
|
#ifdef CONFIG_SHRINKER_DEBUG
|
|
|
|
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2019-03-05 23:45:52 +00:00
|
|
|
static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2017-07-06 22:40:25 +00:00
|
|
|
static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2016-01-20 23:03:02 +00:00
|
|
|
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
|
2009-01-08 02:08:18 +00:00
|
|
|
{
|
2015-11-06 02:47:40 +00:00
|
|
|
return true;
|
2009-01-08 02:08:18 +00:00
|
|
|
}
|
|
|
|
|
2017-01-11 00:58:04 +00:00
|
|
|
static inline
|
|
|
|
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
|
|
|
|
enum lru_list lru, int zone_idx)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2009-01-08 02:08:19 +00:00
|
|
|
|
2018-06-08 00:06:18 +00:00
|
|
|
static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
|
2016-10-07 23:57:23 +00:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
mm, memcg: proportional memory.{low,min} reclaim
cgroup v2 introduces two memory protection thresholds: memory.low
(best-effort) and memory.min (hard protection). While they generally do
what they say on the tin, there is a limitation in their implementation
that makes them difficult to use effectively: that cliff behaviour often
manifests when they become eligible for reclaim. This patch implements
more intuitive and usable behaviour, where we gradually mount more
reclaim pressure as cgroups further and further exceed their protection
thresholds.
This cliff edge behaviour happens because we only choose whether or not
to reclaim based on whether the memcg is within its protection limits
(see the use of mem_cgroup_protected in shrink_node), but we don't vary
our reclaim behaviour based on this information. Imagine the following
timeline, with the numbers the lruvec size in this zone:
1. memory.low=1000000, memory.current=999999. 0 pages may be scanned.
2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned.
3. memory.low=1000000, memory.current=1000001. 1000001* pages may be
scanned. (?!)
* Of course, we won't usually scan all available pages in the zone even
without this patch because of scan control priority, over-reclaim
protection, etc. However, as shown by the tests at the end, these
techniques don't sufficiently throttle such an extreme change in input,
so cliff-like behaviour isn't really averted by their existence alone.
Here's an example of how this plays out in practice. At Facebook, we are
trying to protect various workloads from "system" software, like
configuration management tools, metric collectors, etc (see this[0] case
study). In order to find a suitable memory.low value, we start by
determining the expected memory range within which the workload will be
comfortable operating. This isn't an exact science -- memory usage deemed
"comfortable" will vary over time due to user behaviour, differences in
composition of work, etc, etc. As such we need to ballpark memory.low,
but doing this is currently problematic:
1. If we end up setting it too low for the workload, it won't have
*any* effect (see discussion above). The group will receive the full
weight of reclaim and won't have any priority while competing with the
less important system software, as if we had no memory.low configured
at all.
2. Because of this behaviour, we end up erring on the side of setting
it too high, such that the comfort range is reliably covered. However,
protected memory is completely unavailable to the rest of the system,
so we might cause undue memory and IO pressure there when we *know* we
have some elasticity in the workload.
3. Even if we get the value totally right, smack in the middle of the
comfort zone, we get extreme jumps between no pressure and full
pressure that cause unpredictable pressure spikes in the workload due
to the current binary reclaim behaviour.
With this patch, we can set it to our ballpark estimation without too much
worry. Any undesirable behaviour, such as too much or too little reclaim
pressure on the workload or system will be proportional to how far our
estimation is off. This means we can set memory.low much more
conservatively and thus waste less resources *without* the risk of the
workload falling off a cliff if we overshoot.
As a more abstract technical description, this unintuitive behaviour
results in having to give high-priority workloads a large protection
buffer on top of their expected usage to function reliably, as otherwise
we have abrupt periods of dramatically increased memory pressure which
hamper performance. Having to set these thresholds so high wastes
resources and generally works against the principle of work conservation.
In addition, having proportional memory reclaim behaviour has other
benefits. Most notably, before this patch it's basically mandatory to set
memory.low to a higher than desirable value because otherwise as soon as
you exceed memory.low, all protection is lost, and all pages are eligible
to scan again. By contrast, having a gradual ramp in reclaim pressure
means that you now still get some protection when thresholds are exceeded,
which means that one can now be more comfortable setting memory.low to
lower values without worrying that all protection will be lost. This is
important because workingset size is really hard to know exactly,
especially with variable workloads, so at least getting *some* protection
if your workingset size grows larger than you expect increases user
confidence in setting memory.low without a huge buffer on top being
needed.
Thanks a lot to Johannes Weiner and Tejun Heo for their advice and
assistance in thinking about how to make this work better.
In testing these changes, I intended to verify that:
1. Changes in page scanning become gradual and proportional instead of
binary.
To test this, I experimented stepping further and further down
memory.low protection on a workload that floats around 19G workingset
when under memory.low protection, watching page scan rates for the
workload cgroup:
+------------+-----------------+--------------------+--------------+
| memory.low | test (pgscan/s) | control (pgscan/s) | % of control |
+------------+-----------------+--------------------+--------------+
| 21G | 0 | 0 | N/A |
| 17G | 867 | 3799 | 23% |
| 12G | 1203 | 3543 | 34% |
| 8G | 2534 | 3979 | 64% |
| 4G | 3980 | 4147 | 96% |
| 0 | 3799 | 3980 | 95% |
+------------+-----------------+--------------------+--------------+
As you can see, the test kernel (with a kernel containing this
patch) ramps up page scanning significantly more gradually than the
control kernel (without this patch).
2. More gradual ramp up in reclaim aggression doesn't result in
premature OOMs.
To test this, I wrote a script that slowly increments the number of
pages held by stress(1)'s --vm-keep mode until a production system
entered severe overall memory contention. This script runs in a highly
protected slice taking up the majority of available system memory.
Watching vmstat revealed that page scanning continued essentially
nominally between test and control, without causing forward reclaim
progress to become arrested.
[0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project
[akpm@linux-foundation.org: reflow block comments to fit in 80 cols]
[chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection]
Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name
Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-07 00:58:32 +00:00
|
|
|
static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-04-02 23:57:39 +00:00
|
|
|
static inline void
|
mm, oom: add oom victim's memcg to the oom context information
The current oom report doesn't display victim's memcg context during the
global OOM situation. While this information is not strictly needed, it
can be really helpful for containerized environments to locate which
container has lost a process. Now that we have a single line for the oom
context, we can trivially add both the oom memcg (this can be either
global_oom or a specific memcg which hits its hard limits) and task_memcg
which is the victim's memcg.
Below is the single line output in the oom report after this patch.
- global oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,global_oom,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
- memcg oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,oom_memcg=<memcg>,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
[penguin-kernel@I-love.SAKURA.ne.jp: use pr_cont() in mem_cgroup_print_oom_context()]
Link: http://lkml.kernel.org/r/201812190723.wBJ7NdkN032628@www262.sakura.ne.jp
Link: http://lkml.kernel.org/r/1542799799-36184-2-git-send-email-ufo19890607@gmail.com
Signed-off-by: yuzhoujian <yuzhoujian@didichuxing.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 08:36:10 +00:00
|
|
|
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
|
2009-04-02 23:57:39 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2023-09-14 15:21:39 +00:00
|
|
|
static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
|
memcg: punt high overage reclaim to return-to-userland path
Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped. If a process performs only
speculative allocations, it can blow way past the high limit. This is
actually easily reproducible by simply doing "find /". slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.
This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.
- All over-high reclaims can use GFP_KERNEL regardless of the specific
gfp mask in use, e.g. GFP_NOFS, when the limit was breached.
- It copes with prio inversion. Previously, a low-prio task with
small memory.high might perform over-high reclaim with a bunch of
locks held. If a higher prio task needed any of these locks, it
would have to wait until the low prio task finished reclaim and
released the locks. By handing over-high reclaim to the task exit
path this issue can be avoided.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 02:46:11 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2018-08-22 04:53:54 +00:00
|
|
|
static inline struct mem_cgroup *mem_cgroup_get_oom_group(
|
|
|
|
struct task_struct *victim, struct mem_cgroup *oom_domain)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2017-07-06 22:40:52 +00:00
|
|
|
static inline void __mod_memcg_state(struct mem_cgroup *memcg,
|
2024-05-01 17:26:17 +00:00
|
|
|
enum memcg_stat_item idx,
|
2017-07-06 22:40:52 +00:00
|
|
|
int nr)
|
2017-05-03 21:55:03 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2017-07-06 22:40:52 +00:00
|
|
|
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
2024-05-01 17:26:17 +00:00
|
|
|
enum memcg_stat_item idx,
|
2017-07-06 22:40:52 +00:00
|
|
|
int nr)
|
2017-05-03 21:55:03 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2022-01-14 22:05:45 +00:00
|
|
|
static inline void mod_memcg_page_state(struct page *page,
|
2024-05-01 17:26:17 +00:00
|
|
|
enum memcg_stat_item idx, int val)
|
2022-01-14 22:05:45 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2021-09-02 21:53:27 +00:00
|
|
|
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-14 22:47:12 +00:00
|
|
|
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
|
|
|
|
enum node_stat_item idx)
|
|
|
|
{
|
|
|
|
return node_page_state(lruvec_pgdat(lruvec), idx);
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: make cgroup stats and events query API explicitly local
Patch series "mm: memcontrol: memory.stat cost & correctness".
The cgroup memory.stat file holds recursive statistics for the entire
subtree. The current implementation does this tree walk on-demand
whenever the file is read. This is giving us problems in production.
1. The cost of aggregating the statistics on-demand is high. A lot of
system service cgroups are mostly idle and their stats don't change
between reads, yet we always have to check them. There are also always
some lazily-dying cgroups sitting around that are pinned by a handful
of remaining page cache; the same applies to them.
In an application that periodically monitors memory.stat in our
fleet, we have seen the aggregation consume up to 5% CPU time.
2. When cgroups die and disappear from the cgroup tree, so do their
accumulated vm events. The result is that the event counters at
higher-level cgroups can go backwards and confuse some of our
automation, let alone people looking at the graphs over time.
To address both issues, this patch series changes the stat
implementation to spill counts upwards when the counters change.
The upward spilling is batched using the existing per-cpu cache. In a
sparse file stress test with 5 level cgroup nesting, the additional cost
of the flushing was negligible (a little under 1% of CPU at 100% CPU
utilization, compared to the 5% of reading memory.stat during regular
operation).
This patch (of 4):
memcg_page_state(), lruvec_page_state(), memcg_sum_events() are
currently returning the state of the local memcg or lruvec, not the
recursive state.
In practice there is a demand for both versions, although the callers
that want the recursive counts currently sum them up by hand.
Per default, cgroups are considered recursive entities and generally we
expect more users of the recursive counters, with the local counts being
special cases. To reflect that in the name, add a _local suffix to the
current implementations.
The following patch will re-incarnate these functions with recursive
semantics, but with an O(1) implementation.
[hannes@cmpxchg.org: fix bisection hole]
Link: http://lkml.kernel.org/r/20190417160347.GC23013@cmpxchg.org
Link: http://lkml.kernel.org/r/20190412151507.2769-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-14 22:47:06 +00:00
|
|
|
static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
|
|
|
|
enum node_stat_item idx)
|
2011-01-13 23:47:37 +00:00
|
|
|
{
|
2017-07-06 22:40:52 +00:00
|
|
|
return node_page_state(lruvec_pgdat(lruvec), idx);
|
2011-01-13 23:47:37 +00:00
|
|
|
}
|
|
|
|
|
mm: memcg: restore subtree stats flushing
Stats flushing for memcg currently follows the following rules:
- Always flush the entire memcg hierarchy (i.e. flush the root).
- Only one flusher is allowed at a time. If someone else tries to flush
concurrently, they skip and return immediately.
- A periodic flusher flushes all the stats every 2 seconds.
The reason this approach is followed is because all flushes are serialized
by a global rstat spinlock. On the memcg side, flushing is invoked from
userspace reads as well as in-kernel flushers (e.g. reclaim, refault,
etc). This approach aims to avoid serializing all flushers on the global
lock, which can cause a significant performance hit under high
concurrency.
This approach has the following problems:
- Occasionally a userspace read of the stats of a non-root cgroup will
be too expensive as it has to flush the entire hierarchy [1].
- Sometimes the stats accuracy are compromised if there is an ongoing
flush, and we skip and return before the subtree of interest is
actually flushed, yielding stale stats (by up to 2s due to periodic
flushing). This is more visible when reading stats from userspace,
but can also affect in-kernel flushers.
The latter problem is particulary a concern when userspace reads stats
after an event occurs, but gets stats from before the event. Examples:
- When memory usage / pressure spikes, a userspace OOM handler may look
at the stats of different memcgs to select a victim based on various
heuristics (e.g. how much private memory will be freed by killing
this). Reading stale stats from before the usage spike in this case
may cause a wrongful OOM kill.
- A proactive reclaimer may read the stats after writing to
memory.reclaim to measure the success of the reclaim operation. Stale
stats from before reclaim may give a false negative.
- Reading the stats of a parent and a child memcg may be inconsistent
(child larger than parent), if the flush doesn't happen when the
parent is read, but happens when the child is read.
As for in-kernel flushers, they will occasionally get stale stats. No
regressions are currently known from this, but if there are regressions,
they would be very difficult to debug and link to the source of the
problem.
This patch aims to fix these problems by restoring subtree flushing, and
removing the unified/coalesced flushing logic that skips flushing if there
is an ongoing flush. This change would introduce a significant regression
with global stats flushing thresholds. With per-memcg stats flushing
thresholds, this seems to perform really well. The thresholds protect the
underlying lock from unnecessary contention.
This patch was tested in two ways to ensure the latency of flushing is
up to par, on a machine with 384 cpus:
- A synthetic test with 5000 concurrent workers in 500 cgroups doing
allocations and reclaim, as well as 1000 readers for memory.stat
(variation of [2]). No regressions were noticed in the total runtime.
Note that significant regressions in this test are observed with
global stats thresholds, but not with per-memcg thresholds.
- A synthetic stress test for concurrently reading memcg stats while
memory allocation/freeing workers are running in the background,
provided by Wei Xu [3]. With 250k threads reading the stats every
100ms in 50k cgroups, 99.9% of reads take <= 50us. Less than 0.01%
of reads take more than 1ms, and no reads take more than 100ms.
[1] https://lore.kernel.org/lkml/CABWYdi0c6__rh-K7dcM_pkf9BJdTRtAU08M43KO9ME4-dsgfoQ@mail.gmail.com/
[2] https://lore.kernel.org/lkml/CAJD7tka13M-zVZTyQJYL1iUAYvuQ1fcHbCjcOBZcz6POYTV-4g@mail.gmail.com/
[3] https://lore.kernel.org/lkml/CAAPL-u9D2b=iF5Lf_cRnKxUfkiEe0AMDTu6yhrUAzX0b6a6rDg@mail.gmail.com/
[akpm@linux-foundation.org: fix mm/zswap.c]
[yosryahmed@google.com: remove stats flushing mutex]
Link: https://lkml.kernel.org/r/CAJD7tkZgP3m-VVPn+fF_YuvXeQYK=tZZjJHj=dzD=CcSSpp2qg@mail.gmail.com
Link: https://lkml.kernel.org/r/20231129032154.3710765-6-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ivan Babrou <ivan@cloudflare.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutny <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-11-29 03:21:53 +00:00
|
|
|
static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
|
2021-09-02 21:55:04 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
mm: memcg: restore subtree stats flushing
Stats flushing for memcg currently follows the following rules:
- Always flush the entire memcg hierarchy (i.e. flush the root).
- Only one flusher is allowed at a time. If someone else tries to flush
concurrently, they skip and return immediately.
- A periodic flusher flushes all the stats every 2 seconds.
The reason this approach is followed is because all flushes are serialized
by a global rstat spinlock. On the memcg side, flushing is invoked from
userspace reads as well as in-kernel flushers (e.g. reclaim, refault,
etc). This approach aims to avoid serializing all flushers on the global
lock, which can cause a significant performance hit under high
concurrency.
This approach has the following problems:
- Occasionally a userspace read of the stats of a non-root cgroup will
be too expensive as it has to flush the entire hierarchy [1].
- Sometimes the stats accuracy are compromised if there is an ongoing
flush, and we skip and return before the subtree of interest is
actually flushed, yielding stale stats (by up to 2s due to periodic
flushing). This is more visible when reading stats from userspace,
but can also affect in-kernel flushers.
The latter problem is particulary a concern when userspace reads stats
after an event occurs, but gets stats from before the event. Examples:
- When memory usage / pressure spikes, a userspace OOM handler may look
at the stats of different memcgs to select a victim based on various
heuristics (e.g. how much private memory will be freed by killing
this). Reading stale stats from before the usage spike in this case
may cause a wrongful OOM kill.
- A proactive reclaimer may read the stats after writing to
memory.reclaim to measure the success of the reclaim operation. Stale
stats from before reclaim may give a false negative.
- Reading the stats of a parent and a child memcg may be inconsistent
(child larger than parent), if the flush doesn't happen when the
parent is read, but happens when the child is read.
As for in-kernel flushers, they will occasionally get stale stats. No
regressions are currently known from this, but if there are regressions,
they would be very difficult to debug and link to the source of the
problem.
This patch aims to fix these problems by restoring subtree flushing, and
removing the unified/coalesced flushing logic that skips flushing if there
is an ongoing flush. This change would introduce a significant regression
with global stats flushing thresholds. With per-memcg stats flushing
thresholds, this seems to perform really well. The thresholds protect the
underlying lock from unnecessary contention.
This patch was tested in two ways to ensure the latency of flushing is
up to par, on a machine with 384 cpus:
- A synthetic test with 5000 concurrent workers in 500 cgroups doing
allocations and reclaim, as well as 1000 readers for memory.stat
(variation of [2]). No regressions were noticed in the total runtime.
Note that significant regressions in this test are observed with
global stats thresholds, but not with per-memcg thresholds.
- A synthetic stress test for concurrently reading memcg stats while
memory allocation/freeing workers are running in the background,
provided by Wei Xu [3]. With 250k threads reading the stats every
100ms in 50k cgroups, 99.9% of reads take <= 50us. Less than 0.01%
of reads take more than 1ms, and no reads take more than 100ms.
[1] https://lore.kernel.org/lkml/CABWYdi0c6__rh-K7dcM_pkf9BJdTRtAU08M43KO9ME4-dsgfoQ@mail.gmail.com/
[2] https://lore.kernel.org/lkml/CAJD7tka13M-zVZTyQJYL1iUAYvuQ1fcHbCjcOBZcz6POYTV-4g@mail.gmail.com/
[3] https://lore.kernel.org/lkml/CAAPL-u9D2b=iF5Lf_cRnKxUfkiEe0AMDTu6yhrUAzX0b6a6rDg@mail.gmail.com/
[akpm@linux-foundation.org: fix mm/zswap.c]
[yosryahmed@google.com: remove stats flushing mutex]
Link: https://lkml.kernel.org/r/CAJD7tkZgP3m-VVPn+fF_YuvXeQYK=tZZjJHj=dzD=CcSSpp2qg@mail.gmail.com
Link: https://lkml.kernel.org/r/20231129032154.3710765-6-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ivan Babrou <ivan@cloudflare.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutny <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-11-29 03:21:53 +00:00
|
|
|
static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
|
2022-04-21 23:35:40 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2020-12-15 03:07:04 +00:00
|
|
|
static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
|
2019-08-13 22:37:41 +00:00
|
|
|
int val)
|
|
|
|
{
|
|
|
|
struct page *page = virt_to_head_page(p);
|
|
|
|
|
|
|
|
__mod_node_page_state(page_pgdat(page), idx, val);
|
|
|
|
}
|
|
|
|
|
2020-12-15 03:07:04 +00:00
|
|
|
static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
|
2020-08-07 06:21:37 +00:00
|
|
|
int val)
|
|
|
|
{
|
|
|
|
struct page *page = virt_to_head_page(p);
|
|
|
|
|
|
|
|
mod_node_page_state(page_pgdat(page), idx, val);
|
|
|
|
}
|
|
|
|
|
2017-07-06 22:40:25 +00:00
|
|
|
static inline void count_memcg_events(struct mem_cgroup *memcg,
|
|
|
|
enum vm_event_item idx,
|
|
|
|
unsigned long count)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2019-05-14 00:16:54 +00:00
|
|
|
static inline void __count_memcg_events(struct mem_cgroup *memcg,
|
|
|
|
enum vm_event_item idx,
|
|
|
|
unsigned long count)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2022-05-13 03:23:03 +00:00
|
|
|
static inline void count_memcg_folio_events(struct folio *folio,
|
|
|
|
enum vm_event_item idx, unsigned long nr)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
mm,memcg: provide per-cgroup counters for NUMA balancing operations
The ability to observe the demotion and promotion decisions made by the
kernel on a per-cgroup basis is important for monitoring and tuning
containerized workloads on machines equipped with tiered memory.
Different containers in the system may experience drastically different
memory tiering actions that cannot be distinguished from the global
counters alone.
For example, a container running a workload that has a much hotter memory
accesses will likely see more promotions and fewer demotions, potentially
depriving a colocated container of top tier memory to such an extent that
its performance degrades unacceptably.
For another example, some containers may exhibit longer periods between
data reuse, causing much more numa_hint_faults than numa_pages_migrated.
In this case, tuning hot_threshold_ms may be appropriate, but the signal
can easily be lost if only global counters are available.
In the long term, we hope to introduce per-cgroup control of promotion and
demotion actions to implement memory placement policies in tiering.
This patch set adds seven counters to memory.stat in a cgroup:
numa_pages_migrated, numa_pte_updates, numa_hint_faults, pgdemote_kswapd,
pgdemote_khugepaged, pgdemote_direct and pgpromote_success. pgdemote_*
and pgpromote_success are also available in memory.numa_stat.
count_memcg_events_mm() is added to count multiple event occurrences at
once, and get_mem_cgroup_from_folio() is added because we need to get a
reference to the memcg of a folio before it's migrated to track
numa_pages_migrated. The accounting of PGDEMOTE_* is moved to
shrink_inactive_list() before being changed to per-cgroup.
[kaiyang2@cs.cmu.edu: add documentation of the memcg counters in cgroup-v2.rst]
Link: https://lkml.kernel.org/r/20240814235122.252309-1-kaiyang2@cs.cmu.edu
Link: https://lkml.kernel.org/r/20240814174227.30639-1-kaiyang2@cs.cmu.edu
Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-14 17:42:27 +00:00
|
|
|
static inline void count_memcg_events_mm(struct mm_struct *mm,
|
|
|
|
enum vm_event_item idx, unsigned long count)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2011-05-26 23:25:38 +00:00
|
|
|
static inline
|
2017-07-06 22:40:25 +00:00
|
|
|
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
|
2011-05-26 23:25:38 +00:00
|
|
|
{
|
|
|
|
}
|
2020-12-15 20:34:29 +00:00
|
|
|
|
2024-02-26 20:55:31 +00:00
|
|
|
static inline void split_page_memcg(struct page *head, int old_order, int new_order)
|
mm: memcontrol: switch to rstat
Replace the memory controller's custom hierarchical stats code with the
generic rstat infrastructure provided by the cgroup core.
The current implementation does batched upward propagation from the
write side (i.e. as stats change). The per-cpu batches introduce an
error, which is multiplied by the number of subgroups in a tree. In
systems with many CPUs and sizable cgroup trees, the error can be large
enough to confuse users (e.g. 32 batch pages * 32 CPUs * 32 subgroups
results in an error of up to 128M per stat item). This can entirely
swallow allocation bursts inside a workload that the user is expecting
to see reflected in the statistics.
In the past, we've done read-side aggregation, where a memory.stat read
would have to walk the entire subtree and add up per-cpu counts. This
became problematic with lazily-freed cgroups: we could have large
subtrees where most cgroups were entirely idle. Hence the switch to
change-driven upward propagation. Unfortunately, it needed to trade
accuracy for speed due to the write side being so hot.
Rstat combines the best of both worlds: from the write side, it cheaply
maintains a queue of cgroups that have pending changes, so that the read
side can do selective tree aggregation. This way the reported stats
will always be precise and recent as can be, while the aggregation can
skip over potentially large numbers of idle cgroups.
The way rstat works is that it implements a tree for tracking cgroups
with pending local changes, as well as a flush function that walks the
tree upwards. The controller then drives this by 1) telling rstat when
a local cgroup stat changes (e.g. mod_memcg_state) and 2) when a flush
is required to get uptodate hierarchy stats for a given subtree (e.g.
when memory.stat is read). The controller also provides a flush
callback that is called during the rstat flush walk for each cgroup and
aggregates its local per-cpu counters and propagates them upwards.
This adds a second vmstats to struct mem_cgroup (MEMCG_NR_STAT +
NR_VM_EVENT_ITEMS) to track pending subtree deltas during upward
aggregation. It removes 3 words from the per-cpu data. It eliminates
memcg_exact_page_state(), since memcg_page_state() is now exact.
[akpm@linux-foundation.org: merge fix]
[hannes@cmpxchg.org: fix a sleep in atomic section problem]
Link: https://lkml.kernel.org/r/20210315234100.64307-1-hannes@cmpxchg.org
Link: https://lkml.kernel.org/r/20210209163304.77088-7-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-30 05:56:26 +00:00
|
|
|
{
|
|
|
|
}
|
2012-07-31 23:43:02 +00:00
|
|
|
#endif /* CONFIG_MEMCG */
|
2008-02-07 08:13:51 +00:00
|
|
|
|
2024-03-21 16:36:28 +00:00
|
|
|
/*
|
|
|
|
* Extended information for slab objects stored as an array in page->memcg_data
|
|
|
|
* if MEMCG_DATA_OBJEXTS is set.
|
|
|
|
*/
|
|
|
|
struct slabobj_ext {
|
2024-07-01 15:31:15 +00:00
|
|
|
#ifdef CONFIG_MEMCG
|
2024-03-21 16:36:28 +00:00
|
|
|
struct obj_cgroup *objcg;
|
2024-03-21 16:36:44 +00:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_MEM_ALLOC_PROFILING
|
|
|
|
union codetag_ref ref;
|
|
|
|
#endif
|
2024-03-21 16:36:28 +00:00
|
|
|
} __aligned(8);
|
|
|
|
|
2020-12-15 03:07:04 +00:00
|
|
|
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
|
2019-08-13 22:37:41 +00:00
|
|
|
{
|
2020-12-15 03:07:04 +00:00
|
|
|
__mod_lruvec_kmem_state(p, idx, 1);
|
2019-08-13 22:37:41 +00:00
|
|
|
}
|
|
|
|
|
2020-12-15 03:07:04 +00:00
|
|
|
static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
|
2019-08-13 22:37:41 +00:00
|
|
|
{
|
2020-12-15 03:07:04 +00:00
|
|
|
__mod_lruvec_kmem_state(p, idx, -1);
|
2019-08-13 22:37:41 +00:00
|
|
|
}
|
|
|
|
|
mm: vmscan: determine anon/file pressure balance at the reclaim root
We split the LRU lists into anon and file, and we rebalance the scan
pressure between them when one of them begins thrashing: if the file cache
experiences workingset refaults, we increase the pressure on anonymous
pages; if the workload is stalled on swapins, we increase the pressure on
the file cache instead.
With cgroups and their nested LRU lists, we currently don't do this
correctly. While recursive cgroup reclaim establishes a relative LRU
order among the pages of all involved cgroups, LRU pressure balancing is
done on an individual cgroup LRU level. As a result, when one cgroup is
thrashing on the filesystem cache while a sibling may have cold anonymous
pages, pressure doesn't get equalized between them.
This patch moves LRU balancing decision to the root of reclaim - the same
level where the LRU order is established.
It does this by tracking LRU cost recursively, so that every level of the
cgroup tree knows the aggregate LRU cost of all memory within its domain.
When the page scanner calculates the scan balance for any given individual
cgroup's LRU list, it uses the values from the ancestor cgroup that
initiated the reclaim cycle.
If one sibling is then thrashing on the cache, it will tip the pressure
balance inside its ancestors, and the next hierarchical reclaim iteration
will go more after the anon pages in the tree.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Link: http://lkml.kernel.org/r/20200520232525.798933-13-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-03 23:03:06 +00:00
|
|
|
static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
memcg = lruvec_memcg(lruvec);
|
|
|
|
if (!memcg)
|
|
|
|
return NULL;
|
|
|
|
memcg = parent_mem_cgroup(memcg);
|
|
|
|
if (!memcg)
|
|
|
|
return NULL;
|
|
|
|
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
|
|
|
|
}
|
|
|
|
|
2020-12-15 20:34:29 +00:00
|
|
|
static inline void unlock_page_lruvec(struct lruvec *lruvec)
|
|
|
|
{
|
|
|
|
spin_unlock(&lruvec->lru_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
|
|
|
|
{
|
|
|
|
spin_unlock_irq(&lruvec->lru_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
|
|
|
|
unsigned long flags)
|
|
|
|
{
|
|
|
|
spin_unlock_irqrestore(&lruvec->lru_lock, flags);
|
|
|
|
}
|
|
|
|
|
2024-05-24 01:49:50 +00:00
|
|
|
/* Test requires a stable folio->memcg binding, see folio_memcg() */
|
2021-06-30 02:27:31 +00:00
|
|
|
static inline bool folio_matches_lruvec(struct folio *folio,
|
|
|
|
struct lruvec *lruvec)
|
2021-06-29 02:37:56 +00:00
|
|
|
{
|
2021-06-30 02:27:31 +00:00
|
|
|
return lruvec_pgdat(lruvec) == folio_pgdat(folio) &&
|
|
|
|
lruvec_memcg(lruvec) == folio_memcg(folio);
|
2021-06-29 02:37:56 +00:00
|
|
|
}
|
|
|
|
|
2020-12-15 20:34:33 +00:00
|
|
|
/* Don't lock again iff page's lruvec locked */
|
2021-06-30 02:27:31 +00:00
|
|
|
static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
|
2020-12-15 20:34:33 +00:00
|
|
|
struct lruvec *locked_lruvec)
|
|
|
|
{
|
|
|
|
if (locked_lruvec) {
|
2021-06-30 02:27:31 +00:00
|
|
|
if (folio_matches_lruvec(folio, locked_lruvec))
|
2020-12-15 20:34:33 +00:00
|
|
|
return locked_lruvec;
|
|
|
|
|
|
|
|
unlock_page_lruvec_irq(locked_lruvec);
|
|
|
|
}
|
|
|
|
|
2021-06-29 01:59:47 +00:00
|
|
|
return folio_lruvec_lock_irq(folio);
|
2020-12-15 20:34:33 +00:00
|
|
|
}
|
|
|
|
|
2024-02-27 17:42:42 +00:00
|
|
|
/* Don't lock again iff folio's lruvec locked */
|
|
|
|
static inline void folio_lruvec_relock_irqsave(struct folio *folio,
|
|
|
|
struct lruvec **lruvecp, unsigned long *flags)
|
2020-12-15 20:34:33 +00:00
|
|
|
{
|
2024-02-27 17:42:42 +00:00
|
|
|
if (*lruvecp) {
|
|
|
|
if (folio_matches_lruvec(folio, *lruvecp))
|
|
|
|
return;
|
2020-12-15 20:34:33 +00:00
|
|
|
|
2024-02-27 17:42:42 +00:00
|
|
|
unlock_page_lruvec_irqrestore(*lruvecp, *flags);
|
2020-12-15 20:34:33 +00:00
|
|
|
}
|
|
|
|
|
2024-02-27 17:42:42 +00:00
|
|
|
*lruvecp = folio_lruvec_lock_irqsave(folio, flags);
|
2020-12-15 20:34:33 +00:00
|
|
|
}
|
|
|
|
|
2015-05-22 21:13:37 +00:00
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
2015-05-22 22:23:33 +00:00
|
|
|
|
|
|
|
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
|
writeback: fix incorrect calculation of available memory for memcg domains
For memcg domains, the amount of available memory was calculated as
min(the amount currently in use + headroom according to memcg,
total clean memory)
This isn't quite correct as what should be capped by the amount of
clean memory is the headroom, not the sum of memory in use and
headroom. For example, if a memcg domain has a significant amount of
dirty memory, the above can lead to a value which is lower than the
current amount in use which doesn't make much sense. In most
circumstances, the above leads to a number which is somewhat but not
drastically lower.
As the amount of memory which can be readily allocated to the memcg
domain is capped by the amount of system-wide clean memory which is
not already assigned to the memcg itself, the number we want is
the amount currently in use +
min(headroom according to memcg, clean memory elsewhere in the system)
This patch updates mem_cgroup_wb_stats() to return the number of
filepages and headroom instead of the calculated available pages.
mdtc_cap_avail() is renamed to mdtc_calc_avail() and performs the
above calculation from file, headroom, dirty and globally clean pages.
v2: Dummy mem_cgroup_wb_stats() implementation wasn't updated leading
to build failure when !CGROUP_WRITEBACK. Fixed.
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-09-29 17:04:26 +00:00
|
|
|
void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
|
|
|
|
unsigned long *pheadroom, unsigned long *pdirty,
|
|
|
|
unsigned long *pwriteback);
|
2015-05-22 22:23:33 +00:00
|
|
|
|
2021-05-04 15:43:01 +00:00
|
|
|
void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 16:06:56 +00:00
|
|
|
struct bdi_writeback *wb);
|
|
|
|
|
2021-05-04 15:01:10 +00:00
|
|
|
static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 16:06:56 +00:00
|
|
|
struct bdi_writeback *wb)
|
|
|
|
{
|
2023-01-29 04:09:45 +00:00
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
2019-10-07 00:58:15 +00:00
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
|
2023-01-29 04:09:45 +00:00
|
|
|
memcg = folio_memcg(folio);
|
|
|
|
if (unlikely(memcg && &memcg->css != wb->memcg_css))
|
2021-05-04 15:43:01 +00:00
|
|
|
mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 16:06:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
|
|
|
|
|
2015-05-22 22:23:33 +00:00
|
|
|
#else /* CONFIG_CGROUP_WRITEBACK */
|
|
|
|
|
|
|
|
static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2015-05-22 22:23:35 +00:00
|
|
|
static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
|
writeback: fix incorrect calculation of available memory for memcg domains
For memcg domains, the amount of available memory was calculated as
min(the amount currently in use + headroom according to memcg,
total clean memory)
This isn't quite correct as what should be capped by the amount of
clean memory is the headroom, not the sum of memory in use and
headroom. For example, if a memcg domain has a significant amount of
dirty memory, the above can lead to a value which is lower than the
current amount in use which doesn't make much sense. In most
circumstances, the above leads to a number which is somewhat but not
drastically lower.
As the amount of memory which can be readily allocated to the memcg
domain is capped by the amount of system-wide clean memory which is
not already assigned to the memcg itself, the number we want is
the amount currently in use +
min(headroom according to memcg, clean memory elsewhere in the system)
This patch updates mem_cgroup_wb_stats() to return the number of
filepages and headroom instead of the calculated available pages.
mdtc_cap_avail() is renamed to mdtc_calc_avail() and performs the
above calculation from file, headroom, dirty and globally clean pages.
v2: Dummy mem_cgroup_wb_stats() implementation wasn't updated leading
to build failure when !CGROUP_WRITEBACK. Fixed.
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-09-29 17:04:26 +00:00
|
|
|
unsigned long *pfilepages,
|
|
|
|
unsigned long *pheadroom,
|
2015-05-22 22:23:35 +00:00
|
|
|
unsigned long *pdirty,
|
|
|
|
unsigned long *pwriteback)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2021-05-04 15:01:10 +00:00
|
|
|
static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 16:06:56 +00:00
|
|
|
struct bdi_writeback *wb)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2015-05-22 22:23:33 +00:00
|
|
|
#endif /* CONFIG_CGROUP_WRITEBACK */
|
2015-05-22 21:13:37 +00:00
|
|
|
|
2011-12-11 21:47:03 +00:00
|
|
|
struct sock;
|
2021-08-17 19:40:03 +00:00
|
|
|
bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
|
|
|
|
gfp_t gfp_mask);
|
2016-01-14 23:21:17 +00:00
|
|
|
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
|
2016-01-20 23:02:47 +00:00
|
|
|
#ifdef CONFIG_MEMCG
|
2016-01-14 23:21:34 +00:00
|
|
|
extern struct static_key_false memcg_sockets_enabled_key;
|
|
|
|
#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
|
2016-10-08 00:00:58 +00:00
|
|
|
void mem_cgroup_sk_alloc(struct sock *sk);
|
|
|
|
void mem_cgroup_sk_free(struct sock *sk);
|
2016-01-14 23:21:17 +00:00
|
|
|
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
|
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:21:14 +00:00
|
|
|
{
|
2024-06-28 21:03:10 +00:00
|
|
|
#ifdef CONFIG_MEMCG_V1
|
2023-08-14 07:09:11 +00:00
|
|
|
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
|
|
|
return !!memcg->tcpmem_pressure;
|
2024-06-28 21:03:10 +00:00
|
|
|
#endif /* CONFIG_MEMCG_V1 */
|
2016-01-14 23:21:32 +00:00
|
|
|
do {
|
2021-11-05 20:42:52 +00:00
|
|
|
if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
|
2016-01-14 23:21:32 +00:00
|
|
|
return true;
|
|
|
|
} while ((memcg = parent_mem_cgroup(memcg)));
|
|
|
|
return false;
|
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:21:14 +00:00
|
|
|
}
|
2019-09-23 22:38:12 +00:00
|
|
|
|
2021-05-05 01:36:23 +00:00
|
|
|
int alloc_shrinker_info(struct mem_cgroup *memcg);
|
|
|
|
void free_shrinker_info(struct mem_cgroup *memcg);
|
2021-05-05 01:36:11 +00:00
|
|
|
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
|
2021-05-05 01:36:42 +00:00
|
|
|
void reparent_shrinker_deferred(struct mem_cgroup *memcg);
|
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:21:14 +00:00
|
|
|
#else
|
2016-01-14 23:21:20 +00:00
|
|
|
#define mem_cgroup_sockets_enabled 0
|
2016-10-08 00:00:58 +00:00
|
|
|
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
|
|
|
|
static inline void mem_cgroup_sk_free(struct sock *sk) { };
|
2016-01-14 23:21:17 +00:00
|
|
|
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
|
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:21:14 +00:00
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
2019-09-23 22:38:12 +00:00
|
|
|
|
2021-05-05 01:36:11 +00:00
|
|
|
static inline void set_shrinker_bit(struct mem_cgroup *memcg,
|
|
|
|
int nid, int shrinker_id)
|
2019-09-23 22:38:12 +00:00
|
|
|
{
|
|
|
|
}
|
net: tcp_memcontrol: sanitize tcp memory accounting callbacks
There won't be a tcp control soft limit, so integrating the memcg code
into the global skmem limiting scheme complicates things unnecessarily.
Replace this with simple and clear charge and uncharge calls--hidden
behind a jump label--to account skb memory.
Note that this is not purely aesthetic: as a result of shoehorning the
per-memcg code into the same memory accounting functions that handle the
global level, the old code would compare the per-memcg consumption
against the smaller of the per-memcg limit and the global limit. This
allowed the total consumption of multiple sockets to exceed the global
limit, as long as the individual sockets stayed within bounds. After
this change, the code will always compare the per-memcg consumption to
the per-memcg limit, and the global consumption to the global limit, and
thus close this loophole.
Without a soft limit, the per-memcg memory pressure state in sockets is
generally questionable. However, we did it until now, so we continue to
enter it when the hard limit is hit, and packets are dropped, to let
other sockets in the cgroup know that they shouldn't grow their transmit
windows, either. However, keep it simple in the new callback model and
leave memory pressure lazily when the next packet is accepted (as
opposed to doing it synchroneously when packets are processed). When
packets are dropped, network performance will already be in the toilet,
so that should be a reasonable trade-off.
As described above, consumption is now checked on the per-memcg level
and the global level separately. Likewise, memory pressure states are
maintained on both the per-memcg level and the global level, and a
socket is considered under pressure when either level asserts as much.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:21:14 +00:00
|
|
|
#endif
|
2012-12-18 22:21:56 +00:00
|
|
|
|
2024-07-01 15:31:15 +00:00
|
|
|
#ifdef CONFIG_MEMCG
|
2021-06-03 01:09:30 +00:00
|
|
|
bool mem_cgroup_kmem_disabled(void);
|
2020-04-02 04:06:46 +00:00
|
|
|
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
|
|
|
|
void __memcg_kmem_uncharge_page(struct page *page, int order);
|
2016-07-26 22:24:21 +00:00
|
|
|
|
2023-10-19 22:53:44 +00:00
|
|
|
/*
|
|
|
|
* The returned objcg pointer is safe to use without additional
|
|
|
|
* protection within a scope. The scope is defined either by
|
|
|
|
* the current task (similar to the "current" global variable)
|
|
|
|
* or by set_active_memcg() pair.
|
|
|
|
* Please, use obj_cgroup_get() to get a reference if the pointer
|
|
|
|
* needs to be used outside of the local scope.
|
|
|
|
*/
|
|
|
|
struct obj_cgroup *current_obj_cgroup(void);
|
2023-07-15 04:23:41 +00:00
|
|
|
struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);
|
2020-08-07 06:20:49 +00:00
|
|
|
|
2023-10-19 22:53:46 +00:00
|
|
|
static inline struct obj_cgroup *get_obj_cgroup_from_current(void)
|
|
|
|
{
|
|
|
|
struct obj_cgroup *objcg = current_obj_cgroup();
|
|
|
|
|
|
|
|
if (objcg)
|
|
|
|
obj_cgroup_get(objcg);
|
|
|
|
|
|
|
|
return objcg;
|
|
|
|
}
|
|
|
|
|
2020-08-07 06:20:49 +00:00
|
|
|
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
|
|
|
|
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
|
|
|
|
|
2023-02-10 15:47:31 +00:00
|
|
|
extern struct static_key_false memcg_bpf_enabled_key;
|
|
|
|
static inline bool memcg_bpf_enabled(void)
|
|
|
|
{
|
|
|
|
return static_branch_likely(&memcg_bpf_enabled_key);
|
|
|
|
}
|
|
|
|
|
2023-02-13 19:29:22 +00:00
|
|
|
extern struct static_key_false memcg_kmem_online_key;
|
2012-12-18 22:23:01 +00:00
|
|
|
|
2023-02-13 19:29:22 +00:00
|
|
|
static inline bool memcg_kmem_online(void)
|
2012-12-18 22:21:56 +00:00
|
|
|
{
|
2023-02-13 19:29:22 +00:00
|
|
|
return static_branch_likely(&memcg_kmem_online_key);
|
2012-12-18 22:21:56 +00:00
|
|
|
}
|
|
|
|
|
2020-04-02 04:06:46 +00:00
|
|
|
static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
|
|
|
|
int order)
|
2019-03-05 23:43:13 +00:00
|
|
|
{
|
2023-02-13 19:29:22 +00:00
|
|
|
if (memcg_kmem_online())
|
2020-04-02 04:06:46 +00:00
|
|
|
return __memcg_kmem_charge_page(page, gfp, order);
|
2019-03-05 23:43:13 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-04-02 04:06:46 +00:00
|
|
|
static inline void memcg_kmem_uncharge_page(struct page *page, int order)
|
2019-03-05 23:43:13 +00:00
|
|
|
{
|
2023-02-13 19:29:22 +00:00
|
|
|
if (memcg_kmem_online())
|
2020-04-02 04:06:46 +00:00
|
|
|
__memcg_kmem_uncharge_page(page, order);
|
2019-03-05 23:43:13 +00:00
|
|
|
}
|
|
|
|
|
2015-09-08 22:01:02 +00:00
|
|
|
/*
|
2020-12-15 03:06:45 +00:00
|
|
|
* A helper for accessing memcg's kmem_id, used for getting
|
|
|
|
* corresponding LRU lists.
|
2015-09-08 22:01:02 +00:00
|
|
|
*/
|
2022-03-22 21:41:38 +00:00
|
|
|
static inline int memcg_kmem_id(struct mem_cgroup *memcg)
|
2015-09-08 22:01:02 +00:00
|
|
|
{
|
|
|
|
return memcg ? memcg->kmemcg_id : -1;
|
|
|
|
}
|
2014-04-07 22:39:24 +00:00
|
|
|
|
2022-06-10 18:03:10 +00:00
|
|
|
struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
|
2020-03-29 02:17:25 +00:00
|
|
|
|
mm: count zeromap read and set for swapout and swapin
When the proportion of folios from the zeromap is small, missing their
accounting may not significantly impact profiling. However, it's easy to
construct a scenario where this becomes an issue—for example, allocating
1 GB of memory, writing zeros from userspace, followed by MADV_PAGEOUT,
and then swapping it back in. In this case, the swap-out and swap-in
counts seem to vanish into a black hole, potentially causing semantic
ambiguity.
On the other hand, Usama reported that zero-filled pages can exceed 10% in
workloads utilizing zswap, while Hailong noted that some app in Android
have more than 6% zero-filled pages. Before commit 0ca0c24e3211 ("mm:
store zero pages to be swapped out in a bitmap"), both zswap and zRAM
implemented similar optimizations, leading to these optimized-out pages
being counted in either zswap or zRAM counters (with pswpin/pswpout also
increasing for zRAM). With zeromap functioning prior to both zswap and
zRAM, userspace will no longer detect these swap-out and swap-in actions.
We have three ways to address this:
1. Introduce a dedicated counter specifically for the zeromap.
2. Use pswpin/pswpout accounting, treating the zero map as a standard
backend. This approach aligns with zRAM's current handling of
same-page fills at the device level. However, it would mean losing the
optimized-out page counters previously available in zRAM and would not
align with systems using zswap. Additionally, as noted by Nhat Pham,
pswpin/pswpout counters apply only to I/O done directly to the backend
device.
3. Count zeromap pages under zswap, aligning with system behavior when
zswap is enabled. However, this would not be consistent with zRAM, nor
would it align with systems lacking both zswap and zRAM.
Given the complications with options 2 and 3, this patch selects
option 1.
We can find these counters from /proc/vmstat (counters for the whole
system) and memcg's memory.stat (counters for the interested memcg).
For example:
$ grep -E 'swpin_zero|swpout_zero' /proc/vmstat
swpin_zero 1648
swpout_zero 33536
$ grep -E 'swpin_zero|swpout_zero' /sys/fs/cgroup/system.slice/memory.stat
swpin_zero 3905
swpout_zero 3985
This patch does not address any specific zeromap bug, but the missing
swpout and swpin counts for zero-filled pages can be highly confusing and
may mislead user-space agents that rely on changes in these counters as
indicators. Therefore, we add a Fixes tag to encourage the inclusion of
this counter in any kernel versions with zeromap.
Many thanks to Kanchana for the contribution of changing
count_objcg_event() to count_objcg_events() to support large folios[1],
which has now been incorporated into this patch.
[1] https://lkml.kernel.org/r/20241001053222.6944-5-kanchana.p.sridhar@intel.com
Link: https://lkml.kernel.org/r/20241107011246.59137-1-21cnbao@gmail.com
Fixes: 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap")
Co-developed-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Hailong Liu <hailong.liu@oppo.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-11-07 01:12:46 +00:00
|
|
|
static inline void count_objcg_events(struct obj_cgroup *objcg,
|
|
|
|
enum vm_event_item idx,
|
|
|
|
unsigned long count)
|
2022-05-19 21:08:53 +00:00
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
2023-02-13 19:29:22 +00:00
|
|
|
if (!memcg_kmem_online())
|
2022-05-19 21:08:53 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
memcg = obj_cgroup_memcg(objcg);
|
mm: count zeromap read and set for swapout and swapin
When the proportion of folios from the zeromap is small, missing their
accounting may not significantly impact profiling. However, it's easy to
construct a scenario where this becomes an issue—for example, allocating
1 GB of memory, writing zeros from userspace, followed by MADV_PAGEOUT,
and then swapping it back in. In this case, the swap-out and swap-in
counts seem to vanish into a black hole, potentially causing semantic
ambiguity.
On the other hand, Usama reported that zero-filled pages can exceed 10% in
workloads utilizing zswap, while Hailong noted that some app in Android
have more than 6% zero-filled pages. Before commit 0ca0c24e3211 ("mm:
store zero pages to be swapped out in a bitmap"), both zswap and zRAM
implemented similar optimizations, leading to these optimized-out pages
being counted in either zswap or zRAM counters (with pswpin/pswpout also
increasing for zRAM). With zeromap functioning prior to both zswap and
zRAM, userspace will no longer detect these swap-out and swap-in actions.
We have three ways to address this:
1. Introduce a dedicated counter specifically for the zeromap.
2. Use pswpin/pswpout accounting, treating the zero map as a standard
backend. This approach aligns with zRAM's current handling of
same-page fills at the device level. However, it would mean losing the
optimized-out page counters previously available in zRAM and would not
align with systems using zswap. Additionally, as noted by Nhat Pham,
pswpin/pswpout counters apply only to I/O done directly to the backend
device.
3. Count zeromap pages under zswap, aligning with system behavior when
zswap is enabled. However, this would not be consistent with zRAM, nor
would it align with systems lacking both zswap and zRAM.
Given the complications with options 2 and 3, this patch selects
option 1.
We can find these counters from /proc/vmstat (counters for the whole
system) and memcg's memory.stat (counters for the interested memcg).
For example:
$ grep -E 'swpin_zero|swpout_zero' /proc/vmstat
swpin_zero 1648
swpout_zero 33536
$ grep -E 'swpin_zero|swpout_zero' /sys/fs/cgroup/system.slice/memory.stat
swpin_zero 3905
swpout_zero 3985
This patch does not address any specific zeromap bug, but the missing
swpout and swpin counts for zero-filled pages can be highly confusing and
may mislead user-space agents that rely on changes in these counters as
indicators. Therefore, we add a Fixes tag to encourage the inclusion of
this counter in any kernel versions with zeromap.
Many thanks to Kanchana for the contribution of changing
count_objcg_event() to count_objcg_events() to support large folios[1],
which has now been incorporated into this patch.
[1] https://lkml.kernel.org/r/20241001053222.6944-5-kanchana.p.sridhar@intel.com
Link: https://lkml.kernel.org/r/20241107011246.59137-1-21cnbao@gmail.com
Fixes: 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap")
Co-developed-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Hailong Liu <hailong.liu@oppo.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-11-07 01:12:46 +00:00
|
|
|
count_memcg_events(memcg, idx, count);
|
2022-05-19 21:08:53 +00:00
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2012-12-18 22:21:56 +00:00
|
|
|
#else
|
2021-06-03 01:09:30 +00:00
|
|
|
static inline bool mem_cgroup_kmem_disabled(void)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
2018-10-26 22:03:19 +00:00
|
|
|
|
2020-04-02 04:06:46 +00:00
|
|
|
static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
|
|
|
|
int order)
|
2018-10-26 22:03:19 +00:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-04-02 04:06:46 +00:00
|
|
|
static inline void memcg_kmem_uncharge_page(struct page *page, int order)
|
2018-10-26 22:03:19 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2020-04-02 04:06:46 +00:00
|
|
|
static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp,
|
|
|
|
int order)
|
2019-03-05 23:43:13 +00:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-04-02 04:06:46 +00:00
|
|
|
static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
|
2019-03-05 23:43:13 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2023-07-15 04:23:41 +00:00
|
|
|
static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
|
2022-05-19 21:08:53 +00:00
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2023-02-10 15:47:31 +00:00
|
|
|
static inline bool memcg_bpf_enabled(void)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-02-13 19:29:22 +00:00
|
|
|
static inline bool memcg_kmem_online(void)
|
2012-12-18 22:22:46 +00:00
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2022-03-22 21:41:38 +00:00
|
|
|
static inline int memcg_kmem_id(struct mem_cgroup *memcg)
|
2012-12-18 22:22:34 +00:00
|
|
|
{
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2022-06-10 18:03:10 +00:00
|
|
|
static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
mm: count zeromap read and set for swapout and swapin
When the proportion of folios from the zeromap is small, missing their
accounting may not significantly impact profiling. However, it's easy to
construct a scenario where this becomes an issue—for example, allocating
1 GB of memory, writing zeros from userspace, followed by MADV_PAGEOUT,
and then swapping it back in. In this case, the swap-out and swap-in
counts seem to vanish into a black hole, potentially causing semantic
ambiguity.
On the other hand, Usama reported that zero-filled pages can exceed 10% in
workloads utilizing zswap, while Hailong noted that some app in Android
have more than 6% zero-filled pages. Before commit 0ca0c24e3211 ("mm:
store zero pages to be swapped out in a bitmap"), both zswap and zRAM
implemented similar optimizations, leading to these optimized-out pages
being counted in either zswap or zRAM counters (with pswpin/pswpout also
increasing for zRAM). With zeromap functioning prior to both zswap and
zRAM, userspace will no longer detect these swap-out and swap-in actions.
We have three ways to address this:
1. Introduce a dedicated counter specifically for the zeromap.
2. Use pswpin/pswpout accounting, treating the zero map as a standard
backend. This approach aligns with zRAM's current handling of
same-page fills at the device level. However, it would mean losing the
optimized-out page counters previously available in zRAM and would not
align with systems using zswap. Additionally, as noted by Nhat Pham,
pswpin/pswpout counters apply only to I/O done directly to the backend
device.
3. Count zeromap pages under zswap, aligning with system behavior when
zswap is enabled. However, this would not be consistent with zRAM, nor
would it align with systems lacking both zswap and zRAM.
Given the complications with options 2 and 3, this patch selects
option 1.
We can find these counters from /proc/vmstat (counters for the whole
system) and memcg's memory.stat (counters for the interested memcg).
For example:
$ grep -E 'swpin_zero|swpout_zero' /proc/vmstat
swpin_zero 1648
swpout_zero 33536
$ grep -E 'swpin_zero|swpout_zero' /sys/fs/cgroup/system.slice/memory.stat
swpin_zero 3905
swpout_zero 3985
This patch does not address any specific zeromap bug, but the missing
swpout and swpin counts for zero-filled pages can be highly confusing and
may mislead user-space agents that rely on changes in these counters as
indicators. Therefore, we add a Fixes tag to encourage the inclusion of
this counter in any kernel versions with zeromap.
Many thanks to Kanchana for the contribution of changing
count_objcg_event() to count_objcg_events() to support large folios[1],
which has now been incorporated into this patch.
[1] https://lkml.kernel.org/r/20241001053222.6944-5-kanchana.p.sridhar@intel.com
Link: https://lkml.kernel.org/r/20241107011246.59137-1-21cnbao@gmail.com
Fixes: 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap")
Co-developed-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Hailong Liu <hailong.liu@oppo.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-11-07 01:12:46 +00:00
|
|
|
static inline void count_objcg_events(struct obj_cgroup *objcg,
|
|
|
|
enum vm_event_item idx,
|
|
|
|
unsigned long count)
|
2022-05-19 21:08:53 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2024-07-01 15:31:15 +00:00
|
|
|
#endif /* CONFIG_MEMCG */
|
2016-01-20 23:02:32 +00:00
|
|
|
|
2024-07-01 15:31:15 +00:00
|
|
|
#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
|
2022-05-19 21:08:53 +00:00
|
|
|
bool obj_cgroup_may_zswap(struct obj_cgroup *objcg);
|
|
|
|
void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size);
|
|
|
|
void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size);
|
zswap: memcontrol: implement zswap writeback disabling
During our experiment with zswap, we sometimes observe swap IOs due to
occasional zswap store failures and writebacks-to-swap. These swapping
IOs prevent many users who cannot tolerate swapping from adopting zswap to
save memory and improve performance where possible.
This patch adds the option to disable this behavior entirely: do not
writeback to backing swapping device when a zswap store attempt fail, and
do not write pages in the zswap pool back to the backing swap device (both
when the pool is full, and when the new zswap shrinker is called).
This new behavior can be opted-in/out on a per-cgroup basis via a new
cgroup file. By default, writebacks to swap device is enabled, which is
the previous behavior. Initially, writeback is enabled for the root
cgroup, and a newly created cgroup will inherit the current setting of its
parent.
Note that this is subtly different from setting memory.swap.max to 0, as
it still allows for pages to be stored in the zswap pool (which itself
consumes swap space in its current form).
This patch should be applied on top of the zswap shrinker series:
https://lore.kernel.org/linux-mm/20231130194023.4102148-1-nphamcs@gmail.com/
as it also disables the zswap shrinker, a major source of zswap
writebacks.
For the most part, this feature is motivated by internal parties who
have already established their opinions regarding swapping - the
workloads that are highly sensitive to IO, and especially those who are
using servers with really slow disk performance (for instance, massive
but slow HDDs). For these folks, it's impossible to convince them to
even entertain zswap if swapping also comes as a packaged deal.
Writeback disabling is quite a useful feature in these situations - on
a mixed workloads deployment, they can disable writeback for the more
IO-sensitive workloads, and enable writeback for other background
workloads.
For instance, on a server with HDD, I allocate memories and populate
them with random values (so that zswap store will always fail), and
specify memory.high low enough to trigger reclaim. The time it takes
to allocate the memories and just read through it a couple of times
(doing silly things like computing the values' average etc.):
zswap.writeback disabled:
real 0m30.537s
user 0m23.687s
sys 0m6.637s
0 pages swapped in
0 pages swapped out
zswap.writeback enabled:
real 0m45.061s
user 0m24.310s
sys 0m8.892s
712686 pages swapped in
461093 pages swapped out
(the last two lines are from vmstat -s).
[nphamcs@gmail.com: add a comment about recurring zswap store failures leading to reclaim inefficiency]
Link: https://lkml.kernel.org/r/20231221005725.3446672-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231207192406.3809579-1-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: David Heidelberg <david@ixit.cz>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-12-07 19:24:06 +00:00
|
|
|
bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg);
|
2022-05-19 21:08:53 +00:00
|
|
|
#else
|
|
|
|
static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg,
|
|
|
|
size_t size)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg,
|
|
|
|
size_t size)
|
|
|
|
{
|
|
|
|
}
|
zswap: memcontrol: implement zswap writeback disabling
During our experiment with zswap, we sometimes observe swap IOs due to
occasional zswap store failures and writebacks-to-swap. These swapping
IOs prevent many users who cannot tolerate swapping from adopting zswap to
save memory and improve performance where possible.
This patch adds the option to disable this behavior entirely: do not
writeback to backing swapping device when a zswap store attempt fail, and
do not write pages in the zswap pool back to the backing swap device (both
when the pool is full, and when the new zswap shrinker is called).
This new behavior can be opted-in/out on a per-cgroup basis via a new
cgroup file. By default, writebacks to swap device is enabled, which is
the previous behavior. Initially, writeback is enabled for the root
cgroup, and a newly created cgroup will inherit the current setting of its
parent.
Note that this is subtly different from setting memory.swap.max to 0, as
it still allows for pages to be stored in the zswap pool (which itself
consumes swap space in its current form).
This patch should be applied on top of the zswap shrinker series:
https://lore.kernel.org/linux-mm/20231130194023.4102148-1-nphamcs@gmail.com/
as it also disables the zswap shrinker, a major source of zswap
writebacks.
For the most part, this feature is motivated by internal parties who
have already established their opinions regarding swapping - the
workloads that are highly sensitive to IO, and especially those who are
using servers with really slow disk performance (for instance, massive
but slow HDDs). For these folks, it's impossible to convince them to
even entertain zswap if swapping also comes as a packaged deal.
Writeback disabling is quite a useful feature in these situations - on
a mixed workloads deployment, they can disable writeback for the more
IO-sensitive workloads, and enable writeback for other background
workloads.
For instance, on a server with HDD, I allocate memories and populate
them with random values (so that zswap store will always fail), and
specify memory.high low enough to trigger reclaim. The time it takes
to allocate the memories and just read through it a couple of times
(doing silly things like computing the values' average etc.):
zswap.writeback disabled:
real 0m30.537s
user 0m23.687s
sys 0m6.637s
0 pages swapped in
0 pages swapped out
zswap.writeback enabled:
real 0m45.061s
user 0m24.310s
sys 0m8.892s
712686 pages swapped in
461093 pages swapped out
(the last two lines are from vmstat -s).
[nphamcs@gmail.com: add a comment about recurring zswap store failures leading to reclaim inefficiency]
Link: https://lkml.kernel.org/r/20231221005725.3446672-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231207192406.3809579-1-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: David Heidelberg <david@ixit.cz>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-12-07 19:24:06 +00:00
|
|
|
static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
/* if zswap is disabled, do not block pages going to the swapping device */
|
|
|
|
return true;
|
|
|
|
}
|
2022-05-19 21:08:53 +00:00
|
|
|
#endif
|
|
|
|
|
2024-06-25 00:59:04 +00:00
|
|
|
|
|
|
|
/* Cgroup v1-related declarations */
|
|
|
|
|
2024-06-25 00:59:05 +00:00
|
|
|
#ifdef CONFIG_MEMCG_V1
|
2024-06-25 00:59:04 +00:00
|
|
|
unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned);
|
|
|
|
|
|
|
|
bool mem_cgroup_oom_synchronize(bool wait);
|
|
|
|
|
|
|
|
static inline bool task_in_memcg_oom(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return p->memcg_in_oom;
|
|
|
|
}
|
|
|
|
|
2024-06-28 21:03:17 +00:00
|
|
|
static inline void mem_cgroup_enter_user_fault(void)
|
|
|
|
{
|
|
|
|
WARN_ON(current->in_user_fault);
|
|
|
|
current->in_user_fault = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_exit_user_fault(void)
|
|
|
|
{
|
|
|
|
WARN_ON(!current->in_user_fault);
|
|
|
|
current->in_user_fault = 0;
|
|
|
|
}
|
|
|
|
|
2024-06-25 00:59:05 +00:00
|
|
|
#else /* CONFIG_MEMCG_V1 */
|
2024-06-25 00:59:04 +00:00
|
|
|
static inline
|
|
|
|
unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool task_in_memcg_oom(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool mem_cgroup_oom_synchronize(bool wait)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2024-06-28 21:03:17 +00:00
|
|
|
static inline void mem_cgroup_enter_user_fault(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_exit_user_fault(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2024-06-25 00:59:05 +00:00
|
|
|
#endif /* CONFIG_MEMCG_V1 */
|
2024-06-25 00:59:04 +00:00
|
|
|
|
2008-02-07 08:13:50 +00:00
|
|
|
#endif /* _LINUX_MEMCONTROL_H */
|