mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-15 11:37:47 +00:00
bea67dcc5e
Zhiguo reported that swap release could be a serious bottleneck during process exits[1]. With mTHP, we have the opportunity to batch free swaps. Thanks to the work of Chris and Kairui[2], I was able to achieve this optimization with minimal code changes by building on their efforts. If swap_count is 1, which is likely true as most anon memory are private, we can free all contiguous swap slots all together. Ran the below test program for measuring the bandwidth of munmap using zRAM and 64KiB mTHP: #include <sys/mman.h> #include <sys/time.h> #include <stdlib.h> unsigned long long tv_to_ms(struct timeval tv) { return tv.tv_sec * 1000 + tv.tv_usec / 1000; } main() { struct timeval tv_b, tv_e; int i; #define SIZE 1024*1024*1024 void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (!p) { perror("fail to get memory"); exit(-1); } madvise(p, SIZE, MADV_HUGEPAGE); memset(p, 0x11, SIZE); /* write to get mem */ madvise(p, SIZE, MADV_PAGEOUT); gettimeofday(&tv_b, NULL); munmap(p, SIZE); gettimeofday(&tv_e, NULL); printf("munmap in bandwidth: %ld bytes/ms\n", SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b))); } The result is as below (munmap bandwidth): mm-unstable mm-unstable-with-patch round1 21053761 63161283 round2 21053761 63161283 round3 21053761 63161283 round4 20648881 67108864 round5 20648881 67108864 munmap bandwidth becomes 3X faster. [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/ [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/ [v-songbaohua@oppo.com: check all swaps belong to same swap_cgroup in swap_pte_batch()] Link: https://lkml.kernel.org/r/20240815215308.55233-1-21cnbao@gmail.com [hughd@google.com: add mem_cgroup_disabled() check] Link: https://lkml.kernel.org/r/33f34a88-0130-5444-9b84-93198eeb50e7@google.com [21cnbao@gmail.com: add missing zswap_invalidate()] Link: https://lkml.kernel.org/r/20240821054921.43468-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240807215859.57491-3-21cnbao@gmail.com Signed-off-by: Barry Song <v-songbaohua@oppo.com> Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Kairui Song <kasong@tencent.com> Cc: Chris Li <chrisl@kernel.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kalesh Singh <kaleshsingh@google.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Barry Song <baohua@kernel.org> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
234 lines
5.2 KiB
C
234 lines
5.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/swap_cgroup.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/swapops.h> /* depends on mm.h include */
|
|
|
|
static DEFINE_MUTEX(swap_cgroup_mutex);
|
|
struct swap_cgroup_ctrl {
|
|
struct page **map;
|
|
unsigned long length;
|
|
spinlock_t lock;
|
|
};
|
|
|
|
static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
|
|
|
|
struct swap_cgroup {
|
|
unsigned short id;
|
|
};
|
|
#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
|
|
|
|
/*
|
|
* SwapCgroup implements "lookup" and "exchange" operations.
|
|
* In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
|
|
* against SwapCache. At swap_free(), this is accessed directly from swap.
|
|
*
|
|
* This means,
|
|
* - we have no race in "exchange" when we're accessed via SwapCache because
|
|
* SwapCache(and its swp_entry) is under lock.
|
|
* - When called via swap_free(), there is no user of this entry and no race.
|
|
* Then, we don't need lock around "exchange".
|
|
*
|
|
* TODO: we can push these buffers out to HIGHMEM.
|
|
*/
|
|
|
|
/*
|
|
* allocate buffer for swap_cgroup.
|
|
*/
|
|
static int swap_cgroup_prepare(int type)
|
|
{
|
|
struct page *page;
|
|
struct swap_cgroup_ctrl *ctrl;
|
|
unsigned long idx, max;
|
|
|
|
ctrl = &swap_cgroup_ctrl[type];
|
|
|
|
for (idx = 0; idx < ctrl->length; idx++) {
|
|
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
|
|
if (!page)
|
|
goto not_enough_page;
|
|
ctrl->map[idx] = page;
|
|
|
|
if (!(idx % SWAP_CLUSTER_MAX))
|
|
cond_resched();
|
|
}
|
|
return 0;
|
|
not_enough_page:
|
|
max = idx;
|
|
for (idx = 0; idx < max; idx++)
|
|
__free_page(ctrl->map[idx]);
|
|
|
|
return -ENOMEM;
|
|
}
|
|
|
|
static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl,
|
|
pgoff_t offset)
|
|
{
|
|
struct page *mappage;
|
|
struct swap_cgroup *sc;
|
|
|
|
mappage = ctrl->map[offset / SC_PER_PAGE];
|
|
sc = page_address(mappage);
|
|
return sc + offset % SC_PER_PAGE;
|
|
}
|
|
|
|
static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
|
|
struct swap_cgroup_ctrl **ctrlp)
|
|
{
|
|
pgoff_t offset = swp_offset(ent);
|
|
struct swap_cgroup_ctrl *ctrl;
|
|
|
|
ctrl = &swap_cgroup_ctrl[swp_type(ent)];
|
|
if (ctrlp)
|
|
*ctrlp = ctrl;
|
|
return __lookup_swap_cgroup(ctrl, offset);
|
|
}
|
|
|
|
/**
|
|
* swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
|
|
* @ent: swap entry to be cmpxchged
|
|
* @old: old id
|
|
* @new: new id
|
|
*
|
|
* Returns old id at success, 0 at failure.
|
|
* (There is no mem_cgroup using 0 as its id)
|
|
*/
|
|
unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
|
|
unsigned short old, unsigned short new)
|
|
{
|
|
struct swap_cgroup_ctrl *ctrl;
|
|
struct swap_cgroup *sc;
|
|
unsigned long flags;
|
|
unsigned short retval;
|
|
|
|
sc = lookup_swap_cgroup(ent, &ctrl);
|
|
|
|
spin_lock_irqsave(&ctrl->lock, flags);
|
|
retval = sc->id;
|
|
if (retval == old)
|
|
sc->id = new;
|
|
else
|
|
retval = 0;
|
|
spin_unlock_irqrestore(&ctrl->lock, flags);
|
|
return retval;
|
|
}
|
|
|
|
/**
|
|
* swap_cgroup_record - record mem_cgroup for a set of swap entries
|
|
* @ent: the first swap entry to be recorded into
|
|
* @id: mem_cgroup to be recorded
|
|
* @nr_ents: number of swap entries to be recorded
|
|
*
|
|
* Returns old value at success, 0 at failure.
|
|
* (Of course, old value can be 0.)
|
|
*/
|
|
unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
|
|
unsigned int nr_ents)
|
|
{
|
|
struct swap_cgroup_ctrl *ctrl;
|
|
struct swap_cgroup *sc;
|
|
unsigned short old;
|
|
unsigned long flags;
|
|
pgoff_t offset = swp_offset(ent);
|
|
pgoff_t end = offset + nr_ents;
|
|
|
|
sc = lookup_swap_cgroup(ent, &ctrl);
|
|
|
|
spin_lock_irqsave(&ctrl->lock, flags);
|
|
old = sc->id;
|
|
for (;;) {
|
|
VM_BUG_ON(sc->id != old);
|
|
sc->id = id;
|
|
offset++;
|
|
if (offset == end)
|
|
break;
|
|
if (offset % SC_PER_PAGE)
|
|
sc++;
|
|
else
|
|
sc = __lookup_swap_cgroup(ctrl, offset);
|
|
}
|
|
spin_unlock_irqrestore(&ctrl->lock, flags);
|
|
|
|
return old;
|
|
}
|
|
|
|
/**
|
|
* lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
|
|
* @ent: swap entry to be looked up.
|
|
*
|
|
* Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
|
|
*/
|
|
unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
|
|
{
|
|
if (mem_cgroup_disabled())
|
|
return 0;
|
|
return lookup_swap_cgroup(ent, NULL)->id;
|
|
}
|
|
|
|
int swap_cgroup_swapon(int type, unsigned long max_pages)
|
|
{
|
|
void *array;
|
|
unsigned long length;
|
|
struct swap_cgroup_ctrl *ctrl;
|
|
|
|
if (mem_cgroup_disabled())
|
|
return 0;
|
|
|
|
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
|
|
|
|
array = vcalloc(length, sizeof(void *));
|
|
if (!array)
|
|
goto nomem;
|
|
|
|
ctrl = &swap_cgroup_ctrl[type];
|
|
mutex_lock(&swap_cgroup_mutex);
|
|
ctrl->length = length;
|
|
ctrl->map = array;
|
|
spin_lock_init(&ctrl->lock);
|
|
if (swap_cgroup_prepare(type)) {
|
|
/* memory shortage */
|
|
ctrl->map = NULL;
|
|
ctrl->length = 0;
|
|
mutex_unlock(&swap_cgroup_mutex);
|
|
vfree(array);
|
|
goto nomem;
|
|
}
|
|
mutex_unlock(&swap_cgroup_mutex);
|
|
|
|
return 0;
|
|
nomem:
|
|
pr_info("couldn't allocate enough memory for swap_cgroup\n");
|
|
pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
void swap_cgroup_swapoff(int type)
|
|
{
|
|
struct page **map;
|
|
unsigned long i, length;
|
|
struct swap_cgroup_ctrl *ctrl;
|
|
|
|
if (mem_cgroup_disabled())
|
|
return;
|
|
|
|
mutex_lock(&swap_cgroup_mutex);
|
|
ctrl = &swap_cgroup_ctrl[type];
|
|
map = ctrl->map;
|
|
length = ctrl->length;
|
|
ctrl->map = NULL;
|
|
ctrl->length = 0;
|
|
mutex_unlock(&swap_cgroup_mutex);
|
|
|
|
if (map) {
|
|
for (i = 0; i < length; i++) {
|
|
struct page *page = map[i];
|
|
if (page)
|
|
__free_page(page);
|
|
if (!(i % SWAP_CLUSTER_MAX))
|
|
cond_resched();
|
|
}
|
|
vfree(map);
|
|
}
|
|
}
|