mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-17 13:58:46 +00:00
mm/mglru: rework workingset protection
With the aging feedback no longer considering the distribution of folios in each generation, rework workingset protection to better distribute folios across MAX_NR_GENS. This is achieved by reusing PG_workingset and PG_referenced/LRU_REFS_FLAGS in a slightly different way. For folios accessed multiple times through file descriptors, make lru_gen_inc_refs() set additional bits of LRU_REFS_WIDTH in folio->flags after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily promoted into the second oldest generation in the eviction path. And when folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that lru_gen_inc_refs() can start over. For this case, LRU_REFS_MASK is only valid when PG_referenced is set. For folios accessed multiple times through page tables, folio_update_gen() from a page table walk or lru_gen_set_refs() from a rmap walk sets PG_referenced after the accessed bit is cleared for the first time. Thereafter, those two paths set PG_workingset and promote folios to the youngest generation. Like folio_inc_gen(), when folio_update_gen() does that, it also clears PG_referenced. For this case, LRU_REFS_MASK is not used. For both of the cases, after PG_workingset is set on a folio, it remains until this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It can be set again if lru_gen_test_recent() returns true upon a refault. When adding folios to the LRU lists, lru_gen_folio_seq() distributes them as follows: +---------------------------------+---------------------------------+ | Accessed thru page tables | Accessed thru file descriptors | +---------------------------------+---------------------------------+ | PG_active (set while isolated) | | +----------------+----------------+----------------+----------------+ | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | +---------------------------------+---------------------------------+ |<--------- MIN_NR_GENS --------->| | |<-------------------------- MAX_NR_GENS -------------------------->| After this patch, some typical client and server workloads showed improvements under heavy memory pressure. For example, Python TPC-C, which was used to benchmark a different approach [1] to better detect refault distances, showed a significant decrease in total refaults: Before After Change Time (seconds) 10801 10801 0% Executed (transactions) 41472 43663 +5% workingset_nodes 109070 120244 +10% workingset_refault_anon 5019627 7281831 +45% workingset_refault_file 1294678786 554855564 -57% workingset_refault_total 1299698413 562137395 -57% [1] https://lore.kernel.org/20230920190244.16839-1-ryncsn@gmail.com/ Link: https://lkml.kernel.org/r/20241231043538.4075764-7-yuzhao@google.com Signed-off-by: Yu Zhao <yuzhao@google.com> Reported-by: Kairui Song <kasong@tencent.com> Closes: https://lore.kernel.org/CAOUHufahuWcKf5f1Sg3emnqX+cODuR=2TQo7T4Gr-QYLujn4RA@mail.gmail.com/ Tested-by: Kalesh Singh <kaleshsingh@google.com> Cc: Barry Song <v-songbaohua@oppo.com> Cc: Bharata B Rao <bharata@amd.com> Cc: David Stevens <stevensd@chromium.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
e7e2a93af1
commit
01bedddff3
@ -133,31 +133,25 @@ static inline int lru_hist_from_seq(unsigned long seq)
|
|||||||
return seq % NR_HIST_GENS;
|
return seq % NR_HIST_GENS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int lru_tier_from_refs(int refs)
|
static inline int lru_tier_from_refs(int refs, bool workingset)
|
||||||
{
|
{
|
||||||
VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
|
VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
|
||||||
|
|
||||||
/* see the comment in folio_lru_refs() */
|
/* see the comment on MAX_NR_TIERS */
|
||||||
return order_base_2(refs + 1);
|
return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int folio_lru_refs(struct folio *folio)
|
static inline int folio_lru_refs(struct folio *folio)
|
||||||
{
|
{
|
||||||
unsigned long flags = READ_ONCE(folio->flags);
|
unsigned long flags = READ_ONCE(folio->flags);
|
||||||
bool workingset = flags & BIT(PG_workingset);
|
|
||||||
|
|
||||||
|
if (!(flags & BIT(PG_referenced)))
|
||||||
|
return 0;
|
||||||
/*
|
/*
|
||||||
* Return the number of accesses beyond PG_referenced, i.e., N-1 if the
|
* Return the total number of accesses including PG_referenced. Also see
|
||||||
* total number of accesses is N>1, since N=0,1 both map to the first
|
* the comment on LRU_REFS_FLAGS.
|
||||||
* tier. lru_tier_from_refs() will account for this off-by-one. Also see
|
|
||||||
* the comment on MAX_NR_TIERS.
|
|
||||||
*/
|
*/
|
||||||
return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
|
return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1;
|
||||||
}
|
|
||||||
|
|
||||||
static inline void folio_clear_lru_refs(struct folio *folio)
|
|
||||||
{
|
|
||||||
set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int folio_lru_gen(struct folio *folio)
|
static inline int folio_lru_gen(struct folio *folio)
|
||||||
@ -223,11 +217,43 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
|
|||||||
VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
|
VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio,
|
||||||
|
bool reclaiming)
|
||||||
|
{
|
||||||
|
int gen;
|
||||||
|
int type = folio_is_file_lru(folio);
|
||||||
|
struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* +-----------------------------------+-----------------------------------+
|
||||||
|
* | Accessed through page tables and | Accessed through file descriptors |
|
||||||
|
* | promoted by folio_update_gen() | and protected by folio_inc_gen() |
|
||||||
|
* +-----------------------------------+-----------------------------------+
|
||||||
|
* | PG_active (set while isolated) | |
|
||||||
|
* +-----------------+-----------------+-----------------+-----------------+
|
||||||
|
* | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS |
|
||||||
|
* +-----------------------------------+-----------------------------------+
|
||||||
|
* |<---------- MIN_NR_GENS ---------->| |
|
||||||
|
* |<---------------------------- MAX_NR_GENS ---------------------------->|
|
||||||
|
*/
|
||||||
|
if (folio_test_active(folio))
|
||||||
|
gen = MIN_NR_GENS - folio_test_workingset(folio);
|
||||||
|
else if (reclaiming)
|
||||||
|
gen = MAX_NR_GENS;
|
||||||
|
else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) ||
|
||||||
|
(folio_test_reclaim(folio) &&
|
||||||
|
(folio_test_dirty(folio) || folio_test_writeback(folio))))
|
||||||
|
gen = MIN_NR_GENS;
|
||||||
|
else
|
||||||
|
gen = MAX_NR_GENS - folio_test_workingset(folio);
|
||||||
|
|
||||||
|
return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type]));
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
|
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
|
||||||
{
|
{
|
||||||
unsigned long seq;
|
unsigned long seq;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
unsigned long mask;
|
|
||||||
int gen = folio_lru_gen(folio);
|
int gen = folio_lru_gen(folio);
|
||||||
int type = folio_is_file_lru(folio);
|
int type = folio_is_file_lru(folio);
|
||||||
int zone = folio_zonenum(folio);
|
int zone = folio_zonenum(folio);
|
||||||
@ -237,40 +263,12 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
|
|||||||
|
|
||||||
if (folio_test_unevictable(folio) || !lrugen->enabled)
|
if (folio_test_unevictable(folio) || !lrugen->enabled)
|
||||||
return false;
|
return false;
|
||||||
/*
|
|
||||||
* There are four common cases for this page:
|
|
||||||
* 1. If it's hot, i.e., freshly faulted in, add it to the youngest
|
|
||||||
* generation, and it's protected over the rest below.
|
|
||||||
* 2. If it can't be evicted immediately, i.e., a dirty page pending
|
|
||||||
* writeback, add it to the second youngest generation.
|
|
||||||
* 3. If it should be evicted first, e.g., cold and clean from
|
|
||||||
* folio_rotate_reclaimable(), add it to the oldest generation.
|
|
||||||
* 4. Everything else falls between 2 & 3 above and is added to the
|
|
||||||
* second oldest generation if it's considered inactive, or the
|
|
||||||
* oldest generation otherwise. See lru_gen_is_active().
|
|
||||||
*/
|
|
||||||
if (folio_test_active(folio))
|
|
||||||
seq = lrugen->max_seq;
|
|
||||||
else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
|
|
||||||
(folio_test_reclaim(folio) &&
|
|
||||||
(folio_test_dirty(folio) || folio_test_writeback(folio))))
|
|
||||||
seq = lrugen->max_seq - 1;
|
|
||||||
else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq)
|
|
||||||
seq = lrugen->min_seq[type];
|
|
||||||
else
|
|
||||||
seq = lrugen->min_seq[type] + 1;
|
|
||||||
|
|
||||||
|
seq = lru_gen_folio_seq(lruvec, folio, reclaiming);
|
||||||
gen = lru_gen_from_seq(seq);
|
gen = lru_gen_from_seq(seq);
|
||||||
flags = (gen + 1UL) << LRU_GEN_PGOFF;
|
flags = (gen + 1UL) << LRU_GEN_PGOFF;
|
||||||
/* see the comment on MIN_NR_GENS about PG_active */
|
/* see the comment on MIN_NR_GENS about PG_active */
|
||||||
mask = LRU_GEN_MASK;
|
set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
|
||||||
/*
|
|
||||||
* Don't clear PG_workingset here because it can affect PSI accounting
|
|
||||||
* if the activation is due to workingset refault.
|
|
||||||
*/
|
|
||||||
if (folio_test_active(folio))
|
|
||||||
mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active);
|
|
||||||
set_mask_bits(&folio->flags, mask, flags);
|
|
||||||
|
|
||||||
lru_gen_update_size(lruvec, folio, -1, gen);
|
lru_gen_update_size(lruvec, folio, -1, gen);
|
||||||
/* for folio_rotate_reclaimable() */
|
/* for folio_rotate_reclaimable() */
|
||||||
|
@ -332,66 +332,88 @@ enum lruvec_flags {
|
|||||||
#endif /* !__GENERATING_BOUNDS_H */
|
#endif /* !__GENERATING_BOUNDS_H */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Evictable pages are divided into multiple generations. The youngest and the
|
* Evictable folios are divided into multiple generations. The youngest and the
|
||||||
* oldest generation numbers, max_seq and min_seq, are monotonically increasing.
|
* oldest generation numbers, max_seq and min_seq, are monotonically increasing.
|
||||||
* They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
|
* They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
|
||||||
* offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
|
* offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
|
||||||
* corresponding generation. The gen counter in folio->flags stores gen+1 while
|
* corresponding generation. The gen counter in folio->flags stores gen+1 while
|
||||||
* a page is on one of lrugen->folios[]. Otherwise it stores 0.
|
* a folio is on one of lrugen->folios[]. Otherwise it stores 0.
|
||||||
*
|
*
|
||||||
* A page is added to the youngest generation on faulting. The aging needs to
|
* After a folio is faulted in, the aging needs to check the accessed bit at
|
||||||
* check the accessed bit at least twice before handing this page over to the
|
* least twice before handing this folio over to the eviction. The first check
|
||||||
* eviction. The first check takes care of the accessed bit set on the initial
|
* clears the accessed bit from the initial fault; the second check makes sure
|
||||||
* fault; the second check makes sure this page hasn't been used since then.
|
* this folio hasn't been used since then. This process, AKA second chance,
|
||||||
* This process, AKA second chance, requires a minimum of two generations,
|
* requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI
|
||||||
* hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
|
* compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two
|
||||||
* LRU, e.g., /proc/vmstat, these two generations are considered active; the
|
* generations are considered active; the rest of generations, if they exist,
|
||||||
* rest of generations, if they exist, are considered inactive. See
|
* are considered inactive. See lru_gen_is_active().
|
||||||
* lru_gen_is_active().
|
|
||||||
*
|
*
|
||||||
* PG_active is always cleared while a page is on one of lrugen->folios[] so
|
* PG_active is always cleared while a folio is on one of lrugen->folios[] so
|
||||||
* that the aging needs not to worry about it. And it's set again when a page
|
* that the sliding window needs not to worry about it. And it's set again when
|
||||||
* considered active is isolated for non-reclaiming purposes, e.g., migration.
|
* a folio considered active is isolated for non-reclaiming purposes, e.g.,
|
||||||
* See lru_gen_add_folio() and lru_gen_del_folio().
|
* migration. See lru_gen_add_folio() and lru_gen_del_folio().
|
||||||
*
|
*
|
||||||
* MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
|
* MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
|
||||||
* number of categories of the active/inactive LRU when keeping track of
|
* number of categories of the active/inactive LRU when keeping track of
|
||||||
* accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
|
* accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
|
||||||
* in folio->flags.
|
* in folio->flags, masked by LRU_GEN_MASK.
|
||||||
*/
|
*/
|
||||||
#define MIN_NR_GENS 2U
|
#define MIN_NR_GENS 2U
|
||||||
#define MAX_NR_GENS 4U
|
#define MAX_NR_GENS 4U
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Each generation is divided into multiple tiers. A page accessed N times
|
* Each generation is divided into multiple tiers. A folio accessed N times
|
||||||
* through file descriptors is in tier order_base_2(N). A page in the first tier
|
* through file descriptors is in tier order_base_2(N). A folio in the first
|
||||||
* (N=0,1) is marked by PG_referenced unless it was faulted in through page
|
* tier (N=0,1) is marked by PG_referenced unless it was faulted in through page
|
||||||
* tables or read ahead. A page in any other tier (N>1) is marked by
|
* tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by
|
||||||
* PG_referenced and PG_workingset. This implies a minimum of two tiers is
|
* PG_workingset. A folio in any other tier (1<N<5) between the first and last
|
||||||
* supported without using additional bits in folio->flags.
|
* is marked by additional bits of LRU_REFS_WIDTH in folio->flags.
|
||||||
*
|
*
|
||||||
* In contrast to moving across generations which requires the LRU lock, moving
|
* In contrast to moving across generations which requires the LRU lock, moving
|
||||||
* across tiers only involves atomic operations on folio->flags and therefore
|
* across tiers only involves atomic operations on folio->flags and therefore
|
||||||
* has a negligible cost in the buffered access path. In the eviction path,
|
* has a negligible cost in the buffered access path. In the eviction path,
|
||||||
* comparisons of refaulted/(evicted+protected) from the first tier and the
|
* comparisons of refaulted/(evicted+protected) from the first tier and the rest
|
||||||
* rest infer whether pages accessed multiple times through file descriptors
|
* infer whether folios accessed multiple times through file descriptors are
|
||||||
* are statistically hot and thus worth protecting.
|
* statistically hot and thus worth protecting.
|
||||||
*
|
*
|
||||||
* MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
|
* MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
|
||||||
* number of categories of the active/inactive LRU when keeping track of
|
* number of categories of the active/inactive LRU when keeping track of
|
||||||
* accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
|
* accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
|
||||||
* folio->flags.
|
* folio->flags, masked by LRU_REFS_MASK.
|
||||||
*/
|
*/
|
||||||
#define MAX_NR_TIERS 4U
|
#define MAX_NR_TIERS 4U
|
||||||
|
|
||||||
#ifndef __GENERATING_BOUNDS_H
|
#ifndef __GENERATING_BOUNDS_H
|
||||||
|
|
||||||
struct lruvec;
|
|
||||||
struct page_vma_mapped_walk;
|
|
||||||
|
|
||||||
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||||
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For folios accessed multiple times through file descriptors,
|
||||||
|
* lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags
|
||||||
|
* after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its
|
||||||
|
* bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily
|
||||||
|
* promoted into the second oldest generation in the eviction path. And when
|
||||||
|
* folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that
|
||||||
|
* lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is
|
||||||
|
* only valid when PG_referenced is set.
|
||||||
|
*
|
||||||
|
* For folios accessed multiple times through page tables, folio_update_gen()
|
||||||
|
* from a page table walk or lru_gen_set_refs() from a rmap walk sets
|
||||||
|
* PG_referenced after the accessed bit is cleared for the first time.
|
||||||
|
* Thereafter, those two paths set PG_workingset and promote folios to the
|
||||||
|
* youngest generation. Like folio_inc_gen(), folio_update_gen() also clears
|
||||||
|
* PG_referenced. Note that for this case, LRU_REFS_MASK is not used.
|
||||||
|
*
|
||||||
|
* For both cases above, after PG_workingset is set on a folio, it remains until
|
||||||
|
* this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It
|
||||||
|
* can be set again if lru_gen_test_recent() returns true upon a refault.
|
||||||
|
*/
|
||||||
|
#define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced))
|
||||||
|
|
||||||
|
struct lruvec;
|
||||||
|
struct page_vma_mapped_walk;
|
||||||
|
|
||||||
#ifdef CONFIG_LRU_GEN
|
#ifdef CONFIG_LRU_GEN
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
@ -406,8 +428,6 @@ enum {
|
|||||||
NR_LRU_GEN_CAPS
|
NR_LRU_GEN_CAPS
|
||||||
};
|
};
|
||||||
|
|
||||||
#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
|
|
||||||
|
|
||||||
#define MIN_LRU_BATCH BITS_PER_LONG
|
#define MIN_LRU_BATCH BITS_PER_LONG
|
||||||
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
|
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
|
||||||
|
|
||||||
|
20
mm/swap.c
20
mm/swap.c
@ -387,24 +387,20 @@ static void lru_gen_inc_refs(struct folio *folio)
|
|||||||
if (folio_test_unevictable(folio))
|
if (folio_test_unevictable(folio))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
/* see the comment on LRU_REFS_FLAGS */
|
||||||
if (!folio_test_referenced(folio)) {
|
if (!folio_test_referenced(folio)) {
|
||||||
folio_set_referenced(folio);
|
set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!folio_test_workingset(folio)) {
|
do {
|
||||||
|
if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) {
|
||||||
|
if (!folio_test_workingset(folio))
|
||||||
folio_set_workingset(folio);
|
folio_set_workingset(folio);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* see the comment on MAX_NR_TIERS */
|
new_flags = old_flags + BIT(LRU_REFS_PGOFF);
|
||||||
do {
|
|
||||||
new_flags = old_flags & LRU_REFS_MASK;
|
|
||||||
if (new_flags == LRU_REFS_MASK)
|
|
||||||
break;
|
|
||||||
|
|
||||||
new_flags += BIT(LRU_REFS_PGOFF);
|
|
||||||
new_flags |= old_flags & ~LRU_REFS_MASK;
|
|
||||||
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
|
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -417,7 +413,7 @@ static bool lru_gen_clear_refs(struct folio *folio)
|
|||||||
if (gen < 0)
|
if (gen < 0)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
|
set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
|
||||||
|
|
||||||
lrugen = &folio_lruvec(folio)->lrugen;
|
lrugen = &folio_lruvec(folio)->lrugen;
|
||||||
/* whether can do without shuffling under the LRU lock */
|
/* whether can do without shuffling under the LRU lock */
|
||||||
@ -499,7 +495,7 @@ void folio_add_lru(struct folio *folio)
|
|||||||
folio_test_unevictable(folio), folio);
|
folio_test_unevictable(folio), folio);
|
||||||
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
|
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
|
||||||
|
|
||||||
/* see the comment in lru_gen_add_folio() */
|
/* see the comment in lru_gen_folio_seq() */
|
||||||
if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
|
if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
|
||||||
lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
|
lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
|
||||||
folio_set_active(folio);
|
folio_set_active(folio);
|
||||||
|
135
mm/vmscan.c
135
mm/vmscan.c
@ -862,6 +862,31 @@ enum folio_references {
|
|||||||
FOLIOREF_ACTIVATE,
|
FOLIOREF_ACTIVATE,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef CONFIG_LRU_GEN
|
||||||
|
/*
|
||||||
|
* Only used on a mapped folio in the eviction (rmap walk) path, where promotion
|
||||||
|
* needs to be done by taking the folio off the LRU list and then adding it back
|
||||||
|
* with PG_active set. In contrast, the aging (page table walk) path uses
|
||||||
|
* folio_update_gen().
|
||||||
|
*/
|
||||||
|
static bool lru_gen_set_refs(struct folio *folio)
|
||||||
|
{
|
||||||
|
/* see the comment on LRU_REFS_FLAGS */
|
||||||
|
if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
|
||||||
|
set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static bool lru_gen_set_refs(struct folio *folio)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_LRU_GEN */
|
||||||
|
|
||||||
static enum folio_references folio_check_references(struct folio *folio,
|
static enum folio_references folio_check_references(struct folio *folio,
|
||||||
struct scan_control *sc)
|
struct scan_control *sc)
|
||||||
{
|
{
|
||||||
@ -870,7 +895,6 @@ static enum folio_references folio_check_references(struct folio *folio,
|
|||||||
|
|
||||||
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
|
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
|
||||||
&vm_flags);
|
&vm_flags);
|
||||||
referenced_folio = folio_test_clear_referenced(folio);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
|
* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
|
||||||
@ -888,6 +912,15 @@ static enum folio_references folio_check_references(struct folio *folio,
|
|||||||
if (referenced_ptes == -1)
|
if (referenced_ptes == -1)
|
||||||
return FOLIOREF_KEEP;
|
return FOLIOREF_KEEP;
|
||||||
|
|
||||||
|
if (lru_gen_enabled()) {
|
||||||
|
if (!referenced_ptes)
|
||||||
|
return FOLIOREF_RECLAIM;
|
||||||
|
|
||||||
|
return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP;
|
||||||
|
}
|
||||||
|
|
||||||
|
referenced_folio = folio_test_clear_referenced(folio);
|
||||||
|
|
||||||
if (referenced_ptes) {
|
if (referenced_ptes) {
|
||||||
/*
|
/*
|
||||||
* All mapped folios start out with page table
|
* All mapped folios start out with page table
|
||||||
@ -1092,11 +1125,6 @@ retry:
|
|||||||
if (!sc->may_unmap && folio_mapped(folio))
|
if (!sc->may_unmap && folio_mapped(folio))
|
||||||
goto keep_locked;
|
goto keep_locked;
|
||||||
|
|
||||||
/* folio_update_gen() tried to promote this page? */
|
|
||||||
if (lru_gen_enabled() && !ignore_references &&
|
|
||||||
folio_mapped(folio) && folio_test_referenced(folio))
|
|
||||||
goto keep_locked;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The number of dirty pages determines if a node is marked
|
* The number of dirty pages determines if a node is marked
|
||||||
* reclaim_congested. kswapd will stall and start writing
|
* reclaim_congested. kswapd will stall and start writing
|
||||||
@ -3164,16 +3192,19 @@ static int folio_update_gen(struct folio *folio, int gen)
|
|||||||
|
|
||||||
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
||||||
|
|
||||||
do {
|
/* see the comment on LRU_REFS_FLAGS */
|
||||||
/* lru_gen_del_folio() has isolated this page? */
|
if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
|
||||||
if (!(old_flags & LRU_GEN_MASK)) {
|
set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
|
||||||
/* for shrink_folio_list() */
|
return -1;
|
||||||
new_flags = old_flags | BIT(PG_referenced);
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
do {
|
||||||
new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
|
/* lru_gen_del_folio() has isolated this page? */
|
||||||
|
if (!(old_flags & LRU_GEN_MASK))
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
|
||||||
|
new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
|
||||||
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
|
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
|
||||||
|
|
||||||
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||||
@ -3197,7 +3228,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
|
|||||||
|
|
||||||
new_gen = (old_gen + 1) % MAX_NR_GENS;
|
new_gen = (old_gen + 1) % MAX_NR_GENS;
|
||||||
|
|
||||||
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
|
||||||
new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
|
new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
|
||||||
/* for folio_end_writeback() */
|
/* for folio_end_writeback() */
|
||||||
if (reclaiming)
|
if (reclaiming)
|
||||||
@ -3375,9 +3406,11 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
|
|||||||
static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
|
static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
|
||||||
struct pglist_data *pgdat)
|
struct pglist_data *pgdat)
|
||||||
{
|
{
|
||||||
struct folio *folio;
|
struct folio *folio = pfn_folio(pfn);
|
||||||
|
|
||||||
|
if (folio_lru_gen(folio) < 0)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
folio = pfn_folio(pfn);
|
|
||||||
if (folio_nid(folio) != pgdat->node_id)
|
if (folio_nid(folio) != pgdat->node_id)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
@ -3754,8 +3787,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
|
|||||||
while (!list_empty(head)) {
|
while (!list_empty(head)) {
|
||||||
struct folio *folio = lru_to_folio(head);
|
struct folio *folio = lru_to_folio(head);
|
||||||
int refs = folio_lru_refs(folio);
|
int refs = folio_lru_refs(folio);
|
||||||
int tier = lru_tier_from_refs(refs);
|
bool workingset = folio_test_workingset(folio);
|
||||||
int delta = folio_nr_pages(folio);
|
|
||||||
|
|
||||||
VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
|
VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
|
||||||
VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
|
VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
|
||||||
@ -3765,8 +3797,14 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
|
|||||||
new_gen = folio_inc_gen(lruvec, folio, false);
|
new_gen = folio_inc_gen(lruvec, folio, false);
|
||||||
list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
|
list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
|
||||||
|
|
||||||
|
/* don't count the workingset being lazily promoted */
|
||||||
|
if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
|
||||||
|
int tier = lru_tier_from_refs(refs, workingset);
|
||||||
|
int delta = folio_nr_pages(folio);
|
||||||
|
|
||||||
WRITE_ONCE(lrugen->protected[hist][type][tier],
|
WRITE_ONCE(lrugen->protected[hist][type][tier],
|
||||||
lrugen->protected[hist][type][tier] + delta);
|
lrugen->protected[hist][type][tier] + delta);
|
||||||
|
}
|
||||||
|
|
||||||
if (!--remaining)
|
if (!--remaining)
|
||||||
return false;
|
return false;
|
||||||
@ -4152,15 +4190,9 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
|||||||
old_gen = folio_update_gen(folio, new_gen);
|
old_gen = folio_update_gen(folio, new_gen);
|
||||||
if (old_gen >= 0 && old_gen != new_gen)
|
if (old_gen >= 0 && old_gen != new_gen)
|
||||||
update_batch_size(walk, folio, old_gen, new_gen);
|
update_batch_size(walk, folio, old_gen, new_gen);
|
||||||
|
} else if (lru_gen_set_refs(folio)) {
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
old_gen = folio_lru_gen(folio);
|
old_gen = folio_lru_gen(folio);
|
||||||
if (old_gen < 0)
|
if (old_gen >= 0 && old_gen != new_gen)
|
||||||
folio_set_referenced(folio);
|
|
||||||
else if (old_gen != new_gen) {
|
|
||||||
folio_clear_lru_refs(folio);
|
|
||||||
folio_activate(folio);
|
folio_activate(folio);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -4322,7 +4354,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
|
|||||||
int zone = folio_zonenum(folio);
|
int zone = folio_zonenum(folio);
|
||||||
int delta = folio_nr_pages(folio);
|
int delta = folio_nr_pages(folio);
|
||||||
int refs = folio_lru_refs(folio);
|
int refs = folio_lru_refs(folio);
|
||||||
int tier = lru_tier_from_refs(refs);
|
bool workingset = folio_test_workingset(folio);
|
||||||
|
int tier = lru_tier_from_refs(refs, workingset);
|
||||||
struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
||||||
|
|
||||||
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
|
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
|
||||||
@ -4344,14 +4377,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* protected */
|
/* protected */
|
||||||
if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
|
if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) {
|
||||||
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
|
||||||
|
|
||||||
gen = folio_inc_gen(lruvec, folio, false);
|
gen = folio_inc_gen(lruvec, folio, false);
|
||||||
list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
|
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
|
||||||
|
|
||||||
|
/* don't count the workingset being lazily promoted */
|
||||||
|
if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
|
||||||
|
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
||||||
|
|
||||||
WRITE_ONCE(lrugen->protected[hist][type][tier],
|
WRITE_ONCE(lrugen->protected[hist][type][tier],
|
||||||
lrugen->protected[hist][type][tier] + delta);
|
lrugen->protected[hist][type][tier] + delta);
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4371,8 +4407,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* waiting for writeback */
|
/* waiting for writeback */
|
||||||
if (folio_test_locked(folio) || writeback ||
|
if (writeback || (type == LRU_GEN_FILE && dirty)) {
|
||||||
(type == LRU_GEN_FILE && dirty)) {
|
|
||||||
gen = folio_inc_gen(lruvec, folio, true);
|
gen = folio_inc_gen(lruvec, folio, true);
|
||||||
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
|
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
|
||||||
return true;
|
return true;
|
||||||
@ -4401,13 +4436,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* see the comment on MAX_NR_TIERS */
|
/* see the comment on LRU_REFS_FLAGS */
|
||||||
if (!folio_test_referenced(folio))
|
if (!folio_test_referenced(folio))
|
||||||
folio_clear_lru_refs(folio);
|
set_mask_bits(&folio->flags, LRU_REFS_MASK, 0);
|
||||||
|
|
||||||
/* for shrink_folio_list() */
|
/* for shrink_folio_list() */
|
||||||
folio_clear_reclaim(folio);
|
folio_clear_reclaim(folio);
|
||||||
folio_clear_referenced(folio);
|
|
||||||
|
|
||||||
success = lru_gen_del_folio(lruvec, folio, true);
|
success = lru_gen_del_folio(lruvec, folio, true);
|
||||||
VM_WARN_ON_ONCE_FOLIO(!success, folio);
|
VM_WARN_ON_ONCE_FOLIO(!success, folio);
|
||||||
@ -4597,31 +4631,24 @@ retry:
|
|||||||
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
|
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
|
||||||
|
|
||||||
list_for_each_entry_safe_reverse(folio, next, &list, lru) {
|
list_for_each_entry_safe_reverse(folio, next, &list, lru) {
|
||||||
|
DEFINE_MIN_SEQ(lruvec);
|
||||||
|
|
||||||
if (!folio_evictable(folio)) {
|
if (!folio_evictable(folio)) {
|
||||||
list_del(&folio->lru);
|
list_del(&folio->lru);
|
||||||
folio_putback_lru(folio);
|
folio_putback_lru(folio);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (folio_test_reclaim(folio) &&
|
|
||||||
(folio_test_dirty(folio) || folio_test_writeback(folio))) {
|
|
||||||
/* restore LRU_REFS_FLAGS cleared by isolate_folio() */
|
|
||||||
if (folio_test_workingset(folio))
|
|
||||||
folio_set_referenced(folio);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
|
|
||||||
folio_mapped(folio) || folio_test_locked(folio) ||
|
|
||||||
folio_test_dirty(folio) || folio_test_writeback(folio)) {
|
|
||||||
/* don't add rejected folios to the oldest generation */
|
|
||||||
set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
|
|
||||||
BIT(PG_active));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* retry folios that may have missed folio_rotate_reclaimable() */
|
/* retry folios that may have missed folio_rotate_reclaimable() */
|
||||||
|
if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
|
||||||
|
!folio_test_dirty(folio) && !folio_test_writeback(folio)) {
|
||||||
list_move(&folio->lru, &clean);
|
list_move(&folio->lru, &clean);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* don't add rejected folios to the oldest generation */
|
||||||
|
if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
|
||||||
|
set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
|
||||||
}
|
}
|
||||||
|
|
||||||
spin_lock_irq(&lruvec->lru_lock);
|
spin_lock_irq(&lruvec->lru_lock);
|
||||||
|
@ -239,7 +239,8 @@ static void *lru_gen_eviction(struct folio *folio)
|
|||||||
int type = folio_is_file_lru(folio);
|
int type = folio_is_file_lru(folio);
|
||||||
int delta = folio_nr_pages(folio);
|
int delta = folio_nr_pages(folio);
|
||||||
int refs = folio_lru_refs(folio);
|
int refs = folio_lru_refs(folio);
|
||||||
int tier = lru_tier_from_refs(refs);
|
bool workingset = folio_test_workingset(folio);
|
||||||
|
int tier = lru_tier_from_refs(refs, workingset);
|
||||||
struct mem_cgroup *memcg = folio_memcg(folio);
|
struct mem_cgroup *memcg = folio_memcg(folio);
|
||||||
struct pglist_data *pgdat = folio_pgdat(folio);
|
struct pglist_data *pgdat = folio_pgdat(folio);
|
||||||
|
|
||||||
@ -253,7 +254,7 @@ static void *lru_gen_eviction(struct folio *folio)
|
|||||||
hist = lru_hist_from_seq(min_seq);
|
hist = lru_hist_from_seq(min_seq);
|
||||||
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
|
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
|
||||||
|
|
||||||
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
|
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -304,24 +305,20 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
|
|||||||
lrugen = &lruvec->lrugen;
|
lrugen = &lruvec->lrugen;
|
||||||
|
|
||||||
hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
|
hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
|
||||||
/* see the comment in folio_lru_refs() */
|
refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
|
||||||
refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
|
tier = lru_tier_from_refs(refs, workingset);
|
||||||
tier = lru_tier_from_refs(refs);
|
|
||||||
|
|
||||||
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
|
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
|
||||||
|
|
||||||
|
/* see folio_add_lru() where folio_set_active() will be called */
|
||||||
|
if (lru_gen_in_fault())
|
||||||
mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
|
mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
|
||||||
|
|
||||||
/*
|
if (workingset) {
|
||||||
* Count the following two cases as stalls:
|
folio_set_workingset(folio);
|
||||||
* 1. For pages accessed through page tables, hotter pages pushed out
|
|
||||||
* hot pages which refaulted immediately.
|
|
||||||
* 2. For pages accessed multiple times through file descriptors,
|
|
||||||
* they would have been protected by sort_folio().
|
|
||||||
*/
|
|
||||||
if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) {
|
|
||||||
set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset));
|
|
||||||
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
|
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
|
||||||
}
|
} else
|
||||||
|
set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
|
||||||
unlock:
|
unlock:
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user