mm: multi-gen LRU: section for Bloom filters

Move Bloom filters code into a dedicated section.  Improve the design doc
to explain Bloom filter usage and connection between aging and eviction in
their use.

Link: https://lkml.kernel.org/r/20230118001827.1040870-4-talumbau@google.com
Signed-off-by: T.J. Alumbaugh <talumbau@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
T.J. Alumbaugh 2023-01-18 00:18:23 +00:00 committed by Andrew Morton
parent db19a43d9b
commit ccbbbb8594
2 changed files with 108 additions and 88 deletions

View File

@ -170,6 +170,22 @@ promotes hot pages. If the scan was done cacheline efficiently, it
adds the PMD entry pointing to the PTE table to the Bloom filter. This
forms a feedback loop between the eviction and the aging.
Bloom Filters
-------------
Bloom filters are a space and memory efficient data structure for set
membership test, i.e., test if an element is not in the set or may be
in the set.
In the eviction path, specifically, in ``lru_gen_look_around()``, if a
PMD has a sufficient number of hot pages, its address is placed in the
filter. In the aging path, set membership means that the PTE range
will be scanned for young pages.
Note that Bloom filters are probabilistic on set membership. If a test
is false positive, the cost is an additional scan of a range of PTEs,
which may yield hot pages anyway. Parameters of the filter itself can
control the false positive rate in the limit.
Summary
-------
The multi-gen LRU can be disassembled into the following parts:

View File

@ -3233,6 +3233,98 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
}
/******************************************************************************
* Bloom filters
******************************************************************************/
/*
* Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
* n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
* bits in a bitmap, k is the number of hash functions and n is the number of
* inserted items.
*
* Page table walkers use one of the two filters to reduce their search space.
* To get rid of non-leaf entries that no longer have enough leaf entries, the
* aging uses the double-buffering technique to flip to the other filter each
* time it produces a new generation. For non-leaf entries that have enough
* leaf entries, the aging carries them over to the next generation in
* walk_pmd_range(); the eviction also report them when walking the rmap
* in lru_gen_look_around().
*
* For future optimizations:
* 1. It's not necessary to keep both filters all the time. The spare one can be
* freed after the RCU grace period and reallocated if needed again.
* 2. And when reallocating, it's worth scaling its size according to the number
* of inserted entries in the other filter, to reduce the memory overhead on
* small systems and false positives on large systems.
* 3. Jenkins' hash function is an alternative to Knuth's.
*/
#define BLOOM_FILTER_SHIFT 15
static inline int filter_gen_from_seq(unsigned long seq)
{
return seq % NR_BLOOM_FILTERS;
}
static void get_item_key(void *item, int *key)
{
u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
key[1] = hash >> BLOOM_FILTER_SHIFT;
}
static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
{
int key[2];
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
filter = READ_ONCE(lruvec->mm_state.filters[gen]);
if (!filter)
return true;
get_item_key(item, key);
return test_bit(key[0], filter) && test_bit(key[1], filter);
}
static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
{
int key[2];
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
filter = READ_ONCE(lruvec->mm_state.filters[gen]);
if (!filter)
return;
get_item_key(item, key);
if (!test_bit(key[0], filter))
set_bit(key[0], filter);
if (!test_bit(key[1], filter))
set_bit(key[1], filter);
}
static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
{
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
filter = lruvec->mm_state.filters[gen];
if (filter) {
bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
return;
}
filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
}
/******************************************************************************
* mm_struct list
******************************************************************************/
@ -3352,94 +3444,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
}
#endif
/*
* Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
* n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
* bits in a bitmap, k is the number of hash functions and n is the number of
* inserted items.
*
* Page table walkers use one of the two filters to reduce their search space.
* To get rid of non-leaf entries that no longer have enough leaf entries, the
* aging uses the double-buffering technique to flip to the other filter each
* time it produces a new generation. For non-leaf entries that have enough
* leaf entries, the aging carries them over to the next generation in
* walk_pmd_range(); the eviction also report them when walking the rmap
* in lru_gen_look_around().
*
* For future optimizations:
* 1. It's not necessary to keep both filters all the time. The spare one can be
* freed after the RCU grace period and reallocated if needed again.
* 2. And when reallocating, it's worth scaling its size according to the number
* of inserted entries in the other filter, to reduce the memory overhead on
* small systems and false positives on large systems.
* 3. Jenkins' hash function is an alternative to Knuth's.
*/
#define BLOOM_FILTER_SHIFT 15
static inline int filter_gen_from_seq(unsigned long seq)
{
return seq % NR_BLOOM_FILTERS;
}
static void get_item_key(void *item, int *key)
{
u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
key[1] = hash >> BLOOM_FILTER_SHIFT;
}
static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
{
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
filter = lruvec->mm_state.filters[gen];
if (filter) {
bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
return;
}
filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
}
static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
{
int key[2];
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
filter = READ_ONCE(lruvec->mm_state.filters[gen]);
if (!filter)
return;
get_item_key(item, key);
if (!test_bit(key[0], filter))
set_bit(key[0], filter);
if (!test_bit(key[1], filter))
set_bit(key[1], filter);
}
static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
{
int key[2];
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
filter = READ_ONCE(lruvec->mm_state.filters[gen]);
if (!filter)
return true;
get_item_key(item, key);
return test_bit(key[0], filter) && test_bit(key[1], filter);
}
static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
{
int i;