Merge branch 'mm-hotfixes-stable' into mm-stable

Pick up e7ac4daeed ("mm: count zeromap read and set for swapout and
swapin") in order to move

mm: define obj_cgroup_get() if CONFIG_MEMCG is not defined
mm: zswap: modify zswap_compress() to accept a page instead of a folio
mm: zswap: rename zswap_pool_get() to zswap_pool_tryget()
mm: zswap: modify zswap_stored_pages to be atomic_long_t
mm: zswap: support large folios in zswap_store()
mm: swap: count successful large folio zswap stores in hugepage zswpout stats
mm: zswap: zswap_store_page() will initialize entry after adding to xarray.
mm: add per-order mTHP swpin counters

from mm-unstable into mm-stable.
This commit is contained in:
Andrew Morton 2024-11-11 00:04:10 -08:00
commit 2ec0859039
21 changed files with 127 additions and 55 deletions

View File

@ -666,6 +666,7 @@ Tomeu Vizoso <tomeu@tomeuvizoso.net> <tomeu.vizoso@collabora.com>
Thomas Graf <tgraf@suug.ch>
Thomas Körper <socketcan@esd.eu> <thomas.koerper@esd.eu>
Thomas Pedersen <twp@codeaurora.org>
Thorsten Blum <thorsten.blum@linux.dev> <thorsten.blum@toblux.com>
Tiezhu Yang <yangtiezhu@loongson.cn> <kernelpatch@126.com>
Tingwei Zhang <quic_tingwei@quicinc.com> <tingwei@codeaurora.org>
Tirupathi Reddy <quic_tirupath@quicinc.com> <tirupath@codeaurora.org>

View File

@ -1599,6 +1599,15 @@ The following nested keys are defined.
pglazyfreed (npn)
Amount of reclaimed lazyfree pages
swpin_zero
Number of pages swapped into memory and filled with zero, where I/O
was optimized out because the page content was detected to be zero
during swapout.
swpout_zero
Number of zero-filled pages swapped out with I/O skipped due to the
content being detected as zero.
zswpin
Number of pages moved in to memory from zswap.

View File

@ -6688,7 +6688,7 @@
0: no polling (default)
thp_anon= [KNL]
Format: <size>,<size>[KMG]:<state>;<size>-<size>[KMG]:<state>
Format: <size>[KMG],<size>[KMG]:<state>;<size>[KMG]-<size>[KMG]:<state>
state is one of "always", "madvise", "never" or "inherit".
Control the default behavior of the system with respect
to anonymous transparent hugepages.

View File

@ -303,7 +303,7 @@ control by passing the parameter ``transparent_hugepage=always`` or
kernel command line.
Alternatively, each supported anonymous THP size can be controlled by
passing ``thp_anon=<size>,<size>[KMG]:<state>;<size>-<size>[KMG]:<state>``,
passing ``thp_anon=<size>[KMG],<size>[KMG]:<state>;<size>[KMG]-<size>[KMG]:<state>``,
where ``<size>`` is the THP size (must be a power of 2 of PAGE_SIZE and
supported anonymous THP) and ``<state>`` is one of ``always``, ``madvise``,
``never`` or ``inherit``.

View File

@ -2036,8 +2036,7 @@ static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
rc = 0;
ocfs2_xa_cleanup_value_truncate(loc, "removing",
orig_clusters);
if (rc)
goto out;
goto out;
}
}

View File

@ -457,10 +457,6 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
#endif
}
static const struct vm_operations_struct vmcore_mmap_ops = {
.fault = mmap_vmcore_fault,
};
/**
* vmcore_alloc_buf - allocate buffer in vmalloc memory
* @size: size of buffer
@ -488,6 +484,11 @@ static inline char *vmcore_alloc_buf(size_t size)
* virtually contiguous user-space in ELF layout.
*/
#ifdef CONFIG_MMU
static const struct vm_operations_struct vmcore_mmap_ops = {
.fault = mmap_vmcore_fault,
};
/*
* remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
* reported as not being ram with the zero page.

View File

@ -1711,8 +1711,9 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
static inline void count_objcg_event(struct obj_cgroup *objcg,
enum vm_event_item idx)
static inline void count_objcg_events(struct obj_cgroup *objcg,
enum vm_event_item idx,
unsigned long count)
{
struct mem_cgroup *memcg;
@ -1721,7 +1722,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
count_memcg_events(memcg, idx, 1);
count_memcg_events(memcg, idx, count);
rcu_read_unlock();
}
@ -1776,8 +1777,9 @@ static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
return NULL;
}
static inline void count_objcg_event(struct obj_cgroup *objcg,
enum vm_event_item idx)
static inline void count_objcg_events(struct obj_cgroup *objcg,
enum vm_event_item idx,
unsigned long count)
{
}

View File

@ -825,6 +825,7 @@ struct zone {
unsigned long watermark_boost;
unsigned long nr_reserved_highatomic;
unsigned long nr_free_highatomic;
/*
* We don't know if the memory that we're going to allocate will be

View File

@ -141,7 +141,8 @@ static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type ty
long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type);
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
bool override_rlimit);
void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);

View File

@ -134,6 +134,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifdef CONFIG_SWAP
SWAP_RA,
SWAP_RA_HIT,
SWPIN_ZERO,
SWPOUT_ZERO,
#ifdef CONFIG_KSM
KSM_SWPIN_COPY,
#endif

View File

@ -419,7 +419,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
*/
rcu_read_lock();
ucounts = task_ucounts(t);
sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING,
override_rlimit);
rcu_read_unlock();
if (!sigpending)
return NULL;

View File

@ -307,7 +307,8 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
do_dec_rlimit_put_ucounts(ucounts, NULL, type);
}
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
bool override_rlimit)
{
/* Caller must hold a reference to ucounts */
struct ucounts *iter;
@ -317,10 +318,11 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
long new = atomic_long_add_return(1, &iter->rlimit[type]);
if (new < 0 || new > max)
goto unwind;
goto dec_unwind;
if (iter == ucounts)
ret = new;
max = get_userns_rlimit_max(iter->ns, type);
if (!override_rlimit)
max = get_userns_rlimit_max(iter->ns, type);
/*
* Grab an extra ucount reference for the caller when
* the rlimit count was previously 0.
@ -334,7 +336,6 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
dec_unwind:
dec = atomic_long_sub_return(1, &iter->rlimit[type]);
WARN_ON_ONCE(dec < 0);
unwind:
do_dec_rlimit_put_ucounts(ucounts, iter, type);
return 0;
}

View File

@ -74,15 +74,21 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
* warm caches and TLB hits. in default vmalloc is used to
* reduce the pressure of kernel slab system. as we know,
* mimimal size of vmalloc is one page since vmalloc would
* always align the requested size to page size
* always align the requested size to page size.
* but if vmalloc fails or it is not available (e.g. GFP_ATOMIC)
* allocate percpu slot with kmalloc.
*/
if ((pool->gfp & GFP_ATOMIC) == GFP_ATOMIC)
slot = kmalloc_node(size, pool->gfp, cpu_to_node(i));
else
slot = NULL;
if ((pool->gfp & (GFP_ATOMIC | GFP_KERNEL)) != GFP_ATOMIC)
slot = __vmalloc_node(size, sizeof(void *), pool->gfp,
cpu_to_node(i), __builtin_return_address(0));
if (!slot)
return -ENOMEM;
if (!slot) {
slot = kmalloc_node(size, pool->gfp, cpu_to_node(i));
if (!slot)
return -ENOMEM;
}
memset(slot, 0, size);
pool->cpu_slots[i] = slot;

View File

@ -1412,7 +1412,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
damon_for_each_scheme(s, c) {
struct damos_quota *quota = &s->quota;
if (c->passed_sample_intervals != s->next_apply_sis)
if (c->passed_sample_intervals < s->next_apply_sis)
continue;
if (!s->wmarks.activated)
@ -1456,17 +1456,31 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input,
unsigned long score)
{
const unsigned long goal = 10000;
unsigned long score_goal_diff = max(goal, score) - min(goal, score);
unsigned long score_goal_diff_bp = score_goal_diff * 10000 / goal;
unsigned long compensation = last_input * score_goal_diff_bp / 10000;
/* Set minimum input as 10000 to avoid compensation be zero */
const unsigned long min_input = 10000;
unsigned long score_goal_diff, compensation;
bool over_achieving = score > goal;
if (goal > score)
if (score == goal)
return last_input;
if (score >= goal * 2)
return min_input;
if (over_achieving)
score_goal_diff = score - goal;
else
score_goal_diff = goal - score;
if (last_input < ULONG_MAX / score_goal_diff)
compensation = last_input * score_goal_diff / goal;
else
compensation = last_input / goal * score_goal_diff;
if (over_achieving)
return max(last_input - compensation, min_input);
if (last_input < ULONG_MAX - compensation)
return last_input + compensation;
if (last_input > compensation + min_input)
return last_input - compensation;
return min_input;
return ULONG_MAX;
}
#ifdef CONFIG_PSI
@ -1622,7 +1636,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
bool has_schemes_to_apply = false;
damon_for_each_scheme(s, c) {
if (c->passed_sample_intervals != s->next_apply_sis)
if (c->passed_sample_intervals < s->next_apply_sis)
continue;
if (!s->wmarks.activated)
@ -1642,9 +1656,9 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
}
damon_for_each_scheme(s, c) {
if (c->passed_sample_intervals != s->next_apply_sis)
if (c->passed_sample_intervals < s->next_apply_sis)
continue;
s->next_apply_sis +=
s->next_apply_sis = c->passed_sample_intervals +
(s->apply_interval_us ? s->apply_interval_us :
c->attrs.aggr_interval) / sample_interval;
}
@ -2000,7 +2014,7 @@ static int kdamond_fn(void *data)
if (ctx->ops.check_accesses)
max_nr_accesses = ctx->ops.check_accesses(ctx);
if (ctx->passed_sample_intervals == next_aggregation_sis) {
if (ctx->passed_sample_intervals >= next_aggregation_sis) {
kdamond_merge_regions(ctx,
max_nr_accesses / 10,
sz_limit);
@ -2018,7 +2032,7 @@ static int kdamond_fn(void *data)
sample_interval = ctx->attrs.sample_interval ?
ctx->attrs.sample_interval : 1;
if (ctx->passed_sample_intervals == next_aggregation_sis) {
if (ctx->passed_sample_intervals >= next_aggregation_sis) {
ctx->next_aggregation_sis = next_aggregation_sis +
ctx->attrs.aggr_interval / sample_interval;
@ -2028,7 +2042,7 @@ static int kdamond_fn(void *data)
ctx->ops.reset_aggregated(ctx);
}
if (ctx->passed_sample_intervals == next_ops_update_sis) {
if (ctx->passed_sample_intervals >= next_ops_update_sis) {
ctx->next_ops_update_sis = next_ops_update_sis +
ctx->attrs.ops_update_interval /
sample_interval;

View File

@ -438,6 +438,10 @@ static const unsigned int memcg_vm_event_stat[] = {
PGDEACTIVATE,
PGLAZYFREE,
PGLAZYFREED,
#ifdef CONFIG_SWAP
SWPIN_ZERO,
SWPOUT_ZERO,
#endif
#ifdef CONFIG_ZSWAP
ZSWPIN,
ZSWPOUT,

View File

@ -725,14 +725,17 @@ static int apply_mlockall_flags(int flags)
}
for_each_vma(vmi, vma) {
int error;
vm_flags_t newflags;
newflags = vma->vm_flags & ~VM_LOCKED_MASK;
newflags |= to_add;
/* Ignore errors */
mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
newflags);
error = mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
newflags);
/* Ignore errors, but prev needs fixing up. */
if (error)
prev = vma;
cond_resched();
}
out:

View File

@ -635,6 +635,8 @@ compaction_capture(struct capture_control *capc, struct page *page,
static inline void account_freepages(struct zone *zone, int nr_pages,
int migratetype)
{
lockdep_assert_held(&zone->lock);
if (is_migrate_isolate(migratetype))
return;
@ -642,6 +644,9 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
if (is_migrate_cma(migratetype))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
else if (is_migrate_highatomic(migratetype))
WRITE_ONCE(zone->nr_free_highatomic,
zone->nr_free_highatomic + nr_pages);
}
/* Used for pages not on another list */
@ -3079,11 +3084,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
/*
* If the caller does not have rights to reserves below the min
* watermark then subtract the high-atomic reserves. This will
* over-estimate the size of the atomic reserve but it avoids a search.
* watermark then subtract the free pages reserved for highatomic.
*/
if (likely(!(alloc_flags & ALLOC_RESERVES)))
unusable_free += z->nr_reserved_highatomic;
unusable_free += READ_ONCE(z->nr_free_highatomic);
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */

View File

@ -204,7 +204,9 @@ static bool is_folio_zero_filled(struct folio *folio)
static void swap_zeromap_folio_set(struct folio *folio)
{
struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio);
struct swap_info_struct *sis = swp_swap_info(folio->swap);
int nr_pages = folio_nr_pages(folio);
swp_entry_t entry;
unsigned int i;
@ -212,6 +214,12 @@ static void swap_zeromap_folio_set(struct folio *folio)
entry = page_swap_entry(folio_page(folio, i));
set_bit(swp_offset(entry), sis->zeromap);
}
count_vm_events(SWPOUT_ZERO, nr_pages);
if (objcg) {
count_objcg_events(objcg, SWPOUT_ZERO, nr_pages);
obj_cgroup_put(objcg);
}
}
static void swap_zeromap_folio_clear(struct folio *folio)
@ -505,6 +513,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
static bool swap_read_folio_zeromap(struct folio *folio)
{
int nr_pages = folio_nr_pages(folio);
struct obj_cgroup *objcg;
bool is_zeromap;
/*
@ -519,6 +528,13 @@ static bool swap_read_folio_zeromap(struct folio *folio)
if (!is_zeromap)
return false;
objcg = get_obj_cgroup_from_folio(folio);
count_vm_events(SWPIN_ZERO, nr_pages);
if (objcg) {
count_objcg_events(objcg, SWPIN_ZERO, nr_pages);
obj_cgroup_put(objcg);
}
folio_zero_range(folio, 0, folio_size(folio));
folio_mark_uptodate(folio);
return true;

View File

@ -1415,6 +1415,8 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_SWAP
"swap_ra",
"swap_ra_hit",
"swpin_zero",
"swpout_zero",
#ifdef CONFIG_KSM
"ksm_swpin_copy",
#endif

View File

@ -1053,7 +1053,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
count_vm_event(ZSWPWB);
if (entry->objcg)
count_objcg_event(entry->objcg, ZSWPWB);
count_objcg_events(entry->objcg, ZSWPWB, 1);
zswap_entry_free(entry);
@ -1483,7 +1483,7 @@ bool zswap_store(struct folio *folio)
if (objcg) {
obj_cgroup_charge_zswap(objcg, entry->length);
count_objcg_event(objcg, ZSWPOUT);
count_objcg_events(objcg, ZSWPOUT, 1);
}
/*
@ -1577,7 +1577,7 @@ bool zswap_load(struct folio *folio)
count_vm_event(ZSWPIN);
if (entry->objcg)
count_objcg_event(entry->objcg, ZSWPIN);
count_objcg_events(entry->objcg, ZSWPIN, 1);
if (swapcache) {
zswap_entry_free(entry);

View File

@ -44,13 +44,6 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
if (fd < 0)
ksft_exit_fail_perror("Error opening file\n");
/* Get the free huge pages before allocation */
free_hpage_b = get_free_hugepages();
if (free_hpage_b == 0) {
close(fd);
ksft_exit_skip("No free hugepage, exiting!\n");
}
/* Allocate a hugetlb page */
orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0);
if (orig_buffer == MAP_FAILED) {
@ -94,8 +87,20 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
int main(void)
{
size_t pagesize = 0;
int fd;
ksft_print_header();
/* Open the file to DIO */
fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664);
if (fd < 0)
ksft_exit_skip("Unable to allocate file: %s\n", strerror(errno));
close(fd);
/* Check if huge pages are free */
if (!get_free_hugepages())
ksft_exit_skip("No free hugepage, exiting\n");
ksft_set_plan(4);
/* Get base page size */