// SPDX-License-Identifier: GPL-2.0 /* * DAMON Primitives for Virtual Address Spaces * * Author: SeongJae Park */ #define pr_fmt(fmt) "damon-va: " fmt #include #include #include #include #include #include #include #include "ops-common.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST #undef DAMON_MIN_REGION #define DAMON_MIN_REGION 1 #endif /* * 't->pid' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. */ static inline struct task_struct *damon_get_task_struct(struct damon_target *t) { return get_pid_task(t->pid, PIDTYPE_PID); } /* * Get the mm_struct of the given target * * Caller _must_ put the mm_struct after use, unless it is NULL. * * Returns the mm_struct of the target on success, NULL on failure */ static struct mm_struct *damon_get_mm(struct damon_target *t) { struct task_struct *task; struct mm_struct *mm; task = damon_get_task_struct(t); if (!task) return NULL; mm = get_task_mm(task); put_task_struct(task); return mm; } /* * Functions for the initial monitoring target regions construction */ /* * Size-evenly split a region into 'nr_pieces' small regions * * Returns 0 on success, or negative error code otherwise. */ static int damon_va_evenly_split_region(struct damon_target *t, struct damon_region *r, unsigned int nr_pieces) { unsigned long sz_orig, sz_piece, orig_end; struct damon_region *n = NULL, *next; unsigned long start; if (!r || !nr_pieces) return -EINVAL; orig_end = r->ar.end; sz_orig = r->ar.end - r->ar.start; sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); if (!sz_piece) return -EINVAL; r->ar.end = r->ar.start + sz_piece; next = damon_next_region(r); for (start = r->ar.end; start + sz_piece <= orig_end; start += sz_piece) { n = damon_new_region(start, start + sz_piece); if (!n) return -ENOMEM; damon_insert_region(n, r, next, t); r = n; } /* complement last region for possible rounding error */ if (n) n->ar.end = orig_end; return 0; } static unsigned long sz_range(struct damon_addr_range *r) { return r->end - r->start; } /* * Find three regions separated by two biggest unmapped regions * * vma the head vma of the target address space * regions an array of three address ranges that results will be saved * * This function receives an address space and finds three regions in it which * separated by the two biggest unmapped regions in the space. Please refer to * below comments of '__damon_va_init_regions()' function to know why this is * necessary. * * Returns 0 if success, or negative error code otherwise. */ static int __damon_va_three_regions(struct vm_area_struct *vma, struct damon_addr_range regions[3]) { struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0}; struct vm_area_struct *last_vma = NULL; unsigned long start = 0; struct rb_root rbroot; /* Find two biggest gaps so that first_gap > second_gap > others */ for (; vma; vma = vma->vm_next) { if (!last_vma) { start = vma->vm_start; goto next; } if (vma->rb_subtree_gap <= sz_range(&second_gap)) { rbroot.rb_node = &vma->vm_rb; vma = rb_entry(rb_last(&rbroot), struct vm_area_struct, vm_rb); goto next; } gap.start = last_vma->vm_end; gap.end = vma->vm_start; if (sz_range(&gap) > sz_range(&second_gap)) { swap(gap, second_gap); if (sz_range(&second_gap) > sz_range(&first_gap)) swap(second_gap, first_gap); } next: last_vma = vma; } if (!sz_range(&second_gap) || !sz_range(&first_gap)) return -EINVAL; /* Sort the two biggest gaps by address */ if (first_gap.start > second_gap.start) swap(first_gap, second_gap); /* Store the result */ regions[0].start = ALIGN(start, DAMON_MIN_REGION); regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION); return 0; } /* * Get the three regions in the given target (task) * * Returns 0 on success, negative error code otherwise. */ static int damon_va_three_regions(struct damon_target *t, struct damon_addr_range regions[3]) { struct mm_struct *mm; int rc; mm = damon_get_mm(t); if (!mm) return -EINVAL; mmap_read_lock(mm); rc = __damon_va_three_regions(mm->mmap, regions); mmap_read_unlock(mm); mmput(mm); return rc; } /* * Initialize the monitoring target regions for the given target (task) * * t the given target * * Because only a number of small portions of the entire address space * is actually mapped to the memory and accessed, monitoring the unmapped * regions is wasteful. That said, because we can deal with small noises, * tracking every mapping is not strictly required but could even incur a high * overhead if the mapping frequently changes or the number of mappings is * high. The adaptive regions adjustment mechanism will further help to deal * with the noise by simply identifying the unmapped areas as a region that * has no access. Moreover, applying the real mappings that would have many * unmapped areas inside will make the adaptive mechanism quite complex. That * said, too huge unmapped areas inside the monitoring target should be removed * to not take the time for the adaptive mechanism. * * For the reason, we convert the complex mappings to three distinct regions * that cover every mapped area of the address space. Also the two gaps * between the three regions are the two biggest unmapped areas in the given * address space. In detail, this function first identifies the start and the * end of the mappings and the two biggest unmapped areas of the address space. * Then, it constructs the three regions as below: * * [mappings[0]->start, big_two_unmapped_areas[0]->start) * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start) * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end) * * As usual memory map of processes is as below, the gap between the heap and * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed * region and the stack will be two biggest unmapped regions. Because these * gaps are exceptionally huge areas in usual address space, excluding these * two biggest unmapped regions will be sufficient to make a trade-off. * * * * * (other mmap()-ed regions and small unmapped regions) * * * */ static void __damon_va_init_regions(struct damon_ctx *ctx, struct damon_target *t) { struct damon_target *ti; struct damon_region *r; struct damon_addr_range regions[3]; unsigned long sz = 0, nr_pieces; int i, tidx = 0; if (damon_va_three_regions(t, regions)) { damon_for_each_target(ti, ctx) { if (ti == t) break; tidx++; } pr_debug("Failed to get three regions of %dth target\n", tidx); return; } for (i = 0; i < 3; i++) sz += regions[i].end - regions[i].start; if (ctx->min_nr_regions) sz /= ctx->min_nr_regions; if (sz < DAMON_MIN_REGION) sz = DAMON_MIN_REGION; /* Set the initial three regions of the target */ for (i = 0; i < 3; i++) { r = damon_new_region(regions[i].start, regions[i].end); if (!r) { pr_err("%d'th init region creation failed\n", i); return; } damon_add_region(r, t); nr_pieces = (regions[i].end - regions[i].start) / sz; damon_va_evenly_split_region(t, r, nr_pieces); } } /* Initialize '->regions_list' of every target (task) */ static void damon_va_init(struct damon_ctx *ctx) { struct damon_target *t; damon_for_each_target(t, ctx) { /* the user may set the target regions as they want */ if (!damon_nr_regions(t)) __damon_va_init_regions(ctx, t); } } /* * Update regions for current memory mappings */ static void damon_va_update(struct damon_ctx *ctx) { struct damon_addr_range three_regions[3]; struct damon_target *t; damon_for_each_target(t, ctx) { if (damon_va_three_regions(t, three_regions)) continue; damon_set_regions(t, three_regions, 3); } } static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; spinlock_t *ptl; if (pmd_trans_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); if (!pmd_present(*pmd)) { spin_unlock(ptl); return 0; } if (pmd_trans_huge(*pmd)) { damon_pmdp_mkold(pmd, walk->mm, addr); spin_unlock(ptl); return 0; } spin_unlock(ptl); } if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) return 0; pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte_present(*pte)) goto out; damon_ptep_mkold(pte, walk->mm, addr); out: pte_unmap_unlock(pte, ptl); return 0; } #ifdef CONFIG_HUGETLB_PAGE static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr) { bool referenced = false; pte_t entry = huge_ptep_get(pte); struct page *page = pte_page(entry); get_page(page); if (pte_young(entry)) { referenced = true; entry = pte_mkold(entry); set_huge_pte_at(mm, addr, pte, entry); } #ifdef CONFIG_MMU_NOTIFIER if (mmu_notifier_clear_young(mm, addr, addr + huge_page_size(hstate_vma(vma)))) referenced = true; #endif /* CONFIG_MMU_NOTIFIER */ if (referenced) set_page_young(page); set_page_idle(page); put_page(page); } static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct hstate *h = hstate_vma(walk->vma); spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(h, walk->mm, pte); entry = huge_ptep_get(pte); if (!pte_present(entry)) goto out; damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); out: spin_unlock(ptl); return 0; } #else #define damon_mkold_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, .hugetlb_entry = damon_mkold_hugetlb_entry, }; static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) { mmap_read_lock(mm); walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); mmap_read_unlock(mm); } /* * Functions for the access checking of the regions */ static void __damon_va_prepare_access_check(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); damon_va_mkold(mm, r->sampling_addr); } static void damon_va_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; struct damon_region *r; damon_for_each_target(t, ctx) { mm = damon_get_mm(t); if (!mm) continue; damon_for_each_region(r, t) __damon_va_prepare_access_check(ctx, mm, r); mmput(mm); } } struct damon_young_walk_private { unsigned long *page_sz; bool young; }; static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; spinlock_t *ptl; struct page *page; struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); if (!pmd_present(*pmd)) { spin_unlock(ptl); return 0; } if (!pmd_trans_huge(*pmd)) { spin_unlock(ptl); goto regular_page; } page = damon_get_page(pmd_pfn(*pmd)); if (!page) goto huge_out; if (pmd_young(*pmd) || !page_is_idle(page) || mmu_notifier_test_young(walk->mm, addr)) { *priv->page_sz = HPAGE_PMD_SIZE; priv->young = true; } put_page(page); huge_out: spin_unlock(ptl); return 0; } regular_page: #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) return -EINVAL; pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte_present(*pte)) goto out; page = damon_get_page(pte_pfn(*pte)); if (!page) goto out; if (pte_young(*pte) || !page_is_idle(page) || mmu_notifier_test_young(walk->mm, addr)) { *priv->page_sz = PAGE_SIZE; priv->young = true; } put_page(page); out: pte_unmap_unlock(pte, ptl); return 0; } #ifdef CONFIG_HUGETLB_PAGE static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct damon_young_walk_private *priv = walk->private; struct hstate *h = hstate_vma(walk->vma); struct page *page; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(h, walk->mm, pte); entry = huge_ptep_get(pte); if (!pte_present(entry)) goto out; page = pte_page(entry); get_page(page); if (pte_young(entry) || !page_is_idle(page) || mmu_notifier_test_young(walk->mm, addr)) { *priv->page_sz = huge_page_size(h); priv->young = true; } put_page(page); out: spin_unlock(ptl); return 0; } #else #define damon_young_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, .hugetlb_entry = damon_young_hugetlb_entry, }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, unsigned long *page_sz) { struct damon_young_walk_private arg = { .page_sz = page_sz, .young = false, }; mmap_read_lock(mm); walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); mmap_read_unlock(mm); return arg.young; } /* * Check whether the region was accessed after the last preparation * * mm 'mm_struct' for the given virtual address space * r the region to be checked */ static void __damon_va_check_access(struct mm_struct *mm, struct damon_region *r) { static struct mm_struct *last_mm; static unsigned long last_addr; static unsigned long last_page_sz = PAGE_SIZE; static bool last_accessed; /* If the region is in the last checked page, reuse the result */ if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) == ALIGN_DOWN(r->sampling_addr, last_page_sz))) { if (last_accessed) r->nr_accesses++; return; } last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz); if (last_accessed) r->nr_accesses++; last_mm = mm; last_addr = r->sampling_addr; } static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; struct damon_region *r; unsigned int max_nr_accesses = 0; damon_for_each_target(t, ctx) { mm = damon_get_mm(t); if (!mm) continue; damon_for_each_region(r, t) { __damon_va_check_access(mm, r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } mmput(mm); } return max_nr_accesses; } /* * Functions for the target validity check and cleanup */ static bool damon_va_target_valid(void *target) { struct damon_target *t = target; struct task_struct *task; task = damon_get_task_struct(t); if (task) { put_task_struct(task); return true; } return false; } #ifndef CONFIG_ADVISE_SYSCALLS static unsigned long damos_madvise(struct damon_target *target, struct damon_region *r, int behavior) { return 0; } #else static unsigned long damos_madvise(struct damon_target *target, struct damon_region *r, int behavior) { struct mm_struct *mm; unsigned long start = PAGE_ALIGN(r->ar.start); unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start); unsigned long applied; mm = damon_get_mm(target); if (!mm) return 0; applied = do_madvise(mm, start, len, behavior) ? 0 : len; mmput(mm); return applied; } #endif /* CONFIG_ADVISE_SYSCALLS */ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) { int madv_action; switch (scheme->action) { case DAMOS_WILLNEED: madv_action = MADV_WILLNEED; break; case DAMOS_COLD: madv_action = MADV_COLD; break; case DAMOS_PAGEOUT: madv_action = MADV_PAGEOUT; break; case DAMOS_HUGEPAGE: madv_action = MADV_HUGEPAGE; break; case DAMOS_NOHUGEPAGE: madv_action = MADV_NOHUGEPAGE; break; case DAMOS_STAT: return 0; default: return 0; } return damos_madvise(t, r, madv_action); } static int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme) { switch (scheme->action) { case DAMOS_PAGEOUT: return damon_pageout_score(context, r, scheme); default: break; } return DAMOS_MAX_SCORE; } static int __init damon_va_initcall(void) { struct damon_operations ops = { .id = DAMON_OPS_VADDR, .init = damon_va_init, .update = damon_va_update, .prepare_access_checks = damon_va_prepare_access_checks, .check_accesses = damon_va_check_accesses, .reset_aggregated = NULL, .target_valid = damon_va_target_valid, .cleanup = NULL, .apply_scheme = damon_va_apply_scheme, .get_scheme_score = damon_va_scheme_score, }; /* ops for fixed virtual address ranges */ struct damon_operations ops_fvaddr = ops; int err; /* Don't set the monitoring target regions for the entire mapping */ ops_fvaddr.id = DAMON_OPS_FVADDR; ops_fvaddr.init = NULL; ops_fvaddr.update = NULL; err = damon_register_ops(&ops); if (err) return err; return damon_register_ops(&ops_fvaddr); }; subsys_initcall(damon_va_initcall); #include "vaddr-test.h"