mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-01 10:45:49 +00:00
Folio changes for 5.18
- Rewrite how munlock works to massively reduce the contention on i_mmap_rwsem (Hugh Dickins): https://lore.kernel.org/linux-mm/8e4356d-9622-a7f0-b2c-f116b5f2efea@google.com/ - Sort out the page refcount mess for ZONE_DEVICE pages (Christoph Hellwig): https://lore.kernel.org/linux-mm/20220210072828.2930359-1-hch@lst.de/ - Convert GUP to use folios and make pincount available for order-1 pages. (Matthew Wilcox) - Convert a few more truncation functions to use folios (Matthew Wilcox) - Convert page_vma_mapped_walk to use PFNs instead of pages (Matthew Wilcox) - Convert rmap_walk to use folios (Matthew Wilcox) - Convert most of shrink_page_list() to use a folio (Matthew Wilcox) - Add support for creating large folios in readahead (Matthew Wilcox) -----BEGIN PGP SIGNATURE----- iQEzBAABCgAdFiEEejHryeLBw/spnjHrDpNsjXcpgj4FAmI4ucgACgkQDpNsjXcp gj69Wgf6AwqwmO5Tmy+fLScDPqWxmXJofbocae1kyoGHf7Ui91OK4U2j6IpvAr+g P/vLIK+JAAcTQcrSCjymuEkf4HkGZOR03QQn7maPIEe4eLrZRQDEsmHC1L9gpeJp s/GMvDWiGE0Tnxu0EOzfVi/yT+qjIl/S8VvqtCoJv1HdzxitZ7+1RDuqImaMC5MM Qi3uHag78vLmCltLXpIOdpgZhdZexCdL2Y/1npf+b6FVkAJRRNUnA0gRbS7YpoVp CbxEJcmAl9cpJLuj5i5kIfS9trr+/QcvbUlzRxh4ggC58iqnmF2V09l2MJ7YU3XL v1O/Elq4lRhXninZFQEm9zjrri7LDQ== =n9Ad -----END PGP SIGNATURE----- Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache Pull folio updates from Matthew Wilcox: - Rewrite how munlock works to massively reduce the contention on i_mmap_rwsem (Hugh Dickins): https://lore.kernel.org/linux-mm/8e4356d-9622-a7f0-b2c-f116b5f2efea@google.com/ - Sort out the page refcount mess for ZONE_DEVICE pages (Christoph Hellwig): https://lore.kernel.org/linux-mm/20220210072828.2930359-1-hch@lst.de/ - Convert GUP to use folios and make pincount available for order-1 pages. (Matthew Wilcox) - Convert a few more truncation functions to use folios (Matthew Wilcox) - Convert page_vma_mapped_walk to use PFNs instead of pages (Matthew Wilcox) - Convert rmap_walk to use folios (Matthew Wilcox) - Convert most of shrink_page_list() to use a folio (Matthew Wilcox) - Add support for creating large folios in readahead (Matthew Wilcox) * tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache: (114 commits) mm/damon: minor cleanup for damon_pa_young selftests/vm/transhuge-stress: Support file-backed PMD folios mm/filemap: Support VM_HUGEPAGE for file mappings mm/readahead: Switch to page_cache_ra_order mm/readahead: Align file mappings for non-DAX mm/readahead: Add large folio readahead mm: Support arbitrary THP sizes mm: Make large folios depend on THP mm: Fix READ_ONLY_THP warning mm/filemap: Allow large folios to be added to the page cache mm: Turn can_split_huge_page() into can_split_folio() mm/vmscan: Convert pageout() to take a folio mm/vmscan: Turn page_check_references() into folio_check_references() mm/vmscan: Account large folios correctly mm/vmscan: Optimise shrink_page_list for non-PMD-sized folios mm/vmscan: Free non-shmem folios without splitting them mm/rmap: Constify the rmap_walk_control argument mm/rmap: Convert rmap_walk() to take a folio mm: Turn page_anon_vma() into folio_anon_vma() mm/rmap: Turn page_lock_anon_vma_read() into folio_lock_anon_vma_read() ...
This commit is contained in:
commit
9030fb0bb9
@ -55,18 +55,18 @@ flags the caller provides. The caller is required to pass in a non-null struct
|
||||
pages* array, and the function then pins pages by incrementing each by a special
|
||||
value: GUP_PIN_COUNTING_BIAS.
|
||||
|
||||
For huge pages (and in fact, any compound page of more than 2 pages), the
|
||||
GUP_PIN_COUNTING_BIAS scheme is not used. Instead, an exact form of pin counting
|
||||
is achieved, by using the 3rd struct page in the compound page. A new struct
|
||||
page field, hpage_pinned_refcount, has been added in order to support this.
|
||||
For compound pages, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead,
|
||||
an exact form of pin counting is achieved, by using the 2nd struct page
|
||||
in the compound page. A new struct page field, compound_pincount, has
|
||||
been added in order to support this.
|
||||
|
||||
This approach for compound pages avoids the counting upper limit problems that
|
||||
are discussed below. Those limitations would have been aggravated severely by
|
||||
huge pages, because each tail page adds a refcount to the head page. And in
|
||||
fact, testing revealed that, without a separate hpage_pinned_refcount field,
|
||||
fact, testing revealed that, without a separate compound_pincount field,
|
||||
page overflows were seen in some huge page stress tests.
|
||||
|
||||
This also means that huge pages and compound pages (of order > 1) do not suffer
|
||||
This also means that huge pages and compound pages do not suffer
|
||||
from the false positives problem that is mentioned below.::
|
||||
|
||||
Function
|
||||
@ -264,9 +264,9 @@ place.)
|
||||
Other diagnostics
|
||||
=================
|
||||
|
||||
dump_page() has been enhanced slightly, to handle these new counting fields, and
|
||||
to better report on compound pages in general. Specifically, for compound pages
|
||||
with order > 1, the exact (hpage_pinned_refcount) pincount is reported.
|
||||
dump_page() has been enhanced slightly, to handle these new counting
|
||||
fields, and to better report on compound pages in general. Specifically,
|
||||
for compound pages, the exact (compound_pincount) pincount is reported.
|
||||
|
||||
References
|
||||
==========
|
||||
|
@ -233,6 +233,7 @@ pmd_page_vaddr(pmd_t pmd)
|
||||
return ((pmd_val(pmd) & _PFN_MASK) >> (32-PAGE_SHIFT)) + PAGE_OFFSET;
|
||||
}
|
||||
|
||||
#define pmd_pfn(pmd) (pmd_val(pmd) >> 32)
|
||||
#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> 32))
|
||||
#define pud_page(pud) (pfn_to_page(pud_val(pud) >> 32))
|
||||
|
||||
|
@ -31,7 +31,6 @@ static inline pmd_t pte_pmd(pte_t pte)
|
||||
|
||||
#define pmd_write(pmd) pte_write(pmd_pte(pmd))
|
||||
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
|
||||
#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd))
|
||||
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
|
||||
|
||||
#define mk_pmd(page, prot) pte_pmd(mk_pte(page, prot))
|
||||
|
@ -161,6 +161,7 @@
|
||||
#define pmd_present(x) (pmd_val(x))
|
||||
#define pmd_clear(xp) do { pmd_val(*(xp)) = 0; } while (0)
|
||||
#define pmd_page_vaddr(pmd) (pmd_val(pmd) & PAGE_MASK)
|
||||
#define pmd_pfn(pmd) ((pmd_val(pmd) & PAGE_MASK) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) virt_to_page(pmd_page_vaddr(pmd))
|
||||
#define set_pmd(pmdp, pmd) (*(pmdp) = pmd)
|
||||
#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd))
|
||||
|
@ -208,6 +208,8 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
|
||||
}
|
||||
#define pmd_offset pmd_offset
|
||||
|
||||
#define pmd_pfn(pmd) (__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
|
||||
|
||||
#define pmd_large(pmd) (pmd_val(pmd) & 2)
|
||||
#define pmd_leaf(pmd) (pmd_val(pmd) & 2)
|
||||
#define pmd_bad(pmd) (pmd_val(pmd) & 2)
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include <linux/mman.h>
|
||||
#include <linux/nodemask.h>
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/memory.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/io.h>
|
||||
|
@ -30,6 +30,7 @@
|
||||
#define pgd_ERROR(e) \
|
||||
pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
|
||||
|
||||
#define pmd_pfn(pmd) (pmd_phys(pmd) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) (pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT))
|
||||
#define pte_clear(mm, addr, ptep) set_pte((ptep), \
|
||||
(((unsigned int) addr >= PAGE_OFFSET) ? __pte(_PAGE_GLOBAL) : __pte(0)))
|
||||
|
@ -235,6 +235,11 @@ static inline int pmd_bad(pmd_t pmd)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* pmd_pfn - converts a PMD entry to a page frame number
|
||||
*/
|
||||
#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT)
|
||||
|
||||
/*
|
||||
* pmd_page - converts a PMD entry to a page pointer
|
||||
*/
|
||||
|
@ -267,6 +267,7 @@ ia64_phys_addr_valid (unsigned long addr)
|
||||
#define pmd_present(pmd) (pmd_val(pmd) != 0UL)
|
||||
#define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0UL)
|
||||
#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & _PFN_MASK))
|
||||
#define pmd_pfn(pmd) ((pmd_val(pmd) & _PFN_MASK) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) virt_to_page((pmd_val(pmd) + PAGE_OFFSET))
|
||||
|
||||
#define pud_none(pud) (!pud_val(pud))
|
||||
|
@ -322,6 +322,7 @@ extern pgd_t kernel_pg_dir[PTRS_PER_PGD];
|
||||
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
|
||||
#define __swp_entry_to_pte(x) (__pte((x).val))
|
||||
|
||||
#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
|
||||
|
||||
#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
|
||||
|
@ -147,6 +147,7 @@ static inline void pud_set(pud_t *pudp, pmd_t *pmdp)
|
||||
#define pmd_present(pmd) (pmd_val(pmd) & _PAGE_TABLE)
|
||||
#define pmd_clear(pmdp) ({ pmd_val(*pmdp) = 0; })
|
||||
|
||||
#define pmd_pfn(pmd) ((pmd_val(pmd) & _TABLE_MASK) >> PAGE_SHIFT)
|
||||
/*
|
||||
* m68k does not have huge pages (020/030 actually could), but generic code
|
||||
* expects pmd_page() to exists, only to then DCE it all. Provide a dummy to
|
||||
|
@ -130,6 +130,7 @@ static inline void pte_clear (struct mm_struct *mm, unsigned long addr, pte_t *p
|
||||
({ pte_t __pte; pte_val(__pte) = pfn | pgprot_val(pgprot); __pte; })
|
||||
|
||||
#define pte_page(pte) virt_to_page(__pte_page(pte))
|
||||
#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) virt_to_page(pmd_page_vaddr(pmd))
|
||||
|
||||
|
||||
|
@ -399,6 +399,9 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
|
||||
return ((unsigned long) (pmd_val(pmd) & PAGE_MASK));
|
||||
}
|
||||
|
||||
/* returns pfn of the pmd entry*/
|
||||
#define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT)
|
||||
|
||||
/* returns struct *page of the pmd entry*/
|
||||
#define pmd_page(pmd) (pfn_to_page(__pa(pmd_val(pmd)) >> PAGE_SHIFT))
|
||||
|
||||
|
@ -86,6 +86,11 @@ extern void paging_init(void);
|
||||
*/
|
||||
#define pmd_phys(pmd) virt_to_phys((void *)pmd_val(pmd))
|
||||
|
||||
static inline unsigned long pmd_pfn(pmd_t pmd)
|
||||
{
|
||||
return pmd_val(pmd) >> _PFN_SHIFT;
|
||||
}
|
||||
|
||||
#ifndef CONFIG_MIPS_HUGE_TLB_SUPPORT
|
||||
#define pmd_page(pmd) (pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT))
|
||||
#endif /* CONFIG_MIPS_HUGE_TLB_SUPPORT */
|
||||
@ -422,11 +427,6 @@ static inline int pmd_write(pmd_t pmd)
|
||||
return !!(pmd_val(pmd) & _PAGE_WRITE);
|
||||
}
|
||||
|
||||
static inline unsigned long pmd_pfn(pmd_t pmd)
|
||||
{
|
||||
return pmd_val(pmd) >> _PFN_SHIFT;
|
||||
}
|
||||
|
||||
static inline struct page *pmd_page(pmd_t pmd)
|
||||
{
|
||||
if (pmd_val(pmd) & _PAGE_HUGE)
|
||||
|
@ -308,6 +308,7 @@ static inline pmd_t __mk_pmd(pte_t * ptep, unsigned long prot)
|
||||
return pmd;
|
||||
}
|
||||
|
||||
#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) virt_to_page(__va(pmd_val(pmd)))
|
||||
|
||||
/*
|
||||
|
@ -235,6 +235,7 @@ static inline void pte_clear(struct mm_struct *mm,
|
||||
* and a page entry and page directory to the page they refer to.
|
||||
*/
|
||||
#define pmd_phys(pmd) virt_to_phys((void *)pmd_val(pmd))
|
||||
#define pmd_pfn(pmd) (pmd_phys(pmd) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) (pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT))
|
||||
|
||||
static inline unsigned long pmd_page_vaddr(pmd_t pmd)
|
||||
|
@ -361,6 +361,7 @@ static inline void pmd_set(pmd_t *pmdp, pte_t *ptep)
|
||||
pmd_val(*pmdp) = _KERNPG_TABLE | (unsigned long) ptep;
|
||||
}
|
||||
|
||||
#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
|
||||
|
||||
static inline unsigned long pmd_page_vaddr(pmd_t pmd)
|
||||
|
@ -408,6 +408,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
|
||||
return ((unsigned long) __va(pmd_address(pmd)));
|
||||
}
|
||||
|
||||
#define pmd_pfn(pmd) (pmd_address(pmd) >> PAGE_SHIFT)
|
||||
#define __pmd_page(pmd) ((unsigned long) __va(pmd_address(pmd)))
|
||||
#define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd))
|
||||
|
||||
|
@ -372,8 +372,8 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
|
||||
#define __HAVE_ARCH_PTE_SAME
|
||||
#define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HASHPTE) == 0)
|
||||
|
||||
#define pmd_page(pmd) \
|
||||
pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
|
||||
#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
|
||||
|
||||
/*
|
||||
* Encode and decode a swap entry.
|
||||
|
@ -21,7 +21,6 @@ extern void destroy_context(struct mm_struct *mm);
|
||||
#ifdef CONFIG_SPAPR_TCE_IOMMU
|
||||
struct mm_iommu_table_group_mem_t;
|
||||
|
||||
extern int isolate_lru_page(struct page *page); /* from internal.h */
|
||||
extern bool mm_iommu_preregistered(struct mm_struct *mm);
|
||||
extern long mm_iommu_new(struct mm_struct *mm,
|
||||
unsigned long ua, unsigned long entries,
|
||||
|
@ -349,15 +349,14 @@ static inline int pte_young(pte_t pte)
|
||||
* of the pte page. -- paulus
|
||||
*/
|
||||
#ifndef CONFIG_BOOKE
|
||||
#define pmd_page(pmd) \
|
||||
pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
|
||||
#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT)
|
||||
#else
|
||||
#define pmd_page_vaddr(pmd) \
|
||||
((unsigned long)(pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1)))
|
||||
#define pmd_page(pmd) \
|
||||
pfn_to_page((__pa(pmd_val(pmd)) >> PAGE_SHIFT))
|
||||
#define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT)
|
||||
#endif
|
||||
|
||||
#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
|
||||
/*
|
||||
* Encode and decode a swap entry.
|
||||
* Note that the bits we use in a PTE for representing a swap entry
|
||||
|
@ -142,6 +142,7 @@ static inline pte_t pmd_pte(pmd_t pmd)
|
||||
#define pmd_present(pmd) (!pmd_none(pmd))
|
||||
#define pmd_page_vaddr(pmd) (pmd_val(pmd) & ~PMD_MASKED_BITS)
|
||||
extern struct page *pmd_page(pmd_t pmd);
|
||||
#define pmd_pfn(pmd) (page_to_pfn(pmd_page(pmd)))
|
||||
|
||||
static inline void pud_set(pud_t *pudp, unsigned long val)
|
||||
{
|
||||
|
@ -91,6 +91,7 @@
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/ksm.h>
|
||||
#include <linux/of.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <asm/ultravisor.h>
|
||||
#include <asm/mman.h>
|
||||
#include <asm/kvm_ppc.h>
|
||||
@ -712,7 +713,6 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
|
||||
|
||||
dpage = pfn_to_page(uvmem_pfn);
|
||||
dpage->zone_device_data = pvt;
|
||||
get_page(dpage);
|
||||
lock_page(dpage);
|
||||
return dpage;
|
||||
out_clear:
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <linux/sched.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <misc/cxl-base.h>
|
||||
|
||||
|
@ -406,6 +406,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
|
||||
return (unsigned long)pmd_val(pmd);
|
||||
}
|
||||
|
||||
#define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) (virt_to_page(pmd_val(pmd)))
|
||||
|
||||
#ifdef CONFIG_X2TLB
|
||||
|
@ -127,11 +127,16 @@ static inline int srmmu_device_memory(unsigned long x)
|
||||
return ((x & 0xF0000000) != 0);
|
||||
}
|
||||
|
||||
static inline unsigned long pmd_pfn(pmd_t pmd)
|
||||
{
|
||||
return (pmd_val(pmd) & SRMMU_PTD_PMASK) >> (PAGE_SHIFT-4);
|
||||
}
|
||||
|
||||
static inline struct page *pmd_page(pmd_t pmd)
|
||||
{
|
||||
if (srmmu_device_memory(pmd_val(pmd)))
|
||||
BUG();
|
||||
return pfn_to_page((pmd_val(pmd) & SRMMU_PTD_PMASK) >> (PAGE_SHIFT-4));
|
||||
return pfn_to_page(pmd_pfn(pmd));
|
||||
}
|
||||
|
||||
static inline unsigned long __pmd_page(pmd_t pmd)
|
||||
|
@ -109,6 +109,7 @@ extern unsigned long end_iomem;
|
||||
#define p4d_newpage(x) (p4d_val(x) & _PAGE_NEWPAGE)
|
||||
#define p4d_mkuptodate(x) (p4d_val(x) &= ~_PAGE_NEWPAGE)
|
||||
|
||||
#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) phys_to_page(pmd_val(pmd) & PAGE_MASK)
|
||||
|
||||
#define pte_page(x) pfn_to_page(pte_pfn(x))
|
||||
|
@ -241,6 +241,7 @@ static inline void paging_init(void) { }
|
||||
* The pmd contains the kernel virtual address of the pte page.
|
||||
*/
|
||||
#define pmd_page_vaddr(pmd) ((unsigned long)(pmd_val(pmd) & PAGE_MASK))
|
||||
#define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT)
|
||||
#define pmd_page(pmd) virt_to_page(pmd_val(pmd))
|
||||
|
||||
/*
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <xen/events.h>
|
||||
#include <xen/grant_table.h>
|
||||
#include "common.h"
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <linux/hmm.h>
|
||||
#include <linux/dma-direction.h>
|
||||
#include <linux/dma-mapping.h>
|
||||
#include <linux/migrate.h>
|
||||
#include "amdgpu_sync.h"
|
||||
#include "amdgpu_object.h"
|
||||
#include "amdgpu_vm.h"
|
||||
@ -224,7 +225,6 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
|
||||
page = pfn_to_page(pfn);
|
||||
svm_range_bo_ref(prange->svm_bo);
|
||||
page->zone_device_data = prange->svm_bo;
|
||||
get_page(page);
|
||||
lock_page(page);
|
||||
}
|
||||
|
||||
|
@ -25,6 +25,7 @@
|
||||
|
||||
#include <linux/hashtable.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/atomic.h>
|
||||
|
@ -27,11 +27,11 @@
|
||||
/*
|
||||
* Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com>
|
||||
*/
|
||||
|
||||
#include <linux/dma-buf-map.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/cc_platform.h>
|
||||
#include <linux/ioport.h>
|
||||
#include <xen/xen.h>
|
||||
|
||||
#include <drm/drm_cache.h>
|
||||
|
@ -39,6 +39,8 @@
|
||||
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/hmm.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/migrate.h>
|
||||
|
||||
/*
|
||||
* FIXME: this is ugly right now we are using TTM to allocate vram and we pin
|
||||
@ -324,7 +326,6 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
get_page(page);
|
||||
lock_page(page);
|
||||
return page;
|
||||
}
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/hmm.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/rmap.h>
|
||||
|
||||
struct nouveau_svm {
|
||||
|
@ -2,6 +2,7 @@
|
||||
/*
|
||||
* Copyright (c) 2016 HGST, a Western Digital Company.
|
||||
*/
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pci-p2pdma.h>
|
||||
|
@ -3,6 +3,7 @@
|
||||
#define __NVDIMM_PMEM_H__
|
||||
#include <linux/page-flags.h>
|
||||
#include <linux/badblocks.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/pfn_t.h>
|
||||
#include <linux/fs.h>
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include <linux/init.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mutex.h>
|
||||
|
@ -6,6 +6,7 @@
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-integrity.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/module.h>
|
||||
#include "nvmet.h"
|
||||
|
||||
|
@ -179,6 +179,7 @@
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/limits.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/rwsem.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/spinlock.h>
|
||||
|
@ -48,7 +48,7 @@ config FS_DAX
|
||||
bool "File system based Direct Access (DAX) support"
|
||||
depends on MMU
|
||||
depends on !(ARM || MIPS || SPARC)
|
||||
select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
|
||||
depends on ZONE_DEVICE || FS_DAX_LIMITED
|
||||
select FS_IOMAP
|
||||
select DAX
|
||||
help
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <linux/time.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/stat.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/errno.h>
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <linux/dax.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/pfn_t.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/virtio.h>
|
||||
#include <linux/virtio_fs.h>
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/iomap.h>
|
||||
#include <linux/fiemap.h>
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
static int iomap_to_fiemap(struct fiemap_extent_info *fi,
|
||||
const struct iomap *iomap, u32 flags)
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <linux/hash.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/list_lru.h>
|
||||
#include <linux/fsnotify_backend.h>
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/ima.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/exportfs.h>
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/page_idle.h>
|
||||
|
24
fs/splice.c
24
fs/splice.c
@ -46,45 +46,45 @@
|
||||
static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
|
||||
struct pipe_buffer *buf)
|
||||
{
|
||||
struct page *page = buf->page;
|
||||
struct folio *folio = page_folio(buf->page);
|
||||
struct address_space *mapping;
|
||||
|
||||
lock_page(page);
|
||||
folio_lock(folio);
|
||||
|
||||
mapping = page_mapping(page);
|
||||
mapping = folio_mapping(folio);
|
||||
if (mapping) {
|
||||
WARN_ON(!PageUptodate(page));
|
||||
WARN_ON(!folio_test_uptodate(folio));
|
||||
|
||||
/*
|
||||
* At least for ext2 with nobh option, we need to wait on
|
||||
* writeback completing on this page, since we'll remove it
|
||||
* writeback completing on this folio, since we'll remove it
|
||||
* from the pagecache. Otherwise truncate wont wait on the
|
||||
* page, allowing the disk blocks to be reused by someone else
|
||||
* folio, allowing the disk blocks to be reused by someone else
|
||||
* before we actually wrote our data to them. fs corruption
|
||||
* ensues.
|
||||
*/
|
||||
wait_on_page_writeback(page);
|
||||
folio_wait_writeback(folio);
|
||||
|
||||
if (page_has_private(page) &&
|
||||
!try_to_release_page(page, GFP_KERNEL))
|
||||
if (folio_has_private(folio) &&
|
||||
!filemap_release_folio(folio, GFP_KERNEL))
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
* If we succeeded in removing the mapping, set LRU flag
|
||||
* and return good.
|
||||
*/
|
||||
if (remove_mapping(mapping, page)) {
|
||||
if (remove_mapping(mapping, folio)) {
|
||||
buf->flags |= PIPE_BUF_FLAG_LRU;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Raced with truncate or failed to remove page from current
|
||||
* Raced with truncate or failed to remove folio from current
|
||||
* address space, unlock and return failure.
|
||||
*/
|
||||
out_unlock:
|
||||
unlock_page(page);
|
||||
folio_unlock(folio);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <linux/namei.h>
|
||||
#include <linux/nls.h>
|
||||
#include <linux/sizes.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/vfs.h>
|
||||
#include "vfsmod.h"
|
||||
|
||||
|
@ -2753,54 +2753,6 @@ extern void init_special_inode(struct inode *, umode_t, dev_t);
|
||||
extern void make_bad_inode(struct inode *);
|
||||
extern bool is_bad_inode(struct inode *);
|
||||
|
||||
unsigned long invalidate_mapping_pages(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end);
|
||||
|
||||
void invalidate_mapping_pagevec(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end,
|
||||
unsigned long *nr_pagevec);
|
||||
|
||||
static inline void invalidate_remote_inode(struct inode *inode)
|
||||
{
|
||||
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
|
||||
S_ISLNK(inode->i_mode))
|
||||
invalidate_mapping_pages(inode->i_mapping, 0, -1);
|
||||
}
|
||||
extern int invalidate_inode_pages2(struct address_space *mapping);
|
||||
extern int invalidate_inode_pages2_range(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end);
|
||||
extern int write_inode_now(struct inode *, int);
|
||||
extern int filemap_fdatawrite(struct address_space *);
|
||||
extern int filemap_flush(struct address_space *);
|
||||
extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
|
||||
extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
|
||||
loff_t lend);
|
||||
extern int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
|
||||
loff_t start_byte, loff_t end_byte);
|
||||
|
||||
static inline int filemap_fdatawait(struct address_space *mapping)
|
||||
{
|
||||
return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
|
||||
}
|
||||
|
||||
extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
|
||||
loff_t lend);
|
||||
extern int filemap_write_and_wait_range(struct address_space *mapping,
|
||||
loff_t lstart, loff_t lend);
|
||||
extern int __filemap_fdatawrite_range(struct address_space *mapping,
|
||||
loff_t start, loff_t end, int sync_mode);
|
||||
extern int filemap_fdatawrite_range(struct address_space *mapping,
|
||||
loff_t start, loff_t end);
|
||||
extern int filemap_check_errors(struct address_space *mapping);
|
||||
extern void __filemap_set_wb_err(struct address_space *mapping, int err);
|
||||
int filemap_fdatawrite_wbc(struct address_space *mapping,
|
||||
struct writeback_control *wbc);
|
||||
|
||||
static inline int filemap_write_and_wait(struct address_space *mapping)
|
||||
{
|
||||
return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
|
||||
}
|
||||
|
||||
extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
|
||||
loff_t lend);
|
||||
extern int __must_check file_check_and_advance_wb_err(struct file *file);
|
||||
@ -2812,67 +2764,6 @@ static inline int file_write_and_wait(struct file *file)
|
||||
return file_write_and_wait_range(file, 0, LLONG_MAX);
|
||||
}
|
||||
|
||||
/**
|
||||
* filemap_set_wb_err - set a writeback error on an address_space
|
||||
* @mapping: mapping in which to set writeback error
|
||||
* @err: error to be set in mapping
|
||||
*
|
||||
* When writeback fails in some way, we must record that error so that
|
||||
* userspace can be informed when fsync and the like are called. We endeavor
|
||||
* to report errors on any file that was open at the time of the error. Some
|
||||
* internal callers also need to know when writeback errors have occurred.
|
||||
*
|
||||
* When a writeback error occurs, most filesystems will want to call
|
||||
* filemap_set_wb_err to record the error in the mapping so that it will be
|
||||
* automatically reported whenever fsync is called on the file.
|
||||
*/
|
||||
static inline void filemap_set_wb_err(struct address_space *mapping, int err)
|
||||
{
|
||||
/* Fastpath for common case of no error */
|
||||
if (unlikely(err))
|
||||
__filemap_set_wb_err(mapping, err);
|
||||
}
|
||||
|
||||
/**
|
||||
* filemap_check_wb_err - has an error occurred since the mark was sampled?
|
||||
* @mapping: mapping to check for writeback errors
|
||||
* @since: previously-sampled errseq_t
|
||||
*
|
||||
* Grab the errseq_t value from the mapping, and see if it has changed "since"
|
||||
* the given value was sampled.
|
||||
*
|
||||
* If it has then report the latest error set, otherwise return 0.
|
||||
*/
|
||||
static inline int filemap_check_wb_err(struct address_space *mapping,
|
||||
errseq_t since)
|
||||
{
|
||||
return errseq_check(&mapping->wb_err, since);
|
||||
}
|
||||
|
||||
/**
|
||||
* filemap_sample_wb_err - sample the current errseq_t to test for later errors
|
||||
* @mapping: mapping to be sampled
|
||||
*
|
||||
* Writeback errors are always reported relative to a particular sample point
|
||||
* in the past. This function provides those sample points.
|
||||
*/
|
||||
static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
|
||||
{
|
||||
return errseq_sample(&mapping->wb_err);
|
||||
}
|
||||
|
||||
/**
|
||||
* file_sample_sb_err - sample the current errseq_t to test for later errors
|
||||
* @file: file pointer to be sampled
|
||||
*
|
||||
* Grab the most current superblock-level errseq_t value for the given
|
||||
* struct file.
|
||||
*/
|
||||
static inline errseq_t file_sample_sb_err(struct file *file)
|
||||
{
|
||||
return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
|
||||
}
|
||||
|
||||
extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
|
||||
int datasync);
|
||||
extern int vfs_fsync(struct file *file, int datasync);
|
||||
@ -3627,15 +3518,4 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
|
||||
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
|
||||
int advice);
|
||||
|
||||
/*
|
||||
* Flush file data before changing attributes. Caller must hold any locks
|
||||
* required to prevent further writes to this file until we're done setting
|
||||
* flags.
|
||||
*/
|
||||
static inline int inode_drain_writes(struct inode *inode)
|
||||
{
|
||||
inode_dio_wait(inode);
|
||||
return filemap_write_and_wait(inode->i_mapping);
|
||||
}
|
||||
|
||||
#endif /* _LINUX_FS_H */
|
||||
|
@ -9,14 +9,9 @@
|
||||
#ifndef LINUX_HMM_H
|
||||
#define LINUX_HMM_H
|
||||
|
||||
#include <linux/kconfig.h>
|
||||
#include <linux/pgtable.h>
|
||||
#include <linux/mm.h>
|
||||
|
||||
#include <linux/device.h>
|
||||
#include <linux/migrate.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/completion.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
struct mmu_interval_notifier;
|
||||
|
||||
/*
|
||||
* On output:
|
||||
|
@ -185,7 +185,7 @@ void prep_transhuge_page(struct page *page);
|
||||
void free_transhuge_page(struct page *page);
|
||||
bool is_transparent_hugepage(struct page *page);
|
||||
|
||||
bool can_split_huge_page(struct page *page, int *pextra_pins);
|
||||
bool can_split_folio(struct folio *folio, int *pextra_pins);
|
||||
int split_huge_page_to_list(struct page *page, struct list_head *list);
|
||||
static inline int split_huge_page(struct page *page)
|
||||
{
|
||||
@ -194,7 +194,7 @@ static inline int split_huge_page(struct page *page)
|
||||
void deferred_split_huge_page(struct page *page);
|
||||
|
||||
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
unsigned long address, bool freeze, struct page *page);
|
||||
unsigned long address, bool freeze, struct folio *folio);
|
||||
|
||||
#define split_huge_pmd(__vma, __pmd, __address) \
|
||||
do { \
|
||||
@ -207,7 +207,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
|
||||
|
||||
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
|
||||
bool freeze, struct page *page);
|
||||
bool freeze, struct folio *folio);
|
||||
|
||||
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
|
||||
unsigned long address);
|
||||
@ -250,30 +250,6 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* thp_order - Order of a transparent huge page.
|
||||
* @page: Head page of a transparent huge page.
|
||||
*/
|
||||
static inline unsigned int thp_order(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PGFLAGS(PageTail(page), page);
|
||||
if (PageHead(page))
|
||||
return HPAGE_PMD_ORDER;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* thp_nr_pages - The number of regular pages in this huge page.
|
||||
* @page: The head page of a huge page.
|
||||
*/
|
||||
static inline int thp_nr_pages(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PGFLAGS(PageTail(page), page);
|
||||
if (PageHead(page))
|
||||
return HPAGE_PMD_NR;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* folio_test_pmd_mappable - Can we map this folio with a PMD?
|
||||
* @folio: The folio to test
|
||||
@ -336,18 +312,6 @@ static inline struct list_head *page_deferred_list(struct page *page)
|
||||
#define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; })
|
||||
#define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })
|
||||
|
||||
static inline unsigned int thp_order(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PGFLAGS(PageTail(page), page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int thp_nr_pages(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PGFLAGS(PageTail(page), page);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline bool folio_test_pmd_mappable(struct folio *folio)
|
||||
{
|
||||
return false;
|
||||
@ -387,7 +351,7 @@ static inline bool is_transparent_hugepage(struct page *page)
|
||||
#define thp_get_unmapped_area NULL
|
||||
|
||||
static inline bool
|
||||
can_split_huge_page(struct page *page, int *pextra_pins)
|
||||
can_split_folio(struct folio *folio, int *pextra_pins)
|
||||
{
|
||||
BUILD_BUG();
|
||||
return false;
|
||||
@ -406,9 +370,9 @@ static inline void deferred_split_huge_page(struct page *page) {}
|
||||
do { } while (0)
|
||||
|
||||
static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
unsigned long address, bool freeze, struct page *page) {}
|
||||
unsigned long address, bool freeze, struct folio *folio) {}
|
||||
static inline void split_huge_pmd_address(struct vm_area_struct *vma,
|
||||
unsigned long address, bool freeze, struct page *page) {}
|
||||
unsigned long address, bool freeze, struct folio *folio) {}
|
||||
|
||||
#define split_huge_pud(__vma, __pmd, __address) \
|
||||
do { } while (0)
|
||||
@ -483,15 +447,10 @@ static inline bool thp_migration_supported(void)
|
||||
}
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
/**
|
||||
* thp_size - Size of a transparent huge page.
|
||||
* @page: Head page of a transparent huge page.
|
||||
*
|
||||
* Return: Number of bytes in this page.
|
||||
*/
|
||||
static inline unsigned long thp_size(struct page *page)
|
||||
static inline int split_folio_to_list(struct folio *folio,
|
||||
struct list_head *list)
|
||||
{
|
||||
return PAGE_SIZE << thp_order(page);
|
||||
return split_huge_page_to_list(&folio->page, list);
|
||||
}
|
||||
|
||||
#endif /* _LINUX_HUGE_MM_H */
|
||||
|
@ -970,6 +970,11 @@ static inline struct hstate *page_hstate(struct page *page)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct hstate *size_to_hstate(unsigned long size)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline unsigned long huge_page_size(struct hstate *h)
|
||||
{
|
||||
return PAGE_SIZE;
|
||||
|
@ -51,7 +51,7 @@ static inline void ksm_exit(struct mm_struct *mm)
|
||||
struct page *ksm_might_need_to_copy(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address);
|
||||
|
||||
void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
|
||||
void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc);
|
||||
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
|
||||
|
||||
#else /* !CONFIG_KSM */
|
||||
@ -78,8 +78,8 @@ static inline struct page *ksm_might_need_to_copy(struct page *page,
|
||||
return page;
|
||||
}
|
||||
|
||||
static inline void rmap_walk_ksm(struct page *page,
|
||||
struct rmap_walk_control *rwc)
|
||||
static inline void rmap_walk_ksm(struct folio *folio,
|
||||
const struct rmap_walk_control *rwc)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,8 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_MEMREMAP_H_
|
||||
#define _LINUX_MEMREMAP_H_
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/range.h>
|
||||
#include <linux/ioport.h>
|
||||
#include <linux/percpu-refcount.h>
|
||||
@ -66,9 +68,9 @@ enum memory_type {
|
||||
|
||||
struct dev_pagemap_ops {
|
||||
/*
|
||||
* Called once the page refcount reaches 1. (ZONE_DEVICE pages never
|
||||
* reach 0 refcount unless there is a refcount bug. This allows the
|
||||
* device driver to implement its own memory management.)
|
||||
* Called once the page refcount reaches 0. The reference count will be
|
||||
* reset to one by the core code after the method is called to prepare
|
||||
* for handing out the page again.
|
||||
*/
|
||||
void (*page_free)(struct page *page);
|
||||
|
||||
@ -129,6 +131,25 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
|
||||
return 1 << pgmap->vmemmap_shift;
|
||||
}
|
||||
|
||||
static inline bool is_device_private_page(const struct page *page)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
|
||||
is_zone_device_page(page) &&
|
||||
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
|
||||
}
|
||||
|
||||
static inline bool folio_is_device_private(const struct folio *folio)
|
||||
{
|
||||
return is_device_private_page(&folio->page);
|
||||
}
|
||||
|
||||
static inline bool is_pci_p2pdma_page(const struct page *page)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
|
||||
is_zone_device_page(page) &&
|
||||
page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ZONE_DEVICE
|
||||
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
|
||||
void memunmap_pages(struct dev_pagemap *pgmap);
|
||||
|
@ -3,9 +3,6 @@
|
||||
#define _LINUX_MM_H
|
||||
|
||||
#include <linux/errno.h>
|
||||
|
||||
#ifdef __KERNEL__
|
||||
|
||||
#include <linux/mmdebug.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/bug.h>
|
||||
@ -26,7 +23,6 @@
|
||||
#include <linux/err.h>
|
||||
#include <linux/page-flags.h>
|
||||
#include <linux/page_ref.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/overflow.h>
|
||||
#include <linux/sizes.h>
|
||||
#include <linux/sched.h>
|
||||
@ -216,8 +212,10 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
|
||||
|
||||
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
|
||||
#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio))
|
||||
#else
|
||||
#define nth_page(page,n) ((page) + (n))
|
||||
#define folio_page_idx(folio, p) ((p) - &(folio)->page)
|
||||
#endif
|
||||
|
||||
/* to align the pointer to the (next) page boundary */
|
||||
@ -227,6 +225,10 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
|
||||
#define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
|
||||
|
||||
#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
|
||||
static inline struct folio *lru_to_folio(struct list_head *head)
|
||||
{
|
||||
return list_entry((head)->prev, struct folio, lru);
|
||||
}
|
||||
|
||||
void setup_initial_init_mm(void *start_code, void *end_code,
|
||||
void *end_data, void *brk);
|
||||
@ -775,21 +777,26 @@ static inline int is_vmalloc_or_module_addr(const void *x)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int head_compound_mapcount(struct page *head)
|
||||
/*
|
||||
* How many times the entire folio is mapped as a single unit (eg by a
|
||||
* PMD or PUD entry). This is probably not what you want, except for
|
||||
* debugging purposes; look at folio_mapcount() or page_mapcount()
|
||||
* instead.
|
||||
*/
|
||||
static inline int folio_entire_mapcount(struct folio *folio)
|
||||
{
|
||||
return atomic_read(compound_mapcount_ptr(head)) + 1;
|
||||
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
|
||||
return atomic_read(folio_mapcount_ptr(folio)) + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Mapcount of compound page as a whole, does not include mapped sub-pages.
|
||||
*
|
||||
* Must be called only for compound pages or any their tail sub-pages.
|
||||
* Must be called only for compound pages.
|
||||
*/
|
||||
static inline int compound_mapcount(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PAGE(!PageCompound(page), page);
|
||||
page = compound_head(page);
|
||||
return head_compound_mapcount(page);
|
||||
return folio_entire_mapcount(page_folio(page));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -819,8 +826,14 @@ static inline int page_mapcount(struct page *page)
|
||||
return atomic_read(&page->_mapcount) + 1;
|
||||
}
|
||||
|
||||
int folio_mapcount(struct folio *folio);
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
int total_mapcount(struct page *page);
|
||||
static inline int total_mapcount(struct page *page)
|
||||
{
|
||||
return folio_mapcount(page_folio(page));
|
||||
}
|
||||
|
||||
int page_trans_huge_mapcount(struct page *page);
|
||||
#else
|
||||
static inline int total_mapcount(struct page *page)
|
||||
@ -890,33 +903,17 @@ static inline void destroy_compound_page(struct page *page)
|
||||
compound_page_dtors[page[1].compound_dtor](page);
|
||||
}
|
||||
|
||||
static inline bool hpage_pincount_available(struct page *page)
|
||||
{
|
||||
/*
|
||||
* Can the page->hpage_pinned_refcount field be used? That field is in
|
||||
* the 3rd page of the compound page, so the smallest (2-page) compound
|
||||
* pages cannot support it.
|
||||
*/
|
||||
page = compound_head(page);
|
||||
return PageCompound(page) && compound_order(page) > 1;
|
||||
}
|
||||
|
||||
static inline int head_compound_pincount(struct page *head)
|
||||
{
|
||||
return atomic_read(compound_pincount_ptr(head));
|
||||
}
|
||||
|
||||
static inline int compound_pincount(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
|
||||
page = compound_head(page);
|
||||
return head_compound_pincount(page);
|
||||
}
|
||||
|
||||
static inline void set_compound_order(struct page *page, unsigned int order)
|
||||
{
|
||||
page[1].compound_order = order;
|
||||
#ifdef CONFIG_64BIT
|
||||
page[1].compound_nr = 1U << order;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Returns the number of pages in this potentially compound page. */
|
||||
@ -924,7 +921,11 @@ static inline unsigned long compound_nr(struct page *page)
|
||||
{
|
||||
if (!PageHead(page))
|
||||
return 1;
|
||||
#ifdef CONFIG_64BIT
|
||||
return page[1].compound_nr;
|
||||
#else
|
||||
return 1UL << compound_order(page);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Returns the number of bytes in this potentially compound page. */
|
||||
@ -939,6 +940,37 @@ static inline unsigned int page_shift(struct page *page)
|
||||
return PAGE_SHIFT + compound_order(page);
|
||||
}
|
||||
|
||||
/**
|
||||
* thp_order - Order of a transparent huge page.
|
||||
* @page: Head page of a transparent huge page.
|
||||
*/
|
||||
static inline unsigned int thp_order(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PGFLAGS(PageTail(page), page);
|
||||
return compound_order(page);
|
||||
}
|
||||
|
||||
/**
|
||||
* thp_nr_pages - The number of regular pages in this huge page.
|
||||
* @page: The head page of a huge page.
|
||||
*/
|
||||
static inline int thp_nr_pages(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PGFLAGS(PageTail(page), page);
|
||||
return compound_nr(page);
|
||||
}
|
||||
|
||||
/**
|
||||
* thp_size - Size of a transparent huge page.
|
||||
* @page: Head page of a transparent huge page.
|
||||
*
|
||||
* Return: Number of bytes in this page.
|
||||
*/
|
||||
static inline unsigned long thp_size(struct page *page)
|
||||
{
|
||||
return PAGE_SIZE << thp_order(page);
|
||||
}
|
||||
|
||||
void free_compound_page(struct page *page);
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
@ -1090,59 +1122,35 @@ static inline bool is_zone_device_page(const struct page *page)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline bool folio_is_zone_device(const struct folio *folio)
|
||||
{
|
||||
return is_zone_device_page(&folio->page);
|
||||
}
|
||||
|
||||
static inline bool is_zone_movable_page(const struct page *page)
|
||||
{
|
||||
return page_zonenum(page) == ZONE_MOVABLE;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
void free_devmap_managed_page(struct page *page);
|
||||
#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
|
||||
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
|
||||
|
||||
static inline bool page_is_devmap_managed(struct page *page)
|
||||
bool __put_devmap_managed_page(struct page *page);
|
||||
static inline bool put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
if (!static_branch_unlikely(&devmap_managed_key))
|
||||
return false;
|
||||
if (!is_zone_device_page(page))
|
||||
return false;
|
||||
switch (page->pgmap->type) {
|
||||
case MEMORY_DEVICE_PRIVATE:
|
||||
case MEMORY_DEVICE_FS_DAX:
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
return __put_devmap_managed_page(page);
|
||||
}
|
||||
|
||||
void put_devmap_managed_page(struct page *page);
|
||||
|
||||
#else /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
static inline bool page_is_devmap_managed(struct page *page)
|
||||
#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
|
||||
static inline bool put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
|
||||
static inline bool is_device_private_page(const struct page *page)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
|
||||
IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
|
||||
is_zone_device_page(page) &&
|
||||
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
|
||||
}
|
||||
|
||||
static inline bool is_pci_p2pdma_page(const struct page *page)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
|
||||
IS_ENABLED(CONFIG_PCI_P2PDMA) &&
|
||||
is_zone_device_page(page) &&
|
||||
page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
|
||||
}
|
||||
#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
|
||||
|
||||
/* 127: arbitrary random number, small enough to assemble well */
|
||||
#define folio_ref_zero_or_close_to_overflow(folio) \
|
||||
@ -1168,9 +1176,6 @@ static inline void get_page(struct page *page)
|
||||
}
|
||||
|
||||
bool __must_check try_grab_page(struct page *page, unsigned int flags);
|
||||
struct page *try_grab_compound_head(struct page *page, int refs,
|
||||
unsigned int flags);
|
||||
|
||||
|
||||
static inline __must_check bool try_get_page(struct page *page)
|
||||
{
|
||||
@ -1225,16 +1230,11 @@ static inline void put_page(struct page *page)
|
||||
struct folio *folio = page_folio(page);
|
||||
|
||||
/*
|
||||
* For devmap managed pages we need to catch refcount transition from
|
||||
* 2 to 1, when refcount reach one it means the page is free and we
|
||||
* need to inform the device driver through callback. See
|
||||
* include/linux/memremap.h and HMM for details.
|
||||
* For some devmap managed pages we need to catch refcount transition
|
||||
* from 2 to 1:
|
||||
*/
|
||||
if (page_is_devmap_managed(&folio->page)) {
|
||||
put_devmap_managed_page(&folio->page);
|
||||
if (put_devmap_managed_page(&folio->page))
|
||||
return;
|
||||
}
|
||||
|
||||
folio_put(folio);
|
||||
}
|
||||
|
||||
@ -1264,10 +1264,9 @@ static inline void put_page(struct page *page)
|
||||
* applications that don't have huge page reference counts, this won't be an
|
||||
* issue.
|
||||
*
|
||||
* Locking: the lockless algorithm described in page_cache_get_speculative()
|
||||
* and page_cache_gup_pin_speculative() provides safe operation for
|
||||
* get_user_pages and page_mkclean and other calls that race to set up page
|
||||
* table entries.
|
||||
* Locking: the lockless algorithm described in folio_try_get_rcu()
|
||||
* provides safe operation for get_user_pages(), page_mkclean() and
|
||||
* other calls that race to set up page table entries.
|
||||
*/
|
||||
#define GUP_PIN_COUNTING_BIAS (1U << 10)
|
||||
|
||||
@ -1278,70 +1277,11 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
|
||||
bool make_dirty);
|
||||
void unpin_user_pages(struct page **pages, unsigned long npages);
|
||||
|
||||
/**
|
||||
* page_maybe_dma_pinned - Report if a page is pinned for DMA.
|
||||
* @page: The page.
|
||||
*
|
||||
* This function checks if a page has been pinned via a call to
|
||||
* a function in the pin_user_pages() family.
|
||||
*
|
||||
* For non-huge pages, the return value is partially fuzzy: false is not fuzzy,
|
||||
* because it means "definitely not pinned for DMA", but true means "probably
|
||||
* pinned for DMA, but possibly a false positive due to having at least
|
||||
* GUP_PIN_COUNTING_BIAS worth of normal page references".
|
||||
*
|
||||
* False positives are OK, because: a) it's unlikely for a page to get that many
|
||||
* refcounts, and b) all the callers of this routine are expected to be able to
|
||||
* deal gracefully with a false positive.
|
||||
*
|
||||
* For huge pages, the result will be exactly correct. That's because we have
|
||||
* more tracking data available: the 3rd struct page in the compound page is
|
||||
* used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS
|
||||
* scheme).
|
||||
*
|
||||
* For more information, please see Documentation/core-api/pin_user_pages.rst.
|
||||
*
|
||||
* Return: True, if it is likely that the page has been "dma-pinned".
|
||||
* False, if the page is definitely not dma-pinned.
|
||||
*/
|
||||
static inline bool page_maybe_dma_pinned(struct page *page)
|
||||
{
|
||||
if (hpage_pincount_available(page))
|
||||
return compound_pincount(page) > 0;
|
||||
|
||||
/*
|
||||
* page_ref_count() is signed. If that refcount overflows, then
|
||||
* page_ref_count() returns a negative value, and callers will avoid
|
||||
* further incrementing the refcount.
|
||||
*
|
||||
* Here, for that overflow case, use the signed bit to count a little
|
||||
* bit higher via unsigned math, and thus still get an accurate result.
|
||||
*/
|
||||
return ((unsigned int)page_ref_count(compound_head(page))) >=
|
||||
GUP_PIN_COUNTING_BIAS;
|
||||
}
|
||||
|
||||
static inline bool is_cow_mapping(vm_flags_t flags)
|
||||
{
|
||||
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
|
||||
}
|
||||
|
||||
/*
|
||||
* This should most likely only be called during fork() to see whether we
|
||||
* should break the cow immediately for a page on the src mm.
|
||||
*/
|
||||
static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
|
||||
struct page *page)
|
||||
{
|
||||
if (!is_cow_mapping(vma->vm_flags))
|
||||
return false;
|
||||
|
||||
if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
|
||||
return false;
|
||||
|
||||
return page_maybe_dma_pinned(page);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#define SECTION_IN_PAGE_FLAGS
|
||||
#endif
|
||||
@ -1586,6 +1526,74 @@ static inline unsigned long folio_pfn(struct folio *folio)
|
||||
return page_to_pfn(&folio->page);
|
||||
}
|
||||
|
||||
static inline atomic_t *folio_pincount_ptr(struct folio *folio)
|
||||
{
|
||||
return &folio_page(folio, 1)->compound_pincount;
|
||||
}
|
||||
|
||||
/**
|
||||
* folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
|
||||
* @folio: The folio.
|
||||
*
|
||||
* This function checks if a folio has been pinned via a call to
|
||||
* a function in the pin_user_pages() family.
|
||||
*
|
||||
* For small folios, the return value is partially fuzzy: false is not fuzzy,
|
||||
* because it means "definitely not pinned for DMA", but true means "probably
|
||||
* pinned for DMA, but possibly a false positive due to having at least
|
||||
* GUP_PIN_COUNTING_BIAS worth of normal folio references".
|
||||
*
|
||||
* False positives are OK, because: a) it's unlikely for a folio to
|
||||
* get that many refcounts, and b) all the callers of this routine are
|
||||
* expected to be able to deal gracefully with a false positive.
|
||||
*
|
||||
* For large folios, the result will be exactly correct. That's because
|
||||
* we have more tracking data available: the compound_pincount is used
|
||||
* instead of the GUP_PIN_COUNTING_BIAS scheme.
|
||||
*
|
||||
* For more information, please see Documentation/core-api/pin_user_pages.rst.
|
||||
*
|
||||
* Return: True, if it is likely that the page has been "dma-pinned".
|
||||
* False, if the page is definitely not dma-pinned.
|
||||
*/
|
||||
static inline bool folio_maybe_dma_pinned(struct folio *folio)
|
||||
{
|
||||
if (folio_test_large(folio))
|
||||
return atomic_read(folio_pincount_ptr(folio)) > 0;
|
||||
|
||||
/*
|
||||
* folio_ref_count() is signed. If that refcount overflows, then
|
||||
* folio_ref_count() returns a negative value, and callers will avoid
|
||||
* further incrementing the refcount.
|
||||
*
|
||||
* Here, for that overflow case, use the sign bit to count a little
|
||||
* bit higher via unsigned math, and thus still get an accurate result.
|
||||
*/
|
||||
return ((unsigned int)folio_ref_count(folio)) >=
|
||||
GUP_PIN_COUNTING_BIAS;
|
||||
}
|
||||
|
||||
static inline bool page_maybe_dma_pinned(struct page *page)
|
||||
{
|
||||
return folio_maybe_dma_pinned(page_folio(page));
|
||||
}
|
||||
|
||||
/*
|
||||
* This should most likely only be called during fork() to see whether we
|
||||
* should break the cow immediately for a page on the src mm.
|
||||
*/
|
||||
static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
|
||||
struct page *page)
|
||||
{
|
||||
if (!is_cow_mapping(vma->vm_flags))
|
||||
return false;
|
||||
|
||||
if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
|
||||
return false;
|
||||
|
||||
return page_maybe_dma_pinned(page);
|
||||
}
|
||||
|
||||
/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
|
||||
#ifdef CONFIG_MIGRATION
|
||||
static inline bool is_pinnable_page(struct page *page)
|
||||
@ -1600,6 +1608,11 @@ static inline bool is_pinnable_page(struct page *page)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline bool folio_is_pinnable(struct folio *folio)
|
||||
{
|
||||
return is_pinnable_page(&folio->page);
|
||||
}
|
||||
|
||||
static inline void set_page_zone(struct page *page, enum zone_type zone)
|
||||
{
|
||||
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
|
||||
@ -1749,7 +1762,6 @@ static inline void *folio_address(const struct folio *folio)
|
||||
}
|
||||
|
||||
extern void *page_rmapping(struct page *page);
|
||||
extern struct anon_vma *page_anon_vma(struct page *page);
|
||||
extern pgoff_t __page_file_index(struct page *page);
|
||||
|
||||
/*
|
||||
@ -1855,7 +1867,6 @@ extern void truncate_setsize(struct inode *inode, loff_t newsize);
|
||||
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
|
||||
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
|
||||
int generic_error_remove_page(struct address_space *mapping, struct page *page);
|
||||
int invalidate_inode_page(struct page *page);
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
|
||||
@ -2921,13 +2932,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
|
||||
#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
|
||||
#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
|
||||
* and return without waiting upon it */
|
||||
#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */
|
||||
#define FOLL_NOFAULT 0x80 /* do not fault in pages */
|
||||
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
|
||||
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
|
||||
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
|
||||
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
|
||||
#define FOLL_MLOCK 0x1000 /* lock present pages */
|
||||
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
|
||||
#define FOLL_COW 0x4000 /* internal GUP flag */
|
||||
#define FOLL_ANON 0x8000 /* don't do file mappings */
|
||||
@ -3381,5 +3390,4 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __KERNEL__ */
|
||||
#endif /* _LINUX_MM_H */
|
||||
|
@ -99,7 +99,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
|
||||
|
||||
update_lru_size(lruvec, lru, folio_zonenum(folio),
|
||||
folio_nr_pages(folio));
|
||||
list_add(&folio->lru, &lruvec->lists[lru]);
|
||||
if (lru != LRU_UNEVICTABLE)
|
||||
list_add(&folio->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
|
||||
static __always_inline void add_page_to_lru_list(struct page *page,
|
||||
@ -115,6 +116,7 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
|
||||
|
||||
update_lru_size(lruvec, lru, folio_zonenum(folio),
|
||||
folio_nr_pages(folio));
|
||||
/* This is not expected to be used on LRU_UNEVICTABLE */
|
||||
list_add_tail(&folio->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
|
||||
@ -127,8 +129,11 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
|
||||
static __always_inline
|
||||
void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
|
||||
{
|
||||
list_del(&folio->lru);
|
||||
update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio),
|
||||
enum lru_list lru = folio_lru_list(folio);
|
||||
|
||||
if (lru != LRU_UNEVICTABLE)
|
||||
list_del(&folio->lru);
|
||||
update_lru_size(lruvec, lru, folio_zonenum(folio),
|
||||
-folio_nr_pages(folio));
|
||||
}
|
||||
|
||||
|
@ -85,7 +85,16 @@ struct page {
|
||||
* lruvec->lru_lock. Sometimes used as a generic list
|
||||
* by the page owner.
|
||||
*/
|
||||
struct list_head lru;
|
||||
union {
|
||||
struct list_head lru;
|
||||
/* Or, for the Unevictable "LRU list" slot */
|
||||
struct {
|
||||
/* Always even, to negate PageTail */
|
||||
void *__filler;
|
||||
/* Count page's or folio's mlocks */
|
||||
unsigned int mlock_count;
|
||||
};
|
||||
};
|
||||
/* See page-flags.h for PAGE_MAPPING_FLAGS */
|
||||
struct address_space *mapping;
|
||||
pgoff_t index; /* Our offset within mapping. */
|
||||
@ -126,11 +135,14 @@ struct page {
|
||||
unsigned char compound_dtor;
|
||||
unsigned char compound_order;
|
||||
atomic_t compound_mapcount;
|
||||
atomic_t compound_pincount;
|
||||
#ifdef CONFIG_64BIT
|
||||
unsigned int compound_nr; /* 1 << compound_order */
|
||||
#endif
|
||||
};
|
||||
struct { /* Second tail page of compound page */
|
||||
unsigned long _compound_pad_1; /* compound_head */
|
||||
atomic_t hpage_pinned_refcount;
|
||||
unsigned long _compound_pad_2;
|
||||
/* For both global and memcg */
|
||||
struct list_head deferred_list;
|
||||
};
|
||||
@ -241,7 +253,13 @@ struct folio {
|
||||
struct {
|
||||
/* public: */
|
||||
unsigned long flags;
|
||||
struct list_head lru;
|
||||
union {
|
||||
struct list_head lru;
|
||||
struct {
|
||||
void *__filler;
|
||||
unsigned int mlock_count;
|
||||
};
|
||||
};
|
||||
struct address_space *mapping;
|
||||
pgoff_t index;
|
||||
void *private;
|
||||
@ -285,7 +303,7 @@ static inline atomic_t *compound_mapcount_ptr(struct page *page)
|
||||
|
||||
static inline atomic_t *compound_pincount_ptr(struct page *page)
|
||||
{
|
||||
return &page[2].hpage_pinned_refcount;
|
||||
return &page[1].compound_pincount;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -18,6 +18,120 @@
|
||||
|
||||
struct folio_batch;
|
||||
|
||||
unsigned long invalidate_mapping_pages(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end);
|
||||
|
||||
static inline void invalidate_remote_inode(struct inode *inode)
|
||||
{
|
||||
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
|
||||
S_ISLNK(inode->i_mode))
|
||||
invalidate_mapping_pages(inode->i_mapping, 0, -1);
|
||||
}
|
||||
int invalidate_inode_pages2(struct address_space *mapping);
|
||||
int invalidate_inode_pages2_range(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end);
|
||||
int write_inode_now(struct inode *, int sync);
|
||||
int filemap_fdatawrite(struct address_space *);
|
||||
int filemap_flush(struct address_space *);
|
||||
int filemap_fdatawait_keep_errors(struct address_space *mapping);
|
||||
int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend);
|
||||
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
|
||||
loff_t start_byte, loff_t end_byte);
|
||||
|
||||
static inline int filemap_fdatawait(struct address_space *mapping)
|
||||
{
|
||||
return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
|
||||
}
|
||||
|
||||
bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend);
|
||||
int filemap_write_and_wait_range(struct address_space *mapping,
|
||||
loff_t lstart, loff_t lend);
|
||||
int __filemap_fdatawrite_range(struct address_space *mapping,
|
||||
loff_t start, loff_t end, int sync_mode);
|
||||
int filemap_fdatawrite_range(struct address_space *mapping,
|
||||
loff_t start, loff_t end);
|
||||
int filemap_check_errors(struct address_space *mapping);
|
||||
void __filemap_set_wb_err(struct address_space *mapping, int err);
|
||||
int filemap_fdatawrite_wbc(struct address_space *mapping,
|
||||
struct writeback_control *wbc);
|
||||
|
||||
static inline int filemap_write_and_wait(struct address_space *mapping)
|
||||
{
|
||||
return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
|
||||
}
|
||||
|
||||
/**
|
||||
* filemap_set_wb_err - set a writeback error on an address_space
|
||||
* @mapping: mapping in which to set writeback error
|
||||
* @err: error to be set in mapping
|
||||
*
|
||||
* When writeback fails in some way, we must record that error so that
|
||||
* userspace can be informed when fsync and the like are called. We endeavor
|
||||
* to report errors on any file that was open at the time of the error. Some
|
||||
* internal callers also need to know when writeback errors have occurred.
|
||||
*
|
||||
* When a writeback error occurs, most filesystems will want to call
|
||||
* filemap_set_wb_err to record the error in the mapping so that it will be
|
||||
* automatically reported whenever fsync is called on the file.
|
||||
*/
|
||||
static inline void filemap_set_wb_err(struct address_space *mapping, int err)
|
||||
{
|
||||
/* Fastpath for common case of no error */
|
||||
if (unlikely(err))
|
||||
__filemap_set_wb_err(mapping, err);
|
||||
}
|
||||
|
||||
/**
|
||||
* filemap_check_wb_err - has an error occurred since the mark was sampled?
|
||||
* @mapping: mapping to check for writeback errors
|
||||
* @since: previously-sampled errseq_t
|
||||
*
|
||||
* Grab the errseq_t value from the mapping, and see if it has changed "since"
|
||||
* the given value was sampled.
|
||||
*
|
||||
* If it has then report the latest error set, otherwise return 0.
|
||||
*/
|
||||
static inline int filemap_check_wb_err(struct address_space *mapping,
|
||||
errseq_t since)
|
||||
{
|
||||
return errseq_check(&mapping->wb_err, since);
|
||||
}
|
||||
|
||||
/**
|
||||
* filemap_sample_wb_err - sample the current errseq_t to test for later errors
|
||||
* @mapping: mapping to be sampled
|
||||
*
|
||||
* Writeback errors are always reported relative to a particular sample point
|
||||
* in the past. This function provides those sample points.
|
||||
*/
|
||||
static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
|
||||
{
|
||||
return errseq_sample(&mapping->wb_err);
|
||||
}
|
||||
|
||||
/**
|
||||
* file_sample_sb_err - sample the current errseq_t to test for later errors
|
||||
* @file: file pointer to be sampled
|
||||
*
|
||||
* Grab the most current superblock-level errseq_t value for the given
|
||||
* struct file.
|
||||
*/
|
||||
static inline errseq_t file_sample_sb_err(struct file *file)
|
||||
{
|
||||
return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush file data before changing attributes. Caller must hold any locks
|
||||
* required to prevent further writes to this file until we're done setting
|
||||
* flags.
|
||||
*/
|
||||
static inline int inode_drain_writes(struct inode *inode)
|
||||
{
|
||||
inode_dio_wait(inode);
|
||||
return filemap_write_and_wait(inode->i_mapping);
|
||||
}
|
||||
|
||||
static inline bool mapping_empty(struct address_space *mapping)
|
||||
{
|
||||
return xa_empty(&mapping->i_pages);
|
||||
@ -192,9 +306,14 @@ static inline void mapping_set_large_folios(struct address_space *mapping)
|
||||
__set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Large folio support currently depends on THP. These dependencies are
|
||||
* being worked on but are not yet fixed.
|
||||
*/
|
||||
static inline bool mapping_large_folio_support(struct address_space *mapping)
|
||||
{
|
||||
return test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
|
||||
return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
|
||||
test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
|
||||
}
|
||||
|
||||
static inline int filemap_nr_thps(struct address_space *mapping)
|
||||
@ -212,7 +331,7 @@ static inline void filemap_nr_thps_inc(struct address_space *mapping)
|
||||
if (!mapping_large_folio_support(mapping))
|
||||
atomic_inc(&mapping->nr_thps);
|
||||
#else
|
||||
WARN_ON_ONCE(1);
|
||||
WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -222,7 +341,7 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
|
||||
if (!mapping_large_folio_support(mapping))
|
||||
atomic_dec(&mapping->nr_thps);
|
||||
#else
|
||||
WARN_ON_ONCE(1);
|
||||
WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -283,16 +402,6 @@ static inline struct inode *folio_inode(struct folio *folio)
|
||||
return folio->mapping->host;
|
||||
}
|
||||
|
||||
static inline bool page_cache_add_speculative(struct page *page, int count)
|
||||
{
|
||||
return folio_ref_try_add_rcu((struct folio *)page, count);
|
||||
}
|
||||
|
||||
static inline bool page_cache_get_speculative(struct page *page)
|
||||
{
|
||||
return page_cache_add_speculative(page, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* folio_attach_private - Attach private data to a folio.
|
||||
* @folio: Folio to attach data to.
|
||||
@ -706,6 +815,17 @@ static inline loff_t folio_file_pos(struct folio *folio)
|
||||
return page_file_offset(&folio->page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the offset in PAGE_SIZE (even for hugetlb folios).
|
||||
* (TODO: hugetlb folios should have ->index in PAGE_SIZE)
|
||||
*/
|
||||
static inline pgoff_t folio_pgoff(struct folio *folio)
|
||||
{
|
||||
if (unlikely(folio_test_hugetlb(folio)))
|
||||
return hugetlb_basepage_index(&folio->page);
|
||||
return folio->index;
|
||||
}
|
||||
|
||||
extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
|
||||
unsigned long address);
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <linux/rwsem.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
/*
|
||||
* The anon_vma heads a list of private "related" vmas, to scan if
|
||||
@ -167,18 +168,19 @@ struct anon_vma *page_get_anon_vma(struct page *page);
|
||||
*/
|
||||
void page_move_anon_rmap(struct page *, struct vm_area_struct *);
|
||||
void page_add_anon_rmap(struct page *, struct vm_area_struct *,
|
||||
unsigned long, bool);
|
||||
unsigned long address, bool compound);
|
||||
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
|
||||
unsigned long, int);
|
||||
unsigned long address, int flags);
|
||||
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
|
||||
unsigned long, bool);
|
||||
void page_add_file_rmap(struct page *, bool);
|
||||
void page_remove_rmap(struct page *, bool);
|
||||
|
||||
unsigned long address, bool compound);
|
||||
void page_add_file_rmap(struct page *, struct vm_area_struct *,
|
||||
bool compound);
|
||||
void page_remove_rmap(struct page *, struct vm_area_struct *,
|
||||
bool compound);
|
||||
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
|
||||
unsigned long);
|
||||
unsigned long address);
|
||||
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
|
||||
unsigned long);
|
||||
unsigned long address);
|
||||
|
||||
static inline void page_dup_rmap(struct page *page, bool compound)
|
||||
{
|
||||
@ -188,11 +190,11 @@ static inline void page_dup_rmap(struct page *page, bool compound)
|
||||
/*
|
||||
* Called from mm/vmscan.c to handle paging out
|
||||
*/
|
||||
int page_referenced(struct page *, int is_locked,
|
||||
int folio_referenced(struct folio *, int is_locked,
|
||||
struct mem_cgroup *memcg, unsigned long *vm_flags);
|
||||
|
||||
void try_to_migrate(struct page *page, enum ttu_flags flags);
|
||||
void try_to_unmap(struct page *, enum ttu_flags flags);
|
||||
void try_to_migrate(struct folio *folio, enum ttu_flags flags);
|
||||
void try_to_unmap(struct folio *, enum ttu_flags flags);
|
||||
|
||||
int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, struct page **pages,
|
||||
@ -200,11 +202,13 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
|
||||
|
||||
/* Avoid racy checks */
|
||||
#define PVMW_SYNC (1 << 0)
|
||||
/* Look for migarion entries rather than present PTEs */
|
||||
/* Look for migration entries rather than present PTEs */
|
||||
#define PVMW_MIGRATION (1 << 1)
|
||||
|
||||
struct page_vma_mapped_walk {
|
||||
struct page *page;
|
||||
unsigned long pfn;
|
||||
unsigned long nr_pages;
|
||||
pgoff_t pgoff;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long address;
|
||||
pmd_t *pmd;
|
||||
@ -213,10 +217,30 @@ struct page_vma_mapped_walk {
|
||||
unsigned int flags;
|
||||
};
|
||||
|
||||
#define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \
|
||||
struct page_vma_mapped_walk name = { \
|
||||
.pfn = page_to_pfn(_page), \
|
||||
.nr_pages = compound_nr(page), \
|
||||
.pgoff = page_to_pgoff(page), \
|
||||
.vma = _vma, \
|
||||
.address = _address, \
|
||||
.flags = _flags, \
|
||||
}
|
||||
|
||||
#define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \
|
||||
struct page_vma_mapped_walk name = { \
|
||||
.pfn = folio_pfn(_folio), \
|
||||
.nr_pages = folio_nr_pages(_folio), \
|
||||
.pgoff = folio_pgoff(_folio), \
|
||||
.vma = _vma, \
|
||||
.address = _address, \
|
||||
.flags = _flags, \
|
||||
}
|
||||
|
||||
static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
|
||||
{
|
||||
/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
|
||||
if (pvmw->pte && !PageHuge(pvmw->page))
|
||||
if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
|
||||
pte_unmap(pvmw->pte);
|
||||
if (pvmw->ptl)
|
||||
spin_unlock(pvmw->ptl);
|
||||
@ -237,18 +261,12 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
|
||||
*/
|
||||
int folio_mkclean(struct folio *);
|
||||
|
||||
/*
|
||||
* called in munlock()/munmap() path to check for other vmas holding
|
||||
* the page mlocked.
|
||||
*/
|
||||
void page_mlock(struct page *page);
|
||||
|
||||
void remove_migration_ptes(struct page *old, struct page *new, bool locked);
|
||||
void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
|
||||
|
||||
/*
|
||||
* Called by memory-failure.c to kill processes.
|
||||
*/
|
||||
struct anon_vma *page_lock_anon_vma_read(struct page *page);
|
||||
struct anon_vma *folio_lock_anon_vma_read(struct folio *folio);
|
||||
void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
|
||||
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
|
||||
|
||||
@ -267,15 +285,15 @@ struct rmap_walk_control {
|
||||
* Return false if page table scanning in rmap_walk should be stopped.
|
||||
* Otherwise, return true.
|
||||
*/
|
||||
bool (*rmap_one)(struct page *page, struct vm_area_struct *vma,
|
||||
bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
|
||||
unsigned long addr, void *arg);
|
||||
int (*done)(struct page *page);
|
||||
struct anon_vma *(*anon_lock)(struct page *page);
|
||||
int (*done)(struct folio *folio);
|
||||
struct anon_vma *(*anon_lock)(struct folio *folio);
|
||||
bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
|
||||
};
|
||||
|
||||
void rmap_walk(struct page *page, struct rmap_walk_control *rwc);
|
||||
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
|
||||
void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc);
|
||||
void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc);
|
||||
|
||||
#else /* !CONFIG_MMU */
|
||||
|
||||
@ -283,7 +301,7 @@ void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
|
||||
#define anon_vma_prepare(vma) (0)
|
||||
#define anon_vma_link(vma) do {} while (0)
|
||||
|
||||
static inline int page_referenced(struct page *page, int is_locked,
|
||||
static inline int folio_referenced(struct folio *folio, int is_locked,
|
||||
struct mem_cgroup *memcg,
|
||||
unsigned long *vm_flags)
|
||||
{
|
||||
@ -291,7 +309,7 @@ static inline int page_referenced(struct page *page, int is_locked,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void try_to_unmap(struct page *page, enum ttu_flags flags)
|
||||
static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -328,7 +328,7 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio)
|
||||
|
||||
/* linux/mm/workingset.c */
|
||||
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
|
||||
void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
|
||||
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
|
||||
void workingset_refault(struct folio *folio, void *shadow);
|
||||
void workingset_activation(struct folio *folio);
|
||||
|
||||
@ -375,7 +375,6 @@ extern void lru_add_drain(void);
|
||||
extern void lru_add_drain_cpu(int cpu);
|
||||
extern void lru_add_drain_cpu_zone(struct zone *zone);
|
||||
extern void lru_add_drain_all(void);
|
||||
extern void deactivate_file_page(struct page *page);
|
||||
extern void deactivate_page(struct page *page);
|
||||
extern void mark_page_lazyfree(struct page *page);
|
||||
extern void swap_setup(void);
|
||||
@ -397,7 +396,7 @@ extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
|
||||
unsigned long *nr_scanned);
|
||||
extern unsigned long shrink_all_memory(unsigned long nr_pages);
|
||||
extern int vm_swappiness;
|
||||
extern int remove_mapping(struct address_space *mapping, struct page *page);
|
||||
long remove_mapping(struct address_space *mapping, struct folio *folio);
|
||||
|
||||
extern unsigned long reclaim_pages(struct list_head *page_list);
|
||||
#ifdef CONFIG_NUMA
|
||||
@ -743,7 +742,7 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
|
||||
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
|
||||
extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
|
||||
static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
|
||||
{
|
||||
@ -763,7 +762,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
|
||||
extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
|
||||
extern bool mem_cgroup_swap_full(struct page *page);
|
||||
#else
|
||||
static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
|
||||
static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -327,11 +327,11 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
|
||||
__print_symbolic(__entry->lru, LRU_NAMES))
|
||||
);
|
||||
|
||||
TRACE_EVENT(mm_vmscan_writepage,
|
||||
TRACE_EVENT(mm_vmscan_write_folio,
|
||||
|
||||
TP_PROTO(struct page *page),
|
||||
TP_PROTO(struct folio *folio),
|
||||
|
||||
TP_ARGS(page),
|
||||
TP_ARGS(folio),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned long, pfn)
|
||||
@ -339,9 +339,9 @@ TRACE_EVENT(mm_vmscan_writepage,
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->pfn = page_to_pfn(page);
|
||||
__entry->pfn = folio_pfn(folio);
|
||||
__entry->reclaim_flags = trace_reclaim_flags(
|
||||
page_is_file_lru(page));
|
||||
folio_is_file_lru(folio));
|
||||
),
|
||||
|
||||
TP_printk("page=%p pfn=0x%lx flags=%s",
|
||||
|
@ -155,11 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
struct page *old_page, struct page *new_page)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct page_vma_mapped_walk pvmw = {
|
||||
.page = compound_head(old_page),
|
||||
.vma = vma,
|
||||
.address = addr,
|
||||
};
|
||||
DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0);
|
||||
int err;
|
||||
struct mmu_notifier_range range;
|
||||
|
||||
@ -173,7 +169,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
return err;
|
||||
}
|
||||
|
||||
/* For try_to_free_swap() and munlock_vma_page() below */
|
||||
/* For try_to_free_swap() below */
|
||||
lock_page(old_page);
|
||||
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
@ -201,13 +197,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
set_pte_at_notify(mm, addr, pvmw.pte,
|
||||
mk_pte(new_page, vma->vm_page_prot));
|
||||
|
||||
page_remove_rmap(old_page, false);
|
||||
page_remove_rmap(old_page, vma, false);
|
||||
if (!page_mapped(old_page))
|
||||
try_to_free_swap(old_page);
|
||||
page_vma_mapped_walk_done(&pvmw);
|
||||
|
||||
if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
|
||||
munlock_vma_page(old_page);
|
||||
put_page(old_page);
|
||||
|
||||
err = 0;
|
||||
|
@ -302,7 +302,7 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
|
||||
* found it, but truncated or holepunched or subjected to
|
||||
* invalidate_complete_page2 before we got the page lock (also
|
||||
* cases which we are happy to fail). And we hold a reference,
|
||||
* so refcount care in invalidate_complete_page's remove_mapping
|
||||
* so refcount care in invalidate_inode_page's remove_mapping
|
||||
* prevents drop_caches from setting mapping to NULL beneath us.
|
||||
*
|
||||
* The case we do have to guard against is when memory pressure made
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/cdev.h>
|
||||
#include <linux/device.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/rwsem.h>
|
||||
#include <linux/sched.h>
|
||||
@ -26,6 +27,8 @@
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/platform_device.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/migrate.h>
|
||||
|
||||
#include "test_hmm_uapi.h"
|
||||
|
||||
@ -563,7 +566,6 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
|
||||
}
|
||||
|
||||
dpage->zone_device_data = rpage;
|
||||
get_page(dpage);
|
||||
lock_page(dpage);
|
||||
return dpage;
|
||||
|
||||
|
@ -249,6 +249,9 @@ config MIGRATION
|
||||
pages as migration can relocate pages to satisfy a huge page
|
||||
allocation instead of reclaiming.
|
||||
|
||||
config DEVICE_MIGRATION
|
||||
def_bool MIGRATION && ZONE_DEVICE
|
||||
|
||||
config ARCH_ENABLE_HUGEPAGE_MIGRATION
|
||||
bool
|
||||
|
||||
@ -791,9 +794,6 @@ config ZONE_DEVICE
|
||||
|
||||
If FS_DAX is enabled, then say Y.
|
||||
|
||||
config DEV_PAGEMAP_OPS
|
||||
bool
|
||||
|
||||
#
|
||||
# Helpers to mirror range of the CPU page tables of a process into device page
|
||||
# tables.
|
||||
@ -805,7 +805,6 @@ config HMM_MIRROR
|
||||
config DEVICE_PRIVATE
|
||||
bool "Unaddressable device memory (GPU memory, ...)"
|
||||
depends on ZONE_DEVICE
|
||||
select DEV_PAGEMAP_OPS
|
||||
|
||||
help
|
||||
Allows creation of struct pages to represent unaddressable device
|
||||
|
@ -92,6 +92,7 @@ obj-$(CONFIG_KFENCE) += kfence/
|
||||
obj-$(CONFIG_FAILSLAB) += failslab.o
|
||||
obj-$(CONFIG_MEMTEST) += memtest.o
|
||||
obj-$(CONFIG_MIGRATION) += migrate.o
|
||||
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
|
||||
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
|
||||
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
|
||||
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
|
||||
|
@ -16,14 +16,10 @@
|
||||
#include "../internal.h"
|
||||
#include "ops-common.h"
|
||||
|
||||
static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma,
|
||||
static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma,
|
||||
unsigned long addr, void *arg)
|
||||
{
|
||||
struct page_vma_mapped_walk pvmw = {
|
||||
.page = page,
|
||||
.vma = vma,
|
||||
.address = addr,
|
||||
};
|
||||
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
|
||||
|
||||
while (page_vma_mapped_walk(&pvmw)) {
|
||||
addr = pvmw.address;
|
||||
@ -37,32 +33,34 @@ static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma,
|
||||
|
||||
static void damon_pa_mkold(unsigned long paddr)
|
||||
{
|
||||
struct folio *folio;
|
||||
struct page *page = damon_get_page(PHYS_PFN(paddr));
|
||||
struct rmap_walk_control rwc = {
|
||||
.rmap_one = __damon_pa_mkold,
|
||||
.anon_lock = page_lock_anon_vma_read,
|
||||
.anon_lock = folio_lock_anon_vma_read,
|
||||
};
|
||||
bool need_lock;
|
||||
|
||||
if (!page)
|
||||
return;
|
||||
folio = page_folio(page);
|
||||
|
||||
if (!page_mapped(page) || !page_rmapping(page)) {
|
||||
set_page_idle(page);
|
||||
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
|
||||
folio_set_idle(folio);
|
||||
goto out;
|
||||
}
|
||||
|
||||
need_lock = !PageAnon(page) || PageKsm(page);
|
||||
if (need_lock && !trylock_page(page))
|
||||
need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
|
||||
if (need_lock && !folio_trylock(folio))
|
||||
goto out;
|
||||
|
||||
rmap_walk(page, &rwc);
|
||||
rmap_walk(folio, &rwc);
|
||||
|
||||
if (need_lock)
|
||||
unlock_page(page);
|
||||
folio_unlock(folio);
|
||||
|
||||
out:
|
||||
put_page(page);
|
||||
folio_put(folio);
|
||||
}
|
||||
|
||||
static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
|
||||
@ -89,15 +87,11 @@ struct damon_pa_access_chk_result {
|
||||
bool accessed;
|
||||
};
|
||||
|
||||
static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma,
|
||||
static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
|
||||
unsigned long addr, void *arg)
|
||||
{
|
||||
struct damon_pa_access_chk_result *result = arg;
|
||||
struct page_vma_mapped_walk pvmw = {
|
||||
.page = page,
|
||||
.vma = vma,
|
||||
.address = addr,
|
||||
};
|
||||
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
|
||||
|
||||
result->accessed = false;
|
||||
result->page_sz = PAGE_SIZE;
|
||||
@ -105,12 +99,12 @@ static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma,
|
||||
addr = pvmw.address;
|
||||
if (pvmw.pte) {
|
||||
result->accessed = pte_young(*pvmw.pte) ||
|
||||
!page_is_idle(page) ||
|
||||
!folio_test_idle(folio) ||
|
||||
mmu_notifier_test_young(vma->vm_mm, addr);
|
||||
} else {
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
result->accessed = pmd_young(*pvmw.pmd) ||
|
||||
!page_is_idle(page) ||
|
||||
!folio_test_idle(folio) ||
|
||||
mmu_notifier_test_young(vma->vm_mm, addr);
|
||||
result->page_sz = ((1UL) << HPAGE_PMD_SHIFT);
|
||||
#else
|
||||
@ -129,6 +123,7 @@ static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma,
|
||||
|
||||
static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
|
||||
{
|
||||
struct folio *folio;
|
||||
struct page *page = damon_get_page(PHYS_PFN(paddr));
|
||||
struct damon_pa_access_chk_result result = {
|
||||
.page_sz = PAGE_SIZE,
|
||||
@ -137,33 +132,34 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
|
||||
struct rmap_walk_control rwc = {
|
||||
.arg = &result,
|
||||
.rmap_one = __damon_pa_young,
|
||||
.anon_lock = page_lock_anon_vma_read,
|
||||
.anon_lock = folio_lock_anon_vma_read,
|
||||
};
|
||||
bool need_lock;
|
||||
|
||||
if (!page)
|
||||
return false;
|
||||
folio = page_folio(page);
|
||||
|
||||
if (!page_mapped(page) || !page_rmapping(page)) {
|
||||
if (page_is_idle(page))
|
||||
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
|
||||
if (folio_test_idle(folio))
|
||||
result.accessed = false;
|
||||
else
|
||||
result.accessed = true;
|
||||
put_page(page);
|
||||
folio_put(folio);
|
||||
goto out;
|
||||
}
|
||||
|
||||
need_lock = !PageAnon(page) || PageKsm(page);
|
||||
if (need_lock && !trylock_page(page)) {
|
||||
put_page(page);
|
||||
return NULL;
|
||||
need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
|
||||
if (need_lock && !folio_trylock(folio)) {
|
||||
folio_put(folio);
|
||||
return false;
|
||||
}
|
||||
|
||||
rmap_walk(page, &rwc);
|
||||
rmap_walk(folio, &rwc);
|
||||
|
||||
if (need_lock)
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
|
||||
out:
|
||||
*page_sz = result.page_sz;
|
||||
|
18
mm/debug.c
18
mm/debug.c
@ -48,7 +48,8 @@ const struct trace_print_flags vmaflag_names[] = {
|
||||
|
||||
static void __dump_page(struct page *page)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
struct folio *folio = page_folio(page);
|
||||
struct page *head = &folio->page;
|
||||
struct address_space *mapping;
|
||||
bool compound = PageCompound(page);
|
||||
/*
|
||||
@ -76,6 +77,7 @@ static void __dump_page(struct page *page)
|
||||
else
|
||||
mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
|
||||
head = page;
|
||||
folio = (struct folio *)page;
|
||||
compound = false;
|
||||
} else {
|
||||
mapping = page_mapping(page);
|
||||
@ -92,16 +94,10 @@ static void __dump_page(struct page *page)
|
||||
page, page_ref_count(head), mapcount, mapping,
|
||||
page_to_pgoff(page), page_to_pfn(page));
|
||||
if (compound) {
|
||||
if (hpage_pincount_available(page)) {
|
||||
pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
|
||||
head, compound_order(head),
|
||||
head_compound_mapcount(head),
|
||||
head_compound_pincount(head));
|
||||
} else {
|
||||
pr_warn("head:%p order:%u compound_mapcount:%d\n",
|
||||
head, compound_order(head),
|
||||
head_compound_mapcount(head));
|
||||
}
|
||||
pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
|
||||
head, compound_order(head),
|
||||
folio_entire_mapcount(folio),
|
||||
head_compound_pincount(head));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
|
59
mm/filemap.c
59
mm/filemap.c
@ -842,26 +842,27 @@ noinline int __filemap_add_folio(struct address_space *mapping,
|
||||
{
|
||||
XA_STATE(xas, &mapping->i_pages, index);
|
||||
int huge = folio_test_hugetlb(folio);
|
||||
int error;
|
||||
bool charged = false;
|
||||
long nr = 1;
|
||||
|
||||
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
|
||||
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
|
||||
mapping_set_update(&xas, mapping);
|
||||
|
||||
folio_get(folio);
|
||||
folio->mapping = mapping;
|
||||
folio->index = index;
|
||||
|
||||
if (!huge) {
|
||||
error = mem_cgroup_charge(folio, NULL, gfp);
|
||||
int error = mem_cgroup_charge(folio, NULL, gfp);
|
||||
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
|
||||
if (error)
|
||||
goto error;
|
||||
return error;
|
||||
charged = true;
|
||||
xas_set_order(&xas, index, folio_order(folio));
|
||||
nr = folio_nr_pages(folio);
|
||||
}
|
||||
|
||||
gfp &= GFP_RECLAIM_MASK;
|
||||
folio_ref_add(folio, nr);
|
||||
folio->mapping = mapping;
|
||||
folio->index = xas.xa_index;
|
||||
|
||||
do {
|
||||
unsigned int order = xa_get_order(xas.xa, xas.xa_index);
|
||||
@ -885,6 +886,8 @@ noinline int __filemap_add_folio(struct address_space *mapping,
|
||||
/* entry may have been split before we acquired lock */
|
||||
order = xa_get_order(xas.xa, xas.xa_index);
|
||||
if (order > folio_order(folio)) {
|
||||
/* How to handle large swap entries? */
|
||||
BUG_ON(shmem_mapping(mapping));
|
||||
xas_split(&xas, old, order);
|
||||
xas_reset(&xas);
|
||||
}
|
||||
@ -894,29 +897,31 @@ noinline int __filemap_add_folio(struct address_space *mapping,
|
||||
if (xas_error(&xas))
|
||||
goto unlock;
|
||||
|
||||
mapping->nrpages++;
|
||||
mapping->nrpages += nr;
|
||||
|
||||
/* hugetlb pages do not participate in page cache accounting */
|
||||
if (!huge)
|
||||
__lruvec_stat_add_folio(folio, NR_FILE_PAGES);
|
||||
if (!huge) {
|
||||
__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
|
||||
if (folio_test_pmd_mappable(folio))
|
||||
__lruvec_stat_mod_folio(folio,
|
||||
NR_FILE_THPS, nr);
|
||||
}
|
||||
unlock:
|
||||
xas_unlock_irq(&xas);
|
||||
} while (xas_nomem(&xas, gfp));
|
||||
|
||||
if (xas_error(&xas)) {
|
||||
error = xas_error(&xas);
|
||||
if (charged)
|
||||
mem_cgroup_uncharge(folio);
|
||||
if (xas_error(&xas))
|
||||
goto error;
|
||||
}
|
||||
|
||||
trace_mm_filemap_add_to_page_cache(folio);
|
||||
return 0;
|
||||
error:
|
||||
if (charged)
|
||||
mem_cgroup_uncharge(folio);
|
||||
folio->mapping = NULL;
|
||||
/* Leave page->index set: truncation relies upon it */
|
||||
folio_put(folio);
|
||||
return error;
|
||||
folio_put_refs(folio, nr);
|
||||
return xas_error(&xas);
|
||||
}
|
||||
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
|
||||
|
||||
@ -2997,6 +3002,24 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
|
||||
struct file *fpin = NULL;
|
||||
unsigned int mmap_miss;
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
/* Use the readahead code, even if readahead is disabled */
|
||||
if (vmf->vma->vm_flags & VM_HUGEPAGE) {
|
||||
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
|
||||
ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
|
||||
ra->size = HPAGE_PMD_NR;
|
||||
/*
|
||||
* Fetch two PMD folios, so we get the chance to actually
|
||||
* readahead, unless we've been told not to.
|
||||
*/
|
||||
if (!(vmf->vma->vm_flags & VM_RAND_READ))
|
||||
ra->size *= 2;
|
||||
ra->async_size = HPAGE_PMD_NR;
|
||||
page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
|
||||
return fpin;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* If we don't want any read-ahead, don't bother */
|
||||
if (vmf->vma->vm_flags & VM_RAND_READ)
|
||||
return fpin;
|
||||
@ -3029,7 +3052,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
|
||||
ra->size = ra->ra_pages;
|
||||
ra->async_size = ra->ra_pages / 4;
|
||||
ractl._index = ra->start;
|
||||
do_page_cache_ra(&ractl, ra->size, ra->async_size);
|
||||
page_cache_ra_order(&ractl, ra, 0);
|
||||
return fpin;
|
||||
}
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <linux/migrate.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/swap.h>
|
||||
#include "internal.h"
|
||||
|
||||
struct address_space *page_mapping(struct page *page)
|
||||
{
|
||||
@ -151,3 +152,15 @@ int try_to_release_page(struct page *page, gfp_t gfp)
|
||||
return filemap_release_folio(page_folio(page), gfp);
|
||||
}
|
||||
EXPORT_SYMBOL(try_to_release_page);
|
||||
|
||||
int isolate_lru_page(struct page *page)
|
||||
{
|
||||
if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"))
|
||||
return -EBUSY;
|
||||
return folio_isolate_lru((struct folio *)page);
|
||||
}
|
||||
|
||||
void putback_lru_page(struct page *page)
|
||||
{
|
||||
folio_putback_lru(page_folio(page));
|
||||
}
|
||||
|
492
mm/gup.c
492
mm/gup.c
@ -29,107 +29,71 @@ struct follow_page_context {
|
||||
unsigned int page_mask;
|
||||
};
|
||||
|
||||
static void hpage_pincount_add(struct page *page, int refs)
|
||||
{
|
||||
VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
|
||||
VM_BUG_ON_PAGE(page != compound_head(page), page);
|
||||
|
||||
atomic_add(refs, compound_pincount_ptr(page));
|
||||
}
|
||||
|
||||
static void hpage_pincount_sub(struct page *page, int refs)
|
||||
{
|
||||
VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
|
||||
VM_BUG_ON_PAGE(page != compound_head(page), page);
|
||||
|
||||
atomic_sub(refs, compound_pincount_ptr(page));
|
||||
}
|
||||
|
||||
/* Equivalent to calling put_page() @refs times. */
|
||||
static void put_page_refs(struct page *page, int refs)
|
||||
{
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
|
||||
return;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Calling put_page() for each ref is unnecessarily slow. Only the last
|
||||
* ref needs a put_page().
|
||||
*/
|
||||
if (refs > 1)
|
||||
page_ref_sub(page, refs - 1);
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the compound head page with ref appropriately incremented,
|
||||
* Return the folio with ref appropriately incremented,
|
||||
* or NULL if that failed.
|
||||
*/
|
||||
static inline struct page *try_get_compound_head(struct page *page, int refs)
|
||||
static inline struct folio *try_get_folio(struct page *page, int refs)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
struct folio *folio;
|
||||
|
||||
if (WARN_ON_ONCE(page_ref_count(head) < 0))
|
||||
retry:
|
||||
folio = page_folio(page);
|
||||
if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
|
||||
return NULL;
|
||||
if (unlikely(!page_cache_add_speculative(head, refs)))
|
||||
if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* At this point we have a stable reference to the head page; but it
|
||||
* could be that between the compound_head() lookup and the refcount
|
||||
* increment, the compound page was split, in which case we'd end up
|
||||
* holding a reference on a page that has nothing to do with the page
|
||||
* At this point we have a stable reference to the folio; but it
|
||||
* could be that between calling page_folio() and the refcount
|
||||
* increment, the folio was split, in which case we'd end up
|
||||
* holding a reference on a folio that has nothing to do with the page
|
||||
* we were given anymore.
|
||||
* So now that the head page is stable, recheck that the pages still
|
||||
* belong together.
|
||||
* So now that the folio is stable, recheck that the page still
|
||||
* belongs to this folio.
|
||||
*/
|
||||
if (unlikely(compound_head(page) != head)) {
|
||||
put_page_refs(head, refs);
|
||||
return NULL;
|
||||
if (unlikely(page_folio(page) != folio)) {
|
||||
folio_put_refs(folio, refs);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
return head;
|
||||
return folio;
|
||||
}
|
||||
|
||||
/**
|
||||
* try_grab_compound_head() - attempt to elevate a page's refcount, by a
|
||||
* flags-dependent amount.
|
||||
*
|
||||
* Even though the name includes "compound_head", this function is still
|
||||
* appropriate for callers that have a non-compound @page to get.
|
||||
*
|
||||
* try_grab_folio() - Attempt to get or pin a folio.
|
||||
* @page: pointer to page to be grabbed
|
||||
* @refs: the value to (effectively) add to the page's refcount
|
||||
* @refs: the value to (effectively) add to the folio's refcount
|
||||
* @flags: gup flags: these are the FOLL_* flag values.
|
||||
*
|
||||
* "grab" names in this file mean, "look at flags to decide whether to use
|
||||
* FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
|
||||
* FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
|
||||
*
|
||||
* Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
|
||||
* same time. (That's true throughout the get_user_pages*() and
|
||||
* pin_user_pages*() APIs.) Cases:
|
||||
*
|
||||
* FOLL_GET: page's refcount will be incremented by @refs.
|
||||
* FOLL_GET: folio's refcount will be incremented by @refs.
|
||||
*
|
||||
* FOLL_PIN on compound pages that are > two pages long: page's refcount will
|
||||
* be incremented by @refs, and page[2].hpage_pinned_refcount will be
|
||||
* incremented by @refs * GUP_PIN_COUNTING_BIAS.
|
||||
* FOLL_PIN on large folios: folio's refcount will be incremented by
|
||||
* @refs, and its compound_pincount will be incremented by @refs.
|
||||
*
|
||||
* FOLL_PIN on normal pages, or compound pages that are two pages long:
|
||||
* page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS.
|
||||
* FOLL_PIN on single-page folios: folio's refcount will be incremented by
|
||||
* @refs * GUP_PIN_COUNTING_BIAS.
|
||||
*
|
||||
* Return: head page (with refcount appropriately incremented) for success, or
|
||||
* NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
|
||||
* considered failure, and furthermore, a likely bug in the caller, so a warning
|
||||
* is also emitted.
|
||||
* Return: The folio containing @page (with refcount appropriately
|
||||
* incremented) for success, or NULL upon failure. If neither FOLL_GET
|
||||
* nor FOLL_PIN was set, that's considered failure, and furthermore,
|
||||
* a likely bug in the caller, so a warning is also emitted.
|
||||
*/
|
||||
__maybe_unused struct page *try_grab_compound_head(struct page *page,
|
||||
int refs, unsigned int flags)
|
||||
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
|
||||
{
|
||||
if (flags & FOLL_GET)
|
||||
return try_get_compound_head(page, refs);
|
||||
return try_get_folio(page, refs);
|
||||
else if (flags & FOLL_PIN) {
|
||||
struct folio *folio;
|
||||
|
||||
/*
|
||||
* Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
|
||||
* right zone, so fail and let the caller fall back to the slow
|
||||
@ -143,63 +107,57 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
|
||||
* CAUTION: Don't use compound_head() on the page before this
|
||||
* point, the result won't be stable.
|
||||
*/
|
||||
page = try_get_compound_head(page, refs);
|
||||
if (!page)
|
||||
folio = try_get_folio(page, refs);
|
||||
if (!folio)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* When pinning a compound page of order > 1 (which is what
|
||||
* hpage_pincount_available() checks for), use an exact count to
|
||||
* track it, via hpage_pincount_add/_sub().
|
||||
* When pinning a large folio, use an exact count to track it.
|
||||
*
|
||||
* However, be sure to *also* increment the normal page refcount
|
||||
* field at least once, so that the page really is pinned.
|
||||
* That's why the refcount from the earlier
|
||||
* try_get_compound_head() is left intact.
|
||||
* However, be sure to *also* increment the normal folio
|
||||
* refcount field at least once, so that the folio really
|
||||
* is pinned. That's why the refcount from the earlier
|
||||
* try_get_folio() is left intact.
|
||||
*/
|
||||
if (hpage_pincount_available(page))
|
||||
hpage_pincount_add(page, refs);
|
||||
if (folio_test_large(folio))
|
||||
atomic_add(refs, folio_pincount_ptr(folio));
|
||||
else
|
||||
page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
|
||||
folio_ref_add(folio,
|
||||
refs * (GUP_PIN_COUNTING_BIAS - 1));
|
||||
node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
|
||||
|
||||
mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
|
||||
refs);
|
||||
|
||||
return page;
|
||||
return folio;
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(1);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void put_compound_head(struct page *page, int refs, unsigned int flags)
|
||||
static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
|
||||
{
|
||||
if (flags & FOLL_PIN) {
|
||||
mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
|
||||
refs);
|
||||
|
||||
if (hpage_pincount_available(page))
|
||||
hpage_pincount_sub(page, refs);
|
||||
node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
|
||||
if (folio_test_large(folio))
|
||||
atomic_sub(refs, folio_pincount_ptr(folio));
|
||||
else
|
||||
refs *= GUP_PIN_COUNTING_BIAS;
|
||||
}
|
||||
|
||||
put_page_refs(page, refs);
|
||||
folio_put_refs(folio, refs);
|
||||
}
|
||||
|
||||
/**
|
||||
* try_grab_page() - elevate a page's refcount by a flag-dependent amount
|
||||
* @page: pointer to page to be grabbed
|
||||
* @flags: gup flags: these are the FOLL_* flag values.
|
||||
*
|
||||
* This might not do anything at all, depending on the flags argument.
|
||||
*
|
||||
* "grab" names in this file mean, "look at flags to decide whether to use
|
||||
* FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
|
||||
*
|
||||
* @page: pointer to page to be grabbed
|
||||
* @flags: gup flags: these are the FOLL_* flag values.
|
||||
*
|
||||
* Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
|
||||
* time. Cases: please see the try_grab_compound_head() documentation, with
|
||||
* time. Cases: please see the try_grab_folio() documentation, with
|
||||
* "refs=1".
|
||||
*
|
||||
* Return: true for success, or if no action was required (if neither FOLL_PIN
|
||||
@ -208,32 +166,28 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags)
|
||||
*/
|
||||
bool __must_check try_grab_page(struct page *page, unsigned int flags)
|
||||
{
|
||||
struct folio *folio = page_folio(page);
|
||||
|
||||
WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
|
||||
if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
|
||||
return false;
|
||||
|
||||
if (flags & FOLL_GET)
|
||||
return try_get_page(page);
|
||||
folio_ref_inc(folio);
|
||||
else if (flags & FOLL_PIN) {
|
||||
int refs = 1;
|
||||
|
||||
page = compound_head(page);
|
||||
|
||||
if (WARN_ON_ONCE(page_ref_count(page) <= 0))
|
||||
return false;
|
||||
|
||||
if (hpage_pincount_available(page))
|
||||
hpage_pincount_add(page, 1);
|
||||
else
|
||||
refs = GUP_PIN_COUNTING_BIAS;
|
||||
|
||||
/*
|
||||
* Similar to try_grab_compound_head(): even if using the
|
||||
* hpage_pincount_add/_sub() routines, be sure to
|
||||
* *also* increment the normal page refcount field at least
|
||||
* once, so that the page really is pinned.
|
||||
* Similar to try_grab_folio(): be sure to *also*
|
||||
* increment the normal page refcount field at least once,
|
||||
* so that the page really is pinned.
|
||||
*/
|
||||
page_ref_add(page, refs);
|
||||
if (folio_test_large(folio)) {
|
||||
folio_ref_add(folio, 1);
|
||||
atomic_add(1, folio_pincount_ptr(folio));
|
||||
} else {
|
||||
folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
|
||||
}
|
||||
|
||||
mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
|
||||
node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -250,62 +204,40 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags)
|
||||
*/
|
||||
void unpin_user_page(struct page *page)
|
||||
{
|
||||
put_compound_head(compound_head(page), 1, FOLL_PIN);
|
||||
gup_put_folio(page_folio(page), 1, FOLL_PIN);
|
||||
}
|
||||
EXPORT_SYMBOL(unpin_user_page);
|
||||
|
||||
static inline void compound_range_next(unsigned long i, unsigned long npages,
|
||||
struct page **list, struct page **head,
|
||||
unsigned int *ntails)
|
||||
static inline struct folio *gup_folio_range_next(struct page *start,
|
||||
unsigned long npages, unsigned long i, unsigned int *ntails)
|
||||
{
|
||||
struct page *next, *page;
|
||||
struct page *next = nth_page(start, i);
|
||||
struct folio *folio = page_folio(next);
|
||||
unsigned int nr = 1;
|
||||
|
||||
if (i >= npages)
|
||||
return;
|
||||
if (folio_test_large(folio))
|
||||
nr = min_t(unsigned int, npages - i,
|
||||
folio_nr_pages(folio) - folio_page_idx(folio, next));
|
||||
|
||||
next = *list + i;
|
||||
page = compound_head(next);
|
||||
if (PageCompound(page) && compound_order(page) >= 1)
|
||||
nr = min_t(unsigned int,
|
||||
page + compound_nr(page) - next, npages - i);
|
||||
|
||||
*head = page;
|
||||
*ntails = nr;
|
||||
return folio;
|
||||
}
|
||||
|
||||
#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \
|
||||
for (__i = 0, \
|
||||
compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \
|
||||
__i < __npages; __i += __ntails, \
|
||||
compound_range_next(__i, __npages, __list, &(__head), &(__ntails)))
|
||||
|
||||
static inline void compound_next(unsigned long i, unsigned long npages,
|
||||
struct page **list, struct page **head,
|
||||
unsigned int *ntails)
|
||||
static inline struct folio *gup_folio_next(struct page **list,
|
||||
unsigned long npages, unsigned long i, unsigned int *ntails)
|
||||
{
|
||||
struct page *page;
|
||||
struct folio *folio = page_folio(list[i]);
|
||||
unsigned int nr;
|
||||
|
||||
if (i >= npages)
|
||||
return;
|
||||
|
||||
page = compound_head(list[i]);
|
||||
for (nr = i + 1; nr < npages; nr++) {
|
||||
if (compound_head(list[nr]) != page)
|
||||
if (page_folio(list[nr]) != folio)
|
||||
break;
|
||||
}
|
||||
|
||||
*head = page;
|
||||
*ntails = nr - i;
|
||||
return folio;
|
||||
}
|
||||
|
||||
#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \
|
||||
for (__i = 0, \
|
||||
compound_next(__i, __npages, __list, &(__head), &(__ntails)); \
|
||||
__i < __npages; __i += __ntails, \
|
||||
compound_next(__i, __npages, __list, &(__head), &(__ntails)))
|
||||
|
||||
/**
|
||||
* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
|
||||
* @pages: array of pages to be maybe marked dirty, and definitely released.
|
||||
@ -331,16 +263,17 @@ static inline void compound_next(unsigned long i, unsigned long npages,
|
||||
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
|
||||
bool make_dirty)
|
||||
{
|
||||
unsigned long index;
|
||||
struct page *head;
|
||||
unsigned int ntails;
|
||||
unsigned long i;
|
||||
struct folio *folio;
|
||||
unsigned int nr;
|
||||
|
||||
if (!make_dirty) {
|
||||
unpin_user_pages(pages, npages);
|
||||
return;
|
||||
}
|
||||
|
||||
for_each_compound_head(index, pages, npages, head, ntails) {
|
||||
for (i = 0; i < npages; i += nr) {
|
||||
folio = gup_folio_next(pages, npages, i, &nr);
|
||||
/*
|
||||
* Checking PageDirty at this point may race with
|
||||
* clear_page_dirty_for_io(), but that's OK. Two key
|
||||
@ -361,9 +294,12 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
|
||||
* written back, so it gets written back again in the
|
||||
* next writeback cycle. This is harmless.
|
||||
*/
|
||||
if (!PageDirty(head))
|
||||
set_page_dirty_lock(head);
|
||||
put_compound_head(head, ntails, FOLL_PIN);
|
||||
if (!folio_test_dirty(folio)) {
|
||||
folio_lock(folio);
|
||||
folio_mark_dirty(folio);
|
||||
folio_unlock(folio);
|
||||
}
|
||||
gup_put_folio(folio, nr, FOLL_PIN);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
|
||||
@ -392,14 +328,18 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
|
||||
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
|
||||
bool make_dirty)
|
||||
{
|
||||
unsigned long index;
|
||||
struct page *head;
|
||||
unsigned int ntails;
|
||||
unsigned long i;
|
||||
struct folio *folio;
|
||||
unsigned int nr;
|
||||
|
||||
for_each_compound_range(index, &page, npages, head, ntails) {
|
||||
if (make_dirty && !PageDirty(head))
|
||||
set_page_dirty_lock(head);
|
||||
put_compound_head(head, ntails, FOLL_PIN);
|
||||
for (i = 0; i < npages; i += nr) {
|
||||
folio = gup_folio_range_next(page, npages, i, &nr);
|
||||
if (make_dirty && !folio_test_dirty(folio)) {
|
||||
folio_lock(folio);
|
||||
folio_mark_dirty(folio);
|
||||
folio_unlock(folio);
|
||||
}
|
||||
gup_put_folio(folio, nr, FOLL_PIN);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
|
||||
@ -415,9 +355,9 @@ EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
|
||||
*/
|
||||
void unpin_user_pages(struct page **pages, unsigned long npages)
|
||||
{
|
||||
unsigned long index;
|
||||
struct page *head;
|
||||
unsigned int ntails;
|
||||
unsigned long i;
|
||||
struct folio *folio;
|
||||
unsigned int nr;
|
||||
|
||||
/*
|
||||
* If this WARN_ON() fires, then the system *might* be leaking pages (by
|
||||
@ -427,8 +367,10 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
|
||||
if (WARN_ON(IS_ERR_VALUE(npages)))
|
||||
return;
|
||||
|
||||
for_each_compound_head(index, pages, npages, head, ntails)
|
||||
put_compound_head(head, ntails, FOLL_PIN);
|
||||
for (i = 0; i < npages; i += nr) {
|
||||
folio = gup_folio_next(pages, npages, i, &nr);
|
||||
gup_put_folio(folio, nr, FOLL_PIN);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(unpin_user_pages);
|
||||
|
||||
@ -593,32 +535,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
|
||||
*/
|
||||
mark_page_accessed(page);
|
||||
}
|
||||
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
|
||||
/* Do not mlock pte-mapped THP */
|
||||
if (PageTransCompound(page))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* The preliminary mapping check is mainly to avoid the
|
||||
* pointless overhead of lock_page on the ZERO_PAGE
|
||||
* which might bounce very badly if there is contention.
|
||||
*
|
||||
* If the page is already locked, we don't need to
|
||||
* handle it now - vmscan will handle it later if and
|
||||
* when it attempts to reclaim the page.
|
||||
*/
|
||||
if (page->mapping && trylock_page(page)) {
|
||||
lru_add_drain(); /* push cached pages to LRU */
|
||||
/*
|
||||
* Because we lock page here, and migration is
|
||||
* blocked by the pte's page reference, and we
|
||||
* know the page is still mapped, we don't even
|
||||
* need to check for file-cache page truncation.
|
||||
*/
|
||||
mlock_vma_page(page);
|
||||
unlock_page(page);
|
||||
}
|
||||
}
|
||||
out:
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
return page;
|
||||
@ -941,9 +857,6 @@ static int faultin_page(struct vm_area_struct *vma,
|
||||
unsigned int fault_flags = 0;
|
||||
vm_fault_t ret;
|
||||
|
||||
/* mlock all present pages, but do not fault in new pages */
|
||||
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
|
||||
return -ENOENT;
|
||||
if (*flags & FOLL_NOFAULT)
|
||||
return -EFAULT;
|
||||
if (*flags & FOLL_WRITE)
|
||||
@ -1194,8 +1107,6 @@ static long __get_user_pages(struct mm_struct *mm,
|
||||
case -ENOMEM:
|
||||
case -EHWPOISON:
|
||||
goto out;
|
||||
case -ENOENT:
|
||||
goto next_page;
|
||||
}
|
||||
BUG();
|
||||
} else if (PTR_ERR(page) == -EEXIST) {
|
||||
@ -1500,9 +1411,14 @@ long populate_vma_page_range(struct vm_area_struct *vma,
|
||||
VM_BUG_ON_VMA(end > vma->vm_end, vma);
|
||||
mmap_assert_locked(mm);
|
||||
|
||||
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
|
||||
/*
|
||||
* Rightly or wrongly, the VM_LOCKONFAULT case has never used
|
||||
* faultin_page() to break COW, so it has no work to do here.
|
||||
*/
|
||||
if (vma->vm_flags & VM_LOCKONFAULT)
|
||||
gup_flags &= ~FOLL_POPULATE;
|
||||
return nr_pages;
|
||||
|
||||
gup_flags = FOLL_TOUCH;
|
||||
/*
|
||||
* We want to touch writable mappings with a write fault in order
|
||||
* to break COW, except for shared mappings because these don't COW
|
||||
@ -1569,10 +1485,9 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
|
||||
* in the page table.
|
||||
* FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
|
||||
* a poisoned page.
|
||||
* FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT.
|
||||
* !FOLL_FORCE: Require proper access permissions.
|
||||
*/
|
||||
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON;
|
||||
gup_flags = FOLL_TOUCH | FOLL_HWPOISON;
|
||||
if (write)
|
||||
gup_flags |= FOLL_WRITE;
|
||||
|
||||
@ -1852,72 +1767,80 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
|
||||
struct page **pages,
|
||||
unsigned int gup_flags)
|
||||
{
|
||||
unsigned long i;
|
||||
unsigned long isolation_error_count = 0;
|
||||
bool drain_allow = true;
|
||||
unsigned long isolation_error_count = 0, i;
|
||||
struct folio *prev_folio = NULL;
|
||||
LIST_HEAD(movable_page_list);
|
||||
long ret = 0;
|
||||
struct page *prev_head = NULL;
|
||||
struct page *head;
|
||||
struct migration_target_control mtc = {
|
||||
.nid = NUMA_NO_NODE,
|
||||
.gfp_mask = GFP_USER | __GFP_NOWARN,
|
||||
};
|
||||
bool drain_allow = true;
|
||||
int ret = 0;
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
head = compound_head(pages[i]);
|
||||
if (head == prev_head)
|
||||
continue;
|
||||
prev_head = head;
|
||||
/*
|
||||
* If we get a movable page, since we are going to be pinning
|
||||
* these entries, try to move them out if possible.
|
||||
*/
|
||||
if (!is_pinnable_page(head)) {
|
||||
if (PageHuge(head)) {
|
||||
if (!isolate_huge_page(head, &movable_page_list))
|
||||
isolation_error_count++;
|
||||
} else {
|
||||
if (!PageLRU(head) && drain_allow) {
|
||||
lru_add_drain_all();
|
||||
drain_allow = false;
|
||||
}
|
||||
struct folio *folio = page_folio(pages[i]);
|
||||
|
||||
if (isolate_lru_page(head)) {
|
||||
isolation_error_count++;
|
||||
continue;
|
||||
}
|
||||
list_add_tail(&head->lru, &movable_page_list);
|
||||
mod_node_page_state(page_pgdat(head),
|
||||
NR_ISOLATED_ANON +
|
||||
page_is_file_lru(head),
|
||||
thp_nr_pages(head));
|
||||
}
|
||||
if (folio == prev_folio)
|
||||
continue;
|
||||
prev_folio = folio;
|
||||
|
||||
if (folio_is_pinnable(folio))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Try to move out any movable page before pinning the range.
|
||||
*/
|
||||
if (folio_test_hugetlb(folio)) {
|
||||
if (!isolate_huge_page(&folio->page,
|
||||
&movable_page_list))
|
||||
isolation_error_count++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!folio_test_lru(folio) && drain_allow) {
|
||||
lru_add_drain_all();
|
||||
drain_allow = false;
|
||||
}
|
||||
|
||||
if (folio_isolate_lru(folio)) {
|
||||
isolation_error_count++;
|
||||
continue;
|
||||
}
|
||||
list_add_tail(&folio->lru, &movable_page_list);
|
||||
node_stat_mod_folio(folio,
|
||||
NR_ISOLATED_ANON + folio_is_file_lru(folio),
|
||||
folio_nr_pages(folio));
|
||||
}
|
||||
|
||||
if (!list_empty(&movable_page_list) || isolation_error_count)
|
||||
goto unpin_pages;
|
||||
|
||||
/*
|
||||
* If list is empty, and no isolation errors, means that all pages are
|
||||
* in the correct zone.
|
||||
*/
|
||||
if (list_empty(&movable_page_list) && !isolation_error_count)
|
||||
return nr_pages;
|
||||
return nr_pages;
|
||||
|
||||
unpin_pages:
|
||||
if (gup_flags & FOLL_PIN) {
|
||||
unpin_user_pages(pages, nr_pages);
|
||||
} else {
|
||||
for (i = 0; i < nr_pages; i++)
|
||||
put_page(pages[i]);
|
||||
}
|
||||
|
||||
if (!list_empty(&movable_page_list)) {
|
||||
struct migration_target_control mtc = {
|
||||
.nid = NUMA_NO_NODE,
|
||||
.gfp_mask = GFP_USER | __GFP_NOWARN,
|
||||
};
|
||||
|
||||
ret = migrate_pages(&movable_page_list, alloc_migration_target,
|
||||
NULL, (unsigned long)&mtc, MIGRATE_SYNC,
|
||||
MR_LONGTERM_PIN, NULL);
|
||||
if (ret && !list_empty(&movable_page_list))
|
||||
putback_movable_pages(&movable_page_list);
|
||||
if (ret > 0) /* number of pages not migrated */
|
||||
ret = -ENOMEM;
|
||||
}
|
||||
|
||||
return ret > 0 ? -ENOMEM : ret;
|
||||
if (ret && !list_empty(&movable_page_list))
|
||||
putback_movable_pages(&movable_page_list);
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
static long check_and_migrate_movable_pages(unsigned long nr_pages,
|
||||
@ -2227,7 +2150,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
||||
ptem = ptep = pte_offset_map(&pmd, addr);
|
||||
do {
|
||||
pte_t pte = ptep_get_lockless(ptep);
|
||||
struct page *head, *page;
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
|
||||
/*
|
||||
* Similar to the PMD case below, NUMA hinting must take slow
|
||||
@ -2254,22 +2178,20 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
||||
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
||||
page = pte_page(pte);
|
||||
|
||||
head = try_grab_compound_head(page, 1, flags);
|
||||
if (!head)
|
||||
folio = try_grab_folio(page, 1, flags);
|
||||
if (!folio)
|
||||
goto pte_unmap;
|
||||
|
||||
if (unlikely(page_is_secretmem(page))) {
|
||||
put_compound_head(head, 1, flags);
|
||||
gup_put_folio(folio, 1, flags);
|
||||
goto pte_unmap;
|
||||
}
|
||||
|
||||
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
|
||||
put_compound_head(head, 1, flags);
|
||||
gup_put_folio(folio, 1, flags);
|
||||
goto pte_unmap;
|
||||
}
|
||||
|
||||
VM_BUG_ON_PAGE(compound_head(page) != head, page);
|
||||
|
||||
/*
|
||||
* We need to make the page accessible if and only if we are
|
||||
* going to access its content (the FOLL_PIN case). Please
|
||||
@ -2279,14 +2201,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
||||
if (flags & FOLL_PIN) {
|
||||
ret = arch_make_page_accessible(page);
|
||||
if (ret) {
|
||||
unpin_user_page(page);
|
||||
gup_put_folio(folio, 1, flags);
|
||||
goto pte_unmap;
|
||||
}
|
||||
}
|
||||
SetPageReferenced(page);
|
||||
folio_set_referenced(folio);
|
||||
pages[*nr] = page;
|
||||
(*nr)++;
|
||||
|
||||
} while (ptep++, addr += PAGE_SIZE, addr != end);
|
||||
|
||||
ret = 1;
|
||||
@ -2403,8 +2324,8 @@ static int record_subpages(struct page *page, unsigned long addr,
|
||||
{
|
||||
int nr;
|
||||
|
||||
for (nr = 0; addr != end; addr += PAGE_SIZE)
|
||||
pages[nr++] = page++;
|
||||
for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
|
||||
pages[nr] = nth_page(page, nr);
|
||||
|
||||
return nr;
|
||||
}
|
||||
@ -2422,7 +2343,8 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
|
||||
struct page **pages, int *nr)
|
||||
{
|
||||
unsigned long pte_end;
|
||||
struct page *head, *page;
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
pte_t pte;
|
||||
int refs;
|
||||
|
||||
@ -2438,21 +2360,20 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
|
||||
/* hugepages are never "special" */
|
||||
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
||||
|
||||
head = pte_page(pte);
|
||||
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
|
||||
page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
|
||||
refs = record_subpages(page, addr, end, pages + *nr);
|
||||
|
||||
head = try_grab_compound_head(head, refs, flags);
|
||||
if (!head)
|
||||
folio = try_grab_folio(page, refs, flags);
|
||||
if (!folio)
|
||||
return 0;
|
||||
|
||||
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
|
||||
put_compound_head(head, refs, flags);
|
||||
gup_put_folio(folio, refs, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*nr += refs;
|
||||
SetPageReferenced(head);
|
||||
folio_set_referenced(folio);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -2486,7 +2407,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
||||
unsigned long end, unsigned int flags,
|
||||
struct page **pages, int *nr)
|
||||
{
|
||||
struct page *head, *page;
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
int refs;
|
||||
|
||||
if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
|
||||
@ -2499,20 +2421,20 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
||||
pages, nr);
|
||||
}
|
||||
|
||||
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
|
||||
page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
|
||||
refs = record_subpages(page, addr, end, pages + *nr);
|
||||
|
||||
head = try_grab_compound_head(pmd_page(orig), refs, flags);
|
||||
if (!head)
|
||||
folio = try_grab_folio(page, refs, flags);
|
||||
if (!folio)
|
||||
return 0;
|
||||
|
||||
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
|
||||
put_compound_head(head, refs, flags);
|
||||
gup_put_folio(folio, refs, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*nr += refs;
|
||||
SetPageReferenced(head);
|
||||
folio_set_referenced(folio);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -2520,7 +2442,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
|
||||
unsigned long end, unsigned int flags,
|
||||
struct page **pages, int *nr)
|
||||
{
|
||||
struct page *head, *page;
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
int refs;
|
||||
|
||||
if (!pud_access_permitted(orig, flags & FOLL_WRITE))
|
||||
@ -2533,20 +2456,20 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
|
||||
pages, nr);
|
||||
}
|
||||
|
||||
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
|
||||
page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT);
|
||||
refs = record_subpages(page, addr, end, pages + *nr);
|
||||
|
||||
head = try_grab_compound_head(pud_page(orig), refs, flags);
|
||||
if (!head)
|
||||
folio = try_grab_folio(page, refs, flags);
|
||||
if (!folio)
|
||||
return 0;
|
||||
|
||||
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
|
||||
put_compound_head(head, refs, flags);
|
||||
gup_put_folio(folio, refs, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*nr += refs;
|
||||
SetPageReferenced(head);
|
||||
folio_set_referenced(folio);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -2555,27 +2478,28 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
|
||||
struct page **pages, int *nr)
|
||||
{
|
||||
int refs;
|
||||
struct page *head, *page;
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
|
||||
if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
|
||||
return 0;
|
||||
|
||||
BUILD_BUG_ON(pgd_devmap(orig));
|
||||
|
||||
page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
|
||||
page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT);
|
||||
refs = record_subpages(page, addr, end, pages + *nr);
|
||||
|
||||
head = try_grab_compound_head(pgd_page(orig), refs, flags);
|
||||
if (!head)
|
||||
folio = try_grab_folio(page, refs, flags);
|
||||
if (!folio)
|
||||
return 0;
|
||||
|
||||
if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
|
||||
put_compound_head(head, refs, flags);
|
||||
gup_put_folio(folio, refs, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*nr += refs;
|
||||
SetPageReferenced(head);
|
||||
folio_set_referenced(folio);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
178
mm/huge_memory.c
178
mm/huge_memory.c
@ -583,13 +583,10 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
|
||||
unsigned long ret;
|
||||
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
|
||||
|
||||
if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
|
||||
goto out;
|
||||
|
||||
ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
|
||||
if (ret)
|
||||
return ret;
|
||||
out:
|
||||
|
||||
return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
|
||||
@ -1381,39 +1378,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
|
||||
if (flags & FOLL_TOUCH)
|
||||
touch_pmd(vma, addr, pmd, flags);
|
||||
|
||||
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
|
||||
/*
|
||||
* We don't mlock() pte-mapped THPs. This way we can avoid
|
||||
* leaking mlocked pages into non-VM_LOCKED VMAs.
|
||||
*
|
||||
* For anon THP:
|
||||
*
|
||||
* In most cases the pmd is the only mapping of the page as we
|
||||
* break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
|
||||
* writable private mappings in populate_vma_page_range().
|
||||
*
|
||||
* The only scenario when we have the page shared here is if we
|
||||
* mlocking read-only mapping shared over fork(). We skip
|
||||
* mlocking such pages.
|
||||
*
|
||||
* For file THP:
|
||||
*
|
||||
* We can expect PageDoubleMap() to be stable under page lock:
|
||||
* for file pages we set it in page_add_file_rmap(), which
|
||||
* requires page to be locked.
|
||||
*/
|
||||
|
||||
if (PageAnon(page) && compound_mapcount(page) != 1)
|
||||
goto skip_mlock;
|
||||
if (PageDoubleMap(page) || !page->mapping)
|
||||
goto skip_mlock;
|
||||
if (!trylock_page(page))
|
||||
goto skip_mlock;
|
||||
if (page->mapping && !PageDoubleMap(page))
|
||||
mlock_vma_page(page);
|
||||
unlock_page(page);
|
||||
}
|
||||
skip_mlock:
|
||||
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
|
||||
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
|
||||
|
||||
@ -1611,7 +1575,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
|
||||
if (pmd_present(orig_pmd)) {
|
||||
page = pmd_page(orig_pmd);
|
||||
page_remove_rmap(page, true);
|
||||
page_remove_rmap(page, vma, true);
|
||||
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
|
||||
VM_BUG_ON_PAGE(!PageHead(page), page);
|
||||
} else if (thp_migration_supported()) {
|
||||
@ -2007,7 +1971,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
set_page_dirty(page);
|
||||
if (!PageReferenced(page) && pmd_young(old_pmd))
|
||||
SetPageReferenced(page);
|
||||
page_remove_rmap(page, true);
|
||||
page_remove_rmap(page, vma, true);
|
||||
put_page(page);
|
||||
}
|
||||
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
|
||||
@ -2141,6 +2105,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
}
|
||||
}
|
||||
unlock_page_memcg(page);
|
||||
|
||||
/* Above is effectively page_remove_rmap(page, vma, true) */
|
||||
munlock_vma_page(page, vma, true);
|
||||
}
|
||||
|
||||
smp_wmb(); /* make pte visible before pmd */
|
||||
@ -2148,18 +2115,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
|
||||
if (freeze) {
|
||||
for (i = 0; i < HPAGE_PMD_NR; i++) {
|
||||
page_remove_rmap(page + i, false);
|
||||
page_remove_rmap(page + i, vma, false);
|
||||
put_page(page + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
unsigned long address, bool freeze, struct page *page)
|
||||
unsigned long address, bool freeze, struct folio *folio)
|
||||
{
|
||||
spinlock_t *ptl;
|
||||
struct mmu_notifier_range range;
|
||||
bool do_unlock_page = false;
|
||||
bool do_unlock_folio = false;
|
||||
pmd_t _pmd;
|
||||
|
||||
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
|
||||
@ -2169,20 +2136,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
ptl = pmd_lock(vma->vm_mm, pmd);
|
||||
|
||||
/*
|
||||
* If caller asks to setup a migration entries, we need a page to check
|
||||
* pmd against. Otherwise we can end up replacing wrong page.
|
||||
* If caller asks to setup a migration entry, we need a folio to check
|
||||
* pmd against. Otherwise we can end up replacing wrong folio.
|
||||
*/
|
||||
VM_BUG_ON(freeze && !page);
|
||||
if (page) {
|
||||
VM_WARN_ON_ONCE(!PageLocked(page));
|
||||
if (page != pmd_page(*pmd))
|
||||
VM_BUG_ON(freeze && !folio);
|
||||
if (folio) {
|
||||
VM_WARN_ON_ONCE(!folio_test_locked(folio));
|
||||
if (folio != page_folio(pmd_page(*pmd)))
|
||||
goto out;
|
||||
}
|
||||
|
||||
repeat:
|
||||
if (pmd_trans_huge(*pmd)) {
|
||||
if (!page) {
|
||||
page = pmd_page(*pmd);
|
||||
if (!folio) {
|
||||
folio = page_folio(pmd_page(*pmd));
|
||||
/*
|
||||
* An anonymous page must be locked, to ensure that a
|
||||
* concurrent reuse_swap_page() sees stable mapcount;
|
||||
@ -2190,33 +2157,31 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
* and page lock must not be taken when zap_pmd_range()
|
||||
* calls __split_huge_pmd() while i_mmap_lock is held.
|
||||
*/
|
||||
if (PageAnon(page)) {
|
||||
if (unlikely(!trylock_page(page))) {
|
||||
get_page(page);
|
||||
if (folio_test_anon(folio)) {
|
||||
if (unlikely(!folio_trylock(folio))) {
|
||||
folio_get(folio);
|
||||
_pmd = *pmd;
|
||||
spin_unlock(ptl);
|
||||
lock_page(page);
|
||||
folio_lock(folio);
|
||||
spin_lock(ptl);
|
||||
if (unlikely(!pmd_same(*pmd, _pmd))) {
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
page = NULL;
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
folio = NULL;
|
||||
goto repeat;
|
||||
}
|
||||
put_page(page);
|
||||
folio_put(folio);
|
||||
}
|
||||
do_unlock_page = true;
|
||||
do_unlock_folio = true;
|
||||
}
|
||||
}
|
||||
if (PageMlocked(page))
|
||||
clear_page_mlock(page);
|
||||
} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
|
||||
goto out;
|
||||
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
|
||||
out:
|
||||
spin_unlock(ptl);
|
||||
if (do_unlock_page)
|
||||
unlock_page(page);
|
||||
if (do_unlock_folio)
|
||||
folio_unlock(folio);
|
||||
/*
|
||||
* No need to double call mmu_notifier->invalidate_range() callback.
|
||||
* They are 3 cases to consider inside __split_huge_pmd_locked():
|
||||
@ -2234,7 +2199,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
}
|
||||
|
||||
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
|
||||
bool freeze, struct page *page)
|
||||
bool freeze, struct folio *folio)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
@ -2255,7 +2220,7 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
|
||||
__split_huge_pmd(vma, pmd, address, freeze, page);
|
||||
__split_huge_pmd(vma, pmd, address, freeze, folio);
|
||||
}
|
||||
|
||||
static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
|
||||
@ -2295,6 +2260,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
||||
|
||||
static void unmap_page(struct page *page)
|
||||
{
|
||||
struct folio *folio = page_folio(page);
|
||||
enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
|
||||
TTU_SYNC;
|
||||
|
||||
@ -2305,26 +2271,27 @@ static void unmap_page(struct page *page)
|
||||
* pages can simply be left unmapped, then faulted back on demand.
|
||||
* If that is ever changed (perhaps for mlock), update remap_page().
|
||||
*/
|
||||
if (PageAnon(page))
|
||||
try_to_migrate(page, ttu_flags);
|
||||
if (folio_test_anon(folio))
|
||||
try_to_migrate(folio, ttu_flags);
|
||||
else
|
||||
try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK);
|
||||
try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
|
||||
}
|
||||
|
||||
static void remap_page(struct page *page, unsigned int nr)
|
||||
static void remap_page(struct folio *folio, unsigned long nr)
|
||||
{
|
||||
int i;
|
||||
int i = 0;
|
||||
|
||||
/* If unmap_page() uses try_to_migrate() on file, remove this check */
|
||||
if (!PageAnon(page))
|
||||
if (!folio_test_anon(folio))
|
||||
return;
|
||||
if (PageTransHuge(page)) {
|
||||
remove_migration_ptes(page, page, true);
|
||||
} else {
|
||||
for (i = 0; i < nr; i++)
|
||||
remove_migration_ptes(page + i, page + i, true);
|
||||
for (;;) {
|
||||
remove_migration_ptes(folio, folio, true);
|
||||
i += folio_nr_pages(folio);
|
||||
if (i >= nr)
|
||||
break;
|
||||
folio = folio_next(folio);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2344,8 +2311,11 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
|
||||
} else {
|
||||
/* head is still on lru (and we have it frozen) */
|
||||
VM_WARN_ON(!PageLRU(head));
|
||||
if (PageUnevictable(tail))
|
||||
tail->mlock_count = 0;
|
||||
else
|
||||
list_add_tail(&tail->lru, &head->lru);
|
||||
SetPageLRU(tail);
|
||||
list_add_tail(&tail->lru, &head->lru);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2481,7 +2451,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
|
||||
}
|
||||
local_irq_enable();
|
||||
|
||||
remap_page(head, nr);
|
||||
remap_page(folio, nr);
|
||||
|
||||
if (PageSwapCache(head)) {
|
||||
swp_entry_t entry = { .val = page_private(head) };
|
||||
@ -2506,30 +2476,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
|
||||
}
|
||||
}
|
||||
|
||||
int total_mapcount(struct page *page)
|
||||
{
|
||||
int i, compound, nr, ret;
|
||||
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
|
||||
if (likely(!PageCompound(page)))
|
||||
return atomic_read(&page->_mapcount) + 1;
|
||||
|
||||
compound = compound_mapcount(page);
|
||||
nr = compound_nr(page);
|
||||
if (PageHuge(page))
|
||||
return compound;
|
||||
ret = compound;
|
||||
for (i = 0; i < nr; i++)
|
||||
ret += atomic_read(&page[i]._mapcount) + 1;
|
||||
/* File pages has compound_mapcount included in _mapcount */
|
||||
if (!PageAnon(page))
|
||||
return ret - compound * nr;
|
||||
if (PageDoubleMap(page))
|
||||
ret -= nr;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This calculates accurately how many mappings a transparent hugepage
|
||||
* has (unlike page_mapcount() which isn't fully accurate). This full
|
||||
@ -2579,18 +2525,19 @@ int page_trans_huge_mapcount(struct page *page)
|
||||
}
|
||||
|
||||
/* Racy check whether the huge page can be split */
|
||||
bool can_split_huge_page(struct page *page, int *pextra_pins)
|
||||
bool can_split_folio(struct folio *folio, int *pextra_pins)
|
||||
{
|
||||
int extra_pins;
|
||||
|
||||
/* Additional pins from page cache */
|
||||
if (PageAnon(page))
|
||||
extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
|
||||
if (folio_test_anon(folio))
|
||||
extra_pins = folio_test_swapcache(folio) ?
|
||||
folio_nr_pages(folio) : 0;
|
||||
else
|
||||
extra_pins = thp_nr_pages(page);
|
||||
extra_pins = folio_nr_pages(folio);
|
||||
if (pextra_pins)
|
||||
*pextra_pins = extra_pins;
|
||||
return total_mapcount(page) == page_count(page) - extra_pins - 1;
|
||||
return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2614,7 +2561,8 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
|
||||
*/
|
||||
int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
struct folio *folio = page_folio(page);
|
||||
struct page *head = &folio->page;
|
||||
struct deferred_split *ds_queue = get_deferred_split_queue(head);
|
||||
XA_STATE(xas, &head->mapping->i_pages, head->index);
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
@ -2634,7 +2582,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
* The caller does not necessarily hold an mmap_lock that would
|
||||
* prevent the anon_vma disappearing so we first we take a
|
||||
* reference to it and then lock the anon_vma for write. This
|
||||
* is similar to page_lock_anon_vma_read except the write lock
|
||||
* is similar to folio_lock_anon_vma_read except the write lock
|
||||
* is taken to serialise against parallel split or collapse
|
||||
* operations.
|
||||
*/
|
||||
@ -2681,7 +2629,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
* Racy check if we can split the page, before unmap_page() will
|
||||
* split PMDs
|
||||
*/
|
||||
if (!can_split_huge_page(head, &extra_pins)) {
|
||||
if (!can_split_folio(folio, &extra_pins)) {
|
||||
ret = -EBUSY;
|
||||
goto out_unlock;
|
||||
}
|
||||
@ -2731,7 +2679,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
if (mapping)
|
||||
xas_unlock(&xas);
|
||||
local_irq_enable();
|
||||
remap_page(head, thp_nr_pages(head));
|
||||
remap_page(folio, folio_nr_pages(folio));
|
||||
ret = -EBUSY;
|
||||
}
|
||||
|
||||
@ -2988,7 +2936,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
|
||||
goto next;
|
||||
|
||||
total++;
|
||||
if (!can_split_huge_page(compound_head(page), NULL))
|
||||
if (!can_split_folio(page_folio(page), NULL))
|
||||
goto next;
|
||||
|
||||
if (!trylock_page(page))
|
||||
@ -3181,7 +3129,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
|
||||
if (pmd_soft_dirty(pmdval))
|
||||
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
|
||||
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
|
||||
page_remove_rmap(page, true);
|
||||
page_remove_rmap(page, vma, true);
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
@ -3210,10 +3158,8 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
|
||||
if (PageAnon(new))
|
||||
page_add_anon_rmap(new, vma, mmun_start, true);
|
||||
else
|
||||
page_add_file_rmap(new, true);
|
||||
page_add_file_rmap(new, vma, true);
|
||||
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
|
||||
if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
|
||||
mlock_vma_page(new);
|
||||
|
||||
/* No need to invalidate - it was non-present before */
|
||||
update_mmu_cache_pmd(vma, address, pvmw->pmd);
|
||||
|
15
mm/hugetlb.c
15
mm/hugetlb.c
@ -1321,7 +1321,9 @@ static void __destroy_compound_gigantic_page(struct page *page,
|
||||
}
|
||||
|
||||
set_compound_order(page, 0);
|
||||
#ifdef CONFIG_64BIT
|
||||
page[1].compound_nr = 0;
|
||||
#endif
|
||||
__ClearPageHead(page);
|
||||
}
|
||||
|
||||
@ -1813,7 +1815,9 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
|
||||
for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
|
||||
__ClearPageReserved(p);
|
||||
set_compound_order(page, 0);
|
||||
#ifdef CONFIG_64BIT
|
||||
page[1].compound_nr = 0;
|
||||
#endif
|
||||
__ClearPageHead(page);
|
||||
return false;
|
||||
}
|
||||
@ -5013,7 +5017,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
|
||||
set_page_dirty(page);
|
||||
|
||||
hugetlb_count_sub(pages_per_huge_page(h), mm);
|
||||
page_remove_rmap(page, true);
|
||||
page_remove_rmap(page, vma, true);
|
||||
|
||||
spin_unlock(ptl);
|
||||
tlb_remove_page_size(tlb, page, huge_page_size(h));
|
||||
@ -5258,7 +5262,7 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
/* Break COW */
|
||||
huge_ptep_clear_flush(vma, haddr, ptep);
|
||||
mmu_notifier_invalidate_range(mm, range.start, range.end);
|
||||
page_remove_rmap(old_page, true);
|
||||
page_remove_rmap(old_page, vma, true);
|
||||
hugepage_add_new_anon_rmap(new_page, vma, haddr);
|
||||
set_huge_pte_at(mm, haddr, ptep,
|
||||
make_huge_pte(vma, new_page, 1));
|
||||
@ -6074,7 +6078,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
|
||||
if (pages) {
|
||||
/*
|
||||
* try_grab_compound_head() should always succeed here,
|
||||
* try_grab_folio() should always succeed here,
|
||||
* because: a) we hold the ptl lock, and b) we've just
|
||||
* checked that the huge page is present in the page
|
||||
* tables. If the huge page is present, then the tail
|
||||
@ -6083,9 +6087,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
* any way. So this page must be available at this
|
||||
* point, unless the page refcount overflowed:
|
||||
*/
|
||||
if (WARN_ON_ONCE(!try_grab_compound_head(pages[i],
|
||||
refs,
|
||||
flags))) {
|
||||
if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
|
||||
flags))) {
|
||||
spin_unlock(ptl);
|
||||
remainder = 0;
|
||||
err = -ENOMEM;
|
||||
|
117
mm/internal.h
117
mm/internal.h
@ -10,6 +10,7 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/tracepoint-defs.h>
|
||||
|
||||
struct folio_batch;
|
||||
@ -66,24 +67,20 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat)
|
||||
vm_fault_t do_swap_page(struct vm_fault *vmf);
|
||||
void folio_rotate_reclaimable(struct folio *folio);
|
||||
bool __folio_end_writeback(struct folio *folio);
|
||||
void deactivate_file_folio(struct folio *folio);
|
||||
|
||||
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
|
||||
unsigned long floor, unsigned long ceiling);
|
||||
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
|
||||
|
||||
static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
|
||||
{
|
||||
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
|
||||
}
|
||||
|
||||
struct zap_details;
|
||||
void unmap_page_range(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end,
|
||||
struct zap_details *details);
|
||||
|
||||
void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
|
||||
unsigned long lookahead_size);
|
||||
void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
|
||||
unsigned int order);
|
||||
void force_page_cache_ra(struct readahead_control *, unsigned long nr);
|
||||
static inline void force_page_cache_readahead(struct address_space *mapping,
|
||||
struct file *file, pgoff_t index, unsigned long nr_to_read)
|
||||
@ -100,6 +97,9 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio);
|
||||
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
|
||||
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
|
||||
loff_t end);
|
||||
long invalidate_inode_page(struct page *page);
|
||||
unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end, unsigned long *nr_pagevec);
|
||||
|
||||
/**
|
||||
* folio_evictable - Test whether a folio is evictable.
|
||||
@ -163,8 +163,10 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
|
||||
/*
|
||||
* in mm/vmscan.c:
|
||||
*/
|
||||
extern int isolate_lru_page(struct page *page);
|
||||
extern void putback_lru_page(struct page *page);
|
||||
int isolate_lru_page(struct page *page);
|
||||
int folio_isolate_lru(struct folio *folio);
|
||||
void putback_lru_page(struct page *page);
|
||||
void folio_putback_lru(struct folio *folio);
|
||||
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
|
||||
|
||||
/*
|
||||
@ -396,6 +398,7 @@ static inline bool is_data_mapping(vm_flags_t flags)
|
||||
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct vm_area_struct *prev);
|
||||
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
|
||||
struct anon_vma *folio_anon_vma(struct folio *folio);
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
void unmap_mapping_folio(struct folio *folio);
|
||||
@ -404,32 +407,56 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
|
||||
extern long faultin_vma_page_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end,
|
||||
bool write, int *locked);
|
||||
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end);
|
||||
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
|
||||
{
|
||||
munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
|
||||
}
|
||||
|
||||
/*
|
||||
* must be called with vma's mmap_lock held for read or write, and page locked.
|
||||
*/
|
||||
extern void mlock_vma_page(struct page *page);
|
||||
extern unsigned int munlock_vma_page(struct page *page);
|
||||
|
||||
extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
|
||||
unsigned long len);
|
||||
|
||||
/*
|
||||
* Clear the page's PageMlocked(). This can be useful in a situation where
|
||||
* we want to unconditionally remove a page from the pagecache -- e.g.,
|
||||
* on truncation or freeing.
|
||||
* mlock_vma_page() and munlock_vma_page():
|
||||
* should be called with vma's mmap_lock held for read or write,
|
||||
* under page table lock for the pte/pmd being added or removed.
|
||||
*
|
||||
* It is legal to call this function for any page, mlocked or not.
|
||||
* If called for a page that is still mapped by mlocked vmas, all we do
|
||||
* is revert to lazy LRU behaviour -- semantics are not broken.
|
||||
* mlock is usually called at the end of page_add_*_rmap(),
|
||||
* munlock at the end of page_remove_rmap(); but new anon
|
||||
* pages are managed by lru_cache_add_inactive_or_unevictable()
|
||||
* calling mlock_new_page().
|
||||
*
|
||||
* @compound is used to include pmd mappings of THPs, but filter out
|
||||
* pte mappings of THPs, which cannot be consistently counted: a pte
|
||||
* mapping of the THP head cannot be distinguished by the page alone.
|
||||
*/
|
||||
extern void clear_page_mlock(struct page *page);
|
||||
void mlock_folio(struct folio *folio);
|
||||
static inline void mlock_vma_folio(struct folio *folio,
|
||||
struct vm_area_struct *vma, bool compound)
|
||||
{
|
||||
/*
|
||||
* The VM_SPECIAL check here serves two purposes.
|
||||
* 1) VM_IO check prevents migration from double-counting during mlock.
|
||||
* 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
|
||||
* is never left set on a VM_SPECIAL vma, there is an interval while
|
||||
* file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
|
||||
* still be set while VM_SPECIAL bits are added: so ignore it then.
|
||||
*/
|
||||
if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
|
||||
(compound || !folio_test_large(folio)))
|
||||
mlock_folio(folio);
|
||||
}
|
||||
|
||||
static inline void mlock_vma_page(struct page *page,
|
||||
struct vm_area_struct *vma, bool compound)
|
||||
{
|
||||
mlock_vma_folio(page_folio(page), vma, compound);
|
||||
}
|
||||
|
||||
void munlock_page(struct page *page);
|
||||
static inline void munlock_vma_page(struct page *page,
|
||||
struct vm_area_struct *vma, bool compound)
|
||||
{
|
||||
if (unlikely(vma->vm_flags & VM_LOCKED) &&
|
||||
(compound || !PageTransCompound(page)))
|
||||
munlock_page(page);
|
||||
}
|
||||
void mlock_new_page(struct page *page);
|
||||
bool need_mlock_page_drain(int cpu);
|
||||
void mlock_page_drain(int cpu);
|
||||
|
||||
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
|
||||
|
||||
@ -463,18 +490,20 @@ vma_address(struct page *page, struct vm_area_struct *vma)
|
||||
}
|
||||
|
||||
/*
|
||||
* Then at what user virtual address will none of the page be found in vma?
|
||||
* Then at what user virtual address will none of the range be found in vma?
|
||||
* Assumes that vma_address() already returned a good starting address.
|
||||
* If page is a compound head, the entire compound page is considered.
|
||||
*/
|
||||
static inline unsigned long
|
||||
vma_address_end(struct page *page, struct vm_area_struct *vma)
|
||||
static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
|
||||
{
|
||||
struct vm_area_struct *vma = pvmw->vma;
|
||||
pgoff_t pgoff;
|
||||
unsigned long address;
|
||||
|
||||
VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
|
||||
pgoff = page_to_pgoff(page) + compound_nr(page);
|
||||
/* Common case, plus ->pgoff is invalid for KSM */
|
||||
if (pvmw->nr_pages == 1)
|
||||
return pvmw->address + PAGE_SIZE;
|
||||
|
||||
pgoff = pvmw->pgoff + pvmw->nr_pages;
|
||||
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
|
||||
/* Check for address beyond vma (or wrapped through 0?) */
|
||||
if (address < vma->vm_start || address > vma->vm_end)
|
||||
@ -504,8 +533,13 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
|
||||
}
|
||||
#else /* !CONFIG_MMU */
|
||||
static inline void unmap_mapping_folio(struct folio *folio) { }
|
||||
static inline void clear_page_mlock(struct page *page) { }
|
||||
static inline void mlock_vma_page(struct page *page) { }
|
||||
static inline void mlock_vma_page(struct page *page,
|
||||
struct vm_area_struct *vma, bool compound) { }
|
||||
static inline void munlock_vma_page(struct page *page,
|
||||
struct vm_area_struct *vma, bool compound) { }
|
||||
static inline void mlock_new_page(struct page *page) { }
|
||||
static inline bool need_mlock_page_drain(int cpu) { return false; }
|
||||
static inline void mlock_page_drain(int cpu) { }
|
||||
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
|
||||
{
|
||||
}
|
||||
@ -713,6 +747,13 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
|
||||
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
|
||||
unsigned long addr, int page_nid, int *flags);
|
||||
|
||||
void free_zone_device_page(struct page *page);
|
||||
|
||||
/*
|
||||
* mm/gup.c
|
||||
*/
|
||||
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
|
||||
|
||||
DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
|
||||
|
||||
#endif /* __MM_INTERNAL_H */
|
||||
|
@ -774,7 +774,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
|
||||
*/
|
||||
spin_lock(ptl);
|
||||
ptep_clear(vma->vm_mm, address, _pte);
|
||||
page_remove_rmap(src_page, false);
|
||||
page_remove_rmap(src_page, vma, false);
|
||||
spin_unlock(ptl);
|
||||
free_page_and_swap_cache(src_page);
|
||||
}
|
||||
@ -1513,7 +1513,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
|
||||
if (pte_none(*pte))
|
||||
continue;
|
||||
page = vm_normal_page(vma, addr, *pte);
|
||||
page_remove_rmap(page, false);
|
||||
page_remove_rmap(page, vma, false);
|
||||
}
|
||||
|
||||
pte_unmap_unlock(start_pte, ptl);
|
||||
@ -1834,13 +1834,13 @@ static void collapse_file(struct mm_struct *mm,
|
||||
}
|
||||
|
||||
if (page_mapped(page))
|
||||
unmap_mapping_pages(mapping, index, 1, false);
|
||||
try_to_unmap(page_folio(page),
|
||||
TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
|
||||
|
||||
xas_lock_irq(&xas);
|
||||
xas_set(&xas, index);
|
||||
|
||||
VM_BUG_ON_PAGE(page != xas_load(&xas), page);
|
||||
VM_BUG_ON_PAGE(page_mapped(page), page);
|
||||
|
||||
/*
|
||||
* The page is expected to have page_count() == 3:
|
||||
@ -1904,6 +1904,13 @@ static void collapse_file(struct mm_struct *mm,
|
||||
xas_unlock_irq(&xas);
|
||||
xa_unlocked:
|
||||
|
||||
/*
|
||||
* If collapse is successful, flush must be done now before copying.
|
||||
* If collapse is unsuccessful, does flush actually need to be done?
|
||||
* Do it anyway, to clear the state.
|
||||
*/
|
||||
try_to_unmap_flush();
|
||||
|
||||
if (result == SCAN_SUCCEED) {
|
||||
struct page *page, *tmp;
|
||||
|
||||
|
32
mm/ksm.c
32
mm/ksm.c
@ -1034,10 +1034,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
|
||||
pte_t *orig_pte)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct page_vma_mapped_walk pvmw = {
|
||||
.page = page,
|
||||
.vma = vma,
|
||||
};
|
||||
DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0);
|
||||
int swapped;
|
||||
int err = -EFAULT;
|
||||
struct mmu_notifier_range range;
|
||||
@ -1177,7 +1174,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
|
||||
ptep_clear_flush(vma, addr, ptep);
|
||||
set_pte_at_notify(mm, addr, ptep, newpte);
|
||||
|
||||
page_remove_rmap(page, false);
|
||||
page_remove_rmap(page, vma, false);
|
||||
if (!page_mapped(page))
|
||||
try_to_free_swap(page);
|
||||
put_page(page);
|
||||
@ -1252,16 +1249,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
|
||||
err = replace_page(vma, page, kpage, orig_pte);
|
||||
}
|
||||
|
||||
if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
|
||||
munlock_vma_page(page);
|
||||
if (!PageMlocked(kpage)) {
|
||||
unlock_page(page);
|
||||
lock_page(kpage);
|
||||
mlock_vma_page(kpage);
|
||||
page = kpage; /* for final unlock */
|
||||
}
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
unlock_page(page);
|
||||
out:
|
||||
@ -2567,7 +2554,8 @@ void __ksm_exit(struct mm_struct *mm)
|
||||
struct page *ksm_might_need_to_copy(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
{
|
||||
struct anon_vma *anon_vma = page_anon_vma(page);
|
||||
struct folio *folio = page_folio(page);
|
||||
struct anon_vma *anon_vma = folio_anon_vma(folio);
|
||||
struct page *new_page;
|
||||
|
||||
if (PageKsm(page)) {
|
||||
@ -2603,21 +2591,21 @@ struct page *ksm_might_need_to_copy(struct page *page,
|
||||
return new_page;
|
||||
}
|
||||
|
||||
void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
|
||||
void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc)
|
||||
{
|
||||
struct stable_node *stable_node;
|
||||
struct rmap_item *rmap_item;
|
||||
int search_new_forks = 0;
|
||||
|
||||
VM_BUG_ON_PAGE(!PageKsm(page), page);
|
||||
VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
|
||||
|
||||
/*
|
||||
* Rely on the page lock to protect against concurrent modifications
|
||||
* to that page's node of the stable tree.
|
||||
*/
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
|
||||
|
||||
stable_node = page_stable_node(page);
|
||||
stable_node = folio_stable_node(folio);
|
||||
if (!stable_node)
|
||||
return;
|
||||
again:
|
||||
@ -2652,11 +2640,11 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
|
||||
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
|
||||
continue;
|
||||
|
||||
if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
|
||||
if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
return;
|
||||
}
|
||||
if (rwc->done && rwc->done(page)) {
|
||||
if (rwc->done && rwc->done(folio)) {
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
return;
|
||||
}
|
||||
|
@ -502,6 +502,11 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
|
||||
tlb_end_vma(tlb, vma);
|
||||
}
|
||||
|
||||
static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
|
||||
{
|
||||
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
|
||||
}
|
||||
|
||||
static long madvise_cold(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start_addr, unsigned long end_addr)
|
||||
|
@ -53,6 +53,7 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/vmpressure.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/swap_cgroup.h>
|
||||
#include <linux/cpu.h>
|
||||
@ -1271,8 +1272,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
|
||||
* @nr_pages: positive when adding or negative when removing
|
||||
*
|
||||
* This function must be called under lru_lock, just before a page is added
|
||||
* to or just after a page is removed from an lru list (that ordering being
|
||||
* so as to allow it to check that lru_size 0 is consistent with list_empty).
|
||||
* to or just after a page is removed from an lru list.
|
||||
*/
|
||||
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
|
||||
int zid, int nr_pages)
|
||||
@ -5436,17 +5436,12 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
|
||||
* a device and because they are not accessible by CPU they are store
|
||||
* as special swap entry in the CPU page table.
|
||||
* Handle device private pages that are not accessible by the CPU, but
|
||||
* stored as special swap entries in the page table.
|
||||
*/
|
||||
if (is_device_private_entry(ent)) {
|
||||
page = pfn_swap_entry_to_page(ent);
|
||||
/*
|
||||
* MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
|
||||
* a refcount of 1 when free (unlike normal page)
|
||||
*/
|
||||
if (!page_ref_add_unless(page, 1, 1))
|
||||
if (!get_page_unless_zero(page))
|
||||
return NULL;
|
||||
return page;
|
||||
}
|
||||
@ -7053,19 +7048,19 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
|
||||
|
||||
/**
|
||||
* mem_cgroup_swapout - transfer a memsw charge to swap
|
||||
* @page: page whose memsw charge to transfer
|
||||
* @folio: folio whose memsw charge to transfer
|
||||
* @entry: swap entry to move the charge to
|
||||
*
|
||||
* Transfer the memsw charge of @page to @entry.
|
||||
* Transfer the memsw charge of @folio to @entry.
|
||||
*/
|
||||
void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
|
||||
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
|
||||
{
|
||||
struct mem_cgroup *memcg, *swap_memcg;
|
||||
unsigned int nr_entries;
|
||||
unsigned short oldid;
|
||||
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
VM_BUG_ON_PAGE(page_count(page), page);
|
||||
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
|
||||
VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
@ -7073,9 +7068,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
|
||||
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
return;
|
||||
|
||||
memcg = page_memcg(page);
|
||||
memcg = folio_memcg(folio);
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(!memcg, page);
|
||||
VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
|
||||
if (!memcg)
|
||||
return;
|
||||
|
||||
@ -7085,16 +7080,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
|
||||
* ancestor for the swap instead and transfer the memory+swap charge.
|
||||
*/
|
||||
swap_memcg = mem_cgroup_id_get_online(memcg);
|
||||
nr_entries = thp_nr_pages(page);
|
||||
nr_entries = folio_nr_pages(folio);
|
||||
/* Get references for the tail pages, too */
|
||||
if (nr_entries > 1)
|
||||
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
|
||||
oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
|
||||
nr_entries);
|
||||
VM_BUG_ON_PAGE(oldid, page);
|
||||
VM_BUG_ON_FOLIO(oldid, folio);
|
||||
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
|
||||
|
||||
page->memcg_data = 0;
|
||||
folio->memcg_data = 0;
|
||||
|
||||
if (!mem_cgroup_is_root(memcg))
|
||||
page_counter_uncharge(&memcg->memory, nr_entries);
|
||||
@ -7114,7 +7109,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
|
||||
memcg_stats_lock();
|
||||
mem_cgroup_charge_statistics(memcg, -nr_entries);
|
||||
memcg_stats_unlock();
|
||||
memcg_check_events(memcg, page_to_nid(page));
|
||||
memcg_check_events(memcg, folio_nid(folio));
|
||||
|
||||
css_put(&memcg->css);
|
||||
}
|
||||
|
@ -478,12 +478,13 @@ static struct task_struct *task_early_kill(struct task_struct *tsk,
|
||||
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
|
||||
int force_early)
|
||||
{
|
||||
struct folio *folio = page_folio(page);
|
||||
struct vm_area_struct *vma;
|
||||
struct task_struct *tsk;
|
||||
struct anon_vma *av;
|
||||
pgoff_t pgoff;
|
||||
|
||||
av = page_lock_anon_vma_read(page);
|
||||
av = folio_lock_anon_vma_read(folio);
|
||||
if (av == NULL) /* Not actually mapped anymore */
|
||||
return;
|
||||
|
||||
@ -1347,6 +1348,7 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
|
||||
static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
||||
int flags, struct page *hpage)
|
||||
{
|
||||
struct folio *folio = page_folio(hpage);
|
||||
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
|
||||
struct address_space *mapping;
|
||||
LIST_HEAD(tokill);
|
||||
@ -1421,12 +1423,12 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
||||
*/
|
||||
mapping = hugetlb_page_mapping_lock_write(hpage);
|
||||
if (mapping) {
|
||||
try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
|
||||
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
|
||||
i_mmap_unlock_write(mapping);
|
||||
} else
|
||||
pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
|
||||
} else {
|
||||
try_to_unmap(hpage, ttu);
|
||||
try_to_unmap(folio, ttu);
|
||||
}
|
||||
|
||||
unmap_success = !page_mapped(hpage);
|
||||
@ -2169,7 +2171,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
|
||||
*/
|
||||
static int __soft_offline_page(struct page *page)
|
||||
{
|
||||
int ret = 0;
|
||||
long ret = 0;
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
struct page *hpage = compound_head(page);
|
||||
char const *msg_page[] = {"page", "hugepage"};
|
||||
@ -2216,7 +2218,7 @@ static int __soft_offline_page(struct page *page)
|
||||
if (!list_empty(&pagelist))
|
||||
putback_movable_pages(&pagelist);
|
||||
|
||||
pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n",
|
||||
pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n",
|
||||
pfn, msg_page[huge], ret, &page->flags);
|
||||
if (ret > 0)
|
||||
ret = -EBUSY;
|
||||
|
43
mm/memory.c
43
mm/memory.c
@ -735,9 +735,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
|
||||
|
||||
set_pte_at(vma->vm_mm, address, ptep, pte);
|
||||
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mlock_vma_page(page);
|
||||
|
||||
/*
|
||||
* No need to invalidate - it was non-present before. However
|
||||
* secondary CPUs may have mappings that need invalidating.
|
||||
@ -1389,7 +1386,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
|
||||
mark_page_accessed(page);
|
||||
}
|
||||
rss[mm_counter(page)]--;
|
||||
page_remove_rmap(page, false);
|
||||
page_remove_rmap(page, vma, false);
|
||||
if (unlikely(page_mapcount(page) < 0))
|
||||
print_bad_pte(vma, addr, ptent, page);
|
||||
if (unlikely(__tlb_remove_page(tlb, page))) {
|
||||
@ -1408,7 +1405,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
|
||||
continue;
|
||||
rss[mm_counter(page)]--;
|
||||
if (is_device_private_entry(entry))
|
||||
page_remove_rmap(page, false);
|
||||
page_remove_rmap(page, vma, false);
|
||||
put_page(page);
|
||||
} else if (!non_swap_entry(entry)) {
|
||||
/* Genuine swap entry, hence a private anon page */
|
||||
@ -1763,16 +1760,16 @@ static int validate_page_before_insert(struct page *page)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
|
||||
static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
|
||||
unsigned long addr, struct page *page, pgprot_t prot)
|
||||
{
|
||||
if (!pte_none(*pte))
|
||||
return -EBUSY;
|
||||
/* Ok, finally just insert the thing.. */
|
||||
get_page(page);
|
||||
inc_mm_counter_fast(mm, mm_counter_file(page));
|
||||
page_add_file_rmap(page, false);
|
||||
set_pte_at(mm, addr, pte, mk_pte(page, prot));
|
||||
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
|
||||
page_add_file_rmap(page, vma, false);
|
||||
set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1786,7 +1783,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
|
||||
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
struct page *page, pgprot_t prot)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
int retval;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
@ -1795,17 +1791,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
if (retval)
|
||||
goto out;
|
||||
retval = -ENOMEM;
|
||||
pte = get_locked_pte(mm, addr, &ptl);
|
||||
pte = get_locked_pte(vma->vm_mm, addr, &ptl);
|
||||
if (!pte)
|
||||
goto out;
|
||||
retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
|
||||
retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
out:
|
||||
return retval;
|
||||
}
|
||||
|
||||
#ifdef pte_index
|
||||
static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
|
||||
static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
|
||||
unsigned long addr, struct page *page, pgprot_t prot)
|
||||
{
|
||||
int err;
|
||||
@ -1815,7 +1811,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
|
||||
err = validate_page_before_insert(page);
|
||||
if (err)
|
||||
return err;
|
||||
return insert_page_into_pte_locked(mm, pte, addr, page, prot);
|
||||
return insert_page_into_pte_locked(vma, pte, addr, page, prot);
|
||||
}
|
||||
|
||||
/* insert_pages() amortizes the cost of spinlock operations
|
||||
@ -1852,7 +1848,7 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
|
||||
|
||||
start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
|
||||
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
|
||||
int err = insert_page_in_batch_locked(mm, pte,
|
||||
int err = insert_page_in_batch_locked(vma, pte,
|
||||
addr, pages[curr_page_idx], prot);
|
||||
if (unlikely(err)) {
|
||||
pte_unmap_unlock(start_pte, pte_lock);
|
||||
@ -3108,7 +3104,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
|
||||
* mapcount is visible. So transitively, TLBs to
|
||||
* old page will be flushed before it can be reused.
|
||||
*/
|
||||
page_remove_rmap(old_page, false);
|
||||
page_remove_rmap(old_page, vma, false);
|
||||
}
|
||||
|
||||
/* Free the old page.. */
|
||||
@ -3128,16 +3124,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
|
||||
*/
|
||||
mmu_notifier_invalidate_range_only_end(&range);
|
||||
if (old_page) {
|
||||
/*
|
||||
* Don't let another task, with possibly unlocked vma,
|
||||
* keep the mlocked page.
|
||||
*/
|
||||
if (page_copied && (vma->vm_flags & VM_LOCKED)) {
|
||||
lock_page(old_page); /* LRU manipulation */
|
||||
if (PageMlocked(old_page))
|
||||
munlock_vma_page(old_page);
|
||||
unlock_page(old_page);
|
||||
}
|
||||
if (page_copied)
|
||||
free_swap_cache(old_page);
|
||||
put_page(old_page);
|
||||
@ -3958,7 +3944,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
|
||||
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
|
||||
|
||||
add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
|
||||
page_add_file_rmap(page, true);
|
||||
page_add_file_rmap(page, vma, true);
|
||||
|
||||
/*
|
||||
* deposit and withdraw with pmd lock held
|
||||
*/
|
||||
@ -4007,7 +3994,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
|
||||
lru_cache_add_inactive_or_unevictable(page, vma);
|
||||
} else {
|
||||
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
|
||||
page_add_file_rmap(page, false);
|
||||
page_add_file_rmap(page, vma, false);
|
||||
}
|
||||
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
|
||||
}
|
||||
|
@ -1617,10 +1617,13 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
|
||||
struct folio *folio;
|
||||
|
||||
if (!pfn_valid(pfn))
|
||||
continue;
|
||||
page = pfn_to_page(pfn);
|
||||
head = compound_head(page);
|
||||
folio = page_folio(page);
|
||||
head = &folio->page;
|
||||
|
||||
if (PageHuge(page)) {
|
||||
pfn = page_to_pfn(head) + compound_nr(head) - 1;
|
||||
@ -1637,10 +1640,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
|
||||
* the unmap as the catch all safety net).
|
||||
*/
|
||||
if (PageHWPoison(page)) {
|
||||
if (WARN_ON(PageLRU(page)))
|
||||
isolate_lru_page(page);
|
||||
if (page_mapped(page))
|
||||
try_to_unmap(page, TTU_IGNORE_MLOCK);
|
||||
if (WARN_ON(folio_test_lru(folio)))
|
||||
folio_isolate_lru(folio);
|
||||
if (folio_mapped(folio))
|
||||
try_to_unmap(folio, TTU_IGNORE_MLOCK);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
#include <linux/io.h>
|
||||
#include <linux/kasan.h>
|
||||
#include <linux/memory_hotplug.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/pfn_t.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/mmzone.h>
|
||||
@ -12,6 +12,7 @@
|
||||
#include <linux/types.h>
|
||||
#include <linux/wait_bit.h>
|
||||
#include <linux/xarray.h>
|
||||
#include "internal.h"
|
||||
|
||||
static DEFINE_XARRAY(pgmap_array);
|
||||
|
||||
@ -37,21 +38,19 @@ unsigned long memremap_compat_align(void)
|
||||
EXPORT_SYMBOL_GPL(memremap_compat_align);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
#ifdef CONFIG_FS_DAX
|
||||
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
|
||||
EXPORT_SYMBOL(devmap_managed_key);
|
||||
|
||||
static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
|
||||
pgmap->type == MEMORY_DEVICE_FS_DAX)
|
||||
if (pgmap->type == MEMORY_DEVICE_FS_DAX)
|
||||
static_branch_dec(&devmap_managed_key);
|
||||
}
|
||||
|
||||
static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
|
||||
pgmap->type == MEMORY_DEVICE_FS_DAX)
|
||||
if (pgmap->type == MEMORY_DEVICE_FS_DAX)
|
||||
static_branch_inc(&devmap_managed_key);
|
||||
}
|
||||
#else
|
||||
@ -61,7 +60,7 @@ static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
|
||||
static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
#endif /* CONFIG_FS_DAX */
|
||||
|
||||
static void pgmap_array_delete(struct range *range)
|
||||
{
|
||||
@ -102,23 +101,12 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
|
||||
return (range->start + range_len(range)) >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
|
||||
{
|
||||
if (pfn % (1024 << pgmap->vmemmap_shift))
|
||||
cond_resched();
|
||||
return pfn + pgmap_vmemmap_nr(pgmap);
|
||||
}
|
||||
|
||||
static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
|
||||
{
|
||||
return (pfn_end(pgmap, range_id) -
|
||||
pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
|
||||
}
|
||||
|
||||
#define for_each_device_pfn(pfn, map, i) \
|
||||
for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \
|
||||
pfn = pfn_next(map, pfn))
|
||||
|
||||
static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
|
||||
{
|
||||
struct range *range = &pgmap->ranges[range_id];
|
||||
@ -147,13 +135,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
|
||||
|
||||
void memunmap_pages(struct dev_pagemap *pgmap)
|
||||
{
|
||||
unsigned long pfn;
|
||||
int i;
|
||||
|
||||
percpu_ref_kill(&pgmap->ref);
|
||||
for (i = 0; i < pgmap->nr_range; i++)
|
||||
for_each_device_pfn(pfn, pgmap, i)
|
||||
put_page(pfn_to_page(pfn));
|
||||
percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
|
||||
wait_for_completion(&pgmap->done);
|
||||
percpu_ref_exit(&pgmap->ref);
|
||||
|
||||
@ -329,8 +315,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
|
||||
}
|
||||
break;
|
||||
case MEMORY_DEVICE_FS_DAX:
|
||||
if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
|
||||
IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
|
||||
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
|
||||
WARN(1, "File system DAX not supported\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
@ -466,21 +451,17 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(get_dev_pagemap);
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
void free_devmap_managed_page(struct page *page)
|
||||
void free_zone_device_page(struct page *page)
|
||||
{
|
||||
/* notify page idle for dax */
|
||||
if (!is_device_private_page(page)) {
|
||||
wake_up_var(&page->_refcount);
|
||||
if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free))
|
||||
return;
|
||||
}
|
||||
|
||||
__ClearPageWaiters(page);
|
||||
|
||||
mem_cgroup_uncharge(page_folio(page));
|
||||
|
||||
/*
|
||||
* When a device_private page is freed, the page->mapping field
|
||||
* When a device managed page is freed, the page->mapping field
|
||||
* may still contain a (stale) mapping value. For example, the
|
||||
* lower bits of page->mapping may still identify the page as an
|
||||
* anonymous page. Ultimately, this entire field is just stale
|
||||
@ -502,5 +483,27 @@ void free_devmap_managed_page(struct page *page)
|
||||
*/
|
||||
page->mapping = NULL;
|
||||
page->pgmap->ops->page_free(page);
|
||||
|
||||
/*
|
||||
* Reset the page count to 1 to prepare for handing out the page again.
|
||||
*/
|
||||
set_page_count(page, 1);
|
||||
}
|
||||
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
bool __put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* fsdax page refcounts are 1-based, rather than 0-based: if
|
||||
* refcount is 1, then the page is free and the refcount is
|
||||
* stable because nobody holds a reference on the page.
|
||||
*/
|
||||
if (page_ref_dec_return(page) == 1)
|
||||
wake_up_var(&page->_refcount);
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL(__put_devmap_managed_page);
|
||||
#endif /* CONFIG_FS_DAX */
|
||||
|
870
mm/migrate.c
870
mm/migrate.c
File diff suppressed because it is too large
Load Diff
773
mm/migrate_device.c
Normal file
773
mm/migrate_device.c
Normal file
@ -0,0 +1,773 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Device Memory Migration functionality.
|
||||
*
|
||||
* Originally written by Jérôme Glisse.
|
||||
*/
|
||||
#include <linux/export.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/migrate.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/oom.h>
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include "internal.h"
|
||||
|
||||
static int migrate_vma_collect_skip(unsigned long start,
|
||||
unsigned long end,
|
||||
struct mm_walk *walk)
|
||||
{
|
||||
struct migrate_vma *migrate = walk->private;
|
||||
unsigned long addr;
|
||||
|
||||
for (addr = start; addr < end; addr += PAGE_SIZE) {
|
||||
migrate->dst[migrate->npages] = 0;
|
||||
migrate->src[migrate->npages++] = 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int migrate_vma_collect_hole(unsigned long start,
|
||||
unsigned long end,
|
||||
__always_unused int depth,
|
||||
struct mm_walk *walk)
|
||||
{
|
||||
struct migrate_vma *migrate = walk->private;
|
||||
unsigned long addr;
|
||||
|
||||
/* Only allow populating anonymous memory. */
|
||||
if (!vma_is_anonymous(walk->vma))
|
||||
return migrate_vma_collect_skip(start, end, walk);
|
||||
|
||||
for (addr = start; addr < end; addr += PAGE_SIZE) {
|
||||
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
|
||||
migrate->dst[migrate->npages] = 0;
|
||||
migrate->npages++;
|
||||
migrate->cpages++;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int migrate_vma_collect_pmd(pmd_t *pmdp,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
struct mm_walk *walk)
|
||||
{
|
||||
struct migrate_vma *migrate = walk->private;
|
||||
struct vm_area_struct *vma = walk->vma;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long addr = start, unmapped = 0;
|
||||
spinlock_t *ptl;
|
||||
pte_t *ptep;
|
||||
|
||||
again:
|
||||
if (pmd_none(*pmdp))
|
||||
return migrate_vma_collect_hole(start, end, -1, walk);
|
||||
|
||||
if (pmd_trans_huge(*pmdp)) {
|
||||
struct page *page;
|
||||
|
||||
ptl = pmd_lock(mm, pmdp);
|
||||
if (unlikely(!pmd_trans_huge(*pmdp))) {
|
||||
spin_unlock(ptl);
|
||||
goto again;
|
||||
}
|
||||
|
||||
page = pmd_page(*pmdp);
|
||||
if (is_huge_zero_page(page)) {
|
||||
spin_unlock(ptl);
|
||||
split_huge_pmd(vma, pmdp, addr);
|
||||
if (pmd_trans_unstable(pmdp))
|
||||
return migrate_vma_collect_skip(start, end,
|
||||
walk);
|
||||
} else {
|
||||
int ret;
|
||||
|
||||
get_page(page);
|
||||
spin_unlock(ptl);
|
||||
if (unlikely(!trylock_page(page)))
|
||||
return migrate_vma_collect_skip(start, end,
|
||||
walk);
|
||||
ret = split_huge_page(page);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
if (ret)
|
||||
return migrate_vma_collect_skip(start, end,
|
||||
walk);
|
||||
if (pmd_none(*pmdp))
|
||||
return migrate_vma_collect_hole(start, end, -1,
|
||||
walk);
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(pmd_bad(*pmdp)))
|
||||
return migrate_vma_collect_skip(start, end, walk);
|
||||
|
||||
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
|
||||
for (; addr < end; addr += PAGE_SIZE, ptep++) {
|
||||
unsigned long mpfn = 0, pfn;
|
||||
struct page *page;
|
||||
swp_entry_t entry;
|
||||
pte_t pte;
|
||||
|
||||
pte = *ptep;
|
||||
|
||||
if (pte_none(pte)) {
|
||||
if (vma_is_anonymous(vma)) {
|
||||
mpfn = MIGRATE_PFN_MIGRATE;
|
||||
migrate->cpages++;
|
||||
}
|
||||
goto next;
|
||||
}
|
||||
|
||||
if (!pte_present(pte)) {
|
||||
/*
|
||||
* Only care about unaddressable device page special
|
||||
* page table entry. Other special swap entries are not
|
||||
* migratable, and we ignore regular swapped page.
|
||||
*/
|
||||
entry = pte_to_swp_entry(pte);
|
||||
if (!is_device_private_entry(entry))
|
||||
goto next;
|
||||
|
||||
page = pfn_swap_entry_to_page(entry);
|
||||
if (!(migrate->flags &
|
||||
MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
|
||||
page->pgmap->owner != migrate->pgmap_owner)
|
||||
goto next;
|
||||
|
||||
mpfn = migrate_pfn(page_to_pfn(page)) |
|
||||
MIGRATE_PFN_MIGRATE;
|
||||
if (is_writable_device_private_entry(entry))
|
||||
mpfn |= MIGRATE_PFN_WRITE;
|
||||
} else {
|
||||
if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
|
||||
goto next;
|
||||
pfn = pte_pfn(pte);
|
||||
if (is_zero_pfn(pfn)) {
|
||||
mpfn = MIGRATE_PFN_MIGRATE;
|
||||
migrate->cpages++;
|
||||
goto next;
|
||||
}
|
||||
page = vm_normal_page(migrate->vma, addr, pte);
|
||||
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
|
||||
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
|
||||
}
|
||||
|
||||
/* FIXME support THP */
|
||||
if (!page || !page->mapping || PageTransCompound(page)) {
|
||||
mpfn = 0;
|
||||
goto next;
|
||||
}
|
||||
|
||||
/*
|
||||
* By getting a reference on the page we pin it and that blocks
|
||||
* any kind of migration. Side effect is that it "freezes" the
|
||||
* pte.
|
||||
*
|
||||
* We drop this reference after isolating the page from the lru
|
||||
* for non device page (device page are not on the lru and thus
|
||||
* can't be dropped from it).
|
||||
*/
|
||||
get_page(page);
|
||||
|
||||
/*
|
||||
* Optimize for the common case where page is only mapped once
|
||||
* in one process. If we can lock the page, then we can safely
|
||||
* set up a special migration page table entry now.
|
||||
*/
|
||||
if (trylock_page(page)) {
|
||||
pte_t swp_pte;
|
||||
|
||||
migrate->cpages++;
|
||||
ptep_get_and_clear(mm, addr, ptep);
|
||||
|
||||
/* Setup special migration page table entry */
|
||||
if (mpfn & MIGRATE_PFN_WRITE)
|
||||
entry = make_writable_migration_entry(
|
||||
page_to_pfn(page));
|
||||
else
|
||||
entry = make_readable_migration_entry(
|
||||
page_to_pfn(page));
|
||||
swp_pte = swp_entry_to_pte(entry);
|
||||
if (pte_present(pte)) {
|
||||
if (pte_soft_dirty(pte))
|
||||
swp_pte = pte_swp_mksoft_dirty(swp_pte);
|
||||
if (pte_uffd_wp(pte))
|
||||
swp_pte = pte_swp_mkuffd_wp(swp_pte);
|
||||
} else {
|
||||
if (pte_swp_soft_dirty(pte))
|
||||
swp_pte = pte_swp_mksoft_dirty(swp_pte);
|
||||
if (pte_swp_uffd_wp(pte))
|
||||
swp_pte = pte_swp_mkuffd_wp(swp_pte);
|
||||
}
|
||||
set_pte_at(mm, addr, ptep, swp_pte);
|
||||
|
||||
/*
|
||||
* This is like regular unmap: we remove the rmap and
|
||||
* drop page refcount. Page won't be freed, as we took
|
||||
* a reference just above.
|
||||
*/
|
||||
page_remove_rmap(page, vma, false);
|
||||
put_page(page);
|
||||
|
||||
if (pte_present(pte))
|
||||
unmapped++;
|
||||
} else {
|
||||
put_page(page);
|
||||
mpfn = 0;
|
||||
}
|
||||
|
||||
next:
|
||||
migrate->dst[migrate->npages] = 0;
|
||||
migrate->src[migrate->npages++] = mpfn;
|
||||
}
|
||||
arch_leave_lazy_mmu_mode();
|
||||
pte_unmap_unlock(ptep - 1, ptl);
|
||||
|
||||
/* Only flush the TLB if we actually modified any entries */
|
||||
if (unmapped)
|
||||
flush_tlb_range(walk->vma, start, end);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct mm_walk_ops migrate_vma_walk_ops = {
|
||||
.pmd_entry = migrate_vma_collect_pmd,
|
||||
.pte_hole = migrate_vma_collect_hole,
|
||||
};
|
||||
|
||||
/*
|
||||
* migrate_vma_collect() - collect pages over a range of virtual addresses
|
||||
* @migrate: migrate struct containing all migration information
|
||||
*
|
||||
* This will walk the CPU page table. For each virtual address backed by a
|
||||
* valid page, it updates the src array and takes a reference on the page, in
|
||||
* order to pin the page until we lock it and unmap it.
|
||||
*/
|
||||
static void migrate_vma_collect(struct migrate_vma *migrate)
|
||||
{
|
||||
struct mmu_notifier_range range;
|
||||
|
||||
/*
|
||||
* Note that the pgmap_owner is passed to the mmu notifier callback so
|
||||
* that the registered device driver can skip invalidating device
|
||||
* private page mappings that won't be migrated.
|
||||
*/
|
||||
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
|
||||
migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
|
||||
migrate->pgmap_owner);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
|
||||
walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
|
||||
&migrate_vma_walk_ops, migrate);
|
||||
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
|
||||
}
|
||||
|
||||
/*
|
||||
* migrate_vma_check_page() - check if page is pinned or not
|
||||
* @page: struct page to check
|
||||
*
|
||||
* Pinned pages cannot be migrated. This is the same test as in
|
||||
* folio_migrate_mapping(), except that here we allow migration of a
|
||||
* ZONE_DEVICE page.
|
||||
*/
|
||||
static bool migrate_vma_check_page(struct page *page)
|
||||
{
|
||||
/*
|
||||
* One extra ref because caller holds an extra reference, either from
|
||||
* isolate_lru_page() for a regular page, or migrate_vma_collect() for
|
||||
* a device page.
|
||||
*/
|
||||
int extra = 1;
|
||||
|
||||
/*
|
||||
* FIXME support THP (transparent huge page), it is bit more complex to
|
||||
* check them than regular pages, because they can be mapped with a pmd
|
||||
* or with a pte (split pte mapping).
|
||||
*/
|
||||
if (PageCompound(page))
|
||||
return false;
|
||||
|
||||
/* Page from ZONE_DEVICE have one extra reference */
|
||||
if (is_zone_device_page(page))
|
||||
extra++;
|
||||
|
||||
/* For file back page */
|
||||
if (page_mapping(page))
|
||||
extra += 1 + page_has_private(page);
|
||||
|
||||
if ((page_count(page) - extra) > page_mapcount(page))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* migrate_vma_unmap() - replace page mapping with special migration pte entry
|
||||
* @migrate: migrate struct containing all migration information
|
||||
*
|
||||
* Isolate pages from the LRU and replace mappings (CPU page table pte) with a
|
||||
* special migration pte entry and check if it has been pinned. Pinned pages are
|
||||
* restored because we cannot migrate them.
|
||||
*
|
||||
* This is the last step before we call the device driver callback to allocate
|
||||
* destination memory and copy contents of original page over to new page.
|
||||
*/
|
||||
static void migrate_vma_unmap(struct migrate_vma *migrate)
|
||||
{
|
||||
const unsigned long npages = migrate->npages;
|
||||
unsigned long i, restore = 0;
|
||||
bool allow_drain = true;
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
for (i = 0; i < npages; i++) {
|
||||
struct page *page = migrate_pfn_to_page(migrate->src[i]);
|
||||
struct folio *folio;
|
||||
|
||||
if (!page)
|
||||
continue;
|
||||
|
||||
/* ZONE_DEVICE pages are not on LRU */
|
||||
if (!is_zone_device_page(page)) {
|
||||
if (!PageLRU(page) && allow_drain) {
|
||||
/* Drain CPU's pagevec */
|
||||
lru_add_drain_all();
|
||||
allow_drain = false;
|
||||
}
|
||||
|
||||
if (isolate_lru_page(page)) {
|
||||
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
|
||||
migrate->cpages--;
|
||||
restore++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Drop the reference we took in collect */
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
folio = page_folio(page);
|
||||
if (folio_mapped(folio))
|
||||
try_to_migrate(folio, 0);
|
||||
|
||||
if (page_mapped(page) || !migrate_vma_check_page(page)) {
|
||||
if (!is_zone_device_page(page)) {
|
||||
get_page(page);
|
||||
putback_lru_page(page);
|
||||
}
|
||||
|
||||
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
|
||||
migrate->cpages--;
|
||||
restore++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < npages && restore; i++) {
|
||||
struct page *page = migrate_pfn_to_page(migrate->src[i]);
|
||||
struct folio *folio;
|
||||
|
||||
if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
|
||||
continue;
|
||||
|
||||
folio = page_folio(page);
|
||||
remove_migration_ptes(folio, folio, false);
|
||||
|
||||
migrate->src[i] = 0;
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
restore--;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* migrate_vma_setup() - prepare to migrate a range of memory
|
||||
* @args: contains the vma, start, and pfns arrays for the migration
|
||||
*
|
||||
* Returns: negative errno on failures, 0 when 0 or more pages were migrated
|
||||
* without an error.
|
||||
*
|
||||
* Prepare to migrate a range of memory virtual address range by collecting all
|
||||
* the pages backing each virtual address in the range, saving them inside the
|
||||
* src array. Then lock those pages and unmap them. Once the pages are locked
|
||||
* and unmapped, check whether each page is pinned or not. Pages that aren't
|
||||
* pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
|
||||
* corresponding src array entry. Then restores any pages that are pinned, by
|
||||
* remapping and unlocking those pages.
|
||||
*
|
||||
* The caller should then allocate destination memory and copy source memory to
|
||||
* it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
|
||||
* flag set). Once these are allocated and copied, the caller must update each
|
||||
* corresponding entry in the dst array with the pfn value of the destination
|
||||
* page and with MIGRATE_PFN_VALID. Destination pages must be locked via
|
||||
* lock_page().
|
||||
*
|
||||
* Note that the caller does not have to migrate all the pages that are marked
|
||||
* with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
|
||||
* device memory to system memory. If the caller cannot migrate a device page
|
||||
* back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
|
||||
* consequences for the userspace process, so it must be avoided if at all
|
||||
* possible.
|
||||
*
|
||||
* For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
|
||||
* do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
|
||||
* allowing the caller to allocate device memory for those unbacked virtual
|
||||
* addresses. For this the caller simply has to allocate device memory and
|
||||
* properly set the destination entry like for regular migration. Note that
|
||||
* this can still fail, and thus inside the device driver you must check if the
|
||||
* migration was successful for those entries after calling migrate_vma_pages(),
|
||||
* just like for regular migration.
|
||||
*
|
||||
* After that, the callers must call migrate_vma_pages() to go over each entry
|
||||
* in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
|
||||
* set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
|
||||
* then migrate_vma_pages() to migrate struct page information from the source
|
||||
* struct page to the destination struct page. If it fails to migrate the
|
||||
* struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
|
||||
* src array.
|
||||
*
|
||||
* At this point all successfully migrated pages have an entry in the src
|
||||
* array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
|
||||
* array entry with MIGRATE_PFN_VALID flag set.
|
||||
*
|
||||
* Once migrate_vma_pages() returns the caller may inspect which pages were
|
||||
* successfully migrated, and which were not. Successfully migrated pages will
|
||||
* have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
|
||||
*
|
||||
* It is safe to update device page table after migrate_vma_pages() because
|
||||
* both destination and source page are still locked, and the mmap_lock is held
|
||||
* in read mode (hence no one can unmap the range being migrated).
|
||||
*
|
||||
* Once the caller is done cleaning up things and updating its page table (if it
|
||||
* chose to do so, this is not an obligation) it finally calls
|
||||
* migrate_vma_finalize() to update the CPU page table to point to new pages
|
||||
* for successfully migrated pages or otherwise restore the CPU page table to
|
||||
* point to the original source pages.
|
||||
*/
|
||||
int migrate_vma_setup(struct migrate_vma *args)
|
||||
{
|
||||
long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
|
||||
|
||||
args->start &= PAGE_MASK;
|
||||
args->end &= PAGE_MASK;
|
||||
if (!args->vma || is_vm_hugetlb_page(args->vma) ||
|
||||
(args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
|
||||
return -EINVAL;
|
||||
if (nr_pages <= 0)
|
||||
return -EINVAL;
|
||||
if (args->start < args->vma->vm_start ||
|
||||
args->start >= args->vma->vm_end)
|
||||
return -EINVAL;
|
||||
if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
|
||||
return -EINVAL;
|
||||
if (!args->src || !args->dst)
|
||||
return -EINVAL;
|
||||
|
||||
memset(args->src, 0, sizeof(*args->src) * nr_pages);
|
||||
args->cpages = 0;
|
||||
args->npages = 0;
|
||||
|
||||
migrate_vma_collect(args);
|
||||
|
||||
if (args->cpages)
|
||||
migrate_vma_unmap(args);
|
||||
|
||||
/*
|
||||
* At this point pages are locked and unmapped, and thus they have
|
||||
* stable content and can safely be copied to destination memory that
|
||||
* is allocated by the drivers.
|
||||
*/
|
||||
return 0;
|
||||
|
||||
}
|
||||
EXPORT_SYMBOL(migrate_vma_setup);
|
||||
|
||||
/*
|
||||
* This code closely matches the code in:
|
||||
* __handle_mm_fault()
|
||||
* handle_pte_fault()
|
||||
* do_anonymous_page()
|
||||
* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
|
||||
* private page.
|
||||
*/
|
||||
static void migrate_vma_insert_page(struct migrate_vma *migrate,
|
||||
unsigned long addr,
|
||||
struct page *page,
|
||||
unsigned long *src)
|
||||
{
|
||||
struct vm_area_struct *vma = migrate->vma;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
bool flush = false;
|
||||
spinlock_t *ptl;
|
||||
pte_t entry;
|
||||
pgd_t *pgdp;
|
||||
p4d_t *p4dp;
|
||||
pud_t *pudp;
|
||||
pmd_t *pmdp;
|
||||
pte_t *ptep;
|
||||
|
||||
/* Only allow populating anonymous memory */
|
||||
if (!vma_is_anonymous(vma))
|
||||
goto abort;
|
||||
|
||||
pgdp = pgd_offset(mm, addr);
|
||||
p4dp = p4d_alloc(mm, pgdp, addr);
|
||||
if (!p4dp)
|
||||
goto abort;
|
||||
pudp = pud_alloc(mm, p4dp, addr);
|
||||
if (!pudp)
|
||||
goto abort;
|
||||
pmdp = pmd_alloc(mm, pudp, addr);
|
||||
if (!pmdp)
|
||||
goto abort;
|
||||
|
||||
if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
|
||||
goto abort;
|
||||
|
||||
/*
|
||||
* Use pte_alloc() instead of pte_alloc_map(). We can't run
|
||||
* pte_offset_map() on pmds where a huge pmd might be created
|
||||
* from a different thread.
|
||||
*
|
||||
* pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
|
||||
* parallel threads are excluded by other means.
|
||||
*
|
||||
* Here we only have mmap_read_lock(mm).
|
||||
*/
|
||||
if (pte_alloc(mm, pmdp))
|
||||
goto abort;
|
||||
|
||||
/* See the comment in pte_alloc_one_map() */
|
||||
if (unlikely(pmd_trans_unstable(pmdp)))
|
||||
goto abort;
|
||||
|
||||
if (unlikely(anon_vma_prepare(vma)))
|
||||
goto abort;
|
||||
if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
|
||||
goto abort;
|
||||
|
||||
/*
|
||||
* The memory barrier inside __SetPageUptodate makes sure that
|
||||
* preceding stores to the page contents become visible before
|
||||
* the set_pte_at() write.
|
||||
*/
|
||||
__SetPageUptodate(page);
|
||||
|
||||
if (is_device_private_page(page)) {
|
||||
swp_entry_t swp_entry;
|
||||
|
||||
if (vma->vm_flags & VM_WRITE)
|
||||
swp_entry = make_writable_device_private_entry(
|
||||
page_to_pfn(page));
|
||||
else
|
||||
swp_entry = make_readable_device_private_entry(
|
||||
page_to_pfn(page));
|
||||
entry = swp_entry_to_pte(swp_entry);
|
||||
} else {
|
||||
/*
|
||||
* For now we only support migrating to un-addressable device
|
||||
* memory.
|
||||
*/
|
||||
if (is_zone_device_page(page)) {
|
||||
pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
|
||||
goto abort;
|
||||
}
|
||||
entry = mk_pte(page, vma->vm_page_prot);
|
||||
if (vma->vm_flags & VM_WRITE)
|
||||
entry = pte_mkwrite(pte_mkdirty(entry));
|
||||
}
|
||||
|
||||
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
|
||||
|
||||
if (check_stable_address_space(mm))
|
||||
goto unlock_abort;
|
||||
|
||||
if (pte_present(*ptep)) {
|
||||
unsigned long pfn = pte_pfn(*ptep);
|
||||
|
||||
if (!is_zero_pfn(pfn))
|
||||
goto unlock_abort;
|
||||
flush = true;
|
||||
} else if (!pte_none(*ptep))
|
||||
goto unlock_abort;
|
||||
|
||||
/*
|
||||
* Check for userfaultfd but do not deliver the fault. Instead,
|
||||
* just back off.
|
||||
*/
|
||||
if (userfaultfd_missing(vma))
|
||||
goto unlock_abort;
|
||||
|
||||
inc_mm_counter(mm, MM_ANONPAGES);
|
||||
page_add_new_anon_rmap(page, vma, addr, false);
|
||||
if (!is_zone_device_page(page))
|
||||
lru_cache_add_inactive_or_unevictable(page, vma);
|
||||
get_page(page);
|
||||
|
||||
if (flush) {
|
||||
flush_cache_page(vma, addr, pte_pfn(*ptep));
|
||||
ptep_clear_flush_notify(vma, addr, ptep);
|
||||
set_pte_at_notify(mm, addr, ptep, entry);
|
||||
update_mmu_cache(vma, addr, ptep);
|
||||
} else {
|
||||
/* No need to invalidate - it was non-present before */
|
||||
set_pte_at(mm, addr, ptep, entry);
|
||||
update_mmu_cache(vma, addr, ptep);
|
||||
}
|
||||
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
*src = MIGRATE_PFN_MIGRATE;
|
||||
return;
|
||||
|
||||
unlock_abort:
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
abort:
|
||||
*src &= ~MIGRATE_PFN_MIGRATE;
|
||||
}
|
||||
|
||||
/**
|
||||
* migrate_vma_pages() - migrate meta-data from src page to dst page
|
||||
* @migrate: migrate struct containing all migration information
|
||||
*
|
||||
* This migrates struct page meta-data from source struct page to destination
|
||||
* struct page. This effectively finishes the migration from source page to the
|
||||
* destination page.
|
||||
*/
|
||||
void migrate_vma_pages(struct migrate_vma *migrate)
|
||||
{
|
||||
const unsigned long npages = migrate->npages;
|
||||
const unsigned long start = migrate->start;
|
||||
struct mmu_notifier_range range;
|
||||
unsigned long addr, i;
|
||||
bool notified = false;
|
||||
|
||||
for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
|
||||
struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
|
||||
struct page *page = migrate_pfn_to_page(migrate->src[i]);
|
||||
struct address_space *mapping;
|
||||
int r;
|
||||
|
||||
if (!newpage) {
|
||||
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!page) {
|
||||
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
|
||||
continue;
|
||||
if (!notified) {
|
||||
notified = true;
|
||||
|
||||
mmu_notifier_range_init_owner(&range,
|
||||
MMU_NOTIFY_MIGRATE, 0, migrate->vma,
|
||||
migrate->vma->vm_mm, addr, migrate->end,
|
||||
migrate->pgmap_owner);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
}
|
||||
migrate_vma_insert_page(migrate, addr, newpage,
|
||||
&migrate->src[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
mapping = page_mapping(page);
|
||||
|
||||
if (is_device_private_page(newpage)) {
|
||||
/*
|
||||
* For now only support private anonymous when migrating
|
||||
* to un-addressable device memory.
|
||||
*/
|
||||
if (mapping) {
|
||||
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
|
||||
continue;
|
||||
}
|
||||
} else if (is_zone_device_page(newpage)) {
|
||||
/*
|
||||
* Other types of ZONE_DEVICE page are not supported.
|
||||
*/
|
||||
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
|
||||
continue;
|
||||
}
|
||||
|
||||
r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
|
||||
if (r != MIGRATEPAGE_SUCCESS)
|
||||
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
|
||||
}
|
||||
|
||||
/*
|
||||
* No need to double call mmu_notifier->invalidate_range() callback as
|
||||
* the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
|
||||
* did already call it.
|
||||
*/
|
||||
if (notified)
|
||||
mmu_notifier_invalidate_range_only_end(&range);
|
||||
}
|
||||
EXPORT_SYMBOL(migrate_vma_pages);
|
||||
|
||||
/**
|
||||
* migrate_vma_finalize() - restore CPU page table entry
|
||||
* @migrate: migrate struct containing all migration information
|
||||
*
|
||||
* This replaces the special migration pte entry with either a mapping to the
|
||||
* new page if migration was successful for that page, or to the original page
|
||||
* otherwise.
|
||||
*
|
||||
* This also unlocks the pages and puts them back on the lru, or drops the extra
|
||||
* refcount, for device pages.
|
||||
*/
|
||||
void migrate_vma_finalize(struct migrate_vma *migrate)
|
||||
{
|
||||
const unsigned long npages = migrate->npages;
|
||||
unsigned long i;
|
||||
|
||||
for (i = 0; i < npages; i++) {
|
||||
struct folio *dst, *src;
|
||||
struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
|
||||
struct page *page = migrate_pfn_to_page(migrate->src[i]);
|
||||
|
||||
if (!page) {
|
||||
if (newpage) {
|
||||
unlock_page(newpage);
|
||||
put_page(newpage);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
|
||||
if (newpage) {
|
||||
unlock_page(newpage);
|
||||
put_page(newpage);
|
||||
}
|
||||
newpage = page;
|
||||
}
|
||||
|
||||
src = page_folio(page);
|
||||
dst = page_folio(newpage);
|
||||
remove_migration_ptes(src, dst, false);
|
||||
folio_unlock(src);
|
||||
|
||||
if (is_zone_device_page(page))
|
||||
put_page(page);
|
||||
else
|
||||
putback_lru_page(page);
|
||||
|
||||
if (newpage != page) {
|
||||
unlock_page(newpage);
|
||||
if (is_zone_device_page(newpage))
|
||||
put_page(newpage);
|
||||
else
|
||||
putback_lru_page(newpage);
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(migrate_vma_finalize);
|
670
mm/mlock.c
670
mm/mlock.c
@ -14,6 +14,7 @@
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/sched.h>
|
||||
@ -27,6 +28,8 @@
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
static DEFINE_PER_CPU(struct pagevec, mlock_pvec);
|
||||
|
||||
bool can_do_mlock(void)
|
||||
{
|
||||
if (rlimit(RLIMIT_MEMLOCK) != 0)
|
||||
@ -46,441 +49,320 @@ EXPORT_SYMBOL(can_do_mlock);
|
||||
* be placed on the LRU "unevictable" list, rather than the [in]active lists.
|
||||
* The unevictable list is an LRU sibling list to the [in]active lists.
|
||||
* PageUnevictable is set to indicate the unevictable state.
|
||||
*
|
||||
* When lazy mlocking via vmscan, it is important to ensure that the
|
||||
* vma's VM_LOCKED status is not concurrently being modified, otherwise we
|
||||
* may have mlocked a page that is being munlocked. So lazy mlock must take
|
||||
* the mmap_lock for read, and verify that the vma really is locked
|
||||
* (see mm/rmap.c).
|
||||
*/
|
||||
|
||||
/*
|
||||
* LRU accounting for clear_page_mlock()
|
||||
*/
|
||||
void clear_page_mlock(struct page *page)
|
||||
static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
int nr_pages;
|
||||
/* There is nothing more we can do while it's off LRU */
|
||||
if (!TestClearPageLRU(page))
|
||||
return lruvec;
|
||||
|
||||
if (!TestClearPageMlocked(page))
|
||||
return;
|
||||
lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
|
||||
|
||||
nr_pages = thp_nr_pages(page);
|
||||
mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
|
||||
count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
|
||||
/*
|
||||
* The previous TestClearPageMlocked() corresponds to the smp_mb()
|
||||
* in __pagevec_lru_add_fn().
|
||||
*
|
||||
* See __pagevec_lru_add_fn for more explanation.
|
||||
*/
|
||||
if (!isolate_lru_page(page)) {
|
||||
putback_lru_page(page);
|
||||
} else {
|
||||
if (unlikely(page_evictable(page))) {
|
||||
/*
|
||||
* We lost the race. the page already moved to evictable list.
|
||||
* This is a little surprising, but quite possible:
|
||||
* PageMlocked must have got cleared already by another CPU.
|
||||
* Could this page be on the Unevictable LRU? I'm not sure,
|
||||
* but move it now if so.
|
||||
*/
|
||||
if (PageUnevictable(page))
|
||||
count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
|
||||
if (PageUnevictable(page)) {
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
ClearPageUnevictable(page);
|
||||
add_page_to_lru_list(page, lruvec);
|
||||
__count_vm_events(UNEVICTABLE_PGRESCUED,
|
||||
thp_nr_pages(page));
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark page as mlocked if not already.
|
||||
* If page on LRU, isolate and putback to move to unevictable list.
|
||||
*/
|
||||
void mlock_vma_page(struct page *page)
|
||||
{
|
||||
/* Serialize with page migration */
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
|
||||
|
||||
if (!TestSetPageMlocked(page)) {
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
|
||||
count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
|
||||
if (!isolate_lru_page(page))
|
||||
putback_lru_page(page);
|
||||
if (PageUnevictable(page)) {
|
||||
if (PageMlocked(page))
|
||||
page->mlock_count++;
|
||||
goto out;
|
||||
}
|
||||
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
ClearPageActive(page);
|
||||
SetPageUnevictable(page);
|
||||
page->mlock_count = !!PageMlocked(page);
|
||||
add_page_to_lru_list(page, lruvec);
|
||||
__count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
|
||||
out:
|
||||
SetPageLRU(page);
|
||||
return lruvec;
|
||||
}
|
||||
|
||||
/*
|
||||
* Finish munlock after successful page isolation
|
||||
*
|
||||
* Page must be locked. This is a wrapper for page_mlock()
|
||||
* and putback_lru_page() with munlock accounting.
|
||||
*/
|
||||
static void __munlock_isolated_page(struct page *page)
|
||||
static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
/*
|
||||
* Optimization: if the page was mapped just once, that's our mapping
|
||||
* and we don't need to check all the other vmas.
|
||||
*/
|
||||
if (page_mapcount(page) > 1)
|
||||
page_mlock(page);
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
|
||||
/* Did try_to_unlock() succeed or punt? */
|
||||
if (!PageMlocked(page))
|
||||
count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page));
|
||||
lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
|
||||
|
||||
putback_lru_page(page);
|
||||
/* As above, this is a little surprising, but possible */
|
||||
if (unlikely(page_evictable(page)))
|
||||
goto out;
|
||||
|
||||
SetPageUnevictable(page);
|
||||
page->mlock_count = !!PageMlocked(page);
|
||||
__count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
|
||||
out:
|
||||
add_page_to_lru_list(page, lruvec);
|
||||
SetPageLRU(page);
|
||||
return lruvec;
|
||||
}
|
||||
|
||||
/*
|
||||
* Accounting for page isolation fail during munlock
|
||||
*
|
||||
* Performs accounting when page isolation fails in munlock. There is nothing
|
||||
* else to do because it means some other task has already removed the page
|
||||
* from the LRU. putback_lru_page() will take care of removing the page from
|
||||
* the unevictable list, if necessary. vmscan [page_referenced()] will move
|
||||
* the page back to the unevictable list if some other vma has it mlocked.
|
||||
*/
|
||||
static void __munlock_isolation_failed(struct page *page)
|
||||
static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
bool isolated = false;
|
||||
|
||||
if (PageUnevictable(page))
|
||||
__count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
|
||||
else
|
||||
__count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
|
||||
if (!TestClearPageLRU(page))
|
||||
goto munlock;
|
||||
|
||||
isolated = true;
|
||||
lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
|
||||
|
||||
if (PageUnevictable(page)) {
|
||||
/* Then mlock_count is maintained, but might undercount */
|
||||
if (page->mlock_count)
|
||||
page->mlock_count--;
|
||||
if (page->mlock_count)
|
||||
goto out;
|
||||
}
|
||||
/* else assume that was the last mlock: reclaim will fix it if not */
|
||||
|
||||
munlock:
|
||||
if (TestClearPageMlocked(page)) {
|
||||
__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
|
||||
if (isolated || !PageUnevictable(page))
|
||||
__count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
|
||||
else
|
||||
__count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
|
||||
}
|
||||
|
||||
/* page_evictable() has to be checked *after* clearing Mlocked */
|
||||
if (isolated && PageUnevictable(page) && page_evictable(page)) {
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
ClearPageUnevictable(page);
|
||||
add_page_to_lru_list(page, lruvec);
|
||||
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
|
||||
}
|
||||
out:
|
||||
if (isolated)
|
||||
SetPageLRU(page);
|
||||
return lruvec;
|
||||
}
|
||||
|
||||
/*
|
||||
* Flags held in the low bits of a struct page pointer on the mlock_pvec.
|
||||
*/
|
||||
#define LRU_PAGE 0x1
|
||||
#define NEW_PAGE 0x2
|
||||
static inline struct page *mlock_lru(struct page *page)
|
||||
{
|
||||
return (struct page *)((unsigned long)page + LRU_PAGE);
|
||||
}
|
||||
|
||||
static inline struct page *mlock_new(struct page *page)
|
||||
{
|
||||
return (struct page *)((unsigned long)page + NEW_PAGE);
|
||||
}
|
||||
|
||||
/*
|
||||
* mlock_pagevec() is derived from pagevec_lru_move_fn():
|
||||
* perhaps that can make use of such page pointer flags in future,
|
||||
* but for now just keep it for mlock. We could use three separate
|
||||
* pagevecs instead, but one feels better (munlocking a full pagevec
|
||||
* does not need to drain mlocking pagevecs first).
|
||||
*/
|
||||
static void mlock_pagevec(struct pagevec *pvec)
|
||||
{
|
||||
struct lruvec *lruvec = NULL;
|
||||
unsigned long mlock;
|
||||
struct page *page;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
page = pvec->pages[i];
|
||||
mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE);
|
||||
page = (struct page *)((unsigned long)page - mlock);
|
||||
pvec->pages[i] = page;
|
||||
|
||||
if (mlock & LRU_PAGE)
|
||||
lruvec = __mlock_page(page, lruvec);
|
||||
else if (mlock & NEW_PAGE)
|
||||
lruvec = __mlock_new_page(page, lruvec);
|
||||
else
|
||||
lruvec = __munlock_page(page, lruvec);
|
||||
}
|
||||
|
||||
if (lruvec)
|
||||
unlock_page_lruvec_irq(lruvec);
|
||||
release_pages(pvec->pages, pvec->nr);
|
||||
pagevec_reinit(pvec);
|
||||
}
|
||||
|
||||
void mlock_page_drain(int cpu)
|
||||
{
|
||||
struct pagevec *pvec;
|
||||
|
||||
pvec = &per_cpu(mlock_pvec, cpu);
|
||||
if (pagevec_count(pvec))
|
||||
mlock_pagevec(pvec);
|
||||
}
|
||||
|
||||
bool need_mlock_page_drain(int cpu)
|
||||
{
|
||||
return pagevec_count(&per_cpu(mlock_pvec, cpu));
|
||||
}
|
||||
|
||||
/**
|
||||
* munlock_vma_page - munlock a vma page
|
||||
* @page: page to be unlocked, either a normal page or THP page head
|
||||
*
|
||||
* returns the size of the page as a page mask (0 for normal page,
|
||||
* HPAGE_PMD_NR - 1 for THP head page)
|
||||
*
|
||||
* called from munlock()/munmap() path with page supposedly on the LRU.
|
||||
* When we munlock a page, because the vma where we found the page is being
|
||||
* munlock()ed or munmap()ed, we want to check whether other vmas hold the
|
||||
* page locked so that we can leave it on the unevictable lru list and not
|
||||
* bother vmscan with it. However, to walk the page's rmap list in
|
||||
* page_mlock() we must isolate the page from the LRU. If some other
|
||||
* task has removed the page from the LRU, we won't be able to do that.
|
||||
* So we clear the PageMlocked as we might not get another chance. If we
|
||||
* can't isolate the page, we leave it for putback_lru_page() and vmscan
|
||||
* [page_referenced()/try_to_unmap()] to deal with.
|
||||
* mlock_folio - mlock a folio already on (or temporarily off) LRU
|
||||
* @folio: folio to be mlocked.
|
||||
*/
|
||||
unsigned int munlock_vma_page(struct page *page)
|
||||
void mlock_folio(struct folio *folio)
|
||||
{
|
||||
int nr_pages;
|
||||
struct pagevec *pvec = &get_cpu_var(mlock_pvec);
|
||||
|
||||
/* For page_mlock() and to serialize with page migration */
|
||||
BUG_ON(!PageLocked(page));
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
if (!folio_test_set_mlocked(folio)) {
|
||||
int nr_pages = folio_nr_pages(folio);
|
||||
|
||||
if (!TestClearPageMlocked(page)) {
|
||||
/* Potentially, PTE-mapped THP: do not skip the rest PTEs */
|
||||
return 0;
|
||||
zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
|
||||
__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
|
||||
}
|
||||
|
||||
nr_pages = thp_nr_pages(page);
|
||||
mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
|
||||
|
||||
if (!isolate_lru_page(page))
|
||||
__munlock_isolated_page(page);
|
||||
else
|
||||
__munlock_isolation_failed(page);
|
||||
|
||||
return nr_pages - 1;
|
||||
folio_get(folio);
|
||||
if (!pagevec_add(pvec, mlock_lru(&folio->page)) ||
|
||||
folio_test_large(folio) || lru_cache_disabled())
|
||||
mlock_pagevec(pvec);
|
||||
put_cpu_var(mlock_pvec);
|
||||
}
|
||||
|
||||
/*
|
||||
* convert get_user_pages() return value to posix mlock() error
|
||||
/**
|
||||
* mlock_new_page - mlock a newly allocated page not yet on LRU
|
||||
* @page: page to be mlocked, either a normal page or a THP head.
|
||||
*/
|
||||
static int __mlock_posix_error_return(long retval)
|
||||
void mlock_new_page(struct page *page)
|
||||
{
|
||||
if (retval == -EFAULT)
|
||||
retval = -ENOMEM;
|
||||
else if (retval == -ENOMEM)
|
||||
retval = -EAGAIN;
|
||||
return retval;
|
||||
struct pagevec *pvec = &get_cpu_var(mlock_pvec);
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
SetPageMlocked(page);
|
||||
mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
|
||||
__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
|
||||
|
||||
get_page(page);
|
||||
if (!pagevec_add(pvec, mlock_new(page)) ||
|
||||
PageHead(page) || lru_cache_disabled())
|
||||
mlock_pagevec(pvec);
|
||||
put_cpu_var(mlock_pvec);
|
||||
}
|
||||
|
||||
/*
|
||||
* Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
|
||||
*
|
||||
* The fast path is available only for evictable pages with single mapping.
|
||||
* Then we can bypass the per-cpu pvec and get better performance.
|
||||
* when mapcount > 1 we need page_mlock() which can fail.
|
||||
* when !page_evictable(), we need the full redo logic of putback_lru_page to
|
||||
* avoid leaving evictable page in unevictable list.
|
||||
*
|
||||
* In case of success, @page is added to @pvec and @pgrescued is incremented
|
||||
* in case that the page was previously unevictable. @page is also unlocked.
|
||||
/**
|
||||
* munlock_page - munlock a page
|
||||
* @page: page to be munlocked, either a normal page or a THP head.
|
||||
*/
|
||||
static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
|
||||
int *pgrescued)
|
||||
void munlock_page(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
|
||||
if (page_mapcount(page) <= 1 && page_evictable(page)) {
|
||||
pagevec_add(pvec, page);
|
||||
if (TestClearPageUnevictable(page))
|
||||
(*pgrescued)++;
|
||||
unlock_page(page);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Putback multiple evictable pages to the LRU
|
||||
*
|
||||
* Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
|
||||
* the pages might have meanwhile become unevictable but that is OK.
|
||||
*/
|
||||
static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
|
||||
{
|
||||
count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
|
||||
/*
|
||||
*__pagevec_lru_add() calls release_pages() so we don't call
|
||||
* put_page() explicitly
|
||||
*/
|
||||
__pagevec_lru_add(pvec);
|
||||
count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
|
||||
}
|
||||
|
||||
/*
|
||||
* Munlock a batch of pages from the same zone
|
||||
*
|
||||
* The work is split to two main phases. First phase clears the Mlocked flag
|
||||
* and attempts to isolate the pages, all under a single zone lru lock.
|
||||
* The second phase finishes the munlock only for pages where isolation
|
||||
* succeeded.
|
||||
*
|
||||
* Note that the pagevec may be modified during the process.
|
||||
*/
|
||||
static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
|
||||
{
|
||||
int i;
|
||||
int nr = pagevec_count(pvec);
|
||||
int delta_munlocked = -nr;
|
||||
struct pagevec pvec_putback;
|
||||
struct lruvec *lruvec = NULL;
|
||||
int pgrescued = 0;
|
||||
|
||||
pagevec_init(&pvec_putback);
|
||||
|
||||
/* Phase 1: page isolation */
|
||||
for (i = 0; i < nr; i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
struct folio *folio = page_folio(page);
|
||||
|
||||
if (TestClearPageMlocked(page)) {
|
||||
/*
|
||||
* We already have pin from follow_page_mask()
|
||||
* so we can spare the get_page() here.
|
||||
*/
|
||||
if (TestClearPageLRU(page)) {
|
||||
lruvec = folio_lruvec_relock_irq(folio, lruvec);
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
continue;
|
||||
} else
|
||||
__munlock_isolation_failed(page);
|
||||
} else {
|
||||
delta_munlocked++;
|
||||
}
|
||||
|
||||
/*
|
||||
* We won't be munlocking this page in the next phase
|
||||
* but we still need to release the follow_page_mask()
|
||||
* pin. We cannot do it under lru_lock however. If it's
|
||||
* the last pin, __page_cache_release() would deadlock.
|
||||
*/
|
||||
pagevec_add(&pvec_putback, pvec->pages[i]);
|
||||
pvec->pages[i] = NULL;
|
||||
}
|
||||
if (lruvec) {
|
||||
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
|
||||
unlock_page_lruvec_irq(lruvec);
|
||||
} else if (delta_munlocked) {
|
||||
mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
|
||||
}
|
||||
|
||||
/* Now we can release pins of pages that we are not munlocking */
|
||||
pagevec_release(&pvec_putback);
|
||||
|
||||
/* Phase 2: page munlock */
|
||||
for (i = 0; i < nr; i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
|
||||
if (page) {
|
||||
lock_page(page);
|
||||
if (!__putback_lru_fast_prepare(page, &pvec_putback,
|
||||
&pgrescued)) {
|
||||
/*
|
||||
* Slow path. We don't want to lose the last
|
||||
* pin before unlock_page()
|
||||
*/
|
||||
get_page(page); /* for putback_lru_page() */
|
||||
__munlock_isolated_page(page);
|
||||
unlock_page(page);
|
||||
put_page(page); /* from follow_page_mask() */
|
||||
}
|
||||
}
|
||||
}
|
||||
struct pagevec *pvec = &get_cpu_var(mlock_pvec);
|
||||
|
||||
/*
|
||||
* Phase 3: page putback for pages that qualified for the fast path
|
||||
* This will also call put_page() to return pin from follow_page_mask()
|
||||
* TestClearPageMlocked(page) must be left to __munlock_page(),
|
||||
* which will check whether the page is multiply mlocked.
|
||||
*/
|
||||
if (pagevec_count(&pvec_putback))
|
||||
__putback_lru_fast(&pvec_putback, pgrescued);
|
||||
|
||||
get_page(page);
|
||||
if (!pagevec_add(pvec, page) ||
|
||||
PageHead(page) || lru_cache_disabled())
|
||||
mlock_pagevec(pvec);
|
||||
put_cpu_var(mlock_pvec);
|
||||
}
|
||||
|
||||
/*
|
||||
* Fill up pagevec for __munlock_pagevec using pte walk
|
||||
*
|
||||
* The function expects that the struct page corresponding to @start address is
|
||||
* a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
|
||||
*
|
||||
* The rest of @pvec is filled by subsequent pages within the same pmd and same
|
||||
* zone, as long as the pte's are present and vm_normal_page() succeeds. These
|
||||
* pages also get pinned.
|
||||
*
|
||||
* Returns the address of the next page that should be scanned. This equals
|
||||
* @start + PAGE_SIZE when no page could be added by the pte walk.
|
||||
*/
|
||||
static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
|
||||
struct vm_area_struct *vma, struct zone *zone,
|
||||
unsigned long start, unsigned long end)
|
||||
static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
|
||||
unsigned long end, struct mm_walk *walk)
|
||||
|
||||
{
|
||||
pte_t *pte;
|
||||
struct vm_area_struct *vma = walk->vma;
|
||||
spinlock_t *ptl;
|
||||
pte_t *start_pte, *pte;
|
||||
struct page *page;
|
||||
|
||||
/*
|
||||
* Initialize pte walk starting at the already pinned page where we
|
||||
* are sure that there is a pte, as it was pinned under the same
|
||||
* mmap_lock write op.
|
||||
*/
|
||||
pte = get_locked_pte(vma->vm_mm, start, &ptl);
|
||||
/* Make sure we do not cross the page table boundary */
|
||||
end = pgd_addr_end(start, end);
|
||||
end = p4d_addr_end(start, end);
|
||||
end = pud_addr_end(start, end);
|
||||
end = pmd_addr_end(start, end);
|
||||
|
||||
/* The page next to the pinned page is the first we will try to get */
|
||||
start += PAGE_SIZE;
|
||||
while (start < end) {
|
||||
struct page *page = NULL;
|
||||
pte++;
|
||||
if (pte_present(*pte))
|
||||
page = vm_normal_page(vma, start, *pte);
|
||||
/*
|
||||
* Break if page could not be obtained or the page's node+zone does not
|
||||
* match
|
||||
*/
|
||||
if (!page || page_zone(page) != zone)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Do not use pagevec for PTE-mapped THP,
|
||||
* munlock_vma_pages_range() will handle them.
|
||||
*/
|
||||
if (PageTransCompound(page))
|
||||
break;
|
||||
|
||||
get_page(page);
|
||||
/*
|
||||
* Increase the address that will be returned *before* the
|
||||
* eventual break due to pvec becoming full by adding the page
|
||||
*/
|
||||
start += PAGE_SIZE;
|
||||
if (pagevec_add(pvec, page) == 0)
|
||||
break;
|
||||
ptl = pmd_trans_huge_lock(pmd, vma);
|
||||
if (ptl) {
|
||||
if (!pmd_present(*pmd))
|
||||
goto out;
|
||||
if (is_huge_zero_pmd(*pmd))
|
||||
goto out;
|
||||
page = pmd_page(*pmd);
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mlock_folio(page_folio(page));
|
||||
else
|
||||
munlock_page(page);
|
||||
goto out;
|
||||
}
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
return start;
|
||||
|
||||
start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
|
||||
for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
|
||||
if (!pte_present(*pte))
|
||||
continue;
|
||||
page = vm_normal_page(vma, addr, *pte);
|
||||
if (!page)
|
||||
continue;
|
||||
if (PageTransCompound(page))
|
||||
continue;
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mlock_folio(page_folio(page));
|
||||
else
|
||||
munlock_page(page);
|
||||
}
|
||||
pte_unmap(start_pte);
|
||||
out:
|
||||
spin_unlock(ptl);
|
||||
cond_resched();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* munlock_vma_pages_range() - munlock all pages in the vma range.'
|
||||
* @vma - vma containing range to be munlock()ed.
|
||||
* mlock_vma_pages_range() - mlock any pages already in the range,
|
||||
* or munlock all pages in the range.
|
||||
* @vma - vma containing range to be mlock()ed or munlock()ed
|
||||
* @start - start address in @vma of the range
|
||||
* @end - end of range in @vma.
|
||||
* @end - end of range in @vma
|
||||
* @newflags - the new set of flags for @vma.
|
||||
*
|
||||
* For mremap(), munmap() and exit().
|
||||
*
|
||||
* Called with @vma VM_LOCKED.
|
||||
*
|
||||
* Returns with VM_LOCKED cleared. Callers must be prepared to
|
||||
* deal with this.
|
||||
*
|
||||
* We don't save and restore VM_LOCKED here because pages are
|
||||
* still on lru. In unmap path, pages might be scanned by reclaim
|
||||
* and re-mlocked by page_mlock/try_to_unmap before we unmap and
|
||||
* free them. This will result in freeing mlocked pages.
|
||||
* Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
|
||||
* called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
|
||||
*/
|
||||
void munlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
static void mlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end, vm_flags_t newflags)
|
||||
{
|
||||
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
|
||||
static const struct mm_walk_ops mlock_walk_ops = {
|
||||
.pmd_entry = mlock_pte_range,
|
||||
};
|
||||
|
||||
while (start < end) {
|
||||
struct page *page;
|
||||
unsigned int page_mask = 0;
|
||||
unsigned long page_increm;
|
||||
struct pagevec pvec;
|
||||
struct zone *zone;
|
||||
/*
|
||||
* There is a slight chance that concurrent page migration,
|
||||
* or page reclaim finding a page of this now-VM_LOCKED vma,
|
||||
* will call mlock_vma_page() and raise page's mlock_count:
|
||||
* double counting, leaving the page unevictable indefinitely.
|
||||
* Communicate this danger to mlock_vma_page() with VM_IO,
|
||||
* which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
|
||||
* mmap_lock is held in write mode here, so this weird
|
||||
* combination should not be visible to other mmap_lock users;
|
||||
* but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
|
||||
*/
|
||||
if (newflags & VM_LOCKED)
|
||||
newflags |= VM_IO;
|
||||
WRITE_ONCE(vma->vm_flags, newflags);
|
||||
|
||||
pagevec_init(&pvec);
|
||||
/*
|
||||
* Although FOLL_DUMP is intended for get_dump_page(),
|
||||
* it just so happens that its special treatment of the
|
||||
* ZERO_PAGE (returning an error instead of doing get_page)
|
||||
* suits munlock very well (and if somehow an abnormal page
|
||||
* has sneaked into the range, we won't oops here: great).
|
||||
*/
|
||||
page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
|
||||
lru_add_drain();
|
||||
walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
|
||||
lru_add_drain();
|
||||
|
||||
if (page && !IS_ERR(page)) {
|
||||
if (PageTransTail(page)) {
|
||||
VM_BUG_ON_PAGE(PageMlocked(page), page);
|
||||
put_page(page); /* follow_page_mask() */
|
||||
} else if (PageTransHuge(page)) {
|
||||
lock_page(page);
|
||||
/*
|
||||
* Any THP page found by follow_page_mask() may
|
||||
* have gotten split before reaching
|
||||
* munlock_vma_page(), so we need to compute
|
||||
* the page_mask here instead.
|
||||
*/
|
||||
page_mask = munlock_vma_page(page);
|
||||
unlock_page(page);
|
||||
put_page(page); /* follow_page_mask() */
|
||||
} else {
|
||||
/*
|
||||
* Non-huge pages are handled in batches via
|
||||
* pagevec. The pin from follow_page_mask()
|
||||
* prevents them from collapsing by THP.
|
||||
*/
|
||||
pagevec_add(&pvec, page);
|
||||
zone = page_zone(page);
|
||||
|
||||
/*
|
||||
* Try to fill the rest of pagevec using fast
|
||||
* pte walk. This will also update start to
|
||||
* the next page to process. Then munlock the
|
||||
* pagevec.
|
||||
*/
|
||||
start = __munlock_pagevec_fill(&pvec, vma,
|
||||
zone, start, end);
|
||||
__munlock_pagevec(&pvec, zone);
|
||||
goto next;
|
||||
}
|
||||
}
|
||||
page_increm = 1 + page_mask;
|
||||
start += page_increm * PAGE_SIZE;
|
||||
next:
|
||||
cond_resched();
|
||||
if (newflags & VM_IO) {
|
||||
newflags &= ~VM_IO;
|
||||
WRITE_ONCE(vma->vm_flags, newflags);
|
||||
}
|
||||
}
|
||||
|
||||
@ -500,10 +382,9 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
pgoff_t pgoff;
|
||||
int nr_pages;
|
||||
int ret = 0;
|
||||
int lock = !!(newflags & VM_LOCKED);
|
||||
vm_flags_t old_flags = vma->vm_flags;
|
||||
vm_flags_t oldflags = vma->vm_flags;
|
||||
|
||||
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
|
||||
if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
|
||||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
|
||||
vma_is_dax(vma) || vma_is_secretmem(vma))
|
||||
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
|
||||
@ -535,9 +416,9 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
* Keep track of amount of locked VM.
|
||||
*/
|
||||
nr_pages = (end - start) >> PAGE_SHIFT;
|
||||
if (!lock)
|
||||
if (!(newflags & VM_LOCKED))
|
||||
nr_pages = -nr_pages;
|
||||
else if (old_flags & VM_LOCKED)
|
||||
else if (oldflags & VM_LOCKED)
|
||||
nr_pages = 0;
|
||||
mm->locked_vm += nr_pages;
|
||||
|
||||
@ -547,11 +428,12 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
* set VM_LOCKED, populate_vma_page_range will bring it back.
|
||||
*/
|
||||
|
||||
if (lock)
|
||||
if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
|
||||
/* No work to do, and mlocking twice would be wrong */
|
||||
vma->vm_flags = newflags;
|
||||
else
|
||||
munlock_vma_pages_range(vma, start, end);
|
||||
|
||||
} else {
|
||||
mlock_vma_pages_range(vma, start, end, newflags);
|
||||
}
|
||||
out:
|
||||
*prev = vma;
|
||||
return ret;
|
||||
@ -645,6 +527,18 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
|
||||
return count >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
/*
|
||||
* convert get_user_pages() return value to posix mlock() error
|
||||
*/
|
||||
static int __mlock_posix_error_return(long retval)
|
||||
{
|
||||
if (retval == -EFAULT)
|
||||
retval = -ENOMEM;
|
||||
else if (retval == -ENOMEM)
|
||||
retval = -EAGAIN;
|
||||
return retval;
|
||||
}
|
||||
|
||||
static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
|
||||
{
|
||||
unsigned long locked;
|
||||
|
32
mm/mmap.c
32
mm/mmap.c
@ -2672,6 +2672,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
vma->vm_prev = NULL;
|
||||
do {
|
||||
vma_rb_erase(vma, &mm->mm_rb);
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mm->locked_vm -= vma_pages(vma);
|
||||
mm->map_count--;
|
||||
tail_vma = vma;
|
||||
vma = vma->vm_next;
|
||||
@ -2776,22 +2778,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
return __split_vma(mm, vma, addr, new_below);
|
||||
}
|
||||
|
||||
static inline void
|
||||
unlock_range(struct vm_area_struct *start, unsigned long limit)
|
||||
{
|
||||
struct mm_struct *mm = start->vm_mm;
|
||||
struct vm_area_struct *tmp = start;
|
||||
|
||||
while (tmp && tmp->vm_start < limit) {
|
||||
if (tmp->vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm -= vma_pages(tmp);
|
||||
munlock_vma_pages_all(tmp);
|
||||
}
|
||||
|
||||
tmp = tmp->vm_next;
|
||||
}
|
||||
}
|
||||
|
||||
/* Munmap is split into 2 main parts -- this part which finds
|
||||
* what needs doing, and the areas themselves, which do the
|
||||
* work. This now handles partial unmappings.
|
||||
@ -2872,12 +2858,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* unlock any mlock()ed ranges before detaching vmas
|
||||
*/
|
||||
if (mm->locked_vm)
|
||||
unlock_range(vma, end);
|
||||
|
||||
/* Detach vmas from rbtree */
|
||||
if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
|
||||
downgrade = false;
|
||||
@ -3145,20 +3125,12 @@ void exit_mmap(struct mm_struct *mm)
|
||||
* Nothing can be holding mm->mmap_lock here and the above call
|
||||
* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
|
||||
* __oom_reap_task_mm() will not block.
|
||||
*
|
||||
* This needs to be done before calling unlock_range(),
|
||||
* which clears VM_LOCKED, otherwise the oom reaper cannot
|
||||
* reliably test it.
|
||||
*/
|
||||
(void)__oom_reap_task_mm(mm);
|
||||
|
||||
set_bit(MMF_OOM_SKIP, &mm->flags);
|
||||
}
|
||||
|
||||
mmap_write_lock(mm);
|
||||
if (mm->locked_vm)
|
||||
unlock_range(mm->mmap, ULONG_MAX);
|
||||
|
||||
arch_exit_mmap(mm);
|
||||
|
||||
vma = mm->mmap;
|
||||
|
@ -81,6 +81,13 @@ void lruvec_init(struct lruvec *lruvec)
|
||||
|
||||
for_each_lru(lru)
|
||||
INIT_LIST_HEAD(&lruvec->lists[lru]);
|
||||
/*
|
||||
* The "Unevictable LRU" is imaginary: though its size is maintained,
|
||||
* it is never scanned, and unevictable pages are not threaded on it
|
||||
* (so that their lru fields can be reused to hold mlock_count).
|
||||
* Poison its list head, so that any operations on it would crash.
|
||||
*/
|
||||
list_del(&lruvec->lists[LRU_UNEVICTABLE]);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
|
||||
|
@ -523,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
|
||||
set_bit(MMF_UNSTABLE, &mm->flags);
|
||||
|
||||
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
|
||||
if (!can_madv_lru_vma(vma))
|
||||
if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
|
||||
continue;
|
||||
|
||||
/*
|
||||
|
@ -734,8 +734,7 @@ static void prep_compound_head(struct page *page, unsigned int order)
|
||||
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
|
||||
set_compound_order(page, order);
|
||||
atomic_set(compound_mapcount_ptr(page), -1);
|
||||
if (hpage_pincount_available(page))
|
||||
atomic_set(compound_pincount_ptr(page), 0);
|
||||
atomic_set(compound_pincount_ptr(page), 0);
|
||||
}
|
||||
|
||||
static void prep_compound_tail(struct page *head, int tail_idx)
|
||||
|
@ -13,6 +13,8 @@
|
||||
#include <linux/page_ext.h>
|
||||
#include <linux/page_idle.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
#define BITMAP_CHUNK_SIZE sizeof(u64)
|
||||
#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
|
||||
|
||||
@ -44,15 +46,11 @@ static struct page *page_idle_get_page(unsigned long pfn)
|
||||
return page;
|
||||
}
|
||||
|
||||
static bool page_idle_clear_pte_refs_one(struct page *page,
|
||||
static bool page_idle_clear_pte_refs_one(struct folio *folio,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr, void *arg)
|
||||
{
|
||||
struct page_vma_mapped_walk pvmw = {
|
||||
.page = page,
|
||||
.vma = vma,
|
||||
.address = addr,
|
||||
};
|
||||
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
|
||||
bool referenced = false;
|
||||
|
||||
while (page_vma_mapped_walk(&pvmw)) {
|
||||
@ -74,41 +72,41 @@ static bool page_idle_clear_pte_refs_one(struct page *page,
|
||||
}
|
||||
|
||||
if (referenced) {
|
||||
clear_page_idle(page);
|
||||
folio_clear_idle(folio);
|
||||
/*
|
||||
* We cleared the referenced bit in a mapping to this page. To
|
||||
* avoid interference with page reclaim, mark it young so that
|
||||
* page_referenced() will return > 0.
|
||||
* folio_referenced() will return > 0.
|
||||
*/
|
||||
set_page_young(page);
|
||||
folio_set_young(folio);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void page_idle_clear_pte_refs(struct page *page)
|
||||
{
|
||||
struct folio *folio = page_folio(page);
|
||||
/*
|
||||
* Since rwc.arg is unused, rwc is effectively immutable, so we
|
||||
* can make it static const to save some cycles and stack.
|
||||
*/
|
||||
static const struct rmap_walk_control rwc = {
|
||||
.rmap_one = page_idle_clear_pte_refs_one,
|
||||
.anon_lock = page_lock_anon_vma_read,
|
||||
.anon_lock = folio_lock_anon_vma_read,
|
||||
};
|
||||
bool need_lock;
|
||||
|
||||
if (!page_mapped(page) ||
|
||||
!page_rmapping(page))
|
||||
if (!folio_mapped(folio) || !folio_raw_mapping(folio))
|
||||
return;
|
||||
|
||||
need_lock = !PageAnon(page) || PageKsm(page);
|
||||
if (need_lock && !trylock_page(page))
|
||||
need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
|
||||
if (need_lock && !folio_trylock(folio))
|
||||
return;
|
||||
|
||||
rmap_walk(page, (struct rmap_walk_control *)&rwc);
|
||||
rmap_walk(folio, &rwc);
|
||||
|
||||
if (need_lock)
|
||||
unlock_page(page);
|
||||
folio_unlock(folio);
|
||||
}
|
||||
|
||||
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
|
||||
|
@ -53,18 +53,6 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool pfn_is_match(struct page *page, unsigned long pfn)
|
||||
{
|
||||
unsigned long page_pfn = page_to_pfn(page);
|
||||
|
||||
/* normal page and hugetlbfs page */
|
||||
if (!PageTransCompound(page) || PageHuge(page))
|
||||
return page_pfn == pfn;
|
||||
|
||||
/* THP can be referenced by any subpage */
|
||||
return pfn >= page_pfn && pfn - page_pfn < thp_nr_pages(page);
|
||||
}
|
||||
|
||||
/**
|
||||
* check_pte - check if @pvmw->page is mapped at the @pvmw->pte
|
||||
* @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking
|
||||
@ -116,7 +104,17 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
|
||||
pfn = pte_pfn(*pvmw->pte);
|
||||
}
|
||||
|
||||
return pfn_is_match(pvmw->page, pfn);
|
||||
return (pfn - pvmw->pfn) < pvmw->nr_pages;
|
||||
}
|
||||
|
||||
/* Returns true if the two ranges overlap. Careful to not overflow. */
|
||||
static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw)
|
||||
{
|
||||
if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn)
|
||||
return false;
|
||||
if (pfn > pvmw->pfn + pvmw->nr_pages - 1)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
|
||||
@ -127,7 +125,7 @@ static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
|
||||
}
|
||||
|
||||
/**
|
||||
* page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
|
||||
* page_vma_mapped_walk - check if @pvmw->pfn is mapped in @pvmw->vma at
|
||||
* @pvmw->address
|
||||
* @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags
|
||||
* must be set. pmd, pte and ptl must be NULL.
|
||||
@ -152,8 +150,8 @@ static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
|
||||
*/
|
||||
bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
|
||||
{
|
||||
struct mm_struct *mm = pvmw->vma->vm_mm;
|
||||
struct page *page = pvmw->page;
|
||||
struct vm_area_struct *vma = pvmw->vma;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long end;
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
@ -164,32 +162,26 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
|
||||
if (pvmw->pmd && !pvmw->pte)
|
||||
return not_found(pvmw);
|
||||
|
||||
if (unlikely(PageHuge(page))) {
|
||||
if (unlikely(is_vm_hugetlb_page(vma))) {
|
||||
unsigned long size = pvmw->nr_pages * PAGE_SIZE;
|
||||
/* The only possible mapping was handled on last iteration */
|
||||
if (pvmw->pte)
|
||||
return not_found(pvmw);
|
||||
|
||||
/* when pud is not present, pte will be NULL */
|
||||
pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
|
||||
pvmw->pte = huge_pte_offset(mm, pvmw->address, size);
|
||||
if (!pvmw->pte)
|
||||
return false;
|
||||
|
||||
pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
|
||||
pvmw->ptl = huge_pte_lockptr(size_to_hstate(size), mm,
|
||||
pvmw->pte);
|
||||
spin_lock(pvmw->ptl);
|
||||
if (!check_pte(pvmw))
|
||||
return not_found(pvmw);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Seek to next pte only makes sense for THP.
|
||||
* But more important than that optimization, is to filter out
|
||||
* any PageKsm page: whose page->index misleads vma_address()
|
||||
* and vma_address_end() to disaster.
|
||||
*/
|
||||
end = PageTransCompound(page) ?
|
||||
vma_address_end(page, pvmw->vma) :
|
||||
pvmw->address + PAGE_SIZE;
|
||||
end = vma_address_end(pvmw);
|
||||
if (pvmw->pte)
|
||||
goto next_pte;
|
||||
restart:
|
||||
@ -224,7 +216,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
|
||||
if (likely(pmd_trans_huge(pmde))) {
|
||||
if (pvmw->flags & PVMW_MIGRATION)
|
||||
return not_found(pvmw);
|
||||
if (pmd_page(pmde) != page)
|
||||
if (!check_pmd(pmd_pfn(pmde), pvmw))
|
||||
return not_found(pvmw);
|
||||
return true;
|
||||
}
|
||||
@ -236,7 +228,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
|
||||
return not_found(pvmw);
|
||||
entry = pmd_to_swp_entry(pmde);
|
||||
if (!is_migration_entry(entry) ||
|
||||
pfn_swap_entry_to_page(entry) != page)
|
||||
!check_pmd(swp_offset(entry), pvmw))
|
||||
return not_found(pvmw);
|
||||
return true;
|
||||
}
|
||||
@ -250,7 +242,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
|
||||
* cleared *pmd but not decremented compound_mapcount().
|
||||
*/
|
||||
if ((pvmw->flags & PVMW_SYNC) &&
|
||||
PageTransCompound(page)) {
|
||||
transparent_hugepage_active(vma) &&
|
||||
(pvmw->nr_pages >= HPAGE_PMD_NR)) {
|
||||
spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
|
||||
|
||||
spin_unlock(ptl);
|
||||
@ -307,7 +300,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
|
||||
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
|
||||
{
|
||||
struct page_vma_mapped_walk pvmw = {
|
||||
.page = page,
|
||||
.pfn = page_to_pfn(page),
|
||||
.nr_pages = 1,
|
||||
.vma = vma,
|
||||
.flags = PVMW_SYNC,
|
||||
};
|
||||
|
108
mm/readahead.c
108
mm/readahead.c
@ -262,7 +262,7 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
BUG_ON(!list_empty(pages));
|
||||
BUG_ON(pages && !list_empty(pages));
|
||||
BUG_ON(readahead_count(rac));
|
||||
|
||||
out:
|
||||
@ -361,7 +361,7 @@ EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
|
||||
* behaviour which would occur if page allocations are causing VM writeback.
|
||||
* We really don't want to intermingle reads and writes like that.
|
||||
*/
|
||||
void do_page_cache_ra(struct readahead_control *ractl,
|
||||
static void do_page_cache_ra(struct readahead_control *ractl,
|
||||
unsigned long nr_to_read, unsigned long lookahead_size)
|
||||
{
|
||||
struct inode *inode = ractl->mapping->host;
|
||||
@ -545,11 +545,103 @@ static int try_context_readahead(struct address_space *mapping,
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* There are some parts of the kernel which assume that PMD entries
|
||||
* are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
|
||||
* limit the maximum allocation order to PMD size. I'm not aware of any
|
||||
* assumptions about maximum order if THP are disabled, but 8 seems like
|
||||
* a good order (that's 1MB if you're using 4kB pages)
|
||||
*/
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
|
||||
#else
|
||||
#define MAX_PAGECACHE_ORDER 8
|
||||
#endif
|
||||
|
||||
static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
|
||||
pgoff_t mark, unsigned int order, gfp_t gfp)
|
||||
{
|
||||
int err;
|
||||
struct folio *folio = filemap_alloc_folio(gfp, order);
|
||||
|
||||
if (!folio)
|
||||
return -ENOMEM;
|
||||
if (mark - index < (1UL << order))
|
||||
folio_set_readahead(folio);
|
||||
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
|
||||
if (err)
|
||||
folio_put(folio);
|
||||
else
|
||||
ractl->_nr_pages += 1UL << order;
|
||||
return err;
|
||||
}
|
||||
|
||||
void page_cache_ra_order(struct readahead_control *ractl,
|
||||
struct file_ra_state *ra, unsigned int new_order)
|
||||
{
|
||||
struct address_space *mapping = ractl->mapping;
|
||||
pgoff_t index = readahead_index(ractl);
|
||||
pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
|
||||
pgoff_t mark = index + ra->size - ra->async_size;
|
||||
int err = 0;
|
||||
gfp_t gfp = readahead_gfp_mask(mapping);
|
||||
|
||||
if (!mapping_large_folio_support(mapping) || ra->size < 4)
|
||||
goto fallback;
|
||||
|
||||
limit = min(limit, index + ra->size - 1);
|
||||
|
||||
if (new_order < MAX_PAGECACHE_ORDER) {
|
||||
new_order += 2;
|
||||
if (new_order > MAX_PAGECACHE_ORDER)
|
||||
new_order = MAX_PAGECACHE_ORDER;
|
||||
while ((1 << new_order) > ra->size)
|
||||
new_order--;
|
||||
}
|
||||
|
||||
while (index <= limit) {
|
||||
unsigned int order = new_order;
|
||||
|
||||
/* Align with smaller pages if needed */
|
||||
if (index & ((1UL << order) - 1)) {
|
||||
order = __ffs(index);
|
||||
if (order == 1)
|
||||
order = 0;
|
||||
}
|
||||
/* Don't allocate pages past EOF */
|
||||
while (index + (1UL << order) - 1 > limit) {
|
||||
if (--order == 1)
|
||||
order = 0;
|
||||
}
|
||||
err = ra_alloc_folio(ractl, index, mark, order, gfp);
|
||||
if (err)
|
||||
break;
|
||||
index += 1UL << order;
|
||||
}
|
||||
|
||||
if (index > limit) {
|
||||
ra->size += index - limit - 1;
|
||||
ra->async_size += index - limit - 1;
|
||||
}
|
||||
|
||||
read_pages(ractl, NULL, false);
|
||||
|
||||
/*
|
||||
* If there were already pages in the page cache, then we may have
|
||||
* left some gaps. Let the regular readahead code take care of this
|
||||
* situation.
|
||||
*/
|
||||
if (!err)
|
||||
return;
|
||||
fallback:
|
||||
do_page_cache_ra(ractl, ra->size, ra->async_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* A minimal readahead algorithm for trivial sequential/random reads.
|
||||
*/
|
||||
static void ondemand_readahead(struct readahead_control *ractl,
|
||||
bool hit_readahead_marker, unsigned long req_size)
|
||||
struct folio *folio, unsigned long req_size)
|
||||
{
|
||||
struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
|
||||
struct file_ra_state *ra = ractl->ra;
|
||||
@ -584,12 +676,12 @@ static void ondemand_readahead(struct readahead_control *ractl,
|
||||
}
|
||||
|
||||
/*
|
||||
* Hit a marked page without valid readahead state.
|
||||
* Hit a marked folio without valid readahead state.
|
||||
* E.g. interleaved reads.
|
||||
* Query the pagecache for async_size, which normally equals to
|
||||
* readahead size. Ramp it up and use it as the new readahead size.
|
||||
*/
|
||||
if (hit_readahead_marker) {
|
||||
if (folio) {
|
||||
pgoff_t start;
|
||||
|
||||
rcu_read_lock();
|
||||
@ -662,7 +754,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
|
||||
}
|
||||
|
||||
ractl->_index = ra->start;
|
||||
do_page_cache_ra(ractl, ra->size, ra->async_size);
|
||||
page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0);
|
||||
}
|
||||
|
||||
void page_cache_sync_ra(struct readahead_control *ractl,
|
||||
@ -690,7 +782,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
|
||||
}
|
||||
|
||||
/* do read-ahead */
|
||||
ondemand_readahead(ractl, false, req_count);
|
||||
ondemand_readahead(ractl, NULL, req_count);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
|
||||
|
||||
@ -713,7 +805,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
|
||||
return;
|
||||
|
||||
/* do read-ahead */
|
||||
ondemand_readahead(ractl, true, req_count);
|
||||
ondemand_readahead(ractl, folio, req_count);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(page_cache_async_ra);
|
||||
|
||||
|
177
mm/swap.c
177
mm/swap.c
@ -74,8 +74,8 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
|
||||
};
|
||||
|
||||
/*
|
||||
* This path almost never happens for VM activity - pages are normally
|
||||
* freed via pagevecs. But it gets used by networking.
|
||||
* This path almost never happens for VM activity - pages are normally freed
|
||||
* via pagevecs. But it gets used by networking - and for compound pages.
|
||||
*/
|
||||
static void __page_cache_release(struct page *page)
|
||||
{
|
||||
@ -89,6 +89,14 @@ static void __page_cache_release(struct page *page)
|
||||
__clear_page_lru_flags(page);
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
}
|
||||
/* See comment on PageMlocked in release_pages() */
|
||||
if (unlikely(PageMlocked(page))) {
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
__ClearPageMlocked(page);
|
||||
mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
|
||||
count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
|
||||
}
|
||||
__ClearPageWaiters(page);
|
||||
}
|
||||
|
||||
@ -114,17 +122,9 @@ static void __put_compound_page(struct page *page)
|
||||
|
||||
void __put_page(struct page *page)
|
||||
{
|
||||
if (is_zone_device_page(page)) {
|
||||
put_dev_pagemap(page->pgmap);
|
||||
|
||||
/*
|
||||
* The page belongs to the device that created pgmap. Do
|
||||
* not return it to page allocator.
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
if (unlikely(PageCompound(page)))
|
||||
if (unlikely(is_zone_device_page(page)))
|
||||
free_zone_device_page(page);
|
||||
else if (unlikely(PageCompound(page)))
|
||||
__put_compound_page(page);
|
||||
else
|
||||
__put_single_page(page);
|
||||
@ -482,22 +482,12 @@ EXPORT_SYMBOL(folio_add_lru);
|
||||
void lru_cache_add_inactive_or_unevictable(struct page *page,
|
||||
struct vm_area_struct *vma)
|
||||
{
|
||||
bool unevictable;
|
||||
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
|
||||
unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
|
||||
if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
/*
|
||||
* We use the irq-unsafe __mod_zone_page_state because this
|
||||
* counter is not modified from interrupt context, and the pte
|
||||
* lock is held(spinlock), which implies preemption disabled.
|
||||
*/
|
||||
__mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
|
||||
count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
|
||||
}
|
||||
lru_cache_add(page);
|
||||
if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
|
||||
mlock_new_page(page);
|
||||
else
|
||||
lru_cache_add(page);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -636,35 +626,37 @@ void lru_add_drain_cpu(int cpu)
|
||||
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
|
||||
|
||||
activate_page_drain(cpu);
|
||||
mlock_page_drain(cpu);
|
||||
}
|
||||
|
||||
/**
|
||||
* deactivate_file_page - forcefully deactivate a file page
|
||||
* @page: page to deactivate
|
||||
* deactivate_file_folio() - Forcefully deactivate a file folio.
|
||||
* @folio: Folio to deactivate.
|
||||
*
|
||||
* This function hints the VM that @page is a good reclaim candidate,
|
||||
* for example if its invalidation fails due to the page being dirty
|
||||
* This function hints to the VM that @folio is a good reclaim candidate,
|
||||
* for example if its invalidation fails due to the folio being dirty
|
||||
* or under writeback.
|
||||
*
|
||||
* Context: Caller holds a reference on the page.
|
||||
*/
|
||||
void deactivate_file_page(struct page *page)
|
||||
void deactivate_file_folio(struct folio *folio)
|
||||
{
|
||||
struct pagevec *pvec;
|
||||
|
||||
/*
|
||||
* In a workload with many unevictable page such as mprotect,
|
||||
* unevictable page deactivation for accelerating reclaim is pointless.
|
||||
* In a workload with many unevictable pages such as mprotect,
|
||||
* unevictable folio deactivation for accelerating reclaim is pointless.
|
||||
*/
|
||||
if (PageUnevictable(page))
|
||||
if (folio_test_unevictable(folio))
|
||||
return;
|
||||
|
||||
if (likely(get_page_unless_zero(page))) {
|
||||
struct pagevec *pvec;
|
||||
folio_get(folio);
|
||||
local_lock(&lru_pvecs.lock);
|
||||
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
|
||||
|
||||
local_lock(&lru_pvecs.lock);
|
||||
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
|
||||
|
||||
if (pagevec_add_and_need_flush(pvec, page))
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
|
||||
local_unlock(&lru_pvecs.lock);
|
||||
}
|
||||
if (pagevec_add_and_need_flush(pvec, &folio->page))
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
|
||||
local_unlock(&lru_pvecs.lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -837,6 +829,7 @@ inline void __lru_add_drain_all(bool force_all_cpus)
|
||||
pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
|
||||
pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
|
||||
need_activate_page_drain(cpu) ||
|
||||
need_mlock_page_drain(cpu) ||
|
||||
has_bh_in_lru(cpu, NULL)) {
|
||||
INIT_WORK(work, lru_add_drain_per_cpu);
|
||||
queue_work_on(cpu, mm_percpu_wq, work);
|
||||
@ -935,18 +928,10 @@ void release_pages(struct page **pages, int nr)
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
lruvec = NULL;
|
||||
}
|
||||
/*
|
||||
* ZONE_DEVICE pages that return 'false' from
|
||||
* page_is_devmap_managed() do not require special
|
||||
* processing, and instead, expect a call to
|
||||
* put_page_testzero().
|
||||
*/
|
||||
if (page_is_devmap_managed(page)) {
|
||||
put_devmap_managed_page(page);
|
||||
if (put_devmap_managed_page(page))
|
||||
continue;
|
||||
}
|
||||
if (put_page_testzero(page))
|
||||
put_dev_pagemap(page->pgmap);
|
||||
free_zone_device_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -974,6 +959,18 @@ void release_pages(struct page **pages, int nr)
|
||||
__clear_page_lru_flags(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* In rare cases, when truncation or holepunching raced with
|
||||
* munlock after VM_LOCKED was cleared, Mlocked may still be
|
||||
* found set here. This does not indicate a problem, unless
|
||||
* "unevictable_pgs_cleared" appears worryingly large.
|
||||
*/
|
||||
if (unlikely(PageMlocked(page))) {
|
||||
__ClearPageMlocked(page);
|
||||
dec_zone_page_state(page, NR_MLOCK);
|
||||
count_vm_event(UNEVICTABLE_PGCLEARED);
|
||||
}
|
||||
|
||||
__ClearPageWaiters(page);
|
||||
|
||||
list_add(&page->lru, &pages_to_free);
|
||||
@ -1014,43 +1011,32 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
|
||||
|
||||
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
|
||||
|
||||
/*
|
||||
* A folio becomes evictable in two ways:
|
||||
* 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
|
||||
* 2) Before acquiring LRU lock to put the folio on the correct LRU
|
||||
* and then
|
||||
* a) do PageLRU check with lock [check_move_unevictable_pages]
|
||||
* b) do PageLRU check before lock [clear_page_mlock]
|
||||
*
|
||||
* (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
|
||||
* following strict ordering:
|
||||
*
|
||||
* #0: __pagevec_lru_add_fn #1: clear_page_mlock
|
||||
*
|
||||
* folio_set_lru() folio_test_clear_mlocked()
|
||||
* smp_mb() // explicit ordering // above provides strict
|
||||
* // ordering
|
||||
* folio_test_mlocked() folio_test_lru()
|
||||
*
|
||||
*
|
||||
* if '#1' does not observe setting of PG_lru by '#0' and
|
||||
* fails isolation, the explicit barrier will make sure that
|
||||
* folio_evictable check will put the folio on the correct
|
||||
* LRU. Without smp_mb(), folio_set_lru() can be reordered
|
||||
* after folio_test_mlocked() check and can make '#1' fail the
|
||||
* isolation of the folio whose mlocked bit is cleared (#0 is
|
||||
* also looking at the same folio) and the evictable folio will
|
||||
* be stranded on an unevictable LRU.
|
||||
*/
|
||||
folio_set_lru(folio);
|
||||
smp_mb__after_atomic();
|
||||
|
||||
/*
|
||||
* Is an smp_mb__after_atomic() still required here, before
|
||||
* folio_evictable() tests PageMlocked, to rule out the possibility
|
||||
* of stranding an evictable folio on an unevictable LRU? I think
|
||||
* not, because __munlock_page() only clears PageMlocked while the LRU
|
||||
* lock is held.
|
||||
*
|
||||
* (That is not true of __page_cache_release(), and not necessarily
|
||||
* true of release_pages(): but those only clear PageMlocked after
|
||||
* put_page_testzero() has excluded any other users of the page.)
|
||||
*/
|
||||
if (folio_evictable(folio)) {
|
||||
if (was_unevictable)
|
||||
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
|
||||
} else {
|
||||
folio_clear_active(folio);
|
||||
folio_set_unevictable(folio);
|
||||
/*
|
||||
* folio->mlock_count = !!folio_test_mlocked(folio)?
|
||||
* But that leaves __mlock_page() in doubt whether another
|
||||
* actor has already counted the mlock or not. Err on the
|
||||
* safe side, underestimate, let page reclaim fix it, rather
|
||||
* than leaving a page on the unevictable LRU indefinitely.
|
||||
*/
|
||||
folio->mlock_count = 0;
|
||||
if (!was_unevictable)
|
||||
__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
|
||||
}
|
||||
@ -1158,26 +1144,3 @@ void __init swap_setup(void)
|
||||
* _really_ don't want to cluster much more
|
||||
*/
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
void put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
int count;
|
||||
|
||||
if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
|
||||
return;
|
||||
|
||||
count = page_ref_dec_return(page);
|
||||
|
||||
/*
|
||||
* devmap page refcounts are 1-based, rather than 0-based: if
|
||||
* refcount is 1, then the page is free and the refcount is
|
||||
* stable because nobody holds a reference on the page.
|
||||
*/
|
||||
if (count == 1)
|
||||
free_devmap_managed_page(page);
|
||||
else if (!count)
|
||||
__put_page(page);
|
||||
}
|
||||
EXPORT_SYMBOL(put_devmap_managed_page);
|
||||
#endif
|
||||
|
117
mm/truncate.c
117
mm/truncate.c
@ -193,27 +193,6 @@ static void truncate_cleanup_folio(struct folio *folio)
|
||||
folio_clear_mappedtodisk(folio);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is for invalidate_mapping_pages(). That function can be called at
|
||||
* any time, and is not supposed to throw away dirty pages. But pages can
|
||||
* be marked dirty at any time too, so use remove_mapping which safely
|
||||
* discards clean, unused pages.
|
||||
*
|
||||
* Returns non-zero if the page was successfully invalidated.
|
||||
*/
|
||||
static int
|
||||
invalidate_complete_page(struct address_space *mapping, struct page *page)
|
||||
{
|
||||
|
||||
if (page->mapping != mapping)
|
||||
return 0;
|
||||
|
||||
if (page_has_private(page) && !try_to_release_page(page, 0))
|
||||
return 0;
|
||||
|
||||
return remove_mapping(mapping, page);
|
||||
}
|
||||
|
||||
int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
|
||||
{
|
||||
if (folio->mapping != mapping)
|
||||
@ -294,22 +273,40 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page)
|
||||
}
|
||||
EXPORT_SYMBOL(generic_error_remove_page);
|
||||
|
||||
/*
|
||||
* Safely invalidate one page from its pagecache mapping.
|
||||
* It only drops clean, unused pages. The page must be locked.
|
||||
*
|
||||
* Returns 1 if the page is successfully invalidated, otherwise 0.
|
||||
*/
|
||||
int invalidate_inode_page(struct page *page)
|
||||
static long mapping_evict_folio(struct address_space *mapping,
|
||||
struct folio *folio)
|
||||
{
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
if (folio_test_dirty(folio) || folio_test_writeback(folio))
|
||||
return 0;
|
||||
/* The refcount will be elevated if any page in the folio is mapped */
|
||||
if (folio_ref_count(folio) >
|
||||
folio_nr_pages(folio) + folio_has_private(folio) + 1)
|
||||
return 0;
|
||||
if (folio_has_private(folio) && !filemap_release_folio(folio, 0))
|
||||
return 0;
|
||||
|
||||
return remove_mapping(mapping, folio);
|
||||
}
|
||||
|
||||
/**
|
||||
* invalidate_inode_page() - Remove an unused page from the pagecache.
|
||||
* @page: The page to remove.
|
||||
*
|
||||
* Safely invalidate one page from its pagecache mapping.
|
||||
* It only drops clean, unused pages.
|
||||
*
|
||||
* Context: Page must be locked.
|
||||
* Return: The number of pages successfully removed.
|
||||
*/
|
||||
long invalidate_inode_page(struct page *page)
|
||||
{
|
||||
struct folio *folio = page_folio(page);
|
||||
struct address_space *mapping = folio_mapping(folio);
|
||||
|
||||
/* The page may have been truncated before it was locked */
|
||||
if (!mapping)
|
||||
return 0;
|
||||
if (PageDirty(page) || PageWriteback(page))
|
||||
return 0;
|
||||
if (page_mapped(page))
|
||||
return 0;
|
||||
return invalidate_complete_page(mapping, page);
|
||||
return mapping_evict_folio(mapping, folio);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -497,7 +494,18 @@ void truncate_inode_pages_final(struct address_space *mapping)
|
||||
}
|
||||
EXPORT_SYMBOL(truncate_inode_pages_final);
|
||||
|
||||
static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
|
||||
/**
|
||||
* invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode
|
||||
* @mapping: the address_space which holds the pages to invalidate
|
||||
* @start: the offset 'from' which to invalidate
|
||||
* @end: the offset 'to' which to invalidate (inclusive)
|
||||
* @nr_pagevec: invalidate failed page number for caller
|
||||
*
|
||||
* This helper is similar to invalidate_mapping_pages(), except that it accounts
|
||||
* for pages that are likely on a pagevec and counts them in @nr_pagevec, which
|
||||
* will be used by the caller.
|
||||
*/
|
||||
unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
|
||||
{
|
||||
pgoff_t indices[PAGEVEC_SIZE];
|
||||
@ -510,27 +518,27 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
|
||||
folio_batch_init(&fbatch);
|
||||
while (find_lock_entries(mapping, index, end, &fbatch, indices)) {
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++) {
|
||||
struct page *page = &fbatch.folios[i]->page;
|
||||
struct folio *folio = fbatch.folios[i];
|
||||
|
||||
/* We rely upon deletion not changing page->index */
|
||||
/* We rely upon deletion not changing folio->index */
|
||||
index = indices[i];
|
||||
|
||||
if (xa_is_value(page)) {
|
||||
if (xa_is_value(folio)) {
|
||||
count += invalidate_exceptional_entry(mapping,
|
||||
index,
|
||||
page);
|
||||
folio);
|
||||
continue;
|
||||
}
|
||||
index += thp_nr_pages(page) - 1;
|
||||
index += folio_nr_pages(folio) - 1;
|
||||
|
||||
ret = invalidate_inode_page(page);
|
||||
unlock_page(page);
|
||||
ret = mapping_evict_folio(mapping, folio);
|
||||
folio_unlock(folio);
|
||||
/*
|
||||
* Invalidation is a hint that the page is no longer
|
||||
* Invalidation is a hint that the folio is no longer
|
||||
* of interest and try to speed up its reclaim.
|
||||
*/
|
||||
if (!ret) {
|
||||
deactivate_file_page(page);
|
||||
deactivate_file_folio(folio);
|
||||
/* It is likely on the pagevec of a remote CPU */
|
||||
if (nr_pagevec)
|
||||
(*nr_pagevec)++;
|
||||
@ -562,29 +570,12 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
|
||||
unsigned long invalidate_mapping_pages(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end)
|
||||
{
|
||||
return __invalidate_mapping_pages(mapping, start, end, NULL);
|
||||
return invalidate_mapping_pagevec(mapping, start, end, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL(invalidate_mapping_pages);
|
||||
|
||||
/**
|
||||
* invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode
|
||||
* @mapping: the address_space which holds the pages to invalidate
|
||||
* @start: the offset 'from' which to invalidate
|
||||
* @end: the offset 'to' which to invalidate (inclusive)
|
||||
* @nr_pagevec: invalidate failed page number for caller
|
||||
*
|
||||
* This helper is similar to invalidate_mapping_pages(), except that it accounts
|
||||
* for pages that are likely on a pagevec and counts them in @nr_pagevec, which
|
||||
* will be used by the caller.
|
||||
*/
|
||||
void invalidate_mapping_pagevec(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
|
||||
{
|
||||
__invalidate_mapping_pages(mapping, start, end, nr_pagevec);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is like invalidate_complete_page(), except it ignores the page's
|
||||
* This is like invalidate_inode_page(), except it ignores the page's
|
||||
* refcount. We do this because invalidate_inode_pages2() needs stronger
|
||||
* invalidation guarantees, and cannot afford to leave pages behind because
|
||||
* shrink_page_list() has a temp ref on them, or because they're transiently
|
||||
|
@ -95,10 +95,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
|
||||
if (!pte_none(*dst_pte))
|
||||
goto out_unlock;
|
||||
|
||||
if (page_in_cache)
|
||||
page_add_file_rmap(page, false);
|
||||
else
|
||||
if (page_in_cache) {
|
||||
/* Usually, cache pages are already added to LRU */
|
||||
if (newly_allocated)
|
||||
lru_cache_add(page);
|
||||
page_add_file_rmap(page, dst_vma, false);
|
||||
} else {
|
||||
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
|
||||
lru_cache_add_inactive_or_unevictable(page, dst_vma);
|
||||
}
|
||||
|
||||
/*
|
||||
* Must happen after rmap, as mm_counter() checks mapping (via
|
||||
@ -106,9 +111,6 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
|
||||
*/
|
||||
inc_mm_counter(dst_mm, mm_counter(page));
|
||||
|
||||
if (newly_allocated)
|
||||
lru_cache_add_inactive_or_unevictable(page, dst_vma);
|
||||
|
||||
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
|
||||
|
||||
/* No need to invalidate - it was non-present before */
|
||||
|
36
mm/util.c
36
mm/util.c
@ -681,9 +681,8 @@ bool folio_mapped(struct folio *folio)
|
||||
}
|
||||
EXPORT_SYMBOL(folio_mapped);
|
||||
|
||||
struct anon_vma *page_anon_vma(struct page *page)
|
||||
struct anon_vma *folio_anon_vma(struct folio *folio)
|
||||
{
|
||||
struct folio *folio = page_folio(page);
|
||||
unsigned long mapping = (unsigned long)folio->mapping;
|
||||
|
||||
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
|
||||
@ -742,6 +741,39 @@ int __page_mapcount(struct page *page)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__page_mapcount);
|
||||
|
||||
/**
|
||||
* folio_mapcount() - Calculate the number of mappings of this folio.
|
||||
* @folio: The folio.
|
||||
*
|
||||
* A large folio tracks both how many times the entire folio is mapped,
|
||||
* and how many times each individual page in the folio is mapped.
|
||||
* This function calculates the total number of times the folio is
|
||||
* mapped.
|
||||
*
|
||||
* Return: The number of times this folio is mapped.
|
||||
*/
|
||||
int folio_mapcount(struct folio *folio)
|
||||
{
|
||||
int i, compound, nr, ret;
|
||||
|
||||
if (likely(!folio_test_large(folio)))
|
||||
return atomic_read(&folio->_mapcount) + 1;
|
||||
|
||||
compound = folio_entire_mapcount(folio);
|
||||
nr = folio_nr_pages(folio);
|
||||
if (folio_test_hugetlb(folio))
|
||||
return compound;
|
||||
ret = compound;
|
||||
for (i = 0; i < nr; i++)
|
||||
ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1;
|
||||
/* File pages has compound_mapcount included in _mapcount */
|
||||
if (!folio_test_anon(folio))
|
||||
return ret - compound * nr;
|
||||
if (folio_test_double_map(folio))
|
||||
ret -= nr;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* folio_copy - Copy the contents of one folio to another.
|
||||
* @dst: Folio to copy to.
|
||||
|
305
mm/vmscan.c
305
mm/vmscan.c
@ -979,36 +979,36 @@ void drop_slab(void)
|
||||
drop_slab_node(nid);
|
||||
}
|
||||
|
||||
static inline int is_page_cache_freeable(struct page *page)
|
||||
static inline int is_page_cache_freeable(struct folio *folio)
|
||||
{
|
||||
/*
|
||||
* A freeable page cache page is referenced only by the caller
|
||||
* that isolated the page, the page cache and optional buffer
|
||||
* heads at page->private.
|
||||
*/
|
||||
int page_cache_pins = thp_nr_pages(page);
|
||||
return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
|
||||
return folio_ref_count(folio) - folio_test_private(folio) ==
|
||||
1 + folio_nr_pages(folio);
|
||||
}
|
||||
|
||||
/*
|
||||
* We detected a synchronous write error writing a page out. Probably
|
||||
* We detected a synchronous write error writing a folio out. Probably
|
||||
* -ENOSPC. We need to propagate that into the address_space for a subsequent
|
||||
* fsync(), msync() or close().
|
||||
*
|
||||
* The tricky part is that after writepage we cannot touch the mapping: nothing
|
||||
* prevents it from being freed up. But we have a ref on the page and once
|
||||
* that page is locked, the mapping is pinned.
|
||||
* prevents it from being freed up. But we have a ref on the folio and once
|
||||
* that folio is locked, the mapping is pinned.
|
||||
*
|
||||
* We're allowed to run sleeping lock_page() here because we know the caller has
|
||||
* We're allowed to run sleeping folio_lock() here because we know the caller has
|
||||
* __GFP_FS.
|
||||
*/
|
||||
static void handle_write_error(struct address_space *mapping,
|
||||
struct page *page, int error)
|
||||
struct folio *folio, int error)
|
||||
{
|
||||
lock_page(page);
|
||||
if (page_mapping(page) == mapping)
|
||||
folio_lock(folio);
|
||||
if (folio_mapping(folio) == mapping)
|
||||
mapping_set_error(mapping, error);
|
||||
unlock_page(page);
|
||||
folio_unlock(folio);
|
||||
}
|
||||
|
||||
static bool skip_throttle_noprogress(pg_data_t *pgdat)
|
||||
@ -1155,35 +1155,35 @@ typedef enum {
|
||||
* pageout is called by shrink_page_list() for each dirty page.
|
||||
* Calls ->writepage().
|
||||
*/
|
||||
static pageout_t pageout(struct page *page, struct address_space *mapping)
|
||||
static pageout_t pageout(struct folio *folio, struct address_space *mapping)
|
||||
{
|
||||
/*
|
||||
* If the page is dirty, only perform writeback if that write
|
||||
* If the folio is dirty, only perform writeback if that write
|
||||
* will be non-blocking. To prevent this allocation from being
|
||||
* stalled by pagecache activity. But note that there may be
|
||||
* stalls if we need to run get_block(). We could test
|
||||
* PagePrivate for that.
|
||||
*
|
||||
* If this process is currently in __generic_file_write_iter() against
|
||||
* this page's queue, we can perform writeback even if that
|
||||
* this folio's queue, we can perform writeback even if that
|
||||
* will block.
|
||||
*
|
||||
* If the page is swapcache, write it back even if that would
|
||||
* If the folio is swapcache, write it back even if that would
|
||||
* block, for some throttling. This happens by accident, because
|
||||
* swap_backing_dev_info is bust: it doesn't reflect the
|
||||
* congestion state of the swapdevs. Easy to fix, if needed.
|
||||
*/
|
||||
if (!is_page_cache_freeable(page))
|
||||
if (!is_page_cache_freeable(folio))
|
||||
return PAGE_KEEP;
|
||||
if (!mapping) {
|
||||
/*
|
||||
* Some data journaling orphaned pages can have
|
||||
* page->mapping == NULL while being dirty with clean buffers.
|
||||
* Some data journaling orphaned folios can have
|
||||
* folio->mapping == NULL while being dirty with clean buffers.
|
||||
*/
|
||||
if (page_has_private(page)) {
|
||||
if (try_to_free_buffers(page)) {
|
||||
ClearPageDirty(page);
|
||||
pr_info("%s: orphaned page\n", __func__);
|
||||
if (folio_test_private(folio)) {
|
||||
if (try_to_free_buffers(&folio->page)) {
|
||||
folio_clear_dirty(folio);
|
||||
pr_info("%s: orphaned folio\n", __func__);
|
||||
return PAGE_CLEAN;
|
||||
}
|
||||
}
|
||||
@ -1192,7 +1192,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
|
||||
if (mapping->a_ops->writepage == NULL)
|
||||
return PAGE_ACTIVATE;
|
||||
|
||||
if (clear_page_dirty_for_io(page)) {
|
||||
if (folio_clear_dirty_for_io(folio)) {
|
||||
int res;
|
||||
struct writeback_control wbc = {
|
||||
.sync_mode = WB_SYNC_NONE,
|
||||
@ -1202,21 +1202,21 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
|
||||
.for_reclaim = 1,
|
||||
};
|
||||
|
||||
SetPageReclaim(page);
|
||||
res = mapping->a_ops->writepage(page, &wbc);
|
||||
folio_set_reclaim(folio);
|
||||
res = mapping->a_ops->writepage(&folio->page, &wbc);
|
||||
if (res < 0)
|
||||
handle_write_error(mapping, page, res);
|
||||
handle_write_error(mapping, folio, res);
|
||||
if (res == AOP_WRITEPAGE_ACTIVATE) {
|
||||
ClearPageReclaim(page);
|
||||
folio_clear_reclaim(folio);
|
||||
return PAGE_ACTIVATE;
|
||||
}
|
||||
|
||||
if (!PageWriteback(page)) {
|
||||
if (!folio_test_writeback(folio)) {
|
||||
/* synchronous write or broken a_ops? */
|
||||
ClearPageReclaim(page);
|
||||
folio_clear_reclaim(folio);
|
||||
}
|
||||
trace_mm_vmscan_writepage(page);
|
||||
inc_node_page_state(page, NR_VMSCAN_WRITE);
|
||||
trace_mm_vmscan_write_folio(folio);
|
||||
node_stat_add_folio(folio, NR_VMSCAN_WRITE);
|
||||
return PAGE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1227,16 +1227,16 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
|
||||
* Same as remove_mapping, but if the page is removed from the mapping, it
|
||||
* gets returned with a refcount of 0.
|
||||
*/
|
||||
static int __remove_mapping(struct address_space *mapping, struct page *page,
|
||||
static int __remove_mapping(struct address_space *mapping, struct folio *folio,
|
||||
bool reclaimed, struct mem_cgroup *target_memcg)
|
||||
{
|
||||
int refcount;
|
||||
void *shadow = NULL;
|
||||
|
||||
BUG_ON(!PageLocked(page));
|
||||
BUG_ON(mapping != page_mapping(page));
|
||||
BUG_ON(!folio_test_locked(folio));
|
||||
BUG_ON(mapping != folio_mapping(folio));
|
||||
|
||||
if (!PageSwapCache(page))
|
||||
if (!folio_test_swapcache(folio))
|
||||
spin_lock(&mapping->host->i_lock);
|
||||
xa_lock_irq(&mapping->i_pages);
|
||||
/*
|
||||
@ -1264,23 +1264,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
|
||||
* Note that if SetPageDirty is always performed via set_page_dirty,
|
||||
* and thus under the i_pages lock, then this ordering is not required.
|
||||
*/
|
||||
refcount = 1 + compound_nr(page);
|
||||
if (!page_ref_freeze(page, refcount))
|
||||
refcount = 1 + folio_nr_pages(folio);
|
||||
if (!folio_ref_freeze(folio, refcount))
|
||||
goto cannot_free;
|
||||
/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
|
||||
if (unlikely(PageDirty(page))) {
|
||||
page_ref_unfreeze(page, refcount);
|
||||
if (unlikely(folio_test_dirty(folio))) {
|
||||
folio_ref_unfreeze(folio, refcount);
|
||||
goto cannot_free;
|
||||
}
|
||||
|
||||
if (PageSwapCache(page)) {
|
||||
swp_entry_t swap = { .val = page_private(page) };
|
||||
mem_cgroup_swapout(page, swap);
|
||||
if (folio_test_swapcache(folio)) {
|
||||
swp_entry_t swap = folio_swap_entry(folio);
|
||||
mem_cgroup_swapout(folio, swap);
|
||||
if (reclaimed && !mapping_exiting(mapping))
|
||||
shadow = workingset_eviction(page, target_memcg);
|
||||
__delete_from_swap_cache(page, swap, shadow);
|
||||
shadow = workingset_eviction(folio, target_memcg);
|
||||
__delete_from_swap_cache(&folio->page, swap, shadow);
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
put_swap_page(page, swap);
|
||||
put_swap_page(&folio->page, swap);
|
||||
} else {
|
||||
void (*freepage)(struct page *);
|
||||
|
||||
@ -1301,61 +1301,67 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
|
||||
* exceptional entries and shadow exceptional entries in the
|
||||
* same address_space.
|
||||
*/
|
||||
if (reclaimed && page_is_file_lru(page) &&
|
||||
if (reclaimed && folio_is_file_lru(folio) &&
|
||||
!mapping_exiting(mapping) && !dax_mapping(mapping))
|
||||
shadow = workingset_eviction(page, target_memcg);
|
||||
__delete_from_page_cache(page, shadow);
|
||||
shadow = workingset_eviction(folio, target_memcg);
|
||||
__filemap_remove_folio(folio, shadow);
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
if (mapping_shrinkable(mapping))
|
||||
inode_add_lru(mapping->host);
|
||||
spin_unlock(&mapping->host->i_lock);
|
||||
|
||||
if (freepage != NULL)
|
||||
freepage(page);
|
||||
freepage(&folio->page);
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
cannot_free:
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
if (!PageSwapCache(page))
|
||||
if (!folio_test_swapcache(folio))
|
||||
spin_unlock(&mapping->host->i_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to detach a locked page from its ->mapping. If it is dirty or if
|
||||
* someone else has a ref on the page, abort and return 0. If it was
|
||||
* successfully detached, return 1. Assumes the caller has a single ref on
|
||||
* this page.
|
||||
/**
|
||||
* remove_mapping() - Attempt to remove a folio from its mapping.
|
||||
* @mapping: The address space.
|
||||
* @folio: The folio to remove.
|
||||
*
|
||||
* If the folio is dirty, under writeback or if someone else has a ref
|
||||
* on it, removal will fail.
|
||||
* Return: The number of pages removed from the mapping. 0 if the folio
|
||||
* could not be removed.
|
||||
* Context: The caller should have a single refcount on the folio and
|
||||
* hold its lock.
|
||||
*/
|
||||
int remove_mapping(struct address_space *mapping, struct page *page)
|
||||
long remove_mapping(struct address_space *mapping, struct folio *folio)
|
||||
{
|
||||
if (__remove_mapping(mapping, page, false, NULL)) {
|
||||
if (__remove_mapping(mapping, folio, false, NULL)) {
|
||||
/*
|
||||
* Unfreezing the refcount with 1 rather than 2 effectively
|
||||
* Unfreezing the refcount with 1 effectively
|
||||
* drops the pagecache ref for us without requiring another
|
||||
* atomic operation.
|
||||
*/
|
||||
page_ref_unfreeze(page, 1);
|
||||
return 1;
|
||||
folio_ref_unfreeze(folio, 1);
|
||||
return folio_nr_pages(folio);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* putback_lru_page - put previously isolated page onto appropriate LRU list
|
||||
* @page: page to be put back to appropriate lru list
|
||||
* folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
|
||||
* @folio: Folio to be returned to an LRU list.
|
||||
*
|
||||
* Add previously isolated @page to appropriate LRU list.
|
||||
* Page may still be unevictable for other reasons.
|
||||
* Add previously isolated @folio to appropriate LRU list.
|
||||
* The folio may still be unevictable for other reasons.
|
||||
*
|
||||
* lru_lock must not be held, interrupts must be enabled.
|
||||
* Context: lru_lock must not be held, interrupts must be enabled.
|
||||
*/
|
||||
void putback_lru_page(struct page *page)
|
||||
void folio_putback_lru(struct folio *folio)
|
||||
{
|
||||
lru_cache_add(page);
|
||||
put_page(page); /* drop ref from isolate */
|
||||
folio_add_lru(folio);
|
||||
folio_put(folio); /* drop ref from isolate */
|
||||
}
|
||||
|
||||
enum page_references {
|
||||
@ -1365,61 +1371,61 @@ enum page_references {
|
||||
PAGEREF_ACTIVATE,
|
||||
};
|
||||
|
||||
static enum page_references page_check_references(struct page *page,
|
||||
static enum page_references folio_check_references(struct folio *folio,
|
||||
struct scan_control *sc)
|
||||
{
|
||||
int referenced_ptes, referenced_page;
|
||||
int referenced_ptes, referenced_folio;
|
||||
unsigned long vm_flags;
|
||||
|
||||
referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
|
||||
&vm_flags);
|
||||
referenced_page = TestClearPageReferenced(page);
|
||||
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
|
||||
&vm_flags);
|
||||
referenced_folio = folio_test_clear_referenced(folio);
|
||||
|
||||
/*
|
||||
* Mlock lost the isolation race with us. Let try_to_unmap()
|
||||
* move the page to the unevictable list.
|
||||
* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
|
||||
* Let the folio, now marked Mlocked, be moved to the unevictable list.
|
||||
*/
|
||||
if (vm_flags & VM_LOCKED)
|
||||
return PAGEREF_RECLAIM;
|
||||
return PAGEREF_ACTIVATE;
|
||||
|
||||
if (referenced_ptes) {
|
||||
/*
|
||||
* All mapped pages start out with page table
|
||||
* All mapped folios start out with page table
|
||||
* references from the instantiating fault, so we need
|
||||
* to look twice if a mapped file/anon page is used more
|
||||
* to look twice if a mapped file/anon folio is used more
|
||||
* than once.
|
||||
*
|
||||
* Mark it and spare it for another trip around the
|
||||
* inactive list. Another page table reference will
|
||||
* lead to its activation.
|
||||
*
|
||||
* Note: the mark is set for activated pages as well
|
||||
* so that recently deactivated but used pages are
|
||||
* Note: the mark is set for activated folios as well
|
||||
* so that recently deactivated but used folios are
|
||||
* quickly recovered.
|
||||
*/
|
||||
SetPageReferenced(page);
|
||||
folio_set_referenced(folio);
|
||||
|
||||
if (referenced_page || referenced_ptes > 1)
|
||||
if (referenced_folio || referenced_ptes > 1)
|
||||
return PAGEREF_ACTIVATE;
|
||||
|
||||
/*
|
||||
* Activate file-backed executable pages after first usage.
|
||||
* Activate file-backed executable folios after first usage.
|
||||
*/
|
||||
if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
|
||||
if ((vm_flags & VM_EXEC) && !folio_test_swapbacked(folio))
|
||||
return PAGEREF_ACTIVATE;
|
||||
|
||||
return PAGEREF_KEEP;
|
||||
}
|
||||
|
||||
/* Reclaim if clean, defer dirty pages to writeback */
|
||||
if (referenced_page && !PageSwapBacked(page))
|
||||
/* Reclaim if clean, defer dirty folios to writeback */
|
||||
if (referenced_folio && !folio_test_swapbacked(folio))
|
||||
return PAGEREF_RECLAIM_CLEAN;
|
||||
|
||||
return PAGEREF_RECLAIM;
|
||||
}
|
||||
|
||||
/* Check if a page is dirty or under writeback */
|
||||
static void page_check_dirty_writeback(struct page *page,
|
||||
static void folio_check_dirty_writeback(struct folio *folio,
|
||||
bool *dirty, bool *writeback)
|
||||
{
|
||||
struct address_space *mapping;
|
||||
@ -1428,24 +1434,24 @@ static void page_check_dirty_writeback(struct page *page,
|
||||
* Anonymous pages are not handled by flushers and must be written
|
||||
* from reclaim context. Do not stall reclaim based on them
|
||||
*/
|
||||
if (!page_is_file_lru(page) ||
|
||||
(PageAnon(page) && !PageSwapBacked(page))) {
|
||||
if (!folio_is_file_lru(folio) ||
|
||||
(folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
|
||||
*dirty = false;
|
||||
*writeback = false;
|
||||
return;
|
||||
}
|
||||
|
||||
/* By default assume that the page flags are accurate */
|
||||
*dirty = PageDirty(page);
|
||||
*writeback = PageWriteback(page);
|
||||
/* By default assume that the folio flags are accurate */
|
||||
*dirty = folio_test_dirty(folio);
|
||||
*writeback = folio_test_writeback(folio);
|
||||
|
||||
/* Verify dirty/writeback state if the filesystem supports it */
|
||||
if (!page_has_private(page))
|
||||
if (!folio_test_private(folio))
|
||||
return;
|
||||
|
||||
mapping = page_mapping(page);
|
||||
mapping = folio_mapping(folio);
|
||||
if (mapping && mapping->a_ops->is_dirty_writeback)
|
||||
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
|
||||
mapping->a_ops->is_dirty_writeback(&folio->page, dirty, writeback);
|
||||
}
|
||||
|
||||
static struct page *alloc_demote_page(struct page *page, unsigned long node)
|
||||
@ -1519,14 +1525,16 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
while (!list_empty(page_list)) {
|
||||
struct address_space *mapping;
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
enum page_references references = PAGEREF_RECLAIM;
|
||||
bool dirty, writeback, may_enter_fs;
|
||||
unsigned int nr_pages;
|
||||
|
||||
cond_resched();
|
||||
|
||||
page = lru_to_page(page_list);
|
||||
list_del(&page->lru);
|
||||
folio = lru_to_folio(page_list);
|
||||
list_del(&folio->lru);
|
||||
page = &folio->page;
|
||||
|
||||
if (!trylock_page(page))
|
||||
goto keep;
|
||||
@ -1552,12 +1560,12 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
* reclaim_congested. kswapd will stall and start writing
|
||||
* pages if the tail of the LRU is all dirty unqueued pages.
|
||||
*/
|
||||
page_check_dirty_writeback(page, &dirty, &writeback);
|
||||
folio_check_dirty_writeback(folio, &dirty, &writeback);
|
||||
if (dirty || writeback)
|
||||
stat->nr_dirty++;
|
||||
stat->nr_dirty += nr_pages;
|
||||
|
||||
if (dirty && !writeback)
|
||||
stat->nr_unqueued_dirty++;
|
||||
stat->nr_unqueued_dirty += nr_pages;
|
||||
|
||||
/*
|
||||
* Treat this page as congested if the underlying BDI is or if
|
||||
@ -1567,7 +1575,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
*/
|
||||
mapping = page_mapping(page);
|
||||
if (writeback && PageReclaim(page))
|
||||
stat->nr_congested++;
|
||||
stat->nr_congested += nr_pages;
|
||||
|
||||
/*
|
||||
* If a page at the tail of the LRU is under writeback, there
|
||||
@ -1616,7 +1624,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
if (current_is_kswapd() &&
|
||||
PageReclaim(page) &&
|
||||
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
|
||||
stat->nr_immediate++;
|
||||
stat->nr_immediate += nr_pages;
|
||||
goto activate_locked;
|
||||
|
||||
/* Case 2 above */
|
||||
@ -1634,7 +1642,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
* and it's also appropriate in global reclaim.
|
||||
*/
|
||||
SetPageReclaim(page);
|
||||
stat->nr_writeback++;
|
||||
stat->nr_writeback += nr_pages;
|
||||
goto activate_locked;
|
||||
|
||||
/* Case 3 above */
|
||||
@ -1648,7 +1656,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
}
|
||||
|
||||
if (!ignore_references)
|
||||
references = page_check_references(page, sc);
|
||||
references = folio_check_references(folio, sc);
|
||||
|
||||
switch (references) {
|
||||
case PAGEREF_ACTIVATE:
|
||||
@ -1681,28 +1689,28 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
if (!PageSwapCache(page)) {
|
||||
if (!(sc->gfp_mask & __GFP_IO))
|
||||
goto keep_locked;
|
||||
if (page_maybe_dma_pinned(page))
|
||||
if (folio_maybe_dma_pinned(folio))
|
||||
goto keep_locked;
|
||||
if (PageTransHuge(page)) {
|
||||
/* cannot split THP, skip it */
|
||||
if (!can_split_huge_page(page, NULL))
|
||||
if (!can_split_folio(folio, NULL))
|
||||
goto activate_locked;
|
||||
/*
|
||||
* Split pages without a PMD map right
|
||||
* away. Chances are some or all of the
|
||||
* tail pages can be freed without IO.
|
||||
*/
|
||||
if (!compound_mapcount(page) &&
|
||||
split_huge_page_to_list(page,
|
||||
page_list))
|
||||
if (!folio_entire_mapcount(folio) &&
|
||||
split_folio_to_list(folio,
|
||||
page_list))
|
||||
goto activate_locked;
|
||||
}
|
||||
if (!add_to_swap(page)) {
|
||||
if (!PageTransHuge(page))
|
||||
goto activate_locked_split;
|
||||
/* Fallback to swap normal pages */
|
||||
if (split_huge_page_to_list(page,
|
||||
page_list))
|
||||
if (split_folio_to_list(folio,
|
||||
page_list))
|
||||
goto activate_locked;
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
count_vm_event(THP_SWPOUT_FALLBACK);
|
||||
@ -1716,9 +1724,9 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
/* Adding to swap updated mapping */
|
||||
mapping = page_mapping(page);
|
||||
}
|
||||
} else if (unlikely(PageTransHuge(page))) {
|
||||
/* Split file THP */
|
||||
if (split_huge_page_to_list(page, page_list))
|
||||
} else if (PageSwapBacked(page) && PageTransHuge(page)) {
|
||||
/* Split shmem THP */
|
||||
if (split_folio_to_list(folio, page_list))
|
||||
goto keep_locked;
|
||||
}
|
||||
|
||||
@ -1742,10 +1750,11 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
enum ttu_flags flags = TTU_BATCH_FLUSH;
|
||||
bool was_swapbacked = PageSwapBacked(page);
|
||||
|
||||
if (unlikely(PageTransHuge(page)))
|
||||
if (PageTransHuge(page) &&
|
||||
thp_order(page) >= HPAGE_PMD_ORDER)
|
||||
flags |= TTU_SPLIT_HUGE_PMD;
|
||||
|
||||
try_to_unmap(page, flags);
|
||||
try_to_unmap(folio, flags);
|
||||
if (page_mapped(page)) {
|
||||
stat->nr_unmap_fail += nr_pages;
|
||||
if (!was_swapbacked && PageSwapBacked(page))
|
||||
@ -1793,13 +1802,13 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
* starts and then write it out here.
|
||||
*/
|
||||
try_to_unmap_flush_dirty();
|
||||
switch (pageout(page, mapping)) {
|
||||
switch (pageout(folio, mapping)) {
|
||||
case PAGE_KEEP:
|
||||
goto keep_locked;
|
||||
case PAGE_ACTIVATE:
|
||||
goto activate_locked;
|
||||
case PAGE_SUCCESS:
|
||||
stat->nr_pageout += thp_nr_pages(page);
|
||||
stat->nr_pageout += nr_pages;
|
||||
|
||||
if (PageWriteback(page))
|
||||
goto keep;
|
||||
@ -1877,7 +1886,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
*/
|
||||
count_vm_event(PGLAZYFREED);
|
||||
count_memcg_page_event(page, PGLAZYFREED);
|
||||
} else if (!mapping || !__remove_mapping(mapping, page, true,
|
||||
} else if (!mapping || !__remove_mapping(mapping, folio, true,
|
||||
sc->target_mem_cgroup))
|
||||
goto keep_locked;
|
||||
|
||||
@ -2132,45 +2141,40 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
|
||||
}
|
||||
|
||||
/**
|
||||
* isolate_lru_page - tries to isolate a page from its LRU list
|
||||
* @page: page to isolate from its LRU list
|
||||
* folio_isolate_lru() - Try to isolate a folio from its LRU list.
|
||||
* @folio: Folio to isolate from its LRU list.
|
||||
*
|
||||
* Isolates a @page from an LRU list, clears PageLRU and adjusts the
|
||||
* vmstat statistic corresponding to whatever LRU list the page was on.
|
||||
* Isolate a @folio from an LRU list and adjust the vmstat statistic
|
||||
* corresponding to whatever LRU list the folio was on.
|
||||
*
|
||||
* Returns 0 if the page was removed from an LRU list.
|
||||
* Returns -EBUSY if the page was not on an LRU list.
|
||||
*
|
||||
* The returned page will have PageLRU() cleared. If it was found on
|
||||
* the active list, it will have PageActive set. If it was found on
|
||||
* the unevictable list, it will have the PageUnevictable bit set. That flag
|
||||
* The folio will have its LRU flag cleared. If it was found on the
|
||||
* active list, it will have the Active flag set. If it was found on the
|
||||
* unevictable list, it will have the Unevictable flag set. These flags
|
||||
* may need to be cleared by the caller before letting the page go.
|
||||
*
|
||||
* The vmstat statistic corresponding to the list on which the page was
|
||||
* found will be decremented.
|
||||
*
|
||||
* Restrictions:
|
||||
* Context:
|
||||
*
|
||||
* (1) Must be called with an elevated refcount on the page. This is a
|
||||
* fundamental difference from isolate_lru_pages (which is called
|
||||
* fundamental difference from isolate_lru_pages() (which is called
|
||||
* without a stable reference).
|
||||
* (2) the lru_lock must not be held.
|
||||
* (3) interrupts must be enabled.
|
||||
* (2) The lru_lock must not be held.
|
||||
* (3) Interrupts must be enabled.
|
||||
*
|
||||
* Return: 0 if the folio was removed from an LRU list.
|
||||
* -EBUSY if the folio was not on an LRU list.
|
||||
*/
|
||||
int isolate_lru_page(struct page *page)
|
||||
int folio_isolate_lru(struct folio *folio)
|
||||
{
|
||||
struct folio *folio = page_folio(page);
|
||||
int ret = -EBUSY;
|
||||
|
||||
VM_BUG_ON_PAGE(!page_count(page), page);
|
||||
WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
|
||||
VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
|
||||
|
||||
if (TestClearPageLRU(page)) {
|
||||
if (folio_test_clear_lru(folio)) {
|
||||
struct lruvec *lruvec;
|
||||
|
||||
get_page(page);
|
||||
folio_get(folio);
|
||||
lruvec = folio_lruvec_lock_irq(folio);
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
lruvec_del_folio(lruvec, folio);
|
||||
unlock_page_lruvec_irq(lruvec);
|
||||
ret = 0;
|
||||
}
|
||||
@ -2406,7 +2410,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
|
||||
*
|
||||
* If the pages are mostly unmapped, the processing is fast and it is
|
||||
* appropriate to hold lru_lock across the whole operation. But if
|
||||
* the pages are mapped, the processing is slow (page_referenced()), so
|
||||
* the pages are mapped, the processing is slow (folio_referenced()), so
|
||||
* we should drop lru_lock around each page. It's impossible to balance
|
||||
* this, so instead we remove the pages from the LRU while processing them.
|
||||
* It is safe to rely on PG_active against the non-LRU pages in here because
|
||||
@ -2426,7 +2430,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
||||
LIST_HEAD(l_hold); /* The pages which were snipped off */
|
||||
LIST_HEAD(l_active);
|
||||
LIST_HEAD(l_inactive);
|
||||
struct page *page;
|
||||
unsigned nr_deactivate, nr_activate;
|
||||
unsigned nr_rotated = 0;
|
||||
int file = is_file_lru(lru);
|
||||
@ -2448,9 +2451,13 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
||||
spin_unlock_irq(&lruvec->lru_lock);
|
||||
|
||||
while (!list_empty(&l_hold)) {
|
||||
struct folio *folio;
|
||||
struct page *page;
|
||||
|
||||
cond_resched();
|
||||
page = lru_to_page(&l_hold);
|
||||
list_del(&page->lru);
|
||||
folio = lru_to_folio(&l_hold);
|
||||
list_del(&folio->lru);
|
||||
page = &folio->page;
|
||||
|
||||
if (unlikely(!page_evictable(page))) {
|
||||
putback_lru_page(page);
|
||||
@ -2465,8 +2472,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
||||
}
|
||||
}
|
||||
|
||||
if (page_referenced(page, 0, sc->target_mem_cgroup,
|
||||
&vm_flags)) {
|
||||
if (folio_referenced(folio, 0, sc->target_mem_cgroup,
|
||||
&vm_flags)) {
|
||||
/*
|
||||
* Identify referenced, file-backed active pages and
|
||||
* give them one more trip around the active list. So
|
||||
|
@ -245,31 +245,32 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
|
||||
}
|
||||
|
||||
/**
|
||||
* workingset_eviction - note the eviction of a page from memory
|
||||
* workingset_eviction - note the eviction of a folio from memory
|
||||
* @target_memcg: the cgroup that is causing the reclaim
|
||||
* @page: the page being evicted
|
||||
* @folio: the folio being evicted
|
||||
*
|
||||
* Return: a shadow entry to be stored in @page->mapping->i_pages in place
|
||||
* of the evicted @page so that a later refault can be detected.
|
||||
* Return: a shadow entry to be stored in @folio->mapping->i_pages in place
|
||||
* of the evicted @folio so that a later refault can be detected.
|
||||
*/
|
||||
void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
|
||||
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
|
||||
{
|
||||
struct pglist_data *pgdat = page_pgdat(page);
|
||||
struct pglist_data *pgdat = folio_pgdat(folio);
|
||||
unsigned long eviction;
|
||||
struct lruvec *lruvec;
|
||||
int memcgid;
|
||||
|
||||
/* Page is fully exclusive and pins page's memory cgroup pointer */
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
VM_BUG_ON_PAGE(page_count(page), page);
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
/* Folio is fully exclusive and pins folio's memory cgroup pointer */
|
||||
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
|
||||
VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
|
||||
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
|
||||
|
||||
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
|
||||
/* XXX: target_memcg can be NULL, go through lruvec */
|
||||
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
|
||||
eviction = atomic_long_read(&lruvec->nonresident_age);
|
||||
workingset_age_nonresident(lruvec, thp_nr_pages(page));
|
||||
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
|
||||
workingset_age_nonresident(lruvec, folio_nr_pages(folio));
|
||||
return pack_shadow(memcgid, pgdat, eviction,
|
||||
folio_test_workingset(folio));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -26,15 +26,17 @@
|
||||
#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1))
|
||||
|
||||
int pagemap_fd;
|
||||
int backing_fd = -1;
|
||||
int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
|
||||
#define PROT_RW (PROT_READ | PROT_WRITE)
|
||||
|
||||
int64_t allocate_transhuge(void *ptr)
|
||||
{
|
||||
uint64_t ent[2];
|
||||
|
||||
/* drop pmd */
|
||||
if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
|
||||
MAP_FIXED | MAP_ANONYMOUS |
|
||||
MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
|
||||
if (mmap(ptr, HPAGE_SIZE, PROT_RW, MAP_FIXED | mmap_flags,
|
||||
backing_fd, 0) != ptr)
|
||||
errx(2, "mmap transhuge");
|
||||
|
||||
if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
|
||||
@ -60,6 +62,8 @@ int main(int argc, char **argv)
|
||||
size_t ram, len;
|
||||
void *ptr, *p;
|
||||
struct timespec a, b;
|
||||
int i = 0;
|
||||
char *name = NULL;
|
||||
double s;
|
||||
uint8_t *map;
|
||||
size_t map_len;
|
||||
@ -69,13 +73,23 @@ int main(int argc, char **argv)
|
||||
ram = SIZE_MAX / 4;
|
||||
else
|
||||
ram *= sysconf(_SC_PAGESIZE);
|
||||
len = ram;
|
||||
|
||||
if (argc == 1)
|
||||
len = ram;
|
||||
else if (!strcmp(argv[1], "-h"))
|
||||
errx(1, "usage: %s [size in MiB]", argv[0]);
|
||||
else
|
||||
len = atoll(argv[1]) << 20;
|
||||
while (++i < argc) {
|
||||
if (!strcmp(argv[i], "-h"))
|
||||
errx(1, "usage: %s [size in MiB]", argv[0]);
|
||||
else if (!strcmp(argv[i], "-f"))
|
||||
name = argv[++i];
|
||||
else
|
||||
len = atoll(argv[i]) << 20;
|
||||
}
|
||||
|
||||
if (name) {
|
||||
backing_fd = open(name, O_RDWR);
|
||||
if (backing_fd == -1)
|
||||
errx(2, "open %s", name);
|
||||
mmap_flags = MAP_SHARED;
|
||||
}
|
||||
|
||||
warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
|
||||
" and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
|
||||
@ -86,8 +100,7 @@ int main(int argc, char **argv)
|
||||
err(2, "open pagemap");
|
||||
|
||||
len -= len % HPAGE_SIZE;
|
||||
ptr = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
|
||||
MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
|
||||
ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0);
|
||||
if (ptr == MAP_FAILED)
|
||||
err(2, "initial mmap");
|
||||
ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
|
||||
|
Loading…
Reference in New Issue
Block a user