mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-17 05:45:20 +00:00
mm/gup: retire follow_hugetlb_page()
Now __get_user_pages() should be well prepared to handle thp completely, as long as hugetlb gup requests even without the hugetlb's special path. Time to retire follow_hugetlb_page(). Tweak misc comments to reflect reality of follow_hugetlb_page()'s removal. Link: https://lkml.kernel.org/r/20230628215310.73782-7-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: James Houghton <jthoughton@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Kirill A . Shutemov <kirill@shutemov.name> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yang Shi <shy828301@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
57edfcfd34
commit
4849807114
@ -427,7 +427,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
|
||||
*
|
||||
* We also don't do userfault handling during
|
||||
* coredumping. hugetlbfs has the special
|
||||
* follow_hugetlb_page() to skip missing pages in the
|
||||
* hugetlb_follow_page_mask() to skip missing pages in the
|
||||
* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
|
||||
* the no_page_table() helper in follow_page_mask(), but the
|
||||
* shmem_vm_ops->fault method is invoked even during
|
||||
|
@ -133,9 +133,6 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
|
||||
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned int flags,
|
||||
unsigned int *page_mask);
|
||||
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
|
||||
struct page **, unsigned long *, unsigned long *,
|
||||
long, unsigned int, int *);
|
||||
void unmap_hugepage_range(struct vm_area_struct *,
|
||||
unsigned long, unsigned long, struct page *,
|
||||
zap_flags_t);
|
||||
@ -305,15 +302,6 @@ static inline struct page *hugetlb_follow_page_mask(
|
||||
BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
|
||||
}
|
||||
|
||||
static inline long follow_hugetlb_page(struct mm_struct *mm,
|
||||
struct vm_area_struct *vma, struct page **pages,
|
||||
unsigned long *position, unsigned long *nr_pages,
|
||||
long i, unsigned int flags, int *nonblocking)
|
||||
{
|
||||
BUG();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int copy_hugetlb_page_range(struct mm_struct *dst,
|
||||
struct mm_struct *src,
|
||||
struct vm_area_struct *dst_vma,
|
||||
|
19
mm/gup.c
19
mm/gup.c
@ -819,9 +819,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
|
||||
* Call hugetlb_follow_page_mask for hugetlb vmas as it will use
|
||||
* special hugetlb page table walking code. This eliminates the
|
||||
* need to check for hugetlb entries in the general walking code.
|
||||
*
|
||||
* hugetlb_follow_page_mask is only for follow_page() handling here.
|
||||
* Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
|
||||
*/
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
return hugetlb_follow_page_mask(vma, address, flags,
|
||||
@ -1221,22 +1218,6 @@ static long __get_user_pages(struct mm_struct *mm,
|
||||
ret = check_vma_flags(vma, gup_flags);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (is_vm_hugetlb_page(vma)) {
|
||||
i = follow_hugetlb_page(mm, vma, pages,
|
||||
&start, &nr_pages, i,
|
||||
gup_flags, locked);
|
||||
if (!*locked) {
|
||||
/*
|
||||
* We've got a VM_FAULT_RETRY
|
||||
* and we've lost mmap_lock.
|
||||
* We must stop here.
|
||||
*/
|
||||
BUG_ON(gup_flags & FOLL_NOWAIT);
|
||||
goto out;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
retry:
|
||||
/*
|
||||
|
224
mm/hugetlb.c
224
mm/hugetlb.c
@ -5721,7 +5721,6 @@ out_release_old:
|
||||
|
||||
/*
|
||||
* Return whether there is a pagecache page to back given address within VMA.
|
||||
* Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
|
||||
*/
|
||||
static bool hugetlbfs_pagecache_present(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
@ -6422,37 +6421,6 @@ out_release_nounlock:
|
||||
}
|
||||
#endif /* CONFIG_USERFAULTFD */
|
||||
|
||||
static void record_subpages(struct page *page, struct vm_area_struct *vma,
|
||||
int refs, struct page **pages)
|
||||
{
|
||||
int nr;
|
||||
|
||||
for (nr = 0; nr < refs; nr++) {
|
||||
if (likely(pages))
|
||||
pages[nr] = nth_page(page, nr);
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
|
||||
unsigned int flags, pte_t *pte,
|
||||
bool *unshare)
|
||||
{
|
||||
pte_t pteval = huge_ptep_get(pte);
|
||||
|
||||
*unshare = false;
|
||||
if (is_swap_pte(pteval))
|
||||
return true;
|
||||
if (huge_pte_write(pteval))
|
||||
return false;
|
||||
if (flags & FOLL_WRITE)
|
||||
return true;
|
||||
if (gup_must_unshare(vma, flags, pte_page(pteval))) {
|
||||
*unshare = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned int flags,
|
||||
unsigned int *page_mask)
|
||||
@ -6524,198 +6492,6 @@ out_unlock:
|
||||
return page;
|
||||
}
|
||||
|
||||
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct page **pages, unsigned long *position,
|
||||
unsigned long *nr_pages, long i, unsigned int flags,
|
||||
int *locked)
|
||||
{
|
||||
unsigned long pfn_offset;
|
||||
unsigned long vaddr = *position;
|
||||
unsigned long remainder = *nr_pages;
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
int err = -EFAULT, refs;
|
||||
|
||||
while (vaddr < vma->vm_end && remainder) {
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl = NULL;
|
||||
bool unshare = false;
|
||||
int absent;
|
||||
struct page *page;
|
||||
|
||||
/*
|
||||
* If we have a pending SIGKILL, don't keep faulting pages and
|
||||
* potentially allocating memory.
|
||||
*/
|
||||
if (fatal_signal_pending(current)) {
|
||||
remainder = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
hugetlb_vma_lock_read(vma);
|
||||
/*
|
||||
* Some archs (sparc64, sh*) have multiple pte_ts to
|
||||
* each hugepage. We have to make sure we get the
|
||||
* first, for the page indexing below to work.
|
||||
*
|
||||
* Note that page table lock is not held when pte is null.
|
||||
*/
|
||||
pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
|
||||
huge_page_size(h));
|
||||
if (pte)
|
||||
ptl = huge_pte_lock(h, mm, pte);
|
||||
absent = !pte || huge_pte_none(huge_ptep_get(pte));
|
||||
|
||||
/*
|
||||
* When coredumping, it suits get_dump_page if we just return
|
||||
* an error where there's an empty slot with no huge pagecache
|
||||
* to back it. This way, we avoid allocating a hugepage, and
|
||||
* the sparse dumpfile avoids allocating disk blocks, but its
|
||||
* huge holes still show up with zeroes where they need to be.
|
||||
*/
|
||||
if (absent && (flags & FOLL_DUMP) &&
|
||||
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
|
||||
if (pte)
|
||||
spin_unlock(ptl);
|
||||
hugetlb_vma_unlock_read(vma);
|
||||
remainder = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need call hugetlb_fault for both hugepages under migration
|
||||
* (in which case hugetlb_fault waits for the migration,) and
|
||||
* hwpoisoned hugepages (in which case we need to prevent the
|
||||
* caller from accessing to them.) In order to do this, we use
|
||||
* here is_swap_pte instead of is_hugetlb_entry_migration and
|
||||
* is_hugetlb_entry_hwpoisoned. This is because it simply covers
|
||||
* both cases, and because we can't follow correct pages
|
||||
* directly from any kind of swap entries.
|
||||
*/
|
||||
if (absent ||
|
||||
__follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
|
||||
vm_fault_t ret;
|
||||
unsigned int fault_flags = 0;
|
||||
|
||||
if (pte)
|
||||
spin_unlock(ptl);
|
||||
hugetlb_vma_unlock_read(vma);
|
||||
|
||||
if (flags & FOLL_WRITE)
|
||||
fault_flags |= FAULT_FLAG_WRITE;
|
||||
else if (unshare)
|
||||
fault_flags |= FAULT_FLAG_UNSHARE;
|
||||
if (locked) {
|
||||
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
|
||||
FAULT_FLAG_KILLABLE;
|
||||
if (flags & FOLL_INTERRUPTIBLE)
|
||||
fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
|
||||
}
|
||||
if (flags & FOLL_NOWAIT)
|
||||
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
|
||||
FAULT_FLAG_RETRY_NOWAIT;
|
||||
if (flags & FOLL_TRIED) {
|
||||
/*
|
||||
* Note: FAULT_FLAG_ALLOW_RETRY and
|
||||
* FAULT_FLAG_TRIED can co-exist
|
||||
*/
|
||||
fault_flags |= FAULT_FLAG_TRIED;
|
||||
}
|
||||
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
|
||||
if (ret & VM_FAULT_ERROR) {
|
||||
err = vm_fault_to_errno(ret, flags);
|
||||
remainder = 0;
|
||||
break;
|
||||
}
|
||||
if (ret & VM_FAULT_RETRY) {
|
||||
if (locked &&
|
||||
!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
|
||||
*locked = 0;
|
||||
*nr_pages = 0;
|
||||
/*
|
||||
* VM_FAULT_RETRY must not return an
|
||||
* error, it will return zero
|
||||
* instead.
|
||||
*
|
||||
* No need to update "position" as the
|
||||
* caller will not check it after
|
||||
* *nr_pages is set to 0.
|
||||
*/
|
||||
return i;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
|
||||
page = pte_page(huge_ptep_get(pte));
|
||||
|
||||
VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
|
||||
!PageAnonExclusive(page), page);
|
||||
|
||||
/*
|
||||
* If subpage information not requested, update counters
|
||||
* and skip the same_page loop below.
|
||||
*/
|
||||
if (!pages && !pfn_offset &&
|
||||
(vaddr + huge_page_size(h) < vma->vm_end) &&
|
||||
(remainder >= pages_per_huge_page(h))) {
|
||||
vaddr += huge_page_size(h);
|
||||
remainder -= pages_per_huge_page(h);
|
||||
i += pages_per_huge_page(h);
|
||||
spin_unlock(ptl);
|
||||
hugetlb_vma_unlock_read(vma);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* vaddr may not be aligned to PAGE_SIZE */
|
||||
refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
|
||||
(vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
|
||||
|
||||
if (pages)
|
||||
record_subpages(nth_page(page, pfn_offset),
|
||||
vma, refs,
|
||||
likely(pages) ? pages + i : NULL);
|
||||
|
||||
if (pages) {
|
||||
/*
|
||||
* try_grab_folio() should always succeed here,
|
||||
* because: a) we hold the ptl lock, and b) we've just
|
||||
* checked that the huge page is present in the page
|
||||
* tables. If the huge page is present, then the tail
|
||||
* pages must also be present. The ptl prevents the
|
||||
* head page and tail pages from being rearranged in
|
||||
* any way. As this is hugetlb, the pages will never
|
||||
* be p2pdma or not longterm pinable. So this page
|
||||
* must be available at this point, unless the page
|
||||
* refcount overflowed:
|
||||
*/
|
||||
if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
|
||||
flags))) {
|
||||
spin_unlock(ptl);
|
||||
hugetlb_vma_unlock_read(vma);
|
||||
remainder = 0;
|
||||
err = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
vaddr += (refs << PAGE_SHIFT);
|
||||
remainder -= refs;
|
||||
i += refs;
|
||||
|
||||
spin_unlock(ptl);
|
||||
hugetlb_vma_unlock_read(vma);
|
||||
}
|
||||
*nr_pages = remainder;
|
||||
/*
|
||||
* setting position is actually required only if remainder is
|
||||
* not zero but it's faster not to add a "if (remainder)"
|
||||
* branch.
|
||||
*/
|
||||
*position = vaddr;
|
||||
|
||||
return i ? i : err;
|
||||
}
|
||||
|
||||
long hugetlb_change_protection(struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned long end,
|
||||
pgprot_t newprot, unsigned long cp_flags)
|
||||
|
Loading…
x
Reference in New Issue
Block a user