mm/madvise: clean up pte_offset_map_lock() scans

Came here to make madvise's several pte_offset_map_lock() scans advance to
next extent on failure, and remove superfluous pmd_trans_unstable() and
pmd_none_or_trans_huge_or_clear_bad() calls.  But also did some nearby
cleanup.

swapin_walk_pmd_entry(): don't name an address "index"; don't drop the
lock after every pte, only when calling out to read_swap_cache_async().

madvise_cold_or_pageout_pte_range() and madvise_free_pte_range(): prefer
"start_pte" for pointer, orig_pte usually denotes a saved pte value; leave
lazy MMU mode before unlocking; merge the success and failure paths after
split_folio().

Link: https://lkml.kernel.org/r/cc4d9a88-9da6-362-50d9-6735c2b125c6@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Song Liu <song@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zack Rusin <zackr@vmware.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Hugh Dickins 2023-06-08 18:34:03 -07:00 committed by Andrew Morton
parent a5be621ee2
commit f3cd4ab0aa

View File

@ -188,37 +188,43 @@ static int madvise_update_vma(struct vm_area_struct *vma,
#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->private;
unsigned long index;
struct swap_iocb *splug = NULL;
pte_t *ptep = NULL;
spinlock_t *ptl;
unsigned long addr;
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
return 0;
for (index = start; index != end; index += PAGE_SIZE) {
for (addr = start; addr < end; addr += PAGE_SIZE) {
pte_t pte;
swp_entry_t entry;
struct page *page;
spinlock_t *ptl;
pte_t *ptep;
ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
if (!ptep++) {
ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (!ptep)
break;
}
pte = *ptep;
pte_unmap_unlock(ptep, ptl);
if (!is_swap_pte(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
continue;
pte_unmap_unlock(ptep, ptl);
ptep = NULL;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
vma, index, false, &splug);
vma, addr, false, &splug);
if (page)
put_page(page);
}
if (ptep)
pte_unmap_unlock(ptep, ptl);
swap_read_unplug(splug);
cond_resched();
@ -340,7 +346,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
bool pageout = private->pageout;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
pte_t *orig_pte, *pte, ptent;
pte_t *start_pte, *pte, ptent;
spinlock_t *ptl;
struct folio *folio = NULL;
LIST_HEAD(folio_list);
@ -422,11 +428,11 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
}
regular_folio:
if (pmd_trans_unstable(pmd))
return 0;
#endif
tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (!start_pte)
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr < end; pte++, addr += PAGE_SIZE) {
@ -447,25 +453,28 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
* are sure it's worth. Split it if we are only owner.
*/
if (folio_test_large(folio)) {
int err;
if (folio_mapcount(folio) != 1)
break;
if (pageout_anon_only_filter && !folio_test_anon(folio))
break;
if (!folio_trylock(folio))
break;
folio_get(folio);
if (!folio_trylock(folio)) {
folio_put(folio);
break;
}
pte_unmap_unlock(orig_pte, ptl);
if (split_folio(folio)) {
folio_unlock(folio);
folio_put(folio);
orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
break;
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(start_pte, ptl);
start_pte = NULL;
err = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (err)
break;
start_pte = pte =
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
break;
arch_enter_lazy_mmu_mode();
pte--;
addr -= PAGE_SIZE;
continue;
@ -510,8 +519,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
folio_deactivate(folio);
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
if (start_pte) {
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(start_pte, ptl);
}
if (pageout)
reclaim_pages(&folio_list);
cond_resched();
@ -612,7 +623,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
pte_t *orig_pte, *pte, ptent;
pte_t *start_pte, *pte, ptent;
struct folio *folio;
int nr_swap = 0;
unsigned long next;
@ -620,13 +631,12 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
goto next;
if (pmd_trans_unstable(pmd))
return 0;
return 0;
tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {
@ -664,23 +674,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
* deactivate all pages.
*/
if (folio_test_large(folio)) {
int err;
if (folio_mapcount(folio) != 1)
goto out;
break;
if (!folio_trylock(folio))
break;
folio_get(folio);
if (!folio_trylock(folio)) {
folio_put(folio);
goto out;
}
pte_unmap_unlock(orig_pte, ptl);
if (split_folio(folio)) {
folio_unlock(folio);
folio_put(folio);
orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
goto out;
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(start_pte, ptl);
start_pte = NULL;
err = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (err)
break;
start_pte = pte =
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
break;
arch_enter_lazy_mmu_mode();
pte--;
addr -= PAGE_SIZE;
continue;
@ -725,17 +738,18 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
}
folio_mark_lazyfree(folio);
}
out:
if (nr_swap) {
if (current->mm == mm)
sync_mm_rss(mm);
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
if (start_pte) {
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(start_pte, ptl);
}
cond_resched();
next:
return 0;
}