mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-16 10:17:32 +00:00
aa39ca6940
We want to get rid of follow_page(), and have a more reasonable way to just lookup a folio mapped at a certain address, perform some checks while still under PTL, and then only conditionally grab a folio reference if really required. Further, we might want to get rid of some walk_page_range*() users that really only want to temporarily lookup a single folio at a single address. So let's add a new page table walker that does exactly that, similarly to GUP also being able to walk hugetlb VMAs. Add folio_walk_end() as a macro for now: the compiler is not easy to please with the pte_unmap()->kunmap_local(). Note that one difference between follow_page() and get_user_pages(1) is that follow_page() will not trigger faults to get something mapped. So folio_walk is at least currently not a replacement for get_user_pages(1), but could likely be extended/reused to achieve something similar in the future. Link: https://lkml.kernel.org/r/20240802155524.517137-3-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Claudio Imbrenda <imbrenda@linux.ibm.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Janosch Frank <frankja@linux.ibm.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Matthew Wilcox <willy@infradead.org> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
192 lines
6.6 KiB
C
192 lines
6.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_PAGEWALK_H
|
|
#define _LINUX_PAGEWALK_H
|
|
|
|
#include <linux/mm.h>
|
|
|
|
struct mm_walk;
|
|
|
|
/* Locking requirement during a page walk. */
|
|
enum page_walk_lock {
|
|
/* mmap_lock should be locked for read to stabilize the vma tree */
|
|
PGWALK_RDLOCK = 0,
|
|
/* vma will be write-locked during the walk */
|
|
PGWALK_WRLOCK = 1,
|
|
/* vma is expected to be already write-locked during the walk */
|
|
PGWALK_WRLOCK_VERIFY = 2,
|
|
};
|
|
|
|
/**
|
|
* struct mm_walk_ops - callbacks for walk_page_range
|
|
* @pgd_entry: if set, called for each non-empty PGD (top-level) entry
|
|
* @p4d_entry: if set, called for each non-empty P4D entry
|
|
* @pud_entry: if set, called for each non-empty PUD entry
|
|
* @pmd_entry: if set, called for each non-empty PMD entry
|
|
* this handler is required to be able to handle
|
|
* pmd_trans_huge() pmds. They may simply choose to
|
|
* split_huge_page() instead of handling it explicitly.
|
|
* @pte_entry: if set, called for each PTE (lowest-level) entry,
|
|
* including empty ones
|
|
* @pte_hole: if set, called for each hole at all levels,
|
|
* depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
|
|
* Any folded depths (where PTRS_PER_P?D is equal to 1)
|
|
* are skipped.
|
|
* @hugetlb_entry: if set, called for each hugetlb entry. This hook
|
|
* function is called with the vma lock held, in order to
|
|
* protect against a concurrent freeing of the pte_t* or
|
|
* the ptl. In some cases, the hook function needs to drop
|
|
* and retake the vma lock in order to avoid deadlocks
|
|
* while calling other functions. In such cases the hook
|
|
* function must either refrain from accessing the pte or
|
|
* ptl after dropping the vma lock, or else revalidate
|
|
* those items after re-acquiring the vma lock and before
|
|
* accessing them.
|
|
* @test_walk: caller specific callback function to determine whether
|
|
* we walk over the current vma or not. Returning 0 means
|
|
* "do page table walk over the current vma", returning
|
|
* a negative value means "abort current page table walk
|
|
* right now" and returning 1 means "skip the current vma"
|
|
* Note that this callback is not called when the caller
|
|
* passes in a single VMA as for walk_page_vma().
|
|
* @pre_vma: if set, called before starting walk on a non-null vma.
|
|
* @post_vma: if set, called after a walk on a non-null vma, provided
|
|
* that @pre_vma and the vma walk succeeded.
|
|
*
|
|
* p?d_entry callbacks are called even if those levels are folded on a
|
|
* particular architecture/configuration.
|
|
*/
|
|
struct mm_walk_ops {
|
|
int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk);
|
|
int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk);
|
|
int (*pud_entry)(pud_t *pud, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk);
|
|
int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk);
|
|
int (*pte_entry)(pte_t *pte, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk);
|
|
int (*pte_hole)(unsigned long addr, unsigned long next,
|
|
int depth, struct mm_walk *walk);
|
|
int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
|
|
unsigned long addr, unsigned long next,
|
|
struct mm_walk *walk);
|
|
int (*test_walk)(unsigned long addr, unsigned long next,
|
|
struct mm_walk *walk);
|
|
int (*pre_vma)(unsigned long start, unsigned long end,
|
|
struct mm_walk *walk);
|
|
void (*post_vma)(struct mm_walk *walk);
|
|
enum page_walk_lock walk_lock;
|
|
};
|
|
|
|
/*
|
|
* Action for pud_entry / pmd_entry callbacks.
|
|
* ACTION_SUBTREE is the default
|
|
*/
|
|
enum page_walk_action {
|
|
/* Descend to next level, splitting huge pages if needed and possible */
|
|
ACTION_SUBTREE = 0,
|
|
/* Continue to next entry at this level (ignoring any subtree) */
|
|
ACTION_CONTINUE = 1,
|
|
/* Call again for this entry */
|
|
ACTION_AGAIN = 2
|
|
};
|
|
|
|
/**
|
|
* struct mm_walk - walk_page_range data
|
|
* @ops: operation to call during the walk
|
|
* @mm: mm_struct representing the target process of page table walk
|
|
* @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL)
|
|
* @vma: vma currently walked (NULL if walking outside vmas)
|
|
* @action: next action to perform (see enum page_walk_action)
|
|
* @no_vma: walk ignoring vmas (vma will always be NULL)
|
|
* @private: private data for callbacks' usage
|
|
*
|
|
* (see the comment on walk_page_range() for more details)
|
|
*/
|
|
struct mm_walk {
|
|
const struct mm_walk_ops *ops;
|
|
struct mm_struct *mm;
|
|
pgd_t *pgd;
|
|
struct vm_area_struct *vma;
|
|
enum page_walk_action action;
|
|
bool no_vma;
|
|
void *private;
|
|
};
|
|
|
|
int walk_page_range(struct mm_struct *mm, unsigned long start,
|
|
unsigned long end, const struct mm_walk_ops *ops,
|
|
void *private);
|
|
int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
|
|
unsigned long end, const struct mm_walk_ops *ops,
|
|
pgd_t *pgd,
|
|
void *private);
|
|
int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
|
|
unsigned long end, const struct mm_walk_ops *ops,
|
|
void *private);
|
|
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
|
|
void *private);
|
|
int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
|
|
pgoff_t nr, const struct mm_walk_ops *ops,
|
|
void *private);
|
|
|
|
typedef int __bitwise folio_walk_flags_t;
|
|
|
|
/*
|
|
* Walk migration entries as well. Careful: a large folio might get split
|
|
* concurrently.
|
|
*/
|
|
#define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0))
|
|
|
|
/* Walk shared zeropages (small + huge) as well. */
|
|
#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1))
|
|
|
|
enum folio_walk_level {
|
|
FW_LEVEL_PTE,
|
|
FW_LEVEL_PMD,
|
|
FW_LEVEL_PUD,
|
|
};
|
|
|
|
/**
|
|
* struct folio_walk - folio_walk_start() / folio_walk_end() data
|
|
* @page: exact folio page referenced (if applicable)
|
|
* @level: page table level identifying the entry type
|
|
* @pte: pointer to the page table entry (FW_LEVEL_PTE).
|
|
* @pmd: pointer to the page table entry (FW_LEVEL_PMD).
|
|
* @pud: pointer to the page table entry (FW_LEVEL_PUD).
|
|
* @ptl: pointer to the page table lock.
|
|
*
|
|
* (see folio_walk_start() documentation for more details)
|
|
*/
|
|
struct folio_walk {
|
|
/* public */
|
|
struct page *page;
|
|
enum folio_walk_level level;
|
|
union {
|
|
pte_t *ptep;
|
|
pud_t *pudp;
|
|
pmd_t *pmdp;
|
|
};
|
|
union {
|
|
pte_t pte;
|
|
pud_t pud;
|
|
pmd_t pmd;
|
|
};
|
|
/* private */
|
|
struct vm_area_struct *vma;
|
|
spinlock_t *ptl;
|
|
};
|
|
|
|
struct folio *folio_walk_start(struct folio_walk *fw,
|
|
struct vm_area_struct *vma, unsigned long addr,
|
|
folio_walk_flags_t flags);
|
|
|
|
#define folio_walk_end(__fw, __vma) do { \
|
|
spin_unlock((__fw)->ptl); \
|
|
if (likely((__fw)->level == FW_LEVEL_PTE)) \
|
|
pte_unmap((__fw)->ptep); \
|
|
vma_pgtable_walk_end(__vma); \
|
|
} while (0)
|
|
|
|
#endif /* _LINUX_PAGEWALK_H */
|