diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 763929e814e9..1e700468a685 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -78,6 +78,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 9c48d9a21aa0..b700dae28c48 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -105,6 +105,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 68c44f99bc93..b6a709506987 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -75,6 +75,9 @@ #define MADV_HWPOISON 100 /* poison a page for testing */ #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 1ff0c858544f..99d4ccee7f6e 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -113,6 +113,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6ce1f1ceb432..1ea2c4c33b86 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -79,6 +79,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/madvise.c b/mm/madvise.c index e871a72a6c32..0ceae57da7da 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -37,6 +37,12 @@ #include "internal.h" #include "swap.h" +/* + * Maximum number of attempts we make to install guard pages before we give up + * and return -ERESTARTNOINTR to have userspace try again. + */ +#define MAX_MADVISE_GUARD_RETRIES 3 + struct madvise_walk_private { struct mmu_gather *tlb; bool pageout; @@ -60,6 +66,8 @@ static int madvise_need_mmap_write(int behavior) case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: + case MADV_GUARD_INSTALL: + case MADV_GUARD_REMOVE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -1017,6 +1025,214 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) +{ + vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; + + /* + * A user could lock after setting a guard range but that's fine, as + * they'd not be able to fault in. The issue arises when we try to zap + * existing locked VMAs. We don't want to do that. + */ + if (!allow_locked) + disallowed |= VM_LOCKED; + + if (!vma_is_anonymous(vma)) + return false; + + if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE) + return false; + + return true; +} + +static bool is_guard_pte_marker(pte_t ptent) +{ + return is_pte_marker(ptent) && + is_guard_swp_entry(pte_to_swp_entry(ptent)); +} + +static int guard_install_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t pudval = pudp_get(pud); + + /* If huge return >0 so we abort the operation + zap. */ + return pud_trans_huge(pudval) || pud_devmap(pudval); +} + +static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t pmdval = pmdp_get(pmd); + + /* If huge return >0 so we abort the operation + zap. */ + return pmd_trans_huge(pmdval) || pmd_devmap(pmdval); +} + +static int guard_install_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t pteval = ptep_get(pte); + unsigned long *nr_pages = (unsigned long *)walk->private; + + /* If there is already a guard page marker, we have nothing to do. */ + if (is_guard_pte_marker(pteval)) { + (*nr_pages)++; + + return 0; + } + + /* If populated return >0 so we abort the operation + zap. */ + return 1; +} + +static int guard_install_set_pte(unsigned long addr, unsigned long next, + pte_t *ptep, struct mm_walk *walk) +{ + unsigned long *nr_pages = (unsigned long *)walk->private; + + /* Simply install a PTE marker, this causes segfault on access. */ + *ptep = make_pte_marker(PTE_MARKER_GUARD); + (*nr_pages)++; + + return 0; +} + +static const struct mm_walk_ops guard_install_walk_ops = { + .pud_entry = guard_install_pud_entry, + .pmd_entry = guard_install_pmd_entry, + .pte_entry = guard_install_pte_entry, + .install_pte = guard_install_set_pte, + .walk_lock = PGWALK_RDLOCK, +}; + +static long madvise_guard_install(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + long err; + int i; + + *prev = vma; + if (!is_valid_guard_vma(vma, /* allow_locked = */false)) + return -EINVAL; + + /* + * If we install guard markers, then the range is no longer + * empty from a page table perspective and therefore it's + * appropriate to have an anon_vma. + * + * This ensures that on fork, we copy page tables correctly. + */ + err = anon_vma_prepare(vma); + if (err) + return err; + + /* + * Optimistically try to install the guard marker pages first. If any + * non-guard pages are encountered, give up and zap the range before + * trying again. + * + * We try a few times before giving up and releasing back to userland to + * loop around, releasing locks in the process to avoid contention. This + * would only happen if there was a great many racing page faults. + * + * In most cases we should simply install the guard markers immediately + * with no zap or looping. + */ + for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { + unsigned long nr_pages = 0; + + /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ + err = walk_page_range_mm(vma->vm_mm, start, end, + &guard_install_walk_ops, &nr_pages); + if (err < 0) + return err; + + if (err == 0) { + unsigned long nr_expected_pages = PHYS_PFN(end - start); + + VM_WARN_ON(nr_pages != nr_expected_pages); + return 0; + } + + /* + * OK some of the range have non-guard pages mapped, zap + * them. This leaves existing guard pages in place. + */ + zap_page_range_single(vma, start, end - start, NULL); + } + + /* + * We were unable to install the guard pages due to being raced by page + * faults. This should not happen ordinarily. We return to userspace and + * immediately retry, relieving lock contention. + */ + return restart_syscall(); +} + +static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t pudval = pudp_get(pud); + + /* If huge, cannot have guard pages present, so no-op - skip. */ + if (pud_trans_huge(pudval) || pud_devmap(pudval)) + walk->action = ACTION_CONTINUE; + + return 0; +} + +static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t pmdval = pmdp_get(pmd); + + /* If huge, cannot have guard pages present, so no-op - skip. */ + if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) + walk->action = ACTION_CONTINUE; + + return 0; +} + +static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t ptent = ptep_get(pte); + + if (is_guard_pte_marker(ptent)) { + /* Simply clear the PTE marker. */ + pte_clear_not_present_full(walk->mm, addr, pte, false); + update_mmu_cache(walk->vma, addr, pte); + } + + return 0; +} + +static const struct mm_walk_ops guard_remove_walk_ops = { + .pud_entry = guard_remove_pud_entry, + .pmd_entry = guard_remove_pmd_entry, + .pte_entry = guard_remove_pte_entry, + .walk_lock = PGWALK_RDLOCK, +}; + +static long madvise_guard_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + /* + * We're ok with removing guards in mlock()'d ranges, as this is a + * non-destructive action. + */ + if (!is_valid_guard_vma(vma, /* allow_locked = */true)) + return -EINVAL; + + return walk_page_range(vma->vm_mm, start, end, + &guard_remove_walk_ops, NULL); +} + /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own @@ -1098,6 +1314,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); + case MADV_GUARD_INSTALL: + return madvise_guard_install(vma, prev, start, end); + case MADV_GUARD_REMOVE: + return madvise_guard_remove(vma, prev, start, end); } anon_name = anon_vma_name(vma); @@ -1197,6 +1417,8 @@ madvise_behavior_valid(int behavior) case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: + case MADV_GUARD_INSTALL: + case MADV_GUARD_REMOVE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: @@ -1490,6 +1712,23 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, while (iov_iter_count(iter)) { ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), iter_iov_len(iter), behavior); + /* + * An madvise operation is attempting to restart the syscall, + * but we cannot proceed as it would not be correct to repeat + * the operation in aggregate, and would be surprising to the + * user. + * + * As we have already dropped locks, it is safe to just loop and + * try again. We check for fatal signals in case we need exit + * early anyway. + */ + if (ret == -ERESTARTNOINTR) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + continue; + } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); diff --git a/mm/mseal.c b/mm/mseal.c index ece977bd21e1..81d6e980e8a9 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -30,6 +30,7 @@ static bool is_madv_discard(int behavior) case MADV_REMOVE: case MADV_DONTFORK: case MADV_WIPEONFORK: + case MADV_GUARD_INSTALL: return true; }