diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 8977a4e8a7f8..febbd2b165df 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -179,6 +179,67 @@ static inline struct uffd_msg userfault_msg(unsigned long address, return msg; } +/* + * Verify the pagetables are still not ok after having reigstered into + * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any + * userfault that has already been resolved, if userfaultfd_read and + * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different + * threads. + */ +static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, + unsigned long address, + unsigned long flags, + unsigned long reason) +{ + struct mm_struct *mm = ctx->mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + bool ret = true; + + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + pmd = pmd_offset(pud, address); + /* + * READ_ONCE must function as a barrier with narrower scope + * and it must be equivalent to: + * _pmd = *pmd; barrier(); + * + * This is to deal with the instability (as in + * pmd_trans_unstable) of the pmd. + */ + _pmd = READ_ONCE(*pmd); + if (!pmd_present(_pmd)) + goto out; + + ret = false; + if (pmd_trans_huge(_pmd)) + goto out; + + /* + * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it + * and use the standard pte_offset_map() instead of parsing _pmd. + */ + pte = pte_offset_map(pmd, address); + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + if (pte_none(*pte)) + ret = true; + pte_unmap(pte); + +out: + return ret; +} + /* * The locking rules involved in returning VM_FAULT_RETRY depending on * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and @@ -201,6 +262,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; int ret; + bool must_wait; BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); @@ -260,9 +322,6 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, /* take the reference before dropping the mmap_sem */ userfaultfd_ctx_get(ctx); - /* be gentle and immediately relinquish the mmap_sem */ - up_read(&mm->mmap_sem); - init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; uwq.msg = userfault_msg(address, flags, reason); @@ -282,7 +341,10 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, set_current_state(TASK_KILLABLE); spin_unlock(&ctx->fault_pending_wqh.lock); - if (likely(!ACCESS_ONCE(ctx->released) && + must_wait = userfaultfd_must_wait(ctx, address, flags, reason); + up_read(&mm->mmap_sem); + + if (likely(must_wait && !ACCESS_ONCE(ctx->released) && !fatal_signal_pending(current))) { wake_up_poll(&ctx->fd_wqh, POLLIN); schedule(); @@ -886,17 +948,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, } /* - * userfaultfd_wake is needed in case an userfault is in flight by the - * time a UFFDIO_COPY (or other ioctl variants) completes. The page - * may be well get mapped and the page fault if repeated wouldn't lead - * to a userfault anymore, but before scheduling in TASK_KILLABLE mode - * handle_userfault() doesn't recheck the pagetables and it doesn't - * serialize against UFFDO_COPY (or other ioctl variants). Ultimately - * the knowledge of which pages are mapped is left to userland who is - * responsible for handling the race between read() userfaults and - * background UFFDIO_COPY (or other ioctl variants), if done by - * separate concurrent threads. - * * userfaultfd_wake may be used in combination with the * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. */