mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-01 02:36:02 +00:00
bd225530a4
While investigating HVO for THPs [1], it turns out that speculative PFN walkers like compaction can race with vmemmap modifications, e.g., CPU 1 (vmemmap modifier) CPU 2 (speculative PFN walker) ------------------------------- ------------------------------ Allocates an LRU folio page1 Sees page1 Frees page1 Allocates a hugeTLB folio page2 (page1 being a tail of page2) Updates vmemmap mapping page1 get_page_unless_zero(page1) Even though page1->_refcount is zero after HVO, get_page_unless_zero() can still try to modify this read-only field, resulting in a crash. An independent report [2] confirmed this race. There are two discussed approaches to fix this race: 1. Make RO vmemmap RW so that get_page_unless_zero() can fail without triggering a PF. 2. Use RCU to make sure get_page_unless_zero() either sees zero page->_refcount through the old vmemmap or non-zero page->_refcount through the new one. The second approach is preferred here because: 1. It can prevent illegal modifications to struct page[] that has been HVO'ed; 2. It can be generalized, in a way similar to ZERO_PAGE(), to fix similar races in other places, e.g., arch_remove_memory() on x86 [3], which frees vmemmap mapping offlined struct page[]. While adding synchronize_rcu(), the goal is to be surgical, rather than optimized. Specifically, calls to synchronize_rcu() on the error handling paths can be coalesced, but it is not done for the sake of Simplicity: noticeably, this fix removes ~50% more lines than it adds. According to the hugetlb_optimize_vmemmap section in Documentation/admin-guide/sysctl/vm.rst, enabling HVO makes allocating or freeing hugeTLB pages "~2x slower than before". Having synchronize_rcu() on top makes those operations even worse, and this also affects the user interface /proc/sys/vm/nr_overcommit_hugepages. This is *very* hard to trigger: 1. Most hugeTLB use cases I know of are static, i.e., reserved at boot time, because allocating at runtime is not reliable at all. 2. On top of that, someone has to be very unlucky to get tripped over above, because the race window is so small -- I wasn't able to trigger it with a stress testing that does nothing but that (with THPs though). [1] https://lore.kernel.org/20240229183436.4110845-4-yuzhao@google.com/ [2] https://lore.kernel.org/917FFC7F-0615-44DD-90EE-9F85F8EA9974@linux.dev/ [3] https://lore.kernel.org/be130a96-a27e-4240-ad78-776802f57cad@redhat.com/ Link: https://lkml.kernel.org/r/20240627222705.2974207-1-yuzhao@google.com Signed-off-by: Yu Zhao <yuzhao@google.com> Acked-by: Muchun Song <muchun.song@linux.dev> Cc: David Hildenbrand <david@redhat.com> Cc: Frank van der Linden <fvdl@google.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Peter Xu <peterx@redhat.com> Cc: Yang Shi <yang@os.amperecomputing.com> Cc: Yu Zhao <yuzhao@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
301 lines
7.6 KiB
C
301 lines
7.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_PAGE_REF_H
|
|
#define _LINUX_PAGE_REF_H
|
|
|
|
#include <linux/atomic.h>
|
|
#include <linux/mm_types.h>
|
|
#include <linux/page-flags.h>
|
|
#include <linux/tracepoint-defs.h>
|
|
|
|
DECLARE_TRACEPOINT(page_ref_set);
|
|
DECLARE_TRACEPOINT(page_ref_mod);
|
|
DECLARE_TRACEPOINT(page_ref_mod_and_test);
|
|
DECLARE_TRACEPOINT(page_ref_mod_and_return);
|
|
DECLARE_TRACEPOINT(page_ref_mod_unless);
|
|
DECLARE_TRACEPOINT(page_ref_freeze);
|
|
DECLARE_TRACEPOINT(page_ref_unfreeze);
|
|
|
|
#ifdef CONFIG_DEBUG_PAGE_REF
|
|
|
|
/*
|
|
* Ideally we would want to use the trace_<tracepoint>_enabled() helper
|
|
* functions. But due to include header file issues, that is not
|
|
* feasible. Instead we have to open code the static key functions.
|
|
*
|
|
* See trace_##name##_enabled(void) in include/linux/tracepoint.h
|
|
*/
|
|
#define page_ref_tracepoint_active(t) tracepoint_enabled(t)
|
|
|
|
extern void __page_ref_set(struct page *page, int v);
|
|
extern void __page_ref_mod(struct page *page, int v);
|
|
extern void __page_ref_mod_and_test(struct page *page, int v, int ret);
|
|
extern void __page_ref_mod_and_return(struct page *page, int v, int ret);
|
|
extern void __page_ref_mod_unless(struct page *page, int v, int u);
|
|
extern void __page_ref_freeze(struct page *page, int v, int ret);
|
|
extern void __page_ref_unfreeze(struct page *page, int v);
|
|
|
|
#else
|
|
|
|
#define page_ref_tracepoint_active(t) false
|
|
|
|
static inline void __page_ref_set(struct page *page, int v)
|
|
{
|
|
}
|
|
static inline void __page_ref_mod(struct page *page, int v)
|
|
{
|
|
}
|
|
static inline void __page_ref_mod_and_test(struct page *page, int v, int ret)
|
|
{
|
|
}
|
|
static inline void __page_ref_mod_and_return(struct page *page, int v, int ret)
|
|
{
|
|
}
|
|
static inline void __page_ref_mod_unless(struct page *page, int v, int u)
|
|
{
|
|
}
|
|
static inline void __page_ref_freeze(struct page *page, int v, int ret)
|
|
{
|
|
}
|
|
static inline void __page_ref_unfreeze(struct page *page, int v)
|
|
{
|
|
}
|
|
|
|
#endif
|
|
|
|
static inline int page_ref_count(const struct page *page)
|
|
{
|
|
return atomic_read(&page->_refcount);
|
|
}
|
|
|
|
/**
|
|
* folio_ref_count - The reference count on this folio.
|
|
* @folio: The folio.
|
|
*
|
|
* The refcount is usually incremented by calls to folio_get() and
|
|
* decremented by calls to folio_put(). Some typical users of the
|
|
* folio refcount:
|
|
*
|
|
* - Each reference from a page table
|
|
* - The page cache
|
|
* - Filesystem private data
|
|
* - The LRU list
|
|
* - Pipes
|
|
* - Direct IO which references this page in the process address space
|
|
*
|
|
* Return: The number of references to this folio.
|
|
*/
|
|
static inline int folio_ref_count(const struct folio *folio)
|
|
{
|
|
return page_ref_count(&folio->page);
|
|
}
|
|
|
|
static inline int page_count(const struct page *page)
|
|
{
|
|
return folio_ref_count(page_folio(page));
|
|
}
|
|
|
|
static inline void set_page_count(struct page *page, int v)
|
|
{
|
|
atomic_set(&page->_refcount, v);
|
|
if (page_ref_tracepoint_active(page_ref_set))
|
|
__page_ref_set(page, v);
|
|
}
|
|
|
|
static inline void folio_set_count(struct folio *folio, int v)
|
|
{
|
|
set_page_count(&folio->page, v);
|
|
}
|
|
|
|
/*
|
|
* Setup the page count before being freed into the page allocator for
|
|
* the first time (boot or memory hotplug)
|
|
*/
|
|
static inline void init_page_count(struct page *page)
|
|
{
|
|
set_page_count(page, 1);
|
|
}
|
|
|
|
static inline void page_ref_add(struct page *page, int nr)
|
|
{
|
|
atomic_add(nr, &page->_refcount);
|
|
if (page_ref_tracepoint_active(page_ref_mod))
|
|
__page_ref_mod(page, nr);
|
|
}
|
|
|
|
static inline void folio_ref_add(struct folio *folio, int nr)
|
|
{
|
|
page_ref_add(&folio->page, nr);
|
|
}
|
|
|
|
static inline void page_ref_sub(struct page *page, int nr)
|
|
{
|
|
atomic_sub(nr, &page->_refcount);
|
|
if (page_ref_tracepoint_active(page_ref_mod))
|
|
__page_ref_mod(page, -nr);
|
|
}
|
|
|
|
static inline void folio_ref_sub(struct folio *folio, int nr)
|
|
{
|
|
page_ref_sub(&folio->page, nr);
|
|
}
|
|
|
|
static inline int folio_ref_sub_return(struct folio *folio, int nr)
|
|
{
|
|
int ret = atomic_sub_return(nr, &folio->_refcount);
|
|
|
|
if (page_ref_tracepoint_active(page_ref_mod_and_return))
|
|
__page_ref_mod_and_return(&folio->page, -nr, ret);
|
|
return ret;
|
|
}
|
|
|
|
static inline void page_ref_inc(struct page *page)
|
|
{
|
|
atomic_inc(&page->_refcount);
|
|
if (page_ref_tracepoint_active(page_ref_mod))
|
|
__page_ref_mod(page, 1);
|
|
}
|
|
|
|
static inline void folio_ref_inc(struct folio *folio)
|
|
{
|
|
page_ref_inc(&folio->page);
|
|
}
|
|
|
|
static inline void page_ref_dec(struct page *page)
|
|
{
|
|
atomic_dec(&page->_refcount);
|
|
if (page_ref_tracepoint_active(page_ref_mod))
|
|
__page_ref_mod(page, -1);
|
|
}
|
|
|
|
static inline void folio_ref_dec(struct folio *folio)
|
|
{
|
|
page_ref_dec(&folio->page);
|
|
}
|
|
|
|
static inline int page_ref_sub_and_test(struct page *page, int nr)
|
|
{
|
|
int ret = atomic_sub_and_test(nr, &page->_refcount);
|
|
|
|
if (page_ref_tracepoint_active(page_ref_mod_and_test))
|
|
__page_ref_mod_and_test(page, -nr, ret);
|
|
return ret;
|
|
}
|
|
|
|
static inline int folio_ref_sub_and_test(struct folio *folio, int nr)
|
|
{
|
|
return page_ref_sub_and_test(&folio->page, nr);
|
|
}
|
|
|
|
static inline int page_ref_inc_return(struct page *page)
|
|
{
|
|
int ret = atomic_inc_return(&page->_refcount);
|
|
|
|
if (page_ref_tracepoint_active(page_ref_mod_and_return))
|
|
__page_ref_mod_and_return(page, 1, ret);
|
|
return ret;
|
|
}
|
|
|
|
static inline int folio_ref_inc_return(struct folio *folio)
|
|
{
|
|
return page_ref_inc_return(&folio->page);
|
|
}
|
|
|
|
static inline int page_ref_dec_and_test(struct page *page)
|
|
{
|
|
int ret = atomic_dec_and_test(&page->_refcount);
|
|
|
|
if (page_ref_tracepoint_active(page_ref_mod_and_test))
|
|
__page_ref_mod_and_test(page, -1, ret);
|
|
return ret;
|
|
}
|
|
|
|
static inline int folio_ref_dec_and_test(struct folio *folio)
|
|
{
|
|
return page_ref_dec_and_test(&folio->page);
|
|
}
|
|
|
|
static inline int page_ref_dec_return(struct page *page)
|
|
{
|
|
int ret = atomic_dec_return(&page->_refcount);
|
|
|
|
if (page_ref_tracepoint_active(page_ref_mod_and_return))
|
|
__page_ref_mod_and_return(page, -1, ret);
|
|
return ret;
|
|
}
|
|
|
|
static inline int folio_ref_dec_return(struct folio *folio)
|
|
{
|
|
return page_ref_dec_return(&folio->page);
|
|
}
|
|
|
|
static inline bool page_ref_add_unless(struct page *page, int nr, int u)
|
|
{
|
|
bool ret = false;
|
|
|
|
rcu_read_lock();
|
|
/* avoid writing to the vmemmap area being remapped */
|
|
if (!page_is_fake_head(page) && page_ref_count(page) != u)
|
|
ret = atomic_add_unless(&page->_refcount, nr, u);
|
|
rcu_read_unlock();
|
|
|
|
if (page_ref_tracepoint_active(page_ref_mod_unless))
|
|
__page_ref_mod_unless(page, nr, ret);
|
|
return ret;
|
|
}
|
|
|
|
static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u)
|
|
{
|
|
return page_ref_add_unless(&folio->page, nr, u);
|
|
}
|
|
|
|
/**
|
|
* folio_try_get - Attempt to increase the refcount on a folio.
|
|
* @folio: The folio.
|
|
*
|
|
* If you do not already have a reference to a folio, you can attempt to
|
|
* get one using this function. It may fail if, for example, the folio
|
|
* has been freed since you found a pointer to it, or it is frozen for
|
|
* the purposes of splitting or migration.
|
|
*
|
|
* Return: True if the reference count was successfully incremented.
|
|
*/
|
|
static inline bool folio_try_get(struct folio *folio)
|
|
{
|
|
return folio_ref_add_unless(folio, 1, 0);
|
|
}
|
|
|
|
static inline bool folio_ref_try_add(struct folio *folio, int count)
|
|
{
|
|
return folio_ref_add_unless(folio, count, 0);
|
|
}
|
|
|
|
static inline int page_ref_freeze(struct page *page, int count)
|
|
{
|
|
int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
|
|
|
|
if (page_ref_tracepoint_active(page_ref_freeze))
|
|
__page_ref_freeze(page, count, ret);
|
|
return ret;
|
|
}
|
|
|
|
static inline int folio_ref_freeze(struct folio *folio, int count)
|
|
{
|
|
return page_ref_freeze(&folio->page, count);
|
|
}
|
|
|
|
static inline void page_ref_unfreeze(struct page *page, int count)
|
|
{
|
|
VM_BUG_ON_PAGE(page_count(page) != 0, page);
|
|
VM_BUG_ON(count == 0);
|
|
|
|
atomic_set_release(&page->_refcount, count);
|
|
if (page_ref_tracepoint_active(page_ref_unfreeze))
|
|
__page_ref_unfreeze(page, count);
|
|
}
|
|
|
|
static inline void folio_ref_unfreeze(struct folio *folio, int count)
|
|
{
|
|
page_ref_unfreeze(&folio->page, count);
|
|
}
|
|
#endif
|