linux-next/kernel/events/ring_buffer.c
Lorenzo Stoakes b709eb872e perf: map pages in advance
We are adjusting struct page to make it smaller, removing unneeded fields
which correctly belong to struct folio.

Two of those fields are page->index and page->mapping. Perf is currently
making use of both of these. This is unnecessary. This patch eliminates
this.

Perf establishes its own internally controlled memory-mapped pages using
vm_ops hooks. The first page in the mapping is the read/write user control
page, and the rest of the mapping consists of read-only pages.

The VMA is backed by kernel memory either from the buddy allocator or
vmalloc depending on configuration. It is intended to be mapped read/write,
but because it has a page_mkwrite() hook, vma_wants_writenotify() indicates
that it should be mapped read-only.

When a write fault occurs, the provided page_mkwrite() hook,
perf_mmap_fault() (doing double duty handing faults as well) uses the
vmf->pgoff field to determine if this is the first page, allowing for the
desired read/write first page, read-only rest mapping.

For this to work the implementation has to carefully work around faulting
logic. When a page is write-faulted, the fault() hook is called first, then
its page_mkwrite() hook is called (to allow for dirty tracking in file
systems).

On fault we set the folio's mapping in perf_mmap_fault(), this is because
when do_page_mkwrite() is subsequently invoked, it treats a missing mapping
as an indicator that the fault should be retried.

We also set the folio's index so, given the folio is being treated as faux
user memory, it correctly references its offset within the VMA.

This explains why the mapping and index fields are used - but it's not
necessary.

We preallocate pages when perf_mmap() is called for the first time via
rb_alloc(), and further allocate auxiliary pages via rb_aux_alloc() as
needed if the mapping requires it.

This allocation is done in the f_ops->mmap() hook provided in perf_mmap(),
and so we can instead simply map all the memory right away here - there's
no point in handling (read) page faults when we don't demand page nor need
to be notified about them (perf does not).

This patch therefore changes this logic to map everything when the mmap()
hook is called, establishing a PFN map. It implements vm_ops->pfn_mkwrite()
to provide the required read/write vs. read-only behaviour, which does not
require the previously implemented workarounds.

While it is not ideal to use a VM_PFNMAP here, doing anything else will
result in the page_mkwrite() hook need to be provided, which requires the
same page->mapping hack this patch seeks to undo.

It will also result in the pages being treated as folios and placed on the
rmap, which really does not make sense for these mappings.

Semantically it makes sense to establish this as some kind of special
mapping, as the pages are managed by perf and are not strictly user pages,
but currently the only means by which we can do so functionally while
maintaining the required R/W and R/O behaviour is a PFN map.

There should be no change to actual functionality as a result of this
change.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20250103153151.124163-1-lorenzo.stoakes@oracle.com
2025-01-10 18:16:50 +01:00

964 lines
23 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Performance events ring-buffer code:
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*/
#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/circ_buf.h>
#include <linux/poll.h>
#include <linux/nospec.h>
#include "internal.h"
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1;
if (*perf_event_fasync(handle->event) && !handle->event->pending_kill)
handle->event->pending_kill = POLL_IN;
irq_work_queue(&handle->event->pending_irq);
}
/*
* We need to ensure a later event_id doesn't publish a head when a former
* event isn't done writing. However since we need to deal with NMIs we
* cannot fully serialize things.
*
* We only publish the head (and generate a wakeup) when the outer-most
* event completes.
*/
static void perf_output_get_handle(struct perf_output_handle *handle)
{
struct perf_buffer *rb = handle->rb;
preempt_disable();
/*
* Avoid an explicit LOAD/STORE such that architectures with memops
* can use them.
*/
(*(volatile unsigned int *)&rb->nest)++;
handle->wakeup = local_read(&rb->wakeup);
}
static void perf_output_put_handle(struct perf_output_handle *handle)
{
struct perf_buffer *rb = handle->rb;
unsigned long head;
unsigned int nest;
/*
* If this isn't the outermost nesting, we don't have to update
* @rb->user_page->data_head.
*/
nest = READ_ONCE(rb->nest);
if (nest > 1) {
WRITE_ONCE(rb->nest, nest - 1);
goto out;
}
again:
/*
* In order to avoid publishing a head value that goes backwards,
* we must ensure the load of @rb->head happens after we've
* incremented @rb->nest.
*
* Otherwise we can observe a @rb->head value before one published
* by an IRQ/NMI happening between the load and the increment.
*/
barrier();
head = local_read(&rb->head);
/*
* IRQ/NMI can happen here and advance @rb->head, causing our
* load above to be stale.
*/
/*
* Since the mmap() consumer (userspace) can run on a different CPU:
*
* kernel user
*
* if (LOAD ->data_tail) { LOAD ->data_head
* (A) smp_rmb() (C)
* STORE $data LOAD $data
* smp_wmb() (B) smp_mb() (D)
* STORE ->data_head STORE ->data_tail
* }
*
* Where A pairs with D, and B pairs with C.
*
* In our case (A) is a control dependency that separates the load of
* the ->data_tail and the stores of $data. In case ->data_tail
* indicates there is no room in the buffer to store $data we do not.
*
* D needs to be a full barrier since it separates the data READ
* from the tail WRITE.
*
* For B a WMB is sufficient since it separates two WRITEs, and for C
* an RMB is sufficient since it separates two READs.
*
* See perf_output_begin().
*/
smp_wmb(); /* B, matches C */
WRITE_ONCE(rb->user_page->data_head, head);
/*
* We must publish the head before decrementing the nest count,
* otherwise an IRQ/NMI can publish a more recent head value and our
* write will (temporarily) publish a stale value.
*/
barrier();
WRITE_ONCE(rb->nest, 0);
/*
* Ensure we decrement @rb->nest before we validate the @rb->head.
* Otherwise we cannot be sure we caught the 'last' nested update.
*/
barrier();
if (unlikely(head != local_read(&rb->head))) {
WRITE_ONCE(rb->nest, 1);
goto again;
}
if (handle->wakeup != local_read(&rb->wakeup))
perf_output_wakeup(handle);
out:
preempt_enable();
}
static __always_inline bool
ring_buffer_has_space(unsigned long head, unsigned long tail,
unsigned long data_size, unsigned int size,
bool backward)
{
if (!backward)
return CIRC_SPACE(head, tail, data_size) >= size;
else
return CIRC_SPACE(tail, head, data_size) >= size;
}
static __always_inline int
__perf_output_begin(struct perf_output_handle *handle,
struct perf_sample_data *data,
struct perf_event *event, unsigned int size,
bool backward)
{
struct perf_buffer *rb;
unsigned long tail, offset, head;
int have_lost, page_shift;
struct {
struct perf_event_header header;
u64 id;
u64 lost;
} lost_event;
rcu_read_lock();
/*
* For inherited events we send all the output towards the parent.
*/
if (event->parent)
event = event->parent;
rb = rcu_dereference(event->rb);
if (unlikely(!rb))
goto out;
if (unlikely(rb->paused)) {
if (rb->nr_pages) {
local_inc(&rb->lost);
atomic64_inc(&event->lost_samples);
}
goto out;
}
handle->rb = rb;
handle->event = event;
have_lost = local_read(&rb->lost);
if (unlikely(have_lost)) {
size += sizeof(lost_event);
if (event->attr.sample_id_all)
size += event->id_header_size;
}
perf_output_get_handle(handle);
offset = local_read(&rb->head);
do {
head = offset;
tail = READ_ONCE(rb->user_page->data_tail);
if (!rb->overwrite) {
if (unlikely(!ring_buffer_has_space(head, tail,
perf_data_size(rb),
size, backward)))
goto fail;
}
/*
* The above forms a control dependency barrier separating the
* @tail load above from the data stores below. Since the @tail
* load is required to compute the branch to fail below.
*
* A, matches D; the full memory barrier userspace SHOULD issue
* after reading the data and before storing the new tail
* position.
*
* See perf_output_put_handle().
*/
if (!backward)
head += size;
else
head -= size;
} while (!local_try_cmpxchg(&rb->head, &offset, head));
if (backward) {
offset = head;
head = (u64)(-head);
}
/*
* We rely on the implied barrier() by local_cmpxchg() to ensure
* none of the data stores below can be lifted up by the compiler.
*/
if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
page_shift = PAGE_SHIFT + page_order(rb);
handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
offset &= (1UL << page_shift) - 1;
handle->addr = rb->data_pages[handle->page] + offset;
handle->size = (1UL << page_shift) - offset;
if (unlikely(have_lost)) {
lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
/* XXX mostly redundant; @data is already fully initializes */
perf_event_header__init_id(&lost_event.header, data, event);
perf_output_put(handle, lost_event);
perf_event__output_id_sample(event, handle, data);
}
return 0;
fail:
local_inc(&rb->lost);
atomic64_inc(&event->lost_samples);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
return -ENOSPC;
}
int perf_output_begin_forward(struct perf_output_handle *handle,
struct perf_sample_data *data,
struct perf_event *event, unsigned int size)
{
return __perf_output_begin(handle, data, event, size, false);
}
int perf_output_begin_backward(struct perf_output_handle *handle,
struct perf_sample_data *data,
struct perf_event *event, unsigned int size)
{
return __perf_output_begin(handle, data, event, size, true);
}
int perf_output_begin(struct perf_output_handle *handle,
struct perf_sample_data *data,
struct perf_event *event, unsigned int size)
{
return __perf_output_begin(handle, data, event, size,
unlikely(is_write_backward(event)));
}
unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
return __output_copy(handle, buf, len);
}
unsigned int perf_output_skip(struct perf_output_handle *handle,
unsigned int len)
{
return __output_skip(handle, NULL, len);
}
void perf_output_end(struct perf_output_handle *handle)
{
perf_output_put_handle(handle);
rcu_read_unlock();
}
static void
ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
{
long max_size = perf_data_size(rb);
if (watermark)
rb->watermark = min(max_size, watermark);
if (!rb->watermark)
rb->watermark = max_size / 2;
if (flags & RING_BUFFER_WRITABLE)
rb->overwrite = 0;
else
rb->overwrite = 1;
refcount_set(&rb->refcount, 1);
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
/*
* perf_output_begin() only checks rb->paused, therefore
* rb->paused must be true if we have no pages for output.
*/
if (!rb->nr_pages)
rb->paused = 1;
mutex_init(&rb->aux_mutex);
}
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
{
/*
* OVERWRITE is determined by perf_aux_output_end() and can't
* be passed in directly.
*/
if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
return;
handle->aux_flags |= flags;
}
EXPORT_SYMBOL_GPL(perf_aux_output_flag);
/*
* This is called before hardware starts writing to the AUX area to
* obtain an output handle and make sure there's room in the buffer.
* When the capture completes, call perf_aux_output_end() to commit
* the recorded data to the buffer.
*
* The ordering is similar to that of perf_output_{begin,end}, with
* the exception of (B), which should be taken care of by the pmu
* driver, since ordering rules will differ depending on hardware.
*
* Call this from pmu::start(); see the comment in perf_aux_output_end()
* about its use in pmu callbacks. Both can also be called from the PMI
* handler if needed.
*/
void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *event)
{
struct perf_event *output_event = event;
unsigned long aux_head, aux_tail;
struct perf_buffer *rb;
unsigned int nest;
if (output_event->parent)
output_event = output_event->parent;
/*
* Since this will typically be open across pmu::add/pmu::del, we
* grab ring_buffer's refcount instead of holding rcu read lock
* to make sure it doesn't disappear under us.
*/
rb = ring_buffer_get(output_event);
if (!rb)
return NULL;
if (!rb_has_aux(rb))
goto err;
/*
* If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
* about to get freed, so we leave immediately.
*
* Checking rb::aux_mmap_count and rb::refcount has to be done in
* the same order, see perf_mmap_close. Otherwise we end up freeing
* aux pages in this path, which is a bug, because in_atomic().
*/
if (!atomic_read(&rb->aux_mmap_count))
goto err;
if (!refcount_inc_not_zero(&rb->aux_refcount))
goto err;
nest = READ_ONCE(rb->aux_nest);
/*
* Nesting is not supported for AUX area, make sure nested
* writers are caught early
*/
if (WARN_ON_ONCE(nest))
goto err_put;
WRITE_ONCE(rb->aux_nest, nest + 1);
aux_head = rb->aux_head;
handle->rb = rb;
handle->event = event;
handle->head = aux_head;
handle->size = 0;
handle->aux_flags = 0;
/*
* In overwrite mode, AUX data stores do not depend on aux_tail,
* therefore (A) control dependency barrier does not exist. The
* (B) <-> (C) ordering is still observed by the pmu driver.
*/
if (!rb->aux_overwrite) {
aux_tail = READ_ONCE(rb->user_page->aux_tail);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
if (aux_head - aux_tail < perf_aux_size(rb))
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
/*
* handle->size computation depends on aux_tail load; this forms a
* control dependency barrier separating aux_tail load from aux data
* store that will be enabled on successful return
*/
if (!handle->size) { /* A, matches D */
event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
WRITE_ONCE(rb->aux_nest, 0);
goto err_put;
}
}
return handle->rb->aux_priv;
err_put:
/* can't be last */
rb_free_aux(rb);
err:
ring_buffer_put(rb);
handle->event = NULL;
return NULL;
}
EXPORT_SYMBOL_GPL(perf_aux_output_begin);
static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb)
{
if (rb->aux_overwrite)
return false;
if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
return true;
}
return false;
}
/*
* Commit the data written by hardware into the ring buffer by adjusting
* aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
* pmu driver's responsibility to observe ordering rules of the hardware,
* so that all the data is externally visible before this is called.
*
* Note: this has to be called from pmu::stop() callback, as the assumption
* of the AUX buffer management code is that after pmu::stop(), the AUX
* transaction must be stopped and therefore drop the AUX reference count.
*/
void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
{
bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
struct perf_buffer *rb = handle->rb;
unsigned long aux_head;
/* in overwrite mode, driver provides aux_head via handle */
if (rb->aux_overwrite) {
handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
aux_head = handle->head;
rb->aux_head = aux_head;
} else {
handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
aux_head = rb->aux_head;
rb->aux_head += size;
}
/*
* Only send RECORD_AUX if we have something useful to communicate
*
* Note: the OVERWRITE records by themselves are not considered
* useful, as they don't communicate any *new* information,
* aside from the short-lived offset, that becomes history at
* the next event sched-in and therefore isn't useful.
* The userspace that needs to copy out AUX data in overwrite
* mode should know to use user_page::aux_head for the actual
* offset. So, from now on we don't output AUX records that
* have *only* OVERWRITE flag set.
*/
if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
perf_event_aux_event(handle->event, aux_head, size,
handle->aux_flags);
WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
if (rb_need_aux_wakeup(rb))
wakeup = true;
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
handle->event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
}
handle->event = NULL;
WRITE_ONCE(rb->aux_nest, 0);
/* can't be last */
rb_free_aux(rb);
ring_buffer_put(rb);
}
EXPORT_SYMBOL_GPL(perf_aux_output_end);
/*
* Skip over a given number of bytes in the AUX buffer, due to, for example,
* hardware's alignment constraints.
*/
int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
{
struct perf_buffer *rb = handle->rb;
if (size > handle->size)
return -ENOSPC;
rb->aux_head += size;
WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
if (rb_need_aux_wakeup(rb)) {
perf_output_wakeup(handle);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
}
handle->head = rb->aux_head;
handle->size -= size;
return 0;
}
EXPORT_SYMBOL_GPL(perf_aux_output_skip);
void *perf_get_aux(struct perf_output_handle *handle)
{
/* this is only valid between perf_aux_output_begin and *_end */
if (!handle->event)
return NULL;
return handle->rb->aux_priv;
}
EXPORT_SYMBOL_GPL(perf_get_aux);
/*
* Copy out AUX data from an AUX handle.
*/
long perf_output_copy_aux(struct perf_output_handle *aux_handle,
struct perf_output_handle *handle,
unsigned long from, unsigned long to)
{
struct perf_buffer *rb = aux_handle->rb;
unsigned long tocopy, remainder, len = 0;
void *addr;
from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
do {
tocopy = PAGE_SIZE - offset_in_page(from);
if (to > from)
tocopy = min(tocopy, to - from);
if (!tocopy)
break;
addr = rb->aux_pages[from >> PAGE_SHIFT];
addr += offset_in_page(from);
remainder = perf_output_copy(handle, addr, tocopy);
if (remainder)
return -EFAULT;
len += tocopy;
from += tocopy;
from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
} while (to != from);
return len;
}
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
static struct page *rb_alloc_aux_page(int node, int order)
{
struct page *page;
if (order > MAX_PAGE_ORDER)
order = MAX_PAGE_ORDER;
do {
page = alloc_pages_node(node, PERF_AUX_GFP, order);
} while (!page && order--);
if (page && order) {
/*
* Communicate the allocation size to the driver:
* if we managed to secure a high-order allocation,
* set its first page's private to this order;
* !PagePrivate(page) means it's just a normal page.
*/
split_page(page, order);
SetPagePrivate(page);
set_page_private(page, order);
}
return page;
}
static void rb_free_aux_page(struct perf_buffer *rb, int idx)
{
struct page *page = virt_to_page(rb->aux_pages[idx]);
ClearPagePrivate(page);
__free_page(page);
}
static void __rb_free_aux(struct perf_buffer *rb)
{
int pg;
/*
* Should never happen, the last reference should be dropped from
* perf_mmap_close() path, which first stops aux transactions (which
* in turn are the atomic holders of aux_refcount) and then does the
* last rb_free_aux().
*/
WARN_ON_ONCE(in_atomic());
if (rb->aux_priv) {
rb->free_aux(rb->aux_priv);
rb->free_aux = NULL;
rb->aux_priv = NULL;
}
if (rb->aux_nr_pages) {
for (pg = 0; pg < rb->aux_nr_pages; pg++)
rb_free_aux_page(rb, pg);
kfree(rb->aux_pages);
rb->aux_nr_pages = 0;
}
}
int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags)
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
int ret = -ENOMEM, max_order;
if (!has_aux(event))
return -EOPNOTSUPP;
if (nr_pages <= 0)
return -EINVAL;
if (!overwrite) {
/*
* Watermark defaults to half the buffer, and so does the
* max_order, to aid PMU drivers in double buffering.
*/
if (!watermark)
watermark = min_t(unsigned long,
U32_MAX,
(unsigned long)nr_pages << (PAGE_SHIFT - 1));
/*
* Use aux_watermark as the basis for chunking to
* help PMU drivers honor the watermark.
*/
max_order = get_order(watermark);
} else {
/*
* We need to start with the max_order that fits in nr_pages,
* not the other way around, hence ilog2() and not get_order.
*/
max_order = ilog2(nr_pages);
watermark = 0;
}
/*
* kcalloc_node() is unable to allocate buffer if the size is larger
* than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case.
*/
if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER)
return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node);
if (!rb->aux_pages)
return -ENOMEM;
rb->free_aux = event->pmu->free_aux;
for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
struct page *page;
int last, order;
order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
page = rb_alloc_aux_page(node, order);
if (!page)
goto out;
for (last = rb->aux_nr_pages + (1 << page_private(page));
last > rb->aux_nr_pages; rb->aux_nr_pages++)
rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
}
/*
* In overwrite mode, PMUs that don't support SG may not handle more
* than one contiguous allocation, since they rely on PMI to do double
* buffering. In this case, the entire buffer has to be one contiguous
* chunk.
*/
if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
overwrite) {
struct page *page = virt_to_page(rb->aux_pages[0]);
if (page_private(page) != max_order)
goto out;
}
rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
overwrite);
if (!rb->aux_priv)
goto out;
ret = 0;
/*
* aux_pages (and pmu driver's private data, aux_priv) will be
* referenced in both producer's and consumer's contexts, thus
* we keep a refcount here to make sure either of the two can
* reference them safely.
*/
refcount_set(&rb->aux_refcount, 1);
rb->aux_overwrite = overwrite;
rb->aux_watermark = watermark;
out:
if (!ret)
rb->aux_pgoff = pgoff;
else
__rb_free_aux(rb);
return ret;
}
void rb_free_aux(struct perf_buffer *rb)
{
if (refcount_dec_and_test(&rb->aux_refcount))
__rb_free_aux(rb);
}
#ifndef CONFIG_PERF_USE_VMALLOC
/*
* Back perf_mmap() with regular GFP_KERNEL-0 pages.
*/
static struct page *
__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
if (pgoff > rb->nr_pages)
return NULL;
if (pgoff == 0)
return virt_to_page(rb->user_page);
return virt_to_page(rb->data_pages[pgoff - 1]);
}
static void *perf_mmap_alloc_page(int cpu)
{
struct page *page;
int node;
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
return NULL;
return page_address(page);
}
static void perf_mmap_free_page(void *addr)
{
struct page *page = virt_to_page(addr);
__free_page(page);
}
struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
struct perf_buffer *rb;
unsigned long size;
int i, node;
size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER)
goto fail;
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
rb = kzalloc_node(size, GFP_KERNEL, node);
if (!rb)
goto fail;
rb->user_page = perf_mmap_alloc_page(cpu);
if (!rb->user_page)
goto fail_user_page;
for (i = 0; i < nr_pages; i++) {
rb->data_pages[i] = perf_mmap_alloc_page(cpu);
if (!rb->data_pages[i])
goto fail_data_pages;
}
rb->nr_pages = nr_pages;
ring_buffer_init(rb, watermark, flags);
return rb;
fail_data_pages:
for (i--; i >= 0; i--)
perf_mmap_free_page(rb->data_pages[i]);
perf_mmap_free_page(rb->user_page);
fail_user_page:
kfree(rb);
fail:
return NULL;
}
void rb_free(struct perf_buffer *rb)
{
int i;
perf_mmap_free_page(rb->user_page);
for (i = 0; i < rb->nr_pages; i++)
perf_mmap_free_page(rb->data_pages[i]);
kfree(rb);
}
#else
static struct page *
__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
/* The '>' counts in the user page. */
if (pgoff > data_page_nr(rb))
return NULL;
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
}
static void rb_free_work(struct work_struct *work)
{
struct perf_buffer *rb;
rb = container_of(work, struct perf_buffer, work);
vfree(rb->user_page);
kfree(rb);
}
void rb_free(struct perf_buffer *rb)
{
schedule_work(&rb->work);
}
struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
struct perf_buffer *rb;
unsigned long size;
void *all_buf;
int node;
size = sizeof(struct perf_buffer);
size += sizeof(void *);
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
rb = kzalloc_node(size, GFP_KERNEL, node);
if (!rb)
goto fail;
INIT_WORK(&rb->work, rb_free_work);
all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
if (!all_buf)
goto fail_all_buf;
rb->user_page = all_buf;
rb->data_pages[0] = all_buf + PAGE_SIZE;
if (nr_pages) {
rb->nr_pages = 1;
rb->page_order = ilog2(nr_pages);
}
ring_buffer_init(rb, watermark, flags);
return rb;
fail_all_buf:
kfree(rb);
fail:
return NULL;
}
#endif
struct page *
perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
if (rb->aux_nr_pages) {
/* above AUX space */
if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
return NULL;
/* AUX space */
if (pgoff >= rb->aux_pgoff) {
int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
return virt_to_page(rb->aux_pages[aux_pgoff]);
}
}
return __perf_mmap_to_page(rb, pgoff);
}