Merge branch 'bpf-perf'

Alexei Starovoitov says:

====================
bpf_perf_event_output helper

Over the last year there were multiple attempts to let eBPF programs
output data into perf events by He Kuang and Wangnan.
The last one was:
https://lkml.org/lkml/2015/7/20/736
It was almost perfect with exception that all bpf programs would sent
data into one global perf_event.
This patch set takes different approach by letting user space
open independent PERF_COUNT_SW_BPF_OUTPUT events, so that program
output won't collide.

Wangnan is working on corresponding perf patches.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2015-10-22 06:42:23 -07:00
commit 721daebbdb
10 changed files with 308 additions and 6 deletions

View File

@ -287,6 +287,17 @@ enum bpf_func_id {
* Return: realm if != 0 * Return: realm if != 0
*/ */
BPF_FUNC_get_route_realm, BPF_FUNC_get_route_realm,
/**
* bpf_perf_event_output(ctx, map, index, data, size) - output perf raw sample
* @ctx: struct pt_regs*
* @map: pointer to perf_event_array map
* @index: index of event in the map
* @data: data on stack to be output as raw data
* @size: size of data
* Return: 0 on success
*/
BPF_FUNC_perf_event_output,
__BPF_FUNC_MAX_ID, __BPF_FUNC_MAX_ID,
}; };

View File

@ -110,6 +110,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_ALIGNMENT_FAULTS = 7, PERF_COUNT_SW_ALIGNMENT_FAULTS = 7,
PERF_COUNT_SW_EMULATION_FAULTS = 8, PERF_COUNT_SW_EMULATION_FAULTS = 8,
PERF_COUNT_SW_DUMMY = 9, PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT = 10,
PERF_COUNT_SW_MAX, /* non-ABI */ PERF_COUNT_SW_MAX, /* non-ABI */
}; };

View File

@ -295,6 +295,8 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
return (void *)attr; return (void *)attr;
if (attr->type != PERF_TYPE_RAW && if (attr->type != PERF_TYPE_RAW &&
!(attr->type == PERF_TYPE_SOFTWARE &&
attr->config == PERF_COUNT_SW_BPF_OUTPUT) &&
attr->type != PERF_TYPE_HARDWARE) { attr->type != PERF_TYPE_HARDWARE) {
perf_event_release_kernel(event); perf_event_release_kernel(event);
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);

View File

@ -245,6 +245,7 @@ static const struct {
} func_limit[] = { } func_limit[] = {
{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
}; };
static void print_verifier_state(struct verifier_env *env) static void print_verifier_state(struct verifier_env *env)
@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
* don't allow any other map type to be passed into * don't allow any other map type to be passed into
* the special func; * the special func;
*/ */
if (bool_map != bool_func) if (bool_func && bool_map != bool_func)
return -EINVAL; return -EINVAL;
} }

View File

@ -5286,9 +5286,15 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_RAW) { if (sample_type & PERF_SAMPLE_RAW) {
if (data->raw) { if (data->raw) {
perf_output_put(handle, data->raw->size); u32 raw_size = data->raw->size;
__output_copy(handle, data->raw->data, u32 real_size = round_up(raw_size + sizeof(u32),
data->raw->size); sizeof(u64)) - sizeof(u32);
u64 zero = 0;
perf_output_put(handle, real_size);
__output_copy(handle, data->raw->data, raw_size);
if (real_size - raw_size)
__output_copy(handle, &zero, real_size - raw_size);
} else { } else {
struct { struct {
u32 size; u32 size;
@ -5420,8 +5426,7 @@ void perf_prepare_sample(struct perf_event_header *header,
else else
size += sizeof(u32); size += sizeof(u32);
WARN_ON_ONCE(size & (sizeof(u64)-1)); header->size += round_up(size, sizeof(u64));
header->size += size;
} }
if (sample_type & PERF_SAMPLE_BRANCH_STACK) { if (sample_type & PERF_SAMPLE_BRANCH_STACK) {

View File

@ -215,6 +215,50 @@ const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING,
}; };
static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
{
struct pt_regs *regs = (struct pt_regs *) (long) r1;
struct bpf_map *map = (struct bpf_map *) (long) r2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
struct perf_event *event;
struct perf_raw_record raw = {
.size = size,
.data = data,
};
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
event = (struct perf_event *)array->ptrs[index];
if (unlikely(!event))
return -ENOENT;
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;
if (unlikely(event->oncpu != smp_processor_id()))
return -EOPNOTSUPP;
perf_sample_data_init(&sample_data, 0, 0);
sample_data.raw = &raw;
perf_event_output(event, &sample_data, regs);
return 0;
}
static const struct bpf_func_proto bpf_perf_event_output_proto = {
.func = bpf_perf_event_output,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_STACK,
.arg5_type = ARG_CONST_STACK_SIZE,
};
static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{ {
switch (func_id) { switch (func_id) {
@ -242,6 +286,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_get_smp_processor_id_proto; return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto; return &bpf_perf_event_read_proto;
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto;
default: default:
return NULL; return NULL;
} }

View File

@ -13,6 +13,7 @@ hostprogs-y += tracex3
hostprogs-y += tracex4 hostprogs-y += tracex4
hostprogs-y += tracex5 hostprogs-y += tracex5
hostprogs-y += tracex6 hostprogs-y += tracex6
hostprogs-y += trace_output
hostprogs-y += lathist hostprogs-y += lathist
test_verifier-objs := test_verifier.o libbpf.o test_verifier-objs := test_verifier.o libbpf.o
@ -27,6 +28,7 @@ tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
tracex4-objs := bpf_load.o libbpf.o tracex4_user.o tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
tracex5-objs := bpf_load.o libbpf.o tracex5_user.o tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
tracex6-objs := bpf_load.o libbpf.o tracex6_user.o tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
trace_output-objs := bpf_load.o libbpf.o trace_output_user.o
lathist-objs := bpf_load.o libbpf.o lathist_user.o lathist-objs := bpf_load.o libbpf.o lathist_user.o
# Tell kbuild to always build the programs # Tell kbuild to always build the programs
@ -40,6 +42,7 @@ always += tracex3_kern.o
always += tracex4_kern.o always += tracex4_kern.o
always += tracex5_kern.o always += tracex5_kern.o
always += tracex6_kern.o always += tracex6_kern.o
always += trace_output_kern.o
always += tcbpf1_kern.o always += tcbpf1_kern.o
always += lathist_kern.o always += lathist_kern.o
@ -55,6 +58,7 @@ HOSTLOADLIBES_tracex3 += -lelf
HOSTLOADLIBES_tracex4 += -lelf -lrt HOSTLOADLIBES_tracex4 += -lelf -lrt
HOSTLOADLIBES_tracex5 += -lelf HOSTLOADLIBES_tracex5 += -lelf
HOSTLOADLIBES_tracex6 += -lelf HOSTLOADLIBES_tracex6 += -lelf
HOSTLOADLIBES_trace_output += -lelf -lrt
HOSTLOADLIBES_lathist += -lelf HOSTLOADLIBES_lathist += -lelf
# point this to your LLVM backend with bpf support # point this to your LLVM backend with bpf support
@ -64,3 +68,6 @@ $(obj)/%.o: $(src)/%.c
clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \ clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
-D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \ -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
-O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
-D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
-O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=asm -o $@.s

View File

@ -37,6 +37,8 @@ static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) =
(void *) BPF_FUNC_clone_redirect; (void *) BPF_FUNC_clone_redirect;
static int (*bpf_redirect)(int ifindex, int flags) = static int (*bpf_redirect)(int ifindex, int flags) =
(void *) BPF_FUNC_redirect; (void *) BPF_FUNC_redirect;
static int (*bpf_perf_event_output)(void *ctx, void *map, int index, void *data, int size) =
(void *) BPF_FUNC_perf_event_output;
/* llvm builtin functions that eBPF C program may use to /* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions * emit BPF_LD_ABS and BPF_LD_IND instructions

View File

@ -0,0 +1,31 @@
#include <linux/ptrace.h>
#include <linux/version.h>
#include <uapi/linux/bpf.h>
#include "bpf_helpers.h"
struct bpf_map_def SEC("maps") my_map = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(u32),
.max_entries = 2,
};
SEC("kprobe/sys_write")
int bpf_prog1(struct pt_regs *ctx)
{
struct S {
u64 pid;
u64 cookie;
} data;
memset(&data, 0, sizeof(data));
data.pid = bpf_get_current_pid_tgid();
data.cookie = 0x12345678;
bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data));
return 0;
}
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;

View File

@ -0,0 +1,196 @@
/* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <fcntl.h>
#include <poll.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <linux/bpf.h>
#include <errno.h>
#include <assert.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <time.h>
#include <signal.h>
#include "libbpf.h"
#include "bpf_load.h"
static int pmu_fd;
int page_size;
int page_cnt = 8;
volatile struct perf_event_mmap_page *header;
typedef void (*print_fn)(void *data, int size);
static int perf_event_mmap(int fd)
{
void *base;
int mmap_size;
page_size = getpagesize();
mmap_size = page_size * (page_cnt + 1);
base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (base == MAP_FAILED) {
printf("mmap err\n");
return -1;
}
header = base;
return 0;
}
static int perf_event_poll(int fd)
{
struct pollfd pfd = { .fd = fd, .events = POLLIN };
return poll(&pfd, 1, 1000);
}
struct perf_event_sample {
struct perf_event_header header;
__u32 size;
char data[];
};
void perf_event_read(print_fn fn)
{
__u64 data_tail = header->data_tail;
__u64 data_head = header->data_head;
__u64 buffer_size = page_cnt * page_size;
void *base, *begin, *end;
char buf[256];
asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
if (data_head == data_tail)
return;
base = ((char *)header) + page_size;
begin = base + data_tail % buffer_size;
end = base + data_head % buffer_size;
while (begin != end) {
struct perf_event_sample *e;
e = begin;
if (begin + e->header.size > base + buffer_size) {
long len = base + buffer_size - begin;
assert(len < e->header.size);
memcpy(buf, begin, len);
memcpy(buf + len, base, e->header.size - len);
e = (void *) buf;
begin = base + e->header.size - len;
} else if (begin + e->header.size == base + buffer_size) {
begin = base;
} else {
begin += e->header.size;
}
if (e->header.type == PERF_RECORD_SAMPLE) {
fn(e->data, e->size);
} else if (e->header.type == PERF_RECORD_LOST) {
struct {
struct perf_event_header header;
__u64 id;
__u64 lost;
} *lost = (void *) e;
printf("lost %lld events\n", lost->lost);
} else {
printf("unknown event type=%d size=%d\n",
e->header.type, e->header.size);
}
}
__sync_synchronize(); /* smp_mb() */
header->data_tail = data_head;
}
static __u64 time_get_ns(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000000000ull + ts.tv_nsec;
}
static __u64 start_time;
#define MAX_CNT 100000ll
static void print_bpf_output(void *data, int size)
{
static __u64 cnt;
struct {
__u64 pid;
__u64 cookie;
} *e = data;
if (e->cookie != 0x12345678) {
printf("BUG pid %llx cookie %llx sized %d\n",
e->pid, e->cookie, size);
kill(0, SIGINT);
}
cnt++;
if (cnt == MAX_CNT) {
printf("recv %lld events per sec\n",
MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
kill(0, SIGINT);
}
}
static void test_bpf_perf_event(void)
{
struct perf_event_attr attr = {
.sample_type = PERF_SAMPLE_RAW,
.type = PERF_TYPE_SOFTWARE,
.config = PERF_COUNT_SW_BPF_OUTPUT,
};
int key = 0;
pmu_fd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
assert(pmu_fd >= 0);
assert(bpf_update_elem(map_fd[0], &key, &pmu_fd, BPF_ANY) == 0);
ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
}
int main(int argc, char **argv)
{
char filename[256];
FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
}
test_bpf_perf_event();
if (perf_event_mmap(pmu_fd) < 0)
return 1;
f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
(void) f;
start_time = time_get_ns();
for (;;) {
perf_event_poll(pmu_fd);
perf_event_read(print_bpf_output);
}
return 0;
}