linux-stable/tools/perf/util/bpf_off_cpu.c
Namhyung Kim 8b3b1bb3ea perf record offcpu: Constify control data for BPF
The control knobs set before loading BPF programs should be declared as
'const volatile' so that it can be optimized by the BPF core.

Committer testing:

  root@x1:~# perf record --off-cpu
  ^C[ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 1.807 MB perf.data (5645 samples) ]

  root@x1:~# perf evlist
  cpu_atom/cycles/P
  cpu_core/cycles/P
  offcpu-time
  dummy:u
  root@x1:~# perf evlist -v
  cpu_atom/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0xa00000000, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  cpu_core/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x400000000, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  offcpu-time: type: 1 (software), size: 136, config: 0xa (PERF_COUNT_SW_BPF_OUTPUT), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, sample_id_all: 1
  dummy:u: type: 1 (software), size: 136, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|IDENTIFIER, read_format: ID|LOST, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1
  root@x1:~# perf trace -e bpf --max-events 5 perf record --off-cpu
       0.000 ( 0.015 ms): :2949124/2949124 bpf(cmd: 36, uattr: 0x7ffefc6dbe30, size: 8)          = -1 EOPNOTSUPP (Operation not supported)
       0.031 ( 0.115 ms): :2949124/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbb60, size: 148) = 14
       0.159 ( 0.037 ms): :2949124/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbc20, size: 148) = 14
      23.868 ( 0.144 ms): perf/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbad0, size: 148)     = 14
      24.027 ( 0.014 ms): perf/2949124 bpf(uattr: 0x7ffefc6dbc80, size: 80)                      = 14
  root@x1:~#

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240902200515.2103769-6-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-03 11:54:47 -03:00

396 lines
9.0 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include "util/bpf_counter.h"
#include "util/debug.h"
#include "util/evsel.h"
#include "util/evlist.h"
#include "util/off_cpu.h"
#include "util/perf-hooks.h"
#include "util/record.h"
#include "util/session.h"
#include "util/target.h"
#include "util/cpumap.h"
#include "util/thread_map.h"
#include "util/cgroup.h"
#include "util/strlist.h"
#include <bpf/bpf.h>
#include "bpf_skel/off_cpu.skel.h"
#define MAX_STACKS 32
#define MAX_PROC 4096
/* we don't need actual timestamp, just want to put the samples at last */
#define OFF_CPU_TIMESTAMP (~0ull << 32)
static struct off_cpu_bpf *skel;
struct off_cpu_key {
u32 pid;
u32 tgid;
u32 stack_id;
u32 state;
u64 cgroup_id;
};
union off_cpu_data {
struct perf_event_header hdr;
u64 array[1024 / sizeof(u64)];
};
static int off_cpu_config(struct evlist *evlist)
{
struct evsel *evsel;
struct perf_event_attr attr = {
.type = PERF_TYPE_SOFTWARE,
.config = PERF_COUNT_SW_BPF_OUTPUT,
.size = sizeof(attr), /* to capture ABI version */
};
char *evname = strdup(OFFCPU_EVENT);
if (evname == NULL)
return -ENOMEM;
evsel = evsel__new(&attr);
if (!evsel) {
free(evname);
return -ENOMEM;
}
evsel->core.attr.freq = 1;
evsel->core.attr.sample_period = 1;
/* off-cpu analysis depends on stack trace */
evsel->core.attr.sample_type = PERF_SAMPLE_CALLCHAIN;
evlist__add(evlist, evsel);
free(evsel->name);
evsel->name = evname;
return 0;
}
static void off_cpu_start(void *arg)
{
struct evlist *evlist = arg;
/* update task filter for the given workload */
if (skel->rodata->has_task && skel->rodata->uses_tgid &&
perf_thread_map__pid(evlist->core.threads, 0) != -1) {
int fd;
u32 pid;
u8 val = 1;
fd = bpf_map__fd(skel->maps.task_filter);
pid = perf_thread_map__pid(evlist->core.threads, 0);
bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
}
skel->bss->enabled = 1;
}
static void off_cpu_finish(void *arg __maybe_unused)
{
skel->bss->enabled = 0;
off_cpu_bpf__destroy(skel);
}
/* v5.18 kernel added prev_state arg, so it needs to check the signature */
static void check_sched_switch_args(void)
{
struct btf *btf = btf__load_vmlinux_btf();
const struct btf_type *t1, *t2, *t3;
u32 type_id;
type_id = btf__find_by_name_kind(btf, "btf_trace_sched_switch",
BTF_KIND_TYPEDEF);
if ((s32)type_id < 0)
goto cleanup;
t1 = btf__type_by_id(btf, type_id);
if (t1 == NULL)
goto cleanup;
t2 = btf__type_by_id(btf, t1->type);
if (t2 == NULL || !btf_is_ptr(t2))
goto cleanup;
t3 = btf__type_by_id(btf, t2->type);
/* btf_trace func proto has one more argument for the context */
if (t3 && btf_is_func_proto(t3) && btf_vlen(t3) == 5) {
/* new format: pass prev_state as 4th arg */
skel->rodata->has_prev_state = true;
}
cleanup:
btf__free(btf);
}
int off_cpu_prepare(struct evlist *evlist, struct target *target,
struct record_opts *opts)
{
int err, fd, i;
int ncpus = 1, ntasks = 1, ncgrps = 1;
struct strlist *pid_slist = NULL;
struct str_node *pos;
if (off_cpu_config(evlist) < 0) {
pr_err("Failed to config off-cpu BPF event\n");
return -1;
}
skel = off_cpu_bpf__open();
if (!skel) {
pr_err("Failed to open off-cpu BPF skeleton\n");
return -1;
}
/* don't need to set cpu filter for system-wide mode */
if (target->cpu_list) {
ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
skel->rodata->has_cpu = 1;
}
if (target->pid) {
pid_slist = strlist__new(target->pid, NULL);
if (!pid_slist) {
pr_err("Failed to create a strlist for pid\n");
return -1;
}
ntasks = 0;
strlist__for_each_entry(pos, pid_slist) {
char *end_ptr;
int pid = strtol(pos->s, &end_ptr, 10);
if (pid == INT_MIN || pid == INT_MAX ||
(*end_ptr != '\0' && *end_ptr != ','))
continue;
ntasks++;
}
if (ntasks < MAX_PROC)
ntasks = MAX_PROC;
bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
skel->rodata->has_task = 1;
skel->rodata->uses_tgid = 1;
} else if (target__has_task(target)) {
ntasks = perf_thread_map__nr(evlist->core.threads);
bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
skel->rodata->has_task = 1;
} else if (target__none(target)) {
bpf_map__set_max_entries(skel->maps.task_filter, MAX_PROC);
skel->rodata->has_task = 1;
skel->rodata->uses_tgid = 1;
}
if (evlist__first(evlist)->cgrp) {
ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */
bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps);
if (!cgroup_is_v2("perf_event"))
skel->rodata->uses_cgroup_v1 = true;
skel->rodata->has_cgroup = 1;
}
if (opts->record_cgroup) {
skel->rodata->needs_cgroup = true;
if (!cgroup_is_v2("perf_event"))
skel->rodata->uses_cgroup_v1 = true;
}
set_max_rlimit();
check_sched_switch_args();
err = off_cpu_bpf__load(skel);
if (err) {
pr_err("Failed to load off-cpu skeleton\n");
goto out;
}
if (target->cpu_list) {
u32 cpu;
u8 val = 1;
fd = bpf_map__fd(skel->maps.cpu_filter);
for (i = 0; i < ncpus; i++) {
cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu;
bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
}
}
if (target->pid) {
u8 val = 1;
fd = bpf_map__fd(skel->maps.task_filter);
strlist__for_each_entry(pos, pid_slist) {
char *end_ptr;
u32 tgid;
int pid = strtol(pos->s, &end_ptr, 10);
if (pid == INT_MIN || pid == INT_MAX ||
(*end_ptr != '\0' && *end_ptr != ','))
continue;
tgid = pid;
bpf_map_update_elem(fd, &tgid, &val, BPF_ANY);
}
} else if (target__has_task(target)) {
u32 pid;
u8 val = 1;
fd = bpf_map__fd(skel->maps.task_filter);
for (i = 0; i < ntasks; i++) {
pid = perf_thread_map__pid(evlist->core.threads, i);
bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
}
}
if (evlist__first(evlist)->cgrp) {
struct evsel *evsel;
u8 val = 1;
fd = bpf_map__fd(skel->maps.cgroup_filter);
evlist__for_each_entry(evlist, evsel) {
struct cgroup *cgrp = evsel->cgrp;
if (cgrp == NULL)
continue;
if (!cgrp->id && read_cgroup_id(cgrp) < 0) {
pr_err("Failed to read cgroup id of %s\n",
cgrp->name);
goto out;
}
bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY);
}
}
err = off_cpu_bpf__attach(skel);
if (err) {
pr_err("Failed to attach off-cpu BPF skeleton\n");
goto out;
}
if (perf_hooks__set_hook("record_start", off_cpu_start, evlist) ||
perf_hooks__set_hook("record_end", off_cpu_finish, evlist)) {
pr_err("Failed to attach off-cpu skeleton\n");
goto out;
}
return 0;
out:
off_cpu_bpf__destroy(skel);
return -1;
}
int off_cpu_write(struct perf_session *session)
{
int bytes = 0, size;
int fd, stack;
u64 sample_type, val, sid = 0;
struct evsel *evsel;
struct perf_data_file *file = &session->data->file;
struct off_cpu_key prev, key;
union off_cpu_data data = {
.hdr = {
.type = PERF_RECORD_SAMPLE,
.misc = PERF_RECORD_MISC_USER,
},
};
u64 tstamp = OFF_CPU_TIMESTAMP;
skel->bss->enabled = 0;
evsel = evlist__find_evsel_by_str(session->evlist, OFFCPU_EVENT);
if (evsel == NULL) {
pr_err("%s evsel not found\n", OFFCPU_EVENT);
return 0;
}
sample_type = evsel->core.attr.sample_type;
if (sample_type & ~OFFCPU_SAMPLE_TYPES) {
pr_err("not supported sample type: %llx\n",
(unsigned long long)sample_type);
return -1;
}
if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) {
if (evsel->core.id)
sid = evsel->core.id[0];
}
fd = bpf_map__fd(skel->maps.off_cpu);
stack = bpf_map__fd(skel->maps.stacks);
memset(&prev, 0, sizeof(prev));
while (!bpf_map_get_next_key(fd, &prev, &key)) {
int n = 1; /* start from perf_event_header */
int ip_pos = -1;
bpf_map_lookup_elem(fd, &key, &val);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
data.array[n++] = sid;
if (sample_type & PERF_SAMPLE_IP) {
ip_pos = n;
data.array[n++] = 0; /* will be updated */
}
if (sample_type & PERF_SAMPLE_TID)
data.array[n++] = (u64)key.pid << 32 | key.tgid;
if (sample_type & PERF_SAMPLE_TIME)
data.array[n++] = tstamp;
if (sample_type & PERF_SAMPLE_ID)
data.array[n++] = sid;
if (sample_type & PERF_SAMPLE_CPU)
data.array[n++] = 0;
if (sample_type & PERF_SAMPLE_PERIOD)
data.array[n++] = val;
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int len = 0;
/* data.array[n] is callchain->nr (updated later) */
data.array[n + 1] = PERF_CONTEXT_USER;
data.array[n + 2] = 0;
bpf_map_lookup_elem(stack, &key.stack_id, &data.array[n + 2]);
while (data.array[n + 2 + len])
len++;
/* update length of callchain */
data.array[n] = len + 1;
/* update sample ip with the first callchain entry */
if (ip_pos >= 0)
data.array[ip_pos] = data.array[n + 2];
/* calculate sample callchain data array length */
n += len + 2;
}
if (sample_type & PERF_SAMPLE_CGROUP)
data.array[n++] = key.cgroup_id;
size = n * sizeof(u64);
data.hdr.size = size;
bytes += size;
if (perf_data_file__write(file, &data, size) < 0) {
pr_err("failed to write perf data, error: %m\n");
return bytes;
}
prev = key;
/* increase dummy timestamp to sort later samples */
tstamp++;
}
return bytes;
}