perf stat: Support inherit events during fork() for bperf

bperf has a nice ability to share PMUs, but it still does not support
inherit events during fork(), resulting in some deviations in its stat
results compared with perf.

perf stat result:
$ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop
   Performance counter stats for './perf test -w sqrtloop':

       2,316,038,116      cycles
       2,859,350,725      instructions

         1.009603637 seconds time elapsed

         1.004196000 seconds user
         0.003950000 seconds sys

bperf stat result:
$ ./perf stat --bpf-counters -e cycles,instructions -- \
      ./perf test -w sqrtloop

   Performance counter stats for './perf test -w sqrtloop':

          18,762,093      cycles
          23,487,766      instructions

         1.008913769 seconds time elapsed

         1.003248000 seconds user
         0.004069000 seconds sys

In order to support event inheritance, two new bpf programs are added
to monitor the fork and exit of tasks respectively. When a task is
created, add it to the filter map to enable counting, and reuse the
`accum_key` of its parent task to count together with the parent task.
When a task exits, remove it from the filter map to disable counting.

After support:
$ ./perf stat --bpf-counters -e cycles,instructions -- \
      ./perf test -w sqrtloop

 Performance counter stats for './perf test -w sqrtloop':

     2,316,252,189      cycles
     2,859,946,547      instructions

       1.009422314 seconds time elapsed

       1.003597000 seconds user
       0.004270000 seconds sys

Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
Cc: song@kernel.org
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20241021110201.325617-2-wutengda@huaweicloud.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
This commit is contained in:
Tengda Wu 2024-10-21 11:02:00 +00:00 committed by Namhyung Kim
parent ba993e5ada
commit 07dc3a6de3
5 changed files with 126 additions and 14 deletions

View File

@ -2641,6 +2641,7 @@ int cmd_stat(int argc, const char **argv)
} else if (big_num_opt == 0) /* User passed --no-big-num */
stat_config.big_num = false;
target.inherit = !stat_config.no_inherit;
err = target__validate(&target);
if (err) {
target__strerror(&target, err, errbuf, BUFSIZ);

View File

@ -394,6 +394,7 @@ static int bperf_check_target(struct evsel *evsel,
}
static struct perf_cpu_map *all_cpu_map;
static __u32 filter_entry_cnt;
static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
struct perf_event_attr_map_entry *entry)
@ -444,12 +445,32 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
return err;
}
static int bperf_attach_follower_program(struct bperf_follower_bpf *skel,
enum bperf_filter_type filter_type,
bool inherit)
{
struct bpf_link *link;
int err = 0;
if ((filter_type == BPERF_FILTER_PID ||
filter_type == BPERF_FILTER_TGID) && inherit)
/* attach all follower bpf progs to enable event inheritance */
err = bperf_follower_bpf__attach(skel);
else {
link = bpf_program__attach(skel->progs.fexit_XXX);
if (IS_ERR(link))
err = PTR_ERR(link);
}
return err;
}
static int bperf__load(struct evsel *evsel, struct target *target)
{
struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
int attr_map_fd, diff_map_fd = -1, err;
enum bperf_filter_type filter_type;
__u32 filter_entry_cnt, i;
__u32 i;
if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
return -1;
@ -529,9 +550,6 @@ static int bperf__load(struct evsel *evsel, struct target *target)
/* set up reading map */
bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
filter_entry_cnt);
/* set up follower filter based on target */
bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
filter_entry_cnt);
err = bperf_follower_bpf__load(evsel->follower_skel);
if (err) {
pr_err("Failed to load follower skeleton\n");
@ -543,6 +561,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
for (i = 0; i < filter_entry_cnt; i++) {
int filter_map_fd;
__u32 key;
struct bperf_filter_value fval = { i, 0 };
if (filter_type == BPERF_FILTER_PID ||
filter_type == BPERF_FILTER_TGID)
@ -553,12 +572,14 @@ static int bperf__load(struct evsel *evsel, struct target *target)
break;
filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
}
evsel->follower_skel->bss->type = filter_type;
evsel->follower_skel->bss->inherit = target->inherit;
err = bperf_follower_bpf__attach(evsel->follower_skel);
err = bperf_attach_follower_program(evsel->follower_skel, filter_type,
target->inherit);
out:
if (err && evsel->bperf_leader_link_fd >= 0)
@ -623,7 +644,7 @@ static int bperf__read(struct evsel *evsel)
bperf_sync_counters(evsel);
reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
for (i = 0; i < filter_entry_cnt; i++) {
struct perf_cpu entry;
__u32 cpu;

View File

@ -5,6 +5,8 @@
#include <bpf/bpf_tracing.h>
#include "bperf_u.h"
#define MAX_ENTRIES 102400
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(__u32));
@ -22,25 +24,29 @@ struct {
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u32));
__uint(value_size, sizeof(struct bperf_filter_value));
__uint(max_entries, MAX_ENTRIES);
__uint(map_flags, BPF_F_NO_PREALLOC);
} filter SEC(".maps");
enum bperf_filter_type type = 0;
int enabled = 0;
int inherit;
SEC("fexit/XXX")
int BPF_PROG(fexit_XXX)
{
struct bpf_perf_event_value *diff_val, *accum_val;
__u32 filter_key, zero = 0;
__u32 *accum_key;
__u32 accum_key;
struct bperf_filter_value *fval;
if (!enabled)
return 0;
switch (type) {
case BPERF_FILTER_GLOBAL:
accum_key = &zero;
accum_key = zero;
goto do_add;
case BPERF_FILTER_CPU:
filter_key = bpf_get_smp_processor_id();
@ -49,22 +55,34 @@ int BPF_PROG(fexit_XXX)
filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
break;
case BPERF_FILTER_TGID:
filter_key = bpf_get_current_pid_tgid() >> 32;
/* Use pid as the filter_key to exclude new task counts
* when inherit is disabled. Don't worry about the existing
* children in TGID losing their counts, bpf_counter has
* already added them to the filter map via perf_thread_map
* before this bpf prog runs.
*/
filter_key = inherit ?
bpf_get_current_pid_tgid() >> 32 :
bpf_get_current_pid_tgid() & 0xffffffff;
break;
default:
return 0;
}
accum_key = bpf_map_lookup_elem(&filter, &filter_key);
if (!accum_key)
fval = bpf_map_lookup_elem(&filter, &filter_key);
if (!fval)
return 0;
accum_key = fval->accum_key;
if (fval->exited)
bpf_map_delete_elem(&filter, &filter_key);
do_add:
diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
if (!diff_val)
return 0;
accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key);
if (!accum_val)
return 0;
@ -75,4 +93,70 @@ int BPF_PROG(fexit_XXX)
return 0;
}
/* The program is only used for PID or TGID filter types. */
SEC("tp_btf/task_newtask")
int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags)
{
__u32 parent_key, child_key;
struct bperf_filter_value *parent_fval;
struct bperf_filter_value child_fval = { 0 };
if (!enabled)
return 0;
switch (type) {
case BPERF_FILTER_PID:
parent_key = bpf_get_current_pid_tgid() & 0xffffffff;
child_key = task->pid;
break;
case BPERF_FILTER_TGID:
parent_key = bpf_get_current_pid_tgid() >> 32;
child_key = task->tgid;
if (child_key == parent_key)
return 0;
break;
default:
return 0;
}
/* Check if the current task is one of the target tasks to be counted */
parent_fval = bpf_map_lookup_elem(&filter, &parent_key);
if (!parent_fval)
return 0;
/* Start counting for the new task by adding it into filter map,
* inherit the accum key of its parent task so that they can be
* counted together.
*/
child_fval.accum_key = parent_fval->accum_key;
child_fval.exited = 0;
bpf_map_update_elem(&filter, &child_key, &child_fval, BPF_NOEXIST);
return 0;
}
/* The program is only used for PID or TGID filter types. */
SEC("tp_btf/sched_process_exit")
int BPF_PROG(on_exittask, struct task_struct *task)
{
__u32 pid;
struct bperf_filter_value *fval;
if (!enabled)
return 0;
/* Stop counting for this task by removing it from filter map.
* For TGID type, if the pid can be found in the map, it means that
* this pid belongs to the leader task. After the task exits, the
* tgid of its child tasks (if any) will be 1, so the pid can be
* safely removed.
*/
pid = task->pid;
fval = bpf_map_lookup_elem(&filter, &pid);
if (fval)
fval->exited = 1;
return 0;
}
char LICENSE[] SEC("license") = "Dual BSD/GPL";

View File

@ -11,4 +11,9 @@ enum bperf_filter_type {
BPERF_FILTER_TGID,
};
struct bperf_filter_value {
__u32 accum_key;
__u8 exited;
};
#endif /* __BPERF_STAT_U_H */

View File

@ -17,6 +17,7 @@ struct target {
bool default_per_cpu;
bool per_thread;
bool use_bpf;
bool inherit;
int initial_delay;
const char *attr_map;
};