perf: Optimize context reschedule for single PMU cases

Currently re-scheduling a context will reschedule all active PMUs for
that context, even if it is known only a single event is added.

Namhyung reported that changing this to only reschedule the affected
PMU when possible provides significant performance gains under certain
conditions.

Therefore, allow partial context reschedules for a specific PMU, that
of the event modified.

While the patch looks somewhat noisy, it mostly just propagates a new
@pmu argument through the callchain and modifies the epc loop to only
pick the 'epc->pmu == @pmu' case.

Reported-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240807115549.920950699@infradead.org
This commit is contained in:
Peter Zijlstra 2024-08-07 13:29:25 +02:00
parent ea1992f36b
commit 2d17cf1abc

View File

@ -685,30 +685,32 @@ do { \
___p; \
})
#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
if (_cgroup && !_epc->nr_cgroups) \
continue; \
else if (_pmu && _epc->pmu != _pmu) \
continue; \
else
static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (cgroup && !pmu_ctx->nr_cgroups)
continue;
for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_disable(pmu_ctx->pmu);
}
}
static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (cgroup && !pmu_ctx->nr_cgroups)
continue;
for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_enable(pmu_ctx->pmu);
}
}
static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
#ifdef CONFIG_CGROUP_PERF
@ -865,7 +867,7 @@ static void perf_cgroup_switch(struct task_struct *task)
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_ctx_disable(&cpuctx->ctx, true);
ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
@ -877,7 +879,7 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@ -2656,7 +2658,8 @@ static void add_event_to_ctx(struct perf_event *event,
}
static void task_ctx_sched_out(struct perf_event_context *ctx,
enum event_type_t event_type)
struct pmu *pmu,
enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
@ -2666,18 +2669,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
ctx_sched_out(ctx, event_type);
ctx_sched_out(ctx, pmu, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
struct perf_event_context *ctx,
struct pmu *pmu)
{
ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
if (ctx)
ctx_sched_in(ctx, EVENT_PINNED);
ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
ctx_sched_in(ctx, pmu, EVENT_PINNED);
ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
if (ctx)
ctx_sched_in(ctx, EVENT_FLEXIBLE);
ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
}
/*
@ -2695,16 +2699,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
/*
* XXX: ctx_resched() reschedule entire perf_event_context while adding new
* event to the context or enabling existing event in the context. We can
* probably optimize it by rescheduling only affected pmu_ctx.
*/
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
enum event_type_t event_type)
struct pmu *pmu, enum event_type_t event_type)
{
bool cpu_event = !!(event_type & EVENT_CPU);
struct perf_event_pmu_context *epc;
/*
* If pinned groups are involved, flexible groups also need to be
@ -2715,10 +2715,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL;
perf_ctx_disable(&cpuctx->ctx, false);
for_each_epc(epc, &cpuctx->ctx, pmu, false)
perf_pmu_disable(epc->pmu);
if (task_ctx) {
perf_ctx_disable(task_ctx, false);
task_ctx_sched_out(task_ctx, event_type);
for_each_epc(epc, task_ctx, pmu, false)
perf_pmu_disable(epc->pmu);
task_ctx_sched_out(task_ctx, pmu, event_type);
}
/*
@ -2729,15 +2733,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
ctx_sched_out(&cpuctx->ctx, event_type);
ctx_sched_out(&cpuctx->ctx, pmu, event_type);
else if (event_type & EVENT_PINNED)
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, task_ctx);
perf_event_sched_in(cpuctx, task_ctx, pmu);
perf_ctx_enable(&cpuctx->ctx, false);
if (task_ctx)
perf_ctx_enable(task_ctx, false);
for_each_epc(epc, &cpuctx->ctx, pmu, false)
perf_pmu_enable(epc->pmu);
if (task_ctx) {
for_each_epc(epc, task_ctx, pmu, false)
perf_pmu_enable(epc->pmu);
}
}
void perf_pmu_resched(struct pmu *pmu)
@ -2746,7 +2754,7 @@ void perf_pmu_resched(struct pmu *pmu)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
perf_ctx_unlock(cpuctx, task_ctx);
}
@ -2802,9 +2810,10 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
ctx_sched_out(ctx, EVENT_TIME);
ctx_sched_out(ctx, NULL, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
get_event_type(event));
} else {
add_event_to_ctx(event, ctx);
}
@ -2948,7 +2957,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (ctx->is_active)
ctx_sched_out(ctx, EVENT_TIME);
ctx_sched_out(ctx, NULL, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
@ -2957,7 +2966,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (!event_filter_match(event)) {
ctx_sched_in(ctx, EVENT_TIME);
ctx_sched_in(ctx, NULL, EVENT_TIME);
return;
}
@ -2966,7 +2975,7 @@ static void __perf_event_enable(struct perf_event *event,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
ctx_sched_in(ctx, EVENT_TIME);
ctx_sched_in(ctx, NULL, EVENT_TIME);
return;
}
@ -2974,7 +2983,7 @@ static void __perf_event_enable(struct perf_event *event,
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
}
/*
@ -3276,8 +3285,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
perf_pmu_enable(pmu);
}
/*
* Be very careful with the @pmu argument since this will change ctx state.
* The @pmu argument works for ctx_resched(), because that is symmetric in
* ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
*
* However, if you were to be asymmetrical, you could end up with messed up
* state, eg. ctx->is_active cleared even though most EPCs would still actually
* be active.
*/
static void
ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
@ -3331,11 +3349,8 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
is_active ^= ctx->is_active; /* changed bits */
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (cgroup && !pmu_ctx->nr_cgroups)
continue;
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_out(pmu_ctx, is_active);
}
}
/*
@ -3579,7 +3594,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
inside_switch:
perf_ctx_sched_task_cb(ctx, false);
task_ctx_sched_out(ctx, EVENT_ALL);
task_ctx_sched_out(ctx, NULL, EVENT_ALL);
perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
@ -3877,29 +3892,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx,
merge_sched_in, &can_add_hw);
}
static void ctx_groups_sched_in(struct perf_event_context *ctx,
struct perf_event_groups *groups,
bool cgroup)
static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
enum event_type_t event_type)
{
struct perf_event_pmu_context *pmu_ctx;
struct perf_event_context *ctx = pmu_ctx->ctx;
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (cgroup && !pmu_ctx->nr_cgroups)
continue;
pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
}
}
static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
struct pmu *pmu)
{
pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
if (event_type & EVENT_PINNED)
pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
if (event_type & EVENT_FLEXIBLE)
pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
}
static void
ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
bool cgroup = event_type & EVENT_CGROUP;
@ -3935,12 +3943,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
if (is_active & EVENT_PINNED) {
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
}
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
if (is_active & EVENT_FLEXIBLE) {
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
}
}
static void perf_event_context_sched_in(struct task_struct *task)
@ -3983,10 +3995,10 @@ static void perf_event_context_sched_in(struct task_struct *task)
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
perf_ctx_disable(&cpuctx->ctx, false);
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
}
perf_event_sched_in(cpuctx, ctx);
perf_event_sched_in(cpuctx, ctx, NULL);
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
@ -4327,14 +4339,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
update_context_time(&cpuctx->ctx);
__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
__pmu_ctx_sched_in(&cpuctx->ctx, pmu);
__pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
}
if (task_event)
rotate_ctx(task_epc->ctx, task_event);
if (task_event || (task_epc && cpu_event))
__pmu_ctx_sched_in(task_epc->ctx, pmu);
__pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@ -4400,7 +4412,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
ctx_sched_out(ctx, EVENT_TIME);
ctx_sched_out(ctx, NULL, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
@ -4412,9 +4424,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
*/
if (enabled) {
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
ctx_resched(cpuctx, ctx, NULL, event_type);
} else {
ctx_sched_in(ctx, EVENT_TIME);
ctx_sched_in(ctx, NULL, EVENT_TIME);
}
perf_ctx_unlock(cpuctx, ctx);
@ -13202,7 +13214,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
task_ctx_sched_out(child_ctx, EVENT_ALL);
task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
@ -13751,7 +13763,7 @@ static void __perf_event_exit_context(void *__info)
struct perf_event *event;
raw_spin_lock(&ctx->lock);
ctx_sched_out(ctx, EVENT_TIME);
ctx_sched_out(ctx, NULL, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);