mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-06 05:06:29 +00:00
perf/hw_breakpoint: Optimize list of per-task breakpoints
On a machine with 256 CPUs, running the recently added perf breakpoint benchmark results in: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 236.418 [sec] | | 123134.794271 usecs/op | 7880626.833333 usecs/op/cpu The benchmark tests inherited breakpoint perf events across many threads. Looking at a perf profile, we can see that the majority of the time is spent in various hw_breakpoint.c functions, which execute within the 'nr_bp_mutex' critical sections which then results in contention on that mutex as well: 37.27% [kernel] [k] osq_lock 34.92% [kernel] [k] mutex_spin_on_owner 12.15% [kernel] [k] toggle_bp_slot 11.90% [kernel] [k] __reserve_bp_slot The culprit here is task_bp_pinned(), which has a runtime complexity of O(#tasks) due to storing all task breakpoints in the same list and iterating through that list looking for a matching task. Clearly, this does not scale to thousands of tasks. Instead, make use of the "rhashtable" variant "rhltable" which stores multiple items with the same key in a list. This results in average runtime complexity of O(1) for task_bp_pinned(). With the optimization, the benchmark shows: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.208 [sec] | | 108.422396 usecs/op | 6939.033333 usecs/op/cpu On this particular setup that's a speedup of ~1135x. While one option would be to make task_struct a breakpoint list node, this would only further bloat task_struct for infrequently used data. Furthermore, after all optimizations in this series, there's no evidence it would result in better performance: later optimizations make the time spent looking up entries in the hash table negligible (we'll reach the theoretical ideal performance i.e. no constraints). Signed-off-by: Marco Elver <elver@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Dmitry Vyukov <dvyukov@google.com> Acked-by: Ian Rogers <irogers@google.com> Link: https://lore.kernel.org/r/20220829124719.675715-5-elver@google.com
This commit is contained in:
parent
089cdcb0cd
commit
0370dc314d
@ -36,6 +36,7 @@ struct perf_guest_info_callbacks {
|
||||
};
|
||||
|
||||
#ifdef CONFIG_HAVE_HW_BREAKPOINT
|
||||
#include <linux/rhashtable-types.h>
|
||||
#include <asm/hw_breakpoint.h>
|
||||
#endif
|
||||
|
||||
@ -178,7 +179,7 @@ struct hw_perf_event {
|
||||
* creation and event initalization.
|
||||
*/
|
||||
struct arch_hw_breakpoint info;
|
||||
struct list_head bp_list;
|
||||
struct rhlist_head bp_list;
|
||||
};
|
||||
#endif
|
||||
struct { /* amd_iommu */
|
||||
|
@ -26,10 +26,10 @@
|
||||
#include <linux/irqflags.h>
|
||||
#include <linux/kdebug.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/rhashtable.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
@ -54,7 +54,13 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
|
||||
}
|
||||
|
||||
/* Keep track of the breakpoints attached to tasks */
|
||||
static LIST_HEAD(bp_task_head);
|
||||
static struct rhltable task_bps_ht;
|
||||
static const struct rhashtable_params task_bps_ht_params = {
|
||||
.head_offset = offsetof(struct hw_perf_event, bp_list),
|
||||
.key_offset = offsetof(struct hw_perf_event, target),
|
||||
.key_len = sizeof_field(struct hw_perf_event, target),
|
||||
.automatic_shrinking = true,
|
||||
};
|
||||
|
||||
static int constraints_initialized;
|
||||
|
||||
@ -103,17 +109,23 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
|
||||
*/
|
||||
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
|
||||
{
|
||||
struct task_struct *tsk = bp->hw.target;
|
||||
struct rhlist_head *head, *pos;
|
||||
struct perf_event *iter;
|
||||
int count = 0;
|
||||
|
||||
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
|
||||
if (iter->hw.target == tsk &&
|
||||
find_slot_idx(iter->attr.bp_type) == type &&
|
||||
rcu_read_lock();
|
||||
head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
|
||||
if (!head)
|
||||
goto out;
|
||||
|
||||
rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) {
|
||||
if (find_slot_idx(iter->attr.bp_type) == type &&
|
||||
(iter->cpu < 0 || cpu == iter->cpu))
|
||||
count += hw_breakpoint_weight(iter);
|
||||
}
|
||||
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
return count;
|
||||
}
|
||||
|
||||
@ -186,7 +198,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
|
||||
/*
|
||||
* Add/remove the given breakpoint in our constraint table
|
||||
*/
|
||||
static void
|
||||
static int
|
||||
toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
|
||||
int weight)
|
||||
{
|
||||
@ -199,7 +211,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
|
||||
/* Pinned counter cpu profiling */
|
||||
if (!bp->hw.target) {
|
||||
get_bp_info(bp->cpu, type)->cpu_pinned += weight;
|
||||
return;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Pinned counter task profiling */
|
||||
@ -207,9 +219,9 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
|
||||
toggle_bp_task_slot(bp, cpu, type, weight);
|
||||
|
||||
if (enable)
|
||||
list_add_tail(&bp->hw.bp_list, &bp_task_head);
|
||||
return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
|
||||
else
|
||||
list_del(&bp->hw.bp_list);
|
||||
return rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
|
||||
}
|
||||
|
||||
__weak int arch_reserve_bp_slot(struct perf_event *bp)
|
||||
@ -307,9 +319,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
toggle_bp_slot(bp, true, type, weight);
|
||||
|
||||
return 0;
|
||||
return toggle_bp_slot(bp, true, type, weight);
|
||||
}
|
||||
|
||||
int reserve_bp_slot(struct perf_event *bp)
|
||||
@ -334,7 +344,7 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
|
||||
|
||||
type = find_slot_idx(bp_type);
|
||||
weight = hw_breakpoint_weight(bp);
|
||||
toggle_bp_slot(bp, false, type, weight);
|
||||
WARN_ON(toggle_bp_slot(bp, false, type, weight));
|
||||
}
|
||||
|
||||
void release_bp_slot(struct perf_event *bp)
|
||||
@ -707,7 +717,7 @@ static struct pmu perf_breakpoint = {
|
||||
int __init init_hw_breakpoint(void)
|
||||
{
|
||||
int cpu, err_cpu;
|
||||
int i;
|
||||
int i, ret;
|
||||
|
||||
for (i = 0; i < TYPE_MAX; i++)
|
||||
nr_slots[i] = hw_breakpoint_slots(i);
|
||||
@ -718,18 +728,24 @@ int __init init_hw_breakpoint(void)
|
||||
|
||||
info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
|
||||
GFP_KERNEL);
|
||||
if (!info->tsk_pinned)
|
||||
goto err_alloc;
|
||||
if (!info->tsk_pinned) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ret = rhltable_init(&task_bps_ht, &task_bps_ht_params);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
constraints_initialized = 1;
|
||||
|
||||
perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
|
||||
|
||||
return register_die_notifier(&hw_breakpoint_exceptions_nb);
|
||||
|
||||
err_alloc:
|
||||
err:
|
||||
for_each_possible_cpu(err_cpu) {
|
||||
for (i = 0; i < TYPE_MAX; i++)
|
||||
kfree(get_bp_info(err_cpu, i)->tsk_pinned);
|
||||
@ -737,7 +753,5 @@ int __init init_hw_breakpoint(void)
|
||||
break;
|
||||
}
|
||||
|
||||
return -ENOMEM;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user