mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-15 21:23:23 +00:00
perf/hw_breakpoint: Reduce contention with large number of tasks
While optimizing task_bp_pinned()'s runtime complexity to O(1) on average helps reduce time spent in the critical section, we still suffer due to serializing everything via 'nr_bp_mutex'. Indeed, a profile shows that now contention is the biggest issue: 95.93% [kernel] [k] osq_lock 0.70% [kernel] [k] mutex_spin_on_owner 0.22% [kernel] [k] smp_cfm_core_cond 0.18% [kernel] [k] task_bp_pinned 0.18% [kernel] [k] rhashtable_jhash2 0.15% [kernel] [k] queued_spin_lock_slowpath when running the breakpoint benchmark with (system with 256 CPUs): | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.207 [sec] | | 108.267188 usecs/op | 6929.100000 usecs/op/cpu The main concern for synchronizing the breakpoint constraints data is that a consistent snapshot of the per-CPU and per-task data is observed. The access pattern is as follows: 1. If the target is a task: the task's pinned breakpoints are counted, checked for space, and then appended to; only bp_cpuinfo::cpu_pinned is used to check for conflicts with CPU-only breakpoints; bp_cpuinfo::tsk_pinned are incremented/decremented, but otherwise unused. 2. If the target is a CPU: bp_cpuinfo::cpu_pinned are counted, along with bp_cpuinfo::tsk_pinned; after a successful check, cpu_pinned is incremented. No per-task breakpoints are checked. Since rhltable safely synchronizes insertions/deletions, we can allow concurrency as follows: 1. If the target is a task: independent tasks may update and check the constraints concurrently, but same-task target calls need to be serialized; since bp_cpuinfo::tsk_pinned is only updated, but not checked, these modifications can happen concurrently by switching tsk_pinned to atomic_t. 2. If the target is a CPU: access to the per-CPU constraints needs to be serialized with other CPU-target and task-target callers (to stabilize the bp_cpuinfo::tsk_pinned snapshot). We can allow the above concurrency by introducing a per-CPU constraints data reader-writer lock (bp_cpuinfo_sem), and per-task mutexes (reuses task_struct::perf_event_mutex): 1. If the target is a task: acquires perf_event_mutex, and acquires bp_cpuinfo_sem as a reader. The choice of percpu-rwsem minimizes contention in the presence of many read-lock but few write-lock acquisitions: we assume many orders of magnitude more task target breakpoints creations/destructions than CPU target breakpoints. 2. If the target is a CPU: acquires bp_cpuinfo_sem as a writer. With these changes, contention with thousands of tasks is reduced to the point where waiting on locking no longer dominates the profile: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.077 [sec] | | 40.201563 usecs/op | 2572.900000 usecs/op/cpu 21.54% [kernel] [k] task_bp_pinned 20.18% [kernel] [k] rhashtable_jhash2 6.81% [kernel] [k] toggle_bp_slot 5.47% [kernel] [k] queued_spin_lock_slowpath 3.75% [kernel] [k] smp_cfm_core_cond 3.48% [kernel] [k] bcmp On this particular setup that's a speedup of 2.7x. We're also getting closer to the theoretical ideal performance through optimizations in hw_breakpoint.c -- constraints accounting disabled: | perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.067 [sec] | | 35.286458 usecs/op | 2258.333333 usecs/op/cpu Which means the current implementation is ~12% slower than the theoretical ideal. For reference, performance without any breakpoints: | $> bench -r 30 breakpoint thread -b 0 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 0 breakpoints and 64 parallelism | Total time: 0.060 [sec] | | 31.365625 usecs/op | 2007.400000 usecs/op/cpu On a system with 256 CPUs, the theoretical ideal is only ~12% slower than no breakpoints at all; the current implementation is ~28% slower. Signed-off-by: Marco Elver <elver@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Dmitry Vyukov <dvyukov@google.com> Acked-by: Ian Rogers <irogers@google.com> Link: https://lore.kernel.org/r/20220829124719.675715-12-elver@google.com
This commit is contained in:
parent
01fe8a3f81
commit
0912037fec
@ -19,6 +19,7 @@
|
|||||||
|
|
||||||
#include <linux/hw_breakpoint.h>
|
#include <linux/hw_breakpoint.h>
|
||||||
|
|
||||||
|
#include <linux/atomic.h>
|
||||||
#include <linux/bug.h>
|
#include <linux/bug.h>
|
||||||
#include <linux/cpu.h>
|
#include <linux/cpu.h>
|
||||||
#include <linux/export.h>
|
#include <linux/export.h>
|
||||||
@ -28,6 +29,7 @@
|
|||||||
#include <linux/kernel.h>
|
#include <linux/kernel.h>
|
||||||
#include <linux/mutex.h>
|
#include <linux/mutex.h>
|
||||||
#include <linux/notifier.h>
|
#include <linux/notifier.h>
|
||||||
|
#include <linux/percpu-rwsem.h>
|
||||||
#include <linux/percpu.h>
|
#include <linux/percpu.h>
|
||||||
#include <linux/rhashtable.h>
|
#include <linux/rhashtable.h>
|
||||||
#include <linux/sched.h>
|
#include <linux/sched.h>
|
||||||
@ -41,9 +43,9 @@ struct bp_cpuinfo {
|
|||||||
unsigned int cpu_pinned;
|
unsigned int cpu_pinned;
|
||||||
/* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
|
/* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
|
||||||
#ifdef hw_breakpoint_slots
|
#ifdef hw_breakpoint_slots
|
||||||
unsigned int tsk_pinned[hw_breakpoint_slots(0)];
|
atomic_t tsk_pinned[hw_breakpoint_slots(0)];
|
||||||
#else
|
#else
|
||||||
unsigned int *tsk_pinned;
|
atomic_t *tsk_pinned;
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -65,8 +67,79 @@ static const struct rhashtable_params task_bps_ht_params = {
|
|||||||
|
|
||||||
static bool constraints_initialized __ro_after_init;
|
static bool constraints_initialized __ro_after_init;
|
||||||
|
|
||||||
/* Serialize accesses to the above constraints */
|
/*
|
||||||
static DEFINE_MUTEX(nr_bp_mutex);
|
* Synchronizes accesses to the per-CPU constraints; the locking rules are:
|
||||||
|
*
|
||||||
|
* 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock
|
||||||
|
* (due to bp_slots_histogram::count being atomic, no update are lost).
|
||||||
|
*
|
||||||
|
* 2. Holding a write-lock is required for computations that require a
|
||||||
|
* stable snapshot of all bp_cpuinfo::tsk_pinned.
|
||||||
|
*
|
||||||
|
* 3. In all other cases, non-atomic accesses require the appropriately held
|
||||||
|
* lock (read-lock for read-only accesses; write-lock for reads/writes).
|
||||||
|
*/
|
||||||
|
DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return mutex to serialize accesses to per-task lists in task_bps_ht. Since
|
||||||
|
* rhltable synchronizes concurrent insertions/deletions, independent tasks may
|
||||||
|
* insert/delete concurrently; therefore, a mutex per task is sufficient.
|
||||||
|
*
|
||||||
|
* Uses task_struct::perf_event_mutex, to avoid extending task_struct with a
|
||||||
|
* hw_breakpoint-only mutex, which may be infrequently used. The caveat here is
|
||||||
|
* that hw_breakpoint may contend with per-task perf event list management. The
|
||||||
|
* assumption is that perf usecases involving hw_breakpoints are very unlikely
|
||||||
|
* to result in unnecessary contention.
|
||||||
|
*/
|
||||||
|
static inline struct mutex *get_task_bps_mutex(struct perf_event *bp)
|
||||||
|
{
|
||||||
|
struct task_struct *tsk = bp->hw.target;
|
||||||
|
|
||||||
|
return tsk ? &tsk->perf_event_mutex : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct mutex *bp_constraints_lock(struct perf_event *bp)
|
||||||
|
{
|
||||||
|
struct mutex *tsk_mtx = get_task_bps_mutex(bp);
|
||||||
|
|
||||||
|
if (tsk_mtx) {
|
||||||
|
mutex_lock(tsk_mtx);
|
||||||
|
percpu_down_read(&bp_cpuinfo_sem);
|
||||||
|
} else {
|
||||||
|
percpu_down_write(&bp_cpuinfo_sem);
|
||||||
|
}
|
||||||
|
|
||||||
|
return tsk_mtx;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void bp_constraints_unlock(struct mutex *tsk_mtx)
|
||||||
|
{
|
||||||
|
if (tsk_mtx) {
|
||||||
|
percpu_up_read(&bp_cpuinfo_sem);
|
||||||
|
mutex_unlock(tsk_mtx);
|
||||||
|
} else {
|
||||||
|
percpu_up_write(&bp_cpuinfo_sem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool bp_constraints_is_locked(struct perf_event *bp)
|
||||||
|
{
|
||||||
|
struct mutex *tsk_mtx = get_task_bps_mutex(bp);
|
||||||
|
|
||||||
|
return percpu_is_write_locked(&bp_cpuinfo_sem) ||
|
||||||
|
(tsk_mtx ? mutex_is_locked(tsk_mtx) :
|
||||||
|
percpu_is_read_locked(&bp_cpuinfo_sem));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void assert_bp_constraints_lock_held(struct perf_event *bp)
|
||||||
|
{
|
||||||
|
struct mutex *tsk_mtx = get_task_bps_mutex(bp);
|
||||||
|
|
||||||
|
if (tsk_mtx)
|
||||||
|
lockdep_assert_held(tsk_mtx);
|
||||||
|
lockdep_assert_held(&bp_cpuinfo_sem);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef hw_breakpoint_slots
|
#ifdef hw_breakpoint_slots
|
||||||
/*
|
/*
|
||||||
@ -97,7 +170,7 @@ static __init int init_breakpoint_slots(void)
|
|||||||
for (i = 0; i < TYPE_MAX; i++) {
|
for (i = 0; i < TYPE_MAX; i++) {
|
||||||
struct bp_cpuinfo *info = get_bp_info(cpu, i);
|
struct bp_cpuinfo *info = get_bp_info(cpu, i);
|
||||||
|
|
||||||
info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(int), GFP_KERNEL);
|
info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(atomic_t), GFP_KERNEL);
|
||||||
if (!info->tsk_pinned)
|
if (!info->tsk_pinned)
|
||||||
goto err;
|
goto err;
|
||||||
}
|
}
|
||||||
@ -137,11 +210,19 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type)
|
|||||||
*/
|
*/
|
||||||
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
|
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
|
||||||
{
|
{
|
||||||
unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
|
atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* At this point we want to have acquired the bp_cpuinfo_sem as a
|
||||||
|
* writer to ensure that there are no concurrent writers in
|
||||||
|
* toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot.
|
||||||
|
*/
|
||||||
|
lockdep_assert_held_write(&bp_cpuinfo_sem);
|
||||||
|
|
||||||
for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
|
for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
|
||||||
if (tsk_pinned[i] > 0)
|
ASSERT_EXCLUSIVE_WRITER(tsk_pinned[i]); /* Catch unexpected writers. */
|
||||||
|
if (atomic_read(&tsk_pinned[i]) > 0)
|
||||||
return i + 1;
|
return i + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -158,6 +239,11 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
|
|||||||
struct perf_event *iter;
|
struct perf_event *iter;
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We need a stable snapshot of the per-task breakpoint list.
|
||||||
|
*/
|
||||||
|
assert_bp_constraints_lock_held(bp);
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
|
head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
|
||||||
if (!head)
|
if (!head)
|
||||||
@ -214,16 +300,25 @@ max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type)
|
|||||||
static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
|
static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
|
||||||
enum bp_type_idx type, int weight)
|
enum bp_type_idx type, int weight)
|
||||||
{
|
{
|
||||||
unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
|
atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
|
||||||
int old_idx, new_idx;
|
int old_idx, new_idx;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If bp->hw.target, tsk_pinned is only modified, but not used
|
||||||
|
* otherwise. We can permit concurrent updates as long as there are no
|
||||||
|
* other uses: having acquired bp_cpuinfo_sem as a reader allows
|
||||||
|
* concurrent updates here. Uses of tsk_pinned will require acquiring
|
||||||
|
* bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value.
|
||||||
|
*/
|
||||||
|
lockdep_assert_held_read(&bp_cpuinfo_sem);
|
||||||
|
|
||||||
old_idx = task_bp_pinned(cpu, bp, type) - 1;
|
old_idx = task_bp_pinned(cpu, bp, type) - 1;
|
||||||
new_idx = old_idx + weight;
|
new_idx = old_idx + weight;
|
||||||
|
|
||||||
if (old_idx >= 0)
|
if (old_idx >= 0)
|
||||||
tsk_pinned[old_idx]--;
|
atomic_dec(&tsk_pinned[old_idx]);
|
||||||
if (new_idx >= 0)
|
if (new_idx >= 0)
|
||||||
tsk_pinned[new_idx]++;
|
atomic_inc(&tsk_pinned[new_idx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -241,6 +336,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
|
|||||||
|
|
||||||
/* Pinned counter cpu profiling */
|
/* Pinned counter cpu profiling */
|
||||||
if (!bp->hw.target) {
|
if (!bp->hw.target) {
|
||||||
|
lockdep_assert_held_write(&bp_cpuinfo_sem);
|
||||||
get_bp_info(bp->cpu, type)->cpu_pinned += weight;
|
get_bp_info(bp->cpu, type)->cpu_pinned += weight;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -249,6 +345,11 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
|
|||||||
for_each_cpu(cpu, cpumask)
|
for_each_cpu(cpu, cpumask)
|
||||||
toggle_bp_task_slot(bp, cpu, type, weight);
|
toggle_bp_task_slot(bp, cpu, type, weight);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Readers want a stable snapshot of the per-task breakpoint list.
|
||||||
|
*/
|
||||||
|
assert_bp_constraints_lock_held(bp);
|
||||||
|
|
||||||
if (enable)
|
if (enable)
|
||||||
return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
|
return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
|
||||||
else
|
else
|
||||||
@ -354,14 +455,10 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
|
|||||||
|
|
||||||
int reserve_bp_slot(struct perf_event *bp)
|
int reserve_bp_slot(struct perf_event *bp)
|
||||||
{
|
{
|
||||||
int ret;
|
struct mutex *mtx = bp_constraints_lock(bp);
|
||||||
|
int ret = __reserve_bp_slot(bp, bp->attr.bp_type);
|
||||||
mutex_lock(&nr_bp_mutex);
|
|
||||||
|
|
||||||
ret = __reserve_bp_slot(bp, bp->attr.bp_type);
|
|
||||||
|
|
||||||
mutex_unlock(&nr_bp_mutex);
|
|
||||||
|
|
||||||
|
bp_constraints_unlock(mtx);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -379,12 +476,11 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
|
|||||||
|
|
||||||
void release_bp_slot(struct perf_event *bp)
|
void release_bp_slot(struct perf_event *bp)
|
||||||
{
|
{
|
||||||
mutex_lock(&nr_bp_mutex);
|
struct mutex *mtx = bp_constraints_lock(bp);
|
||||||
|
|
||||||
arch_unregister_hw_breakpoint(bp);
|
arch_unregister_hw_breakpoint(bp);
|
||||||
__release_bp_slot(bp, bp->attr.bp_type);
|
__release_bp_slot(bp, bp->attr.bp_type);
|
||||||
|
bp_constraints_unlock(mtx);
|
||||||
mutex_unlock(&nr_bp_mutex);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
|
static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
|
||||||
@ -411,11 +507,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
|
|||||||
|
|
||||||
static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
|
static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
|
||||||
{
|
{
|
||||||
int ret;
|
struct mutex *mtx = bp_constraints_lock(bp);
|
||||||
|
int ret = __modify_bp_slot(bp, old_type, new_type);
|
||||||
|
|
||||||
mutex_lock(&nr_bp_mutex);
|
bp_constraints_unlock(mtx);
|
||||||
ret = __modify_bp_slot(bp, old_type, new_type);
|
|
||||||
mutex_unlock(&nr_bp_mutex);
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -426,18 +521,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
|
|||||||
*/
|
*/
|
||||||
int dbg_reserve_bp_slot(struct perf_event *bp)
|
int dbg_reserve_bp_slot(struct perf_event *bp)
|
||||||
{
|
{
|
||||||
if (mutex_is_locked(&nr_bp_mutex))
|
int ret;
|
||||||
|
|
||||||
|
if (bp_constraints_is_locked(bp))
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
return __reserve_bp_slot(bp, bp->attr.bp_type);
|
/* Locks aren't held; disable lockdep assert checking. */
|
||||||
|
lockdep_off();
|
||||||
|
ret = __reserve_bp_slot(bp, bp->attr.bp_type);
|
||||||
|
lockdep_on();
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int dbg_release_bp_slot(struct perf_event *bp)
|
int dbg_release_bp_slot(struct perf_event *bp)
|
||||||
{
|
{
|
||||||
if (mutex_is_locked(&nr_bp_mutex))
|
if (bp_constraints_is_locked(bp))
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
/* Locks aren't held; disable lockdep assert checking. */
|
||||||
|
lockdep_off();
|
||||||
__release_bp_slot(bp, bp->attr.bp_type);
|
__release_bp_slot(bp, bp->attr.bp_type);
|
||||||
|
lockdep_on();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -663,7 +768,7 @@ bool hw_breakpoint_is_used(void)
|
|||||||
return true;
|
return true;
|
||||||
|
|
||||||
for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
|
for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
|
||||||
if (info->tsk_pinned[slot])
|
if (atomic_read(&info->tsk_pinned[slot]))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user