mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2024-12-29 01:03:32 +00:00
e32c260195
On 2 x Intel Sapphire Rapids machines with 224 logical CPUs, a poorly behaving BPF scheduler can live-lock the system by making multiple CPUs bang on the same DSQ to the point where soft-lockup detection triggers before SCX's own watchdog can take action. It also seems possible that the machine can be live-locked enough to prevent scx_ops_helper, which is an RT task, from running in a timely manner. Implement scx_softlockup() which is called when three quarters of soft-lockup threshold has passed. The function immediately enables the ops breather and triggers an ops error to initiate ejection of the BPF scheduler. The previous and this patch combined enable the kernel to reliably recover the system from live-lock conditions that can be triggered by a poorly behaving BPF scheduler on Intel dual socket systems. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Douglas Anderson <dianders@chromium.org> Cc: Andrew Morton <akpm@linux-foundation.org>
43 lines
1.3 KiB
Python
43 lines
1.3 KiB
Python
#!/usr/bin/env drgn
|
|
#
|
|
# Copyright (C) 2024 Tejun Heo <tj@kernel.org>
|
|
# Copyright (C) 2024 Meta Platforms, Inc. and affiliates.
|
|
|
|
desc = """
|
|
This is a drgn script to show the current sched_ext state.
|
|
For more info on drgn, visit https://github.com/osandov/drgn.
|
|
"""
|
|
|
|
import drgn
|
|
import sys
|
|
|
|
def err(s):
|
|
print(s, file=sys.stderr, flush=True)
|
|
sys.exit(1)
|
|
|
|
def read_int(name):
|
|
return int(prog[name].value_())
|
|
|
|
def read_atomic(name):
|
|
return prog[name].counter.value_()
|
|
|
|
def read_static_key(name):
|
|
return prog[name].key.enabled.counter.value_()
|
|
|
|
def ops_state_str(state):
|
|
return prog['scx_ops_enable_state_str'][state].string_().decode()
|
|
|
|
ops = prog['scx_ops']
|
|
enable_state = read_atomic("scx_ops_enable_state_var")
|
|
|
|
print(f'ops : {ops.name.string_().decode()}')
|
|
print(f'enabled : {read_static_key("__scx_ops_enabled")}')
|
|
print(f'switching_all : {read_int("scx_switching_all")}')
|
|
print(f'switched_all : {read_static_key("__scx_switched_all")}')
|
|
print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
|
|
print(f'in_softlockup : {prog["scx_in_softlockup"].value_()}')
|
|
print(f'breather_depth: {read_atomic("scx_ops_breather_depth")}')
|
|
print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}')
|
|
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
|
|
print(f'enable_seq : {read_atomic("scx_enable_seq")}')
|