diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1ddbde64a31b..65bc0a489cd2 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -205,11 +205,13 @@ struct sched_ext_entity { void sched_ext_free(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); +void scx_softlockup(u32 dur_s); #else /* !CONFIG_SCHED_CLASS_EXT */ static inline void sched_ext_free(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} +static inline void scx_softlockup(u32 dur_s) {} #endif /* CONFIG_SCHED_CLASS_EXT */ #endif /* _LINUX_SCHED_EXT_H */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2d41f1917464..02f39314ef8a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -867,6 +867,7 @@ static DEFINE_MUTEX(scx_ops_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); +static unsigned long scx_in_softlockup; static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0); static int scx_ops_bypass_depth; static bool scx_ops_init_task_enabled; @@ -4614,6 +4615,49 @@ bool task_should_scx(struct task_struct *p) return p->policy == SCHED_EXT; } +/** + * scx_softlockup - sched_ext softlockup handler + * + * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can + * live-lock the system by making many CPUs target the same DSQ to the point + * where soft-lockup detection triggers. This function is called from + * soft-lockup watchdog when the triggering point is close and tries to unjam + * the system by enabling the breather and aborting the BPF scheduler. + */ +void scx_softlockup(u32 dur_s) +{ + switch (scx_ops_enable_state()) { + case SCX_OPS_ENABLING: + case SCX_OPS_ENABLED: + break; + default: + return; + } + + /* allow only one instance, cleared at the end of scx_ops_bypass() */ + if (test_and_set_bit(0, &scx_in_softlockup)) + return; + + printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", + smp_processor_id(), dur_s, scx_ops.name); + + /* + * Some CPUs may be trapped in the dispatch paths. Enable breather + * immediately; otherwise, we might even be able to get to + * scx_ops_bypass(). + */ + atomic_inc(&scx_ops_breather_depth); + + scx_ops_error("soft lockup - CPU#%d stuck for %us", + smp_processor_id(), dur_s); +} + +static void scx_clear_softlockup(void) +{ + if (test_and_clear_bit(0, &scx_in_softlockup)) + atomic_dec(&scx_ops_breather_depth); +} + /** * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress * @@ -4724,6 +4768,7 @@ static void scx_ops_bypass(bool bypass) atomic_dec(&scx_ops_breather_depth); unlock: raw_spin_unlock_irqrestore(&bypass_lock, flags); + scx_clear_softlockup(); } static void free_exit_info(struct scx_exit_info *ei) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 262691ba62b7..5a93d4c446b8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -644,6 +644,14 @@ static int is_softlockup(unsigned long touch_ts, need_counting_irqs()) start_counting_irqs(); + /* + * A poorly behaving BPF scheduler can live-lock the system into + * soft lockups. Tell sched_ext to try ejecting the BPF + * scheduler when close to a soft lockup. + */ + if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4)) + scx_softlockup(now - touch_ts); + /* Warn about unreasonable delays. */ if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py index c4b3fdda9a0b..b800d4f5f2e9 100644 --- a/tools/sched_ext/scx_show_state.py +++ b/tools/sched_ext/scx_show_state.py @@ -35,6 +35,8 @@ print(f'enabled : {read_static_key("__scx_ops_enabled")}') print(f'switching_all : {read_int("scx_switching_all")}') print(f'switched_all : {read_static_key("__scx_switched_all")}') print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') +print(f'in_softlockup : {prog["scx_in_softlockup"].value_()}') +print(f'breather_depth: {read_atomic("scx_ops_breather_depth")}') print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}') print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') print(f'enable_seq : {read_atomic("scx_enable_seq")}')