mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-15 02:05:33 +00:00
sched_ext: idle: Refresh idle masks during idle-to-idle transitions
With the consolidation of put_prev_task/set_next_task(), see commit 436f3eed5c69 ("sched: Combine the last put_prev_task() and the first set_next_task()"), we are now skipping the transition between these two functions when the previous and the next tasks are the same. As a result, the scx idle state of a CPU is updated only when transitioning to or from the idle thread. While this is generally correct, it can lead to uneven and inefficient core utilization in certain scenarios [1]. A typical scenario involves proactive wake-ups: scx_bpf_pick_idle_cpu() selects and marks an idle CPU as busy, followed by a wake-up via scx_bpf_kick_cpu(), without dispatching any tasks. In this case, the CPU continues running the idle thread, returns to idle, but remains marked as busy, preventing it from being selected again as an idle CPU (until a task eventually runs on it and releases the CPU). For example, running a workload that uses 20% of each CPU, combined with an scx scheduler using proactive wake-ups, results in the following core utilization: CPU 0: 25.7% CPU 1: 29.3% CPU 2: 26.5% CPU 3: 25.5% CPU 4: 0.0% CPU 5: 25.5% CPU 6: 0.0% CPU 7: 10.5% To address this, refresh the idle state also in pick_task_idle(), during idle-to-idle transitions, but only trigger ops.update_idle() on actual state changes to prevent unnecessary updates to the scx scheduler and maintain balanced state transitions. With this change in place, the core utilization in the previous example becomes the following: CPU 0: 18.8% CPU 1: 19.4% CPU 2: 18.0% CPU 3: 18.7% CPU 4: 19.3% CPU 5: 18.9% CPU 6: 18.7% CPU 7: 19.3% [1] https://github.com/sched-ext/scx/pull/1139 Fixes: 7c65ae81ea86 ("sched_ext: Don't call put_prev_task_scx() before picking the next task") Signed-off-by: Andrea Righi <arighi@nvidia.com> Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
parent
68e449d849
commit
a2a3374c47
@ -3590,16 +3590,8 @@ static void reset_idle_masks(void)
|
||||
cpumask_copy(idle_masks.smt, cpu_online_mask);
|
||||
}
|
||||
|
||||
void __scx_update_idle(struct rq *rq, bool idle)
|
||||
static void update_builtin_idle(int cpu, bool idle)
|
||||
{
|
||||
int cpu = cpu_of(rq);
|
||||
|
||||
if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
|
||||
SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
|
||||
if (!static_branch_unlikely(&scx_builtin_idle_enabled))
|
||||
return;
|
||||
}
|
||||
|
||||
if (idle)
|
||||
cpumask_set_cpu(cpu, idle_masks.cpu);
|
||||
else
|
||||
@ -3626,6 +3618,57 @@ void __scx_update_idle(struct rq *rq, bool idle)
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the idle state of a CPU to @idle.
|
||||
*
|
||||
* If @do_notify is true, ops.update_idle() is invoked to notify the scx
|
||||
* scheduler of an actual idle state transition (idle to busy or vice
|
||||
* versa). If @do_notify is false, only the idle state in the idle masks is
|
||||
* refreshed without invoking ops.update_idle().
|
||||
*
|
||||
* This distinction is necessary, because an idle CPU can be "reserved" and
|
||||
* awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
|
||||
* busy even if no tasks are dispatched. In this case, the CPU may return
|
||||
* to idle without a true state transition. Refreshing the idle masks
|
||||
* without invoking ops.update_idle() ensures accurate idle state tracking
|
||||
* while avoiding unnecessary updates and maintaining balanced state
|
||||
* transitions.
|
||||
*/
|
||||
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
|
||||
{
|
||||
int cpu = cpu_of(rq);
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
|
||||
/*
|
||||
* Trigger ops.update_idle() only when transitioning from a task to
|
||||
* the idle thread and vice versa.
|
||||
*
|
||||
* Idle transitions are indicated by do_notify being set to true,
|
||||
* managed by put_prev_task_idle()/set_next_task_idle().
|
||||
*/
|
||||
if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
|
||||
SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
|
||||
|
||||
/*
|
||||
* Update the idle masks:
|
||||
* - for real idle transitions (do_notify == true)
|
||||
* - for idle-to-idle transitions (indicated by the previous task
|
||||
* being the idle thread, managed by pick_task_idle())
|
||||
*
|
||||
* Skip updating idle masks if the previous task is not the idle
|
||||
* thread, since set_next_task_idle() has already handled it when
|
||||
* transitioning from a task to the idle thread (calling this
|
||||
* function with do_notify == true).
|
||||
*
|
||||
* In this way we can avoid updating the idle masks twice,
|
||||
* unnecessarily.
|
||||
*/
|
||||
if (static_branch_likely(&scx_builtin_idle_enabled))
|
||||
if (do_notify || is_idle_task(rq->curr))
|
||||
update_builtin_idle(cpu, idle);
|
||||
}
|
||||
|
||||
static void handle_hotplug(struct rq *rq, bool online)
|
||||
{
|
||||
int cpu = cpu_of(rq);
|
||||
|
@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {}
|
||||
#endif /* CONFIG_SCHED_CLASS_EXT */
|
||||
|
||||
#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
|
||||
void __scx_update_idle(struct rq *rq, bool idle);
|
||||
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
|
||||
|
||||
static inline void scx_update_idle(struct rq *rq, bool idle)
|
||||
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
|
||||
{
|
||||
if (scx_enabled())
|
||||
__scx_update_idle(rq, idle);
|
||||
__scx_update_idle(rq, idle, do_notify);
|
||||
}
|
||||
#else
|
||||
static inline void scx_update_idle(struct rq *rq, bool idle) {}
|
||||
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
|
@ -452,19 +452,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
|
||||
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
|
||||
{
|
||||
dl_server_update_idle_time(rq, prev);
|
||||
scx_update_idle(rq, false);
|
||||
scx_update_idle(rq, false, true);
|
||||
}
|
||||
|
||||
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
|
||||
{
|
||||
update_idle_core(rq);
|
||||
scx_update_idle(rq, true);
|
||||
scx_update_idle(rq, true, true);
|
||||
schedstat_inc(rq->sched_goidle);
|
||||
next->se.exec_start = rq_clock_task(rq);
|
||||
}
|
||||
|
||||
struct task_struct *pick_task_idle(struct rq *rq)
|
||||
{
|
||||
scx_update_idle(rq, true, false);
|
||||
return rq->idle;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user