mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-18 10:56:14 +00:00
6e2df0581f
Commit 67692435c411 ("sched: Rework pick_next_task() slow-path") inadvertly introduced a race because it changed a previously unexplored dependency between dropping the rq->lock and sched_class::put_prev_task(). The comments about dropping rq->lock, in for example newidle_balance(), only mentions the task being current and ->on_cpu being set. But when we look at the 'change' pattern (in for example sched_setnuma()): queued = task_on_rq_queued(p); /* p->on_rq == TASK_ON_RQ_QUEUED */ running = task_current(rq, p); /* rq->curr == p */ if (queued) dequeue_task(...); if (running) put_prev_task(...); /* change task properties */ if (queued) enqueue_task(...); if (running) set_next_task(...); It becomes obvious that if we do this after put_prev_task() has already been called on @p, things go sideways. This is exactly what the commit in question allows to happen when it does: prev->sched_class->put_prev_task(rq, prev, rf); if (!rq->nr_running) newidle_balance(rq, rf); The newidle_balance() call will drop rq->lock after we've called put_prev_task() and that allows the above 'change' pattern to interleave and mess up the state. Furthermore, it turns out we lost the RT-pull when we put the last DL task. Fix both problems by extracting the balancing from put_prev_task() and doing a multi-class balance() pass before put_prev_task(). Fixes: 67692435c411 ("sched: Rework pick_next_task() slow-path") Reported-by: Quentin Perret <qperret@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Quentin Perret <qperret@google.com> Tested-by: Valentin Schneider <valentin.schneider@arm.com>
148 lines
3.3 KiB
C
148 lines
3.3 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* stop-task scheduling class.
|
|
*
|
|
* The stop task is the highest priority task in the system, it preempts
|
|
* everything and will be preempted by nothing.
|
|
*
|
|
* See kernel/stop_machine.c
|
|
*/
|
|
#include "sched.h"
|
|
|
|
#ifdef CONFIG_SMP
|
|
static int
|
|
select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
|
|
{
|
|
return task_cpu(p); /* stop tasks as never migrate */
|
|
}
|
|
|
|
static int
|
|
balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
|
{
|
|
return sched_stop_runnable(rq);
|
|
}
|
|
#endif /* CONFIG_SMP */
|
|
|
|
static void
|
|
check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
/* we're never preempted */
|
|
}
|
|
|
|
static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
|
|
{
|
|
stop->se.exec_start = rq_clock_task(rq);
|
|
}
|
|
|
|
static struct task_struct *
|
|
pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
|
{
|
|
WARN_ON_ONCE(prev || rf);
|
|
|
|
if (!sched_stop_runnable(rq))
|
|
return NULL;
|
|
|
|
set_next_task_stop(rq, rq->stop);
|
|
return rq->stop;
|
|
}
|
|
|
|
static void
|
|
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
add_nr_running(rq, 1);
|
|
}
|
|
|
|
static void
|
|
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
sub_nr_running(rq, 1);
|
|
}
|
|
|
|
static void yield_task_stop(struct rq *rq)
|
|
{
|
|
BUG(); /* the stop task should never yield, its pointless. */
|
|
}
|
|
|
|
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
|
|
{
|
|
struct task_struct *curr = rq->curr;
|
|
u64 delta_exec;
|
|
|
|
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
|
|
if (unlikely((s64)delta_exec < 0))
|
|
delta_exec = 0;
|
|
|
|
schedstat_set(curr->se.statistics.exec_max,
|
|
max(curr->se.statistics.exec_max, delta_exec));
|
|
|
|
curr->se.sum_exec_runtime += delta_exec;
|
|
account_group_exec_runtime(curr, delta_exec);
|
|
|
|
curr->se.exec_start = rq_clock_task(rq);
|
|
cgroup_account_cputime(curr, delta_exec);
|
|
}
|
|
|
|
/*
|
|
* scheduler tick hitting a task of our scheduling class.
|
|
*
|
|
* NOTE: This function can be called remotely by the tick offload that
|
|
* goes along full dynticks. Therefore no local assumption can be made
|
|
* and everything must be accessed through the @rq and @curr passed in
|
|
* parameters.
|
|
*/
|
|
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
|
|
{
|
|
}
|
|
|
|
static void switched_to_stop(struct rq *rq, struct task_struct *p)
|
|
{
|
|
BUG(); /* its impossible to change to this class */
|
|
}
|
|
|
|
static void
|
|
prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
|
|
{
|
|
BUG(); /* how!?, what priority? */
|
|
}
|
|
|
|
static unsigned int
|
|
get_rr_interval_stop(struct rq *rq, struct task_struct *task)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static void update_curr_stop(struct rq *rq)
|
|
{
|
|
}
|
|
|
|
/*
|
|
* Simple, special scheduling class for the per-CPU stop tasks:
|
|
*/
|
|
const struct sched_class stop_sched_class = {
|
|
.next = &dl_sched_class,
|
|
|
|
.enqueue_task = enqueue_task_stop,
|
|
.dequeue_task = dequeue_task_stop,
|
|
.yield_task = yield_task_stop,
|
|
|
|
.check_preempt_curr = check_preempt_curr_stop,
|
|
|
|
.pick_next_task = pick_next_task_stop,
|
|
.put_prev_task = put_prev_task_stop,
|
|
.set_next_task = set_next_task_stop,
|
|
|
|
#ifdef CONFIG_SMP
|
|
.balance = balance_stop,
|
|
.select_task_rq = select_task_rq_stop,
|
|
.set_cpus_allowed = set_cpus_allowed_common,
|
|
#endif
|
|
|
|
.task_tick = task_tick_stop,
|
|
|
|
.get_rr_interval = get_rr_interval_stop,
|
|
|
|
.prio_changed = prio_changed_stop,
|
|
.switched_to = switched_to_stop,
|
|
.update_curr = update_curr_stop,
|
|
};
|