Merge branch into tip/master: 'sched/core'

# New commits in sched/core:
    af98d8a36a96 ("sched/fair: Fix CPU bandwidth limit bypass during CPU hotplug")
    7675361ff9a1 ("sched: deadline: Cleanup goto label in pick_earliest_pushable_dl_task")
    7d5265ffcd8b ("rseq: Validate read-only fields under DEBUG_RSEQ config")
    2a77e4be12cb ("sched/fair: Untangle NEXT_BUDDY and pick_next_task()")
    95d9fed3a2ae ("sched/fair: Mark m*_vruntime() with __maybe_unused")
    0429489e0928 ("sched/fair: Fix variable declaration position")
    61b82dfb6b7e ("sched/fair: Do not try to migrate delayed dequeue task")
    736c55a02c47 ("sched/fair: Rename cfs_rq.nr_running into nr_queued")
    43eef7c3a4a6 ("sched/fair: Remove unused cfs_rq.idle_nr_running")
    31898e7b87dd ("sched/fair: Rename cfs_rq.idle_h_nr_running into h_nr_idle")
    9216582b0bfb ("sched/fair: Removed unsued cfs_rq.h_nr_delayed")
    1a49104496d3 ("sched/fair: Use the new cfs_rq.h_nr_runnable")
    c2a295bffeaf ("sched/fair: Add new cfs_rq.h_nr_runnable")
    7b8a702d9438 ("sched/fair: Rename h_nr_running into h_nr_queued")
    c907cd44a108 ("sched: Unify HK_TYPE_{TIMER|TICK|MISC} to HK_TYPE_KERNEL_NOISE")
    6010d245ddc9 ("sched/isolation: Consolidate housekeeping cpumasks that are always identical")
    1174b9344bc7 ("sched/isolation: Make "isolcpus=nohz" equivalent to "nohz_full"")
    ae5c677729e9 ("sched/core: Remove HK_TYPE_SCHED")
    a76328d44c7a ("sched/fair: Remove CONFIG_CFS_BANDWIDTH=n definition of cfs_bandwidth_used()")
    3a181f20fb4e ("sched/deadline: Consolidate Timer Cancellation")
    53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug")
    d4742f6ed7ea ("sched/deadline: Correctly account for allocated bandwidth during hotplug")
    41d4200b7103 ("sched/deadline: Restore dl_server bandwidth on non-destructive root domain changes")
    59297e2093ce ("sched: add READ_ONCE to task_on_rq_queued")
    108ad0999085 ("sched: Don't try to catch up excess steal time.")

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2024-12-19 20:24:25 +01:00
commit c779bc69c8
13 changed files with 403 additions and 256 deletions

View File

@ -2432,7 +2432,9 @@
specified in the flag list (default: domain):
nohz
Disable the tick when a single task runs.
Disable the tick when a single task runs as well as
disabling other kernel noises like having RCU callbacks
offloaded. This is equivalent to the nohz_full parameter.
A residual 1Hz tick is offloaded to workqueues, which you
need to affine to housekeeping through the global

View File

@ -1374,6 +1374,15 @@ struct task_struct {
* with respect to preemption.
*/
unsigned long rseq_event_mask;
# ifdef CONFIG_DEBUG_RSEQ
/*
* This is a place holder to save a copy of the rseq fields for
* validation of read-only fields. The struct rseq has a
* variable-length array at the end, so it cannot be used
* directly. Reserve a size large enough for the known fields.
*/
char rseq_fields[sizeof(struct rseq)];
# endif
#endif
#ifdef CONFIG_SCHED_MM_CID

View File

@ -7,16 +7,21 @@
#include <linux/tick.h>
enum hk_type {
HK_TYPE_TIMER,
HK_TYPE_RCU,
HK_TYPE_MISC,
HK_TYPE_SCHED,
HK_TYPE_TICK,
HK_TYPE_DOMAIN,
HK_TYPE_WQ,
HK_TYPE_MANAGED_IRQ,
HK_TYPE_KTHREAD,
HK_TYPE_MAX
HK_TYPE_KERNEL_NOISE,
HK_TYPE_MAX,
/*
* The following housekeeping types are only set by the nohz_full
* boot commandline option. So they can share the same value.
*/
HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE,
HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE,
HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE,
HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE,
HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE,
HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE
};
#ifdef CONFIG_CPU_ISOLATION

View File

@ -13,6 +13,7 @@
#include <linux/syscalls.h>
#include <linux/rseq.h>
#include <linux/types.h>
#include <linux/ratelimit.h>
#include <asm/ptrace.h>
#define CREATE_TRACE_POINTS
@ -25,6 +26,78 @@
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
#ifdef CONFIG_DEBUG_RSEQ
static struct rseq *rseq_kernel_fields(struct task_struct *t)
{
return (struct rseq *) t->rseq_fields;
}
static int rseq_validate_ro_fields(struct task_struct *t)
{
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
u32 cpu_id_start, cpu_id, node_id, mm_cid;
struct rseq __user *rseq = t->rseq;
/*
* Validate fields which are required to be read-only by
* user-space.
*/
if (!user_read_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
unsafe_get_user(node_id, &rseq->node_id, efault_end);
unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
user_read_access_end();
if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
cpu_id != rseq_kernel_fields(t)->cpu_id ||
node_id != rseq_kernel_fields(t)->node_id ||
mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
pr_warn("Detected rseq corruption for pid: %d, name: %s\n"
"\tcpu_id_start: %u ?= %u\n"
"\tcpu_id: %u ?= %u\n"
"\tnode_id: %u ?= %u\n"
"\tmm_cid: %u ?= %u\n",
t->pid, t->comm,
cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
cpu_id, rseq_kernel_fields(t)->cpu_id,
node_id, rseq_kernel_fields(t)->node_id,
mm_cid, rseq_kernel_fields(t)->mm_cid);
}
/* For now, only print a console warning on mismatch. */
return 0;
efault_end:
user_read_access_end();
efault:
return -EFAULT;
}
static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
u32 node_id, u32 mm_cid)
{
rseq_kernel_fields(t)->cpu_id_start = cpu_id;
rseq_kernel_fields(t)->cpu_id = cpu_id;
rseq_kernel_fields(t)->node_id = node_id;
rseq_kernel_fields(t)->mm_cid = mm_cid;
}
#else
static int rseq_validate_ro_fields(struct task_struct *t)
{
return 0;
}
static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
u32 node_id, u32 mm_cid)
{
}
#endif
/*
*
* Restartable sequences are a lightweight interface that allows
@ -92,6 +165,11 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
u32 node_id = cpu_to_node(cpu_id);
u32 mm_cid = task_mm_cid(t);
/*
* Validate read-only rseq fields.
*/
if (rseq_validate_ro_fields(t))
goto efault;
WARN_ON_ONCE((int) mm_cid < 0);
if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
@ -105,6 +183,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
* t->rseq_len != ORIG_RSEQ_SIZE.
*/
user_write_access_end();
rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid);
trace_rseq_update(t);
return 0;
@ -119,6 +198,11 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
mm_cid = 0;
/*
* Validate read-only rseq fields.
*/
if (!rseq_validate_ro_fields(t))
return -EFAULT;
/*
* Reset cpu_id_start to its initial state (0).
*/
@ -141,6 +225,9 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
*/
if (put_user(mm_cid, &t->rseq->mm_cid))
return -EFAULT;
rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid);
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally reset only if
@ -423,6 +510,17 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
current->rseq = rseq;
current->rseq_len = rseq_len;
current->rseq_sig = sig;
#ifdef CONFIG_DEBUG_RSEQ
/*
* Initialize the in-kernel rseq fields copy for validation of
* read-only fields.
*/
if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
return -EFAULT;
#endif
/*
* If rseq was previously inactive, and has just been
* registered, ensure the cpu_id_start and cpu_id fields

View File

@ -766,13 +766,15 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
steal = paravirt_steal_clock(cpu_of(rq));
u64 prev_steal;
steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
rq->prev_steal_time_rq += steal;
rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
@ -1168,13 +1170,13 @@ int get_nohz_timer_target(void)
struct sched_domain *sd;
const struct cpumask *hk_mask;
if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
guard(rcu)();
@ -1189,7 +1191,7 @@ int get_nohz_timer_target(void)
}
if (default_cpu == -1)
default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
return default_cpu;
}
@ -1341,7 +1343,7 @@ bool sched_can_stop_tick(struct rq *rq)
if (scx_enabled() && !scx_can_stop_tick(rq))
return false;
if (rq->cfs.h_nr_running > 1)
if (rq->cfs.h_nr_queued > 1)
return false;
/*
@ -5632,7 +5634,7 @@ void sched_tick(void)
unsigned long hw_pressure;
u64 resched_latency;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
arch_scale_freq_tick();
sched_clock_tick();
@ -5771,7 +5773,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
@ -5792,7 +5794,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
@ -6018,7 +6020,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* opportunity to pull in more work from other CPUs.
*/
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
rq->nr_running == rq->cfs.h_nr_queued)) {
p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
@ -8180,19 +8182,14 @@ static void cpuset_cpu_active(void)
cpuset_update_active_cpus();
}
static int cpuset_cpu_inactive(unsigned int cpu)
static void cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
int ret = dl_bw_check_overflow(cpu);
if (ret)
return ret;
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
}
return 0;
}
static inline void sched_smt_present_inc(int cpu)
@ -8254,6 +8251,11 @@ int sched_cpu_deactivate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
int ret;
ret = dl_bw_deactivate(cpu);
if (ret)
return ret;
/*
* Remove CPU from nohz.idle_cpus_mask to prevent participating in
* load balancing when not active
@ -8299,15 +8301,7 @@ int sched_cpu_deactivate(unsigned int cpu)
return 0;
sched_update_numa(cpu, false);
ret = cpuset_cpu_inactive(cpu);
if (ret) {
sched_smt_present_inc(cpu);
sched_set_rq_online(rq, cpu);
balance_push_set(cpu, false);
set_cpu_active(cpu, true);
sched_update_numa(cpu, true);
return ret;
}
cpuset_cpu_inactive(cpu);
sched_domains_numa_masks_clear(cpu);
return 0;
}

View File

@ -342,6 +342,29 @@ static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_s
__add_rq_bw(new_bw, &rq->dl);
}
static __always_inline
void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer)
{
/*
* If the timer callback was running (hrtimer_try_to_cancel == -1),
* it will eventually call put_task_struct().
*/
if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se))
put_task_struct(dl_task_of(dl_se));
}
static __always_inline
void cancel_replenish_timer(struct sched_dl_entity *dl_se)
{
cancel_dl_timer(dl_se, &dl_se->dl_timer);
}
static __always_inline
void cancel_inactive_timer(struct sched_dl_entity *dl_se)
{
cancel_dl_timer(dl_se, &dl_se->inactive_timer);
}
static void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
@ -495,10 +518,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
* will not touch the rq's active utilization,
* so we are still safe.
*/
if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
if (!dl_server(dl_se))
put_task_struct(dl_task_of(dl_se));
}
cancel_inactive_timer(dl_se);
} else {
/*
* Since "dl_non_contending" is not set, the
@ -2115,13 +2135,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* The replenish timer needs to be canceled. No
* problem if it fires concurrently: boosted threads
* are ignored in dl_task_timer().
*
* If the timer callback was running (hrtimer_try_to_cancel == -1),
* it will eventually call put_task_struct().
*/
if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 &&
!dl_server(&p->dl))
put_task_struct(p);
cancel_replenish_timer(&p->dl);
p->dl.dl_throttled = 0;
}
} else if (!dl_prio(p->normal_prio)) {
@ -2289,8 +2304,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* will not touch the rq's active utilization,
* so we are still safe.
*/
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
cancel_inactive_timer(&p->dl);
}
sub_rq_bw(&p->dl, &rq->dl);
rq_unlock(rq, &rf);
@ -2506,16 +2520,13 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu
return NULL;
next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
next_node:
if (next_node) {
while (next_node) {
p = __node_2_pdl(next_node);
if (task_is_pushable(rq, p, cpu))
return p;
next_node = rb_next(next_node);
goto next_node;
}
return NULL;
@ -2964,11 +2975,22 @@ void dl_add_task_root_domain(struct task_struct *p)
void dl_clear_root_domain(struct root_domain *rd)
{
unsigned long flags;
int i;
raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
guard(raw_spinlock_irqsave)(&rd->dl_bw.lock);
rd->dl_bw.total_bw = 0;
raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
/*
* dl_server bandwidth is only restored when CPUs are attached to root
* domains (after domains are created or CPUs moved back to the
* default root doamin).
*/
for_each_cpu(i, rd->span) {
struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
if (dl_server(dl_se) && cpu_active(i))
rd->dl_bw.total_bw += dl_se->dl_bw;
}
}
#endif /* CONFIG_SMP */
@ -3029,8 +3051,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
cancel_inactive_timer(&p->dl);
/*
* In case a task is setscheduled to SCHED_DEADLINE we need to keep
@ -3453,29 +3474,31 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
}
enum dl_bw_request {
dl_bw_req_check_overflow = 0,
dl_bw_req_deactivate = 0,
dl_bw_req_alloc,
dl_bw_req_free
};
static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
{
unsigned long flags;
unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow = 0;
u64 fair_server_bw = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
if (req == dl_bw_req_free) {
cap = dl_bw_capacity(cpu);
switch (req) {
case dl_bw_req_free:
__dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
} else {
unsigned long cap = dl_bw_capacity(cpu);
break;
case dl_bw_req_alloc:
overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
if (req == dl_bw_req_alloc && !overflow) {
if (!overflow) {
/*
* We reserve space in the destination
* root_domain, as we can't fail after this point.
@ -3484,6 +3507,42 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
*/
__dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
}
break;
case dl_bw_req_deactivate:
/*
* cpu is not off yet, but we need to do the math by
* considering it off already (i.e., what would happen if we
* turn cpu off?).
*/
cap -= arch_scale_cpu_capacity(cpu);
/*
* cpu is going offline and NORMAL tasks will be moved away
* from it. We can thus discount dl_server bandwidth
* contribution as it won't need to be servicing tasks after
* the cpu is off.
*/
if (cpu_rq(cpu)->fair_server.dl_server)
fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw;
/*
* Not much to check if no DEADLINE bandwidth is present.
* dl_servers we can discount, as tasks will be moved out the
* offlined CPUs anyway.
*/
if (dl_b->total_bw - fair_server_bw > 0) {
/*
* Leaving at least one CPU for DEADLINE tasks seems a
* wise thing to do. As said above, cpu is not offline
* yet, so account for that.
*/
if (dl_bw_cpus(cpu) - 1)
overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0);
else
overflow = 1;
}
break;
}
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
@ -3492,9 +3551,9 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
return overflow ? -EBUSY : 0;
}
int dl_bw_check_overflow(int cpu)
int dl_bw_deactivate(int cpu)
{
return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
return dl_bw_manage(dl_bw_req_deactivate, cpu, 0);
}
int dl_bw_alloc(int cpu, u64 dl_bw)

View File

@ -379,7 +379,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
return -EINVAL;
}
if (rq->cfs.h_nr_running) {
if (rq->cfs.h_nr_queued) {
update_rq_clock(rq);
dl_server_stop(&rq->fair_server);
}
@ -392,7 +392,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
cpu_of(rq));
if (rq->cfs.h_nr_running)
if (rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
}
@ -843,13 +843,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(right_vruntime));
spread = right_vruntime - left_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed);
SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
cfs_rq->idle_nr_running);
SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
cfs_rq->idle_h_nr_running);
SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",

View File

@ -523,7 +523,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
@ -532,7 +532,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
return max_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
@ -915,7 +915,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
* We can safely skip eligibility check if there is only one entity
* in this cfs_rq, saving some cycles.
*/
if (cfs_rq->nr_running == 1)
if (cfs_rq->nr_queued == 1)
return curr && curr->on_rq ? curr : se;
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
@ -1250,7 +1250,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
account_cfs_rq_runtime(cfs_rq, delta_exec);
if (cfs_rq->nr_running == 1)
if (cfs_rq->nr_queued == 1)
return;
if (resched || did_preempt_short(cfs_rq, curr)) {
@ -2131,7 +2131,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util_cfs(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->nr_running += rq->cfs.h_nr_runnable;
ns->compute_capacity += capacity_of(cpu);
if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
@ -3682,9 +3682,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_add(&se->group_node, &rq->cfs_tasks);
}
#endif
cfs_rq->nr_running++;
if (se_is_idle(se))
cfs_rq->idle_nr_running++;
cfs_rq->nr_queued++;
}
static void
@ -3697,9 +3695,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_del_init(&se->group_node);
}
#endif
cfs_rq->nr_running--;
if (se_is_idle(se))
cfs_rq->idle_nr_running--;
cfs_rq->nr_queued--;
}
/*
@ -5233,7 +5229,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
return !cfs_rq->nr_running;
return !cfs_rq->nr_queued;
}
#define UPDATE_TG 0x0
@ -5289,7 +5285,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* EEVDF: placement strategy #1 / #2
*/
if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) {
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;
@ -5382,8 +5378,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
static inline bool cfs_bandwidth_used(void);
static void
requeue_delayed_entity(struct sched_entity *se);
@ -5405,7 +5399,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
* h_nr_running of its group cfs_rq.
* h_nr_runnable of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
@ -5438,7 +5432,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
if (cfs_rq->nr_running == 1) {
if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
if (!throttled_hierarchy(cfs_rq)) {
list_add_leaf_cfs_rq(cfs_rq);
@ -5480,7 +5474,7 @@ static void set_delayed(struct sched_entity *se)
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_delayed++;
cfs_rq->h_nr_runnable--;
if (cfs_rq_throttled(cfs_rq))
break;
}
@ -5492,7 +5486,7 @@ static void clear_delayed(struct sched_entity *se)
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_delayed--;
cfs_rq->h_nr_runnable++;
if (cfs_rq_throttled(cfs_rq))
break;
}
@ -5509,6 +5503,7 @@ static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool sleep = flags & DEQUEUE_SLEEP;
int action = UPDATE_TG;
update_curr(cfs_rq);
clear_buddies(cfs_rq, se);
@ -5534,7 +5529,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
}
}
int action = UPDATE_TG;
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
action |= DO_DETACH;
@ -5542,7 +5536,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
* h_nr_running of its group cfs_rq.
* h_nr_runnable of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
@ -5580,7 +5574,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
if (cfs_rq->nr_running == 0)
if (cfs_rq->nr_queued == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
return true;
@ -5642,17 +5636,19 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
static struct sched_entity *
pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
struct sched_entity *se;
/*
* Enabling NEXT_BUDDY will affect latency but not fairness.
* Picking the ->next buddy will affect latency but not fairness.
*/
if (sched_feat(NEXT_BUDDY) &&
if (sched_feat(PICK_BUDDY) &&
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
/* ->next will never be delayed */
SCHED_WARN_ON(cfs_rq->next->sched_delayed);
return cfs_rq->next;
}
struct sched_entity *se = pick_eevdf(cfs_rq);
se = pick_eevdf(cfs_rq);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
/*
@ -5928,7 +5924,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
list_del_leaf_cfs_rq(cfs_rq);
SCHED_WARN_ON(cfs_rq->throttled_clock_self);
if (cfs_rq->nr_running)
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock_self = rq_clock(rq);
}
cfs_rq->throttle_count++;
@ -5941,8 +5937,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, delayed_delta, dequeue = 1;
long rq_h_nr_running = rq->cfs.h_nr_running;
long queued_delta, runnable_delta, idle_delta, dequeue = 1;
long rq_h_nr_queued = rq->cfs.h_nr_queued;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@ -5972,9 +5968,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
delayed_delta = cfs_rq->h_nr_delayed;
queued_delta = cfs_rq->h_nr_queued;
runnable_delta = cfs_rq->h_nr_runnable;
idle_delta = cfs_rq->h_nr_idle;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
int flags;
@ -5994,11 +5990,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue_entity(qcfs_rq, se, flags);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
idle_delta = cfs_rq->h_nr_queued;
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
qcfs_rq->h_nr_delayed -= delayed_delta;
qcfs_rq->h_nr_queued -= queued_delta;
qcfs_rq->h_nr_runnable -= runnable_delta;
qcfs_rq->h_nr_idle -= idle_delta;
if (qcfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
@ -6017,18 +6013,18 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
se_update_runnable(se);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
idle_delta = cfs_rq->h_nr_queued;
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
qcfs_rq->h_nr_delayed -= delayed_delta;
qcfs_rq->h_nr_queued -= queued_delta;
qcfs_rq->h_nr_runnable -= runnable_delta;
qcfs_rq->h_nr_idle -= idle_delta;
}
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, task_delta);
sub_nr_running(rq, queued_delta);
/* Stop the fair server if throttling resulted in no runnable tasks */
if (rq_h_nr_running && !rq->cfs.h_nr_running)
if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
dl_server_stop(&rq->fair_server);
done:
/*
@ -6037,7 +6033,7 @@ done:
*/
cfs_rq->throttled = 1;
SCHED_WARN_ON(cfs_rq->throttled_clock);
if (cfs_rq->nr_running)
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@ -6047,8 +6043,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, delayed_delta;
long rq_h_nr_running = rq->cfs.h_nr_running;
long queued_delta, runnable_delta, idle_delta;
long rq_h_nr_queued = rq->cfs.h_nr_queued;
se = cfs_rq->tg->se[cpu_of(rq)];
@ -6081,9 +6077,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
goto unthrottle_throttle;
}
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
delayed_delta = cfs_rq->h_nr_delayed;
queued_delta = cfs_rq->h_nr_queued;
runnable_delta = cfs_rq->h_nr_runnable;
idle_delta = cfs_rq->h_nr_idle;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
@ -6097,11 +6093,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
idle_delta = cfs_rq->h_nr_queued;
qcfs_rq->h_nr_running += task_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
qcfs_rq->h_nr_delayed += delayed_delta;
qcfs_rq->h_nr_queued += queued_delta;
qcfs_rq->h_nr_runnable += runnable_delta;
qcfs_rq->h_nr_idle += idle_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
@ -6115,11 +6111,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
se_update_runnable(se);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
idle_delta = cfs_rq->h_nr_queued;
qcfs_rq->h_nr_running += task_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
qcfs_rq->h_nr_delayed += delayed_delta;
qcfs_rq->h_nr_queued += queued_delta;
qcfs_rq->h_nr_runnable += runnable_delta;
qcfs_rq->h_nr_idle += idle_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
@ -6127,17 +6123,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
/* Start the fair server if un-throttling resulted in new runnable tasks */
if (!rq_h_nr_running && rq->cfs.h_nr_running)
if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);
add_nr_running(rq, queued_delta);
unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
if (rq->curr == rq->idle && rq->cfs.nr_queued)
resched_curr(rq);
}
@ -6438,7 +6434,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued)
return;
__return_cfs_rq_runtime(cfs_rq);
@ -6709,6 +6705,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_rq_held(rq);
// Do not unthrottle for an active CPU
if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask))
return;
/*
* The rq clock has already been updated in the
* set_rq_offline(), so we should skip updating
@ -6723,19 +6723,21 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
if (!cfs_rq->runtime_enabled)
continue;
/*
* clock_task is not advancing so we just need to make sure
* there's some valid quota amount
*/
cfs_rq->runtime_remaining = 1;
/*
* Offline rq is schedulable till CPU is completely disabled
* in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq->runtime_enabled = 0;
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
if (!cfs_rq_throttled(cfs_rq))
continue;
/*
* clock_task is not advancing so we just need to make sure
* there's some valid quota amount
*/
cfs_rq->runtime_remaining = 1;
unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
@ -6784,11 +6786,6 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
#else /* CONFIG_CFS_BANDWIDTH */
static inline bool cfs_bandwidth_used(void)
{
return false;
}
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
@ -6846,7 +6843,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
SCHED_WARN_ON(task_rq(p) != rq);
if (rq->cfs.h_nr_running > 1) {
if (rq->cfs.h_nr_queued > 1) {
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
u64 slice = se->slice;
s64 delta = slice - ran;
@ -6934,7 +6931,7 @@ static inline void check_update_overutilized_status(struct rq *rq) { }
/* Runqueue only has SCHED_IDLE tasks enqueued */
static int sched_idle_rq(struct rq *rq)
{
return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
rq->nr_running);
}
@ -6961,14 +6958,14 @@ requeue_delayed_entity(struct sched_entity *se)
if (sched_feat(DELAY_ZERO)) {
update_entity_lag(cfs_rq, se);
if (se->vlag > 0) {
cfs_rq->nr_running--;
cfs_rq->nr_queued--;
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->vlag = 0;
place_entity(cfs_rq, se, 0);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
cfs_rq->nr_running++;
cfs_rq->nr_queued++;
}
}
@ -6986,10 +6983,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
int h_nr_delayed = 0;
int h_nr_idle = task_has_idle_policy(p);
int h_nr_runnable = 1;
int task_new = !(flags & ENQUEUE_WAKEUP);
int rq_h_nr_running = rq->cfs.h_nr_running;
int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
/*
@ -7014,8 +7011,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (p->in_iowait)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
if (task_new)
h_nr_delayed = !!se->sched_delayed;
if (task_new && se->sched_delayed)
h_nr_runnable = 0;
for_each_sched_entity(se) {
if (se->on_rq) {
@ -7037,12 +7034,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
enqueue_entity(cfs_rq, se, flags);
slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
cfs_rq->h_nr_delayed += h_nr_delayed;
cfs_rq->h_nr_runnable += h_nr_runnable;
cfs_rq->h_nr_queued++;
cfs_rq->h_nr_idle += h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
h_nr_idle = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
@ -7061,19 +7058,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
se->slice = slice;
slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
cfs_rq->h_nr_delayed += h_nr_delayed;
cfs_rq->h_nr_runnable += h_nr_runnable;
cfs_rq->h_nr_queued++;
cfs_rq->h_nr_idle += h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
h_nr_idle = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
}
if (!rq_h_nr_running && rq->cfs.h_nr_running) {
if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
/* Account for idle runtime */
if (!rq->nr_running)
dl_server_update_idle_time(rq, rq->curr);
@ -7120,22 +7117,22 @@ static void set_next_buddy(struct sched_entity *se);
static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
bool was_sched_idle = sched_idle_rq(rq);
int rq_h_nr_running = rq->cfs.h_nr_running;
int rq_h_nr_queued = rq->cfs.h_nr_queued;
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
struct task_struct *p = NULL;
int idle_h_nr_running = 0;
int h_nr_running = 0;
int h_nr_delayed = 0;
int h_nr_idle = 0;
int h_nr_queued = 0;
int h_nr_runnable = 0;
struct cfs_rq *cfs_rq;
u64 slice = 0;
if (entity_is_task(se)) {
p = task_of(se);
h_nr_running = 1;
idle_h_nr_running = task_has_idle_policy(p);
if (!task_sleep && !task_delayed)
h_nr_delayed = !!se->sched_delayed;
h_nr_queued = 1;
h_nr_idle = task_has_idle_policy(p);
if (task_sleep || task_delayed || !se->sched_delayed)
h_nr_runnable = 1;
} else {
cfs_rq = group_cfs_rq(se);
slice = cfs_rq_min_slice(cfs_rq);
@ -7151,12 +7148,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
break;
}
cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
cfs_rq->h_nr_delayed -= h_nr_delayed;
cfs_rq->h_nr_runnable -= h_nr_runnable;
cfs_rq->h_nr_queued -= h_nr_queued;
cfs_rq->h_nr_idle -= h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = h_nr_running;
h_nr_idle = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
@ -7190,21 +7187,21 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
se->slice = slice;
slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
cfs_rq->h_nr_delayed -= h_nr_delayed;
cfs_rq->h_nr_runnable -= h_nr_runnable;
cfs_rq->h_nr_queued -= h_nr_queued;
cfs_rq->h_nr_idle -= h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = h_nr_running;
h_nr_idle = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
return 0;
}
sub_nr_running(rq, h_nr_running);
sub_nr_running(rq, h_nr_queued);
if (rq_h_nr_running && !rq->cfs.h_nr_running)
if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
dl_server_stop(&rq->fair_server);
/* balance early to pull high priority tasks */
@ -8893,7 +8890,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
again:
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_running)
if (!cfs_rq->nr_queued)
return NULL;
do {
@ -9010,7 +9007,7 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru
static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
{
return !!dl_se->rq->cfs.nr_running;
return !!dl_se->rq->cfs.nr_queued;
}
static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
@ -9411,11 +9408,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
* 2) cannot be migrated to this CPU due to cpus_ptr, or
* 3) running (obviously), or
* 4) are cache-hot on their current CPU.
* 1) delayed dequeued unless we migrate load, or
* 2) throttled_lb_pair, or
* 3) cannot be migrated to this CPU due to cpus_ptr, or
* 4) running (obviously), or
* 5) are cache-hot on their current CPU.
*/
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
@ -9800,7 +9801,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq);
if (cfs_rq->nr_running == 0)
if (cfs_rq->nr_queued == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
if (cfs_rq == &rq->cfs)
@ -10332,7 +10333,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* When there is more than 1 task, the group_overloaded case already
* takes care of cpu with reduced capacity
*/
if (rq->cfs.h_nr_running != 1)
if (rq->cfs.h_nr_runnable != 1)
return false;
return check_cpu_capacity(rq, sd);
@ -10367,7 +10368,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
@ -10682,7 +10683,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
sgs->group_util += cpu_util_without(i, p);
sgs->group_runnable += cpu_runnable_without(rq, p);
local = task_running_on_cpu(i, p);
sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;
@ -11464,7 +11465,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
if (rt > env->fbq_type)
continue;
nr_running = rq->cfs.h_nr_running;
nr_running = rq->cfs.h_nr_runnable;
if (!nr_running)
continue;
@ -11623,7 +11624,7 @@ static int need_active_balance(struct lb_env *env)
* available on dst_cpu.
*/
if (env->idle &&
(env->src_rq->cfs.h_nr_running == 1)) {
(env->src_rq->cfs.h_nr_runnable == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
@ -12251,16 +12252,13 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notices that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*
* - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
* anywhere yet.
*/
static inline int find_new_ilb(void)
{
const struct cpumask *hk_mask;
int ilb_cpu;
hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
@ -12278,7 +12276,8 @@ static inline int find_new_ilb(void)
* Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
* SMP function call (IPI).
*
* We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
* We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
* (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
@ -12366,7 +12365,7 @@ static void nohz_balancer_kick(struct rq *rq)
* If there's a runnable CFS task and the current CPU has reduced
* capacity, kick the ILB to see if there's a better CPU to run on:
*/
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@ -12498,10 +12497,6 @@ void nohz_balance_enter_idle(int cpu)
if (!cpu_active(cpu))
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
return;
/*
* Can be set safely without rq->lock held
* If a clear happens, it will have evaluated last additions because
@ -12721,13 +12716,6 @@ static void nohz_newidle_balance(struct rq *this_rq)
{
int this_cpu = this_rq->cpu;
/*
* This CPU doesn't want to be disturbed by scheduler
* housekeeping
*/
if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
return;
/* Will wake up very soon. No time for doing anything else*/
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
@ -12864,11 +12852,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
if (this_rq->cfs.h_nr_running && !pulled_task)
if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
pulled_task = -1;
out:
@ -12982,7 +12970,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
@ -13126,7 +13114,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
if (rq->cfs.nr_running == 1)
if (rq->cfs.nr_queued == 1)
return;
/*
@ -13536,7 +13524,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
struct sched_entity *se = tg->se[i];
struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
long idle_task_delta;
struct rq_flags rf;
@ -13547,16 +13535,8 @@ int sched_group_set_idle(struct task_group *tg, long idle)
if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
goto next_cpu;
if (se->on_rq) {
parent_cfs_rq = cfs_rq_of(se);
if (cfs_rq_is_idle(grp_cfs_rq))
parent_cfs_rq->idle_nr_running++;
else
parent_cfs_rq->idle_nr_running--;
}
idle_task_delta = grp_cfs_rq->h_nr_running -
grp_cfs_rq->idle_h_nr_running;
idle_task_delta = grp_cfs_rq->h_nr_queued -
grp_cfs_rq->h_nr_idle;
if (!cfs_rq_is_idle(grp_cfs_rq))
idle_task_delta *= -1;
@ -13566,7 +13546,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
if (!se->on_rq)
break;
cfs_rq->idle_h_nr_running += idle_task_delta;
cfs_rq->h_nr_idle += idle_task_delta;
/* Already accounted at parent level and above. */
if (cfs_rq_is_idle(cfs_rq))

View File

@ -31,6 +31,15 @@ SCHED_FEAT(PREEMPT_SHORT, true)
*/
SCHED_FEAT(NEXT_BUDDY, false)
/*
* Allow completely ignoring cfs_rq->next; which can be set from various
* places:
* - NEXT_BUDDY (wakeup preemption)
* - yield_to_task()
* - cgroup dequeue / pick
*/
SCHED_FEAT(PICK_BUDDY, true)
/*
* Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.

View File

@ -9,15 +9,9 @@
*/
enum hk_flags {
HK_FLAG_TIMER = BIT(HK_TYPE_TIMER),
HK_FLAG_RCU = BIT(HK_TYPE_RCU),
HK_FLAG_MISC = BIT(HK_TYPE_MISC),
HK_FLAG_SCHED = BIT(HK_TYPE_SCHED),
HK_FLAG_TICK = BIT(HK_TYPE_TICK),
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
HK_FLAG_WQ = BIT(HK_TYPE_WQ),
HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD),
HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
};
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
@ -97,7 +91,7 @@ void __init housekeeping_init(void)
static_branch_enable(&housekeeping_overridden);
if (housekeeping.flags & HK_FLAG_TICK)
if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
sched_tick_offload_init();
for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
@ -121,7 +115,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
unsigned int first_cpu;
int err = 0;
if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) {
if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
pr_warn("Housekeeping: nohz unsupported."
" Build with CONFIG_NO_HZ_FULL\n");
@ -177,7 +171,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
housekeeping_setup_type(type, housekeeping_staging);
}
if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK))
if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE))
tick_nohz_full_setup(non_housekeeping_mask);
housekeeping.flags |= flags;
@ -195,8 +189,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned long flags;
flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
HK_FLAG_MISC | HK_FLAG_KTHREAD;
flags = HK_FLAG_KERNEL_NOISE;
return housekeeping_setup(str, flags);
}
@ -210,9 +203,12 @@ static int __init housekeeping_isolcpus_setup(char *str)
int len;
while (isalpha(*str)) {
/*
* isolcpus=nohz is equivalent to nohz_full.
*/
if (!strncmp(str, "nohz,", 5)) {
str += 5;
flags |= HK_FLAG_TICK;
flags |= HK_FLAG_KERNEL_NOISE;
continue;
}

View File

@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
* se_runnable() = grq->h_nr_running
* se_runnable() = grq->h_nr_runnable
*
* runnable_sum = se_runnable() * runnable = grq->runnable_sum
* runnable_avg = runnable_sum
@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
cfs_rq->h_nr_running - cfs_rq->h_nr_delayed,
cfs_rq->h_nr_runnable,
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1);

View File

@ -362,7 +362,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int dl_bw_check_overflow(int cpu);
extern int dl_bw_deactivate(int cpu);
extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
/*
* SCHED_DEADLINE supports servers (nested scheduling) with the following
@ -650,11 +650,10 @@ struct balance_callback {
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned int nr_running;
unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
unsigned int idle_nr_running; /* SCHED_IDLE */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
unsigned int h_nr_delayed;
unsigned int nr_queued;
unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
unsigned int h_nr_idle; /* SCHED_IDLE */
s64 avg_vruntime;
u64 avg_load;
@ -904,11 +903,8 @@ struct dl_rq {
static inline void se_update_runnable(struct sched_entity *se)
{
if (!entity_is_task(se)) {
struct cfs_rq *cfs_rq = se->my_q;
se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed;
}
if (!entity_is_task(se))
se->runnable_weight = se->my_q->h_nr_runnable;
}
static inline long se_runnable(struct sched_entity *se)
@ -2280,7 +2276,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
static inline int task_on_rq_queued(struct task_struct *p)
{
return p->on_rq == TASK_ON_RQ_QUEUED;
return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED;
}
static inline int task_on_rq_migrating(struct task_struct *p)
@ -2574,7 +2570,7 @@ static inline bool sched_rt_runnable(struct rq *rq)
static inline bool sched_fair_runnable(struct rq *rq)
{
return rq->cfs.nr_running > 0;
return rq->cfs.nr_queued > 0;
}
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);

View File

@ -2721,9 +2721,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
/*
* This domain won't be destroyed and as such
* its dl_bw->total_bw needs to be cleared. It
* will be recomputed in function
* update_tasks_root_domain().
* its dl_bw->total_bw needs to be cleared.
* Tasks contribution will be then recomputed
* in function dl_update_tasks_root_domain(),
* dl_servers contribution in function
* dl_restore_server_root_domain().
*/
rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
dl_clear_root_domain(rd);