mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-08 14:13:53 +00:00
Peter Zijlstra says:
"Mostly minor things this time; some highlights: - core-sched: Add 'Forced Idle' accounting; this allows to track how much CPU time is 'lost' due to core scheduling constraints. - psi: Fix for MEM_FULL; a task running reclaim would be counted as a runnable task and prevent MEM_FULL from being reported. - cpuacct: Long standing fixes for some cgroup accounting issues. - rt: Bandwidth timer could, under unusual circumstances, be failed to armed, leading to indefinite throttling." -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmHdvGkACgkQEsHwGGHe VUq3tQ/9GdaCpbo+WgtM20vo3FqzoRCWAtZZRLWm87g9G7FKE6tD1JCZ+cXn63jR wz4nuTMGg0lHkrmMiHoeTWoRo7Brw3vPdKTbFBxRaPS3gi3qyz8gaDHSKzAHTJSx L3j5XaTLcZnXwXV0MOphbK8ZD2W0f9PJZJjwYy1HFUrXh1AFT0WaMXL3aXuaZr8M jYZoB8r5qXsTBgzNZR8unq5bSUXgvoDAqupFU8gvQWYvNFV4NGK9WFQLlznG1ZhE aE7oHRbpCnb4avbv9xIm/QgLEHeCVTb/4kLBPk57nrW+aXTHX4ZTHuFtFs0nfDHS yHSgie3hthr5lFQ/c2G4a5bi5EfPcyURmgNHpWrs2zWWtWzVtqy1WAQ//m8twd14 9cMeefQzttPUbOjykj5QNCJPqkkGgKlblz3p9j8NwUBYUBtBIejsEP0UFPoVgZuL DjeGhPuGGeTqkVEhLD/pb9kSzUsi1ptTJtnzT9EvtBOi+EpnZnFC6jB98qcuRT19 jhlXwlFNH+SNnMrCniTjLhQK5gVEbvzbU86/nj9CHWDTNdu6DFeJv1S+ZBsjRHUe f8dV9+laXdLK5QJKAeAubq8ciMvacW8pTf/5PJfaFCJHHDs8rgmx/Ip6TxCZzVEG XEhNqOmMNnvbkj+9a1yk6SyD9QkVmitZrvRiqeoGayQMjsphT3E= =H0vR -----END PGP SIGNATURE----- Merge tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler updates from Borislav Petkov: "Mostly minor things this time; some highlights: - core-sched: Add 'Forced Idle' accounting; this allows to track how much CPU time is 'lost' due to core scheduling constraints. - psi: Fix for MEM_FULL; a task running reclaim would be counted as a runnable task and prevent MEM_FULL from being reported. - cpuacct: Long standing fixes for some cgroup accounting issues. - rt: Bandwidth timer could, under unusual circumstances, be failed to armed, leading to indefinite throttling." [ Description above by Peter Zijlstra ] * tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs() sched/fair: Cleanup task_util and capacity type sched/rt: Try to restart rt period timer when rt runtime exceeded sched/fair: Document the slow path and fast path in select_task_rq_fair sched/fair: Fix per-CPU kthread and wakee stacking for asym CPU capacity sched/fair: Fix detection of per-CPU kthreads waking a task sched/cpuacct: Make user/system times in cpuacct.stat more precise sched/cpuacct: Fix user/system in shown cpuacct.usage* cpuacct: Convert BUG_ON() to WARN_ON_ONCE() cputime, cpuacct: Include guest time in user time in cpuacct.stat psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim sched/core: Forced idle accounting psi: Add a missing SPDX license header psi: Remove repeated verbose comment
This commit is contained in:
commit
6ae71436cd
@ -1,3 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_PSI_H
|
||||
#define _LINUX_PSI_H
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_PSI_TYPES_H
|
||||
#define _LINUX_PSI_TYPES_H
|
||||
|
||||
@ -21,7 +22,17 @@ enum psi_task_count {
|
||||
* don't have to special case any state tracking for it.
|
||||
*/
|
||||
NR_ONCPU,
|
||||
NR_PSI_TASK_COUNTS = 4,
|
||||
/*
|
||||
* For IO and CPU stalls the presence of running/oncpu tasks
|
||||
* in the domain means a partial rather than a full stall.
|
||||
* For memory it's not so simple because of page reclaimers:
|
||||
* they are running/oncpu while representing a stall. To tell
|
||||
* whether a domain has productivity left or not, we need to
|
||||
* distinguish between regular running (i.e. productive)
|
||||
* threads and memstall ones.
|
||||
*/
|
||||
NR_MEMSTALL_RUNNING,
|
||||
NR_PSI_TASK_COUNTS = 5,
|
||||
};
|
||||
|
||||
/* Task state bitmasks */
|
||||
@ -29,6 +40,7 @@ enum psi_task_count {
|
||||
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
|
||||
#define TSK_RUNNING (1 << NR_RUNNING)
|
||||
#define TSK_ONCPU (1 << NR_ONCPU)
|
||||
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
|
||||
|
||||
/* Resources that workloads could be stalled on */
|
||||
enum psi_res {
|
||||
|
@ -523,7 +523,11 @@ struct sched_statistics {
|
||||
u64 nr_wakeups_affine_attempts;
|
||||
u64 nr_wakeups_passive;
|
||||
u64 nr_wakeups_idle;
|
||||
|
||||
#ifdef CONFIG_SCHED_CORE
|
||||
u64 core_forceidle_sum;
|
||||
#endif
|
||||
#endif /* CONFIG_SCHEDSTATS */
|
||||
} ____cacheline_aligned;
|
||||
|
||||
struct sched_entity {
|
||||
|
@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
|
||||
return false;
|
||||
|
||||
/* flip prio, so high prio is leftmost */
|
||||
if (prio_less(b, a, task_rq(a)->core->core_forceidle))
|
||||
if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
|
||||
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
|
||||
}
|
||||
|
||||
void sched_core_dequeue(struct rq *rq, struct task_struct *p)
|
||||
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
rq->core->core_task_seq++;
|
||||
|
||||
if (!sched_core_enqueued(p))
|
||||
return;
|
||||
if (sched_core_enqueued(p)) {
|
||||
rb_erase(&p->core_node, &rq->core_tree);
|
||||
RB_CLEAR_NODE(&p->core_node);
|
||||
}
|
||||
|
||||
rb_erase(&p->core_node, &rq->core_tree);
|
||||
RB_CLEAR_NODE(&p->core_node);
|
||||
/*
|
||||
* Migrating the last task off the cpu, with the cpu in forced idle
|
||||
* state. Reschedule to create an accounting edge for forced idle,
|
||||
* and re-examine whether the core is still in forced idle state.
|
||||
*/
|
||||
if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
|
||||
rq->core->core_forceidle_count && rq->curr == rq->idle)
|
||||
resched_curr(rq);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
|
||||
for_each_cpu(t, smt_mask)
|
||||
cpu_rq(t)->core_enabled = enabled;
|
||||
|
||||
cpu_rq(cpu)->core->core_forceidle_start = 0;
|
||||
|
||||
sched_core_unlock(cpu, &flags);
|
||||
|
||||
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
|
||||
@ -364,7 +374,8 @@ void sched_core_put(void)
|
||||
#else /* !CONFIG_SCHED_CORE */
|
||||
|
||||
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
|
||||
static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
|
||||
static inline void
|
||||
sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
|
||||
|
||||
#endif /* CONFIG_SCHED_CORE */
|
||||
|
||||
@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
if (sched_core_enabled(rq))
|
||||
sched_core_dequeue(rq, p);
|
||||
sched_core_dequeue(rq, p, flags);
|
||||
|
||||
if (!(flags & DEQUEUE_NOCLOCK))
|
||||
update_rq_clock(rq);
|
||||
@ -5244,6 +5255,7 @@ void scheduler_tick(void)
|
||||
if (sched_feat(LATENCY_WARN))
|
||||
resched_latency = cpu_resched_latency(rq);
|
||||
calc_global_load_tick(rq);
|
||||
sched_core_tick(rq);
|
||||
|
||||
rq_unlock(rq, &rf);
|
||||
|
||||
@ -5656,6 +5668,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
struct task_struct *next, *p, *max = NULL;
|
||||
const struct cpumask *smt_mask;
|
||||
bool fi_before = false;
|
||||
bool core_clock_updated = (rq == rq->core);
|
||||
unsigned long cookie;
|
||||
int i, cpu, occ = 0;
|
||||
struct rq *rq_i;
|
||||
@ -5708,10 +5721,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
|
||||
/* reset state */
|
||||
rq->core->core_cookie = 0UL;
|
||||
if (rq->core->core_forceidle) {
|
||||
if (rq->core->core_forceidle_count) {
|
||||
if (!core_clock_updated) {
|
||||
update_rq_clock(rq->core);
|
||||
core_clock_updated = true;
|
||||
}
|
||||
sched_core_account_forceidle(rq);
|
||||
/* reset after accounting force idle */
|
||||
rq->core->core_forceidle_start = 0;
|
||||
rq->core->core_forceidle_count = 0;
|
||||
rq->core->core_forceidle_occupation = 0;
|
||||
need_sync = true;
|
||||
fi_before = true;
|
||||
rq->core->core_forceidle = false;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -5753,7 +5774,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
for_each_cpu_wrap(i, smt_mask, cpu) {
|
||||
rq_i = cpu_rq(i);
|
||||
|
||||
if (i != cpu)
|
||||
/*
|
||||
* Current cpu always has its clock updated on entrance to
|
||||
* pick_next_task(). If the current cpu is not the core,
|
||||
* the core may also have been updated above.
|
||||
*/
|
||||
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
|
||||
update_rq_clock(rq_i);
|
||||
|
||||
p = rq_i->core_pick = pick_task(rq_i);
|
||||
@ -5783,7 +5809,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
|
||||
if (p == rq_i->idle) {
|
||||
if (rq_i->nr_running) {
|
||||
rq->core->core_forceidle = true;
|
||||
rq->core->core_forceidle_count++;
|
||||
if (!fi_before)
|
||||
rq->core->core_forceidle_seq++;
|
||||
}
|
||||
@ -5792,6 +5818,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
}
|
||||
}
|
||||
|
||||
if (schedstat_enabled() && rq->core->core_forceidle_count) {
|
||||
if (cookie)
|
||||
rq->core->core_forceidle_start = rq_clock(rq->core);
|
||||
rq->core->core_forceidle_occupation = occ;
|
||||
}
|
||||
|
||||
rq->core->core_pick_seq = rq->core->core_task_seq;
|
||||
next = rq->core_pick;
|
||||
rq->core_sched_seq = rq->core->core_pick_seq;
|
||||
@ -5828,8 +5860,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
* 1 0 1
|
||||
* 1 1 0
|
||||
*/
|
||||
if (!(fi_before && rq->core->core_forceidle))
|
||||
task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
|
||||
if (!(fi_before && rq->core->core_forceidle_count))
|
||||
task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
|
||||
|
||||
rq_i->core_pick->core_occupation = occ;
|
||||
|
||||
@ -6033,11 +6065,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
|
||||
goto unlock;
|
||||
|
||||
/* copy the shared state to the new leader */
|
||||
core_rq->core_task_seq = rq->core_task_seq;
|
||||
core_rq->core_pick_seq = rq->core_pick_seq;
|
||||
core_rq->core_cookie = rq->core_cookie;
|
||||
core_rq->core_forceidle = rq->core_forceidle;
|
||||
core_rq->core_forceidle_seq = rq->core_forceidle_seq;
|
||||
core_rq->core_task_seq = rq->core_task_seq;
|
||||
core_rq->core_pick_seq = rq->core_pick_seq;
|
||||
core_rq->core_cookie = rq->core_cookie;
|
||||
core_rq->core_forceidle_count = rq->core_forceidle_count;
|
||||
core_rq->core_forceidle_seq = rq->core_forceidle_seq;
|
||||
core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
|
||||
|
||||
/*
|
||||
* Accounting edge for forced idle is handled in pick_next_task().
|
||||
* Don't need another one here, since the hotplug thread shouldn't
|
||||
* have a cookie.
|
||||
*/
|
||||
core_rq->core_forceidle_start = 0;
|
||||
|
||||
/* install new leader */
|
||||
for_each_cpu(t, smt_mask) {
|
||||
@ -7126,7 +7166,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
|
||||
|
||||
unsigned long sched_cpu_util(int cpu, unsigned long max)
|
||||
{
|
||||
return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
|
||||
return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
|
||||
ENERGY_UTIL, NULL);
|
||||
}
|
||||
#endif /* CONFIG_SMP */
|
||||
@ -9409,7 +9449,9 @@ void __init sched_init(void)
|
||||
rq->core_pick = NULL;
|
||||
rq->core_enabled = 0;
|
||||
rq->core_tree = RB_ROOT;
|
||||
rq->core_forceidle = false;
|
||||
rq->core_forceidle_count = 0;
|
||||
rq->core_forceidle_occupation = 0;
|
||||
rq->core_forceidle_start = 0;
|
||||
|
||||
rq->core_cookie = 0UL;
|
||||
#endif
|
||||
|
@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
|
||||
|
||||
enqueued = sched_core_enqueued(p);
|
||||
if (enqueued)
|
||||
sched_core_dequeue(rq, p);
|
||||
sched_core_dequeue(rq, p, DEQUEUE_SAVE);
|
||||
|
||||
old_cookie = p->core_cookie;
|
||||
p->core_cookie = cookie;
|
||||
@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
|
||||
* If task is currently running, it may not be compatible anymore after
|
||||
* the cookie change, so enter the scheduler on its CPU to schedule it
|
||||
* away.
|
||||
*
|
||||
* Note that it is possible that as a result of this cookie change, the
|
||||
* core has now entered/left forced idle state. Defer accounting to the
|
||||
* next scheduling edge, rather than always forcing a reschedule here.
|
||||
*/
|
||||
if (task_running(rq, p))
|
||||
resched_curr(rq);
|
||||
@ -232,3 +236,63 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
|
||||
return err;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
|
||||
/* REQUIRES: rq->core's clock recently updated. */
|
||||
void __sched_core_account_forceidle(struct rq *rq)
|
||||
{
|
||||
const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
|
||||
u64 delta, now = rq_clock(rq->core);
|
||||
struct rq *rq_i;
|
||||
struct task_struct *p;
|
||||
int i;
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
|
||||
WARN_ON_ONCE(!rq->core->core_forceidle_count);
|
||||
|
||||
if (rq->core->core_forceidle_start == 0)
|
||||
return;
|
||||
|
||||
delta = now - rq->core->core_forceidle_start;
|
||||
if (unlikely((s64)delta <= 0))
|
||||
return;
|
||||
|
||||
rq->core->core_forceidle_start = now;
|
||||
|
||||
if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
|
||||
/* can't be forced idle without a running task */
|
||||
} else if (rq->core->core_forceidle_count > 1 ||
|
||||
rq->core->core_forceidle_occupation > 1) {
|
||||
/*
|
||||
* For larger SMT configurations, we need to scale the charged
|
||||
* forced idle amount since there can be more than one forced
|
||||
* idle sibling and more than one running cookied task.
|
||||
*/
|
||||
delta *= rq->core->core_forceidle_count;
|
||||
delta = div_u64(delta, rq->core->core_forceidle_occupation);
|
||||
}
|
||||
|
||||
for_each_cpu(i, smt_mask) {
|
||||
rq_i = cpu_rq(i);
|
||||
p = rq_i->core_pick ?: rq_i->curr;
|
||||
|
||||
if (!p->core_cookie)
|
||||
continue;
|
||||
|
||||
__schedstat_add(p->stats.core_forceidle_sum, delta);
|
||||
}
|
||||
}
|
||||
|
||||
void __sched_core_tick(struct rq *rq)
|
||||
{
|
||||
if (!rq->core->core_forceidle_count)
|
||||
return;
|
||||
|
||||
if (rq != rq->core)
|
||||
update_rq_clock(rq->core);
|
||||
|
||||
__sched_core_account_forceidle(rq);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SCHEDSTATS */
|
||||
|
@ -21,15 +21,11 @@ static const char * const cpuacct_stat_desc[] = {
|
||||
[CPUACCT_STAT_SYSTEM] = "system",
|
||||
};
|
||||
|
||||
struct cpuacct_usage {
|
||||
u64 usages[CPUACCT_STAT_NSTATS];
|
||||
};
|
||||
|
||||
/* track CPU usage of a group of tasks and its child groups */
|
||||
struct cpuacct {
|
||||
struct cgroup_subsys_state css;
|
||||
/* cpuusage holds pointer to a u64-type object on every CPU */
|
||||
struct cpuacct_usage __percpu *cpuusage;
|
||||
u64 __percpu *cpuusage;
|
||||
struct kernel_cpustat __percpu *cpustat;
|
||||
};
|
||||
|
||||
@ -49,7 +45,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
|
||||
return css_ca(ca->css.parent);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
|
||||
static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
|
||||
static struct cpuacct root_cpuacct = {
|
||||
.cpustat = &kernel_cpustat,
|
||||
.cpuusage = &root_cpuacct_cpuusage,
|
||||
@ -68,7 +64,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
if (!ca)
|
||||
goto out;
|
||||
|
||||
ca->cpuusage = alloc_percpu(struct cpuacct_usage);
|
||||
ca->cpuusage = alloc_percpu(u64);
|
||||
if (!ca->cpuusage)
|
||||
goto out_free_ca;
|
||||
|
||||
@ -99,14 +95,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
|
||||
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
|
||||
enum cpuacct_stat_index index)
|
||||
{
|
||||
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
||||
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
||||
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
|
||||
u64 data;
|
||||
|
||||
/*
|
||||
* We allow index == CPUACCT_STAT_NSTATS here to read
|
||||
* the sum of usages.
|
||||
*/
|
||||
BUG_ON(index > CPUACCT_STAT_NSTATS);
|
||||
if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS))
|
||||
return 0;
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
/*
|
||||
@ -115,14 +113,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
|
||||
raw_spin_rq_lock_irq(cpu_rq(cpu));
|
||||
#endif
|
||||
|
||||
if (index == CPUACCT_STAT_NSTATS) {
|
||||
int i = 0;
|
||||
|
||||
data = 0;
|
||||
for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
|
||||
data += cpuusage->usages[i];
|
||||
} else {
|
||||
data = cpuusage->usages[index];
|
||||
switch (index) {
|
||||
case CPUACCT_STAT_USER:
|
||||
data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE];
|
||||
break;
|
||||
case CPUACCT_STAT_SYSTEM:
|
||||
data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] +
|
||||
cpustat[CPUTIME_SOFTIRQ];
|
||||
break;
|
||||
case CPUACCT_STAT_NSTATS:
|
||||
data = *cpuusage;
|
||||
break;
|
||||
}
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
@ -132,10 +133,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
|
||||
return data;
|
||||
}
|
||||
|
||||
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
|
||||
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu)
|
||||
{
|
||||
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
||||
int i;
|
||||
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
||||
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
|
||||
|
||||
/* Don't allow to reset global kernel_cpustat */
|
||||
if (ca == &root_cpuacct)
|
||||
return;
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
/*
|
||||
@ -143,9 +148,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
|
||||
*/
|
||||
raw_spin_rq_lock_irq(cpu_rq(cpu));
|
||||
#endif
|
||||
|
||||
for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
|
||||
cpuusage->usages[i] = val;
|
||||
*cpuusage = 0;
|
||||
cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0;
|
||||
cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0;
|
||||
cpustat[CPUTIME_SOFTIRQ] = 0;
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
raw_spin_rq_unlock_irq(cpu_rq(cpu));
|
||||
@ -196,7 +202,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
|
||||
return -EINVAL;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
cpuacct_cpuusage_write(ca, cpu, 0);
|
||||
cpuacct_cpuusage_write(ca, cpu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -243,25 +249,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
|
||||
seq_puts(m, "\n");
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
||||
|
||||
seq_printf(m, "%d", cpu);
|
||||
|
||||
for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
|
||||
#ifndef CONFIG_64BIT
|
||||
/*
|
||||
* Take rq->lock to make 64-bit read safe on 32-bit
|
||||
* platforms.
|
||||
*/
|
||||
raw_spin_rq_lock_irq(cpu_rq(cpu));
|
||||
#endif
|
||||
|
||||
seq_printf(m, " %llu", cpuusage->usages[index]);
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
raw_spin_rq_unlock_irq(cpu_rq(cpu));
|
||||
#endif
|
||||
}
|
||||
for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
|
||||
seq_printf(m, " %llu",
|
||||
cpuacct_cpuusage_read(ca, cpu, index));
|
||||
seq_puts(m, "\n");
|
||||
}
|
||||
return 0;
|
||||
@ -270,25 +261,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
|
||||
static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct cpuacct *ca = css_ca(seq_css(sf));
|
||||
s64 val[CPUACCT_STAT_NSTATS];
|
||||
struct task_cputime cputime;
|
||||
u64 val[CPUACCT_STAT_NSTATS];
|
||||
int cpu;
|
||||
int stat;
|
||||
|
||||
memset(val, 0, sizeof(val));
|
||||
memset(&cputime, 0, sizeof(cputime));
|
||||
for_each_possible_cpu(cpu) {
|
||||
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
|
||||
|
||||
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
|
||||
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
|
||||
val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
|
||||
val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
|
||||
val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
|
||||
cputime.utime += cpustat[CPUTIME_USER];
|
||||
cputime.utime += cpustat[CPUTIME_NICE];
|
||||
cputime.stime += cpustat[CPUTIME_SYSTEM];
|
||||
cputime.stime += cpustat[CPUTIME_IRQ];
|
||||
cputime.stime += cpustat[CPUTIME_SOFTIRQ];
|
||||
|
||||
cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu);
|
||||
}
|
||||
|
||||
cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime,
|
||||
&val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]);
|
||||
|
||||
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
|
||||
seq_printf(sf, "%s %lld\n",
|
||||
cpuacct_stat_desc[stat],
|
||||
(long long)nsec_to_clock_t(val[stat]));
|
||||
seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat],
|
||||
nsec_to_clock_t(val[stat]));
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -339,16 +335,11 @@ static struct cftype files[] = {
|
||||
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
||||
{
|
||||
struct cpuacct *ca;
|
||||
int index = CPUACCT_STAT_SYSTEM;
|
||||
struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
|
||||
|
||||
if (regs && user_mode(regs))
|
||||
index = CPUACCT_STAT_USER;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
|
||||
__this_cpu_add(ca->cpuusage->usages[index], cputime);
|
||||
__this_cpu_add(*ca->cpuusage, cputime);
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
@ -168,7 +168,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
|
||||
|
||||
sg_cpu->max = max;
|
||||
sg_cpu->bw_dl = cpu_bw_dl(rq);
|
||||
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
|
||||
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max,
|
||||
FREQUENCY_UTIL, NULL);
|
||||
}
|
||||
|
||||
|
@ -148,10 +148,10 @@ void account_guest_time(struct task_struct *p, u64 cputime)
|
||||
|
||||
/* Add guest time to cpustat. */
|
||||
if (task_nice(p) > 0) {
|
||||
cpustat[CPUTIME_NICE] += cputime;
|
||||
task_group_account_field(p, CPUTIME_NICE, cputime);
|
||||
cpustat[CPUTIME_GUEST_NICE] += cputime;
|
||||
} else {
|
||||
cpustat[CPUTIME_USER] += cputime;
|
||||
task_group_account_field(p, CPUTIME_USER, cputime);
|
||||
cpustat[CPUTIME_GUEST] += cputime;
|
||||
}
|
||||
}
|
||||
|
@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
|
||||
|
||||
__PN(avg_atom);
|
||||
__PN(avg_per_cpu);
|
||||
|
||||
#ifdef CONFIG_SCHED_CORE
|
||||
PN_SCHEDSTAT(core_forceidle_sum);
|
||||
#endif
|
||||
}
|
||||
|
||||
__P(nr_switches);
|
||||
|
@ -1502,7 +1502,6 @@ struct task_numa_env {
|
||||
|
||||
static unsigned long cpu_load(struct rq *rq);
|
||||
static unsigned long cpu_runnable(struct rq *rq);
|
||||
static unsigned long cpu_util(int cpu);
|
||||
static inline long adjust_numa_imbalance(int imbalance,
|
||||
int dst_running, int dst_weight);
|
||||
|
||||
@ -1569,7 +1568,7 @@ static void update_numa_stats(struct task_numa_env *env,
|
||||
|
||||
ns->load += cpu_load(rq);
|
||||
ns->runnable += cpu_runnable(rq);
|
||||
ns->util += cpu_util(cpu);
|
||||
ns->util += cpu_util_cfs(cpu);
|
||||
ns->nr_running += rq->cfs.h_nr_running;
|
||||
ns->compute_capacity += capacity_of(cpu);
|
||||
|
||||
@ -3240,7 +3239,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
|
||||
* As is, the util number is not freq-invariant (we'd have to
|
||||
* implement arch_scale_freq_capacity() for that).
|
||||
*
|
||||
* See cpu_util().
|
||||
* See cpu_util_cfs().
|
||||
*/
|
||||
cpufreq_update_util(rq, flags);
|
||||
}
|
||||
@ -4070,7 +4069,8 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
|
||||
trace_sched_util_est_se_tp(&p->se);
|
||||
}
|
||||
|
||||
static inline int task_fits_capacity(struct task_struct *p, long capacity)
|
||||
static inline int task_fits_capacity(struct task_struct *p,
|
||||
unsigned long capacity)
|
||||
{
|
||||
return fits_capacity(uclamp_task_util(p), capacity);
|
||||
}
|
||||
@ -5509,11 +5509,9 @@ static inline void hrtick_update(struct rq *rq)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static inline unsigned long cpu_util(int cpu);
|
||||
|
||||
static inline bool cpu_overutilized(int cpu)
|
||||
{
|
||||
return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
|
||||
return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
|
||||
}
|
||||
|
||||
static inline void update_overutilized_status(struct rq *rq)
|
||||
@ -6345,7 +6343,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
|
||||
return best_cpu;
|
||||
}
|
||||
|
||||
static inline bool asym_fits_capacity(int task_util, int cpu)
|
||||
static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
|
||||
{
|
||||
if (static_branch_unlikely(&sched_asym_cpucapacity))
|
||||
return fits_capacity(task_util, capacity_of(cpu));
|
||||
@ -6398,8 +6396,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
* pattern is IO completions.
|
||||
*/
|
||||
if (is_per_cpu_kthread(current) &&
|
||||
in_task() &&
|
||||
prev == smp_processor_id() &&
|
||||
this_rq()->nr_running <= 1) {
|
||||
this_rq()->nr_running <= 1 &&
|
||||
asym_fits_capacity(task_util, prev)) {
|
||||
return prev;
|
||||
}
|
||||
|
||||
@ -6456,58 +6456,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
return target;
|
||||
}
|
||||
|
||||
/**
|
||||
* cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks.
|
||||
* @cpu: the CPU to get the utilization of
|
||||
*
|
||||
* The unit of the return value must be the one of capacity so we can compare
|
||||
* the utilization with the capacity of the CPU that is available for CFS task
|
||||
* (ie cpu_capacity).
|
||||
*
|
||||
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
|
||||
* recent utilization of currently non-runnable tasks on a CPU. It represents
|
||||
* the amount of utilization of a CPU in the range [0..capacity_orig] where
|
||||
* capacity_orig is the cpu_capacity available at the highest frequency
|
||||
* (arch_scale_freq_capacity()).
|
||||
* The utilization of a CPU converges towards a sum equal to or less than the
|
||||
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
|
||||
* the running time on this CPU scaled by capacity_curr.
|
||||
*
|
||||
* The estimated utilization of a CPU is defined to be the maximum between its
|
||||
* cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
|
||||
* currently RUNNABLE on that CPU.
|
||||
* This allows to properly represent the expected utilization of a CPU which
|
||||
* has just got a big task running since a long sleep period. At the same time
|
||||
* however it preserves the benefits of the "blocked utilization" in
|
||||
* describing the potential for other tasks waking up on the same CPU.
|
||||
*
|
||||
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
|
||||
* higher than capacity_orig because of unfortunate rounding in
|
||||
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
|
||||
* the average stabilizes with the new running time. We need to check that the
|
||||
* utilization stays within the range of [0..capacity_orig] and cap it if
|
||||
* necessary. Without utilization capping, a group could be seen as overloaded
|
||||
* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
|
||||
* available capacity. We allow utilization to overshoot capacity_curr (but not
|
||||
* capacity_orig) as it useful for predicting the capacity required after task
|
||||
* migrations (scheduler-driven DVFS).
|
||||
*
|
||||
* Return: the (estimated) utilization for the specified CPU
|
||||
*/
|
||||
static inline unsigned long cpu_util(int cpu)
|
||||
{
|
||||
struct cfs_rq *cfs_rq;
|
||||
unsigned int util;
|
||||
|
||||
cfs_rq = &cpu_rq(cpu)->cfs;
|
||||
util = READ_ONCE(cfs_rq->avg.util_avg);
|
||||
|
||||
if (sched_feat(UTIL_EST))
|
||||
util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
|
||||
|
||||
return min_t(unsigned long, util, capacity_orig_of(cpu));
|
||||
}
|
||||
|
||||
/*
|
||||
* cpu_util_without: compute cpu utilization without any contributions from *p
|
||||
* @cpu: the CPU which utilization is requested
|
||||
@ -6528,7 +6476,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
|
||||
|
||||
/* Task has no contribution or is new */
|
||||
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
|
||||
return cpu_util(cpu);
|
||||
return cpu_util_cfs(cpu);
|
||||
|
||||
cfs_rq = &cpu_rq(cpu)->cfs;
|
||||
util = READ_ONCE(cfs_rq->avg.util_avg);
|
||||
@ -6592,7 +6540,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
|
||||
/*
|
||||
* Utilization (estimated) can exceed the CPU capacity, thus let's
|
||||
* clamp to the maximum CPU capacity to ensure consistency with
|
||||
* the cpu_util call.
|
||||
* cpu_util.
|
||||
*/
|
||||
return min_t(unsigned long, util, capacity_orig_of(cpu));
|
||||
}
|
||||
@ -6624,7 +6572,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
|
||||
* During wake-up, the task isn't enqueued yet and doesn't
|
||||
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
|
||||
* so just add it (if needed) to "simulate" what will be
|
||||
* cpu_util() after the task has been enqueued.
|
||||
* cpu_util after the task has been enqueued.
|
||||
*/
|
||||
if (dst_cpu == cpu)
|
||||
util_est += _task_util_est(p);
|
||||
@ -6915,6 +6863,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Usually only true for WF_EXEC and WF_FORK, as sched_domains
|
||||
* usually do not have SD_BALANCE_WAKE set. That means wakeup
|
||||
* will usually go to the fast path.
|
||||
*/
|
||||
if (tmp->flags & sd_flag)
|
||||
sd = tmp;
|
||||
else if (!want_affine)
|
||||
@ -8681,7 +8634,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||
struct rq *rq = cpu_rq(i);
|
||||
|
||||
sgs->group_load += cpu_load(rq);
|
||||
sgs->group_util += cpu_util(i);
|
||||
sgs->group_util += cpu_util_cfs(i);
|
||||
sgs->group_runnable += cpu_runnable(rq);
|
||||
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
|
||||
|
||||
@ -9699,7 +9652,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
||||
break;
|
||||
|
||||
case migrate_util:
|
||||
util = cpu_util(cpu_of(rq));
|
||||
util = cpu_util_cfs(i);
|
||||
|
||||
/*
|
||||
* Don't try to pull utilization from a CPU with one
|
||||
@ -11068,7 +11021,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
|
||||
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
|
||||
* if we need to give up the CPU.
|
||||
*/
|
||||
if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
|
||||
if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
|
||||
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
|
||||
resched_curr(rq);
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Pressure stall information for CPU, memory and IO
|
||||
*
|
||||
@ -34,13 +35,19 @@
|
||||
* delayed on that resource such that nobody is advancing and the CPU
|
||||
* goes idle. This leaves both workload and CPU unproductive.
|
||||
*
|
||||
* Naturally, the FULL state doesn't exist for the CPU resource at the
|
||||
* system level, but exist at the cgroup level, means all non-idle tasks
|
||||
* in a cgroup are delayed on the CPU resource which used by others outside
|
||||
* of the cgroup or throttled by the cgroup cpu.max configuration.
|
||||
*
|
||||
* SOME = nr_delayed_tasks != 0
|
||||
* FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
|
||||
* FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
|
||||
*
|
||||
* What it means for a task to be productive is defined differently
|
||||
* for each resource. For IO, productive means a running task. For
|
||||
* memory, productive means a running task that isn't a reclaimer. For
|
||||
* CPU, productive means an oncpu task.
|
||||
*
|
||||
* Naturally, the FULL state doesn't exist for the CPU resource at the
|
||||
* system level, but exist at the cgroup level. At the cgroup level,
|
||||
* FULL means all non-idle tasks in the cgroup are delayed on the CPU
|
||||
* resource which is being used by others outside of the cgroup or
|
||||
* throttled by the cgroup cpu.max configuration.
|
||||
*
|
||||
* The percentage of wallclock time spent in those compound stall
|
||||
* states gives pressure numbers between 0 and 100 for each resource,
|
||||
@ -81,13 +88,13 @@
|
||||
*
|
||||
* threads = min(nr_nonidle_tasks, nr_cpus)
|
||||
* SOME = min(nr_delayed_tasks / threads, 1)
|
||||
* FULL = (threads - min(nr_running_tasks, threads)) / threads
|
||||
* FULL = (threads - min(nr_productive_tasks, threads)) / threads
|
||||
*
|
||||
* For the 257 number crunchers on 256 CPUs, this yields:
|
||||
*
|
||||
* threads = min(257, 256)
|
||||
* SOME = min(1 / 256, 1) = 0.4%
|
||||
* FULL = (256 - min(257, 256)) / 256 = 0%
|
||||
* FULL = (256 - min(256, 256)) / 256 = 0%
|
||||
*
|
||||
* For the 1 out of 4 memory-delayed tasks, this yields:
|
||||
*
|
||||
@ -112,7 +119,7 @@
|
||||
* For each runqueue, we track:
|
||||
*
|
||||
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
|
||||
* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
|
||||
* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
|
||||
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
|
||||
*
|
||||
* and then periodically aggregate:
|
||||
@ -233,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
|
||||
case PSI_MEM_SOME:
|
||||
return unlikely(tasks[NR_MEMSTALL]);
|
||||
case PSI_MEM_FULL:
|
||||
return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
|
||||
return unlikely(tasks[NR_MEMSTALL] &&
|
||||
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
|
||||
case PSI_CPU_SOME:
|
||||
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
|
||||
case PSI_CPU_FULL:
|
||||
@ -710,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||
if (groupc->tasks[t]) {
|
||||
groupc->tasks[t]--;
|
||||
} else if (!psi_bug) {
|
||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
|
||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
|
||||
cpu, t, groupc->tasks[0],
|
||||
groupc->tasks[1], groupc->tasks[2],
|
||||
groupc->tasks[3], clear, set);
|
||||
groupc->tasks[3], groupc->tasks[4],
|
||||
clear, set);
|
||||
psi_bug = 1;
|
||||
}
|
||||
}
|
||||
@ -833,7 +842,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
/*
|
||||
* When switching between tasks that have an identical
|
||||
* runtime state, the cgroup that contains both tasks
|
||||
* runtime state, the cgroup that contains both tasks
|
||||
* we reach the first common ancestor. Iterate @next's
|
||||
* ancestors only until we encounter @prev's ONCPU.
|
||||
*/
|
||||
@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
int clear = TSK_ONCPU, set = 0;
|
||||
|
||||
/*
|
||||
* When we're going to sleep, psi_dequeue() lets us handle
|
||||
* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
|
||||
* with TSK_ONCPU and save walking common ancestors twice.
|
||||
* When we're going to sleep, psi_dequeue() lets us
|
||||
* handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
|
||||
* TSK_IOWAIT here, where we can combine it with
|
||||
* TSK_ONCPU and save walking common ancestors twice.
|
||||
*/
|
||||
if (sleep) {
|
||||
clear |= TSK_RUNNING;
|
||||
if (prev->in_memstall)
|
||||
clear |= TSK_MEMSTALL_RUNNING;
|
||||
if (prev->in_iowait)
|
||||
set |= TSK_IOWAIT;
|
||||
}
|
||||
@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
|
||||
rq = this_rq_lock_irq(&rf);
|
||||
|
||||
current->in_memstall = 1;
|
||||
psi_task_change(current, 0, TSK_MEMSTALL);
|
||||
psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
|
||||
|
||||
rq_unlock_irq(rq, &rf);
|
||||
}
|
||||
@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
|
||||
rq = this_rq_lock_irq(&rf);
|
||||
|
||||
current->in_memstall = 0;
|
||||
psi_task_change(current, TSK_MEMSTALL, 0);
|
||||
psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
|
||||
|
||||
rq_unlock_irq(rq, &rf);
|
||||
}
|
||||
|
@ -52,11 +52,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
|
||||
rt_b->rt_period_timer.function = sched_rt_period_timer;
|
||||
}
|
||||
|
||||
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
||||
static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
||||
{
|
||||
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
|
||||
return;
|
||||
|
||||
raw_spin_lock(&rt_b->rt_runtime_lock);
|
||||
if (!rt_b->rt_period_active) {
|
||||
rt_b->rt_period_active = 1;
|
||||
@ -75,6 +72,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
||||
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
||||
}
|
||||
|
||||
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
||||
{
|
||||
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
|
||||
return;
|
||||
|
||||
do_start_rt_bandwidth(rt_b);
|
||||
}
|
||||
|
||||
void init_rt_rq(struct rt_rq *rt_rq)
|
||||
{
|
||||
struct rt_prio_array *array;
|
||||
@ -1031,13 +1036,17 @@ static void update_curr_rt(struct rq *rq)
|
||||
|
||||
for_each_sched_rt_entity(rt_se) {
|
||||
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
|
||||
int exceeded;
|
||||
|
||||
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
|
||||
raw_spin_lock(&rt_rq->rt_runtime_lock);
|
||||
rt_rq->rt_time += delta_exec;
|
||||
if (sched_rt_runtime_exceeded(rt_rq))
|
||||
exceeded = sched_rt_runtime_exceeded(rt_rq);
|
||||
if (exceeded)
|
||||
resched_curr(rq);
|
||||
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
||||
if (exceeded)
|
||||
do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2911,8 +2920,12 @@ static int sched_rt_global_validate(void)
|
||||
|
||||
static void sched_rt_do_global(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
|
||||
def_rt_bandwidth.rt_runtime = global_rt_runtime();
|
||||
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
|
||||
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
|
||||
}
|
||||
|
||||
int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
|
||||
|
@ -1111,8 +1111,10 @@ struct rq {
|
||||
unsigned int core_task_seq;
|
||||
unsigned int core_pick_seq;
|
||||
unsigned long core_cookie;
|
||||
unsigned char core_forceidle;
|
||||
unsigned int core_forceidle_count;
|
||||
unsigned int core_forceidle_seq;
|
||||
unsigned int core_forceidle_occupation;
|
||||
u64 core_forceidle_start;
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p)
|
||||
}
|
||||
|
||||
extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
|
||||
extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
|
||||
extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
extern void sched_core_get(void);
|
||||
extern void sched_core_put(void);
|
||||
@ -1854,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { }
|
||||
#include "stats.h"
|
||||
#include "autogroup.h"
|
||||
|
||||
#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
|
||||
|
||||
extern void __sched_core_account_forceidle(struct rq *rq);
|
||||
|
||||
static inline void sched_core_account_forceidle(struct rq *rq)
|
||||
{
|
||||
if (schedstat_enabled())
|
||||
__sched_core_account_forceidle(rq);
|
||||
}
|
||||
|
||||
extern void __sched_core_tick(struct rq *rq);
|
||||
|
||||
static inline void sched_core_tick(struct rq *rq)
|
||||
{
|
||||
if (sched_core_enabled(rq) && schedstat_enabled())
|
||||
__sched_core_tick(rq);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void sched_core_account_forceidle(struct rq *rq) {}
|
||||
|
||||
static inline void sched_core_tick(struct rq *rq) {}
|
||||
|
||||
#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
|
||||
/*
|
||||
@ -2938,16 +2966,52 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
|
||||
return READ_ONCE(rq->avg_dl.util_avg);
|
||||
}
|
||||
|
||||
static inline unsigned long cpu_util_cfs(struct rq *rq)
|
||||
/**
|
||||
* cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
|
||||
* @cpu: the CPU to get the utilization for.
|
||||
*
|
||||
* The unit of the return value must be the same as the one of CPU capacity
|
||||
* so that CPU utilization can be compared with CPU capacity.
|
||||
*
|
||||
* CPU utilization is the sum of running time of runnable tasks plus the
|
||||
* recent utilization of currently non-runnable tasks on that CPU.
|
||||
* It represents the amount of CPU capacity currently used by CFS tasks in
|
||||
* the range [0..max CPU capacity] with max CPU capacity being the CPU
|
||||
* capacity at f_max.
|
||||
*
|
||||
* The estimated CPU utilization is defined as the maximum between CPU
|
||||
* utilization and sum of the estimated utilization of the currently
|
||||
* runnable tasks on that CPU. It preserves a utilization "snapshot" of
|
||||
* previously-executed tasks, which helps better deduce how busy a CPU will
|
||||
* be when a long-sleeping task wakes up. The contribution to CPU utilization
|
||||
* of such a task would be significantly decayed at this point of time.
|
||||
*
|
||||
* CPU utilization can be higher than the current CPU capacity
|
||||
* (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
|
||||
* of rounding errors as well as task migrations or wakeups of new tasks.
|
||||
* CPU utilization has to be capped to fit into the [0..max CPU capacity]
|
||||
* range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
|
||||
* could be seen as over-utilized even though CPU1 has 20% of spare CPU
|
||||
* capacity. CPU utilization is allowed to overshoot current CPU capacity
|
||||
* though since this is useful for predicting the CPU capacity required
|
||||
* after task migrations (scheduler-driven DVFS).
|
||||
*
|
||||
* Return: (Estimated) utilization for the specified CPU.
|
||||
*/
|
||||
static inline unsigned long cpu_util_cfs(int cpu)
|
||||
{
|
||||
unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
|
||||
struct cfs_rq *cfs_rq;
|
||||
unsigned long util;
|
||||
|
||||
cfs_rq = &cpu_rq(cpu)->cfs;
|
||||
util = READ_ONCE(cfs_rq->avg.util_avg);
|
||||
|
||||
if (sched_feat(UTIL_EST)) {
|
||||
util = max_t(unsigned long, util,
|
||||
READ_ONCE(rq->cfs.avg.util_est.enqueued));
|
||||
READ_ONCE(cfs_rq->avg.util_est.enqueued));
|
||||
}
|
||||
|
||||
return util;
|
||||
return min(util, capacity_orig_of(cpu));
|
||||
}
|
||||
|
||||
static inline unsigned long cpu_util_rt(struct rq *rq)
|
||||
|
@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return;
|
||||
|
||||
if (p->in_memstall)
|
||||
set |= TSK_MEMSTALL_RUNNING;
|
||||
|
||||
if (!wakeup || p->sched_psi_wake_requeue) {
|
||||
if (p->in_memstall)
|
||||
set |= TSK_MEMSTALL;
|
||||
@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
|
||||
return;
|
||||
|
||||
if (p->in_memstall)
|
||||
clear |= TSK_MEMSTALL;
|
||||
clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
|
||||
|
||||
psi_task_change(p, clear, 0);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user