mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-09 14:43:16 +00:00
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle are: - Various NUMA scheduling updates: harmonize the load-balancer and NUMA placement logic to not work against each other. The intended result is better locality, better utilization and fewer migrations. - Introduce Thermal Pressure tracking and optimizations, to improve task placement on thermally overloaded systems. - Implement frequency invariant scheduler accounting on (some) x86 CPUs. This is done by observing and sampling the 'recent' CPU frequency average at ~tick boundaries. The CPU provides this data via the APERF/MPERF MSRs. This hopefully makes our capacity estimates more precise and keeps tasks on the same CPU better even if it might seem overloaded at a lower momentary frequency. (As usual, turbo mode is a complication that we resolve by observing the maximum frequency and renormalizing to it.) - Add asymmetric CPU capacity wakeup scan to improve capacity utilization on asymmetric topologies. (big.LITTLE systems) - PSI fixes and optimizations. - RT scheduling capacity awareness fixes & improvements. - Optimize the CONFIG_RT_GROUP_SCHED constraints code. - Misc fixes, cleanups and optimizations - see the changelog for details" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (62 commits) threads: Update PID limit comment according to futex UAPI change sched/fair: Fix condition of avg_load calculation sched/rt: cpupri_find: Trigger a full search as fallback kthread: Do not preempt current task if it is going to call schedule() sched/fair: Improve spreading of utilization sched: Avoid scale real weight down to zero psi: Move PF_MEMSTALL out of task->flags MAINTAINERS: Add maintenance information for psi psi: Optimize switching tasks inside shared cgroups psi: Fix cpu.pressure for cpu.max and competing cgroups sched/core: Distribute tasks within affinity masks sched/fair: Fix enqueue_task_fair warning thermal/cpu-cooling, sched/core: Move the arch_set_thermal_pressure() API to generic scheduler code sched/rt: Remove unnecessary push for unfit tasks sched/rt: Allow pulling unfitting task sched/rt: Optimize cpupri_find() on non-heterogenous systems sched/rt: Re-instate old behavior in select_task_rq_rt() sched/rt: cpupri_find: Implement fallback mechanism for !fit case sched/fair: Fix reordering of enqueue/dequeue_task_fair() sched/fair: Fix runnable_avg for throttled cfs ...
This commit is contained in:
commit
642e53ead6
@ -4428,6 +4428,22 @@
|
||||
incurs a small amount of overhead in the scheduler
|
||||
but is useful for debugging and performance tuning.
|
||||
|
||||
sched_thermal_decay_shift=
|
||||
[KNL, SMP] Set a decay shift for scheduler thermal
|
||||
pressure signal. Thermal pressure signal follows the
|
||||
default decay period of other scheduler pelt
|
||||
signals(usually 32 ms but configurable). Setting
|
||||
sched_thermal_decay_shift will left shift the decay
|
||||
period for the thermal pressure signal by the shift
|
||||
value.
|
||||
i.e. with the default pelt decay period of 32 ms
|
||||
sched_thermal_decay_shift thermal pressure decay pr
|
||||
1 64 ms
|
||||
2 128 ms
|
||||
and so on.
|
||||
Format: integer between 0 and 10
|
||||
Default is 0.
|
||||
|
||||
skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate
|
||||
xtime_lock contention on larger systems, and/or RCU lock
|
||||
contention on all systems with CONFIG_MAXSMP set.
|
||||
|
@ -61,8 +61,8 @@ setup that list.
|
||||
address of the associated 'lock entry', plus or minus, of what will
|
||||
be called the 'lock word', from that 'lock entry'. The 'lock word'
|
||||
is always a 32 bit word, unlike the other words above. The 'lock
|
||||
word' holds 3 flag bits in the upper 3 bits, and the thread id (TID)
|
||||
of the thread holding the lock in the bottom 29 bits. See further
|
||||
word' holds 2 flag bits in the upper 2 bits, and the thread id (TID)
|
||||
of the thread holding the lock in the bottom 30 bits. See further
|
||||
below for a description of the flag bits.
|
||||
|
||||
The third word, called 'list_op_pending', contains transient copy of
|
||||
@ -128,7 +128,7 @@ that thread's robust_futex linked lock list a given time.
|
||||
A given futex lock structure in a user shared memory region may be held
|
||||
at different times by any of the threads with access to that region. The
|
||||
thread currently holding such a lock, if any, is marked with the threads
|
||||
TID in the lower 29 bits of the 'lock word'.
|
||||
TID in the lower 30 bits of the 'lock word'.
|
||||
|
||||
When adding or removing a lock from its list of held locks, in order for
|
||||
the kernel to correctly handle lock cleanup regardless of when the task
|
||||
@ -141,7 +141,7 @@ On insertion:
|
||||
1) set the 'list_op_pending' word to the address of the 'lock entry'
|
||||
to be inserted,
|
||||
2) acquire the futex lock,
|
||||
3) add the lock entry, with its thread id (TID) in the bottom 29 bits
|
||||
3) add the lock entry, with its thread id (TID) in the bottom 30 bits
|
||||
of the 'lock word', to the linked list starting at 'head', and
|
||||
4) clear the 'list_op_pending' word.
|
||||
|
||||
@ -155,7 +155,7 @@ On removal:
|
||||
|
||||
On exit, the kernel will consider the address stored in
|
||||
'list_op_pending' and the address of each 'lock word' found by walking
|
||||
the list starting at 'head'. For each such address, if the bottom 29
|
||||
the list starting at 'head'. For each such address, if the bottom 30
|
||||
bits of the 'lock word' at offset 'offset' from that address equals the
|
||||
exiting threads TID, then the kernel will do two things:
|
||||
|
||||
@ -180,7 +180,5 @@ any point:
|
||||
future kernel configuration changes) elements.
|
||||
|
||||
When the kernel sees a list entry whose 'lock word' doesn't have the
|
||||
current threads TID in the lower 29 bits, it does nothing with that
|
||||
current threads TID in the lower 30 bits, it does nothing with that
|
||||
entry, and goes on to the next entry.
|
||||
|
||||
Bit 29 (0x20000000) of the 'lock word' is reserved for future use.
|
||||
|
@ -13552,6 +13552,12 @@ F: net/psample
|
||||
F: include/net/psample.h
|
||||
F: include/uapi/linux/psample.h
|
||||
|
||||
PRESSURE STALL INFORMATION (PSI)
|
||||
M: Johannes Weiner <hannes@cmpxchg.org>
|
||||
S: Maintained
|
||||
F: kernel/sched/psi.c
|
||||
F: include/linux/psi*
|
||||
|
||||
PSTORE FILESYSTEM
|
||||
M: Kees Cook <keescook@chromium.org>
|
||||
M: Anton Vorontsov <anton@enomsg.org>
|
||||
|
@ -16,6 +16,9 @@
|
||||
/* Enable topology flag updates */
|
||||
#define arch_update_cpu_topology topology_update_cpu_topology
|
||||
|
||||
/* Replace task scheduler's default thermal pressure retrieve API */
|
||||
#define arch_scale_thermal_pressure topology_get_thermal_pressure
|
||||
|
||||
#else
|
||||
|
||||
static inline void init_cpu_topology(void) { }
|
||||
|
@ -62,6 +62,7 @@ CONFIG_ARCH_ZX=y
|
||||
CONFIG_ARCH_ZYNQMP=y
|
||||
CONFIG_ARM64_VA_BITS_48=y
|
||||
CONFIG_SCHED_MC=y
|
||||
CONFIG_SCHED_SMT=y
|
||||
CONFIG_NUMA=y
|
||||
CONFIG_SECCOMP=y
|
||||
CONFIG_KEXEC=y
|
||||
|
@ -25,6 +25,9 @@ int pcibus_to_node(struct pci_bus *bus);
|
||||
/* Enable topology flag updates */
|
||||
#define arch_update_cpu_topology topology_update_cpu_topology
|
||||
|
||||
/* Replace task scheduler's default thermal pressure retrieve API */
|
||||
#define arch_scale_thermal_pressure topology_get_thermal_pressure
|
||||
|
||||
#include <asm-generic/topology.h>
|
||||
|
||||
#endif /* _ASM_ARM_TOPOLOGY_H */
|
||||
|
@ -193,4 +193,29 @@ static inline void sched_clear_itmt_support(void)
|
||||
}
|
||||
#endif /* CONFIG_SCHED_MC_PRIO */
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
#include <asm/cpufeature.h>
|
||||
|
||||
DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key);
|
||||
|
||||
#define arch_scale_freq_invariant() static_branch_likely(&arch_scale_freq_key)
|
||||
|
||||
DECLARE_PER_CPU(unsigned long, arch_freq_scale);
|
||||
|
||||
static inline long arch_scale_freq_capacity(int cpu)
|
||||
{
|
||||
return per_cpu(arch_freq_scale, cpu);
|
||||
}
|
||||
#define arch_scale_freq_capacity arch_scale_freq_capacity
|
||||
|
||||
extern void arch_scale_freq_tick(void);
|
||||
#define arch_scale_freq_tick arch_scale_freq_tick
|
||||
|
||||
extern void arch_set_max_freq_ratio(bool turbo_disabled);
|
||||
#else
|
||||
static inline void arch_set_max_freq_ratio(bool turbo_disabled)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_X86_TOPOLOGY_H */
|
||||
|
@ -147,6 +147,8 @@ static inline void smpboot_restore_warm_reset_vector(void)
|
||||
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
|
||||
}
|
||||
|
||||
static void init_freq_invariance(void);
|
||||
|
||||
/*
|
||||
* Report back to the Boot Processor during boot time or to the caller processor
|
||||
* during CPU online.
|
||||
@ -183,6 +185,8 @@ static void smp_callin(void)
|
||||
*/
|
||||
set_cpu_sibling_map(raw_smp_processor_id());
|
||||
|
||||
init_freq_invariance();
|
||||
|
||||
/*
|
||||
* Get our bogomips.
|
||||
* Update loops_per_jiffy in cpu_data. Previous call to
|
||||
@ -1337,7 +1341,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
|
||||
set_sched_topology(x86_topology);
|
||||
|
||||
set_cpu_sibling_map(0);
|
||||
|
||||
init_freq_invariance();
|
||||
smp_sanity_check();
|
||||
|
||||
switch (apic_intr_mode) {
|
||||
@ -1764,3 +1768,287 @@ void native_play_dead(void)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* APERF/MPERF frequency ratio computation.
|
||||
*
|
||||
* The scheduler wants to do frequency invariant accounting and needs a <1
|
||||
* ratio to account for the 'current' frequency, corresponding to
|
||||
* freq_curr / freq_max.
|
||||
*
|
||||
* Since the frequency freq_curr on x86 is controlled by micro-controller and
|
||||
* our P-state setting is little more than a request/hint, we need to observe
|
||||
* the effective frequency 'BusyMHz', i.e. the average frequency over a time
|
||||
* interval after discarding idle time. This is given by:
|
||||
*
|
||||
* BusyMHz = delta_APERF / delta_MPERF * freq_base
|
||||
*
|
||||
* where freq_base is the max non-turbo P-state.
|
||||
*
|
||||
* The freq_max term has to be set to a somewhat arbitrary value, because we
|
||||
* can't know which turbo states will be available at a given point in time:
|
||||
* it all depends on the thermal headroom of the entire package. We set it to
|
||||
* the turbo level with 4 cores active.
|
||||
*
|
||||
* Benchmarks show that's a good compromise between the 1C turbo ratio
|
||||
* (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
|
||||
* which would ignore the entire turbo range (a conspicuous part, making
|
||||
* freq_curr/freq_max always maxed out).
|
||||
*
|
||||
* An exception to the heuristic above is the Atom uarch, where we choose the
|
||||
* highest turbo level for freq_max since Atom's are generally oriented towards
|
||||
* power efficiency.
|
||||
*
|
||||
* Setting freq_max to anything less than the 1C turbo ratio makes the ratio
|
||||
* freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
|
||||
*/
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
|
||||
|
||||
static DEFINE_PER_CPU(u64, arch_prev_aperf);
|
||||
static DEFINE_PER_CPU(u64, arch_prev_mperf);
|
||||
static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
|
||||
static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
|
||||
|
||||
void arch_set_max_freq_ratio(bool turbo_disabled)
|
||||
{
|
||||
arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
|
||||
arch_turbo_freq_ratio;
|
||||
}
|
||||
|
||||
static bool turbo_disabled(void)
|
||||
{
|
||||
u64 misc_en;
|
||||
int err;
|
||||
|
||||
err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
|
||||
if (err)
|
||||
return false;
|
||||
|
||||
return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
|
||||
}
|
||||
|
||||
static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
|
||||
if (err)
|
||||
return false;
|
||||
|
||||
err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
|
||||
if (err)
|
||||
return false;
|
||||
|
||||
*base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
|
||||
*turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#include <asm/cpu_device_id.h>
|
||||
#include <asm/intel-family.h>
|
||||
|
||||
#define ICPU(model) \
|
||||
{X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0}
|
||||
|
||||
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
|
||||
ICPU(INTEL_FAM6_XEON_PHI_KNL),
|
||||
ICPU(INTEL_FAM6_XEON_PHI_KNM),
|
||||
{}
|
||||
};
|
||||
|
||||
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
|
||||
ICPU(INTEL_FAM6_SKYLAKE_X),
|
||||
{}
|
||||
};
|
||||
|
||||
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
|
||||
ICPU(INTEL_FAM6_ATOM_GOLDMONT),
|
||||
ICPU(INTEL_FAM6_ATOM_GOLDMONT_D),
|
||||
ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS),
|
||||
{}
|
||||
};
|
||||
|
||||
static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
|
||||
int num_delta_fratio)
|
||||
{
|
||||
int fratio, delta_fratio, found;
|
||||
int err, i;
|
||||
u64 msr;
|
||||
|
||||
if (!x86_match_cpu(has_knl_turbo_ratio_limits))
|
||||
return false;
|
||||
|
||||
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
|
||||
if (err)
|
||||
return false;
|
||||
|
||||
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
|
||||
|
||||
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
|
||||
if (err)
|
||||
return false;
|
||||
|
||||
fratio = (msr >> 8) & 0xFF;
|
||||
i = 16;
|
||||
found = 0;
|
||||
do {
|
||||
if (found >= num_delta_fratio) {
|
||||
*turbo_freq = fratio;
|
||||
return true;
|
||||
}
|
||||
|
||||
delta_fratio = (msr >> (i + 5)) & 0x7;
|
||||
|
||||
if (delta_fratio) {
|
||||
found += 1;
|
||||
fratio -= delta_fratio;
|
||||
}
|
||||
|
||||
i += 8;
|
||||
} while (i < 64);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
|
||||
{
|
||||
u64 ratios, counts;
|
||||
u32 group_size;
|
||||
int err, i;
|
||||
|
||||
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
|
||||
if (err)
|
||||
return false;
|
||||
|
||||
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
|
||||
|
||||
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
|
||||
if (err)
|
||||
return false;
|
||||
|
||||
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
|
||||
if (err)
|
||||
return false;
|
||||
|
||||
for (i = 0; i < 64; i += 8) {
|
||||
group_size = (counts >> i) & 0xFF;
|
||||
if (group_size >= size) {
|
||||
*turbo_freq = (ratios >> i) & 0xFF;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
|
||||
if (err)
|
||||
return false;
|
||||
|
||||
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, turbo_freq);
|
||||
if (err)
|
||||
return false;
|
||||
|
||||
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
|
||||
*turbo_freq = (*turbo_freq >> 24) & 0xFF; /* 4C turbo */
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool intel_set_max_freq_ratio(void)
|
||||
{
|
||||
u64 base_freq, turbo_freq;
|
||||
|
||||
if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
|
||||
goto out;
|
||||
|
||||
if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
|
||||
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
|
||||
goto out;
|
||||
|
||||
if (knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
|
||||
goto out;
|
||||
|
||||
if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
|
||||
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
|
||||
goto out;
|
||||
|
||||
if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
|
||||
goto out;
|
||||
|
||||
return false;
|
||||
|
||||
out:
|
||||
arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
|
||||
base_freq);
|
||||
arch_set_max_freq_ratio(turbo_disabled());
|
||||
return true;
|
||||
}
|
||||
|
||||
static void init_counter_refs(void *arg)
|
||||
{
|
||||
u64 aperf, mperf;
|
||||
|
||||
rdmsrl(MSR_IA32_APERF, aperf);
|
||||
rdmsrl(MSR_IA32_MPERF, mperf);
|
||||
|
||||
this_cpu_write(arch_prev_aperf, aperf);
|
||||
this_cpu_write(arch_prev_mperf, mperf);
|
||||
}
|
||||
|
||||
static void init_freq_invariance(void)
|
||||
{
|
||||
bool ret = false;
|
||||
|
||||
if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF))
|
||||
return;
|
||||
|
||||
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
|
||||
ret = intel_set_max_freq_ratio();
|
||||
|
||||
if (ret) {
|
||||
on_each_cpu(init_counter_refs, NULL, 1);
|
||||
static_branch_enable(&arch_scale_freq_key);
|
||||
} else {
|
||||
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
|
||||
}
|
||||
}
|
||||
|
||||
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
|
||||
|
||||
void arch_scale_freq_tick(void)
|
||||
{
|
||||
u64 freq_scale;
|
||||
u64 aperf, mperf;
|
||||
u64 acnt, mcnt;
|
||||
|
||||
if (!arch_scale_freq_invariant())
|
||||
return;
|
||||
|
||||
rdmsrl(MSR_IA32_APERF, aperf);
|
||||
rdmsrl(MSR_IA32_MPERF, mperf);
|
||||
|
||||
acnt = aperf - this_cpu_read(arch_prev_aperf);
|
||||
mcnt = mperf - this_cpu_read(arch_prev_mperf);
|
||||
if (!mcnt)
|
||||
return;
|
||||
|
||||
this_cpu_write(arch_prev_aperf, aperf);
|
||||
this_cpu_write(arch_prev_mperf, mperf);
|
||||
|
||||
acnt <<= 2*SCHED_CAPACITY_SHIFT;
|
||||
mcnt *= arch_max_freq_ratio;
|
||||
|
||||
freq_scale = div64_u64(acnt, mcnt);
|
||||
|
||||
if (freq_scale > SCHED_CAPACITY_SCALE)
|
||||
freq_scale = SCHED_CAPACITY_SCALE;
|
||||
|
||||
this_cpu_write(arch_freq_scale, freq_scale);
|
||||
}
|
||||
|
@ -922,6 +922,7 @@ static void intel_pstate_update_limits(unsigned int cpu)
|
||||
*/
|
||||
if (global.turbo_disabled_mf != global.turbo_disabled) {
|
||||
global.turbo_disabled_mf = global.turbo_disabled;
|
||||
arch_set_max_freq_ratio(global.turbo_disabled);
|
||||
for_each_possible_cpu(cpu)
|
||||
intel_pstate_update_max_freq(cpu);
|
||||
} else {
|
||||
|
@ -431,6 +431,10 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
|
||||
unsigned long state)
|
||||
{
|
||||
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
|
||||
struct cpumask *cpus;
|
||||
unsigned int frequency;
|
||||
unsigned long max_capacity, capacity;
|
||||
int ret;
|
||||
|
||||
/* Request state should be less than max_level */
|
||||
if (WARN_ON(state > cpufreq_cdev->max_level))
|
||||
@ -442,8 +446,19 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
|
||||
|
||||
cpufreq_cdev->cpufreq_state = state;
|
||||
|
||||
return freq_qos_update_request(&cpufreq_cdev->qos_req,
|
||||
get_state_freq(cpufreq_cdev, state));
|
||||
frequency = get_state_freq(cpufreq_cdev, state);
|
||||
|
||||
ret = freq_qos_update_request(&cpufreq_cdev->qos_req, frequency);
|
||||
|
||||
if (ret > 0) {
|
||||
cpus = cpufreq_cdev->policy->cpus;
|
||||
max_capacity = arch_scale_cpu_capacity(cpumask_first(cpus));
|
||||
capacity = frequency * max_capacity;
|
||||
capacity /= cpufreq_cdev->policy->cpuinfo.max_freq;
|
||||
arch_set_thermal_pressure(cpus, max_capacity - capacity);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Bind cpufreq callbacks to thermal cooling device ops */
|
||||
|
@ -30,6 +30,16 @@ static inline unsigned long topology_get_freq_scale(int cpu)
|
||||
return per_cpu(freq_scale, cpu);
|
||||
}
|
||||
|
||||
DECLARE_PER_CPU(unsigned long, thermal_pressure);
|
||||
|
||||
static inline unsigned long topology_get_thermal_pressure(int cpu)
|
||||
{
|
||||
return per_cpu(thermal_pressure, cpu);
|
||||
}
|
||||
|
||||
void arch_set_thermal_pressure(struct cpumask *cpus,
|
||||
unsigned long th_pressure);
|
||||
|
||||
struct cpu_topology {
|
||||
int thread_id;
|
||||
int core_id;
|
||||
|
@ -194,6 +194,11 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
|
||||
const struct cpumask *src2p) {
|
||||
return cpumask_next_and(-1, src1p, src2p);
|
||||
}
|
||||
|
||||
#define for_each_cpu(cpu, mask) \
|
||||
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
|
||||
#define for_each_cpu_not(cpu, mask) \
|
||||
@ -245,6 +250,8 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
|
||||
int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
|
||||
int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
|
||||
unsigned int cpumask_local_spread(unsigned int i, int node);
|
||||
int cpumask_any_and_distribute(const struct cpumask *src1p,
|
||||
const struct cpumask *src2p);
|
||||
|
||||
/**
|
||||
* for_each_cpu - iterate over every cpu in a mask
|
||||
|
@ -257,6 +257,13 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
|
||||
|
||||
#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
|
||||
|
||||
#ifndef CONFIG_PREEMPT_RT
|
||||
# define cant_migrate() cant_sleep()
|
||||
#else
|
||||
/* Placeholder for now */
|
||||
# define cant_migrate() do { } while (0)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* abs - return absolute value of an argument
|
||||
* @x: the value. If it is unsigned type, it is converted to signed type first.
|
||||
|
@ -322,4 +322,34 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* migrate_disable - Prevent migration of the current task
|
||||
*
|
||||
* Maps to preempt_disable() which also disables preemption. Use
|
||||
* migrate_disable() to annotate that the intent is to prevent migration,
|
||||
* but not necessarily preemption.
|
||||
*
|
||||
* Can be invoked nested like preempt_disable() and needs the corresponding
|
||||
* number of migrate_enable() invocations.
|
||||
*/
|
||||
static __always_inline void migrate_disable(void)
|
||||
{
|
||||
preempt_disable();
|
||||
}
|
||||
|
||||
/**
|
||||
* migrate_enable - Allow migration of the current task
|
||||
*
|
||||
* Counterpart to migrate_disable().
|
||||
*
|
||||
* As migrate_disable() can be invoked nested, only the outermost invocation
|
||||
* reenables migration.
|
||||
*
|
||||
* Currently mapped to preempt_enable().
|
||||
*/
|
||||
static __always_inline void migrate_enable(void)
|
||||
{
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
#endif /* __LINUX_PREEMPT_H */
|
||||
|
@ -17,6 +17,8 @@ extern struct psi_group psi_system;
|
||||
void psi_init(void);
|
||||
|
||||
void psi_task_change(struct task_struct *task, int clear, int set);
|
||||
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
bool sleep);
|
||||
|
||||
void psi_memstall_tick(struct task_struct *task, int cpu);
|
||||
void psi_memstall_enter(unsigned long *flags);
|
||||
|
@ -14,13 +14,21 @@ enum psi_task_count {
|
||||
NR_IOWAIT,
|
||||
NR_MEMSTALL,
|
||||
NR_RUNNING,
|
||||
NR_PSI_TASK_COUNTS = 3,
|
||||
/*
|
||||
* This can't have values other than 0 or 1 and could be
|
||||
* implemented as a bit flag. But for now we still have room
|
||||
* in the first cacheline of psi_group_cpu, and this way we
|
||||
* don't have to special case any state tracking for it.
|
||||
*/
|
||||
NR_ONCPU,
|
||||
NR_PSI_TASK_COUNTS = 4,
|
||||
};
|
||||
|
||||
/* Task state bitmasks */
|
||||
#define TSK_IOWAIT (1 << NR_IOWAIT)
|
||||
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
|
||||
#define TSK_RUNNING (1 << NR_RUNNING)
|
||||
#define TSK_ONCPU (1 << NR_ONCPU)
|
||||
|
||||
/* Resources that workloads could be stalled on */
|
||||
enum psi_res {
|
||||
|
@ -356,28 +356,30 @@ struct util_est {
|
||||
} __attribute__((__aligned__(sizeof(u64))));
|
||||
|
||||
/*
|
||||
* The load_avg/util_avg accumulates an infinite geometric series
|
||||
* (see __update_load_avg() in kernel/sched/fair.c).
|
||||
* The load/runnable/util_avg accumulates an infinite geometric series
|
||||
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
|
||||
*
|
||||
* [load_avg definition]
|
||||
*
|
||||
* load_avg = runnable% * scale_load_down(load)
|
||||
*
|
||||
* where runnable% is the time ratio that a sched_entity is runnable.
|
||||
* For cfs_rq, it is the aggregated load_avg of all runnable and
|
||||
* blocked sched_entities.
|
||||
* [runnable_avg definition]
|
||||
*
|
||||
* runnable_avg = runnable% * SCHED_CAPACITY_SCALE
|
||||
*
|
||||
* [util_avg definition]
|
||||
*
|
||||
* util_avg = running% * SCHED_CAPACITY_SCALE
|
||||
*
|
||||
* where running% is the time ratio that a sched_entity is running on
|
||||
* a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
|
||||
* and blocked sched_entities.
|
||||
* where runnable% is the time ratio that a sched_entity is runnable and
|
||||
* running% the time ratio that a sched_entity is running.
|
||||
*
|
||||
* load_avg and util_avg don't direcly factor frequency scaling and CPU
|
||||
* capacity scaling. The scaling is done through the rq_clock_pelt that
|
||||
* is used for computing those signals (see update_rq_clock_pelt())
|
||||
* For cfs_rq, they are the aggregated values of all runnable and blocked
|
||||
* sched_entities.
|
||||
*
|
||||
* The load/runnable/util_avg doesn't direcly factor frequency scaling and CPU
|
||||
* capacity scaling. The scaling is done through the rq_clock_pelt that is used
|
||||
* for computing those signals (see update_rq_clock_pelt())
|
||||
*
|
||||
* N.B., the above ratios (runnable% and running%) themselves are in the
|
||||
* range of [0, 1]. To do fixed point arithmetics, we therefore scale them
|
||||
@ -401,11 +403,11 @@ struct util_est {
|
||||
struct sched_avg {
|
||||
u64 last_update_time;
|
||||
u64 load_sum;
|
||||
u64 runnable_load_sum;
|
||||
u64 runnable_sum;
|
||||
u32 util_sum;
|
||||
u32 period_contrib;
|
||||
unsigned long load_avg;
|
||||
unsigned long runnable_load_avg;
|
||||
unsigned long runnable_avg;
|
||||
unsigned long util_avg;
|
||||
struct util_est util_est;
|
||||
} ____cacheline_aligned;
|
||||
@ -449,7 +451,6 @@ struct sched_statistics {
|
||||
struct sched_entity {
|
||||
/* For load-balancing: */
|
||||
struct load_weight load;
|
||||
unsigned long runnable_weight;
|
||||
struct rb_node run_node;
|
||||
struct list_head group_node;
|
||||
unsigned int on_rq;
|
||||
@ -470,6 +471,8 @@ struct sched_entity {
|
||||
struct cfs_rq *cfs_rq;
|
||||
/* rq "owned" by this entity/group: */
|
||||
struct cfs_rq *my_q;
|
||||
/* cached value of my_q->h_nr_running */
|
||||
unsigned long runnable_weight;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
@ -782,9 +785,12 @@ struct task_struct {
|
||||
unsigned frozen:1;
|
||||
#endif
|
||||
#ifdef CONFIG_BLK_CGROUP
|
||||
/* to be used once the psi infrastructure lands upstream. */
|
||||
unsigned use_memdelay:1;
|
||||
#endif
|
||||
#ifdef CONFIG_PSI
|
||||
/* Stalled due to lack of memory */
|
||||
unsigned in_memstall:1;
|
||||
#endif
|
||||
|
||||
unsigned long atomic_flags; /* Flags requiring atomic access. */
|
||||
|
||||
@ -1479,7 +1485,6 @@ extern struct pid *cad_pid;
|
||||
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
|
||||
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
|
||||
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
|
||||
#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */
|
||||
#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
|
||||
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
|
||||
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
|
||||
|
@ -225,6 +225,14 @@ unsigned long arch_scale_cpu_capacity(int cpu)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef arch_scale_thermal_pressure
|
||||
static __always_inline
|
||||
unsigned long arch_scale_thermal_pressure(int cpu)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int task_node(const struct task_struct *p)
|
||||
{
|
||||
return cpu_to_node(task_cpu(p));
|
||||
|
@ -29,7 +29,7 @@
|
||||
|
||||
/*
|
||||
* A maximum of 4 million PIDs should be enough for a while.
|
||||
* [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
|
||||
* [NOTE: PID/TIDs are limited to 2^30 ~= 1 billion, see FUTEX_TID_MASK.]
|
||||
*/
|
||||
#define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
|
||||
(sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
|
||||
|
@ -487,7 +487,11 @@ TRACE_EVENT(sched_process_hang,
|
||||
);
|
||||
#endif /* CONFIG_DETECT_HUNG_TASK */
|
||||
|
||||
DECLARE_EVENT_CLASS(sched_move_task_template,
|
||||
/*
|
||||
* Tracks migration of tasks from one runqueue to another. Can be used to
|
||||
* detect if automatic NUMA balancing is bouncing between nodes.
|
||||
*/
|
||||
TRACE_EVENT(sched_move_numa,
|
||||
|
||||
TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
|
||||
|
||||
@ -519,23 +523,7 @@ DECLARE_EVENT_CLASS(sched_move_task_template,
|
||||
__entry->dst_cpu, __entry->dst_nid)
|
||||
);
|
||||
|
||||
/*
|
||||
* Tracks migration of tasks from one runqueue to another. Can be used to
|
||||
* detect if automatic NUMA balancing is bouncing between nodes
|
||||
*/
|
||||
DEFINE_EVENT(sched_move_task_template, sched_move_numa,
|
||||
TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
|
||||
|
||||
TP_ARGS(tsk, src_cpu, dst_cpu)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(sched_move_task_template, sched_stick_numa,
|
||||
TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
|
||||
|
||||
TP_ARGS(tsk, src_cpu, dst_cpu)
|
||||
);
|
||||
|
||||
TRACE_EVENT(sched_swap_numa,
|
||||
DECLARE_EVENT_CLASS(sched_numa_pair_template,
|
||||
|
||||
TP_PROTO(struct task_struct *src_tsk, int src_cpu,
|
||||
struct task_struct *dst_tsk, int dst_cpu),
|
||||
@ -561,11 +549,11 @@ TRACE_EVENT(sched_swap_numa,
|
||||
__entry->src_ngid = task_numa_group_id(src_tsk);
|
||||
__entry->src_cpu = src_cpu;
|
||||
__entry->src_nid = cpu_to_node(src_cpu);
|
||||
__entry->dst_pid = task_pid_nr(dst_tsk);
|
||||
__entry->dst_tgid = task_tgid_nr(dst_tsk);
|
||||
__entry->dst_ngid = task_numa_group_id(dst_tsk);
|
||||
__entry->dst_pid = dst_tsk ? task_pid_nr(dst_tsk) : 0;
|
||||
__entry->dst_tgid = dst_tsk ? task_tgid_nr(dst_tsk) : 0;
|
||||
__entry->dst_ngid = dst_tsk ? task_numa_group_id(dst_tsk) : 0;
|
||||
__entry->dst_cpu = dst_cpu;
|
||||
__entry->dst_nid = cpu_to_node(dst_cpu);
|
||||
__entry->dst_nid = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1;
|
||||
),
|
||||
|
||||
TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d",
|
||||
@ -575,6 +563,23 @@ TRACE_EVENT(sched_swap_numa,
|
||||
__entry->dst_cpu, __entry->dst_nid)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa,
|
||||
|
||||
TP_PROTO(struct task_struct *src_tsk, int src_cpu,
|
||||
struct task_struct *dst_tsk, int dst_cpu),
|
||||
|
||||
TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
|
||||
|
||||
TP_PROTO(struct task_struct *src_tsk, int src_cpu,
|
||||
struct task_struct *dst_tsk, int dst_cpu),
|
||||
|
||||
TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
|
||||
);
|
||||
|
||||
|
||||
/*
|
||||
* Tracepoint for waking a polling cpu without an IPI.
|
||||
*/
|
||||
@ -613,6 +618,10 @@ DECLARE_TRACE(pelt_dl_tp,
|
||||
TP_PROTO(struct rq *rq),
|
||||
TP_ARGS(rq));
|
||||
|
||||
DECLARE_TRACE(pelt_thermal_tp,
|
||||
TP_PROTO(struct rq *rq),
|
||||
TP_ARGS(rq));
|
||||
|
||||
DECLARE_TRACE(pelt_irq_tp,
|
||||
TP_PROTO(struct rq *rq),
|
||||
TP_ARGS(rq));
|
||||
|
@ -451,6 +451,10 @@ config HAVE_SCHED_AVG_IRQ
|
||||
depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
|
||||
depends on SMP
|
||||
|
||||
config SCHED_THERMAL_PRESSURE
|
||||
bool "Enable periodic averaging of thermal pressure"
|
||||
depends on SMP
|
||||
|
||||
config BSD_PROCESS_ACCT
|
||||
bool "BSD Process Accounting"
|
||||
depends on MULTIUSER
|
||||
|
@ -199,8 +199,15 @@ static void __kthread_parkme(struct kthread *self)
|
||||
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
|
||||
break;
|
||||
|
||||
/*
|
||||
* Thread is going to call schedule(), do not preempt it,
|
||||
* or the caller of kthread_park() may spend more time in
|
||||
* wait_task_inactive().
|
||||
*/
|
||||
preempt_disable();
|
||||
complete(&self->parked);
|
||||
schedule();
|
||||
schedule_preempt_disabled();
|
||||
preempt_enable();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
}
|
||||
@ -245,8 +252,14 @@ static int kthread(void *_create)
|
||||
/* OK, tell user we're spawned, wait for stop or wakeup */
|
||||
__set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
create->result = current;
|
||||
/*
|
||||
* Thread is going to call schedule(), do not preempt it,
|
||||
* or the creator may spend more time in wait_task_inactive().
|
||||
*/
|
||||
preempt_disable();
|
||||
complete(done);
|
||||
schedule();
|
||||
schedule_preempt_disabled();
|
||||
preempt_enable();
|
||||
|
||||
ret = -EINTR;
|
||||
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
|
||||
|
@ -761,7 +761,6 @@ static void set_load_weight(struct task_struct *p, bool update_load)
|
||||
if (task_has_idle_policy(p)) {
|
||||
load->weight = scale_load(WEIGHT_IDLEPRIO);
|
||||
load->inv_weight = WMULT_IDLEPRIO;
|
||||
p->se.runnable_weight = load->weight;
|
||||
return;
|
||||
}
|
||||
|
||||
@ -774,7 +773,6 @@ static void set_load_weight(struct task_struct *p, bool update_load)
|
||||
} else {
|
||||
load->weight = scale_load(sched_prio_to_weight[prio]);
|
||||
load->inv_weight = sched_prio_to_wmult[prio];
|
||||
p->se.runnable_weight = load->weight;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1652,7 +1650,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
|
||||
if (cpumask_equal(p->cpus_ptr, new_mask))
|
||||
goto out;
|
||||
|
||||
dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
|
||||
/*
|
||||
* Picking a ~random cpu helps in cases where we are changing affinity
|
||||
* for groups of tasks (ie. cpuset), so that load balancing is not
|
||||
* immediately required to distribute the tasks within their new mask.
|
||||
*/
|
||||
dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
|
||||
if (dest_cpu >= nr_cpu_ids) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
@ -3578,6 +3581,17 @@ unsigned long long task_sched_runtime(struct task_struct *p)
|
||||
return ns;
|
||||
}
|
||||
|
||||
DEFINE_PER_CPU(unsigned long, thermal_pressure);
|
||||
|
||||
void arch_set_thermal_pressure(struct cpumask *cpus,
|
||||
unsigned long th_pressure)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_cpu(cpu, cpus)
|
||||
WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called by the timer code, with HZ frequency.
|
||||
* We call it with interrupts disabled.
|
||||
@ -3588,12 +3602,16 @@ void scheduler_tick(void)
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct task_struct *curr = rq->curr;
|
||||
struct rq_flags rf;
|
||||
unsigned long thermal_pressure;
|
||||
|
||||
arch_scale_freq_tick();
|
||||
sched_clock_tick();
|
||||
|
||||
rq_lock(rq, &rf);
|
||||
|
||||
update_rq_clock(rq);
|
||||
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
|
||||
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
|
||||
curr->sched_class->task_tick(rq, curr, 0);
|
||||
calc_global_load_tick(rq);
|
||||
psi_task_tick(rq);
|
||||
@ -3671,7 +3689,6 @@ static void sched_tick_remote(struct work_struct *work)
|
||||
if (cpu_is_offline(cpu))
|
||||
goto out_unlock;
|
||||
|
||||
curr = rq->curr;
|
||||
update_rq_clock(rq);
|
||||
|
||||
if (!is_idle_task(curr)) {
|
||||
@ -4074,6 +4091,8 @@ static void __sched notrace __schedule(bool preempt)
|
||||
*/
|
||||
++*switch_count;
|
||||
|
||||
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
|
||||
|
||||
trace_sched_switch(preempt, prev, next);
|
||||
|
||||
/* Also unlocks the rq: */
|
||||
|
@ -41,8 +41,67 @@ static int convert_prio(int prio)
|
||||
return cpupri;
|
||||
}
|
||||
|
||||
static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
|
||||
struct cpumask *lowest_mask, int idx)
|
||||
{
|
||||
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
|
||||
int skip = 0;
|
||||
|
||||
if (!atomic_read(&(vec)->count))
|
||||
skip = 1;
|
||||
/*
|
||||
* When looking at the vector, we need to read the counter,
|
||||
* do a memory barrier, then read the mask.
|
||||
*
|
||||
* Note: This is still all racey, but we can deal with it.
|
||||
* Ideally, we only want to look at masks that are set.
|
||||
*
|
||||
* If a mask is not set, then the only thing wrong is that we
|
||||
* did a little more work than necessary.
|
||||
*
|
||||
* If we read a zero count but the mask is set, because of the
|
||||
* memory barriers, that can only happen when the highest prio
|
||||
* task for a run queue has left the run queue, in which case,
|
||||
* it will be followed by a pull. If the task we are processing
|
||||
* fails to find a proper place to go, that pull request will
|
||||
* pull this task if the run queue is running at a lower
|
||||
* priority.
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
/* Need to do the rmb for every iteration */
|
||||
if (skip)
|
||||
return 0;
|
||||
|
||||
if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
|
||||
return 0;
|
||||
|
||||
if (lowest_mask) {
|
||||
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
|
||||
|
||||
/*
|
||||
* We have to ensure that we have at least one bit
|
||||
* still set in the array, since the map could have
|
||||
* been concurrently emptied between the first and
|
||||
* second reads of vec->mask. If we hit this
|
||||
* condition, simply act as though we never hit this
|
||||
* priority level and continue on.
|
||||
*/
|
||||
if (cpumask_empty(lowest_mask))
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int cpupri_find(struct cpupri *cp, struct task_struct *p,
|
||||
struct cpumask *lowest_mask)
|
||||
{
|
||||
return cpupri_find_fitness(cp, p, lowest_mask, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* cpupri_find - find the best (lowest-pri) CPU in the system
|
||||
* cpupri_find_fitness - find the best (lowest-pri) CPU in the system
|
||||
* @cp: The cpupri context
|
||||
* @p: The task
|
||||
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
|
||||
@ -58,84 +117,59 @@ static int convert_prio(int prio)
|
||||
*
|
||||
* Return: (int)bool - CPUs were found
|
||||
*/
|
||||
int cpupri_find(struct cpupri *cp, struct task_struct *p,
|
||||
int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
|
||||
struct cpumask *lowest_mask,
|
||||
bool (*fitness_fn)(struct task_struct *p, int cpu))
|
||||
{
|
||||
int idx = 0;
|
||||
int task_pri = convert_prio(p->prio);
|
||||
int idx, cpu;
|
||||
|
||||
BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
|
||||
|
||||
for (idx = 0; idx < task_pri; idx++) {
|
||||
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
|
||||
int skip = 0;
|
||||
|
||||
if (!atomic_read(&(vec)->count))
|
||||
skip = 1;
|
||||
/*
|
||||
* When looking at the vector, we need to read the counter,
|
||||
* do a memory barrier, then read the mask.
|
||||
*
|
||||
* Note: This is still all racey, but we can deal with it.
|
||||
* Ideally, we only want to look at masks that are set.
|
||||
*
|
||||
* If a mask is not set, then the only thing wrong is that we
|
||||
* did a little more work than necessary.
|
||||
*
|
||||
* If we read a zero count but the mask is set, because of the
|
||||
* memory barriers, that can only happen when the highest prio
|
||||
* task for a run queue has left the run queue, in which case,
|
||||
* it will be followed by a pull. If the task we are processing
|
||||
* fails to find a proper place to go, that pull request will
|
||||
* pull this task if the run queue is running at a lower
|
||||
* priority.
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
/* Need to do the rmb for every iteration */
|
||||
if (skip)
|
||||
if (!__cpupri_find(cp, p, lowest_mask, idx))
|
||||
continue;
|
||||
|
||||
if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
|
||||
continue;
|
||||
if (!lowest_mask || !fitness_fn)
|
||||
return 1;
|
||||
|
||||
if (lowest_mask) {
|
||||
int cpu;
|
||||
|
||||
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
|
||||
|
||||
/*
|
||||
* We have to ensure that we have at least one bit
|
||||
* still set in the array, since the map could have
|
||||
* been concurrently emptied between the first and
|
||||
* second reads of vec->mask. If we hit this
|
||||
* condition, simply act as though we never hit this
|
||||
* priority level and continue on.
|
||||
*/
|
||||
if (cpumask_empty(lowest_mask))
|
||||
continue;
|
||||
|
||||
if (!fitness_fn)
|
||||
return 1;
|
||||
|
||||
/* Ensure the capacity of the CPUs fit the task */
|
||||
for_each_cpu(cpu, lowest_mask) {
|
||||
if (!fitness_fn(p, cpu))
|
||||
cpumask_clear_cpu(cpu, lowest_mask);
|
||||
}
|
||||
|
||||
/*
|
||||
* If no CPU at the current priority can fit the task
|
||||
* continue looking
|
||||
*/
|
||||
if (cpumask_empty(lowest_mask))
|
||||
continue;
|
||||
/* Ensure the capacity of the CPUs fit the task */
|
||||
for_each_cpu(cpu, lowest_mask) {
|
||||
if (!fitness_fn(p, cpu))
|
||||
cpumask_clear_cpu(cpu, lowest_mask);
|
||||
}
|
||||
|
||||
/*
|
||||
* If no CPU at the current priority can fit the task
|
||||
* continue looking
|
||||
*/
|
||||
if (cpumask_empty(lowest_mask))
|
||||
continue;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we failed to find a fitting lowest_mask, kick off a new search
|
||||
* but without taking into account any fitness criteria this time.
|
||||
*
|
||||
* This rule favours honouring priority over fitting the task in the
|
||||
* correct CPU (Capacity Awareness being the only user now).
|
||||
* The idea is that if a higher priority task can run, then it should
|
||||
* run even if this ends up being on unfitting CPU.
|
||||
*
|
||||
* The cost of this trade-off is not entirely clear and will probably
|
||||
* be good for some workloads and bad for others.
|
||||
*
|
||||
* The main idea here is that if some CPUs were overcommitted, we try
|
||||
* to spread which is what the scheduler traditionally did. Sys admins
|
||||
* must do proper RT planning to avoid overloading the system if they
|
||||
* really care.
|
||||
*/
|
||||
if (fitness_fn)
|
||||
return cpupri_find(cp, p, lowest_mask);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -19,8 +19,10 @@ struct cpupri {
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
int cpupri_find(struct cpupri *cp, struct task_struct *p,
|
||||
struct cpumask *lowest_mask,
|
||||
bool (*fitness_fn)(struct task_struct *p, int cpu));
|
||||
struct cpumask *lowest_mask);
|
||||
int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
|
||||
struct cpumask *lowest_mask,
|
||||
bool (*fitness_fn)(struct task_struct *p, int cpu));
|
||||
void cpupri_set(struct cpupri *cp, int cpu, int pri);
|
||||
int cpupri_init(struct cpupri *cp);
|
||||
void cpupri_cleanup(struct cpupri *cp);
|
||||
|
@ -909,8 +909,10 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
|
||||
} while (read_seqcount_retry(&vtime->seqcount, seq));
|
||||
}
|
||||
|
||||
static int vtime_state_check(struct vtime *vtime, int cpu)
|
||||
static int vtime_state_fetch(struct vtime *vtime, int cpu)
|
||||
{
|
||||
int state = READ_ONCE(vtime->state);
|
||||
|
||||
/*
|
||||
* We raced against a context switch, fetch the
|
||||
* kcpustat task again.
|
||||
@ -927,10 +929,10 @@ static int vtime_state_check(struct vtime *vtime, int cpu)
|
||||
*
|
||||
* Case 1) is ok but 2) is not. So wait for a safe VTIME state.
|
||||
*/
|
||||
if (vtime->state == VTIME_INACTIVE)
|
||||
if (state == VTIME_INACTIVE)
|
||||
return -EAGAIN;
|
||||
|
||||
return 0;
|
||||
return state;
|
||||
}
|
||||
|
||||
static u64 kcpustat_user_vtime(struct vtime *vtime)
|
||||
@ -949,14 +951,15 @@ static int kcpustat_field_vtime(u64 *cpustat,
|
||||
{
|
||||
struct vtime *vtime = &tsk->vtime;
|
||||
unsigned int seq;
|
||||
int err;
|
||||
|
||||
do {
|
||||
int state;
|
||||
|
||||
seq = read_seqcount_begin(&vtime->seqcount);
|
||||
|
||||
err = vtime_state_check(vtime, cpu);
|
||||
if (err < 0)
|
||||
return err;
|
||||
state = vtime_state_fetch(vtime, cpu);
|
||||
if (state < 0)
|
||||
return state;
|
||||
|
||||
*val = cpustat[usage];
|
||||
|
||||
@ -969,7 +972,7 @@ static int kcpustat_field_vtime(u64 *cpustat,
|
||||
*/
|
||||
switch (usage) {
|
||||
case CPUTIME_SYSTEM:
|
||||
if (vtime->state == VTIME_SYS)
|
||||
if (state == VTIME_SYS)
|
||||
*val += vtime->stime + vtime_delta(vtime);
|
||||
break;
|
||||
case CPUTIME_USER:
|
||||
@ -981,11 +984,11 @@ static int kcpustat_field_vtime(u64 *cpustat,
|
||||
*val += kcpustat_user_vtime(vtime);
|
||||
break;
|
||||
case CPUTIME_GUEST:
|
||||
if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0)
|
||||
if (state == VTIME_GUEST && task_nice(tsk) <= 0)
|
||||
*val += vtime->gtime + vtime_delta(vtime);
|
||||
break;
|
||||
case CPUTIME_GUEST_NICE:
|
||||
if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0)
|
||||
if (state == VTIME_GUEST && task_nice(tsk) > 0)
|
||||
*val += vtime->gtime + vtime_delta(vtime);
|
||||
break;
|
||||
default:
|
||||
@ -1036,23 +1039,23 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
|
||||
{
|
||||
struct vtime *vtime = &tsk->vtime;
|
||||
unsigned int seq;
|
||||
int err;
|
||||
|
||||
do {
|
||||
u64 *cpustat;
|
||||
u64 delta;
|
||||
int state;
|
||||
|
||||
seq = read_seqcount_begin(&vtime->seqcount);
|
||||
|
||||
err = vtime_state_check(vtime, cpu);
|
||||
if (err < 0)
|
||||
return err;
|
||||
state = vtime_state_fetch(vtime, cpu);
|
||||
if (state < 0)
|
||||
return state;
|
||||
|
||||
*dst = *src;
|
||||
cpustat = dst->cpustat;
|
||||
|
||||
/* Task is sleeping, dead or idle, nothing to add */
|
||||
if (vtime->state < VTIME_SYS)
|
||||
if (state < VTIME_SYS)
|
||||
continue;
|
||||
|
||||
delta = vtime_delta(vtime);
|
||||
@ -1061,15 +1064,15 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
|
||||
* Task runs either in user (including guest) or kernel space,
|
||||
* add pending nohz time to the right place.
|
||||
*/
|
||||
if (vtime->state == VTIME_SYS) {
|
||||
if (state == VTIME_SYS) {
|
||||
cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
|
||||
} else if (vtime->state == VTIME_USER) {
|
||||
} else if (state == VTIME_USER) {
|
||||
if (task_nice(tsk) > 0)
|
||||
cpustat[CPUTIME_NICE] += vtime->utime + delta;
|
||||
else
|
||||
cpustat[CPUTIME_USER] += vtime->utime + delta;
|
||||
} else {
|
||||
WARN_ON_ONCE(vtime->state != VTIME_GUEST);
|
||||
WARN_ON_ONCE(state != VTIME_GUEST);
|
||||
if (task_nice(tsk) > 0) {
|
||||
cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
|
||||
cpustat[CPUTIME_NICE] += vtime->gtime + delta;
|
||||
@ -1080,7 +1083,7 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
|
||||
}
|
||||
} while (read_seqcount_retry(&vtime->seqcount, seq));
|
||||
|
||||
return err;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
|
||||
|
@ -153,7 +153,7 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
|
||||
__sub_running_bw(dl_se->dl_bw, dl_rq);
|
||||
}
|
||||
|
||||
void dl_change_utilization(struct task_struct *p, u64 new_bw)
|
||||
static void dl_change_utilization(struct task_struct *p, u64 new_bw)
|
||||
{
|
||||
struct rq *rq;
|
||||
|
||||
@ -334,6 +334,8 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
|
||||
return dl_rq->root.rb_leftmost == &dl_se->rb_node;
|
||||
}
|
||||
|
||||
static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
|
||||
|
||||
void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
|
||||
{
|
||||
raw_spin_lock_init(&dl_b->dl_runtime_lock);
|
||||
@ -2496,7 +2498,7 @@ int sched_dl_global_validate(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
|
||||
static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
|
||||
{
|
||||
if (global_rt_runtime() == RUNTIME_INF) {
|
||||
dl_rq->bw_ratio = 1 << RATIO_SHIFT;
|
||||
|
@ -402,11 +402,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
|
||||
}
|
||||
|
||||
P(se->load.weight);
|
||||
P(se->runnable_weight);
|
||||
#ifdef CONFIG_SMP
|
||||
P(se->avg.load_avg);
|
||||
P(se->avg.util_avg);
|
||||
P(se->avg.runnable_load_avg);
|
||||
P(se->avg.runnable_avg);
|
||||
#endif
|
||||
|
||||
#undef PN_SCHEDSTAT
|
||||
@ -524,11 +523,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
|
||||
#ifdef CONFIG_SMP
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
|
||||
cfs_rq->avg.load_avg);
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
|
||||
cfs_rq->avg.runnable_load_avg);
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
|
||||
cfs_rq->avg.runnable_avg);
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
|
||||
cfs_rq->avg.util_avg);
|
||||
SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
|
||||
@ -537,8 +535,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
cfs_rq->removed.load_avg);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
|
||||
cfs_rq->removed.util_avg);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
|
||||
cfs_rq->removed.runnable_sum);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg",
|
||||
cfs_rq->removed.runnable_avg);
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
|
||||
cfs_rq->tg_load_avg_contrib);
|
||||
@ -947,13 +945,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
|
||||
"nr_involuntary_switches", (long long)p->nivcsw);
|
||||
|
||||
P(se.load.weight);
|
||||
P(se.runnable_weight);
|
||||
#ifdef CONFIG_SMP
|
||||
P(se.avg.load_sum);
|
||||
P(se.avg.runnable_load_sum);
|
||||
P(se.avg.runnable_sum);
|
||||
P(se.avg.util_sum);
|
||||
P(se.avg.load_avg);
|
||||
P(se.avg.runnable_load_avg);
|
||||
P(se.avg.runnable_avg);
|
||||
P(se.avg.util_avg);
|
||||
P(se.avg.last_update_time);
|
||||
P(se.avg.util_est.ewma);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -121,8 +121,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
|
||||
*/
|
||||
if (periods) {
|
||||
sa->load_sum = decay_load(sa->load_sum, periods);
|
||||
sa->runnable_load_sum =
|
||||
decay_load(sa->runnable_load_sum, periods);
|
||||
sa->runnable_sum =
|
||||
decay_load(sa->runnable_sum, periods);
|
||||
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
|
||||
|
||||
/*
|
||||
@ -149,7 +149,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
|
||||
if (load)
|
||||
sa->load_sum += load * contrib;
|
||||
if (runnable)
|
||||
sa->runnable_load_sum += runnable * contrib;
|
||||
sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;
|
||||
if (running)
|
||||
sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
|
||||
|
||||
@ -238,7 +238,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
|
||||
___update_load_avg(struct sched_avg *sa, unsigned long load)
|
||||
{
|
||||
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
|
||||
|
||||
@ -246,7 +246,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
|
||||
* Step 2: update *_avg.
|
||||
*/
|
||||
sa->load_avg = div_u64(load * sa->load_sum, divider);
|
||||
sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
|
||||
sa->runnable_avg = div_u64(sa->runnable_sum, divider);
|
||||
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
|
||||
}
|
||||
|
||||
@ -254,33 +254,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
|
||||
* sched_entity:
|
||||
*
|
||||
* task:
|
||||
* se_runnable() == se_weight()
|
||||
* se_weight() = se->load.weight
|
||||
* se_runnable() = !!on_rq
|
||||
*
|
||||
* group: [ see update_cfs_group() ]
|
||||
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
|
||||
* se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
|
||||
* se_runnable() = grq->h_nr_running
|
||||
*
|
||||
* load_sum := runnable_sum
|
||||
* load_avg = se_weight(se) * runnable_avg
|
||||
* runnable_sum = se_runnable() * runnable = grq->runnable_sum
|
||||
* runnable_avg = runnable_sum
|
||||
*
|
||||
* runnable_load_sum := runnable_sum
|
||||
* runnable_load_avg = se_runnable(se) * runnable_avg
|
||||
*
|
||||
* XXX collapse load_sum and runnable_load_sum
|
||||
* load_sum := runnable
|
||||
* load_avg = se_weight(se) * load_sum
|
||||
*
|
||||
* cfq_rq:
|
||||
*
|
||||
* runnable_sum = \Sum se->avg.runnable_sum
|
||||
* runnable_avg = \Sum se->avg.runnable_avg
|
||||
*
|
||||
* load_sum = \Sum se_weight(se) * se->avg.load_sum
|
||||
* load_avg = \Sum se->avg.load_avg
|
||||
*
|
||||
* runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
|
||||
* runnable_load_avg = \Sum se->avg.runable_load_avg
|
||||
*/
|
||||
|
||||
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
|
||||
{
|
||||
if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
|
||||
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
|
||||
___update_load_avg(&se->avg, se_weight(se));
|
||||
trace_pelt_se_tp(se);
|
||||
return 1;
|
||||
}
|
||||
@ -290,10 +289,10 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
|
||||
|
||||
int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
|
||||
if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
|
||||
cfs_rq->curr == se)) {
|
||||
|
||||
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
|
||||
___update_load_avg(&se->avg, se_weight(se));
|
||||
cfs_se_util_change(&se->avg);
|
||||
trace_pelt_se_tp(se);
|
||||
return 1;
|
||||
@ -306,10 +305,10 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
if (___update_load_sum(now, &cfs_rq->avg,
|
||||
scale_load_down(cfs_rq->load.weight),
|
||||
scale_load_down(cfs_rq->runnable_weight),
|
||||
cfs_rq->h_nr_running,
|
||||
cfs_rq->curr != NULL)) {
|
||||
|
||||
___update_load_avg(&cfs_rq->avg, 1, 1);
|
||||
___update_load_avg(&cfs_rq->avg, 1);
|
||||
trace_pelt_cfs_tp(cfs_rq);
|
||||
return 1;
|
||||
}
|
||||
@ -322,9 +321,9 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
|
||||
*
|
||||
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
|
||||
* util_sum = cpu_scale * load_sum
|
||||
* runnable_load_sum = load_sum
|
||||
* runnable_sum = util_sum
|
||||
*
|
||||
* load_avg and runnable_load_avg are not supported and meaningless.
|
||||
* load_avg and runnable_avg are not supported and meaningless.
|
||||
*
|
||||
*/
|
||||
|
||||
@ -335,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
running,
|
||||
running)) {
|
||||
|
||||
___update_load_avg(&rq->avg_rt, 1, 1);
|
||||
___update_load_avg(&rq->avg_rt, 1);
|
||||
trace_pelt_rt_tp(rq);
|
||||
return 1;
|
||||
}
|
||||
@ -348,7 +347,9 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
*
|
||||
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
|
||||
* util_sum = cpu_scale * load_sum
|
||||
* runnable_load_sum = load_sum
|
||||
* runnable_sum = util_sum
|
||||
*
|
||||
* load_avg and runnable_avg are not supported and meaningless.
|
||||
*
|
||||
*/
|
||||
|
||||
@ -359,7 +360,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
running,
|
||||
running)) {
|
||||
|
||||
___update_load_avg(&rq->avg_dl, 1, 1);
|
||||
___update_load_avg(&rq->avg_dl, 1);
|
||||
trace_pelt_dl_tp(rq);
|
||||
return 1;
|
||||
}
|
||||
@ -367,13 +368,46 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_THERMAL_PRESSURE
|
||||
/*
|
||||
* thermal:
|
||||
*
|
||||
* load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
|
||||
*
|
||||
* util_avg and runnable_load_avg are not supported and meaningless.
|
||||
*
|
||||
* Unlike rt/dl utilization tracking that track time spent by a cpu
|
||||
* running a rt/dl task through util_avg, the average thermal pressure is
|
||||
* tracked through load_avg. This is because thermal pressure signal is
|
||||
* time weighted "delta" capacity unlike util_avg which is binary.
|
||||
* "delta capacity" = actual capacity -
|
||||
* capped capacity a cpu due to a thermal event.
|
||||
*/
|
||||
|
||||
int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
|
||||
{
|
||||
if (___update_load_sum(now, &rq->avg_thermal,
|
||||
capacity,
|
||||
capacity,
|
||||
capacity)) {
|
||||
___update_load_avg(&rq->avg_thermal, 1);
|
||||
trace_pelt_thermal_tp(rq);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
||||
/*
|
||||
* irq:
|
||||
*
|
||||
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
|
||||
* util_sum = cpu_scale * load_sum
|
||||
* runnable_load_sum = load_sum
|
||||
* runnable_sum = util_sum
|
||||
*
|
||||
* load_avg and runnable_avg are not supported and meaningless.
|
||||
*
|
||||
*/
|
||||
|
||||
@ -410,7 +444,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
|
||||
1);
|
||||
|
||||
if (ret) {
|
||||
___update_load_avg(&rq->avg_irq, 1, 1);
|
||||
___update_load_avg(&rq->avg_irq, 1);
|
||||
trace_pelt_irq_tp(rq);
|
||||
}
|
||||
|
||||
|
@ -7,6 +7,26 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
|
||||
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
|
||||
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
|
||||
|
||||
#ifdef CONFIG_SCHED_THERMAL_PRESSURE
|
||||
int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
|
||||
|
||||
static inline u64 thermal_load_avg(struct rq *rq)
|
||||
{
|
||||
return READ_ONCE(rq->avg_thermal.load_avg);
|
||||
}
|
||||
#else
|
||||
static inline int
|
||||
update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline u64 thermal_load_avg(struct rq *rq)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
||||
int update_irq_load_avg(struct rq *rq, u64 running);
|
||||
#else
|
||||
@ -158,6 +178,17 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int
|
||||
update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline u64 thermal_load_avg(struct rq *rq)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int
|
||||
update_irq_load_avg(struct rq *rq, u64 running)
|
||||
{
|
||||
|
@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
|
||||
case PSI_MEM_FULL:
|
||||
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
|
||||
case PSI_CPU_SOME:
|
||||
return tasks[NR_RUNNING] > 1;
|
||||
return tasks[NR_RUNNING] > tasks[NR_ONCPU];
|
||||
case PSI_NONIDLE:
|
||||
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
|
||||
tasks[NR_RUNNING];
|
||||
@ -669,13 +669,14 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
|
||||
groupc->times[PSI_NONIDLE] += delta;
|
||||
}
|
||||
|
||||
static u32 psi_group_change(struct psi_group *group, int cpu,
|
||||
unsigned int clear, unsigned int set)
|
||||
static void psi_group_change(struct psi_group *group, int cpu,
|
||||
unsigned int clear, unsigned int set,
|
||||
bool wake_clock)
|
||||
{
|
||||
struct psi_group_cpu *groupc;
|
||||
u32 state_mask = 0;
|
||||
unsigned int t, m;
|
||||
enum psi_states s;
|
||||
u32 state_mask = 0;
|
||||
|
||||
groupc = per_cpu_ptr(group->pcpu, cpu);
|
||||
|
||||
@ -695,10 +696,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
|
||||
if (!(m & (1 << t)))
|
||||
continue;
|
||||
if (groupc->tasks[t] == 0 && !psi_bug) {
|
||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
|
||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
|
||||
cpu, t, groupc->tasks[0],
|
||||
groupc->tasks[1], groupc->tasks[2],
|
||||
clear, set);
|
||||
groupc->tasks[3], clear, set);
|
||||
psi_bug = 1;
|
||||
}
|
||||
groupc->tasks[t]--;
|
||||
@ -717,7 +718,11 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
|
||||
|
||||
write_seqcount_end(&groupc->seq);
|
||||
|
||||
return state_mask;
|
||||
if (state_mask & group->poll_states)
|
||||
psi_schedule_poll_work(group, 1);
|
||||
|
||||
if (wake_clock && !delayed_work_pending(&group->avgs_work))
|
||||
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
|
||||
}
|
||||
|
||||
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
|
||||
@ -744,6 +749,21 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
|
||||
return &psi_system;
|
||||
}
|
||||
|
||||
static void psi_flags_change(struct task_struct *task, int clear, int set)
|
||||
{
|
||||
if (((task->psi_flags & set) ||
|
||||
(task->psi_flags & clear) != clear) &&
|
||||
!psi_bug) {
|
||||
printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
|
||||
task->pid, task->comm, task_cpu(task),
|
||||
task->psi_flags, clear, set);
|
||||
psi_bug = 1;
|
||||
}
|
||||
|
||||
task->psi_flags &= ~clear;
|
||||
task->psi_flags |= set;
|
||||
}
|
||||
|
||||
void psi_task_change(struct task_struct *task, int clear, int set)
|
||||
{
|
||||
int cpu = task_cpu(task);
|
||||
@ -754,17 +774,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
|
||||
if (!task->pid)
|
||||
return;
|
||||
|
||||
if (((task->psi_flags & set) ||
|
||||
(task->psi_flags & clear) != clear) &&
|
||||
!psi_bug) {
|
||||
printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
|
||||
task->pid, task->comm, cpu,
|
||||
task->psi_flags, clear, set);
|
||||
psi_bug = 1;
|
||||
}
|
||||
|
||||
task->psi_flags &= ~clear;
|
||||
task->psi_flags |= set;
|
||||
psi_flags_change(task, clear, set);
|
||||
|
||||
/*
|
||||
* Periodic aggregation shuts off if there is a period of no
|
||||
@ -777,14 +787,51 @@ void psi_task_change(struct task_struct *task, int clear, int set)
|
||||
wq_worker_last_func(task) == psi_avgs_work))
|
||||
wake_clock = false;
|
||||
|
||||
while ((group = iterate_groups(task, &iter))) {
|
||||
u32 state_mask = psi_group_change(group, cpu, clear, set);
|
||||
while ((group = iterate_groups(task, &iter)))
|
||||
psi_group_change(group, cpu, clear, set, wake_clock);
|
||||
}
|
||||
|
||||
if (state_mask & group->poll_states)
|
||||
psi_schedule_poll_work(group, 1);
|
||||
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
bool sleep)
|
||||
{
|
||||
struct psi_group *group, *common = NULL;
|
||||
int cpu = task_cpu(prev);
|
||||
void *iter;
|
||||
|
||||
if (wake_clock && !delayed_work_pending(&group->avgs_work))
|
||||
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
|
||||
if (next->pid) {
|
||||
psi_flags_change(next, 0, TSK_ONCPU);
|
||||
/*
|
||||
* When moving state between tasks, the group that
|
||||
* contains them both does not change: we can stop
|
||||
* updating the tree once we reach the first common
|
||||
* ancestor. Iterate @next's ancestors until we
|
||||
* encounter @prev's state.
|
||||
*/
|
||||
iter = NULL;
|
||||
while ((group = iterate_groups(next, &iter))) {
|
||||
if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
|
||||
common = group;
|
||||
break;
|
||||
}
|
||||
|
||||
psi_group_change(group, cpu, 0, TSK_ONCPU, true);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is a voluntary sleep, dequeue will have taken care
|
||||
* of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
|
||||
* only need to deal with it during preemption.
|
||||
*/
|
||||
if (sleep)
|
||||
return;
|
||||
|
||||
if (prev->pid) {
|
||||
psi_flags_change(prev, TSK_ONCPU, 0);
|
||||
|
||||
iter = NULL;
|
||||
while ((group = iterate_groups(prev, &iter)) && group != common)
|
||||
psi_group_change(group, cpu, TSK_ONCPU, 0, true);
|
||||
}
|
||||
}
|
||||
|
||||
@ -818,17 +865,17 @@ void psi_memstall_enter(unsigned long *flags)
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return;
|
||||
|
||||
*flags = current->flags & PF_MEMSTALL;
|
||||
*flags = current->in_memstall;
|
||||
if (*flags)
|
||||
return;
|
||||
/*
|
||||
* PF_MEMSTALL setting & accounting needs to be atomic wrt
|
||||
* in_memstall setting & accounting needs to be atomic wrt
|
||||
* changes to the task's scheduling state, otherwise we can
|
||||
* race with CPU migration.
|
||||
*/
|
||||
rq = this_rq_lock_irq(&rf);
|
||||
|
||||
current->flags |= PF_MEMSTALL;
|
||||
current->in_memstall = 1;
|
||||
psi_task_change(current, 0, TSK_MEMSTALL);
|
||||
|
||||
rq_unlock_irq(rq, &rf);
|
||||
@ -851,13 +898,13 @@ void psi_memstall_leave(unsigned long *flags)
|
||||
if (*flags)
|
||||
return;
|
||||
/*
|
||||
* PF_MEMSTALL clearing & accounting needs to be atomic wrt
|
||||
* in_memstall clearing & accounting needs to be atomic wrt
|
||||
* changes to the task's scheduling state, otherwise we could
|
||||
* race with CPU migration.
|
||||
*/
|
||||
rq = this_rq_lock_irq(&rf);
|
||||
|
||||
current->flags &= ~PF_MEMSTALL;
|
||||
current->in_memstall = 0;
|
||||
psi_task_change(current, TSK_MEMSTALL, 0);
|
||||
|
||||
rq_unlock_irq(rq, &rf);
|
||||
@ -916,12 +963,14 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
|
||||
|
||||
rq = task_rq_lock(task, &rf);
|
||||
|
||||
if (task_on_rq_queued(task))
|
||||
if (task_on_rq_queued(task)) {
|
||||
task_flags = TSK_RUNNING;
|
||||
else if (task->in_iowait)
|
||||
if (task_current(rq, task))
|
||||
task_flags |= TSK_ONCPU;
|
||||
} else if (task->in_iowait)
|
||||
task_flags = TSK_IOWAIT;
|
||||
|
||||
if (task->flags & PF_MEMSTALL)
|
||||
if (task->in_memstall)
|
||||
task_flags |= TSK_MEMSTALL;
|
||||
|
||||
if (task_flags)
|
||||
|
@ -1474,6 +1474,13 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
if (test || !rt_task_fits_capacity(p, cpu)) {
|
||||
int target = find_lowest_rq(p);
|
||||
|
||||
/*
|
||||
* Bail out if we were forcing a migration to find a better
|
||||
* fitting CPU but our search failed.
|
||||
*/
|
||||
if (!test && target != -1 && !rt_task_fits_capacity(p, target))
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
* Don't bother moving it if the destination CPU is
|
||||
* not running a lower priority task.
|
||||
@ -1482,6 +1489,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
p->prio < cpu_rq(target)->rt.highest_prio.curr)
|
||||
cpu = target;
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
rcu_read_unlock();
|
||||
|
||||
out:
|
||||
@ -1495,7 +1504,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
|
||||
* let's hope p can move out.
|
||||
*/
|
||||
if (rq->curr->nr_cpus_allowed == 1 ||
|
||||
!cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL))
|
||||
!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -1503,7 +1512,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
|
||||
* see if it is pushed or pulled somewhere else.
|
||||
*/
|
||||
if (p->nr_cpus_allowed != 1 &&
|
||||
cpupri_find(&rq->rd->cpupri, p, NULL, NULL))
|
||||
cpupri_find(&rq->rd->cpupri, p, NULL))
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -1647,8 +1656,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
|
||||
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
|
||||
{
|
||||
if (!task_running(rq, p) &&
|
||||
cpumask_test_cpu(cpu, p->cpus_ptr) &&
|
||||
rt_task_fits_capacity(p, cpu))
|
||||
cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
@ -1682,6 +1690,7 @@ static int find_lowest_rq(struct task_struct *task)
|
||||
struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
|
||||
int this_cpu = smp_processor_id();
|
||||
int cpu = task_cpu(task);
|
||||
int ret;
|
||||
|
||||
/* Make sure the mask is initialized first */
|
||||
if (unlikely(!lowest_mask))
|
||||
@ -1690,8 +1699,22 @@ static int find_lowest_rq(struct task_struct *task)
|
||||
if (task->nr_cpus_allowed == 1)
|
||||
return -1; /* No other targets possible */
|
||||
|
||||
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask,
|
||||
rt_task_fits_capacity))
|
||||
/*
|
||||
* If we're on asym system ensure we consider the different capacities
|
||||
* of the CPUs when searching for the lowest_mask.
|
||||
*/
|
||||
if (static_branch_unlikely(&sched_asym_cpucapacity)) {
|
||||
|
||||
ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
|
||||
task, lowest_mask,
|
||||
rt_task_fits_capacity);
|
||||
} else {
|
||||
|
||||
ret = cpupri_find(&task_rq(task)->rd->cpupri,
|
||||
task, lowest_mask);
|
||||
}
|
||||
|
||||
if (!ret)
|
||||
return -1; /* No targets found */
|
||||
|
||||
/*
|
||||
@ -2202,7 +2225,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
|
||||
(rq->curr->nr_cpus_allowed < 2 ||
|
||||
rq->curr->prio <= p->prio);
|
||||
|
||||
if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq)))
|
||||
if (need_to_push)
|
||||
push_rt_tasks(rq);
|
||||
}
|
||||
|
||||
@ -2274,10 +2297,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
|
||||
*/
|
||||
if (task_on_rq_queued(p) && rq->curr != p) {
|
||||
#ifdef CONFIG_SMP
|
||||
bool need_to_push = rq->rt.overloaded ||
|
||||
!rt_task_fits_capacity(p, cpu_of(rq));
|
||||
|
||||
if (p->nr_cpus_allowed > 1 && need_to_push)
|
||||
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
|
||||
rt_queue_push_tasks(rq);
|
||||
#endif /* CONFIG_SMP */
|
||||
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
|
||||
@ -2449,10 +2469,11 @@ const struct sched_class rt_sched_class = {
|
||||
*/
|
||||
static DEFINE_MUTEX(rt_constraints_mutex);
|
||||
|
||||
/* Must be called with tasklist_lock held */
|
||||
static inline int tg_has_rt_tasks(struct task_group *tg)
|
||||
{
|
||||
struct task_struct *g, *p;
|
||||
struct task_struct *task;
|
||||
struct css_task_iter it;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* Autogroups do not have RT tasks; see autogroup_create().
|
||||
@ -2460,12 +2481,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
|
||||
if (task_group_is_autogroup(tg))
|
||||
return 0;
|
||||
|
||||
for_each_process_thread(g, p) {
|
||||
if (rt_task(p) && task_group(p) == tg)
|
||||
return 1;
|
||||
}
|
||||
css_task_iter_start(&tg->css, 0, &it);
|
||||
while (!ret && (task = css_task_iter_next(&it)))
|
||||
ret |= rt_task(task);
|
||||
css_task_iter_end(&it);
|
||||
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct rt_schedulable_data {
|
||||
@ -2496,9 +2517,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Ensure we don't starve existing RT tasks.
|
||||
* Ensure we don't starve existing RT tasks if runtime turns zero.
|
||||
*/
|
||||
if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
|
||||
if (rt_bandwidth_enabled() && !runtime &&
|
||||
tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
|
||||
return -EBUSY;
|
||||
|
||||
total = to_ratio(period, runtime);
|
||||
@ -2564,7 +2586,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&rt_constraints_mutex);
|
||||
read_lock(&tasklist_lock);
|
||||
err = __rt_schedulable(tg, rt_period, rt_runtime);
|
||||
if (err)
|
||||
goto unlock;
|
||||
@ -2582,7 +2603,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
|
||||
}
|
||||
raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
|
||||
unlock:
|
||||
read_unlock(&tasklist_lock);
|
||||
mutex_unlock(&rt_constraints_mutex);
|
||||
|
||||
return err;
|
||||
@ -2641,9 +2661,7 @@ static int sched_rt_global_constraints(void)
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&rt_constraints_mutex);
|
||||
read_lock(&tasklist_lock);
|
||||
ret = __rt_schedulable(NULL, 0, 0);
|
||||
read_unlock(&tasklist_lock);
|
||||
mutex_unlock(&rt_constraints_mutex);
|
||||
|
||||
return ret;
|
||||
|
@ -118,7 +118,13 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust);
|
||||
#ifdef CONFIG_64BIT
|
||||
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
|
||||
# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
|
||||
# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT)
|
||||
# define scale_load_down(w) \
|
||||
({ \
|
||||
unsigned long __w = (w); \
|
||||
if (__w) \
|
||||
__w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
|
||||
__w; \
|
||||
})
|
||||
#else
|
||||
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
|
||||
# define scale_load(w) (w)
|
||||
@ -305,7 +311,6 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
|
||||
dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
|
||||
}
|
||||
|
||||
extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
|
||||
extern void init_dl_bw(struct dl_bw *dl_b);
|
||||
extern int sched_dl_global_validate(void);
|
||||
extern void sched_dl_do_global(void);
|
||||
@ -489,7 +494,6 @@ struct cfs_bandwidth { };
|
||||
/* CFS-related fields in a runqueue */
|
||||
struct cfs_rq {
|
||||
struct load_weight load;
|
||||
unsigned long runnable_weight;
|
||||
unsigned int nr_running;
|
||||
unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
|
||||
unsigned int idle_h_nr_running; /* SCHED_IDLE */
|
||||
@ -528,7 +532,7 @@ struct cfs_rq {
|
||||
int nr;
|
||||
unsigned long load_avg;
|
||||
unsigned long util_avg;
|
||||
unsigned long runnable_sum;
|
||||
unsigned long runnable_avg;
|
||||
} removed;
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
@ -688,8 +692,30 @@ struct dl_rq {
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
/* An entity is a task if it doesn't "own" a runqueue */
|
||||
#define entity_is_task(se) (!se->my_q)
|
||||
|
||||
static inline void se_update_runnable(struct sched_entity *se)
|
||||
{
|
||||
if (!entity_is_task(se))
|
||||
se->runnable_weight = se->my_q->h_nr_running;
|
||||
}
|
||||
|
||||
static inline long se_runnable(struct sched_entity *se)
|
||||
{
|
||||
if (entity_is_task(se))
|
||||
return !!se->on_rq;
|
||||
else
|
||||
return se->runnable_weight;
|
||||
}
|
||||
|
||||
#else
|
||||
#define entity_is_task(se) 1
|
||||
|
||||
static inline void se_update_runnable(struct sched_entity *se) {}
|
||||
|
||||
static inline long se_runnable(struct sched_entity *se)
|
||||
{
|
||||
return !!se->on_rq;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
@ -701,10 +727,6 @@ static inline long se_weight(struct sched_entity *se)
|
||||
return scale_load_down(se->load.weight);
|
||||
}
|
||||
|
||||
static inline long se_runnable(struct sched_entity *se)
|
||||
{
|
||||
return scale_load_down(se->runnable_weight);
|
||||
}
|
||||
|
||||
static inline bool sched_asym_prefer(int a, int b)
|
||||
{
|
||||
@ -943,6 +965,9 @@ struct rq {
|
||||
struct sched_avg avg_dl;
|
||||
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
||||
struct sched_avg avg_irq;
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_THERMAL_PRESSURE
|
||||
struct sched_avg avg_thermal;
|
||||
#endif
|
||||
u64 idle_stamp;
|
||||
u64 avg_idle;
|
||||
@ -1107,6 +1132,24 @@ static inline u64 rq_clock_task(struct rq *rq)
|
||||
return rq->clock_task;
|
||||
}
|
||||
|
||||
/**
|
||||
* By default the decay is the default pelt decay period.
|
||||
* The decay shift can change the decay period in
|
||||
* multiples of 32.
|
||||
* Decay shift Decay period(ms)
|
||||
* 0 32
|
||||
* 1 64
|
||||
* 2 128
|
||||
* 3 256
|
||||
* 4 512
|
||||
*/
|
||||
extern int sched_thermal_decay_shift;
|
||||
|
||||
static inline u64 rq_clock_thermal(struct rq *rq)
|
||||
{
|
||||
return rq_clock_task(rq) >> sched_thermal_decay_shift;
|
||||
}
|
||||
|
||||
static inline void rq_clock_skip_update(struct rq *rq)
|
||||
{
|
||||
lockdep_assert_held(&rq->lock);
|
||||
@ -1337,8 +1380,6 @@ extern void sched_ttwu_pending(void);
|
||||
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
|
||||
__sd; __sd = __sd->parent)
|
||||
|
||||
#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
|
||||
|
||||
/**
|
||||
* highest_flag_domain - Return highest sched_domain containing flag.
|
||||
* @cpu: The CPU whose highest level of sched domain is to
|
||||
@ -1869,7 +1910,6 @@ extern struct dl_bandwidth def_dl_bandwidth;
|
||||
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
|
||||
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
|
||||
extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
|
||||
extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
|
||||
|
||||
#define BW_SHIFT 20
|
||||
#define BW_UNIT (1 << BW_SHIFT)
|
||||
@ -1968,6 +2008,13 @@ static inline int hrtick_enabled(struct rq *rq)
|
||||
|
||||
#endif /* CONFIG_SCHED_HRTICK */
|
||||
|
||||
#ifndef arch_scale_freq_tick
|
||||
static __always_inline
|
||||
void arch_scale_freq_tick(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef arch_scale_freq_capacity
|
||||
static __always_inline
|
||||
unsigned long arch_scale_freq_capacity(int cpu)
|
||||
|
@ -70,7 +70,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
|
||||
return;
|
||||
|
||||
if (!wakeup || p->sched_psi_wake_requeue) {
|
||||
if (p->flags & PF_MEMSTALL)
|
||||
if (p->in_memstall)
|
||||
set |= TSK_MEMSTALL;
|
||||
if (p->sched_psi_wake_requeue)
|
||||
p->sched_psi_wake_requeue = 0;
|
||||
@ -90,9 +90,17 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
|
||||
return;
|
||||
|
||||
if (!sleep) {
|
||||
if (p->flags & PF_MEMSTALL)
|
||||
if (p->in_memstall)
|
||||
clear |= TSK_MEMSTALL;
|
||||
} else {
|
||||
/*
|
||||
* When a task sleeps, schedule() dequeues it before
|
||||
* switching to the next one. Merge the clearing of
|
||||
* TSK_RUNNING and TSK_ONCPU to save an unnecessary
|
||||
* psi_task_change() call in psi_sched_switch().
|
||||
*/
|
||||
clear |= TSK_ONCPU;
|
||||
|
||||
if (p->in_iowait)
|
||||
set |= TSK_IOWAIT;
|
||||
}
|
||||
@ -109,14 +117,14 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
|
||||
* deregister its sleep-persistent psi states from the old
|
||||
* queue, and let psi_enqueue() know it has to requeue.
|
||||
*/
|
||||
if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) {
|
||||
if (unlikely(p->in_iowait || p->in_memstall)) {
|
||||
struct rq_flags rf;
|
||||
struct rq *rq;
|
||||
int clear = 0;
|
||||
|
||||
if (p->in_iowait)
|
||||
clear |= TSK_IOWAIT;
|
||||
if (p->flags & PF_MEMSTALL)
|
||||
if (p->in_memstall)
|
||||
clear |= TSK_MEMSTALL;
|
||||
|
||||
rq = __task_rq_lock(p, &rf);
|
||||
@ -126,18 +134,31 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
|
||||
}
|
||||
}
|
||||
|
||||
static inline void psi_sched_switch(struct task_struct *prev,
|
||||
struct task_struct *next,
|
||||
bool sleep)
|
||||
{
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return;
|
||||
|
||||
psi_task_switch(prev, next, sleep);
|
||||
}
|
||||
|
||||
static inline void psi_task_tick(struct rq *rq)
|
||||
{
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return;
|
||||
|
||||
if (unlikely(rq->curr->flags & PF_MEMSTALL))
|
||||
if (unlikely(rq->curr->in_memstall))
|
||||
psi_memstall_tick(rq->curr, cpu_of(rq));
|
||||
}
|
||||
#else /* CONFIG_PSI */
|
||||
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
|
||||
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
|
||||
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
|
||||
static inline void psi_sched_switch(struct task_struct *prev,
|
||||
struct task_struct *next,
|
||||
bool sleep) {}
|
||||
static inline void psi_task_tick(struct rq *rq) {}
|
||||
#endif /* CONFIG_PSI */
|
||||
|
||||
|
@ -317,8 +317,9 @@ static void sched_energy_set(bool has_eas)
|
||||
* EAS can be used on a root domain if it meets all the following conditions:
|
||||
* 1. an Energy Model (EM) is available;
|
||||
* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
|
||||
* 3. the EM complexity is low enough to keep scheduling overheads low;
|
||||
* 4. schedutil is driving the frequency of all CPUs of the rd;
|
||||
* 3. no SMT is detected.
|
||||
* 4. the EM complexity is low enough to keep scheduling overheads low;
|
||||
* 5. schedutil is driving the frequency of all CPUs of the rd;
|
||||
*
|
||||
* The complexity of the Energy Model is defined as:
|
||||
*
|
||||
@ -360,6 +361,13 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
|
||||
goto free;
|
||||
}
|
||||
|
||||
/* EAS definitely does *not* handle SMT */
|
||||
if (sched_smt_active()) {
|
||||
pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
|
||||
cpumask_pr_args(cpu_map));
|
||||
goto free;
|
||||
}
|
||||
|
||||
for_each_cpu(i, cpu_map) {
|
||||
/* Skip already covered CPUs. */
|
||||
if (find_pd(pd, i))
|
||||
@ -1374,18 +1382,9 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
* Convert topological properties into behaviour.
|
||||
*/
|
||||
|
||||
if (sd->flags & SD_ASYM_CPUCAPACITY) {
|
||||
struct sched_domain *t = sd;
|
||||
|
||||
/*
|
||||
* Don't attempt to spread across CPUs of different capacities.
|
||||
*/
|
||||
if (sd->child)
|
||||
sd->child->flags &= ~SD_PREFER_SIBLING;
|
||||
|
||||
for_each_lower_domain(t)
|
||||
t->flags |= SD_BALANCE_WAKE;
|
||||
}
|
||||
/* Don't attempt to spread across CPUs of different capacities. */
|
||||
if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
|
||||
sd->child->flags &= ~SD_PREFER_SIBLING;
|
||||
|
||||
if (sd->flags & SD_SHARE_CPUCAPACITY) {
|
||||
sd->imbalance_pct = 110;
|
||||
|
@ -232,3 +232,32 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
|
||||
BUG();
|
||||
}
|
||||
EXPORT_SYMBOL(cpumask_local_spread);
|
||||
|
||||
static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);
|
||||
|
||||
/**
|
||||
* Returns an arbitrary cpu within srcp1 & srcp2.
|
||||
*
|
||||
* Iterated calls using the same srcp1 and srcp2 will be distributed within
|
||||
* their intersection.
|
||||
*
|
||||
* Returns >= nr_cpu_ids if the intersection is empty.
|
||||
*/
|
||||
int cpumask_any_and_distribute(const struct cpumask *src1p,
|
||||
const struct cpumask *src2p)
|
||||
{
|
||||
int next, prev;
|
||||
|
||||
/* NOTE: our first selection will skip 0. */
|
||||
prev = __this_cpu_read(distribute_cpu_mask_prev);
|
||||
|
||||
next = cpumask_next_and(prev, src1p, src2p);
|
||||
if (next >= nr_cpu_ids)
|
||||
next = cpumask_first_and(src1p, src2p);
|
||||
|
||||
if (next < nr_cpu_ids)
|
||||
__this_cpu_write(distribute_cpu_mask_prev, next);
|
||||
|
||||
return next;
|
||||
}
|
||||
EXPORT_SYMBOL(cpumask_any_and_distribute);
|
||||
|
Loading…
x
Reference in New Issue
Block a user