From e5bc44e47c531860be96ac615314b1ab23d5aa2b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 25 Apr 2024 09:37:09 +0200 Subject: [PATCH 1/7] arch/topology: Fix variable naming to avoid shadowing Using 'hw_pressure' for local variable name is confusing in regard to the per-CPU 'hw_pressure' variable that uses the same name: include/linux/arch_topology.h:DECLARE_PER_CPU(unsigned long, hw_pressure); ... which puts it into a global scope for all code that includes , shadowing the local variable. Rename it to avoid compiler confusion & Sparse warnings. [ mingo: Expanded the changelog. ] Reported-by: kernel test robot Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Reviewed-by: Lukasz Luba Reviewed-by: Konrad Dybcio Acked-by: Sudeep Holla Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240425073709.379016-1-vincent.guittot@linaro.org Closes: https://lore.kernel.org/oe-kbuild-all/202404250740.VhQQoD7N-lkp@intel.com/ Fixes: d4dbc991714e ("sched/cpufreq: Rename arch_update_thermal_pressure() => arch_update_hw_pressure()") Tested-by: Konrad Dybcio # QC SM8550 QRD --- drivers/base/arch_topology.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 0248912ff687..c66d070207a0 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -179,7 +179,7 @@ DEFINE_PER_CPU(unsigned long, hw_pressure); void topology_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_freq) { - unsigned long max_capacity, capacity, hw_pressure; + unsigned long max_capacity, capacity, pressure; u32 max_freq; int cpu; @@ -196,12 +196,12 @@ void topology_update_hw_pressure(const struct cpumask *cpus, else capacity = mult_frac(max_capacity, capped_freq, max_freq); - hw_pressure = max_capacity - capacity; + pressure = max_capacity - capacity; - trace_hw_pressure_update(cpu, hw_pressure); + trace_hw_pressure_update(cpu, pressure); for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure); + WRITE_ONCE(per_cpu(hw_pressure, cpu), pressure); } EXPORT_SYMBOL_GPL(topology_update_hw_pressure); From a1fd0b9d751f840df23ef0e75b691fc00cfd4743 Mon Sep 17 00:00:00 2001 From: Vitalii Bursov Date: Tue, 30 Apr 2024 18:05:23 +0300 Subject: [PATCH 2/7] sched/fair: Allow disabling sched_balance_newidle with sched_relax_domain_level Change relax_domain_level checks so that it would be possible to include or exclude all domains from newidle balancing. This matches the behavior described in the documentation: -1 no request. use system default or follow request of others. 0 no search. 1 search siblings (hyperthreads in a core). "2" enables levels 0 and 1, level_max excludes the last (level_max) level, and level_max+1 includes all levels. Fixes: 1d3504fcf560 ("sched, cpuset: customize sched domains, core") Signed-off-by: Vitalii Bursov Signed-off-by: Ingo Molnar Tested-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/bd6de28e80073c79466ec6401cdeae78f0d4423d.1714488502.git.vitaly@bursov.com --- kernel/cgroup/cpuset.c | 2 +- kernel/sched/topology.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4237c8748715..da24187c4e02 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2948,7 +2948,7 @@ bool current_cpuset_is_being_rebound(void) static int update_relax_domain_level(struct cpuset *cs, s64 val) { #ifdef CONFIG_SMP - if (val < -1 || val >= sched_domain_level_max) + if (val < -1 || val > sched_domain_level_max + 1) return -EINVAL; #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 63aecd2a7a9f..67a777b31743 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1475,7 +1475,7 @@ static void set_domain_attribute(struct sched_domain *sd, } else request = attr->relax_domain_level; - if (sd->level > request) { + if (sd->level >= request) { /* Turn off idle balance on this domain: */ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } From 287372fa39f579a61e17b000aa74c8418d230528 Mon Sep 17 00:00:00 2001 From: Vitalii Bursov Date: Tue, 30 Apr 2024 18:05:24 +0300 Subject: [PATCH 3/7] sched/debug: Dump domains' level Knowing domain's level exactly can be useful when setting relax_domain_level or cpuset.sched_relax_domain_level Usage: cat /debug/sched/domains/cpu0/domain1/level to dump cpu0 domain1's level. SDM macro is not used because sd->level is 'int' and it would hide the type mismatch between 'int' and 'u32'. Signed-off-by: Vitalii Bursov Signed-off-by: Ingo Molnar Tested-by: Dietmar Eggemann Acked-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/9489b6475f6dd6fbc67c617752d4216fa094da53.1714488502.git.vitaly@bursov.com --- kernel/sched/debug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8d5d98a5834d..c1eb9a1afd13 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -425,6 +425,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); + debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level); } void update_sched_domain_debugfs(void) From 0f1c74befa656305ecc85c954dc31f84c1cc26e1 Mon Sep 17 00:00:00 2001 From: Vitalii Bursov Date: Tue, 30 Apr 2024 18:05:25 +0300 Subject: [PATCH 4/7] docs: cgroup-v1: Clarify that domain levels are system-specific Add a clarification that domain levels are system-specific and where to check for system details. Signed-off-by: Vitalii Bursov Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Acked-by: Vincent Guittot Link: https://lore.kernel.org/r/42b177a2e897cdf880caf9c2025f5b609e820334.1714488502.git.vitaly@bursov.com --- Documentation/admin-guide/cgroup-v1/cpusets.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst index 7d3415eea05d..f401af5e2f09 100644 --- a/Documentation/admin-guide/cgroup-v1/cpusets.rst +++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst @@ -568,7 +568,7 @@ on the next tick. For some applications in special situation, waiting The 'cpuset.sched_relax_domain_level' file allows you to request changing this searching range as you like. This file takes int value which -indicates size of searching range in levels ideally as follows, +indicates size of searching range in levels approximately as follows, otherwise initial value -1 that indicates the cpuset has no request. ====== =========================================================== @@ -581,6 +581,11 @@ otherwise initial value -1 that indicates the cpuset has no request. 5 search system wide [on NUMA system] ====== =========================================================== +Not all levels can be present and values can change depending on the +system architecture and kernel configuration. Check +/sys/kernel/debug/sched/domains/cpu*/domain*/ for system-specific +details. + The system default is architecture dependent. The system default can be changed using the relax_domain_level= boot parameter. From 72bffbf57c5247ac6146d1103ef42e9f8d094bc8 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Thu, 14 Mar 2024 18:59:16 -0700 Subject: [PATCH 5/7] sched/fair: Fix initial util_avg calculation Change se->load.weight to se_weight(se) in the calculation for the initial util_avg to avoid unnecessarily inflating the util_avg by 1024 times. The reason is that se->load.weight has the unit/scale as the scaled-up load, while cfs_rg->avg.load_avg has the unit/scale as the true task weight (as mapped directly from the task's nice/priority value). With CONFIG_32BIT, the scaled-up load is equal to the true task weight. With CONFIG_64BIT, the scaled-up load is 1024 times the true task weight. Thus, the current code may inflate the util_avg by 1024 times. The follow-up capping will not allow the util_avg value to go wild. But the calculation should have the correct logic. Signed-off-by: Dawei Li Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Vishal Chourasia Link: https://lore.kernel.org/r/20240315015916.21545-1-daweilics@gmail.com --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 146ecf9cc3af..900978741a81 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1031,7 +1031,8 @@ void init_entity_runnable_average(struct sched_entity *se) * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: * - * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1) + * * se_weight(se) * * However, in many cases, the above util_avg does not give a desired * value. Moreover, the sum of the util_avgs may be divergent, such @@ -1078,7 +1079,7 @@ void post_init_entity_util_avg(struct task_struct *p) if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg = cfs_rq->avg.util_avg * se_weight(se); sa->util_avg /= (cfs_rq->avg.load_avg + 1); if (sa->util_avg > cap) From 7cb7fb5b49399fc59f1c44686d82c0df0776c8c6 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Tue, 5 Mar 2024 15:18:20 +0000 Subject: [PATCH 6/7] sched/fair: Remove stale FREQUENCY_UTIL comment On 05/03/2024 15:05, Vincent Guittot wrote: I'm fine with either and that was my first thought here, too, but it did seem like the comment was mostly placed there to justify the 'unexpected' high utilization when explicitly passing FREQUENCY_UTIL and the need to clamp it then. So removing did feel slightly more natural to me anyway. So alternatively: From: Christian Loehle Date: Tue, 5 Mar 2024 09:34:41 +0000 Subject: [PATCH] sched/fair: Remove stale FREQUENCY_UTIL mention effective_cpu_util() flags were removed, so remove mentioning of the flag. commit 9c0b4bb7f6303 ("sched/cpufreq: Rework schedutil governor performance estimation") reworked effective_cpu_util() removing enum cpu_util_type. Modify the comment accordingly. Signed-off-by: Christian Loehle Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/0e2833ee-0939-44e0-82a2-520a585a0153@arm.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 900978741a81..9744b5036dd8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7900,8 +7900,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * Performance domain frequency: utilization clamping * must be considered since it affects the selection * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. + * NOTE: in case RT tasks are running, by default the min + * utilization can be max OPP. */ eff_util = effective_cpu_util(cpu, util, &min, &max); From 49217ea147df7647cb89161b805c797487783fc0 Mon Sep 17 00:00:00 2001 From: Cheng Yu Date: Wed, 24 Apr 2024 21:24:38 +0800 Subject: [PATCH 7/7] sched/core: Fix incorrect initialization of the 'burst' parameter in cpu_max_write() In the cgroup v2 CPU subsystem, assuming we have a cgroup named 'test', and we set cpu.max and cpu.max.burst: # echo 1000000 > /sys/fs/cgroup/test/cpu.max # echo 1000000 > /sys/fs/cgroup/test/cpu.max.burst then we check cpu.max and cpu.max.burst: # cat /sys/fs/cgroup/test/cpu.max 1000000 100000 # cat /sys/fs/cgroup/test/cpu.max.burst 1000000 Next we set cpu.max again and check cpu.max and cpu.max.burst: # echo 2000000 > /sys/fs/cgroup/test/cpu.max # cat /sys/fs/cgroup/test/cpu.max 2000000 100000 # cat /sys/fs/cgroup/test/cpu.max.burst 1000 ... we find that the cpu.max.burst value changed unexpectedly. In cpu_max_write(), the unit of the burst value returned by tg_get_cfs_burst() is microseconds, while in cpu_max_write(), the burst unit used for calculation should be nanoseconds, which leads to the bug. To fix it, get the burst value directly from tg->cfs_bandwidth.burst. Fixes: f4183717b370 ("sched/fair: Introduce the burstable CFS controller") Reported-by: Qixin Liao Signed-off-by: Cheng Yu Signed-off-by: Zhang Qiao Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Tested-by: Vincent Guittot Link: https://lore.kernel.org/r/20240424132438.514720-1-serein.chengyu@huawei.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a914388144a..f88f50552acb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11402,7 +11402,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, { struct task_group *tg = css_tg(of_css(of)); u64 period = tg_get_cfs_period(tg); - u64 burst = tg_get_cfs_burst(tg); + u64 burst = tg->cfs_bandwidth.burst; u64 quota; int ret;