mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-17 10:46:33 +00:00
ea31f1c6b4
Add three counters to follow and understand hiperdispatch behavior; * adjustment_count (amount of capacity adjustments triggered) * greedy_time_ms (time spent while all cpus are on high capacity) * conservative_time_ms (time spent while only entitled cpus are on high capacity) These counters can be found under /sys/kernel/debug/s390/hiperdispatch/ Time counters are in <msec> format and only cover the time spent when hiperdispatch is active. Acked-by: Vasily Gorbik <gor@linux.ibm.com> Signed-off-by: Mete Durlu <meted@linux.ibm.com> Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
431 lines
12 KiB
C
431 lines
12 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright IBM Corp. 2024
|
|
*/
|
|
|
|
#define KMSG_COMPONENT "hd"
|
|
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
|
|
|
|
/*
|
|
* Hiperdispatch:
|
|
* Dynamically calculates the optimum number of high capacity COREs
|
|
* by considering the state the system is in. When hiperdispatch decides
|
|
* that a capacity update is necessary, it schedules a topology update.
|
|
* During topology updates the CPU capacities are always re-adjusted.
|
|
*
|
|
* There is two places where CPU capacities are being accessed within
|
|
* hiperdispatch.
|
|
* -> hiperdispatch's reoccuring work function reads CPU capacities to
|
|
* determine high capacity CPU count.
|
|
* -> during a topology update hiperdispatch's adjustment function
|
|
* updates CPU capacities.
|
|
* These two can run on different CPUs in parallel which can cause
|
|
* hiperdispatch to make wrong decisions. This can potentially cause
|
|
* some overhead by leading to extra rebuild_sched_domains() calls
|
|
* for correction. Access to capacities within hiperdispatch has to be
|
|
* serialized to prevent the overhead.
|
|
*
|
|
* Hiperdispatch decision making revolves around steal time.
|
|
* HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
|
|
* crosses the threshold value hiperdispatch falls back to giving high
|
|
* capacities to entitled CPUs. When steal time drops below the
|
|
* threshold boundary, hiperdispatch utilizes all CPUs by giving all
|
|
* of them high capacity.
|
|
*
|
|
* The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
|
|
* performance. Comparing the throughput of;
|
|
* - single CORE, with N threads, running N tasks
|
|
* - N separate COREs running N tasks,
|
|
* using individual COREs for individual tasks yield better
|
|
* performance. This performance difference is roughly ~30% (can change
|
|
* between machine generations)
|
|
*
|
|
* Hiperdispatch tries to hint scheduler to use individual COREs for
|
|
* each task, as long as steal time on those COREs are less than 30%,
|
|
* therefore delaying the throughput loss caused by using SMP threads.
|
|
*/
|
|
|
|
#include <linux/cpumask.h>
|
|
#include <linux/debugfs.h>
|
|
#include <linux/device.h>
|
|
#include <linux/kernel_stat.h>
|
|
#include <linux/kstrtox.h>
|
|
#include <linux/ktime.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/types.h>
|
|
#include <linux/workqueue.h>
|
|
#include <asm/hiperdispatch.h>
|
|
#include <asm/setup.h>
|
|
#include <asm/smp.h>
|
|
#include <asm/topology.h>
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
#include <asm/trace/hiperdispatch.h>
|
|
|
|
#define HD_DELAY_FACTOR (4)
|
|
#define HD_DELAY_INTERVAL (HZ / 4)
|
|
#define HD_STEAL_THRESHOLD 30
|
|
#define HD_STEAL_AVG_WEIGHT 16
|
|
|
|
static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */
|
|
static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */
|
|
static int hd_high_capacity_cores; /* Current CORE count with high capacity */
|
|
static int hd_entitled_cores; /* Total vertical high and medium CORE count */
|
|
static int hd_online_cores; /* Current online CORE count */
|
|
|
|
static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */
|
|
static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */
|
|
static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */
|
|
static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */
|
|
|
|
static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
|
|
static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
|
|
static int hd_enabled;
|
|
|
|
static void hd_capacity_work_fn(struct work_struct *work);
|
|
static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
|
|
|
|
static int hd_set_hiperdispatch_mode(int enable)
|
|
{
|
|
if (!MACHINE_HAS_TOPOLOGY)
|
|
enable = 0;
|
|
if (hd_enabled == enable)
|
|
return 0;
|
|
hd_enabled = enable;
|
|
return 1;
|
|
}
|
|
|
|
void hd_reset_state(void)
|
|
{
|
|
cpumask_clear(&hd_vl_coremask);
|
|
cpumask_clear(&hd_vmvl_cpumask);
|
|
hd_entitled_cores = 0;
|
|
hd_online_cores = 0;
|
|
}
|
|
|
|
void hd_add_core(int cpu)
|
|
{
|
|
const struct cpumask *siblings;
|
|
int polarization;
|
|
|
|
hd_online_cores++;
|
|
polarization = smp_cpu_get_polarization(cpu);
|
|
siblings = topology_sibling_cpumask(cpu);
|
|
switch (polarization) {
|
|
case POLARIZATION_VH:
|
|
hd_entitled_cores++;
|
|
break;
|
|
case POLARIZATION_VM:
|
|
hd_entitled_cores++;
|
|
cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
|
|
break;
|
|
case POLARIZATION_VL:
|
|
cpumask_set_cpu(cpu, &hd_vl_coremask);
|
|
cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Serialize update and read operations of debug counters. */
|
|
static DEFINE_MUTEX(hd_counter_mutex);
|
|
|
|
static void hd_update_times(void)
|
|
{
|
|
static ktime_t prev;
|
|
ktime_t now;
|
|
|
|
/*
|
|
* Check if hiperdispatch is active, if not set the prev to 0.
|
|
* This way it is possible to differentiate the first update iteration after
|
|
* enabling hiperdispatch.
|
|
*/
|
|
if (hd_entitled_cores == 0 || hd_enabled == 0) {
|
|
prev = ktime_set(0, 0);
|
|
return;
|
|
}
|
|
now = ktime_get();
|
|
if (ktime_after(prev, 0)) {
|
|
if (hd_high_capacity_cores == hd_online_cores)
|
|
hd_high_time += ktime_ms_delta(now, prev);
|
|
else
|
|
hd_low_time += ktime_ms_delta(now, prev);
|
|
}
|
|
prev = now;
|
|
}
|
|
|
|
static void hd_update_capacities(void)
|
|
{
|
|
int cpu, upscaling_cores;
|
|
unsigned long capacity;
|
|
|
|
upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
|
|
capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
|
|
hd_high_capacity_cores = hd_entitled_cores;
|
|
for_each_cpu(cpu, &hd_vl_coremask) {
|
|
smp_set_core_capacity(cpu, capacity);
|
|
if (capacity != CPU_CAPACITY_HIGH)
|
|
continue;
|
|
hd_high_capacity_cores++;
|
|
upscaling_cores--;
|
|
if (upscaling_cores == 0)
|
|
capacity = CPU_CAPACITY_LOW;
|
|
}
|
|
}
|
|
|
|
void hd_disable_hiperdispatch(void)
|
|
{
|
|
cancel_delayed_work_sync(&hd_capacity_work);
|
|
hd_high_capacity_cores = hd_online_cores;
|
|
hd_previous_steal = 0;
|
|
}
|
|
|
|
int hd_enable_hiperdispatch(void)
|
|
{
|
|
mutex_lock(&hd_counter_mutex);
|
|
hd_update_times();
|
|
mutex_unlock(&hd_counter_mutex);
|
|
if (hd_enabled == 0)
|
|
return 0;
|
|
if (hd_entitled_cores == 0)
|
|
return 0;
|
|
if (hd_online_cores <= hd_entitled_cores)
|
|
return 0;
|
|
mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
|
|
hd_update_capacities();
|
|
return 1;
|
|
}
|
|
|
|
static unsigned long hd_steal_avg(unsigned long new)
|
|
{
|
|
static unsigned long steal;
|
|
|
|
steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
|
|
return steal;
|
|
}
|
|
|
|
static unsigned long hd_calculate_steal_percentage(void)
|
|
{
|
|
unsigned long time_delta, steal_delta, steal, percentage;
|
|
static ktime_t prev;
|
|
int cpus, cpu;
|
|
ktime_t now;
|
|
|
|
cpus = 0;
|
|
steal = 0;
|
|
percentage = 0;
|
|
for_each_cpu(cpu, &hd_vmvl_cpumask) {
|
|
steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
|
|
cpus++;
|
|
}
|
|
/*
|
|
* If there is no vertical medium and low CPUs steal time
|
|
* is 0 as vertical high CPUs shouldn't experience steal time.
|
|
*/
|
|
if (cpus == 0)
|
|
return percentage;
|
|
now = ktime_get();
|
|
time_delta = ktime_to_ns(ktime_sub(now, prev));
|
|
if (steal > hd_previous_steal && hd_previous_steal != 0) {
|
|
steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
|
|
percentage = steal_delta / cpus;
|
|
}
|
|
hd_previous_steal = steal;
|
|
prev = now;
|
|
return percentage;
|
|
}
|
|
|
|
static void hd_capacity_work_fn(struct work_struct *work)
|
|
{
|
|
unsigned long steal_percentage, new_cores;
|
|
|
|
mutex_lock(&smp_cpu_state_mutex);
|
|
/*
|
|
* If online cores are less or equal to entitled cores hiperdispatch
|
|
* does not need to make any adjustments, call a topology update to
|
|
* disable hiperdispatch.
|
|
* Normally this check is handled on topology update, but during cpu
|
|
* unhotplug, topology and cpu mask updates are done in reverse
|
|
* order, causing hd_enable_hiperdispatch() to get stale data.
|
|
*/
|
|
if (hd_online_cores <= hd_entitled_cores) {
|
|
topology_schedule_update();
|
|
mutex_unlock(&smp_cpu_state_mutex);
|
|
return;
|
|
}
|
|
steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
|
|
if (steal_percentage < hd_steal_threshold)
|
|
new_cores = hd_online_cores;
|
|
else
|
|
new_cores = hd_entitled_cores;
|
|
if (hd_high_capacity_cores != new_cores) {
|
|
trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
|
|
hd_high_capacity_cores = new_cores;
|
|
atomic64_inc(&hd_adjustments);
|
|
topology_schedule_update();
|
|
}
|
|
trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
|
|
mutex_unlock(&smp_cpu_state_mutex);
|
|
schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
|
|
}
|
|
|
|
static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
|
|
void *buffer, size_t *lenp, loff_t *ppos)
|
|
{
|
|
int hiperdispatch;
|
|
int rc;
|
|
struct ctl_table ctl_entry = {
|
|
.procname = ctl->procname,
|
|
.data = &hiperdispatch,
|
|
.maxlen = sizeof(int),
|
|
.extra1 = SYSCTL_ZERO,
|
|
.extra2 = SYSCTL_ONE,
|
|
};
|
|
|
|
hiperdispatch = hd_enabled;
|
|
rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
|
|
if (rc < 0 || !write)
|
|
return rc;
|
|
mutex_lock(&smp_cpu_state_mutex);
|
|
if (hd_set_hiperdispatch_mode(hiperdispatch))
|
|
topology_schedule_update();
|
|
mutex_unlock(&smp_cpu_state_mutex);
|
|
return 0;
|
|
}
|
|
|
|
static struct ctl_table hiperdispatch_ctl_table[] = {
|
|
{
|
|
.procname = "hiperdispatch",
|
|
.mode = 0644,
|
|
.proc_handler = hiperdispatch_ctl_handler,
|
|
},
|
|
};
|
|
|
|
static ssize_t hd_steal_threshold_show(struct device *dev,
|
|
struct device_attribute *attr,
|
|
char *buf)
|
|
{
|
|
return sysfs_emit(buf, "%u\n", hd_steal_threshold);
|
|
}
|
|
|
|
static ssize_t hd_steal_threshold_store(struct device *dev,
|
|
struct device_attribute *attr,
|
|
const char *buf,
|
|
size_t count)
|
|
{
|
|
unsigned int val;
|
|
int rc;
|
|
|
|
rc = kstrtouint(buf, 0, &val);
|
|
if (rc)
|
|
return rc;
|
|
if (val > 100)
|
|
return -ERANGE;
|
|
hd_steal_threshold = val;
|
|
return count;
|
|
}
|
|
|
|
static DEVICE_ATTR_RW(hd_steal_threshold);
|
|
|
|
static ssize_t hd_delay_factor_show(struct device *dev,
|
|
struct device_attribute *attr,
|
|
char *buf)
|
|
{
|
|
return sysfs_emit(buf, "%u\n", hd_delay_factor);
|
|
}
|
|
|
|
static ssize_t hd_delay_factor_store(struct device *dev,
|
|
struct device_attribute *attr,
|
|
const char *buf,
|
|
size_t count)
|
|
{
|
|
unsigned int val;
|
|
int rc;
|
|
|
|
rc = kstrtouint(buf, 0, &val);
|
|
if (rc)
|
|
return rc;
|
|
if (!val)
|
|
return -ERANGE;
|
|
hd_delay_factor = val;
|
|
return count;
|
|
}
|
|
|
|
static DEVICE_ATTR_RW(hd_delay_factor);
|
|
|
|
static struct attribute *hd_attrs[] = {
|
|
&dev_attr_hd_steal_threshold.attr,
|
|
&dev_attr_hd_delay_factor.attr,
|
|
NULL,
|
|
};
|
|
|
|
static const struct attribute_group hd_attr_group = {
|
|
.name = "hiperdispatch",
|
|
.attrs = hd_attrs,
|
|
};
|
|
|
|
static int hd_greedy_time_get(void *unused, u64 *val)
|
|
{
|
|
mutex_lock(&hd_counter_mutex);
|
|
hd_update_times();
|
|
*val = hd_high_time;
|
|
mutex_unlock(&hd_counter_mutex);
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
|
|
|
|
static int hd_conservative_time_get(void *unused, u64 *val)
|
|
{
|
|
mutex_lock(&hd_counter_mutex);
|
|
hd_update_times();
|
|
*val = hd_low_time;
|
|
mutex_unlock(&hd_counter_mutex);
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
|
|
|
|
static int hd_adjustment_count_get(void *unused, u64 *val)
|
|
{
|
|
*val = atomic64_read(&hd_adjustments);
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
|
|
|
|
static void __init hd_create_debugfs_counters(void)
|
|
{
|
|
struct dentry *dir;
|
|
|
|
dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
|
|
debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
|
|
debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
|
|
debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
|
|
}
|
|
|
|
static void __init hd_create_attributes(void)
|
|
{
|
|
struct device *dev;
|
|
|
|
dev = bus_get_dev_root(&cpu_subsys);
|
|
if (!dev)
|
|
return;
|
|
if (sysfs_create_group(&dev->kobj, &hd_attr_group))
|
|
pr_warn("Unable to create hiperdispatch attribute group\n");
|
|
put_device(dev);
|
|
}
|
|
|
|
static int __init hd_init(void)
|
|
{
|
|
if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
|
|
hd_set_hiperdispatch_mode(1);
|
|
topology_schedule_update();
|
|
}
|
|
if (!register_sysctl("s390", hiperdispatch_ctl_table))
|
|
pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
|
|
hd_create_debugfs_counters();
|
|
hd_create_attributes();
|
|
return 0;
|
|
}
|
|
late_initcall(hd_init);
|