2019-06-04 10:11:33 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2007-07-09 18:52:00 +02:00
|
|
|
/*
|
2011-11-15 17:14:39 +01:00
|
|
|
* kernel/sched/debug.c
|
2007-07-09 18:52:00 +02:00
|
|
|
*
|
2018-03-03 12:20:47 +01:00
|
|
|
* Print the CFS rbtree and other debugging details
|
2007-07-09 18:52:00 +02:00
|
|
|
*
|
|
|
|
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
|
|
|
|
*/
|
2011-10-25 10:00:11 +02:00
|
|
|
|
2007-07-09 18:52:00 +02:00
|
|
|
/*
|
2023-09-20 15:00:25 +02:00
|
|
|
* This allows printing both to /sys/kernel/debug/sched/debug and
|
2007-07-09 18:52:00 +02:00
|
|
|
* to the console
|
|
|
|
*/
|
|
|
|
#define SEQ_printf(m, x...) \
|
|
|
|
do { \
|
|
|
|
if (m) \
|
|
|
|
seq_printf(m, x); \
|
|
|
|
else \
|
2018-03-19 14:35:54 -04:00
|
|
|
pr_cont(x); \
|
2007-07-09 18:52:00 +02:00
|
|
|
} while (0)
|
|
|
|
|
2007-10-15 17:00:08 +02:00
|
|
|
/*
|
|
|
|
* Ease the printing of nsec fields:
|
|
|
|
*/
|
2007-12-30 17:24:35 +01:00
|
|
|
static long long nsec_high(unsigned long long nsec)
|
2007-10-15 17:00:08 +02:00
|
|
|
{
|
2007-12-30 17:24:35 +01:00
|
|
|
if ((long long)nsec < 0) {
|
2007-10-15 17:00:08 +02:00
|
|
|
nsec = -nsec;
|
|
|
|
do_div(nsec, 1000000);
|
|
|
|
return -nsec;
|
|
|
|
}
|
|
|
|
do_div(nsec, 1000000);
|
|
|
|
|
|
|
|
return nsec;
|
|
|
|
}
|
|
|
|
|
2007-12-30 17:24:35 +01:00
|
|
|
static unsigned long nsec_low(unsigned long long nsec)
|
2007-10-15 17:00:08 +02:00
|
|
|
{
|
2007-12-30 17:24:35 +01:00
|
|
|
if ((long long)nsec < 0)
|
2007-10-15 17:00:08 +02:00
|
|
|
nsec = -nsec;
|
|
|
|
|
|
|
|
return do_div(nsec, 1000000);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
|
|
|
|
|
2016-02-22 16:26:50 -05:00
|
|
|
#define SCHED_FEAT(name, enabled) \
|
|
|
|
#name ,
|
|
|
|
|
|
|
|
static const char * const sched_feat_names[] = {
|
|
|
|
#include "features.h"
|
|
|
|
};
|
|
|
|
|
|
|
|
#undef SCHED_FEAT
|
|
|
|
|
|
|
|
static int sched_feat_show(struct seq_file *m, void *v)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < __SCHED_FEAT_NR; i++) {
|
|
|
|
if (!(sysctl_sched_features & (1UL << i)))
|
|
|
|
seq_puts(m, "NO_");
|
|
|
|
seq_printf(m, "%s ", sched_feat_names[i]);
|
|
|
|
}
|
|
|
|
seq_puts(m, "\n");
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-12-31 00:14:15 +09:00
|
|
|
#ifdef CONFIG_JUMP_LABEL
|
2016-02-22 16:26:50 -05:00
|
|
|
|
|
|
|
#define jump_label_key__true STATIC_KEY_INIT_TRUE
|
|
|
|
#define jump_label_key__false STATIC_KEY_INIT_FALSE
|
|
|
|
|
|
|
|
#define SCHED_FEAT(name, enabled) \
|
|
|
|
jump_label_key__##enabled ,
|
|
|
|
|
|
|
|
struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
|
|
|
|
#include "features.h"
|
|
|
|
};
|
|
|
|
|
|
|
|
#undef SCHED_FEAT
|
|
|
|
|
|
|
|
static void sched_feat_disable(int i)
|
|
|
|
{
|
2018-07-31 21:12:22 +09:00
|
|
|
static_key_disable_cpuslocked(&sched_feat_keys[i]);
|
2016-02-22 16:26:50 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
static void sched_feat_enable(int i)
|
|
|
|
{
|
2018-07-31 21:12:22 +09:00
|
|
|
static_key_enable_cpuslocked(&sched_feat_keys[i]);
|
2016-02-22 16:26:50 -05:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void sched_feat_disable(int i) { };
|
|
|
|
static void sched_feat_enable(int i) { };
|
2018-12-31 00:14:15 +09:00
|
|
|
#endif /* CONFIG_JUMP_LABEL */
|
2016-02-22 16:26:50 -05:00
|
|
|
|
|
|
|
static int sched_feat_set(char *cmp)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int neg = 0;
|
|
|
|
|
|
|
|
if (strncmp(cmp, "NO_", 3) == 0) {
|
|
|
|
neg = 1;
|
|
|
|
cmp += 3;
|
|
|
|
}
|
|
|
|
|
2018-05-31 19:11:19 +08:00
|
|
|
i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp);
|
|
|
|
if (i < 0)
|
|
|
|
return i;
|
|
|
|
|
|
|
|
if (neg) {
|
|
|
|
sysctl_sched_features &= ~(1UL << i);
|
|
|
|
sched_feat_disable(i);
|
|
|
|
} else {
|
|
|
|
sysctl_sched_features |= (1UL << i);
|
|
|
|
sched_feat_enable(i);
|
2016-02-22 16:26:50 -05:00
|
|
|
}
|
|
|
|
|
2018-05-31 19:11:19 +08:00
|
|
|
return 0;
|
2016-02-22 16:26:50 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
sched_feat_write(struct file *filp, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos)
|
|
|
|
{
|
|
|
|
char buf[64];
|
|
|
|
char *cmp;
|
2018-05-31 19:11:19 +08:00
|
|
|
int ret;
|
2016-02-22 16:26:50 -05:00
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
if (cnt > 63)
|
|
|
|
cnt = 63;
|
|
|
|
|
|
|
|
if (copy_from_user(&buf, ubuf, cnt))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
buf[cnt] = 0;
|
|
|
|
cmp = strstrip(buf);
|
|
|
|
|
|
|
|
/* Ensure the static_key remains in a consistent state */
|
|
|
|
inode = file_inode(filp);
|
2018-07-31 21:12:22 +09:00
|
|
|
cpus_read_lock();
|
2016-02-22 16:26:50 -05:00
|
|
|
inode_lock(inode);
|
2018-05-31 19:11:19 +08:00
|
|
|
ret = sched_feat_set(cmp);
|
2016-02-22 16:26:50 -05:00
|
|
|
inode_unlock(inode);
|
2018-07-31 21:12:22 +09:00
|
|
|
cpus_read_unlock();
|
2018-05-31 19:11:19 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2016-02-22 16:26:50 -05:00
|
|
|
|
|
|
|
*ppos += cnt;
|
|
|
|
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sched_feat_open(struct inode *inode, struct file *filp)
|
|
|
|
{
|
|
|
|
return single_open(filp, sched_feat_show, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations sched_feat_fops = {
|
|
|
|
.open = sched_feat_open,
|
|
|
|
.write = sched_feat_write,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = single_release,
|
|
|
|
};
|
|
|
|
|
2021-03-24 11:43:21 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
|
|
|
static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos)
|
|
|
|
{
|
|
|
|
char buf[16];
|
2021-09-27 12:46:35 +01:00
|
|
|
unsigned int scaling;
|
2021-03-24 11:43:21 +01:00
|
|
|
|
|
|
|
if (cnt > 15)
|
|
|
|
cnt = 15;
|
|
|
|
|
|
|
|
if (copy_from_user(&buf, ubuf, cnt))
|
|
|
|
return -EFAULT;
|
2021-09-27 12:46:35 +01:00
|
|
|
buf[cnt] = '\0';
|
2021-03-24 11:43:21 +01:00
|
|
|
|
2021-09-27 12:46:35 +01:00
|
|
|
if (kstrtouint(buf, 10, &scaling))
|
2021-03-24 11:43:21 +01:00
|
|
|
return -EINVAL;
|
|
|
|
|
2021-09-27 12:46:35 +01:00
|
|
|
if (scaling >= SCHED_TUNABLESCALING_END)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
sysctl_sched_tunable_scaling = scaling;
|
2021-03-24 11:43:21 +01:00
|
|
|
if (sched_update_scaling())
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
*ppos += cnt;
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sched_scaling_show(struct seq_file *m, void *v)
|
|
|
|
{
|
|
|
|
seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sched_scaling_open(struct inode *inode, struct file *filp)
|
|
|
|
{
|
|
|
|
return single_open(filp, sched_scaling_show, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations sched_scaling_fops = {
|
|
|
|
.open = sched_scaling_open,
|
|
|
|
.write = sched_scaling_write,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = single_release,
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif /* SMP */
|
|
|
|
|
2021-03-25 12:21:38 +01:00
|
|
|
#ifdef CONFIG_PREEMPT_DYNAMIC
|
|
|
|
|
|
|
|
static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos)
|
|
|
|
{
|
|
|
|
char buf[16];
|
|
|
|
int mode;
|
|
|
|
|
|
|
|
if (cnt > 15)
|
|
|
|
cnt = 15;
|
|
|
|
|
|
|
|
if (copy_from_user(&buf, ubuf, cnt))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
buf[cnt] = 0;
|
|
|
|
mode = sched_dynamic_mode(strstrip(buf));
|
|
|
|
if (mode < 0)
|
|
|
|
return mode;
|
|
|
|
|
|
|
|
sched_dynamic_update(mode);
|
|
|
|
|
|
|
|
*ppos += cnt;
|
|
|
|
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sched_dynamic_show(struct seq_file *m, void *v)
|
|
|
|
{
|
|
|
|
static const char * preempt_modes[] = {
|
2024-10-04 14:46:58 +02:00
|
|
|
"none", "voluntary", "full", "lazy",
|
2021-03-25 12:21:38 +01:00
|
|
|
};
|
2024-10-04 14:46:58 +02:00
|
|
|
int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
|
2024-10-04 14:46:56 +02:00
|
|
|
int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
|
2021-03-25 12:21:38 +01:00
|
|
|
|
2024-10-04 14:46:56 +02:00
|
|
|
for (; i < j; i++) {
|
2021-03-25 12:21:38 +01:00
|
|
|
if (preempt_dynamic_mode == i)
|
|
|
|
seq_puts(m, "(");
|
|
|
|
seq_puts(m, preempt_modes[i]);
|
|
|
|
if (preempt_dynamic_mode == i)
|
|
|
|
seq_puts(m, ")");
|
|
|
|
|
|
|
|
seq_puts(m, " ");
|
|
|
|
}
|
|
|
|
|
|
|
|
seq_puts(m, "\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sched_dynamic_open(struct inode *inode, struct file *filp)
|
|
|
|
{
|
|
|
|
return single_open(filp, sched_dynamic_show, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations sched_dynamic_fops = {
|
|
|
|
.open = sched_dynamic_open,
|
|
|
|
.write = sched_dynamic_write,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = single_release,
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif /* CONFIG_PREEMPT_DYNAMIC */
|
|
|
|
|
2021-04-15 18:23:17 +02:00
|
|
|
__read_mostly bool sched_debug_verbose;
|
2017-09-07 17:03:53 +02:00
|
|
|
|
2023-03-03 13:37:54 -05:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
static struct dentry *sd_dentry;
|
|
|
|
|
|
|
|
|
|
|
|
static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos)
|
|
|
|
{
|
|
|
|
ssize_t result;
|
|
|
|
bool orig;
|
|
|
|
|
|
|
|
cpus_read_lock();
|
|
|
|
mutex_lock(&sched_domains_mutex);
|
|
|
|
|
|
|
|
orig = sched_debug_verbose;
|
|
|
|
result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
|
|
|
|
|
|
|
|
if (sched_debug_verbose && !orig)
|
|
|
|
update_sched_domain_debugfs();
|
|
|
|
else if (!sched_debug_verbose && orig) {
|
|
|
|
debugfs_remove(sd_dentry);
|
|
|
|
sd_dentry = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&sched_domains_mutex);
|
|
|
|
cpus_read_unlock();
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#define sched_verbose_write debugfs_write_file_bool
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static const struct file_operations sched_verbose_fops = {
|
|
|
|
.read = debugfs_read_file_bool,
|
|
|
|
.write = sched_verbose_write,
|
|
|
|
.open = simple_open,
|
|
|
|
.llseek = default_llseek,
|
|
|
|
};
|
|
|
|
|
2021-03-25 15:18:19 +01:00
|
|
|
static const struct seq_operations sched_debug_sops;
|
|
|
|
|
|
|
|
static int sched_debug_open(struct inode *inode, struct file *filp)
|
|
|
|
{
|
|
|
|
return seq_open(filp, &sched_debug_sops);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations sched_debug_fops = {
|
|
|
|
.open = sched_debug_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
|
|
|
};
|
|
|
|
|
2024-05-27 14:06:52 +02:00
|
|
|
enum dl_param {
|
|
|
|
DL_RUNTIME = 0,
|
|
|
|
DL_PERIOD,
|
|
|
|
};
|
|
|
|
|
2024-08-01 10:44:03 -05:00
|
|
|
static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
|
2024-05-27 14:06:52 +02:00
|
|
|
static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
|
|
|
|
|
|
|
|
static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos, enum dl_param param)
|
|
|
|
{
|
|
|
|
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
|
|
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
|
u64 runtime, period;
|
|
|
|
size_t err;
|
|
|
|
int retval;
|
|
|
|
u64 value;
|
|
|
|
|
|
|
|
err = kstrtoull_from_user(ubuf, cnt, 10, &value);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
scoped_guard (rq_lock_irqsave, rq) {
|
|
|
|
runtime = rq->fair_server.dl_runtime;
|
|
|
|
period = rq->fair_server.dl_period;
|
|
|
|
|
|
|
|
switch (param) {
|
|
|
|
case DL_RUNTIME:
|
|
|
|
if (runtime == value)
|
|
|
|
break;
|
|
|
|
runtime = value;
|
|
|
|
break;
|
|
|
|
case DL_PERIOD:
|
|
|
|
if (value == period)
|
|
|
|
break;
|
|
|
|
period = value;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (runtime > period ||
|
|
|
|
period > fair_server_period_max ||
|
|
|
|
period < fair_server_period_min) {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2024-12-02 18:45:58 +01:00
|
|
|
if (rq->cfs.h_nr_queued) {
|
2024-05-27 14:06:52 +02:00
|
|
|
update_rq_clock(rq);
|
|
|
|
dl_server_stop(&rq->fair_server);
|
|
|
|
}
|
|
|
|
|
|
|
|
retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
|
|
|
|
if (retval)
|
|
|
|
cnt = retval;
|
|
|
|
|
|
|
|
if (!runtime)
|
|
|
|
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
|
|
|
|
cpu_of(rq));
|
|
|
|
|
2024-12-02 18:45:58 +01:00
|
|
|
if (rq->cfs.h_nr_queued)
|
2024-05-27 14:06:52 +02:00
|
|
|
dl_server_start(&rq->fair_server);
|
|
|
|
}
|
|
|
|
|
|
|
|
*ppos += cnt;
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
|
|
|
|
{
|
|
|
|
unsigned long cpu = (unsigned long) m->private;
|
|
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
|
u64 value;
|
|
|
|
|
|
|
|
switch (param) {
|
|
|
|
case DL_RUNTIME:
|
|
|
|
value = rq->fair_server.dl_runtime;
|
|
|
|
break;
|
|
|
|
case DL_PERIOD:
|
|
|
|
value = rq->fair_server.dl_period;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
seq_printf(m, "%llu\n", value);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos)
|
|
|
|
{
|
|
|
|
return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
|
|
|
|
{
|
|
|
|
return sched_fair_server_show(m, v, DL_RUNTIME);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
|
|
|
|
{
|
|
|
|
return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations fair_server_runtime_fops = {
|
|
|
|
.open = sched_fair_server_runtime_open,
|
|
|
|
.write = sched_fair_server_runtime_write,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = single_release,
|
|
|
|
};
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos)
|
|
|
|
{
|
|
|
|
return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sched_fair_server_period_show(struct seq_file *m, void *v)
|
|
|
|
{
|
|
|
|
return sched_fair_server_show(m, v, DL_PERIOD);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
|
|
|
|
{
|
|
|
|
return single_open(filp, sched_fair_server_period_show, inode->i_private);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations fair_server_period_fops = {
|
|
|
|
.open = sched_fair_server_period_open,
|
|
|
|
.write = sched_fair_server_period_write,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = single_release,
|
|
|
|
};
|
|
|
|
|
2021-03-25 12:21:38 +01:00
|
|
|
static struct dentry *debugfs_sched;
|
2021-03-24 11:43:21 +01:00
|
|
|
|
2024-05-27 14:06:52 +02:00
|
|
|
static void debugfs_fair_server_init(void)
|
|
|
|
{
|
|
|
|
struct dentry *d_fair;
|
|
|
|
unsigned long cpu;
|
|
|
|
|
|
|
|
d_fair = debugfs_create_dir("fair_server", debugfs_sched);
|
|
|
|
if (!d_fair)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
struct dentry *d_cpu;
|
|
|
|
char buf[32];
|
|
|
|
|
|
|
|
snprintf(buf, sizeof(buf), "cpu%lu", cpu);
|
|
|
|
d_cpu = debugfs_create_dir(buf, d_fair);
|
|
|
|
|
|
|
|
debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
|
|
|
|
debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-02-22 16:26:50 -05:00
|
|
|
static __init int sched_init_debug(void)
|
|
|
|
{
|
2021-03-24 11:43:21 +01:00
|
|
|
struct dentry __maybe_unused *numa;
|
2016-02-22 16:26:50 -05:00
|
|
|
|
2021-03-24 11:43:21 +01:00
|
|
|
debugfs_sched = debugfs_create_dir("sched", NULL);
|
|
|
|
|
|
|
|
debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
|
2023-03-03 13:37:54 -05:00
|
|
|
debugfs_create_file_unsafe("verbose", 0644, debugfs_sched, &sched_debug_verbose, &sched_verbose_fops);
|
2021-03-25 12:21:38 +01:00
|
|
|
#ifdef CONFIG_PREEMPT_DYNAMIC
|
|
|
|
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
|
|
|
|
#endif
|
2021-03-24 11:43:21 +01:00
|
|
|
|
2023-05-31 13:58:48 +02:00
|
|
|
debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
|
2021-03-24 11:43:21 +01:00
|
|
|
|
2021-04-16 14:29:36 -07:00
|
|
|
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
|
|
|
|
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
|
|
|
|
|
2021-03-24 11:43:21 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
|
|
|
|
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
|
|
|
|
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
|
2021-03-25 11:31:20 +01:00
|
|
|
|
|
|
|
mutex_lock(&sched_domains_mutex);
|
|
|
|
update_sched_domain_debugfs();
|
|
|
|
mutex_unlock(&sched_domains_mutex);
|
2021-03-24 11:43:21 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
|
numa = debugfs_create_dir("numa_balancing", debugfs_sched);
|
|
|
|
|
|
|
|
debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay);
|
|
|
|
debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
|
|
|
|
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
|
|
|
|
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
|
memory tiering: hot page selection with hint page fault latency
Patch series "memory tiering: hot page selection", v4.
To optimize page placement in a memory tiering system with NUMA balancing,
the hot pages in the slow memory nodes need to be identified.
Essentially, the original NUMA balancing implementation selects the mostly
recently accessed (MRU) pages to promote. But this isn't a perfect
algorithm to identify the hot pages. Because the pages with quite low
access frequency may be accessed eventually given the NUMA balancing page
table scanning period could be quite long (e.g. 60 seconds). So in this
patchset, we implement a new hot page identification algorithm based on
the latency between NUMA balancing page table scanning and hint page
fault. Which is a kind of mostly frequently accessed (MFU) algorithm.
In NUMA balancing memory tiering mode, if there are hot pages in slow
memory node and cold pages in fast memory node, we need to promote/demote
hot/cold pages between the fast and cold memory nodes.
A choice is to promote/demote as fast as possible. But the CPU cycles and
memory bandwidth consumed by the high promoting/demoting throughput will
hurt the latency of some workload because of accessing inflating and slow
memory bandwidth contention.
A way to resolve this issue is to restrict the max promoting/demoting
throughput. It will take longer to finish the promoting/demoting. But
the workload latency will be better. This is implemented in this patchset
as the page promotion rate limit mechanism.
The promotion hot threshold is workload and system configuration
dependent. So in this patchset, a method to adjust the hot threshold
automatically is implemented. The basic idea is to control the number of
the candidate promotion pages to match the promotion rate limit.
We used the pmbench memory accessing benchmark tested the patchset on a
2-socket server system with DRAM and PMEM installed. The test results are
as follows,
pmbench score promote rate
(accesses/s) MB/s
------------- ------------
base 146887704.1 725.6
hot selection 165695601.2 544.0
rate limit 162814569.8 165.2
auto adjustment 170495294.0 136.9
From the results above,
With hot page selection patch [1/3], the pmbench score increases about
12.8%, and promote rate (overhead) decreases about 25.0%, compared with
base kernel.
With rate limit patch [2/3], pmbench score decreases about 1.7%, and
promote rate decreases about 69.6%, compared with hot page selection
patch.
With threshold auto adjustment patch [3/3], pmbench score increases about
4.7%, and promote rate decrease about 17.1%, compared with rate limit
patch.
Baolin helped to test the patchset with MySQL on a machine which contains
1 DRAM node (30G) and 1 PMEM node (126G).
sysbench /usr/share/sysbench/oltp_read_write.lua \
......
--tables=200 \
--table-size=1000000 \
--report-interval=10 \
--threads=16 \
--time=120
The tps can be improved about 5%.
This patch (of 3):
To optimize page placement in a memory tiering system with NUMA balancing,
the hot pages in the slow memory node need to be identified. Essentially,
the original NUMA balancing implementation selects the mostly recently
accessed (MRU) pages to promote. But this isn't a perfect algorithm to
identify the hot pages. Because the pages with quite low access frequency
may be accessed eventually given the NUMA balancing page table scanning
period could be quite long (e.g. 60 seconds). The most frequently
accessed (MFU) algorithm is better.
So, in this patch we implemented a better hot page selection algorithm.
Which is based on NUMA balancing page table scanning and hint page fault
as follows,
- When the page tables of the processes are scanned to change PTE/PMD
to be PROT_NONE, the current time is recorded in struct page as scan
time.
- When the page is accessed, hint page fault will occur. The scan
time is gotten from the struct page. And The hint page fault
latency is defined as
hint page fault time - scan time
The shorter the hint page fault latency of a page is, the higher the
probability of their access frequency to be higher. So the hint page
fault latency is a better estimation of the page hot/cold.
It's hard to find some extra space in struct page to hold the scan time.
Fortunately, we can reuse some bits used by the original NUMA balancing.
NUMA balancing uses some bits in struct page to store the page accessing
CPU and PID (referring to page_cpupid_xchg_last()). Which is used by the
multi-stage node selection algorithm to avoid to migrate pages shared
accessed by the NUMA nodes back and forth. But for pages in the slow
memory node, even if they are shared accessed by multiple NUMA nodes, as
long as the pages are hot, they need to be promoted to the fast memory
node. So the accessing CPU and PID information are unnecessary for the
slow memory pages. We can reuse these bits in struct page to record the
scan time. For the fast memory pages, these bits are used as before.
For the hot threshold, the default value is 1 second, which works well in
our performance test. All pages with hint page fault latency < hot
threshold will be considered hot.
It's hard for users to determine the hot threshold. So we don't provide a
kernel ABI to set it, just provide a debugfs interface for advanced users
to experiment. We will continue to work on a hot threshold automatic
adjustment mechanism.
The downside of the above method is that the response time to the workload
hot spot changing may be much longer. For example,
- A previous cold memory area becomes hot
- The hint page fault will be triggered. But the hint page fault
latency isn't shorter than the hot threshold. So the pages will
not be promoted.
- When the memory area is scanned again, maybe after a scan period,
the hint page fault latency measured will be shorter than the hot
threshold and the pages will be promoted.
To mitigate this, if there are enough free space in the fast memory node,
the hot threshold will not be used, all pages will be promoted upon the
hint page fault for fast response.
Thanks Zhong Jiang reported and tested the fix for a bug when disabling
memory tiering mode dynamically.
Link: https://lkml.kernel.org/r/20220713083954.34196-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20220713083954.34196-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Wei Xu <weixugc@google.com>
Cc: osalvador <osalvador@suse.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Zhong Jiang <zhongjiang-ali@linux.alibaba.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-07-13 16:39:51 +08:00
|
|
|
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
|
2021-03-24 11:43:21 +01:00
|
|
|
#endif
|
2017-09-07 17:03:53 +02:00
|
|
|
|
2021-03-25 15:18:19 +01:00
|
|
|
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
|
|
|
|
|
2024-05-27 14:06:52 +02:00
|
|
|
debugfs_fair_server_init();
|
|
|
|
|
2016-02-22 16:26:50 -05:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
late_initcall(sched_init_debug);
|
|
|
|
|
2016-02-22 16:26:51 -05:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
static cpumask_var_t sd_sysctl_cpus;
|
2016-02-22 16:26:51 -05:00
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
static int sd_flags_show(struct seq_file *m, void *v)
|
2020-08-17 12:29:52 +01:00
|
|
|
{
|
2021-03-25 11:31:20 +01:00
|
|
|
unsigned long flags = *(unsigned int *)m->private;
|
2020-08-17 12:29:52 +01:00
|
|
|
int idx;
|
|
|
|
|
|
|
|
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
|
2021-03-25 11:31:20 +01:00
|
|
|
seq_puts(m, sd_flag_debug[idx].name);
|
|
|
|
seq_puts(m, " ");
|
2020-08-17 12:29:52 +01:00
|
|
|
}
|
2021-03-25 11:31:20 +01:00
|
|
|
seq_puts(m, "\n");
|
2020-08-17 12:29:52 +01:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
static int sd_flags_open(struct inode *inode, struct file *file)
|
2016-02-22 16:26:51 -05:00
|
|
|
{
|
2021-03-25 11:31:20 +01:00
|
|
|
return single_open(file, sd_flags_show, inode->i_private);
|
2016-02-22 16:26:51 -05:00
|
|
|
}
|
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
static const struct file_operations sd_flags_fops = {
|
|
|
|
.open = sd_flags_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = single_release,
|
|
|
|
};
|
2017-08-10 17:10:26 +02:00
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
static void register_sd(struct sched_domain *sd, struct dentry *parent)
|
2016-02-22 16:26:51 -05:00
|
|
|
{
|
2021-03-25 11:31:20 +01:00
|
|
|
#define SDM(type, mode, member) \
|
|
|
|
debugfs_create_##type(#member, mode, parent, &sd->member)
|
2016-02-22 16:26:51 -05:00
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
SDM(ulong, 0644, min_interval);
|
|
|
|
SDM(ulong, 0644, max_interval);
|
|
|
|
SDM(u64, 0644, max_newidle_lb_cost);
|
|
|
|
SDM(u32, 0644, busy_factor);
|
|
|
|
SDM(u32, 0644, imbalance_pct);
|
|
|
|
SDM(u32, 0644, cache_nice_tries);
|
|
|
|
SDM(str, 0444, name);
|
2016-02-22 16:26:51 -05:00
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
#undef SDM
|
2017-08-10 17:10:26 +02:00
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
|
2023-07-07 15:57:05 -07:00
|
|
|
debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
|
2024-04-30 18:05:24 +03:00
|
|
|
debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level);
|
2021-03-25 11:31:20 +01:00
|
|
|
}
|
2017-08-10 17:10:26 +02:00
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
void update_sched_domain_debugfs(void)
|
|
|
|
{
|
|
|
|
int cpu, i;
|
2017-08-10 17:10:26 +02:00
|
|
|
|
sched/debug: Don't update sched_domain debug directories before sched_debug_init()
Since CPU capacity asymmetry can stem purely from maximum frequency
differences (e.g. Pixel 1), a rebuild of the scheduler topology can be
issued upon loading cpufreq, see:
arch_topology.c::init_cpu_capacity_callback()
Turns out that if this rebuild happens *before* sched_debug_init() is
run (which is a late initcall), we end up messing up the sched_domain debug
directory: passing a NULL parent to debugfs_create_dir() ends up creating
the directory at the debugfs root, which in this case creates
/sys/kernel/debug/domains (instead of /sys/kernel/debug/sched/domains).
This currently doesn't happen on asymmetric systems which use cpufreq-scpi
or cpufreq-dt drivers, as those are loaded via
deferred_probe_initcall() (it is also a late initcall, but appears to be
ordered *after* sched_debug_init()).
Ionela has been working on detecting maximum frequency asymmetry via ACPI,
and that actually happens via a *device* initcall, thus before
sched_debug_init(), and causes the aforementionned debugfs mayhem.
One option would be to punt sched_debug_init() down to
fs_initcall_sync(). Preventing update_sched_domain_debugfs() from running
before sched_debug_init() appears to be the safer option.
Fixes: 3b87f136f8fc ("sched,debug: Convert sysctl sched_domains to debugfs")
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lore.kernel.org/r/20210514095339.12979-1-ionela.voinescu@arm.com
2021-05-18 14:07:25 +01:00
|
|
|
/*
|
|
|
|
* This can unfortunately be invoked before sched_debug_init() creates
|
|
|
|
* the debug directory. Don't touch sd_sysctl_cpus until then.
|
|
|
|
*/
|
|
|
|
if (!debugfs_sched)
|
|
|
|
return;
|
|
|
|
|
2023-03-03 13:37:54 -05:00
|
|
|
if (!sched_debug_verbose)
|
|
|
|
return;
|
|
|
|
|
2017-08-10 17:10:26 +02:00
|
|
|
if (!cpumask_available(sd_sysctl_cpus)) {
|
|
|
|
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
|
|
|
|
return;
|
|
|
|
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
|
|
|
|
}
|
|
|
|
|
2023-03-03 13:37:54 -05:00
|
|
|
if (!sd_dentry) {
|
2021-03-25 11:31:20 +01:00
|
|
|
sd_dentry = debugfs_create_dir("domains", debugfs_sched);
|
|
|
|
|
2023-03-03 13:37:54 -05:00
|
|
|
/* rebuild sd_sysctl_cpus if empty since it gets cleared below */
|
|
|
|
if (cpumask_empty(sd_sysctl_cpus))
|
|
|
|
cpumask_copy(sd_sysctl_cpus, cpu_online_mask);
|
|
|
|
}
|
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
for_each_cpu(cpu, sd_sysctl_cpus) {
|
|
|
|
struct sched_domain *sd;
|
|
|
|
struct dentry *d_cpu;
|
|
|
|
char buf[32];
|
|
|
|
|
|
|
|
snprintf(buf, sizeof(buf), "cpu%d", cpu);
|
2022-09-02 14:31:07 +02:00
|
|
|
debugfs_lookup_and_remove(buf, sd_dentry);
|
2021-03-25 11:31:20 +01:00
|
|
|
d_cpu = debugfs_create_dir(buf, sd_dentry);
|
|
|
|
|
|
|
|
i = 0;
|
|
|
|
for_each_domain(cpu, sd) {
|
|
|
|
struct dentry *d_sd;
|
2017-08-10 17:10:26 +02:00
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
snprintf(buf, sizeof(buf), "domain%d", i);
|
|
|
|
d_sd = debugfs_create_dir(buf, d_cpu);
|
2017-08-10 17:10:26 +02:00
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
register_sd(sd, d_sd);
|
|
|
|
i++;
|
2017-08-10 17:10:26 +02:00
|
|
|
}
|
|
|
|
|
2021-03-25 11:31:20 +01:00
|
|
|
__cpumask_clear_cpu(cpu, sd_sysctl_cpus);
|
2016-02-22 16:26:51 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-10 17:10:26 +02:00
|
|
|
void dirty_sched_domain_sysctl(int cpu)
|
|
|
|
{
|
|
|
|
if (cpumask_available(sd_sysctl_cpus))
|
|
|
|
__cpumask_set_cpu(cpu, sd_sysctl_cpus);
|
|
|
|
}
|
|
|
|
|
2016-02-22 16:26:51 -05:00
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
2008-11-10 21:34:09 +05:30
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
sched: Add 'autogroup' scheduling feature: automated per session task groups
A recurring complaint from CFS users is that parallel kbuild has
a negative impact on desktop interactivity. This patch
implements an idea from Linus, to automatically create task
groups. Currently, only per session autogroups are implemented,
but the patch leaves the way open for enhancement.
Implementation: each task's signal struct contains an inherited
pointer to a refcounted autogroup struct containing a task group
pointer, the default for all tasks pointing to the
init_task_group. When a task calls setsid(), a new task group
is created, the process is moved into the new task group, and a
reference to the preveious task group is dropped. Child
processes inherit this task group thereafter, and increase it's
refcount. When the last thread of a process exits, the
process's reference is dropped, such that when the last process
referencing an autogroup exits, the autogroup is destroyed.
At runqueue selection time, IFF a task has no cgroup assignment,
its current autogroup is used.
Autogroup bandwidth is controllable via setting it's nice level
through the proc filesystem:
cat /proc/<pid>/autogroup
Displays the task's group and the group's nice level.
echo <nice level> > /proc/<pid>/autogroup
Sets the task group's shares to the weight of nice <level> task.
Setting nice level is rate limited for !admin users due to the
abuse risk of task group locking.
The feature is enabled from boot by default if
CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via
the boot option noautogroup, and can also be turned on/off on
the fly via:
echo [01] > /proc/sys/kernel/sched_autogroup_enabled
... which will automatically move tasks to/from the root task group.
Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Paul Turner <pjt@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
[ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-11-30 14:18:03 +01:00
|
|
|
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
|
2008-11-10 21:34:09 +05:30
|
|
|
{
|
|
|
|
struct sched_entity *se = tg->se[cpu];
|
|
|
|
|
sched: Clean up and harmonize the coding style of the scheduler code base
A good number of small style inconsistencies have accumulated
in the scheduler core, so do a pass over them to harmonize
all these details:
- fix speling in comments,
- use curly braces for multi-line statements,
- remove unnecessary parentheses from integer literals,
- capitalize consistently,
- remove stray newlines,
- add comments where necessary,
- remove invalid/unnecessary comments,
- align structure definitions and other data types vertically,
- add missing newlines for increased readability,
- fix vertical tabulation where it's misaligned,
- harmonize preprocessor conditional block labeling
and vertical alignment,
- remove line-breaks where they uglify the code,
- add newline after local variable definitions,
No change in functionality:
md5:
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-03-03 14:01:12 +01:00
|
|
|
#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
|
sched: Make struct sched_statistics independent of fair sched class
If we want to use the schedstats facility to trace other sched classes, we
should make it independent of fair sched class. The struct sched_statistics
is the schedular statistics of a task_struct or a task_group. So we can
move it into struct task_struct and struct task_group to achieve the goal.
After the patch, schestats are orgnized as follows,
struct task_struct {
...
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
...
struct sched_statistics stats;
...
};
Regarding the task group, schedstats is only supported for fair group
sched, and a new struct sched_entity_stats is introduced, suggested by
Peter -
struct sched_entity_stats {
struct sched_entity se;
struct sched_statistics stats;
} __no_randomize_layout;
Then with the se in a task_group, we can easily get the stats.
The sched_statistics members may be frequently modified when schedstats is
enabled, in order to avoid impacting on random data which may in the same
cacheline with them, the struct sched_statistics is defined as cacheline
aligned.
As this patch changes the core struct of scheduler, so I verified the
performance it may impact on the scheduler with 'perf bench sched
pipe', suggested by Mel. Below is the result, in which all the values
are in usecs/op.
Before After
kernel.sched_schedstats=0 5.2~5.4 5.2~5.4
kernel.sched_schedstats=1 5.3~5.5 5.3~5.5
[These data is a little difference with the earlier version, that is
because my old test machine is destroyed so I have to use a new
different test machine.]
Almost no impact on the sched performance.
No functional change.
[lkp@intel.com: reported build failure in earlier version]
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com
2021-09-05 14:35:41 +00:00
|
|
|
#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \
|
|
|
|
#F, (long long)schedstat_val(stats->F))
|
sched: Clean up and harmonize the coding style of the scheduler code base
A good number of small style inconsistencies have accumulated
in the scheduler core, so do a pass over them to harmonize
all these details:
- fix speling in comments,
- use curly braces for multi-line statements,
- remove unnecessary parentheses from integer literals,
- capitalize consistently,
- remove stray newlines,
- add comments where necessary,
- remove invalid/unnecessary comments,
- align structure definitions and other data types vertically,
- add missing newlines for increased readability,
- fix vertical tabulation where it's misaligned,
- harmonize preprocessor conditional block labeling
and vertical alignment,
- remove line-breaks where they uglify the code,
- add newline after local variable definitions,
No change in functionality:
md5:
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-03-03 14:01:12 +01:00
|
|
|
#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
|
sched: Make struct sched_statistics independent of fair sched class
If we want to use the schedstats facility to trace other sched classes, we
should make it independent of fair sched class. The struct sched_statistics
is the schedular statistics of a task_struct or a task_group. So we can
move it into struct task_struct and struct task_group to achieve the goal.
After the patch, schestats are orgnized as follows,
struct task_struct {
...
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
...
struct sched_statistics stats;
...
};
Regarding the task group, schedstats is only supported for fair group
sched, and a new struct sched_entity_stats is introduced, suggested by
Peter -
struct sched_entity_stats {
struct sched_entity se;
struct sched_statistics stats;
} __no_randomize_layout;
Then with the se in a task_group, we can easily get the stats.
The sched_statistics members may be frequently modified when schedstats is
enabled, in order to avoid impacting on random data which may in the same
cacheline with them, the struct sched_statistics is defined as cacheline
aligned.
As this patch changes the core struct of scheduler, so I verified the
performance it may impact on the scheduler with 'perf bench sched
pipe', suggested by Mel. Below is the result, in which all the values
are in usecs/op.
Before After
kernel.sched_schedstats=0 5.2~5.4 5.2~5.4
kernel.sched_schedstats=1 5.3~5.5 5.3~5.5
[These data is a little difference with the earlier version, that is
because my old test machine is destroyed so I have to use a new
different test machine.]
Almost no impact on the sched performance.
No functional change.
[lkp@intel.com: reported build failure in earlier version]
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com
2021-09-05 14:35:41 +00:00
|
|
|
#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", \
|
|
|
|
#F, SPLIT_NS((long long)schedstat_val(stats->F)))
|
2008-11-10 21:34:09 +05:30
|
|
|
|
2015-07-15 08:04:36 +08:00
|
|
|
if (!se)
|
2012-10-04 12:51:20 +02:00
|
|
|
return;
|
|
|
|
|
2008-11-10 21:34:09 +05:30
|
|
|
PN(se->exec_start);
|
|
|
|
PN(se->vruntime);
|
|
|
|
PN(se->sum_exec_runtime);
|
sched: Clean up and harmonize the coding style of the scheduler code base
A good number of small style inconsistencies have accumulated
in the scheduler core, so do a pass over them to harmonize
all these details:
- fix speling in comments,
- use curly braces for multi-line statements,
- remove unnecessary parentheses from integer literals,
- capitalize consistently,
- remove stray newlines,
- add comments where necessary,
- remove invalid/unnecessary comments,
- align structure definitions and other data types vertically,
- add missing newlines for increased readability,
- fix vertical tabulation where it's misaligned,
- harmonize preprocessor conditional block labeling
and vertical alignment,
- remove line-breaks where they uglify the code,
- add newline after local variable definitions,
No change in functionality:
md5:
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-03-03 14:01:12 +01:00
|
|
|
|
2016-02-05 09:08:36 +00:00
|
|
|
if (schedstat_enabled()) {
|
2021-10-06 10:12:05 +02:00
|
|
|
struct sched_statistics *stats;
|
|
|
|
stats = __schedstats_from_se(se);
|
sched: Make struct sched_statistics independent of fair sched class
If we want to use the schedstats facility to trace other sched classes, we
should make it independent of fair sched class. The struct sched_statistics
is the schedular statistics of a task_struct or a task_group. So we can
move it into struct task_struct and struct task_group to achieve the goal.
After the patch, schestats are orgnized as follows,
struct task_struct {
...
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
...
struct sched_statistics stats;
...
};
Regarding the task group, schedstats is only supported for fair group
sched, and a new struct sched_entity_stats is introduced, suggested by
Peter -
struct sched_entity_stats {
struct sched_entity se;
struct sched_statistics stats;
} __no_randomize_layout;
Then with the se in a task_group, we can easily get the stats.
The sched_statistics members may be frequently modified when schedstats is
enabled, in order to avoid impacting on random data which may in the same
cacheline with them, the struct sched_statistics is defined as cacheline
aligned.
As this patch changes the core struct of scheduler, so I verified the
performance it may impact on the scheduler with 'perf bench sched
pipe', suggested by Mel. Below is the result, in which all the values
are in usecs/op.
Before After
kernel.sched_schedstats=0 5.2~5.4 5.2~5.4
kernel.sched_schedstats=1 5.3~5.5 5.3~5.5
[These data is a little difference with the earlier version, that is
because my old test machine is destroyed so I have to use a new
different test machine.]
Almost no impact on the sched performance.
No functional change.
[lkp@intel.com: reported build failure in earlier version]
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com
2021-09-05 14:35:41 +00:00
|
|
|
|
|
|
|
PN_SCHEDSTAT(wait_start);
|
|
|
|
PN_SCHEDSTAT(sleep_start);
|
|
|
|
PN_SCHEDSTAT(block_start);
|
|
|
|
PN_SCHEDSTAT(sleep_max);
|
|
|
|
PN_SCHEDSTAT(block_max);
|
|
|
|
PN_SCHEDSTAT(exec_max);
|
|
|
|
PN_SCHEDSTAT(slice_max);
|
|
|
|
PN_SCHEDSTAT(wait_max);
|
|
|
|
PN_SCHEDSTAT(wait_sum);
|
|
|
|
P_SCHEDSTAT(wait_count);
|
2016-02-05 09:08:36 +00:00
|
|
|
}
|
sched: Clean up and harmonize the coding style of the scheduler code base
A good number of small style inconsistencies have accumulated
in the scheduler core, so do a pass over them to harmonize
all these details:
- fix speling in comments,
- use curly braces for multi-line statements,
- remove unnecessary parentheses from integer literals,
- capitalize consistently,
- remove stray newlines,
- add comments where necessary,
- remove invalid/unnecessary comments,
- align structure definitions and other data types vertically,
- add missing newlines for increased readability,
- fix vertical tabulation where it's misaligned,
- harmonize preprocessor conditional block labeling
and vertical alignment,
- remove line-breaks where they uglify the code,
- add newline after local variable definitions,
No change in functionality:
md5:
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-03-03 14:01:12 +01:00
|
|
|
|
2008-11-10 21:34:09 +05:30
|
|
|
P(se->load.weight);
|
2012-10-04 13:18:29 +02:00
|
|
|
#ifdef CONFIG_SMP
|
sched/fair: Rewrite runnable load and utilization average tracking
The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:
1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
updated at the granularity of an entity at a time, which results in the
cfs_rq's load average is stale or partially updated: at any time, only
one entity is up to date, all other entities are effectively lagging
behind. This is undesirable.
To illustrate, if we have n runnable entities in the cfs_rq, as time
elapses, they certainly become outdated:
t0: cfs_rq { e1_old, e2_old, ..., en_old }
and when we update:
t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }
t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }
...
We solve this by combining all runnable entities' load averages together
in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
on the fact that if we regard the update as a function, then:
w * update(e) = update(w * e) and
update(e1) + update(e2) = update(e1 + e2), then
w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)
therefore, by this rewrite, we have an entirely updated cfs_rq at the
time we update it:
t1: update cfs_rq { e1_new, e2_new, ..., en_new }
t2: update cfs_rq { e1_new, e2_new, ..., en_new }
...
2. cfs_rq's load average is different between top rq->cfs_rq and other
task_group's per CPU cfs_rqs in whether or not blocked_load_average
contributes to the load.
The basic idea behind runnable load average (the same for utilization)
is that the blocked state is taken into account as opposed to only
accounting for the currently runnable state. Therefore, the average
should include both the runnable/running and blocked load averages.
This rewrite does that.
In addition, we also combine runnable/running and blocked averages
of all entities into the cfs_rq's average, and update it together at
once. This is based on the fact that:
update(runnable) + update(blocked) = update(runnable + blocked)
This significantly reduces the code as we don't need to separately
maintain/update runnable/running load and blocked load.
3. How task_group entities' share is calculated is complex and imprecise.
We reduce the complexity in this rewrite to allow a very simple rule:
the task_group's load_avg is aggregated from its per CPU cfs_rqs's
load_avgs. Then group entity's weight is simply proportional to its
own cfs_rq's load_avg / task_group's load_avg. To illustrate,
if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,
task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then
cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share
To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.
As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-07-15 08:04:37 +08:00
|
|
|
P(se->avg.load_avg);
|
|
|
|
P(se->avg.util_avg);
|
2020-02-24 09:52:18 +00:00
|
|
|
P(se->avg.runnable_avg);
|
2012-10-04 13:18:29 +02:00
|
|
|
#endif
|
2016-06-17 12:43:26 -05:00
|
|
|
|
|
|
|
#undef PN_SCHEDSTAT
|
2008-11-10 21:34:09 +05:30
|
|
|
#undef PN
|
2016-06-17 12:43:26 -05:00
|
|
|
#undef P_SCHEDSTAT
|
2008-11-10 21:34:09 +05:30
|
|
|
#undef P
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2011-01-11 15:41:54 +05:30
|
|
|
#ifdef CONFIG_CGROUP_SCHED
|
2021-04-15 15:54:26 -04:00
|
|
|
static DEFINE_SPINLOCK(sched_debug_lock);
|
2011-01-11 15:41:54 +05:30
|
|
|
static char group_path[PATH_MAX];
|
|
|
|
|
2021-04-15 15:54:26 -04:00
|
|
|
static void task_group_path(struct task_group *tg, char *path, int plen)
|
2011-01-11 15:41:54 +05:30
|
|
|
{
|
2021-04-15 15:54:26 -04:00
|
|
|
if (autogroup_path(tg, path, plen))
|
|
|
|
return;
|
2011-01-11 15:42:57 +05:30
|
|
|
|
2021-04-15 15:54:26 -04:00
|
|
|
cgroup_path(tg->css.cgroup, path, plen);
|
|
|
|
}
|
sched: Clean up and harmonize the coding style of the scheduler code base
A good number of small style inconsistencies have accumulated
in the scheduler core, so do a pass over them to harmonize
all these details:
- fix speling in comments,
- use curly braces for multi-line statements,
- remove unnecessary parentheses from integer literals,
- capitalize consistently,
- remove stray newlines,
- add comments where necessary,
- remove invalid/unnecessary comments,
- align structure definitions and other data types vertically,
- add missing newlines for increased readability,
- fix vertical tabulation where it's misaligned,
- harmonize preprocessor conditional block labeling
and vertical alignment,
- remove line-breaks where they uglify the code,
- add newline after local variable definitions,
No change in functionality:
md5:
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-03-03 14:01:12 +01:00
|
|
|
|
2021-04-15 15:54:26 -04:00
|
|
|
/*
|
|
|
|
* Only 1 SEQ_printf_task_group_path() caller can use the full length
|
|
|
|
* group_path[] for cgroup path. Other simultaneous callers will have
|
|
|
|
* to use a shorter stack buffer. A "..." suffix is appended at the end
|
|
|
|
* of the stack buffer so that it will show up in case the output length
|
|
|
|
* matches the given buffer size to indicate possible path name truncation.
|
|
|
|
*/
|
|
|
|
#define SEQ_printf_task_group_path(m, tg, fmt...) \
|
|
|
|
{ \
|
|
|
|
if (spin_trylock(&sched_debug_lock)) { \
|
|
|
|
task_group_path(tg, group_path, sizeof(group_path)); \
|
|
|
|
SEQ_printf(m, fmt, group_path); \
|
|
|
|
spin_unlock(&sched_debug_lock); \
|
|
|
|
} else { \
|
|
|
|
char buf[128]; \
|
|
|
|
char *bufend = buf + sizeof(buf) - 3; \
|
|
|
|
task_group_path(tg, buf, bufend - buf); \
|
|
|
|
strcpy(bufend - 1, "..."); \
|
|
|
|
SEQ_printf(m, fmt, buf); \
|
|
|
|
} \
|
2011-01-11 15:41:54 +05:30
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2007-07-09 18:52:00 +02:00
|
|
|
static void
|
2007-08-09 11:16:51 +02:00
|
|
|
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
|
2007-07-09 18:52:00 +02:00
|
|
|
{
|
2020-10-31 01:32:23 +08:00
|
|
|
if (task_current(rq, p))
|
2017-08-07 16:44:22 +08:00
|
|
|
SEQ_printf(m, ">R");
|
2017-08-07 16:44:23 +08:00
|
|
|
else
|
|
|
|
SEQ_printf(m, " %c", task_state_to_char(p));
|
2007-07-09 18:52:00 +02:00
|
|
|
|
2024-09-06 13:30:19 +08:00
|
|
|
SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
|
2013-09-09 13:01:41 +02:00
|
|
|
p->comm, task_pid_nr(p),
|
2007-10-15 17:00:08 +02:00
|
|
|
SPLIT_NS(p->se.vruntime),
|
sched/fair: Implement an EEVDF-like scheduling policy
Where CFS is currently a WFQ based scheduler with only a single knob,
the weight. The addition of a second, latency oriented parameter,
makes something like WF2Q or EEVDF based a much better fit.
Specifically, EEVDF does EDF like scheduling in the left half of the
tree -- those entities that are owed service. Except because this is a
virtual time scheduler, the deadlines are in virtual time as well,
which is what allows over-subscription.
EEVDF has two parameters:
- weight, or time-slope: which is mapped to nice just as before
- request size, or slice length: which is used to compute
the virtual deadline as: vd_i = ve_i + r_i/w_i
Basically, by setting a smaller slice, the deadline will be earlier
and the task will be more eligible and ran earlier.
Tick driven preemption is driven by request/slice completion; while
wakeup preemption is driven by the deadline.
Because the tree is now effectively an interval tree, and the
selection is no longer 'leftmost', over-scheduling is less of a
problem.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230531124603.931005524@infradead.org
2023-05-31 13:58:44 +02:00
|
|
|
entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
|
|
|
|
SPLIT_NS(p->se.deadline),
|
sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion
Allow applications to directly set a suggested request/slice length using
sched_attr::sched_runtime.
The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
Applications should strive to use their periodic runtime at a high
confidence interval (95%+) as the target slice. Using a smaller slice
will introduce undue preemptions, while using a larger value will
increase latency.
For all the following examples assume a scheduling quantum of 8, and for
consistency all examples have W=4:
{A,B,C,D}(w=1,r=8):
ABCD...
+---+---+---+---
t=0, V=1.5 t=1, V=3.5
A |------< A |------<
B |------< B |------<
C |------< C |------<
D |------< D |------<
---+*------+-------+--- ---+--*----+-------+---
t=2, V=5.5 t=3, V=7.5
A |------< A |------<
B |------< B |------<
C |------< C |------<
D |------< D |------<
---+----*--+-------+--- ---+------*+-------+---
Note: 4 identical tasks in FIFO order
~~~
{A,B}(w=1,r=16) C(w=2,r=16)
AACCBBCC...
+---+---+---+---
t=0, V=1.25 t=2, V=5.25
A |--------------< A |--------------<
B |--------------< B |--------------<
C |------< C |------<
---+*------+-------+--- ---+----*--+-------+---
t=4, V=8.25 t=6, V=12.25
A |--------------< A |--------------<
B |--------------< B |--------------<
C |------< C |------<
---+-------*-------+--- ---+-------+---*---+---
Note: 1 heavy task -- because q=8, double r such that the deadline of the w=2
task doesn't go below q.
Note: observe the full schedule becomes: W*max(r_i/w_i) = 4*2q = 8q in length.
Note: the period of the heavy task is half the full period at:
W*(r_i/w_i) = 4*(2q/2) = 4q
~~~
{A,C,D}(w=1,r=16) B(w=1,r=8):
BAACCBDD...
+---+---+---+---
t=0, V=1.5 t=1, V=3.5
A |--------------< A |---------------<
B |------< B |------<
C |--------------< C |--------------<
D |--------------< D |--------------<
---+*------+-------+--- ---+--*----+-------+---
t=3, V=7.5 t=5, V=11.5
A |---------------< A |---------------<
B |------< B |------<
C |--------------< C |--------------<
D |--------------< D |--------------<
---+------*+-------+--- ---+-------+--*----+---
t=6, V=13.5
A |---------------<
B |------<
C |--------------<
D |--------------<
---+-------+----*--+---
Note: 1 short task -- again double r so that the deadline of the short task
won't be below q. Made B short because its not the leftmost task, but is
eligible with the 0,1,2,3 spread.
Note: like with the heavy task, the period of the short task observes:
W*(r_i/w_i) = 4*(1q/1) = 4q
~~~
A(w=1,r=16) B(w=1,r=8) C(w=2,r=16)
BCCAABCC...
+---+---+---+---
t=0, V=1.25 t=1, V=3.25
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+*------+-------+--- ---+--*----+-------+---
t=3, V=7.25 t=5, V=11.25
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+------*+-------+--- ---+-------+--*----+---
t=6, V=13.25
A |--------------<
B |------<
C |------<
---+-------+----*--+---
Note: 1 heavy and 1 short task -- combine them all.
Note: both the short and heavy task end up with a period of 4q
~~~
A(w=1,r=16) B(w=2,r=16) C(w=1,r=8)
BBCAABBC...
+---+---+---+---
t=0, V=1 t=2, V=5
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+*------+-------+--- ---+----*--+-------+---
t=3, V=7 t=5, V=11
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+------*+-------+--- ---+-------+--*----+---
t=7, V=15
A |--------------<
B |------<
C |------<
---+-------+------*+---
Note: as before but permuted
~~~
From all this it can be deduced that, for the steady state:
- the total period (P) of a schedule is: W*max(r_i/w_i)
- the average period of a task is: W*(r_i/w_i)
- each task obtains the fair share: w_i/W of each full period P
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105030.842834421@infradead.org
2023-05-22 13:46:30 +02:00
|
|
|
p->se.custom_slice ? 'S' : ' ',
|
sched/fair: Implement an EEVDF-like scheduling policy
Where CFS is currently a WFQ based scheduler with only a single knob,
the weight. The addition of a second, latency oriented parameter,
makes something like WF2Q or EEVDF based a much better fit.
Specifically, EEVDF does EDF like scheduling in the left half of the
tree -- those entities that are owed service. Except because this is a
virtual time scheduler, the deadlines are in virtual time as well,
which is what allows over-subscription.
EEVDF has two parameters:
- weight, or time-slope: which is mapped to nice just as before
- request size, or slice length: which is used to compute
the virtual deadline as: vd_i = ve_i + r_i/w_i
Basically, by setting a smaller slice, the deadline will be earlier
and the task will be more eligible and ran earlier.
Tick driven preemption is driven by request/slice completion; while
wakeup preemption is driven by the deadline.
Because the tree is now effectively an interval tree, and the
selection is no longer 'leftmost', over-scheduling is less of a
problem.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230531124603.931005524@infradead.org
2023-05-31 13:58:44 +02:00
|
|
|
SPLIT_NS(p->se.slice),
|
|
|
|
SPLIT_NS(p->se.sum_exec_runtime),
|
2007-07-09 18:52:00 +02:00
|
|
|
(long long)(p->nvcsw + p->nivcsw),
|
2007-08-06 04:26:59 +01:00
|
|
|
p->prio);
|
2016-06-03 17:58:40 -05:00
|
|
|
|
2024-09-06 13:30:19 +08:00
|
|
|
SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld",
|
sched: Make struct sched_statistics independent of fair sched class
If we want to use the schedstats facility to trace other sched classes, we
should make it independent of fair sched class. The struct sched_statistics
is the schedular statistics of a task_struct or a task_group. So we can
move it into struct task_struct and struct task_group to achieve the goal.
After the patch, schestats are orgnized as follows,
struct task_struct {
...
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
...
struct sched_statistics stats;
...
};
Regarding the task group, schedstats is only supported for fair group
sched, and a new struct sched_entity_stats is introduced, suggested by
Peter -
struct sched_entity_stats {
struct sched_entity se;
struct sched_statistics stats;
} __no_randomize_layout;
Then with the se in a task_group, we can easily get the stats.
The sched_statistics members may be frequently modified when schedstats is
enabled, in order to avoid impacting on random data which may in the same
cacheline with them, the struct sched_statistics is defined as cacheline
aligned.
As this patch changes the core struct of scheduler, so I verified the
performance it may impact on the scheduler with 'perf bench sched
pipe', suggested by Mel. Below is the result, in which all the values
are in usecs/op.
Before After
kernel.sched_schedstats=0 5.2~5.4 5.2~5.4
kernel.sched_schedstats=1 5.3~5.5 5.3~5.5
[These data is a little difference with the earlier version, that is
because my old test machine is destroyed so I have to use a new
different test machine.]
Almost no impact on the sched performance.
No functional change.
[lkp@intel.com: reported build failure in earlier version]
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com
2021-09-05 14:35:41 +00:00
|
|
|
SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
|
2021-09-05 14:35:43 +00:00
|
|
|
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
|
|
|
|
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
|
2016-06-03 17:58:40 -05:00
|
|
|
|
2013-10-07 11:29:30 +01:00
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
2024-09-06 13:30:19 +08:00
|
|
|
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
|
2013-10-07 11:29:30 +01:00
|
|
|
#endif
|
2011-01-11 15:41:54 +05:30
|
|
|
#ifdef CONFIG_CGROUP_SCHED
|
2024-09-06 13:30:19 +08:00
|
|
|
SEQ_printf_task_group_path(m, task_group(p), " %s")
|
2011-01-11 15:41:54 +05:30
|
|
|
#endif
|
2008-04-19 19:45:00 +02:00
|
|
|
|
|
|
|
SEQ_printf(m, "\n");
|
2007-07-09 18:52:00 +02:00
|
|
|
}
|
|
|
|
|
2007-08-09 11:16:51 +02:00
|
|
|
static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
|
2007-07-09 18:52:00 +02:00
|
|
|
{
|
|
|
|
struct task_struct *g, *p;
|
|
|
|
|
2018-03-19 14:35:55 -04:00
|
|
|
SEQ_printf(m, "\n");
|
|
|
|
SEQ_printf(m, "runnable tasks:\n");
|
2024-09-06 13:30:19 +08:00
|
|
|
SEQ_printf(m, " S task PID vruntime eligible "
|
|
|
|
"deadline slice sum-exec switches "
|
|
|
|
"prio wait-time sum-sleep sum-block"
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
|
" node group-id"
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_CGROUP_SCHED
|
|
|
|
" group-path"
|
|
|
|
#endif
|
|
|
|
"\n");
|
2018-03-19 14:35:55 -04:00
|
|
|
SEQ_printf(m, "-------------------------------------------------------"
|
2024-09-06 13:30:19 +08:00
|
|
|
"------------------------------------------------------"
|
|
|
|
"------------------------------------------------------"
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
|
"--------------"
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_CGROUP_SCHED
|
|
|
|
"--------------"
|
|
|
|
#endif
|
|
|
|
"\n");
|
2007-07-09 18:52:00 +02:00
|
|
|
|
2014-09-21 21:33:41 +02:00
|
|
|
rcu_read_lock();
|
2014-08-13 21:19:56 +02:00
|
|
|
for_each_process_thread(g, p) {
|
2013-10-07 11:29:30 +01:00
|
|
|
if (task_cpu(p) != rq_cpu)
|
2007-07-09 18:52:00 +02:00
|
|
|
continue;
|
|
|
|
|
2007-08-09 11:16:51 +02:00
|
|
|
print_task(m, rq, p);
|
2014-08-13 21:19:56 +02:00
|
|
|
}
|
2014-09-21 21:33:41 +02:00
|
|
|
rcu_read_unlock();
|
2007-07-09 18:52:00 +02:00
|
|
|
}
|
|
|
|
|
2007-08-09 11:16:47 +02:00
|
|
|
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
2007-07-09 18:52:00 +02:00
|
|
|
{
|
2023-11-15 11:36:45 +08:00
|
|
|
s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
|
|
|
|
struct sched_entity *last, *first, *root;
|
2009-06-17 22:20:55 +09:00
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2007-10-15 17:00:05 +02:00
|
|
|
unsigned long flags;
|
|
|
|
|
2011-01-11 15:41:54 +05:30
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2018-03-19 14:35:55 -04:00
|
|
|
SEQ_printf(m, "\n");
|
2021-04-15 15:54:26 -04:00
|
|
|
SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu);
|
2011-01-11 15:41:54 +05:30
|
|
|
#else
|
2018-03-19 14:35:55 -04:00
|
|
|
SEQ_printf(m, "\n");
|
|
|
|
SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
|
2011-01-11 15:41:54 +05:30
|
|
|
#endif
|
2007-10-15 17:00:05 +02:00
|
|
|
|
2020-11-17 18:19:31 -05:00
|
|
|
raw_spin_rq_lock_irqsave(rq, flags);
|
2023-11-15 11:36:45 +08:00
|
|
|
root = __pick_root_entity(cfs_rq);
|
|
|
|
if (root)
|
|
|
|
left_vruntime = root->min_vruntime;
|
2023-05-31 13:58:40 +02:00
|
|
|
first = __pick_first_entity(cfs_rq);
|
|
|
|
if (first)
|
2023-11-15 11:36:45 +08:00
|
|
|
left_deadline = first->deadline;
|
2007-10-15 17:00:05 +02:00
|
|
|
last = __pick_last_entity(cfs_rq);
|
|
|
|
if (last)
|
2023-05-31 13:58:40 +02:00
|
|
|
right_vruntime = last->vruntime;
|
2008-11-10 10:46:32 +01:00
|
|
|
min_vruntime = cfs_rq->min_vruntime;
|
2020-11-17 18:19:31 -05:00
|
|
|
raw_spin_rq_unlock_irqrestore(rq, flags);
|
2023-05-31 13:58:40 +02:00
|
|
|
|
2023-11-15 11:36:45 +08:00
|
|
|
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
|
|
|
|
SPLIT_NS(left_deadline));
|
2023-05-31 13:58:40 +02:00
|
|
|
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
|
|
|
|
SPLIT_NS(left_vruntime));
|
2007-10-15 17:00:08 +02:00
|
|
|
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
|
|
|
|
SPLIT_NS(min_vruntime));
|
2023-05-31 13:58:40 +02:00
|
|
|
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
|
|
|
|
SPLIT_NS(avg_vruntime(cfs_rq)));
|
|
|
|
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
|
|
|
|
SPLIT_NS(right_vruntime));
|
|
|
|
spread = right_vruntime - left_vruntime;
|
|
|
|
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
|
2024-12-02 18:46:04 +01:00
|
|
|
SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued);
|
2024-12-02 18:45:59 +01:00
|
|
|
SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable);
|
2024-12-02 18:45:58 +01:00
|
|
|
SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
|
2024-12-02 18:46:02 +01:00
|
|
|
SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle);
|
2010-11-15 15:47:00 -08:00
|
|
|
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
|
2008-06-27 13:41:14 +02:00
|
|
|
#ifdef CONFIG_SMP
|
sched/fair: Rewrite runnable load and utilization average tracking
The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:
1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
updated at the granularity of an entity at a time, which results in the
cfs_rq's load average is stale or partially updated: at any time, only
one entity is up to date, all other entities are effectively lagging
behind. This is undesirable.
To illustrate, if we have n runnable entities in the cfs_rq, as time
elapses, they certainly become outdated:
t0: cfs_rq { e1_old, e2_old, ..., en_old }
and when we update:
t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }
t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }
...
We solve this by combining all runnable entities' load averages together
in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
on the fact that if we regard the update as a function, then:
w * update(e) = update(w * e) and
update(e1) + update(e2) = update(e1 + e2), then
w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)
therefore, by this rewrite, we have an entirely updated cfs_rq at the
time we update it:
t1: update cfs_rq { e1_new, e2_new, ..., en_new }
t2: update cfs_rq { e1_new, e2_new, ..., en_new }
...
2. cfs_rq's load average is different between top rq->cfs_rq and other
task_group's per CPU cfs_rqs in whether or not blocked_load_average
contributes to the load.
The basic idea behind runnable load average (the same for utilization)
is that the blocked state is taken into account as opposed to only
accounting for the currently runnable state. Therefore, the average
should include both the runnable/running and blocked load averages.
This rewrite does that.
In addition, we also combine runnable/running and blocked averages
of all entities into the cfs_rq's average, and update it together at
once. This is based on the fact that:
update(runnable) + update(blocked) = update(runnable + blocked)
This significantly reduces the code as we don't need to separately
maintain/update runnable/running load and blocked load.
3. How task_group entities' share is calculated is complex and imprecise.
We reduce the complexity in this rewrite to allow a very simple rule:
the task_group's load_avg is aggregated from its per CPU cfs_rqs's
load_avgs. Then group entity's weight is simply proportional to its
own cfs_rq's load_avg / task_group's load_avg. To illustrate,
if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,
task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then
cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share
To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.
As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-07-15 08:04:37 +08:00
|
|
|
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
|
|
|
|
cfs_rq->avg.load_avg);
|
2020-02-24 09:52:18 +00:00
|
|
|
SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
|
|
|
|
cfs_rq->avg.runnable_avg);
|
sched/fair: Rewrite runnable load and utilization average tracking
The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:
1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
updated at the granularity of an entity at a time, which results in the
cfs_rq's load average is stale or partially updated: at any time, only
one entity is up to date, all other entities are effectively lagging
behind. This is undesirable.
To illustrate, if we have n runnable entities in the cfs_rq, as time
elapses, they certainly become outdated:
t0: cfs_rq { e1_old, e2_old, ..., en_old }
and when we update:
t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }
t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }
...
We solve this by combining all runnable entities' load averages together
in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
on the fact that if we regard the update as a function, then:
w * update(e) = update(w * e) and
update(e1) + update(e2) = update(e1 + e2), then
w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)
therefore, by this rewrite, we have an entirely updated cfs_rq at the
time we update it:
t1: update cfs_rq { e1_new, e2_new, ..., en_new }
t2: update cfs_rq { e1_new, e2_new, ..., en_new }
...
2. cfs_rq's load average is different between top rq->cfs_rq and other
task_group's per CPU cfs_rqs in whether or not blocked_load_average
contributes to the load.
The basic idea behind runnable load average (the same for utilization)
is that the blocked state is taken into account as opposed to only
accounting for the currently runnable state. Therefore, the average
should include both the runnable/running and blocked load averages.
This rewrite does that.
In addition, we also combine runnable/running and blocked averages
of all entities into the cfs_rq's average, and update it together at
once. This is based on the fact that:
update(runnable) + update(blocked) = update(runnable + blocked)
This significantly reduces the code as we don't need to separately
maintain/update runnable/running load and blocked load.
3. How task_group entities' share is calculated is complex and imprecise.
We reduce the complexity in this rewrite to allow a very simple rule:
the task_group's load_avg is aggregated from its per CPU cfs_rqs's
load_avgs. Then group entity's weight is simply proportional to its
own cfs_rq's load_avg / task_group's load_avg. To illustrate,
if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,
task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then
cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share
To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.
As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-07-15 08:04:37 +08:00
|
|
|
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
|
|
|
|
cfs_rq->avg.util_avg);
|
2023-12-01 17:16:52 +01:00
|
|
|
SEQ_printf(m, " .%-30s: %u\n", "util_est",
|
|
|
|
cfs_rq->avg.util_est);
|
2017-05-08 16:51:41 +02:00
|
|
|
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
|
|
|
|
cfs_rq->removed.load_avg);
|
|
|
|
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
|
|
|
|
cfs_rq->removed.util_avg);
|
2020-02-24 09:52:18 +00:00
|
|
|
SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg",
|
|
|
|
cfs_rq->removed.runnable_avg);
|
2013-06-28 19:10:35 +08:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
sched/fair: Rewrite runnable load and utilization average tracking
The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:
1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
updated at the granularity of an entity at a time, which results in the
cfs_rq's load average is stale or partially updated: at any time, only
one entity is up to date, all other entities are effectively lagging
behind. This is undesirable.
To illustrate, if we have n runnable entities in the cfs_rq, as time
elapses, they certainly become outdated:
t0: cfs_rq { e1_old, e2_old, ..., en_old }
and when we update:
t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }
t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }
...
We solve this by combining all runnable entities' load averages together
in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
on the fact that if we regard the update as a function, then:
w * update(e) = update(w * e) and
update(e1) + update(e2) = update(e1 + e2), then
w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)
therefore, by this rewrite, we have an entirely updated cfs_rq at the
time we update it:
t1: update cfs_rq { e1_new, e2_new, ..., en_new }
t2: update cfs_rq { e1_new, e2_new, ..., en_new }
...
2. cfs_rq's load average is different between top rq->cfs_rq and other
task_group's per CPU cfs_rqs in whether or not blocked_load_average
contributes to the load.
The basic idea behind runnable load average (the same for utilization)
is that the blocked state is taken into account as opposed to only
accounting for the currently runnable state. Therefore, the average
should include both the runnable/running and blocked load averages.
This rewrite does that.
In addition, we also combine runnable/running and blocked averages
of all entities into the cfs_rq's average, and update it together at
once. This is based on the fact that:
update(runnable) + update(blocked) = update(runnable + blocked)
This significantly reduces the code as we don't need to separately
maintain/update runnable/running load and blocked load.
3. How task_group entities' share is calculated is complex and imprecise.
We reduce the complexity in this rewrite to allow a very simple rule:
the task_group's load_avg is aggregated from its per CPU cfs_rqs's
load_avgs. Then group entity's weight is simply proportional to its
own cfs_rq's load_avg / task_group's load_avg. To illustrate,
if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,
task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then
cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share
To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.
As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-07-15 08:04:37 +08:00
|
|
|
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
|
|
|
|
cfs_rq->tg_load_avg_contrib);
|
2013-06-28 19:10:35 +08:00
|
|
|
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
|
|
|
|
atomic_long_read(&cfs_rq->tg->load_avg));
|
2008-06-27 13:41:14 +02:00
|
|
|
#endif
|
2013-06-28 19:10:35 +08:00
|
|
|
#endif
|
2013-10-16 11:16:32 -07:00
|
|
|
#ifdef CONFIG_CFS_BANDWIDTH
|
|
|
|
SEQ_printf(m, " .%-30s: %d\n", "throttled",
|
|
|
|
cfs_rq->throttled);
|
|
|
|
SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
|
|
|
|
cfs_rq->throttle_count);
|
|
|
|
#endif
|
2010-11-15 15:47:00 -08:00
|
|
|
|
2013-06-28 19:10:35 +08:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2008-11-10 21:34:09 +05:30
|
|
|
print_cfs_group_stats(m, cpu, cfs_rq->tg);
|
2008-06-27 13:41:14 +02:00
|
|
|
#endif
|
2007-07-09 18:52:00 +02:00
|
|
|
}
|
|
|
|
|
2008-06-19 14:22:24 +02:00
|
|
|
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
|
|
|
|
{
|
2011-01-11 15:41:54 +05:30
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2018-03-19 14:35:55 -04:00
|
|
|
SEQ_printf(m, "\n");
|
2021-04-15 15:54:26 -04:00
|
|
|
SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu);
|
2011-01-11 15:41:54 +05:30
|
|
|
#else
|
2018-03-19 14:35:55 -04:00
|
|
|
SEQ_printf(m, "\n");
|
|
|
|
SEQ_printf(m, "rt_rq[%d]:\n", cpu);
|
2011-01-11 15:41:54 +05:30
|
|
|
#endif
|
2008-06-19 14:22:24 +02:00
|
|
|
|
|
|
|
#define P(x) \
|
|
|
|
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
|
2017-06-26 17:07:14 +02:00
|
|
|
#define PU(x) \
|
|
|
|
SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(rt_rq->x))
|
2008-06-19 14:22:24 +02:00
|
|
|
#define PN(x) \
|
|
|
|
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
|
|
|
|
|
2017-06-26 17:07:14 +02:00
|
|
|
PU(rt_nr_running);
|
2024-05-27 14:06:55 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2008-06-19 14:22:24 +02:00
|
|
|
P(rt_throttled);
|
|
|
|
PN(rt_time);
|
|
|
|
PN(rt_runtime);
|
2024-05-27 14:06:55 +02:00
|
|
|
#endif
|
2008-06-19 14:22:24 +02:00
|
|
|
|
|
|
|
#undef PN
|
2017-06-26 17:07:14 +02:00
|
|
|
#undef PU
|
2008-06-19 14:22:24 +02:00
|
|
|
#undef P
|
|
|
|
}
|
|
|
|
|
2014-10-31 06:39:33 +08:00
|
|
|
void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
|
|
|
|
{
|
2016-02-22 16:26:52 -05:00
|
|
|
struct dl_bw *dl_bw;
|
|
|
|
|
2018-03-19 14:35:55 -04:00
|
|
|
SEQ_printf(m, "\n");
|
|
|
|
SEQ_printf(m, "dl_rq[%d]:\n", cpu);
|
2017-06-26 17:07:14 +02:00
|
|
|
|
|
|
|
#define PU(x) \
|
|
|
|
SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
|
|
|
|
|
|
|
|
PU(dl_nr_running);
|
2016-02-22 16:26:52 -05:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
|
|
|
|
#else
|
|
|
|
dl_bw = &dl_rq->dl_bw;
|
|
|
|
#endif
|
|
|
|
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
|
|
|
|
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
|
2017-06-26 17:07:14 +02:00
|
|
|
|
|
|
|
#undef PU
|
2014-10-31 06:39:33 +08:00
|
|
|
}
|
|
|
|
|
2007-08-09 11:16:51 +02:00
|
|
|
static void print_cpu(struct seq_file *m, int cpu)
|
2007-07-09 18:52:00 +02:00
|
|
|
{
|
2009-06-17 22:20:55 +09:00
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2007-07-09 18:52:00 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_X86
|
|
|
|
{
|
|
|
|
unsigned int freq = cpu_khz ? : 1;
|
|
|
|
|
2013-02-21 15:15:09 -08:00
|
|
|
SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
|
2007-07-09 18:52:00 +02:00
|
|
|
cpu, freq / 1000, (freq % 1000));
|
|
|
|
}
|
|
|
|
#else
|
2013-02-21 15:15:09 -08:00
|
|
|
SEQ_printf(m, "cpu#%d\n", cpu);
|
2007-07-09 18:52:00 +02:00
|
|
|
#endif
|
|
|
|
|
2012-05-14 14:34:00 +02:00
|
|
|
#define P(x) \
|
|
|
|
do { \
|
|
|
|
if (sizeof(rq->x) == 4) \
|
2023-05-06 15:42:53 +08:00
|
|
|
SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \
|
2012-05-14 14:34:00 +02:00
|
|
|
else \
|
|
|
|
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
|
|
|
|
} while (0)
|
|
|
|
|
2007-10-15 17:00:08 +02:00
|
|
|
#define PN(x) \
|
|
|
|
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
|
2007-07-09 18:52:00 +02:00
|
|
|
|
|
|
|
P(nr_running);
|
|
|
|
P(nr_switches);
|
|
|
|
P(nr_uninterruptible);
|
2007-10-15 17:00:08 +02:00
|
|
|
PN(next_balance);
|
2013-09-09 13:01:41 +02:00
|
|
|
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
|
2007-10-15 17:00:08 +02:00
|
|
|
PN(clock);
|
2015-01-05 11:18:12 +01:00
|
|
|
PN(clock_task);
|
2007-07-09 18:52:00 +02:00
|
|
|
#undef P
|
2007-10-15 17:00:08 +02:00
|
|
|
#undef PN
|
2007-07-09 18:52:00 +02:00
|
|
|
|
2009-11-04 17:53:50 +01:00
|
|
|
#ifdef CONFIG_SMP
|
2016-05-03 12:38:25 +08:00
|
|
|
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
|
2009-11-04 17:53:50 +01:00
|
|
|
P64(avg_idle);
|
2014-01-23 18:39:54 +08:00
|
|
|
P64(max_idle_balance_cost);
|
2016-05-03 12:38:25 +08:00
|
|
|
#undef P64
|
2009-11-04 17:53:50 +01:00
|
|
|
#endif
|
2008-11-10 10:46:32 +01:00
|
|
|
|
2016-06-17 12:43:26 -05:00
|
|
|
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
|
2016-02-05 09:08:36 +00:00
|
|
|
if (schedstat_enabled()) {
|
|
|
|
P(yld_count);
|
|
|
|
P(sched_count);
|
|
|
|
P(sched_goidle);
|
|
|
|
P(ttwu_count);
|
|
|
|
P(ttwu_local);
|
|
|
|
}
|
2008-11-10 10:46:32 +01:00
|
|
|
#undef P
|
2016-06-17 12:43:26 -05:00
|
|
|
|
2007-08-09 11:16:47 +02:00
|
|
|
print_cfs_stats(m, cpu);
|
2008-06-19 14:22:24 +02:00
|
|
|
print_rt_stats(m, cpu);
|
2014-10-31 06:39:33 +08:00
|
|
|
print_dl_stats(m, cpu);
|
2007-07-09 18:52:00 +02:00
|
|
|
|
2007-08-09 11:16:51 +02:00
|
|
|
print_rq(m, rq, cpu);
|
2013-02-21 15:15:09 -08:00
|
|
|
SEQ_printf(m, "\n");
|
2007-07-09 18:52:00 +02:00
|
|
|
}
|
|
|
|
|
2009-11-30 12:16:47 +01:00
|
|
|
static const char *sched_tunable_scaling_names[] = {
|
|
|
|
"none",
|
2018-11-28 15:23:50 +00:00
|
|
|
"logarithmic",
|
2009-11-30 12:16:47 +01:00
|
|
|
"linear"
|
|
|
|
};
|
|
|
|
|
2013-02-21 15:15:09 -08:00
|
|
|
static void sched_debug_header(struct seq_file *m)
|
2007-07-09 18:52:00 +02:00
|
|
|
{
|
2010-11-19 21:11:09 +01:00
|
|
|
u64 ktime, sched_clk, cpu_clk;
|
|
|
|
unsigned long flags;
|
2007-07-09 18:52:00 +02:00
|
|
|
|
2010-11-19 21:11:09 +01:00
|
|
|
local_irq_save(flags);
|
|
|
|
ktime = ktime_to_ns(ktime_get());
|
|
|
|
sched_clk = sched_clock();
|
|
|
|
cpu_clk = local_clock();
|
|
|
|
local_irq_restore(flags);
|
|
|
|
|
2013-10-07 11:29:30 +01:00
|
|
|
SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
|
2007-07-09 18:52:00 +02:00
|
|
|
init_utsname()->release,
|
|
|
|
(int)strcspn(init_utsname()->version, " "),
|
|
|
|
init_utsname()->version);
|
|
|
|
|
2010-11-19 21:11:09 +01:00
|
|
|
#define P(x) \
|
|
|
|
SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
|
|
|
|
#define PN(x) \
|
|
|
|
SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
|
|
|
|
PN(ktime);
|
|
|
|
PN(sched_clk);
|
|
|
|
PN(cpu_clk);
|
|
|
|
P(jiffies);
|
|
|
|
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
|
2013-11-28 19:38:42 +01:00
|
|
|
P(sched_clock_stable());
|
2010-11-19 21:11:09 +01:00
|
|
|
#endif
|
|
|
|
#undef PN
|
|
|
|
#undef P
|
|
|
|
|
|
|
|
SEQ_printf(m, "\n");
|
|
|
|
SEQ_printf(m, "sysctl_sched\n");
|
2007-07-09 18:52:00 +02:00
|
|
|
|
2007-10-15 17:00:10 +02:00
|
|
|
#define P(x) \
|
2007-10-15 17:00:10 +02:00
|
|
|
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
|
2007-10-15 17:00:10 +02:00
|
|
|
#define PN(x) \
|
2007-10-15 17:00:10 +02:00
|
|
|
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
|
2023-05-31 13:58:48 +02:00
|
|
|
PN(sysctl_sched_base_slice);
|
2007-10-15 17:00:10 +02:00
|
|
|
P(sysctl_sched_features);
|
|
|
|
#undef PN
|
|
|
|
#undef P
|
|
|
|
|
2013-02-21 15:15:09 -08:00
|
|
|
SEQ_printf(m, " .%-40s: %d (%s)\n",
|
|
|
|
"sysctl_sched_tunable_scaling",
|
2009-11-30 12:16:47 +01:00
|
|
|
sysctl_sched_tunable_scaling,
|
|
|
|
sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
|
2013-02-21 15:15:09 -08:00
|
|
|
SEQ_printf(m, "\n");
|
|
|
|
}
|
2009-11-30 12:16:47 +01:00
|
|
|
|
2013-02-21 15:15:09 -08:00
|
|
|
static int sched_debug_show(struct seq_file *m, void *v)
|
|
|
|
{
|
|
|
|
int cpu = (unsigned long)(v - 2);
|
2007-07-09 18:52:00 +02:00
|
|
|
|
2013-02-21 15:15:09 -08:00
|
|
|
if (cpu != -1)
|
|
|
|
print_cpu(m, cpu);
|
|
|
|
else
|
|
|
|
sched_debug_header(m);
|
2007-07-09 18:52:00 +02:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void sysrq_sched_debug_show(void)
|
2007-07-09 18:52:00 +02:00
|
|
|
{
|
2013-02-21 15:15:09 -08:00
|
|
|
int cpu;
|
|
|
|
|
|
|
|
sched_debug_header(NULL);
|
2019-12-26 16:52:24 +08:00
|
|
|
for_each_online_cpu(cpu) {
|
|
|
|
/*
|
|
|
|
* Need to reset softlockup watchdogs on all CPUs, because
|
|
|
|
* another CPU might be blocked waiting for us to process
|
|
|
|
* an IPI or stop_machine.
|
|
|
|
*/
|
|
|
|
touch_nmi_watchdog();
|
|
|
|
touch_all_softlockup_watchdogs();
|
2013-02-21 15:15:09 -08:00
|
|
|
print_cpu(NULL, cpu);
|
2019-12-26 16:52:24 +08:00
|
|
|
}
|
2013-02-21 15:15:09 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2021-03-18 13:38:50 +01:00
|
|
|
* This iterator needs some explanation.
|
2013-02-21 15:15:09 -08:00
|
|
|
* It returns 1 for the header position.
|
sched: Clean up and harmonize the coding style of the scheduler code base
A good number of small style inconsistencies have accumulated
in the scheduler core, so do a pass over them to harmonize
all these details:
- fix speling in comments,
- use curly braces for multi-line statements,
- remove unnecessary parentheses from integer literals,
- capitalize consistently,
- remove stray newlines,
- add comments where necessary,
- remove invalid/unnecessary comments,
- align structure definitions and other data types vertically,
- add missing newlines for increased readability,
- fix vertical tabulation where it's misaligned,
- harmonize preprocessor conditional block labeling
and vertical alignment,
- remove line-breaks where they uglify the code,
- add newline after local variable definitions,
No change in functionality:
md5:
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-03-03 14:01:12 +01:00
|
|
|
* This means 2 is CPU 0.
|
|
|
|
* In a hotplugged system some CPUs, including CPU 0, may be missing so we have
|
|
|
|
* to use cpumask_* to iterate over the CPUs.
|
2013-02-21 15:15:09 -08:00
|
|
|
*/
|
|
|
|
static void *sched_debug_start(struct seq_file *file, loff_t *offset)
|
|
|
|
{
|
|
|
|
unsigned long n = *offset;
|
|
|
|
|
|
|
|
if (n == 0)
|
|
|
|
return (void *) 1;
|
|
|
|
|
|
|
|
n--;
|
|
|
|
|
|
|
|
if (n > 0)
|
|
|
|
n = cpumask_next(n - 1, cpu_online_mask);
|
|
|
|
else
|
|
|
|
n = cpumask_first(cpu_online_mask);
|
|
|
|
|
|
|
|
*offset = n + 1;
|
|
|
|
|
|
|
|
if (n < nr_cpu_ids)
|
|
|
|
return (void *)(unsigned long)(n + 2);
|
sched: Clean up and harmonize the coding style of the scheduler code base
A good number of small style inconsistencies have accumulated
in the scheduler core, so do a pass over them to harmonize
all these details:
- fix speling in comments,
- use curly braces for multi-line statements,
- remove unnecessary parentheses from integer literals,
- capitalize consistently,
- remove stray newlines,
- add comments where necessary,
- remove invalid/unnecessary comments,
- align structure definitions and other data types vertically,
- add missing newlines for increased readability,
- fix vertical tabulation where it's misaligned,
- harmonize preprocessor conditional block labeling
and vertical alignment,
- remove line-breaks where they uglify the code,
- add newline after local variable definitions,
No change in functionality:
md5:
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-03-03 14:01:12 +01:00
|
|
|
|
2013-02-21 15:15:09 -08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
|
|
|
|
{
|
|
|
|
(*offset)++;
|
|
|
|
return sched_debug_start(file, offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void sched_debug_stop(struct seq_file *file, void *data)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct seq_operations sched_debug_sops = {
|
sched: Clean up and harmonize the coding style of the scheduler code base
A good number of small style inconsistencies have accumulated
in the scheduler core, so do a pass over them to harmonize
all these details:
- fix speling in comments,
- use curly braces for multi-line statements,
- remove unnecessary parentheses from integer literals,
- capitalize consistently,
- remove stray newlines,
- add comments where necessary,
- remove invalid/unnecessary comments,
- align structure definitions and other data types vertically,
- add missing newlines for increased readability,
- fix vertical tabulation where it's misaligned,
- harmonize preprocessor conditional block labeling
and vertical alignment,
- remove line-breaks where they uglify the code,
- add newline after local variable definitions,
No change in functionality:
md5:
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm
1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-03-03 14:01:12 +01:00
|
|
|
.start = sched_debug_start,
|
|
|
|
.next = sched_debug_next,
|
|
|
|
.stop = sched_debug_stop,
|
|
|
|
.show = sched_debug_show,
|
2013-02-21 15:15:09 -08:00
|
|
|
};
|
|
|
|
|
2020-02-26 12:45:42 +00:00
|
|
|
#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
|
|
|
|
#define __P(F) __PS(#F, F)
|
|
|
|
#define P(F) __PS(#F, p->F)
|
2021-06-02 16:58:08 +02:00
|
|
|
#define PM(F, M) __PS(#F, p->F & (M))
|
2020-02-26 12:45:42 +00:00
|
|
|
#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
|
|
|
|
#define __PN(F) __PSN(#F, F)
|
|
|
|
#define PN(F) __PSN(#F, p->F)
|
2013-10-07 11:29:30 +01:00
|
|
|
|
|
|
|
|
2015-06-25 22:51:43 +05:30
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
|
void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
|
|
|
|
unsigned long tpf, unsigned long gsf, unsigned long gpf)
|
|
|
|
{
|
|
|
|
SEQ_printf(m, "numa_faults node=%d ", node);
|
2018-06-20 22:32:47 +05:30
|
|
|
SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf);
|
|
|
|
SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf);
|
2015-06-25 22:51:43 +05:30
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
2013-10-07 11:29:30 +01:00
|
|
|
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
|
if (p->mm)
|
|
|
|
P(mm->numa_scan_seq);
|
|
|
|
|
2015-06-25 22:51:43 +05:30
|
|
|
P(numa_pages_migrated);
|
|
|
|
P(numa_preferred_nid);
|
|
|
|
P(total_numa_faults);
|
|
|
|
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
|
|
|
|
task_node(p), task_numa_group_id(p));
|
|
|
|
show_numa_stats(p, m);
|
2013-10-07 11:29:30 +01:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2017-08-06 14:41:41 +10:00
|
|
|
void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
|
|
|
|
struct seq_file *m)
|
2007-07-09 18:52:00 +02:00
|
|
|
{
|
2007-10-15 17:00:18 +02:00
|
|
|
unsigned long nr_switches;
|
2007-07-09 18:52:00 +02:00
|
|
|
|
2017-08-06 14:41:41 +10:00
|
|
|
SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
|
2010-05-26 14:43:22 -07:00
|
|
|
get_nr_threads(p));
|
2007-10-15 17:00:18 +02:00
|
|
|
SEQ_printf(m,
|
2013-06-27 22:20:05 +05:30
|
|
|
"---------------------------------------------------------"
|
|
|
|
"----------\n");
|
2020-02-26 12:45:42 +00:00
|
|
|
|
sched: Make struct sched_statistics independent of fair sched class
If we want to use the schedstats facility to trace other sched classes, we
should make it independent of fair sched class. The struct sched_statistics
is the schedular statistics of a task_struct or a task_group. So we can
move it into struct task_struct and struct task_group to achieve the goal.
After the patch, schestats are orgnized as follows,
struct task_struct {
...
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
...
struct sched_statistics stats;
...
};
Regarding the task group, schedstats is only supported for fair group
sched, and a new struct sched_entity_stats is introduced, suggested by
Peter -
struct sched_entity_stats {
struct sched_entity se;
struct sched_statistics stats;
} __no_randomize_layout;
Then with the se in a task_group, we can easily get the stats.
The sched_statistics members may be frequently modified when schedstats is
enabled, in order to avoid impacting on random data which may in the same
cacheline with them, the struct sched_statistics is defined as cacheline
aligned.
As this patch changes the core struct of scheduler, so I verified the
performance it may impact on the scheduler with 'perf bench sched
pipe', suggested by Mel. Below is the result, in which all the values
are in usecs/op.
Before After
kernel.sched_schedstats=0 5.2~5.4 5.2~5.4
kernel.sched_schedstats=1 5.3~5.5 5.3~5.5
[These data is a little difference with the earlier version, that is
because my old test machine is destroyed so I have to use a new
different test machine.]
Almost no impact on the sched performance.
No functional change.
[lkp@intel.com: reported build failure in earlier version]
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com
2021-09-05 14:35:41 +00:00
|
|
|
#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->stats.F))
|
|
|
|
#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F))
|
2007-07-09 18:52:00 +02:00
|
|
|
|
2007-10-15 17:00:08 +02:00
|
|
|
PN(se.exec_start);
|
|
|
|
PN(se.vruntime);
|
|
|
|
PN(se.sum_exec_runtime);
|
2007-08-02 17:41:40 +02:00
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
nr_switches = p->nvcsw + p->nivcsw;
|
|
|
|
|
|
|
|
P(se.nr_migrations);
|
|
|
|
|
2016-02-05 09:08:36 +00:00
|
|
|
if (schedstat_enabled()) {
|
2007-10-15 17:00:18 +02:00
|
|
|
u64 avg_atom, avg_per_cpu;
|
|
|
|
|
sched: Make struct sched_statistics independent of fair sched class
If we want to use the schedstats facility to trace other sched classes, we
should make it independent of fair sched class. The struct sched_statistics
is the schedular statistics of a task_struct or a task_group. So we can
move it into struct task_struct and struct task_group to achieve the goal.
After the patch, schestats are orgnized as follows,
struct task_struct {
...
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
...
struct sched_statistics stats;
...
};
Regarding the task group, schedstats is only supported for fair group
sched, and a new struct sched_entity_stats is introduced, suggested by
Peter -
struct sched_entity_stats {
struct sched_entity se;
struct sched_statistics stats;
} __no_randomize_layout;
Then with the se in a task_group, we can easily get the stats.
The sched_statistics members may be frequently modified when schedstats is
enabled, in order to avoid impacting on random data which may in the same
cacheline with them, the struct sched_statistics is defined as cacheline
aligned.
As this patch changes the core struct of scheduler, so I verified the
performance it may impact on the scheduler with 'perf bench sched
pipe', suggested by Mel. Below is the result, in which all the values
are in usecs/op.
Before After
kernel.sched_schedstats=0 5.2~5.4 5.2~5.4
kernel.sched_schedstats=1 5.3~5.5 5.3~5.5
[These data is a little difference with the earlier version, that is
because my old test machine is destroyed so I have to use a new
different test machine.]
Almost no impact on the sched performance.
No functional change.
[lkp@intel.com: reported build failure in earlier version]
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com
2021-09-05 14:35:41 +00:00
|
|
|
PN_SCHEDSTAT(sum_sleep_runtime);
|
2021-09-05 14:35:43 +00:00
|
|
|
PN_SCHEDSTAT(sum_block_runtime);
|
sched: Make struct sched_statistics independent of fair sched class
If we want to use the schedstats facility to trace other sched classes, we
should make it independent of fair sched class. The struct sched_statistics
is the schedular statistics of a task_struct or a task_group. So we can
move it into struct task_struct and struct task_group to achieve the goal.
After the patch, schestats are orgnized as follows,
struct task_struct {
...
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
...
struct sched_statistics stats;
...
};
Regarding the task group, schedstats is only supported for fair group
sched, and a new struct sched_entity_stats is introduced, suggested by
Peter -
struct sched_entity_stats {
struct sched_entity se;
struct sched_statistics stats;
} __no_randomize_layout;
Then with the se in a task_group, we can easily get the stats.
The sched_statistics members may be frequently modified when schedstats is
enabled, in order to avoid impacting on random data which may in the same
cacheline with them, the struct sched_statistics is defined as cacheline
aligned.
As this patch changes the core struct of scheduler, so I verified the
performance it may impact on the scheduler with 'perf bench sched
pipe', suggested by Mel. Below is the result, in which all the values
are in usecs/op.
Before After
kernel.sched_schedstats=0 5.2~5.4 5.2~5.4
kernel.sched_schedstats=1 5.3~5.5 5.3~5.5
[These data is a little difference with the earlier version, that is
because my old test machine is destroyed so I have to use a new
different test machine.]
Almost no impact on the sched performance.
No functional change.
[lkp@intel.com: reported build failure in earlier version]
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com
2021-09-05 14:35:41 +00:00
|
|
|
PN_SCHEDSTAT(wait_start);
|
|
|
|
PN_SCHEDSTAT(sleep_start);
|
|
|
|
PN_SCHEDSTAT(block_start);
|
|
|
|
PN_SCHEDSTAT(sleep_max);
|
|
|
|
PN_SCHEDSTAT(block_max);
|
|
|
|
PN_SCHEDSTAT(exec_max);
|
|
|
|
PN_SCHEDSTAT(slice_max);
|
|
|
|
PN_SCHEDSTAT(wait_max);
|
|
|
|
PN_SCHEDSTAT(wait_sum);
|
|
|
|
P_SCHEDSTAT(wait_count);
|
|
|
|
PN_SCHEDSTAT(iowait_sum);
|
|
|
|
P_SCHEDSTAT(iowait_count);
|
|
|
|
P_SCHEDSTAT(nr_migrations_cold);
|
|
|
|
P_SCHEDSTAT(nr_failed_migrations_affine);
|
|
|
|
P_SCHEDSTAT(nr_failed_migrations_running);
|
|
|
|
P_SCHEDSTAT(nr_failed_migrations_hot);
|
|
|
|
P_SCHEDSTAT(nr_forced_migrations);
|
|
|
|
P_SCHEDSTAT(nr_wakeups);
|
|
|
|
P_SCHEDSTAT(nr_wakeups_sync);
|
|
|
|
P_SCHEDSTAT(nr_wakeups_migrate);
|
|
|
|
P_SCHEDSTAT(nr_wakeups_local);
|
|
|
|
P_SCHEDSTAT(nr_wakeups_remote);
|
|
|
|
P_SCHEDSTAT(nr_wakeups_affine);
|
|
|
|
P_SCHEDSTAT(nr_wakeups_affine_attempts);
|
|
|
|
P_SCHEDSTAT(nr_wakeups_passive);
|
|
|
|
P_SCHEDSTAT(nr_wakeups_idle);
|
2016-02-05 09:08:36 +00:00
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
avg_atom = p->se.sum_exec_runtime;
|
|
|
|
if (nr_switches)
|
2014-06-14 15:00:09 +02:00
|
|
|
avg_atom = div64_ul(avg_atom, nr_switches);
|
2007-10-15 17:00:18 +02:00
|
|
|
else
|
|
|
|
avg_atom = -1LL;
|
|
|
|
|
|
|
|
avg_per_cpu = p->se.sum_exec_runtime;
|
2007-11-28 15:52:56 +01:00
|
|
|
if (p->se.nr_migrations) {
|
2008-05-01 04:34:28 -07:00
|
|
|
avg_per_cpu = div64_u64(avg_per_cpu,
|
|
|
|
p->se.nr_migrations);
|
2007-11-28 15:52:56 +01:00
|
|
|
} else {
|
2007-10-15 17:00:18 +02:00
|
|
|
avg_per_cpu = -1LL;
|
2007-11-28 15:52:56 +01:00
|
|
|
}
|
2007-10-15 17:00:18 +02:00
|
|
|
|
|
|
|
__PN(avg_atom);
|
|
|
|
__PN(avg_per_cpu);
|
2021-10-18 13:34:28 -07:00
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_CORE
|
|
|
|
PN_SCHEDSTAT(core_forceidle_sum);
|
|
|
|
#endif
|
2007-10-15 17:00:18 +02:00
|
|
|
}
|
2016-06-17 12:43:26 -05:00
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
__P(nr_switches);
|
2020-02-26 12:45:42 +00:00
|
|
|
__PS("nr_voluntary_switches", p->nvcsw);
|
|
|
|
__PS("nr_involuntary_switches", p->nivcsw);
|
2007-10-15 17:00:18 +02:00
|
|
|
|
2007-07-09 18:52:00 +02:00
|
|
|
P(se.load.weight);
|
2013-06-28 19:10:35 +08:00
|
|
|
#ifdef CONFIG_SMP
|
sched/fair: Rewrite runnable load and utilization average tracking
The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:
1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
updated at the granularity of an entity at a time, which results in the
cfs_rq's load average is stale or partially updated: at any time, only
one entity is up to date, all other entities are effectively lagging
behind. This is undesirable.
To illustrate, if we have n runnable entities in the cfs_rq, as time
elapses, they certainly become outdated:
t0: cfs_rq { e1_old, e2_old, ..., en_old }
and when we update:
t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }
t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }
...
We solve this by combining all runnable entities' load averages together
in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
on the fact that if we regard the update as a function, then:
w * update(e) = update(w * e) and
update(e1) + update(e2) = update(e1 + e2), then
w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)
therefore, by this rewrite, we have an entirely updated cfs_rq at the
time we update it:
t1: update cfs_rq { e1_new, e2_new, ..., en_new }
t2: update cfs_rq { e1_new, e2_new, ..., en_new }
...
2. cfs_rq's load average is different between top rq->cfs_rq and other
task_group's per CPU cfs_rqs in whether or not blocked_load_average
contributes to the load.
The basic idea behind runnable load average (the same for utilization)
is that the blocked state is taken into account as opposed to only
accounting for the currently runnable state. Therefore, the average
should include both the runnable/running and blocked load averages.
This rewrite does that.
In addition, we also combine runnable/running and blocked averages
of all entities into the cfs_rq's average, and update it together at
once. This is based on the fact that:
update(runnable) + update(blocked) = update(runnable + blocked)
This significantly reduces the code as we don't need to separately
maintain/update runnable/running load and blocked load.
3. How task_group entities' share is calculated is complex and imprecise.
We reduce the complexity in this rewrite to allow a very simple rule:
the task_group's load_avg is aggregated from its per CPU cfs_rqs's
load_avgs. Then group entity's weight is simply proportional to its
own cfs_rq's load_avg / task_group's load_avg. To illustrate,
if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,
task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then
cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share
To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.
As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-07-15 08:04:37 +08:00
|
|
|
P(se.avg.load_sum);
|
2020-02-24 09:52:18 +00:00
|
|
|
P(se.avg.runnable_sum);
|
sched/fair: Rewrite runnable load and utilization average tracking
The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:
1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
updated at the granularity of an entity at a time, which results in the
cfs_rq's load average is stale or partially updated: at any time, only
one entity is up to date, all other entities are effectively lagging
behind. This is undesirable.
To illustrate, if we have n runnable entities in the cfs_rq, as time
elapses, they certainly become outdated:
t0: cfs_rq { e1_old, e2_old, ..., en_old }
and when we update:
t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }
t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }
...
We solve this by combining all runnable entities' load averages together
in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
on the fact that if we regard the update as a function, then:
w * update(e) = update(w * e) and
update(e1) + update(e2) = update(e1 + e2), then
w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)
therefore, by this rewrite, we have an entirely updated cfs_rq at the
time we update it:
t1: update cfs_rq { e1_new, e2_new, ..., en_new }
t2: update cfs_rq { e1_new, e2_new, ..., en_new }
...
2. cfs_rq's load average is different between top rq->cfs_rq and other
task_group's per CPU cfs_rqs in whether or not blocked_load_average
contributes to the load.
The basic idea behind runnable load average (the same for utilization)
is that the blocked state is taken into account as opposed to only
accounting for the currently runnable state. Therefore, the average
should include both the runnable/running and blocked load averages.
This rewrite does that.
In addition, we also combine runnable/running and blocked averages
of all entities into the cfs_rq's average, and update it together at
once. This is based on the fact that:
update(runnable) + update(blocked) = update(runnable + blocked)
This significantly reduces the code as we don't need to separately
maintain/update runnable/running load and blocked load.
3. How task_group entities' share is calculated is complex and imprecise.
We reduce the complexity in this rewrite to allow a very simple rule:
the task_group's load_avg is aggregated from its per CPU cfs_rqs's
load_avgs. Then group entity's weight is simply proportional to its
own cfs_rq's load_avg / task_group's load_avg. To illustrate,
if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,
task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then
cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share
To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.
As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-07-15 08:04:37 +08:00
|
|
|
P(se.avg.util_sum);
|
|
|
|
P(se.avg.load_avg);
|
2020-02-24 09:52:18 +00:00
|
|
|
P(se.avg.runnable_avg);
|
sched/fair: Rewrite runnable load and utilization average tracking
The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:
1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
updated at the granularity of an entity at a time, which results in the
cfs_rq's load average is stale or partially updated: at any time, only
one entity is up to date, all other entities are effectively lagging
behind. This is undesirable.
To illustrate, if we have n runnable entities in the cfs_rq, as time
elapses, they certainly become outdated:
t0: cfs_rq { e1_old, e2_old, ..., en_old }
and when we update:
t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }
t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }
...
We solve this by combining all runnable entities' load averages together
in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
on the fact that if we regard the update as a function, then:
w * update(e) = update(w * e) and
update(e1) + update(e2) = update(e1 + e2), then
w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)
therefore, by this rewrite, we have an entirely updated cfs_rq at the
time we update it:
t1: update cfs_rq { e1_new, e2_new, ..., en_new }
t2: update cfs_rq { e1_new, e2_new, ..., en_new }
...
2. cfs_rq's load average is different between top rq->cfs_rq and other
task_group's per CPU cfs_rqs in whether or not blocked_load_average
contributes to the load.
The basic idea behind runnable load average (the same for utilization)
is that the blocked state is taken into account as opposed to only
accounting for the currently runnable state. Therefore, the average
should include both the runnable/running and blocked load averages.
This rewrite does that.
In addition, we also combine runnable/running and blocked averages
of all entities into the cfs_rq's average, and update it together at
once. This is based on the fact that:
update(runnable) + update(blocked) = update(runnable + blocked)
This significantly reduces the code as we don't need to separately
maintain/update runnable/running load and blocked load.
3. How task_group entities' share is calculated is complex and imprecise.
We reduce the complexity in this rewrite to allow a very simple rule:
the task_group's load_avg is aggregated from its per CPU cfs_rqs's
load_avgs. Then group entity's weight is simply proportional to its
own cfs_rq's load_avg / task_group's load_avg. To illustrate,
if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,
task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then
cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share
To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.
As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-07-15 08:04:37 +08:00
|
|
|
P(se.avg.util_avg);
|
|
|
|
P(se.avg.last_update_time);
|
2023-12-01 17:16:52 +01:00
|
|
|
PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
|
2020-02-26 12:45:43 +00:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_UCLAMP_TASK
|
2020-05-10 18:26:41 +05:30
|
|
|
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
|
|
|
|
__PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
|
2020-02-26 12:45:43 +00:00
|
|
|
__PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
|
|
|
|
__PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
|
2013-06-25 13:33:36 +05:30
|
|
|
#endif
|
2007-07-09 18:52:00 +02:00
|
|
|
P(policy);
|
|
|
|
P(prio);
|
2018-11-05 16:51:55 +05:30
|
|
|
if (task_has_dl_policy(p)) {
|
2016-10-26 11:17:17 +02:00
|
|
|
P(dl.runtime);
|
|
|
|
P(dl.deadline);
|
|
|
|
}
|
sched_ext: Implement BPF extensible scheduler class
Implement a new scheduler class sched_ext (SCX), which allows scheduling
policies to be implemented as BPF programs to achieve the following:
1. Ease of experimentation and exploration: Enabling rapid iteration of new
scheduling policies.
2. Customization: Building application-specific schedulers which implement
policies that are not applicable to general-purpose schedulers.
3. Rapid scheduler deployments: Non-disruptive swap outs of scheduling
policies in production environments.
sched_ext leverages BPF’s struct_ops feature to define a structure which
exports function callbacks and flags to BPF programs that wish to implement
scheduling policies. The struct_ops structure exported by sched_ext is
struct sched_ext_ops, and is conceptually similar to struct sched_class. The
role of sched_ext is to map the complex sched_class callbacks to the more
simple and ergonomic struct sched_ext_ops callbacks.
For more detailed discussion on the motivations and overview, please refer
to the cover letter.
Later patches will also add several example schedulers and documentation.
This patch implements the minimum core framework to enable implementation of
BPF schedulers. Subsequent patches will gradually add functionalities
including safety guarantee mechanisms, nohz and cgroup support.
include/linux/sched/ext.h defines struct sched_ext_ops. With the comment on
top, each operation should be self-explanatory. The followings are worth
noting:
- Both "sched_ext" and its shorthand "scx" are used. If the identifier
already has "sched" in it, "ext" is used; otherwise, "scx".
- In sched_ext_ops, only .name is mandatory. Every operation is optional and
if omitted a simple but functional default behavior is provided.
- A new policy constant SCHED_EXT is added and a task can select sched_ext
by invoking sched_setscheduler(2) with the new policy constant. However,
if the BPF scheduler is not loaded, SCHED_EXT is the same as SCHED_NORMAL
and the task is scheduled by CFS. When the BPF scheduler is loaded, all
tasks which have the SCHED_EXT policy are switched to sched_ext.
- To bridge the workflow imbalance between the scheduler core and
sched_ext_ops callbacks, sched_ext uses simple FIFOs called dispatch
queues (dsq's). By default, there is one global dsq (SCX_DSQ_GLOBAL), and
one local per-CPU dsq (SCX_DSQ_LOCAL). SCX_DSQ_GLOBAL is provided for
convenience and need not be used by a scheduler that doesn't require it.
SCX_DSQ_LOCAL is the per-CPU FIFO that sched_ext pulls from when putting
the next task on the CPU. The BPF scheduler can manage an arbitrary number
of dsq's using scx_bpf_create_dsq() and scx_bpf_destroy_dsq().
- sched_ext guarantees system integrity no matter what the BPF scheduler
does. To enable this, each task's ownership is tracked through
p->scx.ops_state and all tasks are put on scx_tasks list. The disable path
can always recover and revert all tasks back to CFS. See p->scx.ops_state
and scx_tasks.
- A task is not tied to its rq while enqueued. This decouples CPU selection
from queueing and allows sharing a scheduling queue across an arbitrary
subset of CPUs. This adds some complexities as a task may need to be
bounced between rq's right before it starts executing. See
dispatch_to_local_dsq() and move_task_to_local_dsq().
- One complication that arises from the above weak association between task
and rq is that synchronizing with dequeue() gets complicated as dequeue()
may happen anytime while the task is enqueued and the dispatch path might
need to release the rq lock to transfer the task. Solving this requires a
bit of complexity. See the logic around p->scx.sticky_cpu and
p->scx.ops_qseq.
- Both enable and disable paths are a bit complicated. The enable path
switches all tasks without blocking to avoid issues which can arise from
partially switched states (e.g. the switching task itself being starved).
The disable path can't trust the BPF scheduler at all, so it also has to
guarantee forward progress without blocking. See scx_ops_enable() and
scx_ops_disable_workfn().
- When sched_ext is disabled, static_branches are used to shut down the
entry points from hot paths.
v7: - scx_ops_bypass() was incorrectly and unnecessarily trying to grab
scx_ops_enable_mutex which can lead to deadlocks in the disable path.
Fixed.
- Fixed TASK_DEAD handling bug in scx_ops_enable() path which could lead
to use-after-free.
- Consolidated per-cpu variable usages and other cleanups.
v6: - SCX_NR_ONLINE_OPS replaced with SCX_OPI_*_BEGIN/END so that multiple
groups can be expressed. Later CPU hotplug operations are put into
their own group.
- SCX_OPS_DISABLING state is replaced with the new bypass mechanism
which allows temporarily putting the system into simple FIFO
scheduling mode bypassing the BPF scheduler. In addition to the shut
down path, this will also be used to isolate the BPF scheduler across
PM events. Enabling and disabling the bypass mode requires iterating
all runnable tasks. rq->scx.runnable_list addition is moved from the
later watchdog patch.
- ops.prep_enable() is replaced with ops.init_task() and
ops.enable/disable() are now called whenever the task enters and
leaves sched_ext instead of when the task becomes schedulable on
sched_ext and stops being so. A new operation - ops.exit_task() - is
called when the task stops being schedulable on sched_ext.
- scx_bpf_dispatch() can now be called from ops.select_cpu() too. This
removes the need for communicating local dispatch decision made by
ops.select_cpu() to ops.enqueue() via per-task storage.
SCX_KF_SELECT_CPU is added to support the change.
- SCX_TASK_ENQ_LOCAL which told the BPF scheudler that
scx_select_cpu_dfl() wants the task to be dispatched to the local DSQ
was removed. Instead, scx_bpf_select_cpu_dfl() now dispatches directly
if it finds a suitable idle CPU. If such behavior is not desired,
users can use scx_bpf_select_cpu_dfl() which returns the verdict in a
bool out param.
- scx_select_cpu_dfl() was mishandling WAKE_SYNC and could end up
queueing many tasks on a local DSQ which makes tasks to execute in
order while other CPUs stay idle which made some hackbench numbers
really bad. Fixed.
- The current state of sched_ext can now be monitored through files
under /sys/sched_ext instead of /sys/kernel/debug/sched/ext. This is
to enable monitoring on kernels which don't enable debugfs.
- sched_ext wasn't telling BPF that ops.dispatch()'s @prev argument may
be NULL and a BPF scheduler which derefs the pointer without checking
could crash the kernel. Tell BPF. This is currently a bit ugly. A
better way to annotate this is expected in the future.
- scx_exit_info updated to carry pointers to message buffers instead of
embedding them directly. This decouples buffer sizes from API so that
they can be changed without breaking compatibility.
- exit_code added to scx_exit_info. This is used to indicate different
exit conditions on non-error exits and will be used to handle e.g. CPU
hotplugs.
- The patch "sched_ext: Allow BPF schedulers to switch all eligible
tasks into sched_ext" is folded in and the interface is changed so
that partial switching is indicated with a new ops flag
%SCX_OPS_SWITCH_PARTIAL. This makes scx_bpf_switch_all() unnecessasry
and in turn SCX_KF_INIT. ops.init() is now called with
SCX_KF_SLEEPABLE.
- Code reorganized so that only the parts necessary to integrate with
the rest of the kernel are in the header files.
- Changes to reflect the BPF and other kernel changes including the
addition of bpf_sched_ext_ops.cfi_stubs.
v5: - To accommodate 32bit configs, p->scx.ops_state is now atomic_long_t
instead of atomic64_t and scx_dsp_buf_ent.qseq which uses
load_acquire/store_release is now unsigned long instead of u64.
- Fix the bug where bpf_scx_btf_struct_access() was allowing write
access to arbitrary fields.
- Distinguish kfuncs which can be called from any sched_ext ops and from
anywhere. e.g. scx_bpf_pick_idle_cpu() can now be called only from
sched_ext ops.
- Rename "type" to "kind" in scx_exit_info to make it easier to use on
languages in which "type" is a reserved keyword.
- Since cff9b2332ab7 ("kernel/sched: Modify initial boot task idle
setup"), PF_IDLE is not set on idle tasks which haven't been online
yet which made scx_task_iter_next_filtered() include those idle tasks
in iterations leading to oopses. Update scx_task_iter_next_filtered()
to directly test p->sched_class against idle_sched_class instead of
using is_idle_task() which tests PF_IDLE.
- Other updates to match upstream changes such as adding const to
set_cpumask() param and renaming check_preempt_curr() to
wakeup_preempt().
v4: - SCHED_CHANGE_BLOCK replaced with the previous
sched_deq_and_put_task()/sched_enq_and_set_tsak() pair. This is
because upstream is adaopting a different generic cleanup mechanism.
Once that lands, the code will be adapted accordingly.
- task_on_scx() used to test whether a task should be switched into SCX,
which is confusing. Renamed to task_should_scx(). task_on_scx() now
tests whether a task is currently on SCX.
- scx_has_idle_cpus is barely used anymore and replaced with direct
check on the idle cpumask.
- SCX_PICK_IDLE_CORE added and scx_pick_idle_cpu() improved to prefer
fully idle cores.
- ops.enable() now sees up-to-date p->scx.weight value.
- ttwu_queue path is disabled for tasks on SCX to avoid confusing BPF
schedulers expecting ->select_cpu() call.
- Use cpu_smt_mask() instead of topology_sibling_cpumask() like the rest
of the scheduler.
v3: - ops.set_weight() added to allow BPF schedulers to track weight changes
without polling p->scx.weight.
- move_task_to_local_dsq() was losing SCX-specific enq_flags when
enqueueing the task on the target dsq because it goes through
activate_task() which loses the upper 32bit of the flags. Carry the
flags through rq->scx.extra_enq_flags.
- scx_bpf_dispatch(), scx_bpf_pick_idle_cpu(), scx_bpf_task_running()
and scx_bpf_task_cpu() now use the new KF_RCU instead of
KF_TRUSTED_ARGS to make it easier for BPF schedulers to call them.
- The kfunc helper access control mechanism implemented through
sched_ext_entity.kf_mask is improved. Now SCX_CALL_OP*() is always
used when invoking scx_ops operations.
v2: - balance_scx_on_up() is dropped. Instead, on UP, balance_scx() is
called from put_prev_taks_scx() and pick_next_task_scx() as necessary.
To determine whether balance_scx() should be called from
put_prev_task_scx(), SCX_TASK_DEQD_FOR_SLEEP flag is added. See the
comment in put_prev_task_scx() for details.
- sched_deq_and_put_task() / sched_enq_and_set_task() sequences replaced
with SCHED_CHANGE_BLOCK().
- Unused all_dsqs list removed. This was a left-over from previous
iterations.
- p->scx.kf_mask is added to track and enforce which kfunc helpers are
allowed. Also, init/exit sequences are updated to make some kfuncs
always safe to call regardless of the current BPF scheduler state.
Combined, this should make all the kfuncs safe.
- BPF now supports sleepable struct_ops operations. Hacky workaround
removed and operations and kfunc helpers are tagged appropriately.
- BPF now supports bitmask / cpumask helpers. scx_bpf_get_idle_cpumask()
and friends are added so that BPF schedulers can use the idle masks
with the generic helpers. This replaces the hacky kfunc helpers added
by a separate patch in V1.
- CONFIG_SCHED_CLASS_EXT can no longer be enabled if SCHED_CORE is
enabled. This restriction will be removed by a later patch which adds
core-sched support.
- Add MAINTAINERS entries and other misc changes.
Signed-off-by: Tejun Heo <tj@kernel.org>
Co-authored-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Andrea Righi <andrea.righi@canonical.com>
2024-06-18 10:09:17 -10:00
|
|
|
#ifdef CONFIG_SCHED_CLASS_EXT
|
|
|
|
__PS("ext.enabled", task_on_scx(p));
|
|
|
|
#endif
|
2016-06-17 12:43:26 -05:00
|
|
|
#undef PN_SCHEDSTAT
|
|
|
|
#undef P_SCHEDSTAT
|
2007-07-09 18:52:00 +02:00
|
|
|
|
|
|
|
{
|
2008-11-16 08:07:15 +01:00
|
|
|
unsigned int this_cpu = raw_smp_processor_id();
|
2007-07-09 18:52:00 +02:00
|
|
|
u64 t0, t1;
|
|
|
|
|
2008-11-16 08:07:15 +01:00
|
|
|
t0 = cpu_clock(this_cpu);
|
|
|
|
t1 = cpu_clock(this_cpu);
|
2020-02-26 12:45:42 +00:00
|
|
|
__PS("clock-delta", t1-t0);
|
2007-07-09 18:52:00 +02:00
|
|
|
}
|
2013-10-07 11:29:30 +01:00
|
|
|
|
|
|
|
sched_show_numa(p, m);
|
2007-07-09 18:52:00 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void proc_sched_set_task(struct task_struct *p)
|
|
|
|
{
|
2007-08-02 17:41:40 +02:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
sched: Make struct sched_statistics independent of fair sched class
If we want to use the schedstats facility to trace other sched classes, we
should make it independent of fair sched class. The struct sched_statistics
is the schedular statistics of a task_struct or a task_group. So we can
move it into struct task_struct and struct task_group to achieve the goal.
After the patch, schestats are orgnized as follows,
struct task_struct {
...
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
...
struct sched_statistics stats;
...
};
Regarding the task group, schedstats is only supported for fair group
sched, and a new struct sched_entity_stats is introduced, suggested by
Peter -
struct sched_entity_stats {
struct sched_entity se;
struct sched_statistics stats;
} __no_randomize_layout;
Then with the se in a task_group, we can easily get the stats.
The sched_statistics members may be frequently modified when schedstats is
enabled, in order to avoid impacting on random data which may in the same
cacheline with them, the struct sched_statistics is defined as cacheline
aligned.
As this patch changes the core struct of scheduler, so I verified the
performance it may impact on the scheduler with 'perf bench sched
pipe', suggested by Mel. Below is the result, in which all the values
are in usecs/op.
Before After
kernel.sched_schedstats=0 5.2~5.4 5.2~5.4
kernel.sched_schedstats=1 5.3~5.5 5.3~5.5
[These data is a little difference with the earlier version, that is
because my old test machine is destroyed so I have to use a new
different test machine.]
Almost no impact on the sched performance.
No functional change.
[lkp@intel.com: reported build failure in earlier version]
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com
2021-09-05 14:35:41 +00:00
|
|
|
memset(&p->stats, 0, sizeof(p->stats));
|
2007-08-02 17:41:40 +02:00
|
|
|
#endif
|
2007-07-09 18:52:00 +02:00
|
|
|
}
|
2021-04-16 14:29:36 -07:00
|
|
|
|
|
|
|
void resched_latency_warn(int cpu, u64 latency)
|
|
|
|
{
|
|
|
|
static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
|
|
|
|
|
|
|
|
WARN(__ratelimit(&latency_check_ratelimit),
|
|
|
|
"sched: CPU %d need_resched set for > %llu ns (%d ticks) "
|
|
|
|
"without schedule\n",
|
|
|
|
cpu, latency, cpu_rq(cpu)->ticks_without_resched);
|
|
|
|
}
|