mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-15 02:05:33 +00:00
Merge branch into tip/master: 'sched/core'
# New commits in sched/core: 7c8cd569ff66 ("docs: Update Schedstat version to 17") 011b3a14dc66 ("sched/stats: Print domain name in /proc/schedstat") 1c055a0f5d3b ("sched: Move sched domain name out of CONFIG_SCHED_DEBUG") 3b2a793ea70f ("sched: Report the different kinds of imbalances in /proc/schedstat") c3856c9ce6b8 ("sched/fair: Cleanup in migrate_degrades_locality() to improve readability") a430d99e3490 ("sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat") ee8118c1f186 ("sched/fair: Update comments after sched_tick() rename.") af98d8a36a96 ("sched/fair: Fix CPU bandwidth limit bypass during CPU hotplug") 7675361ff9a1 ("sched: deadline: Cleanup goto label in pick_earliest_pushable_dl_task") 7d5265ffcd8b ("rseq: Validate read-only fields under DEBUG_RSEQ config") 2a77e4be12cb ("sched/fair: Untangle NEXT_BUDDY and pick_next_task()") 95d9fed3a2ae ("sched/fair: Mark m*_vruntime() with __maybe_unused") 0429489e0928 ("sched/fair: Fix variable declaration position") 61b82dfb6b7e ("sched/fair: Do not try to migrate delayed dequeue task") 736c55a02c47 ("sched/fair: Rename cfs_rq.nr_running into nr_queued") 43eef7c3a4a6 ("sched/fair: Remove unused cfs_rq.idle_nr_running") 31898e7b87dd ("sched/fair: Rename cfs_rq.idle_h_nr_running into h_nr_idle") 9216582b0bfb ("sched/fair: Removed unsued cfs_rq.h_nr_delayed") 1a49104496d3 ("sched/fair: Use the new cfs_rq.h_nr_runnable") c2a295bffeaf ("sched/fair: Add new cfs_rq.h_nr_runnable") 7b8a702d9438 ("sched/fair: Rename h_nr_running into h_nr_queued") c907cd44a108 ("sched: Unify HK_TYPE_{TIMER|TICK|MISC} to HK_TYPE_KERNEL_NOISE") 6010d245ddc9 ("sched/isolation: Consolidate housekeeping cpumasks that are always identical") 1174b9344bc7 ("sched/isolation: Make "isolcpus=nohz" equivalent to "nohz_full"") ae5c677729e9 ("sched/core: Remove HK_TYPE_SCHED") a76328d44c7a ("sched/fair: Remove CONFIG_CFS_BANDWIDTH=n definition of cfs_bandwidth_used()") 3a181f20fb4e ("sched/deadline: Consolidate Timer Cancellation") 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug") d4742f6ed7ea ("sched/deadline: Correctly account for allocated bandwidth during hotplug") 41d4200b7103 ("sched/deadline: Restore dl_server bandwidth on non-destructive root domain changes") 59297e2093ce ("sched: add READ_ONCE to task_on_rq_queued") 108ad0999085 ("sched: Don't try to catch up excess steal time.") Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
8e94367b8d
@ -2506,7 +2506,9 @@
|
||||
specified in the flag list (default: domain):
|
||||
|
||||
nohz
|
||||
Disable the tick when a single task runs.
|
||||
Disable the tick when a single task runs as well as
|
||||
disabling other kernel noises like having RCU callbacks
|
||||
offloaded. This is equivalent to the nohz_full parameter.
|
||||
|
||||
A residual 1Hz tick is offloaded to workqueues, which you
|
||||
need to affine to housekeeping through the global
|
||||
|
@ -2,6 +2,12 @@
|
||||
Scheduler Statistics
|
||||
====================
|
||||
|
||||
Version 17 of schedstats removed 'lb_imbalance' field as it has no
|
||||
significance anymore and instead added more relevant fields namely
|
||||
'lb_imbalance_load', 'lb_imbalance_util', 'lb_imbalance_task' and
|
||||
'lb_imbalance_misfit'. The domain field prints the name of the
|
||||
corresponding sched domain from this version onwards.
|
||||
|
||||
Version 16 of schedstats changed the order of definitions within
|
||||
'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES]
|
||||
columns in show_schedstat(). In particular the position of CPU_IDLE
|
||||
@ -9,7 +15,9 @@ and __CPU_NOT_IDLE changed places. The size of the array is unchanged.
|
||||
|
||||
Version 15 of schedstats dropped counters for some sched_yield:
|
||||
yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is
|
||||
identical to version 14.
|
||||
identical to version 14. Details are available at
|
||||
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/scheduler/sched-stats.txt?id=1e1dbb259c79b
|
||||
|
||||
Version 14 of schedstats includes support for sched_domains, which hit the
|
||||
mainline kernel in 2.6.20 although it is identical to the stats from version
|
||||
@ -26,7 +34,14 @@ cpus on the machine, while domain0 is the most tightly focused domain,
|
||||
sometimes balancing only between pairs of cpus. At this time, there
|
||||
are no architectures which need more than three domain levels. The first
|
||||
field in the domain stats is a bit map indicating which cpus are affected
|
||||
by that domain.
|
||||
by that domain. Details are available at
|
||||
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=b762f3ffb797c
|
||||
|
||||
The schedstat documentation is maintained version 10 onwards and is not
|
||||
updated for version 11 and 12. The details for version 10 are available at
|
||||
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=1da177e4c3f4
|
||||
|
||||
These fields are counters, and only increment. Programs which make use
|
||||
of these will need to start with a baseline observation and then calculate
|
||||
@ -71,88 +86,97 @@ Domain statistics
|
||||
-----------------
|
||||
One of these is produced per domain for each cpu described. (Note that if
|
||||
CONFIG_SMP is not defined, *no* domains are utilized and these lines
|
||||
will not appear in the output.)
|
||||
will not appear in the output. <name> is an extension to the domain field
|
||||
that prints the name of the corresponding sched domain. It can appear in
|
||||
schedstat version 17 and above, and requires CONFIG_SCHED_DEBUG.)
|
||||
|
||||
domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
|
||||
domain<N> <name> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
|
||||
|
||||
The first field is a bit mask indicating what cpus this domain operates over.
|
||||
|
||||
The next 24 are a variety of sched_balance_rq() statistics in grouped into types
|
||||
of idleness (idle, busy, and newly idle):
|
||||
The next 33 are a variety of sched_balance_rq() statistics in grouped into types
|
||||
of idleness (busy, idle and newly idle):
|
||||
|
||||
1) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was idle
|
||||
2) # of times in this domain sched_balance_rq() checked but found
|
||||
the load did not require balancing when the cpu was idle
|
||||
3) # of times in this domain sched_balance_rq() tried to move one or
|
||||
more tasks and failed, when the cpu was idle
|
||||
4) sum of imbalances discovered (if any) with each call to
|
||||
sched_balance_rq() in this domain when the cpu was idle
|
||||
5) # of times in this domain pull_task() was called when the cpu
|
||||
was idle
|
||||
6) # of times in this domain pull_task() was called even though
|
||||
the target task was cache-hot when idle
|
||||
7) # of times in this domain sched_balance_rq() was called but did
|
||||
not find a busier queue while the cpu was idle
|
||||
8) # of times in this domain a busier queue was found while the
|
||||
cpu was idle but no busier group was found
|
||||
9) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was busy
|
||||
10) # of times in this domain sched_balance_rq() checked but found the
|
||||
2) # of times in this domain sched_balance_rq() checked but found the
|
||||
load did not require balancing when busy
|
||||
11) # of times in this domain sched_balance_rq() tried to move one or
|
||||
3) # of times in this domain sched_balance_rq() tried to move one or
|
||||
more tasks and failed, when the cpu was busy
|
||||
12) sum of imbalances discovered (if any) with each call to
|
||||
sched_balance_rq() in this domain when the cpu was busy
|
||||
13) # of times in this domain pull_task() was called when busy
|
||||
14) # of times in this domain pull_task() was called even though the
|
||||
4) Total imbalance in load when the cpu was busy
|
||||
5) Total imbalance in utilization when the cpu was busy
|
||||
6) Total imbalance in number of tasks when the cpu was busy
|
||||
7) Total imbalance due to misfit tasks when the cpu was busy
|
||||
8) # of times in this domain pull_task() was called when busy
|
||||
9) # of times in this domain pull_task() was called even though the
|
||||
target task was cache-hot when busy
|
||||
15) # of times in this domain sched_balance_rq() was called but did not
|
||||
10) # of times in this domain sched_balance_rq() was called but did not
|
||||
find a busier queue while the cpu was busy
|
||||
16) # of times in this domain a busier queue was found while the cpu
|
||||
11) # of times in this domain a busier queue was found while the cpu
|
||||
was busy but no busier group was found
|
||||
|
||||
17) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was just becoming idle
|
||||
18) # of times in this domain sched_balance_rq() checked but found the
|
||||
12) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was idle
|
||||
13) # of times in this domain sched_balance_rq() checked but found
|
||||
the load did not require balancing when the cpu was idle
|
||||
14) # of times in this domain sched_balance_rq() tried to move one or
|
||||
more tasks and failed, when the cpu was idle
|
||||
15) Total imbalance in load when the cpu was idle
|
||||
16) Total imbalance in utilization when the cpu was idle
|
||||
17) Total imbalance in number of tasks when the cpu was idle
|
||||
18) Total imbalance due to misfit tasks when the cpu was idle
|
||||
19) # of times in this domain pull_task() was called when the cpu
|
||||
was idle
|
||||
20) # of times in this domain pull_task() was called even though
|
||||
the target task was cache-hot when idle
|
||||
21) # of times in this domain sched_balance_rq() was called but did
|
||||
not find a busier queue while the cpu was idle
|
||||
22) # of times in this domain a busier queue was found while the
|
||||
cpu was idle but no busier group was found
|
||||
|
||||
23) # of times in this domain sched_balance_rq() was called when the
|
||||
was just becoming idle
|
||||
24) # of times in this domain sched_balance_rq() checked but found the
|
||||
load did not require balancing when the cpu was just becoming idle
|
||||
19) # of times in this domain sched_balance_rq() tried to move one or more
|
||||
25) # of times in this domain sched_balance_rq() tried to move one or more
|
||||
tasks and failed, when the cpu was just becoming idle
|
||||
20) sum of imbalances discovered (if any) with each call to
|
||||
sched_balance_rq() in this domain when the cpu was just becoming idle
|
||||
21) # of times in this domain pull_task() was called when newly idle
|
||||
22) # of times in this domain pull_task() was called even though the
|
||||
26) Total imbalance in load when the cpu was just becoming idle
|
||||
27) Total imbalance in utilization when the cpu was just becoming idle
|
||||
28) Total imbalance in number of tasks when the cpu was just becoming idle
|
||||
29) Total imbalance due to misfit tasks when the cpu was just becoming idle
|
||||
30) # of times in this domain pull_task() was called when newly idle
|
||||
31) # of times in this domain pull_task() was called even though the
|
||||
target task was cache-hot when just becoming idle
|
||||
23) # of times in this domain sched_balance_rq() was called but did not
|
||||
32) # of times in this domain sched_balance_rq() was called but did not
|
||||
find a busier queue while the cpu was just becoming idle
|
||||
24) # of times in this domain a busier queue was found while the cpu
|
||||
33) # of times in this domain a busier queue was found while the cpu
|
||||
was just becoming idle but no busier group was found
|
||||
|
||||
Next three are active_load_balance() statistics:
|
||||
|
||||
25) # of times active_load_balance() was called
|
||||
26) # of times active_load_balance() tried to move a task and failed
|
||||
27) # of times active_load_balance() successfully moved a task
|
||||
34) # of times active_load_balance() was called
|
||||
35) # of times active_load_balance() tried to move a task and failed
|
||||
36) # of times active_load_balance() successfully moved a task
|
||||
|
||||
Next three are sched_balance_exec() statistics:
|
||||
|
||||
28) sbe_cnt is not used
|
||||
29) sbe_balanced is not used
|
||||
30) sbe_pushed is not used
|
||||
37) sbe_cnt is not used
|
||||
38) sbe_balanced is not used
|
||||
39) sbe_pushed is not used
|
||||
|
||||
Next three are sched_balance_fork() statistics:
|
||||
|
||||
31) sbf_cnt is not used
|
||||
32) sbf_balanced is not used
|
||||
33) sbf_pushed is not used
|
||||
40) sbf_cnt is not used
|
||||
41) sbf_balanced is not used
|
||||
42) sbf_pushed is not used
|
||||
|
||||
Next three are try_to_wake_up() statistics:
|
||||
|
||||
34) # of times in this domain try_to_wake_up() awoke a task that
|
||||
43) # of times in this domain try_to_wake_up() awoke a task that
|
||||
last ran on a different cpu in this domain
|
||||
35) # of times in this domain try_to_wake_up() moved a task to the
|
||||
44) # of times in this domain try_to_wake_up() moved a task to the
|
||||
waking cpu because it was cache-cold on its own cpu anyway
|
||||
36) # of times in this domain try_to_wake_up() started passive balancing
|
||||
45) # of times in this domain try_to_wake_up() started passive balancing
|
||||
|
||||
/proc/<pid>/schedstat
|
||||
---------------------
|
||||
|
@ -944,6 +944,7 @@ struct task_struct {
|
||||
unsigned sched_reset_on_fork:1;
|
||||
unsigned sched_contributes_to_load:1;
|
||||
unsigned sched_migrated:1;
|
||||
unsigned sched_task_hot:1;
|
||||
|
||||
/* Force alignment to the next boundary: */
|
||||
unsigned :0;
|
||||
@ -1374,6 +1375,15 @@ struct task_struct {
|
||||
* with respect to preemption.
|
||||
*/
|
||||
unsigned long rseq_event_mask;
|
||||
# ifdef CONFIG_DEBUG_RSEQ
|
||||
/*
|
||||
* This is a place holder to save a copy of the rseq fields for
|
||||
* validation of read-only fields. The struct rseq has a
|
||||
* variable-length array at the end, so it cannot be used
|
||||
* directly. Reserve a size large enough for the known fields.
|
||||
*/
|
||||
char rseq_fields[sizeof(struct rseq)];
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_MM_CID
|
||||
|
@ -7,16 +7,21 @@
|
||||
#include <linux/tick.h>
|
||||
|
||||
enum hk_type {
|
||||
HK_TYPE_TIMER,
|
||||
HK_TYPE_RCU,
|
||||
HK_TYPE_MISC,
|
||||
HK_TYPE_SCHED,
|
||||
HK_TYPE_TICK,
|
||||
HK_TYPE_DOMAIN,
|
||||
HK_TYPE_WQ,
|
||||
HK_TYPE_MANAGED_IRQ,
|
||||
HK_TYPE_KTHREAD,
|
||||
HK_TYPE_MAX
|
||||
HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_MAX,
|
||||
|
||||
/*
|
||||
* The following housekeeping types are only set by the nohz_full
|
||||
* boot commandline option. So they can share the same value.
|
||||
*/
|
||||
HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE
|
||||
};
|
||||
|
||||
#ifdef CONFIG_CPU_ISOLATION
|
||||
|
@ -114,7 +114,10 @@ struct sched_domain {
|
||||
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_imbalance_load[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
|
||||
@ -140,9 +143,7 @@ struct sched_domain {
|
||||
unsigned int ttwu_move_affine;
|
||||
unsigned int ttwu_move_balance;
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
char *name;
|
||||
#endif
|
||||
union {
|
||||
void *private; /* used during construction */
|
||||
struct rcu_head rcu; /* used during destruction */
|
||||
@ -198,18 +199,12 @@ struct sched_domain_topology_level {
|
||||
int flags;
|
||||
int numa_level;
|
||||
struct sd_data data;
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
char *name;
|
||||
#endif
|
||||
};
|
||||
|
||||
extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
# define SD_INIT_NAME(type) .name = #type
|
||||
#else
|
||||
# define SD_INIT_NAME(type)
|
||||
#endif
|
||||
|
||||
#else /* CONFIG_SMP */
|
||||
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/rseq.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include <asm/ptrace.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
@ -25,6 +26,78 @@
|
||||
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
|
||||
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
|
||||
|
||||
#ifdef CONFIG_DEBUG_RSEQ
|
||||
static struct rseq *rseq_kernel_fields(struct task_struct *t)
|
||||
{
|
||||
return (struct rseq *) t->rseq_fields;
|
||||
}
|
||||
|
||||
static int rseq_validate_ro_fields(struct task_struct *t)
|
||||
{
|
||||
static DEFINE_RATELIMIT_STATE(_rs,
|
||||
DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
u32 cpu_id_start, cpu_id, node_id, mm_cid;
|
||||
struct rseq __user *rseq = t->rseq;
|
||||
|
||||
/*
|
||||
* Validate fields which are required to be read-only by
|
||||
* user-space.
|
||||
*/
|
||||
if (!user_read_access_begin(rseq, t->rseq_len))
|
||||
goto efault;
|
||||
unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
|
||||
unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
|
||||
unsafe_get_user(node_id, &rseq->node_id, efault_end);
|
||||
unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
|
||||
user_read_access_end();
|
||||
|
||||
if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
|
||||
cpu_id != rseq_kernel_fields(t)->cpu_id ||
|
||||
node_id != rseq_kernel_fields(t)->node_id ||
|
||||
mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
|
||||
|
||||
pr_warn("Detected rseq corruption for pid: %d, name: %s\n"
|
||||
"\tcpu_id_start: %u ?= %u\n"
|
||||
"\tcpu_id: %u ?= %u\n"
|
||||
"\tnode_id: %u ?= %u\n"
|
||||
"\tmm_cid: %u ?= %u\n",
|
||||
t->pid, t->comm,
|
||||
cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
|
||||
cpu_id, rseq_kernel_fields(t)->cpu_id,
|
||||
node_id, rseq_kernel_fields(t)->node_id,
|
||||
mm_cid, rseq_kernel_fields(t)->mm_cid);
|
||||
}
|
||||
|
||||
/* For now, only print a console warning on mismatch. */
|
||||
return 0;
|
||||
|
||||
efault_end:
|
||||
user_read_access_end();
|
||||
efault:
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
|
||||
u32 node_id, u32 mm_cid)
|
||||
{
|
||||
rseq_kernel_fields(t)->cpu_id_start = cpu_id;
|
||||
rseq_kernel_fields(t)->cpu_id = cpu_id;
|
||||
rseq_kernel_fields(t)->node_id = node_id;
|
||||
rseq_kernel_fields(t)->mm_cid = mm_cid;
|
||||
}
|
||||
#else
|
||||
static int rseq_validate_ro_fields(struct task_struct *t)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
|
||||
u32 node_id, u32 mm_cid)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
*
|
||||
* Restartable sequences are a lightweight interface that allows
|
||||
@ -92,6 +165,11 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
|
||||
u32 node_id = cpu_to_node(cpu_id);
|
||||
u32 mm_cid = task_mm_cid(t);
|
||||
|
||||
/*
|
||||
* Validate read-only rseq fields.
|
||||
*/
|
||||
if (rseq_validate_ro_fields(t))
|
||||
goto efault;
|
||||
WARN_ON_ONCE((int) mm_cid < 0);
|
||||
if (!user_write_access_begin(rseq, t->rseq_len))
|
||||
goto efault;
|
||||
@ -105,6 +183,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
|
||||
* t->rseq_len != ORIG_RSEQ_SIZE.
|
||||
*/
|
||||
user_write_access_end();
|
||||
rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid);
|
||||
trace_rseq_update(t);
|
||||
return 0;
|
||||
|
||||
@ -119,6 +198,11 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
|
||||
u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
|
||||
mm_cid = 0;
|
||||
|
||||
/*
|
||||
* Validate read-only rseq fields.
|
||||
*/
|
||||
if (!rseq_validate_ro_fields(t))
|
||||
return -EFAULT;
|
||||
/*
|
||||
* Reset cpu_id_start to its initial state (0).
|
||||
*/
|
||||
@ -141,6 +225,9 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
|
||||
*/
|
||||
if (put_user(mm_cid, &t->rseq->mm_cid))
|
||||
return -EFAULT;
|
||||
|
||||
rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid);
|
||||
|
||||
/*
|
||||
* Additional feature fields added after ORIG_RSEQ_SIZE
|
||||
* need to be conditionally reset only if
|
||||
@ -423,6 +510,17 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
|
||||
current->rseq = rseq;
|
||||
current->rseq_len = rseq_len;
|
||||
current->rseq_sig = sig;
|
||||
#ifdef CONFIG_DEBUG_RSEQ
|
||||
/*
|
||||
* Initialize the in-kernel rseq fields copy for validation of
|
||||
* read-only fields.
|
||||
*/
|
||||
if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
|
||||
get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
|
||||
get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
|
||||
get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
|
||||
return -EFAULT;
|
||||
#endif
|
||||
/*
|
||||
* If rseq was previously inactive, and has just been
|
||||
* registered, ensure the cpu_id_start and cpu_id fields
|
||||
|
@ -766,13 +766,15 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
|
||||
#endif
|
||||
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
|
||||
if (static_key_false((¶virt_steal_rq_enabled))) {
|
||||
steal = paravirt_steal_clock(cpu_of(rq));
|
||||
u64 prev_steal;
|
||||
|
||||
steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
|
||||
steal -= rq->prev_steal_time_rq;
|
||||
|
||||
if (unlikely(steal > delta))
|
||||
steal = delta;
|
||||
|
||||
rq->prev_steal_time_rq += steal;
|
||||
rq->prev_steal_time_rq = prev_steal;
|
||||
delta -= steal;
|
||||
}
|
||||
#endif
|
||||
@ -1168,13 +1170,13 @@ int get_nohz_timer_target(void)
|
||||
struct sched_domain *sd;
|
||||
const struct cpumask *hk_mask;
|
||||
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
|
||||
if (!idle_cpu(cpu))
|
||||
return cpu;
|
||||
default_cpu = cpu;
|
||||
}
|
||||
|
||||
hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
|
||||
hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
|
||||
|
||||
guard(rcu)();
|
||||
|
||||
@ -1189,7 +1191,7 @@ int get_nohz_timer_target(void)
|
||||
}
|
||||
|
||||
if (default_cpu == -1)
|
||||
default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
|
||||
default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
|
||||
|
||||
return default_cpu;
|
||||
}
|
||||
@ -1341,7 +1343,7 @@ bool sched_can_stop_tick(struct rq *rq)
|
||||
if (scx_enabled() && !scx_can_stop_tick(rq))
|
||||
return false;
|
||||
|
||||
if (rq->cfs.h_nr_running > 1)
|
||||
if (rq->cfs.h_nr_queued > 1)
|
||||
return false;
|
||||
|
||||
/*
|
||||
@ -5632,7 +5634,7 @@ void sched_tick(void)
|
||||
unsigned long hw_pressure;
|
||||
u64 resched_latency;
|
||||
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
|
||||
arch_scale_freq_tick();
|
||||
|
||||
sched_clock_tick();
|
||||
@ -5771,7 +5773,7 @@ static void sched_tick_start(int cpu)
|
||||
int os;
|
||||
struct tick_work *twork;
|
||||
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(!tick_work_cpu);
|
||||
@ -5792,7 +5794,7 @@ static void sched_tick_stop(int cpu)
|
||||
struct tick_work *twork;
|
||||
int os;
|
||||
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(!tick_work_cpu);
|
||||
@ -6018,7 +6020,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
* opportunity to pull in more work from other CPUs.
|
||||
*/
|
||||
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
|
||||
rq->nr_running == rq->cfs.h_nr_running)) {
|
||||
rq->nr_running == rq->cfs.h_nr_queued)) {
|
||||
|
||||
p = pick_next_task_fair(rq, prev, rf);
|
||||
if (unlikely(p == RETRY_TASK))
|
||||
@ -8180,19 +8182,14 @@ static void cpuset_cpu_active(void)
|
||||
cpuset_update_active_cpus();
|
||||
}
|
||||
|
||||
static int cpuset_cpu_inactive(unsigned int cpu)
|
||||
static void cpuset_cpu_inactive(unsigned int cpu)
|
||||
{
|
||||
if (!cpuhp_tasks_frozen) {
|
||||
int ret = dl_bw_check_overflow(cpu);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
cpuset_update_active_cpus();
|
||||
} else {
|
||||
num_cpus_frozen++;
|
||||
partition_sched_domains(1, NULL, NULL);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void sched_smt_present_inc(int cpu)
|
||||
@ -8254,6 +8251,11 @@ int sched_cpu_deactivate(unsigned int cpu)
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
int ret;
|
||||
|
||||
ret = dl_bw_deactivate(cpu);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* Remove CPU from nohz.idle_cpus_mask to prevent participating in
|
||||
* load balancing when not active
|
||||
@ -8299,15 +8301,7 @@ int sched_cpu_deactivate(unsigned int cpu)
|
||||
return 0;
|
||||
|
||||
sched_update_numa(cpu, false);
|
||||
ret = cpuset_cpu_inactive(cpu);
|
||||
if (ret) {
|
||||
sched_smt_present_inc(cpu);
|
||||
sched_set_rq_online(rq, cpu);
|
||||
balance_push_set(cpu, false);
|
||||
set_cpu_active(cpu, true);
|
||||
sched_update_numa(cpu, true);
|
||||
return ret;
|
||||
}
|
||||
cpuset_cpu_inactive(cpu);
|
||||
sched_domains_numa_masks_clear(cpu);
|
||||
return 0;
|
||||
}
|
||||
|
@ -342,6 +342,29 @@ static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_s
|
||||
__add_rq_bw(new_bw, &rq->dl);
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer)
|
||||
{
|
||||
/*
|
||||
* If the timer callback was running (hrtimer_try_to_cancel == -1),
|
||||
* it will eventually call put_task_struct().
|
||||
*/
|
||||
if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se))
|
||||
put_task_struct(dl_task_of(dl_se));
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
void cancel_replenish_timer(struct sched_dl_entity *dl_se)
|
||||
{
|
||||
cancel_dl_timer(dl_se, &dl_se->dl_timer);
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
void cancel_inactive_timer(struct sched_dl_entity *dl_se)
|
||||
{
|
||||
cancel_dl_timer(dl_se, &dl_se->inactive_timer);
|
||||
}
|
||||
|
||||
static void dl_change_utilization(struct task_struct *p, u64 new_bw)
|
||||
{
|
||||
WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
|
||||
@ -495,10 +518,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
|
||||
* will not touch the rq's active utilization,
|
||||
* so we are still safe.
|
||||
*/
|
||||
if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
|
||||
if (!dl_server(dl_se))
|
||||
put_task_struct(dl_task_of(dl_se));
|
||||
}
|
||||
cancel_inactive_timer(dl_se);
|
||||
} else {
|
||||
/*
|
||||
* Since "dl_non_contending" is not set, the
|
||||
@ -2115,13 +2135,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
|
||||
* The replenish timer needs to be canceled. No
|
||||
* problem if it fires concurrently: boosted threads
|
||||
* are ignored in dl_task_timer().
|
||||
*
|
||||
* If the timer callback was running (hrtimer_try_to_cancel == -1),
|
||||
* it will eventually call put_task_struct().
|
||||
*/
|
||||
if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 &&
|
||||
!dl_server(&p->dl))
|
||||
put_task_struct(p);
|
||||
cancel_replenish_timer(&p->dl);
|
||||
p->dl.dl_throttled = 0;
|
||||
}
|
||||
} else if (!dl_prio(p->normal_prio)) {
|
||||
@ -2289,8 +2304,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
|
||||
* will not touch the rq's active utilization,
|
||||
* so we are still safe.
|
||||
*/
|
||||
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
|
||||
put_task_struct(p);
|
||||
cancel_inactive_timer(&p->dl);
|
||||
}
|
||||
sub_rq_bw(&p->dl, &rq->dl);
|
||||
rq_unlock(rq, &rf);
|
||||
@ -2506,16 +2520,13 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu
|
||||
return NULL;
|
||||
|
||||
next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
|
||||
|
||||
next_node:
|
||||
if (next_node) {
|
||||
while (next_node) {
|
||||
p = __node_2_pdl(next_node);
|
||||
|
||||
if (task_is_pushable(rq, p, cpu))
|
||||
return p;
|
||||
|
||||
next_node = rb_next(next_node);
|
||||
goto next_node;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
@ -2964,11 +2975,22 @@ void dl_add_task_root_domain(struct task_struct *p)
|
||||
|
||||
void dl_clear_root_domain(struct root_domain *rd)
|
||||
{
|
||||
unsigned long flags;
|
||||
int i;
|
||||
|
||||
raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
|
||||
guard(raw_spinlock_irqsave)(&rd->dl_bw.lock);
|
||||
rd->dl_bw.total_bw = 0;
|
||||
raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
|
||||
|
||||
/*
|
||||
* dl_server bandwidth is only restored when CPUs are attached to root
|
||||
* domains (after domains are created or CPUs moved back to the
|
||||
* default root doamin).
|
||||
*/
|
||||
for_each_cpu(i, rd->span) {
|
||||
struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
|
||||
|
||||
if (dl_server(dl_se) && cpu_active(i))
|
||||
rd->dl_bw.total_bw += dl_se->dl_bw;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
@ -3029,8 +3051,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
|
||||
*/
|
||||
static void switched_to_dl(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
|
||||
put_task_struct(p);
|
||||
cancel_inactive_timer(&p->dl);
|
||||
|
||||
/*
|
||||
* In case a task is setscheduled to SCHED_DEADLINE we need to keep
|
||||
@ -3453,29 +3474,31 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
|
||||
}
|
||||
|
||||
enum dl_bw_request {
|
||||
dl_bw_req_check_overflow = 0,
|
||||
dl_bw_req_deactivate = 0,
|
||||
dl_bw_req_alloc,
|
||||
dl_bw_req_free
|
||||
};
|
||||
|
||||
static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
|
||||
{
|
||||
unsigned long flags;
|
||||
unsigned long flags, cap;
|
||||
struct dl_bw *dl_b;
|
||||
bool overflow = 0;
|
||||
u64 fair_server_bw = 0;
|
||||
|
||||
rcu_read_lock_sched();
|
||||
dl_b = dl_bw_of(cpu);
|
||||
raw_spin_lock_irqsave(&dl_b->lock, flags);
|
||||
|
||||
if (req == dl_bw_req_free) {
|
||||
cap = dl_bw_capacity(cpu);
|
||||
switch (req) {
|
||||
case dl_bw_req_free:
|
||||
__dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
|
||||
} else {
|
||||
unsigned long cap = dl_bw_capacity(cpu);
|
||||
|
||||
break;
|
||||
case dl_bw_req_alloc:
|
||||
overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
|
||||
|
||||
if (req == dl_bw_req_alloc && !overflow) {
|
||||
if (!overflow) {
|
||||
/*
|
||||
* We reserve space in the destination
|
||||
* root_domain, as we can't fail after this point.
|
||||
@ -3484,6 +3507,42 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
|
||||
*/
|
||||
__dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
|
||||
}
|
||||
break;
|
||||
case dl_bw_req_deactivate:
|
||||
/*
|
||||
* cpu is not off yet, but we need to do the math by
|
||||
* considering it off already (i.e., what would happen if we
|
||||
* turn cpu off?).
|
||||
*/
|
||||
cap -= arch_scale_cpu_capacity(cpu);
|
||||
|
||||
/*
|
||||
* cpu is going offline and NORMAL tasks will be moved away
|
||||
* from it. We can thus discount dl_server bandwidth
|
||||
* contribution as it won't need to be servicing tasks after
|
||||
* the cpu is off.
|
||||
*/
|
||||
if (cpu_rq(cpu)->fair_server.dl_server)
|
||||
fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw;
|
||||
|
||||
/*
|
||||
* Not much to check if no DEADLINE bandwidth is present.
|
||||
* dl_servers we can discount, as tasks will be moved out the
|
||||
* offlined CPUs anyway.
|
||||
*/
|
||||
if (dl_b->total_bw - fair_server_bw > 0) {
|
||||
/*
|
||||
* Leaving at least one CPU for DEADLINE tasks seems a
|
||||
* wise thing to do. As said above, cpu is not offline
|
||||
* yet, so account for that.
|
||||
*/
|
||||
if (dl_bw_cpus(cpu) - 1)
|
||||
overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0);
|
||||
else
|
||||
overflow = 1;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
|
||||
@ -3492,9 +3551,9 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
|
||||
return overflow ? -EBUSY : 0;
|
||||
}
|
||||
|
||||
int dl_bw_check_overflow(int cpu)
|
||||
int dl_bw_deactivate(int cpu)
|
||||
{
|
||||
return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
|
||||
return dl_bw_manage(dl_bw_req_deactivate, cpu, 0);
|
||||
}
|
||||
|
||||
int dl_bw_alloc(int cpu, u64 dl_bw)
|
||||
|
@ -379,7 +379,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (rq->cfs.h_nr_running) {
|
||||
if (rq->cfs.h_nr_queued) {
|
||||
update_rq_clock(rq);
|
||||
dl_server_stop(&rq->fair_server);
|
||||
}
|
||||
@ -392,7 +392,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
|
||||
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
|
||||
cpu_of(rq));
|
||||
|
||||
if (rq->cfs.h_nr_running)
|
||||
if (rq->cfs.h_nr_queued)
|
||||
dl_server_start(&rq->fair_server);
|
||||
}
|
||||
|
||||
@ -843,13 +843,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
SPLIT_NS(right_vruntime));
|
||||
spread = right_vruntime - left_vruntime;
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
|
||||
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
|
||||
cfs_rq->idle_nr_running);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
|
||||
cfs_rq->idle_h_nr_running);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
|
||||
#ifdef CONFIG_SMP
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
|
||||
|
@ -523,7 +523,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
|
||||
* Scheduling class tree data structure manipulation methods:
|
||||
*/
|
||||
|
||||
static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
|
||||
static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
|
||||
{
|
||||
s64 delta = (s64)(vruntime - max_vruntime);
|
||||
if (delta > 0)
|
||||
@ -532,7 +532,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
|
||||
return max_vruntime;
|
||||
}
|
||||
|
||||
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
|
||||
static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
|
||||
{
|
||||
s64 delta = (s64)(vruntime - min_vruntime);
|
||||
if (delta < 0)
|
||||
@ -910,7 +910,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
|
||||
* We can safely skip eligibility check if there is only one entity
|
||||
* in this cfs_rq, saving some cycles.
|
||||
*/
|
||||
if (cfs_rq->nr_running == 1)
|
||||
if (cfs_rq->nr_queued == 1)
|
||||
return curr && curr->on_rq ? curr : se;
|
||||
|
||||
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
|
||||
@ -1245,7 +1245,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
|
||||
|
||||
account_cfs_rq_runtime(cfs_rq, delta_exec);
|
||||
|
||||
if (cfs_rq->nr_running == 1)
|
||||
if (cfs_rq->nr_queued == 1)
|
||||
return;
|
||||
|
||||
if (resched || did_preempt_short(cfs_rq, curr)) {
|
||||
@ -2126,7 +2126,7 @@ static void update_numa_stats(struct task_numa_env *env,
|
||||
ns->load += cpu_load(rq);
|
||||
ns->runnable += cpu_runnable(rq);
|
||||
ns->util += cpu_util_cfs(cpu);
|
||||
ns->nr_running += rq->cfs.h_nr_running;
|
||||
ns->nr_running += rq->cfs.h_nr_runnable;
|
||||
ns->compute_capacity += capacity_of(cpu);
|
||||
|
||||
if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
|
||||
@ -3677,9 +3677,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
list_add(&se->group_node, &rq->cfs_tasks);
|
||||
}
|
||||
#endif
|
||||
cfs_rq->nr_running++;
|
||||
if (se_is_idle(se))
|
||||
cfs_rq->idle_nr_running++;
|
||||
cfs_rq->nr_queued++;
|
||||
}
|
||||
|
||||
static void
|
||||
@ -3692,9 +3690,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
list_del_init(&se->group_node);
|
||||
}
|
||||
#endif
|
||||
cfs_rq->nr_running--;
|
||||
if (se_is_idle(se))
|
||||
cfs_rq->idle_nr_running--;
|
||||
cfs_rq->nr_queued--;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -5128,7 +5124,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
|
||||
|
||||
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return !cfs_rq->nr_running;
|
||||
return !cfs_rq->nr_queued;
|
||||
}
|
||||
|
||||
#define UPDATE_TG 0x0
|
||||
@ -5184,7 +5180,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
*
|
||||
* EEVDF: placement strategy #1 / #2
|
||||
*/
|
||||
if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) {
|
||||
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
|
||||
struct sched_entity *curr = cfs_rq->curr;
|
||||
unsigned long load;
|
||||
|
||||
@ -5277,8 +5273,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
|
||||
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
|
||||
|
||||
static inline bool cfs_bandwidth_used(void);
|
||||
|
||||
static void
|
||||
requeue_delayed_entity(struct sched_entity *se);
|
||||
|
||||
@ -5300,7 +5294,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
* When enqueuing a sched_entity, we must:
|
||||
* - Update loads to have both entity and cfs_rq synced with now.
|
||||
* - For group_entity, update its runnable_weight to reflect the new
|
||||
* h_nr_running of its group cfs_rq.
|
||||
* h_nr_runnable of its group cfs_rq.
|
||||
* - For group_entity, update its weight to reflect the new share of
|
||||
* its group cfs_rq
|
||||
* - Add its new weight to cfs_rq->load.weight
|
||||
@ -5333,7 +5327,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
__enqueue_entity(cfs_rq, se);
|
||||
se->on_rq = 1;
|
||||
|
||||
if (cfs_rq->nr_running == 1) {
|
||||
if (cfs_rq->nr_queued == 1) {
|
||||
check_enqueue_throttle(cfs_rq);
|
||||
if (!throttled_hierarchy(cfs_rq)) {
|
||||
list_add_leaf_cfs_rq(cfs_rq);
|
||||
@ -5375,7 +5369,7 @@ static void set_delayed(struct sched_entity *se)
|
||||
for_each_sched_entity(se) {
|
||||
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||
|
||||
cfs_rq->h_nr_delayed++;
|
||||
cfs_rq->h_nr_runnable--;
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
break;
|
||||
}
|
||||
@ -5387,7 +5381,7 @@ static void clear_delayed(struct sched_entity *se)
|
||||
for_each_sched_entity(se) {
|
||||
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||
|
||||
cfs_rq->h_nr_delayed--;
|
||||
cfs_rq->h_nr_runnable++;
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
break;
|
||||
}
|
||||
@ -5404,6 +5398,7 @@ static bool
|
||||
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
{
|
||||
bool sleep = flags & DEQUEUE_SLEEP;
|
||||
int action = UPDATE_TG;
|
||||
|
||||
update_curr(cfs_rq);
|
||||
clear_buddies(cfs_rq, se);
|
||||
@ -5429,7 +5424,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
}
|
||||
}
|
||||
|
||||
int action = UPDATE_TG;
|
||||
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
|
||||
action |= DO_DETACH;
|
||||
|
||||
@ -5437,7 +5431,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
* When dequeuing a sched_entity, we must:
|
||||
* - Update loads to have both entity and cfs_rq synced with now.
|
||||
* - For group_entity, update its runnable_weight to reflect the new
|
||||
* h_nr_running of its group cfs_rq.
|
||||
* h_nr_runnable of its group cfs_rq.
|
||||
* - Subtract its previous weight from cfs_rq->load.weight.
|
||||
* - For group entity, update its weight to reflect the new share
|
||||
* of its group cfs_rq.
|
||||
@ -5475,7 +5469,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
if (flags & DEQUEUE_DELAYED)
|
||||
finish_delayed_dequeue_entity(se);
|
||||
|
||||
if (cfs_rq->nr_running == 0)
|
||||
if (cfs_rq->nr_queued == 0)
|
||||
update_idle_cfs_rq_clock_pelt(cfs_rq);
|
||||
|
||||
return true;
|
||||
@ -5537,17 +5531,19 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
|
||||
static struct sched_entity *
|
||||
pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
struct sched_entity *se;
|
||||
|
||||
/*
|
||||
* Enabling NEXT_BUDDY will affect latency but not fairness.
|
||||
* Picking the ->next buddy will affect latency but not fairness.
|
||||
*/
|
||||
if (sched_feat(NEXT_BUDDY) &&
|
||||
if (sched_feat(PICK_BUDDY) &&
|
||||
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
|
||||
/* ->next will never be delayed */
|
||||
SCHED_WARN_ON(cfs_rq->next->sched_delayed);
|
||||
return cfs_rq->next;
|
||||
}
|
||||
|
||||
struct sched_entity *se = pick_eevdf(cfs_rq);
|
||||
se = pick_eevdf(cfs_rq);
|
||||
if (se->sched_delayed) {
|
||||
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
|
||||
/*
|
||||
@ -5823,7 +5819,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
|
||||
list_del_leaf_cfs_rq(cfs_rq);
|
||||
|
||||
SCHED_WARN_ON(cfs_rq->throttled_clock_self);
|
||||
if (cfs_rq->nr_running)
|
||||
if (cfs_rq->nr_queued)
|
||||
cfs_rq->throttled_clock_self = rq_clock(rq);
|
||||
}
|
||||
cfs_rq->throttle_count++;
|
||||
@ -5836,8 +5832,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
||||
struct sched_entity *se;
|
||||
long task_delta, idle_task_delta, delayed_delta, dequeue = 1;
|
||||
long rq_h_nr_running = rq->cfs.h_nr_running;
|
||||
long queued_delta, runnable_delta, idle_delta, dequeue = 1;
|
||||
long rq_h_nr_queued = rq->cfs.h_nr_queued;
|
||||
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
/* This will start the period timer if necessary */
|
||||
@ -5867,9 +5863,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
|
||||
rcu_read_unlock();
|
||||
|
||||
task_delta = cfs_rq->h_nr_running;
|
||||
idle_task_delta = cfs_rq->idle_h_nr_running;
|
||||
delayed_delta = cfs_rq->h_nr_delayed;
|
||||
queued_delta = cfs_rq->h_nr_queued;
|
||||
runnable_delta = cfs_rq->h_nr_runnable;
|
||||
idle_delta = cfs_rq->h_nr_idle;
|
||||
for_each_sched_entity(se) {
|
||||
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
|
||||
int flags;
|
||||
@ -5889,11 +5885,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
dequeue_entity(qcfs_rq, se, flags);
|
||||
|
||||
if (cfs_rq_is_idle(group_cfs_rq(se)))
|
||||
idle_task_delta = cfs_rq->h_nr_running;
|
||||
idle_delta = cfs_rq->h_nr_queued;
|
||||
|
||||
qcfs_rq->h_nr_running -= task_delta;
|
||||
qcfs_rq->idle_h_nr_running -= idle_task_delta;
|
||||
qcfs_rq->h_nr_delayed -= delayed_delta;
|
||||
qcfs_rq->h_nr_queued -= queued_delta;
|
||||
qcfs_rq->h_nr_runnable -= runnable_delta;
|
||||
qcfs_rq->h_nr_idle -= idle_delta;
|
||||
|
||||
if (qcfs_rq->load.weight) {
|
||||
/* Avoid re-evaluating load for this entity: */
|
||||
@ -5912,18 +5908,18 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
se_update_runnable(se);
|
||||
|
||||
if (cfs_rq_is_idle(group_cfs_rq(se)))
|
||||
idle_task_delta = cfs_rq->h_nr_running;
|
||||
idle_delta = cfs_rq->h_nr_queued;
|
||||
|
||||
qcfs_rq->h_nr_running -= task_delta;
|
||||
qcfs_rq->idle_h_nr_running -= idle_task_delta;
|
||||
qcfs_rq->h_nr_delayed -= delayed_delta;
|
||||
qcfs_rq->h_nr_queued -= queued_delta;
|
||||
qcfs_rq->h_nr_runnable -= runnable_delta;
|
||||
qcfs_rq->h_nr_idle -= idle_delta;
|
||||
}
|
||||
|
||||
/* At this point se is NULL and we are at root level*/
|
||||
sub_nr_running(rq, task_delta);
|
||||
sub_nr_running(rq, queued_delta);
|
||||
|
||||
/* Stop the fair server if throttling resulted in no runnable tasks */
|
||||
if (rq_h_nr_running && !rq->cfs.h_nr_running)
|
||||
if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
|
||||
dl_server_stop(&rq->fair_server);
|
||||
done:
|
||||
/*
|
||||
@ -5932,7 +5928,7 @@ done:
|
||||
*/
|
||||
cfs_rq->throttled = 1;
|
||||
SCHED_WARN_ON(cfs_rq->throttled_clock);
|
||||
if (cfs_rq->nr_running)
|
||||
if (cfs_rq->nr_queued)
|
||||
cfs_rq->throttled_clock = rq_clock(rq);
|
||||
return true;
|
||||
}
|
||||
@ -5942,8 +5938,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
||||
struct sched_entity *se;
|
||||
long task_delta, idle_task_delta, delayed_delta;
|
||||
long rq_h_nr_running = rq->cfs.h_nr_running;
|
||||
long queued_delta, runnable_delta, idle_delta;
|
||||
long rq_h_nr_queued = rq->cfs.h_nr_queued;
|
||||
|
||||
se = cfs_rq->tg->se[cpu_of(rq)];
|
||||
|
||||
@ -5976,9 +5972,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
goto unthrottle_throttle;
|
||||
}
|
||||
|
||||
task_delta = cfs_rq->h_nr_running;
|
||||
idle_task_delta = cfs_rq->idle_h_nr_running;
|
||||
delayed_delta = cfs_rq->h_nr_delayed;
|
||||
queued_delta = cfs_rq->h_nr_queued;
|
||||
runnable_delta = cfs_rq->h_nr_runnable;
|
||||
idle_delta = cfs_rq->h_nr_idle;
|
||||
for_each_sched_entity(se) {
|
||||
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
|
||||
|
||||
@ -5992,11 +5988,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
|
||||
|
||||
if (cfs_rq_is_idle(group_cfs_rq(se)))
|
||||
idle_task_delta = cfs_rq->h_nr_running;
|
||||
idle_delta = cfs_rq->h_nr_queued;
|
||||
|
||||
qcfs_rq->h_nr_running += task_delta;
|
||||
qcfs_rq->idle_h_nr_running += idle_task_delta;
|
||||
qcfs_rq->h_nr_delayed += delayed_delta;
|
||||
qcfs_rq->h_nr_queued += queued_delta;
|
||||
qcfs_rq->h_nr_runnable += runnable_delta;
|
||||
qcfs_rq->h_nr_idle += idle_delta;
|
||||
|
||||
/* end evaluation on encountering a throttled cfs_rq */
|
||||
if (cfs_rq_throttled(qcfs_rq))
|
||||
@ -6010,11 +6006,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
se_update_runnable(se);
|
||||
|
||||
if (cfs_rq_is_idle(group_cfs_rq(se)))
|
||||
idle_task_delta = cfs_rq->h_nr_running;
|
||||
idle_delta = cfs_rq->h_nr_queued;
|
||||
|
||||
qcfs_rq->h_nr_running += task_delta;
|
||||
qcfs_rq->idle_h_nr_running += idle_task_delta;
|
||||
qcfs_rq->h_nr_delayed += delayed_delta;
|
||||
qcfs_rq->h_nr_queued += queued_delta;
|
||||
qcfs_rq->h_nr_runnable += runnable_delta;
|
||||
qcfs_rq->h_nr_idle += idle_delta;
|
||||
|
||||
/* end evaluation on encountering a throttled cfs_rq */
|
||||
if (cfs_rq_throttled(qcfs_rq))
|
||||
@ -6022,17 +6018,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
}
|
||||
|
||||
/* Start the fair server if un-throttling resulted in new runnable tasks */
|
||||
if (!rq_h_nr_running && rq->cfs.h_nr_running)
|
||||
if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
|
||||
dl_server_start(&rq->fair_server);
|
||||
|
||||
/* At this point se is NULL and we are at root level*/
|
||||
add_nr_running(rq, task_delta);
|
||||
add_nr_running(rq, queued_delta);
|
||||
|
||||
unthrottle_throttle:
|
||||
assert_list_leaf_cfs_rq(rq);
|
||||
|
||||
/* Determine whether we need to wake up potentially idle CPU: */
|
||||
if (rq->curr == rq->idle && rq->cfs.nr_running)
|
||||
if (rq->curr == rq->idle && rq->cfs.nr_queued)
|
||||
resched_curr(rq);
|
||||
}
|
||||
|
||||
@ -6333,7 +6329,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
if (!cfs_bandwidth_used())
|
||||
return;
|
||||
|
||||
if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
|
||||
if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued)
|
||||
return;
|
||||
|
||||
__return_cfs_rq_runtime(cfs_rq);
|
||||
@ -6604,6 +6600,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
|
||||
// Do not unthrottle for an active CPU
|
||||
if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask))
|
||||
return;
|
||||
|
||||
/*
|
||||
* The rq clock has already been updated in the
|
||||
* set_rq_offline(), so we should skip updating
|
||||
@ -6618,18 +6618,20 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
|
||||
if (!cfs_rq->runtime_enabled)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* clock_task is not advancing so we just need to make sure
|
||||
* there's some valid quota amount
|
||||
*/
|
||||
cfs_rq->runtime_remaining = 1;
|
||||
/*
|
||||
* Offline rq is schedulable till CPU is completely disabled
|
||||
* in take_cpu_down(), so we prevent new cfs throttling here.
|
||||
*/
|
||||
cfs_rq->runtime_enabled = 0;
|
||||
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
if (!cfs_rq_throttled(cfs_rq))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* clock_task is not advancing so we just need to make sure
|
||||
* there's some valid quota amount
|
||||
*/
|
||||
cfs_rq->runtime_remaining = 1;
|
||||
unthrottle_cfs_rq(cfs_rq);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
@ -6679,11 +6681,6 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
|
||||
|
||||
#else /* CONFIG_CFS_BANDWIDTH */
|
||||
|
||||
static inline bool cfs_bandwidth_used(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
|
||||
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
|
||||
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
|
||||
@ -6741,7 +6738,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
|
||||
|
||||
SCHED_WARN_ON(task_rq(p) != rq);
|
||||
|
||||
if (rq->cfs.h_nr_running > 1) {
|
||||
if (rq->cfs.h_nr_queued > 1) {
|
||||
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
|
||||
u64 slice = se->slice;
|
||||
s64 delta = slice - ran;
|
||||
@ -6829,7 +6826,7 @@ static inline void check_update_overutilized_status(struct rq *rq) { }
|
||||
/* Runqueue only has SCHED_IDLE tasks enqueued */
|
||||
static int sched_idle_rq(struct rq *rq)
|
||||
{
|
||||
return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
|
||||
return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
|
||||
rq->nr_running);
|
||||
}
|
||||
|
||||
@ -6856,14 +6853,14 @@ requeue_delayed_entity(struct sched_entity *se)
|
||||
if (sched_feat(DELAY_ZERO)) {
|
||||
update_entity_lag(cfs_rq, se);
|
||||
if (se->vlag > 0) {
|
||||
cfs_rq->nr_running--;
|
||||
cfs_rq->nr_queued--;
|
||||
if (se != cfs_rq->curr)
|
||||
__dequeue_entity(cfs_rq, se);
|
||||
se->vlag = 0;
|
||||
place_entity(cfs_rq, se, 0);
|
||||
if (se != cfs_rq->curr)
|
||||
__enqueue_entity(cfs_rq, se);
|
||||
cfs_rq->nr_running++;
|
||||
cfs_rq->nr_queued++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -6881,10 +6878,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
struct cfs_rq *cfs_rq;
|
||||
struct sched_entity *se = &p->se;
|
||||
int idle_h_nr_running = task_has_idle_policy(p);
|
||||
int h_nr_delayed = 0;
|
||||
int h_nr_idle = task_has_idle_policy(p);
|
||||
int h_nr_runnable = 1;
|
||||
int task_new = !(flags & ENQUEUE_WAKEUP);
|
||||
int rq_h_nr_running = rq->cfs.h_nr_running;
|
||||
int rq_h_nr_queued = rq->cfs.h_nr_queued;
|
||||
u64 slice = 0;
|
||||
|
||||
/*
|
||||
@ -6909,8 +6906,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
if (p->in_iowait)
|
||||
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
|
||||
|
||||
if (task_new)
|
||||
h_nr_delayed = !!se->sched_delayed;
|
||||
if (task_new && se->sched_delayed)
|
||||
h_nr_runnable = 0;
|
||||
|
||||
for_each_sched_entity(se) {
|
||||
if (se->on_rq) {
|
||||
@ -6932,12 +6929,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
enqueue_entity(cfs_rq, se, flags);
|
||||
slice = cfs_rq_min_slice(cfs_rq);
|
||||
|
||||
cfs_rq->h_nr_running++;
|
||||
cfs_rq->idle_h_nr_running += idle_h_nr_running;
|
||||
cfs_rq->h_nr_delayed += h_nr_delayed;
|
||||
cfs_rq->h_nr_runnable += h_nr_runnable;
|
||||
cfs_rq->h_nr_queued++;
|
||||
cfs_rq->h_nr_idle += h_nr_idle;
|
||||
|
||||
if (cfs_rq_is_idle(cfs_rq))
|
||||
idle_h_nr_running = 1;
|
||||
h_nr_idle = 1;
|
||||
|
||||
/* end evaluation on encountering a throttled cfs_rq */
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
@ -6956,19 +6953,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
se->slice = slice;
|
||||
slice = cfs_rq_min_slice(cfs_rq);
|
||||
|
||||
cfs_rq->h_nr_running++;
|
||||
cfs_rq->idle_h_nr_running += idle_h_nr_running;
|
||||
cfs_rq->h_nr_delayed += h_nr_delayed;
|
||||
cfs_rq->h_nr_runnable += h_nr_runnable;
|
||||
cfs_rq->h_nr_queued++;
|
||||
cfs_rq->h_nr_idle += h_nr_idle;
|
||||
|
||||
if (cfs_rq_is_idle(cfs_rq))
|
||||
idle_h_nr_running = 1;
|
||||
h_nr_idle = 1;
|
||||
|
||||
/* end evaluation on encountering a throttled cfs_rq */
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
goto enqueue_throttle;
|
||||
}
|
||||
|
||||
if (!rq_h_nr_running && rq->cfs.h_nr_running) {
|
||||
if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
|
||||
/* Account for idle runtime */
|
||||
if (!rq->nr_running)
|
||||
dl_server_update_idle_time(rq, rq->curr);
|
||||
@ -7015,22 +7012,22 @@ static void set_next_buddy(struct sched_entity *se);
|
||||
static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
{
|
||||
bool was_sched_idle = sched_idle_rq(rq);
|
||||
int rq_h_nr_running = rq->cfs.h_nr_running;
|
||||
int rq_h_nr_queued = rq->cfs.h_nr_queued;
|
||||
bool task_sleep = flags & DEQUEUE_SLEEP;
|
||||
bool task_delayed = flags & DEQUEUE_DELAYED;
|
||||
struct task_struct *p = NULL;
|
||||
int idle_h_nr_running = 0;
|
||||
int h_nr_running = 0;
|
||||
int h_nr_delayed = 0;
|
||||
int h_nr_idle = 0;
|
||||
int h_nr_queued = 0;
|
||||
int h_nr_runnable = 0;
|
||||
struct cfs_rq *cfs_rq;
|
||||
u64 slice = 0;
|
||||
|
||||
if (entity_is_task(se)) {
|
||||
p = task_of(se);
|
||||
h_nr_running = 1;
|
||||
idle_h_nr_running = task_has_idle_policy(p);
|
||||
if (!task_sleep && !task_delayed)
|
||||
h_nr_delayed = !!se->sched_delayed;
|
||||
h_nr_queued = 1;
|
||||
h_nr_idle = task_has_idle_policy(p);
|
||||
if (task_sleep || task_delayed || !se->sched_delayed)
|
||||
h_nr_runnable = 1;
|
||||
} else {
|
||||
cfs_rq = group_cfs_rq(se);
|
||||
slice = cfs_rq_min_slice(cfs_rq);
|
||||
@ -7046,12 +7043,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
break;
|
||||
}
|
||||
|
||||
cfs_rq->h_nr_running -= h_nr_running;
|
||||
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
|
||||
cfs_rq->h_nr_delayed -= h_nr_delayed;
|
||||
cfs_rq->h_nr_runnable -= h_nr_runnable;
|
||||
cfs_rq->h_nr_queued -= h_nr_queued;
|
||||
cfs_rq->h_nr_idle -= h_nr_idle;
|
||||
|
||||
if (cfs_rq_is_idle(cfs_rq))
|
||||
idle_h_nr_running = h_nr_running;
|
||||
h_nr_idle = h_nr_queued;
|
||||
|
||||
/* end evaluation on encountering a throttled cfs_rq */
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
@ -7085,21 +7082,21 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
se->slice = slice;
|
||||
slice = cfs_rq_min_slice(cfs_rq);
|
||||
|
||||
cfs_rq->h_nr_running -= h_nr_running;
|
||||
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
|
||||
cfs_rq->h_nr_delayed -= h_nr_delayed;
|
||||
cfs_rq->h_nr_runnable -= h_nr_runnable;
|
||||
cfs_rq->h_nr_queued -= h_nr_queued;
|
||||
cfs_rq->h_nr_idle -= h_nr_idle;
|
||||
|
||||
if (cfs_rq_is_idle(cfs_rq))
|
||||
idle_h_nr_running = h_nr_running;
|
||||
h_nr_idle = h_nr_queued;
|
||||
|
||||
/* end evaluation on encountering a throttled cfs_rq */
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
return 0;
|
||||
}
|
||||
|
||||
sub_nr_running(rq, h_nr_running);
|
||||
sub_nr_running(rq, h_nr_queued);
|
||||
|
||||
if (rq_h_nr_running && !rq->cfs.h_nr_running)
|
||||
if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
|
||||
dl_server_stop(&rq->fair_server);
|
||||
|
||||
/* balance early to pull high priority tasks */
|
||||
@ -8788,7 +8785,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
|
||||
|
||||
again:
|
||||
cfs_rq = &rq->cfs;
|
||||
if (!cfs_rq->nr_running)
|
||||
if (!cfs_rq->nr_queued)
|
||||
return NULL;
|
||||
|
||||
do {
|
||||
@ -8905,7 +8902,7 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru
|
||||
|
||||
static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
|
||||
{
|
||||
return !!dl_se->rq->cfs.nr_running;
|
||||
return !!dl_se->rq->cfs.nr_queued;
|
||||
}
|
||||
|
||||
static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
|
||||
@ -9236,43 +9233,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
/*
|
||||
* Returns 1, if task migration degrades locality
|
||||
* Returns 0, if task migration improves locality i.e migration preferred.
|
||||
* Returns -1, if task migration is not affected by locality.
|
||||
* Returns a positive value, if task migration degrades locality.
|
||||
* Returns 0, if task migration is not affected by locality.
|
||||
* Returns a negative value, if task migration improves locality i.e migration preferred.
|
||||
*/
|
||||
static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
||||
static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
||||
{
|
||||
struct numa_group *numa_group = rcu_dereference(p->numa_group);
|
||||
unsigned long src_weight, dst_weight;
|
||||
int src_nid, dst_nid, dist;
|
||||
|
||||
if (!static_branch_likely(&sched_numa_balancing))
|
||||
return -1;
|
||||
return 0;
|
||||
|
||||
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
|
||||
return -1;
|
||||
return 0;
|
||||
|
||||
src_nid = cpu_to_node(env->src_cpu);
|
||||
dst_nid = cpu_to_node(env->dst_cpu);
|
||||
|
||||
if (src_nid == dst_nid)
|
||||
return -1;
|
||||
return 0;
|
||||
|
||||
/* Migrating away from the preferred node is always bad. */
|
||||
if (src_nid == p->numa_preferred_nid) {
|
||||
if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
|
||||
return 1;
|
||||
else
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Encourage migration to the preferred node. */
|
||||
if (dst_nid == p->numa_preferred_nid)
|
||||
return 0;
|
||||
return -1;
|
||||
|
||||
/* Leaving a core idle is often worse than degrading locality. */
|
||||
if (env->idle == CPU_IDLE)
|
||||
return -1;
|
||||
return 0;
|
||||
|
||||
dist = node_distance(src_nid, dst_nid);
|
||||
if (numa_group) {
|
||||
@ -9283,14 +9280,14 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
||||
dst_weight = task_weight(p, dst_nid, dist);
|
||||
}
|
||||
|
||||
return dst_weight < src_weight;
|
||||
return src_weight - dst_weight;
|
||||
}
|
||||
|
||||
#else
|
||||
static inline int migrate_degrades_locality(struct task_struct *p,
|
||||
static inline long migrate_degrades_locality(struct task_struct *p,
|
||||
struct lb_env *env)
|
||||
{
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -9300,17 +9297,23 @@ static inline int migrate_degrades_locality(struct task_struct *p,
|
||||
static
|
||||
int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
||||
{
|
||||
int tsk_cache_hot;
|
||||
long degrades, hot;
|
||||
|
||||
lockdep_assert_rq_held(env->src_rq);
|
||||
if (p->sched_task_hot)
|
||||
p->sched_task_hot = 0;
|
||||
|
||||
/*
|
||||
* We do not migrate tasks that are:
|
||||
* 1) throttled_lb_pair, or
|
||||
* 2) cannot be migrated to this CPU due to cpus_ptr, or
|
||||
* 3) running (obviously), or
|
||||
* 4) are cache-hot on their current CPU.
|
||||
* 1) delayed dequeued unless we migrate load, or
|
||||
* 2) throttled_lb_pair, or
|
||||
* 3) cannot be migrated to this CPU due to cpus_ptr, or
|
||||
* 4) running (obviously), or
|
||||
* 5) are cache-hot on their current CPU.
|
||||
*/
|
||||
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
|
||||
return 0;
|
||||
|
||||
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
|
||||
return 0;
|
||||
|
||||
@ -9369,16 +9372,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
||||
if (env->flags & LBF_ACTIVE_LB)
|
||||
return 1;
|
||||
|
||||
tsk_cache_hot = migrate_degrades_locality(p, env);
|
||||
if (tsk_cache_hot == -1)
|
||||
tsk_cache_hot = task_hot(p, env);
|
||||
degrades = migrate_degrades_locality(p, env);
|
||||
if (!degrades)
|
||||
hot = task_hot(p, env);
|
||||
else
|
||||
hot = degrades > 0;
|
||||
|
||||
if (tsk_cache_hot <= 0 ||
|
||||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
|
||||
if (tsk_cache_hot == 1) {
|
||||
schedstat_inc(env->sd->lb_hot_gained[env->idle]);
|
||||
schedstat_inc(p->stats.nr_forced_migrations);
|
||||
}
|
||||
if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
|
||||
if (hot)
|
||||
p->sched_task_hot = 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -9393,6 +9395,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
|
||||
{
|
||||
lockdep_assert_rq_held(env->src_rq);
|
||||
|
||||
if (p->sched_task_hot) {
|
||||
p->sched_task_hot = 0;
|
||||
schedstat_inc(env->sd->lb_hot_gained[env->idle]);
|
||||
schedstat_inc(p->stats.nr_forced_migrations);
|
||||
}
|
||||
|
||||
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
|
||||
set_task_cpu(p, env->dst_cpu);
|
||||
}
|
||||
@ -9553,6 +9561,9 @@ static int detach_tasks(struct lb_env *env)
|
||||
|
||||
continue;
|
||||
next:
|
||||
if (p->sched_task_hot)
|
||||
schedstat_inc(p->stats.nr_failed_migrations_hot);
|
||||
|
||||
list_move(&p->se.group_node, tasks);
|
||||
}
|
||||
|
||||
@ -9695,7 +9706,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
|
||||
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
|
||||
update_tg_load_avg(cfs_rq);
|
||||
|
||||
if (cfs_rq->nr_running == 0)
|
||||
if (cfs_rq->nr_queued == 0)
|
||||
update_idle_cfs_rq_clock_pelt(cfs_rq);
|
||||
|
||||
if (cfs_rq == &rq->cfs)
|
||||
@ -10227,7 +10238,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
|
||||
* When there is more than 1 task, the group_overloaded case already
|
||||
* takes care of cpu with reduced capacity
|
||||
*/
|
||||
if (rq->cfs.h_nr_running != 1)
|
||||
if (rq->cfs.h_nr_runnable != 1)
|
||||
return false;
|
||||
|
||||
return check_cpu_capacity(rq, sd);
|
||||
@ -10262,7 +10273,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||
sgs->group_load += load;
|
||||
sgs->group_util += cpu_util_cfs(i);
|
||||
sgs->group_runnable += cpu_runnable(rq);
|
||||
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
|
||||
sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
|
||||
|
||||
nr_running = rq->nr_running;
|
||||
sgs->sum_nr_running += nr_running;
|
||||
@ -10577,7 +10588,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
|
||||
sgs->group_util += cpu_util_without(i, p);
|
||||
sgs->group_runnable += cpu_runnable_without(rq, p);
|
||||
local = task_running_on_cpu(i, p);
|
||||
sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
|
||||
sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
|
||||
|
||||
nr_running = rq->nr_running - local;
|
||||
sgs->sum_nr_running += nr_running;
|
||||
@ -11359,7 +11370,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
|
||||
if (rt > env->fbq_type)
|
||||
continue;
|
||||
|
||||
nr_running = rq->cfs.h_nr_running;
|
||||
nr_running = rq->cfs.h_nr_runnable;
|
||||
if (!nr_running)
|
||||
continue;
|
||||
|
||||
@ -11518,7 +11529,7 @@ static int need_active_balance(struct lb_env *env)
|
||||
* available on dst_cpu.
|
||||
*/
|
||||
if (env->idle &&
|
||||
(env->src_rq->cfs.h_nr_running == 1)) {
|
||||
(env->src_rq->cfs.h_nr_runnable == 1)) {
|
||||
if ((check_cpu_capacity(env->src_rq, sd)) &&
|
||||
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
|
||||
return 1;
|
||||
@ -11598,6 +11609,28 @@ static int should_we_balance(struct lb_env *env)
|
||||
return group_balance_cpu(sg) == env->dst_cpu;
|
||||
}
|
||||
|
||||
static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd,
|
||||
enum cpu_idle_type idle)
|
||||
{
|
||||
if (!schedstat_enabled())
|
||||
return;
|
||||
|
||||
switch (env->migration_type) {
|
||||
case migrate_load:
|
||||
__schedstat_add(sd->lb_imbalance_load[idle], env->imbalance);
|
||||
break;
|
||||
case migrate_util:
|
||||
__schedstat_add(sd->lb_imbalance_util[idle], env->imbalance);
|
||||
break;
|
||||
case migrate_task:
|
||||
__schedstat_add(sd->lb_imbalance_task[idle], env->imbalance);
|
||||
break;
|
||||
case migrate_misfit:
|
||||
__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check this_cpu to ensure it is balanced within domain. Attempt to move
|
||||
* tasks if there is an imbalance.
|
||||
@ -11648,7 +11681,7 @@ redo:
|
||||
|
||||
WARN_ON_ONCE(busiest == env.dst_rq);
|
||||
|
||||
schedstat_add(sd->lb_imbalance[idle], env.imbalance);
|
||||
update_lb_imbalance_stat(&env, sd, idle);
|
||||
|
||||
env.src_cpu = busiest->cpu;
|
||||
env.src_rq = busiest;
|
||||
@ -12146,16 +12179,13 @@ static inline int on_null_domain(struct rq *rq)
|
||||
* - When one of the busy CPUs notices that there may be an idle rebalancing
|
||||
* needed, they will kick the idle load balancer, which then does idle
|
||||
* load balancing for all the idle CPUs.
|
||||
*
|
||||
* - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
|
||||
* anywhere yet.
|
||||
*/
|
||||
static inline int find_new_ilb(void)
|
||||
{
|
||||
const struct cpumask *hk_mask;
|
||||
int ilb_cpu;
|
||||
|
||||
hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
|
||||
hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
|
||||
|
||||
for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
|
||||
|
||||
@ -12173,7 +12203,8 @@ static inline int find_new_ilb(void)
|
||||
* Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
|
||||
* SMP function call (IPI).
|
||||
*
|
||||
* We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
|
||||
* We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
|
||||
* (if there is one).
|
||||
*/
|
||||
static void kick_ilb(unsigned int flags)
|
||||
{
|
||||
@ -12261,7 +12292,7 @@ static void nohz_balancer_kick(struct rq *rq)
|
||||
* If there's a runnable CFS task and the current CPU has reduced
|
||||
* capacity, kick the ILB to see if there's a better CPU to run on:
|
||||
*/
|
||||
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
|
||||
if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
|
||||
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
|
||||
goto unlock;
|
||||
}
|
||||
@ -12393,10 +12424,6 @@ void nohz_balance_enter_idle(int cpu)
|
||||
if (!cpu_active(cpu))
|
||||
return;
|
||||
|
||||
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
|
||||
if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Can be set safely without rq->lock held
|
||||
* If a clear happens, it will have evaluated last additions because
|
||||
@ -12616,13 +12643,6 @@ static void nohz_newidle_balance(struct rq *this_rq)
|
||||
{
|
||||
int this_cpu = this_rq->cpu;
|
||||
|
||||
/*
|
||||
* This CPU doesn't want to be disturbed by scheduler
|
||||
* housekeeping
|
||||
*/
|
||||
if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
|
||||
return;
|
||||
|
||||
/* Will wake up very soon. No time for doing anything else*/
|
||||
if (this_rq->avg_idle < sysctl_sched_migration_cost)
|
||||
return;
|
||||
@ -12759,11 +12779,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
|
||||
* have been enqueued in the meantime. Since we're not going idle,
|
||||
* pretend we pulled a task.
|
||||
*/
|
||||
if (this_rq->cfs.h_nr_running && !pulled_task)
|
||||
if (this_rq->cfs.h_nr_queued && !pulled_task)
|
||||
pulled_task = 1;
|
||||
|
||||
/* Is there a task of a high priority class? */
|
||||
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
|
||||
if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
|
||||
pulled_task = -1;
|
||||
|
||||
out:
|
||||
@ -12784,9 +12804,9 @@ out:
|
||||
/*
|
||||
* This softirq handler is triggered via SCHED_SOFTIRQ from two places:
|
||||
*
|
||||
* - directly from the local scheduler_tick() for periodic load balancing
|
||||
* - directly from the local sched_tick() for periodic load balancing
|
||||
*
|
||||
* - indirectly from a remote scheduler_tick() for NOHZ idle balancing
|
||||
* - indirectly from a remote sched_tick() for NOHZ idle balancing
|
||||
* through the SMP cross-call nohz_csd_func()
|
||||
*/
|
||||
static __latent_entropy void sched_balance_softirq(void)
|
||||
@ -12877,7 +12897,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
|
||||
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
|
||||
* if we need to give up the CPU.
|
||||
*/
|
||||
if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
|
||||
if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 &&
|
||||
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
|
||||
resched_curr(rq);
|
||||
}
|
||||
@ -13021,7 +13041,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
|
||||
if (!task_on_rq_queued(p))
|
||||
return;
|
||||
|
||||
if (rq->cfs.nr_running == 1)
|
||||
if (rq->cfs.nr_queued == 1)
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -13431,7 +13451,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
|
||||
for_each_possible_cpu(i) {
|
||||
struct rq *rq = cpu_rq(i);
|
||||
struct sched_entity *se = tg->se[i];
|
||||
struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
|
||||
struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
|
||||
bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
|
||||
long idle_task_delta;
|
||||
struct rq_flags rf;
|
||||
@ -13442,16 +13462,8 @@ int sched_group_set_idle(struct task_group *tg, long idle)
|
||||
if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
|
||||
goto next_cpu;
|
||||
|
||||
if (se->on_rq) {
|
||||
parent_cfs_rq = cfs_rq_of(se);
|
||||
if (cfs_rq_is_idle(grp_cfs_rq))
|
||||
parent_cfs_rq->idle_nr_running++;
|
||||
else
|
||||
parent_cfs_rq->idle_nr_running--;
|
||||
}
|
||||
|
||||
idle_task_delta = grp_cfs_rq->h_nr_running -
|
||||
grp_cfs_rq->idle_h_nr_running;
|
||||
idle_task_delta = grp_cfs_rq->h_nr_queued -
|
||||
grp_cfs_rq->h_nr_idle;
|
||||
if (!cfs_rq_is_idle(grp_cfs_rq))
|
||||
idle_task_delta *= -1;
|
||||
|
||||
@ -13461,7 +13473,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
|
||||
if (!se->on_rq)
|
||||
break;
|
||||
|
||||
cfs_rq->idle_h_nr_running += idle_task_delta;
|
||||
cfs_rq->h_nr_idle += idle_task_delta;
|
||||
|
||||
/* Already accounted at parent level and above. */
|
||||
if (cfs_rq_is_idle(cfs_rq))
|
||||
|
@ -31,6 +31,15 @@ SCHED_FEAT(PREEMPT_SHORT, true)
|
||||
*/
|
||||
SCHED_FEAT(NEXT_BUDDY, false)
|
||||
|
||||
/*
|
||||
* Allow completely ignoring cfs_rq->next; which can be set from various
|
||||
* places:
|
||||
* - NEXT_BUDDY (wakeup preemption)
|
||||
* - yield_to_task()
|
||||
* - cgroup dequeue / pick
|
||||
*/
|
||||
SCHED_FEAT(PICK_BUDDY, true)
|
||||
|
||||
/*
|
||||
* Consider buddies to be cache hot, decreases the likeliness of a
|
||||
* cache buddy being migrated away, increases cache locality.
|
||||
|
@ -9,15 +9,9 @@
|
||||
*/
|
||||
|
||||
enum hk_flags {
|
||||
HK_FLAG_TIMER = BIT(HK_TYPE_TIMER),
|
||||
HK_FLAG_RCU = BIT(HK_TYPE_RCU),
|
||||
HK_FLAG_MISC = BIT(HK_TYPE_MISC),
|
||||
HK_FLAG_SCHED = BIT(HK_TYPE_SCHED),
|
||||
HK_FLAG_TICK = BIT(HK_TYPE_TICK),
|
||||
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
|
||||
HK_FLAG_WQ = BIT(HK_TYPE_WQ),
|
||||
HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
|
||||
HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD),
|
||||
HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
|
||||
};
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
|
||||
@ -97,7 +91,7 @@ void __init housekeeping_init(void)
|
||||
|
||||
static_branch_enable(&housekeeping_overridden);
|
||||
|
||||
if (housekeeping.flags & HK_FLAG_TICK)
|
||||
if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
|
||||
sched_tick_offload_init();
|
||||
|
||||
for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
|
||||
@ -121,7 +115,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
|
||||
unsigned int first_cpu;
|
||||
int err = 0;
|
||||
|
||||
if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
|
||||
if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) {
|
||||
if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
|
||||
pr_warn("Housekeeping: nohz unsupported."
|
||||
" Build with CONFIG_NO_HZ_FULL\n");
|
||||
@ -177,7 +171,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
|
||||
housekeeping_setup_type(type, housekeeping_staging);
|
||||
}
|
||||
|
||||
if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK))
|
||||
if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE))
|
||||
tick_nohz_full_setup(non_housekeeping_mask);
|
||||
|
||||
housekeeping.flags |= flags;
|
||||
@ -195,8 +189,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
|
||||
HK_FLAG_MISC | HK_FLAG_KTHREAD;
|
||||
flags = HK_FLAG_KERNEL_NOISE;
|
||||
|
||||
return housekeeping_setup(str, flags);
|
||||
}
|
||||
@ -210,9 +203,12 @@ static int __init housekeeping_isolcpus_setup(char *str)
|
||||
int len;
|
||||
|
||||
while (isalpha(*str)) {
|
||||
/*
|
||||
* isolcpus=nohz is equivalent to nohz_full.
|
||||
*/
|
||||
if (!strncmp(str, "nohz,", 5)) {
|
||||
str += 5;
|
||||
flags |= HK_FLAG_TICK;
|
||||
flags |= HK_FLAG_KERNEL_NOISE;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
|
||||
*
|
||||
* group: [ see update_cfs_group() ]
|
||||
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
|
||||
* se_runnable() = grq->h_nr_running
|
||||
* se_runnable() = grq->h_nr_runnable
|
||||
*
|
||||
* runnable_sum = se_runnable() * runnable = grq->runnable_sum
|
||||
* runnable_avg = runnable_sum
|
||||
@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
if (___update_load_sum(now, &cfs_rq->avg,
|
||||
scale_load_down(cfs_rq->load.weight),
|
||||
cfs_rq->h_nr_running - cfs_rq->h_nr_delayed,
|
||||
cfs_rq->h_nr_runnable,
|
||||
cfs_rq->curr != NULL)) {
|
||||
|
||||
___update_load_avg(&cfs_rq->avg, 1);
|
||||
|
@ -362,7 +362,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
|
||||
extern bool __checkparam_dl(const struct sched_attr *attr);
|
||||
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
|
||||
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
|
||||
extern int dl_bw_check_overflow(int cpu);
|
||||
extern int dl_bw_deactivate(int cpu);
|
||||
extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
|
||||
/*
|
||||
* SCHED_DEADLINE supports servers (nested scheduling) with the following
|
||||
@ -650,11 +650,10 @@ struct balance_callback {
|
||||
/* CFS-related fields in a runqueue */
|
||||
struct cfs_rq {
|
||||
struct load_weight load;
|
||||
unsigned int nr_running;
|
||||
unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
|
||||
unsigned int idle_nr_running; /* SCHED_IDLE */
|
||||
unsigned int idle_h_nr_running; /* SCHED_IDLE */
|
||||
unsigned int h_nr_delayed;
|
||||
unsigned int nr_queued;
|
||||
unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
|
||||
unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
|
||||
unsigned int h_nr_idle; /* SCHED_IDLE */
|
||||
|
||||
s64 avg_vruntime;
|
||||
u64 avg_load;
|
||||
@ -904,11 +903,8 @@ struct dl_rq {
|
||||
|
||||
static inline void se_update_runnable(struct sched_entity *se)
|
||||
{
|
||||
if (!entity_is_task(se)) {
|
||||
struct cfs_rq *cfs_rq = se->my_q;
|
||||
|
||||
se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed;
|
||||
}
|
||||
if (!entity_is_task(se))
|
||||
se->runnable_weight = se->my_q->h_nr_runnable;
|
||||
}
|
||||
|
||||
static inline long se_runnable(struct sched_entity *se)
|
||||
@ -2280,7 +2276,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
|
||||
|
||||
static inline int task_on_rq_queued(struct task_struct *p)
|
||||
{
|
||||
return p->on_rq == TASK_ON_RQ_QUEUED;
|
||||
return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED;
|
||||
}
|
||||
|
||||
static inline int task_on_rq_migrating(struct task_struct *p)
|
||||
@ -2574,7 +2570,7 @@ static inline bool sched_rt_runnable(struct rq *rq)
|
||||
|
||||
static inline bool sched_fair_runnable(struct rq *rq)
|
||||
{
|
||||
return rq->cfs.nr_running > 0;
|
||||
return rq->cfs.nr_queued > 0;
|
||||
}
|
||||
|
||||
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
|
||||
|
@ -103,7 +103,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
|
||||
* Bump this up when changing the output format or the meaning of an existing
|
||||
* format, so that tools can adapt (or abort)
|
||||
*/
|
||||
#define SCHEDSTAT_VERSION 16
|
||||
#define SCHEDSTAT_VERSION 17
|
||||
|
||||
static int show_schedstat(struct seq_file *seq, void *v)
|
||||
{
|
||||
@ -138,14 +138,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
|
||||
for_each_domain(cpu, sd) {
|
||||
enum cpu_idle_type itype;
|
||||
|
||||
seq_printf(seq, "domain%d %*pb", dcount++,
|
||||
seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name,
|
||||
cpumask_pr_args(sched_domain_span(sd)));
|
||||
for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
|
||||
seq_printf(seq, " %u %u %u %u %u %u %u %u",
|
||||
seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u",
|
||||
sd->lb_count[itype],
|
||||
sd->lb_balanced[itype],
|
||||
sd->lb_failed[itype],
|
||||
sd->lb_imbalance[itype],
|
||||
sd->lb_imbalance_load[itype],
|
||||
sd->lb_imbalance_util[itype],
|
||||
sd->lb_imbalance_task[itype],
|
||||
sd->lb_imbalance_misfit[itype],
|
||||
sd->lb_gained[itype],
|
||||
sd->lb_hot_gained[itype],
|
||||
sd->lb_nobusyq[itype],
|
||||
|
@ -1635,9 +1635,7 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
.max_newidle_lb_cost = 0,
|
||||
.last_decay_max_lb_cost = jiffies,
|
||||
.child = child,
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
.name = tl->name,
|
||||
#endif
|
||||
};
|
||||
|
||||
sd_span = sched_domain_span(sd);
|
||||
@ -2338,10 +2336,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
|
||||
if (!cpumask_subset(sched_domain_span(child),
|
||||
sched_domain_span(sd))) {
|
||||
pr_err("BUG: arch topology borken\n");
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
pr_err(" the %s domain not a subset of the %s domain\n",
|
||||
child->name, sd->name);
|
||||
#endif
|
||||
/* Fixup, ensure @sd has at least @child CPUs. */
|
||||
cpumask_or(sched_domain_span(sd),
|
||||
sched_domain_span(sd),
|
||||
@ -2721,9 +2717,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
|
||||
|
||||
/*
|
||||
* This domain won't be destroyed and as such
|
||||
* its dl_bw->total_bw needs to be cleared. It
|
||||
* will be recomputed in function
|
||||
* update_tasks_root_domain().
|
||||
* its dl_bw->total_bw needs to be cleared.
|
||||
* Tasks contribution will be then recomputed
|
||||
* in function dl_update_tasks_root_domain(),
|
||||
* dl_servers contribution in function
|
||||
* dl_restore_server_root_domain().
|
||||
*/
|
||||
rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
|
||||
dl_clear_root_domain(rd);
|
||||
|
Loading…
x
Reference in New Issue
Block a user