mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-09 14:43:16 +00:00
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into next
Pull scheduler updates from Ingo Molnar: "The main scheduling related changes in this cycle were: - various sched/numa updates, for better performance - tree wide cleanup of open coded nice levels - nohz fix related to rq->nr_running use - cpuidle changes and continued consolidation to improve the kernel/sched/idle.c high level idle scheduling logic. As part of this effort I pulled cpuidle driver changes from Rafael as well. - standardized idle polling amongst architectures - continued work on preparing better power/energy aware scheduling - sched/rt updates - misc fixlets and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (49 commits) sched/numa: Decay ->wakee_flips instead of zeroing sched/numa: Update migrate_improves/degrades_locality() sched/numa: Allow task switch if load imbalance improves sched/rt: Fix 'struct sched_dl_entity' and dl_task_time() comments, to match the current upstream code sched: Consolidate open coded implementations of nice level frobbing into nice_to_rlimit() and rlimit_to_nice() sched: Initialize rq->age_stamp on processor start sched, nohz: Change rq->nr_running to always use wrappers sched: Fix the rq->next_balance logic in rebalance_domains() and idle_balance() sched: Use clamp() and clamp_val() to make sys_nice() more readable sched: Do not zero sg->cpumask and sg->sgp->power in build_sched_groups() sched/numa: Fix initialization of sched_domain_topology for NUMA sched: Call select_idle_sibling() when not affine_sd sched: Simplify return logic in sched_read_attr() sched: Simplify return logic in sched_copy_attr() sched: Fix exec_start/task_hot on migrated tasks arm64: Remove TIF_POLLING_NRFLAG metag: Remove TIF_POLLING_NRFLAG sched/idle: Make cpuidle_idle_call() void sched/idle: Reflow cpuidle_idle_call() sched/idle: Delay clearing the polling bit ...
This commit is contained in:
commit
c84a1e32ee
@ -73,12 +73,14 @@ register struct thread_info *__current_thread_info __asm__("$8");
|
||||
#define TIF_SYSCALL_AUDIT 4 /* syscall audit active */
|
||||
#define TIF_DIE_IF_KERNEL 9 /* dik recursion lock */
|
||||
#define TIF_MEMDIE 13 /* is terminating due to OOM killer */
|
||||
#define TIF_POLLING_NRFLAG 14 /* idle is polling for TIF_NEED_RESCHED */
|
||||
|
||||
#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
|
||||
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
|
||||
#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
|
||||
#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
|
||||
#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
|
||||
#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
|
||||
|
||||
/* Work to do on interrupt/exception return. */
|
||||
#define _TIF_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
|
||||
@ -92,8 +94,6 @@ register struct thread_info *__current_thread_info __asm__("$8");
|
||||
#define TS_UAC_NOFIX 0x0002 /* ! flags as they match */
|
||||
#define TS_UAC_SIGBUS 0x0004 /* ! userspace part of 'osf_sysinfo' */
|
||||
#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
|
||||
#define TS_POLLING 0x0010 /* idle task polling need_resched,
|
||||
skip sending interrupt */
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
#define HAVE_SET_RESTORE_SIGMASK 1
|
||||
|
@ -185,6 +185,15 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
|
||||
return &cpu_topology[cpu].core_sibling;
|
||||
}
|
||||
|
||||
/*
|
||||
* The current assumption is that we can power gate each core independently.
|
||||
* This will be superseded by DT binding once available.
|
||||
*/
|
||||
const struct cpumask *cpu_corepower_mask(int cpu)
|
||||
{
|
||||
return &cpu_topology[cpu].thread_sibling;
|
||||
}
|
||||
|
||||
static void update_siblings_masks(unsigned int cpuid)
|
||||
{
|
||||
struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
|
||||
@ -266,6 +275,20 @@ void store_cpu_topology(unsigned int cpuid)
|
||||
cpu_topology[cpuid].socket_id, mpidr);
|
||||
}
|
||||
|
||||
static inline const int cpu_corepower_flags(void)
|
||||
{
|
||||
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
|
||||
}
|
||||
|
||||
static struct sched_domain_topology_level arm_topology[] = {
|
||||
#ifdef CONFIG_SCHED_MC
|
||||
{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
|
||||
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
|
||||
#endif
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
/*
|
||||
* init_cpu_topology is called at boot when only one cpu is running
|
||||
* which prevent simultaneous write access to cpu_topology array
|
||||
@ -289,4 +312,7 @@ void __init init_cpu_topology(void)
|
||||
smp_wmb();
|
||||
|
||||
parse_dt_topology();
|
||||
|
||||
/* Set scheduler topology descriptor */
|
||||
set_sched_topology(arm_topology);
|
||||
}
|
||||
|
@ -95,13 +95,11 @@ static inline struct thread_info *current_thread_info(void)
|
||||
* TIF_NEED_RESCHED - rescheduling necessary
|
||||
* TIF_NOTIFY_RESUME - callback before returning to user
|
||||
* TIF_USEDFPU - FPU was used by this task this quantum (SMP)
|
||||
* TIF_POLLING_NRFLAG - true if poll_idle() is polling TIF_NEED_RESCHED
|
||||
*/
|
||||
#define TIF_SIGPENDING 0
|
||||
#define TIF_NEED_RESCHED 1
|
||||
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
|
||||
#define TIF_SYSCALL_TRACE 8
|
||||
#define TIF_POLLING_NRFLAG 16
|
||||
#define TIF_MEMDIE 18 /* is terminating due to OOM killer */
|
||||
#define TIF_FREEZE 19
|
||||
#define TIF_RESTORE_SIGMASK 20
|
||||
|
@ -107,6 +107,7 @@ struct thread_info {
|
||||
#define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */
|
||||
#define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */
|
||||
#define TIF_RESTORE_RSE 21 /* user RBS is newer than kernel RBS */
|
||||
#define TIF_POLLING_NRFLAG 22 /* idle is polling for TIF_NEED_RESCHED */
|
||||
|
||||
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
|
||||
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
|
||||
@ -118,6 +119,7 @@ struct thread_info {
|
||||
#define _TIF_MCA_INIT (1 << TIF_MCA_INIT)
|
||||
#define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED)
|
||||
#define _TIF_RESTORE_RSE (1 << TIF_RESTORE_RSE)
|
||||
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
|
||||
|
||||
/* "work to do on user-return" bits */
|
||||
#define TIF_ALLWORK_MASK (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\
|
||||
@ -125,7 +127,6 @@ struct thread_info {
|
||||
/* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
|
||||
#define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
|
||||
|
||||
#define TS_POLLING 1 /* true if in idle loop and not sleeping */
|
||||
#define TS_RESTORE_SIGMASK 2 /* restore signal mask in do_signal() */
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
@ -46,30 +46,6 @@
|
||||
|
||||
void build_cpu_to_node_map(void);
|
||||
|
||||
#define SD_CPU_INIT (struct sched_domain) { \
|
||||
.parent = NULL, \
|
||||
.child = NULL, \
|
||||
.groups = NULL, \
|
||||
.min_interval = 1, \
|
||||
.max_interval = 4, \
|
||||
.busy_factor = 64, \
|
||||
.imbalance_pct = 125, \
|
||||
.cache_nice_tries = 2, \
|
||||
.busy_idx = 2, \
|
||||
.idle_idx = 1, \
|
||||
.newidle_idx = 0, \
|
||||
.wake_idx = 0, \
|
||||
.forkexec_idx = 0, \
|
||||
.flags = SD_LOAD_BALANCE \
|
||||
| SD_BALANCE_NEWIDLE \
|
||||
| SD_BALANCE_EXEC \
|
||||
| SD_BALANCE_FORK \
|
||||
| SD_WAKE_AFFINE, \
|
||||
.last_balance = jiffies, \
|
||||
.balance_interval = 1, \
|
||||
.nr_balance_failed = 0, \
|
||||
}
|
||||
|
||||
#endif /* CONFIG_NUMA */
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
@ -117,10 +117,8 @@ static inline int kstack_end(void *addr)
|
||||
#define TIF_SECCOMP 5 /* secure computing */
|
||||
#define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */
|
||||
#define TIF_NOTIFY_RESUME 7 /* callback before returning to user */
|
||||
#define TIF_POLLING_NRFLAG 8 /* true if poll_idle() is polling
|
||||
TIF_NEED_RESCHED */
|
||||
#define TIF_MEMDIE 9 /* is terminating due to OOM killer */
|
||||
#define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint instrumentation */
|
||||
#define TIF_MEMDIE 8 /* is terminating due to OOM killer */
|
||||
#define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */
|
||||
|
||||
|
||||
#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
|
||||
|
@ -766,6 +766,28 @@ int setup_profiling_timer(unsigned int multiplier)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
/* cpumask of CPUs with asymetric SMT dependancy */
|
||||
static const int powerpc_smt_flags(void)
|
||||
{
|
||||
int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
|
||||
|
||||
if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
|
||||
printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
|
||||
flags |= SD_ASYM_PACKING;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct sched_domain_topology_level powerpc_topology[] = {
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
|
||||
#endif
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
void __init smp_cpus_done(unsigned int max_cpus)
|
||||
{
|
||||
cpumask_var_t old_mask;
|
||||
@ -790,15 +812,8 @@ void __init smp_cpus_done(unsigned int max_cpus)
|
||||
|
||||
dump_numa_cpu_topology();
|
||||
|
||||
}
|
||||
set_sched_topology(powerpc_topology);
|
||||
|
||||
int arch_sd_sibling_asym_packing(void)
|
||||
{
|
||||
if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
|
||||
printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
|
||||
return SD_ASYM_PACKING;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
|
@ -26,21 +26,12 @@ extern struct cpu_topology_s390 cpu_topology[NR_CPUS];
|
||||
|
||||
#define mc_capable() 1
|
||||
|
||||
static inline const struct cpumask *cpu_coregroup_mask(int cpu)
|
||||
{
|
||||
return &cpu_topology[cpu].core_mask;
|
||||
}
|
||||
|
||||
static inline const struct cpumask *cpu_book_mask(int cpu)
|
||||
{
|
||||
return &cpu_topology[cpu].book_mask;
|
||||
}
|
||||
|
||||
int topology_cpu_init(struct cpu *);
|
||||
int topology_set_cpu_management(int fc);
|
||||
void topology_schedule_update(void);
|
||||
void store_topology(struct sysinfo_15_1_x *info);
|
||||
void topology_expect_change(void);
|
||||
const struct cpumask *cpu_coregroup_mask(int cpu);
|
||||
|
||||
#else /* CONFIG_SCHED_BOOK */
|
||||
|
||||
@ -64,8 +55,6 @@ static inline void s390_init_cpu_topology(void)
|
||||
};
|
||||
#endif
|
||||
|
||||
#define SD_BOOK_INIT SD_CPU_INIT
|
||||
|
||||
#include <asm-generic/topology.h>
|
||||
|
||||
#endif /* _ASM_S390_TOPOLOGY_H */
|
||||
|
@ -445,6 +445,23 @@ int topology_cpu_init(struct cpu *cpu)
|
||||
return sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group);
|
||||
}
|
||||
|
||||
const struct cpumask *cpu_coregroup_mask(int cpu)
|
||||
{
|
||||
return &cpu_topology[cpu].core_mask;
|
||||
}
|
||||
|
||||
static const struct cpumask *cpu_book_mask(int cpu)
|
||||
{
|
||||
return &cpu_topology[cpu].book_mask;
|
||||
}
|
||||
|
||||
static struct sched_domain_topology_level s390_topology[] = {
|
||||
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
|
||||
{ cpu_book_mask, SD_INIT_NAME(BOOK) },
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
static int __init topology_init(void)
|
||||
{
|
||||
if (!MACHINE_HAS_TOPOLOGY) {
|
||||
@ -453,6 +470,9 @@ static int __init topology_init(void)
|
||||
}
|
||||
set_topology_timer();
|
||||
out:
|
||||
|
||||
set_sched_topology(s390_topology);
|
||||
|
||||
return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching);
|
||||
}
|
||||
device_initcall(topology_init);
|
||||
|
@ -129,6 +129,7 @@ extern void _cpu_idle(void);
|
||||
#define TIF_MEMDIE 7 /* OOM killer at work */
|
||||
#define TIF_NOTIFY_RESUME 8 /* callback before returning to user */
|
||||
#define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */
|
||||
#define TIF_POLLING_NRFLAG 10 /* idle is polling for TIF_NEED_RESCHED */
|
||||
|
||||
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
|
||||
#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
|
||||
@ -140,6 +141,7 @@ extern void _cpu_idle(void);
|
||||
#define _TIF_MEMDIE (1<<TIF_MEMDIE)
|
||||
#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
|
||||
#define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
|
||||
#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
|
||||
|
||||
/* Work to do on any return to user space. */
|
||||
#define _TIF_ALLWORK_MASK \
|
||||
@ -162,7 +164,6 @@ extern void _cpu_idle(void);
|
||||
#ifdef __tilegx__
|
||||
#define TS_COMPAT 0x0001 /* 32-bit compatibility mode */
|
||||
#endif
|
||||
#define TS_POLLING 0x0004 /* in idle loop but not sleeping */
|
||||
#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal */
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
@ -44,39 +44,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
|
||||
/* For now, use numa node -1 for global allocation. */
|
||||
#define pcibus_to_node(bus) ((void)(bus), -1)
|
||||
|
||||
/*
|
||||
* TILE architecture has many cores integrated in one processor, so we need
|
||||
* setup bigger balance_interval for both CPU/NODE scheduling domains to
|
||||
* reduce process scheduling costs.
|
||||
*/
|
||||
|
||||
/* sched_domains SD_CPU_INIT for TILE architecture */
|
||||
#define SD_CPU_INIT (struct sched_domain) { \
|
||||
.min_interval = 4, \
|
||||
.max_interval = 128, \
|
||||
.busy_factor = 64, \
|
||||
.imbalance_pct = 125, \
|
||||
.cache_nice_tries = 1, \
|
||||
.busy_idx = 2, \
|
||||
.idle_idx = 1, \
|
||||
.newidle_idx = 0, \
|
||||
.wake_idx = 0, \
|
||||
.forkexec_idx = 0, \
|
||||
\
|
||||
.flags = 1*SD_LOAD_BALANCE \
|
||||
| 1*SD_BALANCE_NEWIDLE \
|
||||
| 1*SD_BALANCE_EXEC \
|
||||
| 1*SD_BALANCE_FORK \
|
||||
| 0*SD_BALANCE_WAKE \
|
||||
| 0*SD_WAKE_AFFINE \
|
||||
| 0*SD_SHARE_CPUPOWER \
|
||||
| 0*SD_SHARE_PKG_RESOURCES \
|
||||
| 0*SD_SERIALIZE \
|
||||
, \
|
||||
.last_balance = jiffies, \
|
||||
.balance_interval = 32, \
|
||||
}
|
||||
|
||||
/* By definition, we create nodes based on online memory. */
|
||||
#define node_has_online_mem(nid) 1
|
||||
|
||||
|
@ -83,6 +83,7 @@ struct thread_info {
|
||||
#define TIF_FORK 18 /* ret_from_fork */
|
||||
#define TIF_NOHZ 19 /* in adaptive nohz mode */
|
||||
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
|
||||
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
|
||||
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
|
||||
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
|
||||
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
|
||||
@ -106,6 +107,7 @@ struct thread_info {
|
||||
#define _TIF_IA32 (1 << TIF_IA32)
|
||||
#define _TIF_FORK (1 << TIF_FORK)
|
||||
#define _TIF_NOHZ (1 << TIF_NOHZ)
|
||||
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
|
||||
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
|
||||
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
|
||||
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
|
||||
@ -191,8 +193,6 @@ static inline struct thread_info *current_thread_info(void)
|
||||
* have to worry about atomic accesses.
|
||||
*/
|
||||
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
|
||||
#define TS_POLLING 0x0004 /* idle task polling need_resched,
|
||||
skip sending interrupt */
|
||||
#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
@ -844,21 +844,10 @@ static int apm_do_idle(void)
|
||||
int polling;
|
||||
int err = 0;
|
||||
|
||||
polling = !!(current_thread_info()->status & TS_POLLING);
|
||||
if (polling) {
|
||||
current_thread_info()->status &= ~TS_POLLING;
|
||||
/*
|
||||
* TS_POLLING-cleared state must be visible before we
|
||||
* test NEED_RESCHED:
|
||||
*/
|
||||
smp_mb();
|
||||
}
|
||||
if (!need_resched()) {
|
||||
idled = 1;
|
||||
ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
|
||||
}
|
||||
if (polling)
|
||||
current_thread_info()->status |= TS_POLLING;
|
||||
|
||||
if (!idled)
|
||||
return 0;
|
||||
|
@ -548,7 +548,7 @@ static int loop_thread(void *data)
|
||||
struct loop_device *lo = data;
|
||||
struct bio *bio;
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
|
||||
while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
|
||||
|
||||
|
@ -533,7 +533,7 @@ static int nbd_thread(void *data)
|
||||
struct nbd_device *nbd = data;
|
||||
struct request *req;
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
|
||||
/* wait for something to do */
|
||||
wait_event_interruptible(nbd->waiting_wq,
|
||||
|
@ -1463,7 +1463,7 @@ static int kcdrwd(void *foobar)
|
||||
struct packet_data *pkt;
|
||||
long min_sleep_time, residue;
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
set_freezable();
|
||||
|
||||
for (;;) {
|
||||
|
@ -1007,7 +1007,7 @@ static int ipmi_thread(void *data)
|
||||
struct timespec busy_until;
|
||||
|
||||
ipmi_si_set_not_busy(&busy_until);
|
||||
set_user_nice(current, 19);
|
||||
set_user_nice(current, MAX_NICE);
|
||||
while (!kthread_should_stop()) {
|
||||
int busy_wait;
|
||||
|
||||
|
@ -32,6 +32,7 @@ LIST_HEAD(cpuidle_detected_devices);
|
||||
static int enabled_devices;
|
||||
static int off __read_mostly;
|
||||
static int initialized __read_mostly;
|
||||
static bool use_deepest_state __read_mostly;
|
||||
|
||||
int cpuidle_disabled(void)
|
||||
{
|
||||
@ -65,23 +66,42 @@ int cpuidle_play_dead(void)
|
||||
}
|
||||
|
||||
/**
|
||||
* cpuidle_enabled - check if the cpuidle framework is ready
|
||||
* @dev: cpuidle device for this cpu
|
||||
* @drv: cpuidle driver for this cpu
|
||||
* cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode.
|
||||
* @enable: Whether enable or disable the feature.
|
||||
*
|
||||
* Return 0 on success, otherwise:
|
||||
* -NODEV : the cpuidle framework is not available
|
||||
* -EBUSY : the cpuidle framework is not initialized
|
||||
* If the "deepest idle" mode is enabled, cpuidle will ignore the governor and
|
||||
* always use the state with the greatest exit latency (out of the states that
|
||||
* are not disabled).
|
||||
*
|
||||
* This function can only be called after cpuidle_pause() to avoid races.
|
||||
*/
|
||||
int cpuidle_enabled(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
||||
void cpuidle_use_deepest_state(bool enable)
|
||||
{
|
||||
if (off || !initialized)
|
||||
return -ENODEV;
|
||||
use_deepest_state = enable;
|
||||
}
|
||||
|
||||
if (!drv || !dev || !dev->enabled)
|
||||
return -EBUSY;
|
||||
/**
|
||||
* cpuidle_find_deepest_state - Find the state of the greatest exit latency.
|
||||
* @drv: cpuidle driver for a given CPU.
|
||||
* @dev: cpuidle device for a given CPU.
|
||||
*/
|
||||
static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
|
||||
struct cpuidle_device *dev)
|
||||
{
|
||||
unsigned int latency_req = 0;
|
||||
int i, ret = CPUIDLE_DRIVER_STATE_START - 1;
|
||||
|
||||
return 0;
|
||||
for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
|
||||
struct cpuidle_state *s = &drv->states[i];
|
||||
struct cpuidle_state_usage *su = &dev->states_usage[i];
|
||||
|
||||
if (s->disabled || su->disable || s->exit_latency <= latency_req)
|
||||
continue;
|
||||
|
||||
latency_req = s->exit_latency;
|
||||
ret = i;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -138,6 +158,15 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
|
||||
*/
|
||||
int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
||||
{
|
||||
if (off || !initialized)
|
||||
return -ENODEV;
|
||||
|
||||
if (!drv || !dev || !dev->enabled)
|
||||
return -EBUSY;
|
||||
|
||||
if (unlikely(use_deepest_state))
|
||||
return cpuidle_find_deepest_state(drv, dev);
|
||||
|
||||
return cpuidle_curr_governor->select(drv, dev);
|
||||
}
|
||||
|
||||
@ -169,7 +198,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev,
|
||||
*/
|
||||
void cpuidle_reflect(struct cpuidle_device *dev, int index)
|
||||
{
|
||||
if (cpuidle_curr_governor->reflect)
|
||||
if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state))
|
||||
cpuidle_curr_governor->reflect(dev, index);
|
||||
}
|
||||
|
||||
|
@ -296,7 +296,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
||||
data->needs_update = 0;
|
||||
}
|
||||
|
||||
data->last_state_idx = 0;
|
||||
data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
|
||||
|
||||
/* Special case when user has set very strict latency requirement */
|
||||
if (unlikely(latency_req == 0))
|
||||
@ -310,13 +310,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
||||
|
||||
data->bucket = which_bucket(data->next_timer_us);
|
||||
|
||||
/*
|
||||
* if the correction factor is 0 (eg first time init or cpu hotplug
|
||||
* etc), we actually want to start out with a unity factor.
|
||||
*/
|
||||
if (data->correction_factor[data->bucket] == 0)
|
||||
data->correction_factor[data->bucket] = RESOLUTION * DECAY;
|
||||
|
||||
/*
|
||||
* Force the result of multiplication to be 64 bits even if both
|
||||
* operands are 32 bits.
|
||||
@ -466,9 +459,17 @@ static int menu_enable_device(struct cpuidle_driver *drv,
|
||||
struct cpuidle_device *dev)
|
||||
{
|
||||
struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
|
||||
int i;
|
||||
|
||||
memset(data, 0, sizeof(struct menu_device));
|
||||
|
||||
/*
|
||||
* if the correction factor is 0 (eg first time init or cpu hotplug
|
||||
* etc), we actually want to start out with a unity factor.
|
||||
*/
|
||||
for(i = 0; i < BUCKETS; i++)
|
||||
data->correction_factor[i] = RESOLUTION * DECAY;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1803,7 +1803,7 @@ static int ap_poll_thread(void *data)
|
||||
int requests;
|
||||
struct ap_device *ap_dev;
|
||||
|
||||
set_user_nice(current, 19);
|
||||
set_user_nice(current, MAX_NICE);
|
||||
while (1) {
|
||||
if (ap_suspend_flag)
|
||||
return 0;
|
||||
|
@ -464,7 +464,7 @@ static int bnx2fc_l2_rcv_thread(void *arg)
|
||||
struct fcoe_percpu_s *bg = arg;
|
||||
struct sk_buff *skb;
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
while (!kthread_should_stop()) {
|
||||
schedule();
|
||||
@ -602,7 +602,7 @@ int bnx2fc_percpu_io_thread(void *arg)
|
||||
struct bnx2fc_work *work, *tmp;
|
||||
LIST_HEAD(work_list);
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
while (!kthread_should_stop()) {
|
||||
schedule();
|
||||
|
@ -1870,7 +1870,7 @@ int bnx2i_percpu_io_thread(void *arg)
|
||||
struct bnx2i_work *work, *tmp;
|
||||
LIST_HEAD(work_list);
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
spin_lock_bh(&p->p_work_lock);
|
||||
|
@ -1872,7 +1872,7 @@ static int fcoe_percpu_receive_thread(void *arg)
|
||||
|
||||
skb_queue_head_init(&tmp);
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
|
||||
retry:
|
||||
while (!kthread_should_stop()) {
|
||||
|
@ -4515,7 +4515,7 @@ static int ibmvfc_work(void *data)
|
||||
struct ibmvfc_host *vhost = data;
|
||||
int rc;
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
|
||||
while (1) {
|
||||
rc = wait_event_interruptible(vhost->work_wait_q,
|
||||
|
@ -2213,7 +2213,7 @@ static int ibmvscsi_work(void *data)
|
||||
struct ibmvscsi_host_data *hostdata = data;
|
||||
int rc;
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
|
||||
while (1) {
|
||||
rc = wait_event_interruptible(hostdata->work_wait_q,
|
||||
|
@ -731,7 +731,7 @@ lpfc_do_work(void *p)
|
||||
struct lpfc_hba *phba = p;
|
||||
int rc;
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
current->flags |= PF_NOFREEZE;
|
||||
phba->data_flags = 0;
|
||||
|
||||
|
@ -4828,7 +4828,7 @@ qla2x00_do_dpc(void *data)
|
||||
ha = (struct qla_hw_data *)data;
|
||||
base_vha = pci_get_drvdata(ha->pdev);
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
while (!kthread_should_stop()) {
|
||||
|
@ -439,12 +439,12 @@ static void binder_set_nice(long nice)
|
||||
set_user_nice(current, nice);
|
||||
return;
|
||||
}
|
||||
min_nice = 20 - current->signal->rlim[RLIMIT_NICE].rlim_cur;
|
||||
min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur);
|
||||
binder_debug(BINDER_DEBUG_PRIORITY_CAP,
|
||||
"%d: nice value %ld not allowed use %ld instead\n",
|
||||
current->pid, nice, min_nice);
|
||||
set_user_nice(current, min_nice);
|
||||
if (min_nice < 20)
|
||||
if (min_nice <= MAX_NICE)
|
||||
return;
|
||||
binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
|
||||
}
|
||||
|
@ -404,7 +404,7 @@ static int loop_thread(void *data)
|
||||
int refcheck;
|
||||
int ret = 0;
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
|
||||
lo->lo_state = LLOOP_BOUND;
|
||||
|
||||
|
@ -1107,7 +1107,7 @@ static int o2hb_thread(void *data)
|
||||
|
||||
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
|
||||
|
||||
set_user_nice(current, -20);
|
||||
set_user_nice(current, MIN_NICE);
|
||||
|
||||
/* Pin node */
|
||||
o2nm_depend_this_node();
|
||||
|
@ -120,8 +120,6 @@ struct cpuidle_driver {
|
||||
#ifdef CONFIG_CPU_IDLE
|
||||
extern void disable_cpuidle(void);
|
||||
|
||||
extern int cpuidle_enabled(struct cpuidle_driver *drv,
|
||||
struct cpuidle_device *dev);
|
||||
extern int cpuidle_select(struct cpuidle_driver *drv,
|
||||
struct cpuidle_device *dev);
|
||||
extern int cpuidle_enter(struct cpuidle_driver *drv,
|
||||
@ -145,13 +143,11 @@ extern void cpuidle_resume(void);
|
||||
extern int cpuidle_enable_device(struct cpuidle_device *dev);
|
||||
extern void cpuidle_disable_device(struct cpuidle_device *dev);
|
||||
extern int cpuidle_play_dead(void);
|
||||
extern void cpuidle_use_deepest_state(bool enable);
|
||||
|
||||
extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
|
||||
#else
|
||||
static inline void disable_cpuidle(void) { }
|
||||
static inline int cpuidle_enabled(struct cpuidle_driver *drv,
|
||||
struct cpuidle_device *dev)
|
||||
{return -ENODEV; }
|
||||
static inline int cpuidle_select(struct cpuidle_driver *drv,
|
||||
struct cpuidle_device *dev)
|
||||
{return -ENODEV; }
|
||||
@ -180,6 +176,7 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev)
|
||||
{return -ENODEV; }
|
||||
static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
|
||||
static inline int cpuidle_play_dead(void) {return -ENODEV; }
|
||||
static inline void cpuidle_use_deepest_state(bool enable) {}
|
||||
static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
|
||||
struct cpuidle_device *dev) {return NULL; }
|
||||
#endif
|
||||
|
@ -870,6 +870,7 @@ enum cpu_idle_type {
|
||||
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
|
||||
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
|
||||
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
|
||||
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
|
||||
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
|
||||
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
|
||||
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
|
||||
@ -877,7 +878,26 @@ enum cpu_idle_type {
|
||||
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
|
||||
#define SD_NUMA 0x4000 /* cross-node balancing */
|
||||
|
||||
extern int __weak arch_sd_sibiling_asym_packing(void);
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
static inline const int cpu_smt_flags(void)
|
||||
{
|
||||
return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_MC
|
||||
static inline const int cpu_core_flags(void)
|
||||
{
|
||||
return SD_SHARE_PKG_RESOURCES;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
static inline const int cpu_numa_flags(void)
|
||||
{
|
||||
return SD_NUMA;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct sched_domain_attr {
|
||||
int relax_domain_level;
|
||||
@ -985,6 +1005,38 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
|
||||
|
||||
bool cpus_share_cache(int this_cpu, int that_cpu);
|
||||
|
||||
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
|
||||
typedef const int (*sched_domain_flags_f)(void);
|
||||
|
||||
#define SDTL_OVERLAP 0x01
|
||||
|
||||
struct sd_data {
|
||||
struct sched_domain **__percpu sd;
|
||||
struct sched_group **__percpu sg;
|
||||
struct sched_group_power **__percpu sgp;
|
||||
};
|
||||
|
||||
struct sched_domain_topology_level {
|
||||
sched_domain_mask_f mask;
|
||||
sched_domain_flags_f sd_flags;
|
||||
int flags;
|
||||
int numa_level;
|
||||
struct sd_data data;
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
char *name;
|
||||
#endif
|
||||
};
|
||||
|
||||
extern struct sched_domain_topology_level *sched_domain_topology;
|
||||
|
||||
extern void set_sched_topology(struct sched_domain_topology_level *tl);
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
# define SD_INIT_NAME(type) .name = #type
|
||||
#else
|
||||
# define SD_INIT_NAME(type)
|
||||
#endif
|
||||
|
||||
#else /* CONFIG_SMP */
|
||||
|
||||
struct sched_domain_attr;
|
||||
@ -1123,8 +1175,8 @@ struct sched_dl_entity {
|
||||
|
||||
/*
|
||||
* Original scheduling parameters. Copied here from sched_attr
|
||||
* during sched_setscheduler2(), they will remain the same until
|
||||
* the next sched_setscheduler2().
|
||||
* during sched_setattr(), they will remain the same until
|
||||
* the next sched_setattr().
|
||||
*/
|
||||
u64 dl_runtime; /* maximum runtime for each instance */
|
||||
u64 dl_deadline; /* relative deadline of each instance */
|
||||
@ -2723,51 +2775,9 @@ static inline int spin_needbreak(spinlock_t *lock)
|
||||
|
||||
/*
|
||||
* Idle thread specific functions to determine the need_resched
|
||||
* polling state. We have two versions, one based on TS_POLLING in
|
||||
* thread_info.status and one based on TIF_POLLING_NRFLAG in
|
||||
* thread_info.flags
|
||||
* polling state.
|
||||
*/
|
||||
#ifdef TS_POLLING
|
||||
static inline int tsk_is_polling(struct task_struct *p)
|
||||
{
|
||||
return task_thread_info(p)->status & TS_POLLING;
|
||||
}
|
||||
static inline void __current_set_polling(void)
|
||||
{
|
||||
current_thread_info()->status |= TS_POLLING;
|
||||
}
|
||||
|
||||
static inline bool __must_check current_set_polling_and_test(void)
|
||||
{
|
||||
__current_set_polling();
|
||||
|
||||
/*
|
||||
* Polling state must be visible before we test NEED_RESCHED,
|
||||
* paired by resched_task()
|
||||
*/
|
||||
smp_mb();
|
||||
|
||||
return unlikely(tif_need_resched());
|
||||
}
|
||||
|
||||
static inline void __current_clr_polling(void)
|
||||
{
|
||||
current_thread_info()->status &= ~TS_POLLING;
|
||||
}
|
||||
|
||||
static inline bool __must_check current_clr_polling_and_test(void)
|
||||
{
|
||||
__current_clr_polling();
|
||||
|
||||
/*
|
||||
* Polling state must be visible before we test NEED_RESCHED,
|
||||
* paired by resched_task()
|
||||
*/
|
||||
smp_mb();
|
||||
|
||||
return unlikely(tif_need_resched());
|
||||
}
|
||||
#elif defined(TIF_POLLING_NRFLAG)
|
||||
#ifdef TIF_POLLING_NRFLAG
|
||||
static inline int tsk_is_polling(struct task_struct *p)
|
||||
{
|
||||
return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
|
||||
|
@ -41,4 +41,20 @@
|
||||
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
|
||||
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
|
||||
|
||||
/*
|
||||
* Convert nice value [19,-20] to rlimit style value [1,40].
|
||||
*/
|
||||
static inline long nice_to_rlimit(long nice)
|
||||
{
|
||||
return (MAX_NICE - nice + 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert rlimit style value [1,40] to nice value [-20, 19].
|
||||
*/
|
||||
static inline long rlimit_to_nice(long prio)
|
||||
{
|
||||
return (MAX_NICE - prio + 1);
|
||||
}
|
||||
|
||||
#endif /* _SCHED_PRIO_H */
|
||||
|
@ -104,20 +104,6 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
|
||||
#define test_thread_flag(flag) \
|
||||
test_ti_thread_flag(current_thread_info(), flag)
|
||||
|
||||
static inline __deprecated void set_need_resched(void)
|
||||
{
|
||||
/*
|
||||
* Use of this function in deprecated.
|
||||
*
|
||||
* As of this writing there are only a few users in the DRM tree left
|
||||
* all of which are wrong and can be removed without causing too much
|
||||
* grief.
|
||||
*
|
||||
* The DRM people are aware and are working on removing the last few
|
||||
* instances.
|
||||
*/
|
||||
}
|
||||
|
||||
#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
|
||||
|
||||
#if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
|
||||
|
@ -66,121 +66,6 @@ int arch_update_cpu_topology(void);
|
||||
#define PENALTY_FOR_NODE_WITH_CPUS (1)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Below are the 3 major initializers used in building sched_domains:
|
||||
* SD_SIBLING_INIT, for SMT domains
|
||||
* SD_CPU_INIT, for SMP domains
|
||||
*
|
||||
* Any architecture that cares to do any tuning to these values should do so
|
||||
* by defining their own arch-specific initializer in include/asm/topology.h.
|
||||
* A definition there will automagically override these default initializers
|
||||
* and allow arch-specific performance tuning of sched_domains.
|
||||
* (Only non-zero and non-null fields need be specified.)
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is,
|
||||
* so can't we drop this in favor of CONFIG_SCHED_SMT?
|
||||
*/
|
||||
#define ARCH_HAS_SCHED_WAKE_IDLE
|
||||
/* Common values for SMT siblings */
|
||||
#ifndef SD_SIBLING_INIT
|
||||
#define SD_SIBLING_INIT (struct sched_domain) { \
|
||||
.min_interval = 1, \
|
||||
.max_interval = 2, \
|
||||
.busy_factor = 64, \
|
||||
.imbalance_pct = 110, \
|
||||
\
|
||||
.flags = 1*SD_LOAD_BALANCE \
|
||||
| 1*SD_BALANCE_NEWIDLE \
|
||||
| 1*SD_BALANCE_EXEC \
|
||||
| 1*SD_BALANCE_FORK \
|
||||
| 0*SD_BALANCE_WAKE \
|
||||
| 1*SD_WAKE_AFFINE \
|
||||
| 1*SD_SHARE_CPUPOWER \
|
||||
| 1*SD_SHARE_PKG_RESOURCES \
|
||||
| 0*SD_SERIALIZE \
|
||||
| 0*SD_PREFER_SIBLING \
|
||||
| arch_sd_sibling_asym_packing() \
|
||||
, \
|
||||
.last_balance = jiffies, \
|
||||
.balance_interval = 1, \
|
||||
.smt_gain = 1178, /* 15% */ \
|
||||
.max_newidle_lb_cost = 0, \
|
||||
.next_decay_max_lb_cost = jiffies, \
|
||||
}
|
||||
#endif
|
||||
#endif /* CONFIG_SCHED_SMT */
|
||||
|
||||
#ifdef CONFIG_SCHED_MC
|
||||
/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
|
||||
#ifndef SD_MC_INIT
|
||||
#define SD_MC_INIT (struct sched_domain) { \
|
||||
.min_interval = 1, \
|
||||
.max_interval = 4, \
|
||||
.busy_factor = 64, \
|
||||
.imbalance_pct = 125, \
|
||||
.cache_nice_tries = 1, \
|
||||
.busy_idx = 2, \
|
||||
.wake_idx = 0, \
|
||||
.forkexec_idx = 0, \
|
||||
\
|
||||
.flags = 1*SD_LOAD_BALANCE \
|
||||
| 1*SD_BALANCE_NEWIDLE \
|
||||
| 1*SD_BALANCE_EXEC \
|
||||
| 1*SD_BALANCE_FORK \
|
||||
| 0*SD_BALANCE_WAKE \
|
||||
| 1*SD_WAKE_AFFINE \
|
||||
| 0*SD_SHARE_CPUPOWER \
|
||||
| 1*SD_SHARE_PKG_RESOURCES \
|
||||
| 0*SD_SERIALIZE \
|
||||
, \
|
||||
.last_balance = jiffies, \
|
||||
.balance_interval = 1, \
|
||||
.max_newidle_lb_cost = 0, \
|
||||
.next_decay_max_lb_cost = jiffies, \
|
||||
}
|
||||
#endif
|
||||
#endif /* CONFIG_SCHED_MC */
|
||||
|
||||
/* Common values for CPUs */
|
||||
#ifndef SD_CPU_INIT
|
||||
#define SD_CPU_INIT (struct sched_domain) { \
|
||||
.min_interval = 1, \
|
||||
.max_interval = 4, \
|
||||
.busy_factor = 64, \
|
||||
.imbalance_pct = 125, \
|
||||
.cache_nice_tries = 1, \
|
||||
.busy_idx = 2, \
|
||||
.idle_idx = 1, \
|
||||
.newidle_idx = 0, \
|
||||
.wake_idx = 0, \
|
||||
.forkexec_idx = 0, \
|
||||
\
|
||||
.flags = 1*SD_LOAD_BALANCE \
|
||||
| 1*SD_BALANCE_NEWIDLE \
|
||||
| 1*SD_BALANCE_EXEC \
|
||||
| 1*SD_BALANCE_FORK \
|
||||
| 0*SD_BALANCE_WAKE \
|
||||
| 1*SD_WAKE_AFFINE \
|
||||
| 0*SD_SHARE_CPUPOWER \
|
||||
| 0*SD_SHARE_PKG_RESOURCES \
|
||||
| 0*SD_SERIALIZE \
|
||||
| 1*SD_PREFER_SIBLING \
|
||||
, \
|
||||
.last_balance = jiffies, \
|
||||
.balance_interval = 1, \
|
||||
.max_newidle_lb_cost = 0, \
|
||||
.next_decay_max_lb_cost = jiffies, \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_BOOK
|
||||
#ifndef SD_BOOK_INIT
|
||||
#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
|
||||
#endif
|
||||
#endif /* CONFIG_SCHED_BOOK */
|
||||
|
||||
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
|
||||
DECLARE_PER_CPU(int, numa_node);
|
||||
|
||||
@ -295,4 +180,17 @@ static inline int cpu_to_mem(int cpu)
|
||||
#define topology_core_cpumask(cpu) cpumask_of(cpu)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
static inline const struct cpumask *cpu_smt_mask(int cpu)
|
||||
{
|
||||
return topology_thread_cpumask(cpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline const struct cpumask *cpu_cpu_mask(int cpu)
|
||||
{
|
||||
return cpumask_of_node(cpu_to_node(cpu));
|
||||
}
|
||||
|
||||
|
||||
#endif /* _LINUX_TOPOLOGY_H */
|
||||
|
@ -216,7 +216,7 @@ static int lock_torture_writer(void *arg)
|
||||
static DEFINE_TORTURE_RANDOM(rand);
|
||||
|
||||
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
|
||||
set_user_nice(current, 19);
|
||||
set_user_nice(current, MAX_NICE);
|
||||
|
||||
do {
|
||||
if ((torture_random(&rand) & 0xfffff) == 0)
|
||||
|
@ -54,9 +54,11 @@ static void freeze_begin(void)
|
||||
|
||||
static void freeze_enter(void)
|
||||
{
|
||||
cpuidle_use_deepest_state(true);
|
||||
cpuidle_resume();
|
||||
wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
|
||||
cpuidle_pause();
|
||||
cpuidle_use_deepest_state(false);
|
||||
}
|
||||
|
||||
void freeze_wake(void)
|
||||
|
@ -521,6 +521,39 @@ static inline void init_hrtick(void)
|
||||
}
|
||||
#endif /* CONFIG_SCHED_HRTICK */
|
||||
|
||||
/*
|
||||
* cmpxchg based fetch_or, macro so it works for different integer types
|
||||
*/
|
||||
#define fetch_or(ptr, val) \
|
||||
({ typeof(*(ptr)) __old, __val = *(ptr); \
|
||||
for (;;) { \
|
||||
__old = cmpxchg((ptr), __val, __val | (val)); \
|
||||
if (__old == __val) \
|
||||
break; \
|
||||
__val = __old; \
|
||||
} \
|
||||
__old; \
|
||||
})
|
||||
|
||||
#ifdef TIF_POLLING_NRFLAG
|
||||
/*
|
||||
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
|
||||
* this avoids any races wrt polling state changes and thereby avoids
|
||||
* spurious IPIs.
|
||||
*/
|
||||
static bool set_nr_and_not_polling(struct task_struct *p)
|
||||
{
|
||||
struct thread_info *ti = task_thread_info(p);
|
||||
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
|
||||
}
|
||||
#else
|
||||
static bool set_nr_and_not_polling(struct task_struct *p)
|
||||
{
|
||||
set_tsk_need_resched(p);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* resched_task - mark a task 'to be rescheduled now'.
|
||||
*
|
||||
@ -537,17 +570,15 @@ void resched_task(struct task_struct *p)
|
||||
if (test_tsk_need_resched(p))
|
||||
return;
|
||||
|
||||
set_tsk_need_resched(p);
|
||||
|
||||
cpu = task_cpu(p);
|
||||
|
||||
if (cpu == smp_processor_id()) {
|
||||
set_tsk_need_resched(p);
|
||||
set_preempt_need_resched();
|
||||
return;
|
||||
}
|
||||
|
||||
/* NEED_RESCHED must be visible before we test polling */
|
||||
smp_mb();
|
||||
if (!tsk_is_polling(p))
|
||||
if (set_nr_and_not_polling(p))
|
||||
smp_send_reschedule(cpu);
|
||||
}
|
||||
|
||||
@ -3018,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);
|
||||
int can_nice(const struct task_struct *p, const int nice)
|
||||
{
|
||||
/* convert nice value [19,-20] to rlimit style value [1,40] */
|
||||
int nice_rlim = 20 - nice;
|
||||
int nice_rlim = nice_to_rlimit(nice);
|
||||
|
||||
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
|
||||
capable(CAP_SYS_NICE));
|
||||
@ -3042,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)
|
||||
* We don't have to worry. Conceptually one call occurs first
|
||||
* and we have a single winner.
|
||||
*/
|
||||
if (increment < -40)
|
||||
increment = -40;
|
||||
if (increment > 40)
|
||||
increment = 40;
|
||||
|
||||
increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
|
||||
nice = task_nice(current) + increment;
|
||||
if (nice < MIN_NICE)
|
||||
nice = MIN_NICE;
|
||||
if (nice > MAX_NICE)
|
||||
nice = MAX_NICE;
|
||||
|
||||
nice = clamp_val(nice, MIN_NICE, MAX_NICE);
|
||||
if (increment < 0 && !can_nice(current, nice))
|
||||
return -EPERM;
|
||||
|
||||
@ -3642,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
|
||||
*/
|
||||
attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
|
||||
|
||||
out:
|
||||
return ret;
|
||||
return 0;
|
||||
|
||||
err_size:
|
||||
put_user(sizeof(*attr), &uattr->size);
|
||||
ret = -E2BIG;
|
||||
goto out;
|
||||
return -E2BIG;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -3808,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
|
||||
|
||||
for (; addr < end; addr++) {
|
||||
if (*addr)
|
||||
goto err_size;
|
||||
return -EFBIG;
|
||||
}
|
||||
|
||||
attr->size = usize;
|
||||
@ -3818,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
|
||||
if (ret)
|
||||
return -EFAULT;
|
||||
|
||||
out:
|
||||
return ret;
|
||||
|
||||
err_size:
|
||||
ret = -E2BIG;
|
||||
goto out;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -5093,10 +5110,20 @@ static struct notifier_block migration_notifier = {
|
||||
.priority = CPU_PRI_MIGRATION,
|
||||
};
|
||||
|
||||
static void __cpuinit set_cpu_rq_start_time(void)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
rq->age_stamp = sched_clock_cpu(cpu);
|
||||
}
|
||||
|
||||
static int sched_cpu_active(struct notifier_block *nfb,
|
||||
unsigned long action, void *hcpu)
|
||||
{
|
||||
switch (action & ~CPU_TASKS_FROZEN) {
|
||||
case CPU_STARTING:
|
||||
set_cpu_rq_start_time();
|
||||
return NOTIFY_OK;
|
||||
case CPU_DOWN_FAILED:
|
||||
set_cpu_active((long)hcpu, true);
|
||||
return NOTIFY_OK;
|
||||
@ -5305,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)
|
||||
SD_BALANCE_FORK |
|
||||
SD_BALANCE_EXEC |
|
||||
SD_SHARE_CPUPOWER |
|
||||
SD_SHARE_PKG_RESOURCES)) {
|
||||
SD_SHARE_PKG_RESOURCES |
|
||||
SD_SHARE_POWERDOMAIN)) {
|
||||
if (sd->groups != sd->groups->next)
|
||||
return 0;
|
||||
}
|
||||
@ -5336,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
|
||||
SD_BALANCE_EXEC |
|
||||
SD_SHARE_CPUPOWER |
|
||||
SD_SHARE_PKG_RESOURCES |
|
||||
SD_PREFER_SIBLING);
|
||||
SD_PREFER_SIBLING |
|
||||
SD_SHARE_POWERDOMAIN);
|
||||
if (nr_node_ids == 1)
|
||||
pflags &= ~SD_SERIALIZE;
|
||||
}
|
||||
@ -5610,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)
|
||||
|
||||
__setup("isolcpus=", isolated_cpu_setup);
|
||||
|
||||
static const struct cpumask *cpu_cpu_mask(int cpu)
|
||||
{
|
||||
return cpumask_of_node(cpu_to_node(cpu));
|
||||
}
|
||||
|
||||
struct sd_data {
|
||||
struct sched_domain **__percpu sd;
|
||||
struct sched_group **__percpu sg;
|
||||
struct sched_group_power **__percpu sgp;
|
||||
};
|
||||
|
||||
struct s_data {
|
||||
struct sched_domain ** __percpu sd;
|
||||
struct root_domain *rd;
|
||||
@ -5633,21 +5651,6 @@ enum s_alloc {
|
||||
sa_none,
|
||||
};
|
||||
|
||||
struct sched_domain_topology_level;
|
||||
|
||||
typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
|
||||
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
|
||||
|
||||
#define SDTL_OVERLAP 0x01
|
||||
|
||||
struct sched_domain_topology_level {
|
||||
sched_domain_init_f init;
|
||||
sched_domain_mask_f mask;
|
||||
int flags;
|
||||
int numa_level;
|
||||
struct sd_data data;
|
||||
};
|
||||
|
||||
/*
|
||||
* Build an iteration mask that can exclude certain CPUs from the upwards
|
||||
* domain traversal.
|
||||
@ -5815,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
|
||||
continue;
|
||||
|
||||
group = get_group(i, sdd, &sg);
|
||||
cpumask_clear(sched_group_cpus(sg));
|
||||
sg->sgp->power = 0;
|
||||
cpumask_setall(sched_group_mask(sg));
|
||||
|
||||
for_each_cpu(j, span) {
|
||||
@ -5866,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
|
||||
atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
|
||||
}
|
||||
|
||||
int __weak arch_sd_sibling_asym_packing(void)
|
||||
{
|
||||
return 0*SD_ASYM_PACKING;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initializers for schedule domains
|
||||
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
# define SD_INIT_NAME(sd, type) sd->name = #type
|
||||
#else
|
||||
# define SD_INIT_NAME(sd, type) do { } while (0)
|
||||
#endif
|
||||
|
||||
#define SD_INIT_FUNC(type) \
|
||||
static noinline struct sched_domain * \
|
||||
sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
|
||||
{ \
|
||||
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
|
||||
*sd = SD_##type##_INIT; \
|
||||
SD_INIT_NAME(sd, type); \
|
||||
sd->private = &tl->data; \
|
||||
return sd; \
|
||||
}
|
||||
|
||||
SD_INIT_FUNC(CPU)
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
SD_INIT_FUNC(SIBLING)
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_MC
|
||||
SD_INIT_FUNC(MC)
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_BOOK
|
||||
SD_INIT_FUNC(BOOK)
|
||||
#endif
|
||||
|
||||
static int default_relax_domain_level = -1;
|
||||
int sched_domain_level_max;
|
||||
|
||||
@ -5991,99 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
|
||||
*per_cpu_ptr(sdd->sgp, cpu) = NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
static const struct cpumask *cpu_smt_mask(int cpu)
|
||||
{
|
||||
return topology_thread_cpumask(cpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Topology list, bottom-up.
|
||||
*/
|
||||
static struct sched_domain_topology_level default_topology[] = {
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
{ sd_init_SIBLING, cpu_smt_mask, },
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_MC
|
||||
{ sd_init_MC, cpu_coregroup_mask, },
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_BOOK
|
||||
{ sd_init_BOOK, cpu_book_mask, },
|
||||
#endif
|
||||
{ sd_init_CPU, cpu_cpu_mask, },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
static struct sched_domain_topology_level *sched_domain_topology = default_topology;
|
||||
|
||||
#define for_each_sd_topology(tl) \
|
||||
for (tl = sched_domain_topology; tl->init; tl++)
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
|
||||
static int sched_domains_numa_levels;
|
||||
static int *sched_domains_numa_distance;
|
||||
static struct cpumask ***sched_domains_numa_masks;
|
||||
static int sched_domains_curr_level;
|
||||
#endif
|
||||
|
||||
static inline int sd_local_flags(int level)
|
||||
{
|
||||
if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
|
||||
return 0;
|
||||
|
||||
return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
|
||||
}
|
||||
/*
|
||||
* SD_flags allowed in topology descriptions.
|
||||
*
|
||||
* SD_SHARE_CPUPOWER - describes SMT topologies
|
||||
* SD_SHARE_PKG_RESOURCES - describes shared caches
|
||||
* SD_NUMA - describes NUMA topologies
|
||||
* SD_SHARE_POWERDOMAIN - describes shared power domain
|
||||
*
|
||||
* Odd one out:
|
||||
* SD_ASYM_PACKING - describes SMT quirks
|
||||
*/
|
||||
#define TOPOLOGY_SD_FLAGS \
|
||||
(SD_SHARE_CPUPOWER | \
|
||||
SD_SHARE_PKG_RESOURCES | \
|
||||
SD_NUMA | \
|
||||
SD_ASYM_PACKING | \
|
||||
SD_SHARE_POWERDOMAIN)
|
||||
|
||||
static struct sched_domain *
|
||||
sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
|
||||
sd_init(struct sched_domain_topology_level *tl, int cpu)
|
||||
{
|
||||
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
|
||||
int level = tl->numa_level;
|
||||
int sd_weight = cpumask_weight(
|
||||
sched_domains_numa_masks[level][cpu_to_node(cpu)]);
|
||||
int sd_weight, sd_flags = 0;
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
/*
|
||||
* Ugly hack to pass state to sd_numa_mask()...
|
||||
*/
|
||||
sched_domains_curr_level = tl->numa_level;
|
||||
#endif
|
||||
|
||||
sd_weight = cpumask_weight(tl->mask(cpu));
|
||||
|
||||
if (tl->sd_flags)
|
||||
sd_flags = (*tl->sd_flags)();
|
||||
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
|
||||
"wrong sd_flags in topology description\n"))
|
||||
sd_flags &= ~TOPOLOGY_SD_FLAGS;
|
||||
|
||||
*sd = (struct sched_domain){
|
||||
.min_interval = sd_weight,
|
||||
.max_interval = 2*sd_weight,
|
||||
.busy_factor = 32,
|
||||
.imbalance_pct = 125,
|
||||
.cache_nice_tries = 2,
|
||||
.busy_idx = 3,
|
||||
.idle_idx = 2,
|
||||
|
||||
.cache_nice_tries = 0,
|
||||
.busy_idx = 0,
|
||||
.idle_idx = 0,
|
||||
.newidle_idx = 0,
|
||||
.wake_idx = 0,
|
||||
.forkexec_idx = 0,
|
||||
|
||||
.flags = 1*SD_LOAD_BALANCE
|
||||
| 1*SD_BALANCE_NEWIDLE
|
||||
| 0*SD_BALANCE_EXEC
|
||||
| 0*SD_BALANCE_FORK
|
||||
| 1*SD_BALANCE_EXEC
|
||||
| 1*SD_BALANCE_FORK
|
||||
| 0*SD_BALANCE_WAKE
|
||||
| 0*SD_WAKE_AFFINE
|
||||
| 1*SD_WAKE_AFFINE
|
||||
| 0*SD_SHARE_CPUPOWER
|
||||
| 0*SD_SHARE_PKG_RESOURCES
|
||||
| 1*SD_SERIALIZE
|
||||
| 0*SD_SERIALIZE
|
||||
| 0*SD_PREFER_SIBLING
|
||||
| 1*SD_NUMA
|
||||
| sd_local_flags(level)
|
||||
| 0*SD_NUMA
|
||||
| sd_flags
|
||||
,
|
||||
|
||||
.last_balance = jiffies,
|
||||
.balance_interval = sd_weight,
|
||||
.smt_gain = 0,
|
||||
.max_newidle_lb_cost = 0,
|
||||
.next_decay_max_lb_cost = jiffies,
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
.name = tl->name,
|
||||
#endif
|
||||
};
|
||||
SD_INIT_NAME(sd, NUMA);
|
||||
sd->private = &tl->data;
|
||||
|
||||
/*
|
||||
* Ugly hack to pass state to sd_numa_mask()...
|
||||
* Convert topological properties into behaviour.
|
||||
*/
|
||||
sched_domains_curr_level = tl->numa_level;
|
||||
|
||||
if (sd->flags & SD_SHARE_CPUPOWER) {
|
||||
sd->imbalance_pct = 110;
|
||||
sd->smt_gain = 1178; /* ~15% */
|
||||
|
||||
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
|
||||
sd->imbalance_pct = 117;
|
||||
sd->cache_nice_tries = 1;
|
||||
sd->busy_idx = 2;
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
} else if (sd->flags & SD_NUMA) {
|
||||
sd->cache_nice_tries = 2;
|
||||
sd->busy_idx = 3;
|
||||
sd->idle_idx = 2;
|
||||
|
||||
sd->flags |= SD_SERIALIZE;
|
||||
if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
|
||||
sd->flags &= ~(SD_BALANCE_EXEC |
|
||||
SD_BALANCE_FORK |
|
||||
SD_WAKE_AFFINE);
|
||||
}
|
||||
|
||||
#endif
|
||||
} else {
|
||||
sd->flags |= SD_PREFER_SIBLING;
|
||||
sd->cache_nice_tries = 1;
|
||||
sd->busy_idx = 2;
|
||||
sd->idle_idx = 1;
|
||||
}
|
||||
|
||||
sd->private = &tl->data;
|
||||
|
||||
return sd;
|
||||
}
|
||||
|
||||
/*
|
||||
* Topology list, bottom-up.
|
||||
*/
|
||||
static struct sched_domain_topology_level default_topology[] = {
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_MC
|
||||
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
|
||||
#endif
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
struct sched_domain_topology_level *sched_domain_topology = default_topology;
|
||||
|
||||
#define for_each_sd_topology(tl) \
|
||||
for (tl = sched_domain_topology; tl->mask; tl++)
|
||||
|
||||
void set_sched_topology(struct sched_domain_topology_level *tl)
|
||||
{
|
||||
sched_domain_topology = tl;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
|
||||
static const struct cpumask *sd_numa_mask(int cpu)
|
||||
{
|
||||
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
|
||||
@ -6227,7 +6250,10 @@ static void sched_init_numa(void)
|
||||
}
|
||||
}
|
||||
|
||||
tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
|
||||
/* Compute default topology size */
|
||||
for (i = 0; sched_domain_topology[i].mask; i++);
|
||||
|
||||
tl = kzalloc((i + level + 1) *
|
||||
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
|
||||
if (!tl)
|
||||
return;
|
||||
@ -6235,18 +6261,19 @@ static void sched_init_numa(void)
|
||||
/*
|
||||
* Copy the default topology bits..
|
||||
*/
|
||||
for (i = 0; default_topology[i].init; i++)
|
||||
tl[i] = default_topology[i];
|
||||
for (i = 0; sched_domain_topology[i].mask; i++)
|
||||
tl[i] = sched_domain_topology[i];
|
||||
|
||||
/*
|
||||
* .. and append 'j' levels of NUMA goodness.
|
||||
*/
|
||||
for (j = 0; j < level; i++, j++) {
|
||||
tl[i] = (struct sched_domain_topology_level){
|
||||
.init = sd_numa_init,
|
||||
.mask = sd_numa_mask,
|
||||
.sd_flags = cpu_numa_flags,
|
||||
.flags = SDTL_OVERLAP,
|
||||
.numa_level = j,
|
||||
SD_INIT_NAME(NUMA)
|
||||
};
|
||||
}
|
||||
|
||||
@ -6404,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
|
||||
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
|
||||
struct sched_domain *child, int cpu)
|
||||
{
|
||||
struct sched_domain *sd = tl->init(tl, cpu);
|
||||
struct sched_domain *sd = sd_init(tl, cpu);
|
||||
if (!sd)
|
||||
return child;
|
||||
|
||||
@ -6974,6 +7001,7 @@ void __init sched_init(void)
|
||||
if (cpu_isolated_map == NULL)
|
||||
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
|
||||
idle_thread_set_boot_cpu();
|
||||
set_cpu_rq_start_time();
|
||||
#endif
|
||||
init_sched_fair_class();
|
||||
|
||||
|
@ -520,7 +520,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
|
||||
* We need to take care of a possible races here. In fact, the
|
||||
* task might have changed its scheduling policy to something
|
||||
* different from SCHED_DEADLINE or changed its reservation
|
||||
* parameters (through sched_setscheduler()).
|
||||
* parameters (through sched_setattr()).
|
||||
*/
|
||||
if (!dl_task(p) || dl_se->dl_new)
|
||||
goto unlock;
|
||||
@ -741,7 +741,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
|
||||
|
||||
WARN_ON(!dl_prio(prio));
|
||||
dl_rq->dl_nr_running++;
|
||||
inc_nr_running(rq_of_dl_rq(dl_rq));
|
||||
add_nr_running(rq_of_dl_rq(dl_rq), 1);
|
||||
|
||||
inc_dl_deadline(dl_rq, deadline);
|
||||
inc_dl_migration(dl_se, dl_rq);
|
||||
@ -755,7 +755,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
|
||||
WARN_ON(!dl_prio(prio));
|
||||
WARN_ON(!dl_rq->dl_nr_running);
|
||||
dl_rq->dl_nr_running--;
|
||||
dec_nr_running(rq_of_dl_rq(dl_rq));
|
||||
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
|
||||
|
||||
dec_dl_deadline(dl_rq, dl_se->deadline);
|
||||
dec_dl_migration(dl_se, dl_rq);
|
||||
|
@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,
|
||||
env->best_cpu = env->dst_cpu;
|
||||
}
|
||||
|
||||
static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
|
||||
long src_load, long dst_load,
|
||||
struct task_numa_env *env)
|
||||
{
|
||||
long imb, old_imb;
|
||||
|
||||
/* We care about the slope of the imbalance, not the direction. */
|
||||
if (dst_load < src_load)
|
||||
swap(dst_load, src_load);
|
||||
|
||||
/* Is the difference below the threshold? */
|
||||
imb = dst_load * 100 - src_load * env->imbalance_pct;
|
||||
if (imb <= 0)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* The imbalance is above the allowed threshold.
|
||||
* Compare it with the old imbalance.
|
||||
*/
|
||||
if (orig_dst_load < orig_src_load)
|
||||
swap(orig_dst_load, orig_src_load);
|
||||
|
||||
old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
|
||||
|
||||
/* Would this change make things worse? */
|
||||
return (old_imb > imb);
|
||||
}
|
||||
|
||||
/*
|
||||
* This checks if the overall compute and NUMA accesses of the system would
|
||||
* be improved if the source tasks was migrated to the target dst_cpu taking
|
||||
@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,
|
||||
struct rq *src_rq = cpu_rq(env->src_cpu);
|
||||
struct rq *dst_rq = cpu_rq(env->dst_cpu);
|
||||
struct task_struct *cur;
|
||||
long dst_load, src_load;
|
||||
long orig_src_load, src_load;
|
||||
long orig_dst_load, dst_load;
|
||||
long load;
|
||||
long imp = (groupimp > 0) ? groupimp : taskimp;
|
||||
|
||||
@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,
|
||||
* In the overloaded case, try and keep the load balanced.
|
||||
*/
|
||||
balance:
|
||||
dst_load = env->dst_stats.load;
|
||||
src_load = env->src_stats.load;
|
||||
orig_dst_load = env->dst_stats.load;
|
||||
orig_src_load = env->src_stats.load;
|
||||
|
||||
/* XXX missing power terms */
|
||||
load = task_h_load(env->p);
|
||||
dst_load += load;
|
||||
src_load -= load;
|
||||
dst_load = orig_dst_load + load;
|
||||
src_load = orig_src_load - load;
|
||||
|
||||
if (cur) {
|
||||
load = task_h_load(cur);
|
||||
@ -1195,11 +1224,8 @@ balance:
|
||||
src_load += load;
|
||||
}
|
||||
|
||||
/* make src_load the smaller */
|
||||
if (dst_load < src_load)
|
||||
swap(dst_load, src_load);
|
||||
|
||||
if (src_load * env->imbalance_pct < dst_load * 100)
|
||||
if (load_too_imbalanced(orig_src_load, orig_dst_load,
|
||||
src_load, dst_load, env))
|
||||
goto unlock;
|
||||
|
||||
assign:
|
||||
@ -1301,6 +1327,15 @@ static int task_numa_migrate(struct task_struct *p)
|
||||
if (env.best_cpu == -1)
|
||||
return -EAGAIN;
|
||||
|
||||
/*
|
||||
* If the task is part of a workload that spans multiple NUMA nodes,
|
||||
* and is migrating into one of the workload's active nodes, remember
|
||||
* this node as the task's preferred numa node, so the workload can
|
||||
* settle down.
|
||||
* A task that migrated to a second choice node will be better off
|
||||
* trying for a better one later. Do not set the preferred node here.
|
||||
*/
|
||||
if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
|
||||
sched_setnuma(p, env.dst_nid);
|
||||
|
||||
/*
|
||||
@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)
|
||||
/* Attempt to migrate a task to a CPU on the preferred node. */
|
||||
static void numa_migrate_preferred(struct task_struct *p)
|
||||
{
|
||||
unsigned long interval = HZ;
|
||||
|
||||
/* This task has no NUMA fault statistics yet */
|
||||
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
|
||||
return;
|
||||
|
||||
/* Periodically retry migrating the task to the preferred node */
|
||||
p->numa_migrate_retry = jiffies + HZ;
|
||||
interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
|
||||
p->numa_migrate_retry = jiffies + interval;
|
||||
|
||||
/* Success if task is already running on preferred CPU */
|
||||
if (task_node(p) == p->numa_preferred_nid)
|
||||
@ -1738,6 +1776,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
||||
struct task_struct *p = current;
|
||||
bool migrated = flags & TNF_MIGRATED;
|
||||
int cpu_node = task_node(current);
|
||||
int local = !!(flags & TNF_FAULT_LOCAL);
|
||||
int priv;
|
||||
|
||||
if (!numabalancing_enabled)
|
||||
@ -1786,6 +1825,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
||||
task_numa_group(p, last_cpupid, flags, &priv);
|
||||
}
|
||||
|
||||
/*
|
||||
* If a workload spans multiple NUMA nodes, a shared fault that
|
||||
* occurs wholly within the set of nodes that the workload is
|
||||
* actively using should be counted as local. This allows the
|
||||
* scan rate to slow down when a workload has settled down.
|
||||
*/
|
||||
if (!priv && !local && p->numa_group &&
|
||||
node_isset(cpu_node, p->numa_group->active_nodes) &&
|
||||
node_isset(mem_node, p->numa_group->active_nodes))
|
||||
local = 1;
|
||||
|
||||
task_numa_placement(p);
|
||||
|
||||
/*
|
||||
@ -1800,7 +1850,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
||||
|
||||
p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
|
||||
p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
|
||||
p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
|
||||
p->numa_faults_locality[local] += pages;
|
||||
}
|
||||
|
||||
static void reset_ptenuma_scan(struct task_struct *p)
|
||||
@ -3301,7 +3351,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
}
|
||||
|
||||
if (!se)
|
||||
rq->nr_running -= task_delta;
|
||||
sub_nr_running(rq, task_delta);
|
||||
|
||||
cfs_rq->throttled = 1;
|
||||
cfs_rq->throttled_clock = rq_clock(rq);
|
||||
@ -3352,7 +3402,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
}
|
||||
|
||||
if (!se)
|
||||
rq->nr_running += task_delta;
|
||||
add_nr_running(rq, task_delta);
|
||||
|
||||
/* determine whether we need to wake up potentially idle cpu */
|
||||
if (rq->curr == rq->idle && rq->cfs.nr_running)
|
||||
@ -3884,7 +3934,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
|
||||
if (!se) {
|
||||
update_rq_runnable_avg(rq, rq->nr_running);
|
||||
inc_nr_running(rq);
|
||||
add_nr_running(rq, 1);
|
||||
}
|
||||
hrtick_update(rq);
|
||||
}
|
||||
@ -3944,7 +3994,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
}
|
||||
|
||||
if (!se) {
|
||||
dec_nr_running(rq);
|
||||
sub_nr_running(rq, 1);
|
||||
update_rq_runnable_avg(rq, 1);
|
||||
}
|
||||
hrtick_update(rq);
|
||||
@ -4015,7 +4065,7 @@ static void record_wakee(struct task_struct *p)
|
||||
* about the loss.
|
||||
*/
|
||||
if (jiffies > current->wakee_flip_decay_ts + HZ) {
|
||||
current->wakee_flips = 0;
|
||||
current->wakee_flips >>= 1;
|
||||
current->wakee_flip_decay_ts = jiffies;
|
||||
}
|
||||
|
||||
@ -4449,10 +4499,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
||||
sd = tmp;
|
||||
}
|
||||
|
||||
if (affine_sd) {
|
||||
if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
|
||||
if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
|
||||
prev_cpu = cpu;
|
||||
|
||||
if (sd_flag & SD_BALANCE_WAKE) {
|
||||
new_cpu = select_idle_sibling(p, prev_cpu);
|
||||
goto unlock;
|
||||
}
|
||||
@ -4520,6 +4570,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
|
||||
atomic_long_add(se->avg.load_avg_contrib,
|
||||
&cfs_rq->removed_load);
|
||||
}
|
||||
|
||||
/* We have migrated, no longer consider this task hot */
|
||||
se->exec_start = 0;
|
||||
}
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
@ -5070,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)
|
||||
/* Returns true if the destination node has incurred more faults */
|
||||
static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
|
||||
{
|
||||
struct numa_group *numa_group = rcu_dereference(p->numa_group);
|
||||
int src_nid, dst_nid;
|
||||
|
||||
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
|
||||
@ -5083,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
|
||||
if (src_nid == dst_nid)
|
||||
return false;
|
||||
|
||||
/* Always encourage migration to the preferred node. */
|
||||
if (numa_group) {
|
||||
/* Task is already in the group's interleave set. */
|
||||
if (node_isset(src_nid, numa_group->active_nodes))
|
||||
return false;
|
||||
|
||||
/* Task is moving into the group's interleave set. */
|
||||
if (node_isset(dst_nid, numa_group->active_nodes))
|
||||
return true;
|
||||
|
||||
return group_faults(p, dst_nid) > group_faults(p, src_nid);
|
||||
}
|
||||
|
||||
/* Encourage migration to the preferred node. */
|
||||
if (dst_nid == p->numa_preferred_nid)
|
||||
return true;
|
||||
|
||||
/* If both task and group weight improve, this move is a winner. */
|
||||
if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
|
||||
group_weight(p, dst_nid) > group_weight(p, src_nid))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
return task_faults(p, dst_nid) > task_faults(p, src_nid);
|
||||
}
|
||||
|
||||
|
||||
static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
||||
{
|
||||
struct numa_group *numa_group = rcu_dereference(p->numa_group);
|
||||
int src_nid, dst_nid;
|
||||
|
||||
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
|
||||
@ -5112,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
||||
if (src_nid == dst_nid)
|
||||
return false;
|
||||
|
||||
if (numa_group) {
|
||||
/* Task is moving within/into the group's interleave set. */
|
||||
if (node_isset(dst_nid, numa_group->active_nodes))
|
||||
return false;
|
||||
|
||||
/* Task is moving out of the group's interleave set. */
|
||||
if (node_isset(src_nid, numa_group->active_nodes))
|
||||
return true;
|
||||
|
||||
return group_faults(p, dst_nid) < group_faults(p, src_nid);
|
||||
}
|
||||
|
||||
/* Migrating away from the preferred node is always bad. */
|
||||
if (src_nid == p->numa_preferred_nid)
|
||||
return true;
|
||||
|
||||
/* If either task or group weight get worse, don't do it. */
|
||||
if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
|
||||
group_weight(p, dst_nid) < group_weight(p, src_nid))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
return task_faults(p, dst_nid) < task_faults(p, src_nid);
|
||||
}
|
||||
|
||||
#else
|
||||
@ -5564,6 +5633,7 @@ static unsigned long scale_rt_power(int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
u64 total, available, age_stamp, avg;
|
||||
s64 delta;
|
||||
|
||||
/*
|
||||
* Since we're reading these variables without serialization make sure
|
||||
@ -5572,7 +5642,11 @@ static unsigned long scale_rt_power(int cpu)
|
||||
age_stamp = ACCESS_ONCE(rq->age_stamp);
|
||||
avg = ACCESS_ONCE(rq->rt_avg);
|
||||
|
||||
total = sched_avg_period() + (rq_clock(rq) - age_stamp);
|
||||
delta = rq_clock(rq) - age_stamp;
|
||||
if (unlikely(delta < 0))
|
||||
delta = 0;
|
||||
|
||||
total = sched_avg_period() + delta;
|
||||
|
||||
if (unlikely(total < avg)) {
|
||||
/* Ensures that power won't end up being negative */
|
||||
@ -6640,17 +6714,44 @@ out:
|
||||
return ld_moved;
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
|
||||
{
|
||||
unsigned long interval = sd->balance_interval;
|
||||
|
||||
if (cpu_busy)
|
||||
interval *= sd->busy_factor;
|
||||
|
||||
/* scale ms to jiffies */
|
||||
interval = msecs_to_jiffies(interval);
|
||||
interval = clamp(interval, 1UL, max_load_balance_interval);
|
||||
|
||||
return interval;
|
||||
}
|
||||
|
||||
static inline void
|
||||
update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
|
||||
{
|
||||
unsigned long interval, next;
|
||||
|
||||
interval = get_sd_balance_interval(sd, cpu_busy);
|
||||
next = sd->last_balance + interval;
|
||||
|
||||
if (time_after(*next_balance, next))
|
||||
*next_balance = next;
|
||||
}
|
||||
|
||||
/*
|
||||
* idle_balance is called by schedule() if this_cpu is about to become
|
||||
* idle. Attempts to pull tasks from other CPUs.
|
||||
*/
|
||||
static int idle_balance(struct rq *this_rq)
|
||||
{
|
||||
unsigned long next_balance = jiffies + HZ;
|
||||
int this_cpu = this_rq->cpu;
|
||||
struct sched_domain *sd;
|
||||
int pulled_task = 0;
|
||||
unsigned long next_balance = jiffies + HZ;
|
||||
u64 curr_cost = 0;
|
||||
int this_cpu = this_rq->cpu;
|
||||
|
||||
idle_enter_fair(this_rq);
|
||||
|
||||
@ -6660,8 +6761,15 @@ static int idle_balance(struct rq *this_rq)
|
||||
*/
|
||||
this_rq->idle_stamp = rq_clock(this_rq);
|
||||
|
||||
if (this_rq->avg_idle < sysctl_sched_migration_cost)
|
||||
if (this_rq->avg_idle < sysctl_sched_migration_cost) {
|
||||
rcu_read_lock();
|
||||
sd = rcu_dereference_check_sched_domain(this_rq->sd);
|
||||
if (sd)
|
||||
update_next_balance(sd, 0, &next_balance);
|
||||
rcu_read_unlock();
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop the rq->lock, but keep IRQ/preempt disabled.
|
||||
@ -6671,20 +6779,20 @@ static int idle_balance(struct rq *this_rq)
|
||||
update_blocked_averages(this_cpu);
|
||||
rcu_read_lock();
|
||||
for_each_domain(this_cpu, sd) {
|
||||
unsigned long interval;
|
||||
int continue_balancing = 1;
|
||||
u64 t0, domain_cost;
|
||||
|
||||
if (!(sd->flags & SD_LOAD_BALANCE))
|
||||
continue;
|
||||
|
||||
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
|
||||
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
|
||||
update_next_balance(sd, 0, &next_balance);
|
||||
break;
|
||||
}
|
||||
|
||||
if (sd->flags & SD_BALANCE_NEWIDLE) {
|
||||
t0 = sched_clock_cpu(this_cpu);
|
||||
|
||||
/* If we've pulled tasks over stop searching: */
|
||||
pulled_task = load_balance(this_cpu, this_rq,
|
||||
sd, CPU_NEWLY_IDLE,
|
||||
&continue_balancing);
|
||||
@ -6696,10 +6804,13 @@ static int idle_balance(struct rq *this_rq)
|
||||
curr_cost += domain_cost;
|
||||
}
|
||||
|
||||
interval = msecs_to_jiffies(sd->balance_interval);
|
||||
if (time_after(next_balance, sd->last_balance + interval))
|
||||
next_balance = sd->last_balance + interval;
|
||||
if (pulled_task)
|
||||
update_next_balance(sd, 0, &next_balance);
|
||||
|
||||
/*
|
||||
* Stop searching for tasks to pull if there are
|
||||
* now runnable tasks on this rq.
|
||||
*/
|
||||
if (pulled_task || this_rq->nr_running > 0)
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
@ -6717,20 +6828,13 @@ static int idle_balance(struct rq *this_rq)
|
||||
if (this_rq->cfs.h_nr_running && !pulled_task)
|
||||
pulled_task = 1;
|
||||
|
||||
if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
|
||||
/*
|
||||
* We are going idle. next_balance may be set based on
|
||||
* a busy processor. So reset next_balance.
|
||||
*/
|
||||
this_rq->next_balance = next_balance;
|
||||
}
|
||||
|
||||
out:
|
||||
/* Move the next balance forward */
|
||||
if (time_after(this_rq->next_balance, next_balance))
|
||||
this_rq->next_balance = next_balance;
|
||||
|
||||
/* Is there a task of a high priority class? */
|
||||
if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
|
||||
((this_rq->stop && this_rq->stop->on_rq) ||
|
||||
this_rq->dl.dl_nr_running ||
|
||||
(this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
|
||||
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
|
||||
pulled_task = -1;
|
||||
|
||||
if (pulled_task) {
|
||||
@ -7011,16 +7115,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
break;
|
||||
}
|
||||
|
||||
interval = sd->balance_interval;
|
||||
if (idle != CPU_IDLE)
|
||||
interval *= sd->busy_factor;
|
||||
|
||||
/* scale ms to jiffies */
|
||||
interval = msecs_to_jiffies(interval);
|
||||
interval = clamp(interval, 1UL, max_load_balance_interval);
|
||||
interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
|
||||
|
||||
need_serialize = sd->flags & SD_SERIALIZE;
|
||||
|
||||
if (need_serialize) {
|
||||
if (!spin_trylock(&balancing))
|
||||
goto out;
|
||||
@ -7036,6 +7133,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
|
||||
}
|
||||
sd->last_balance = jiffies;
|
||||
interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
|
||||
}
|
||||
if (need_serialize)
|
||||
spin_unlock(&balancing);
|
||||
|
@ -67,24 +67,21 @@ void __weak arch_cpu_idle(void)
|
||||
* cpuidle_idle_call - the main idle function
|
||||
*
|
||||
* NOTE: no locks or semaphores should be used here
|
||||
* return non-zero on failure
|
||||
*/
|
||||
static int cpuidle_idle_call(void)
|
||||
static void cpuidle_idle_call(void)
|
||||
{
|
||||
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
|
||||
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
|
||||
int next_state, entered_state, ret;
|
||||
int next_state, entered_state;
|
||||
bool broadcast;
|
||||
|
||||
/*
|
||||
* Check if the idle task must be rescheduled. If it is the
|
||||
* case, exit the function after re-enabling the local irq and
|
||||
* set again the polling flag
|
||||
* case, exit the function after re-enabling the local irq.
|
||||
*/
|
||||
if (current_clr_polling_and_test()) {
|
||||
if (need_resched()) {
|
||||
local_irq_enable();
|
||||
__current_set_polling();
|
||||
return 0;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -101,18 +98,24 @@ static int cpuidle_idle_call(void)
|
||||
rcu_idle_enter();
|
||||
|
||||
/*
|
||||
* Check if the cpuidle framework is ready, otherwise fallback
|
||||
* to the default arch specific idle method
|
||||
*/
|
||||
ret = cpuidle_enabled(drv, dev);
|
||||
|
||||
if (!ret) {
|
||||
/*
|
||||
* Ask the governor to choose an idle state it thinks
|
||||
* it is convenient to go to. There is *always* a
|
||||
* convenient idle state
|
||||
* Ask the cpuidle framework to choose a convenient idle state.
|
||||
* Fall back to the default arch idle method on errors.
|
||||
*/
|
||||
next_state = cpuidle_select(drv, dev);
|
||||
if (next_state < 0) {
|
||||
use_default:
|
||||
/*
|
||||
* We can't use the cpuidle framework, let's use the default
|
||||
* idle routine.
|
||||
*/
|
||||
if (current_clr_polling_and_test())
|
||||
local_irq_enable();
|
||||
else
|
||||
arch_cpu_idle();
|
||||
|
||||
goto exit_idle;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* The idle task must be scheduled, it is pointless to
|
||||
@ -123,74 +126,51 @@ static int cpuidle_idle_call(void)
|
||||
dev->last_residency = 0;
|
||||
entered_state = next_state;
|
||||
local_irq_enable();
|
||||
} else {
|
||||
broadcast = !!(drv->states[next_state].flags &
|
||||
CPUIDLE_FLAG_TIMER_STOP);
|
||||
goto exit_idle;
|
||||
}
|
||||
|
||||
broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
|
||||
|
||||
if (broadcast)
|
||||
/*
|
||||
* Tell the time framework to switch
|
||||
* to a broadcast timer because our
|
||||
* local timer will be shutdown. If a
|
||||
* local timer is used from another
|
||||
* cpu as a broadcast timer, this call
|
||||
* may fail if it is not available
|
||||
* Tell the time framework to switch to a broadcast timer
|
||||
* because our local timer will be shutdown. If a local timer
|
||||
* is used from another cpu as a broadcast timer, this call may
|
||||
* fail if it is not available
|
||||
*/
|
||||
ret = clockevents_notify(
|
||||
CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
|
||||
&dev->cpu);
|
||||
if (broadcast &&
|
||||
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
|
||||
goto use_default;
|
||||
|
||||
if (!ret) {
|
||||
trace_cpu_idle_rcuidle(next_state, dev->cpu);
|
||||
|
||||
/*
|
||||
* Enter the idle state previously
|
||||
* returned by the governor
|
||||
* decision. This function will block
|
||||
* until an interrupt occurs and will
|
||||
* take care of re-enabling the local
|
||||
* interrupts
|
||||
* Enter the idle state previously returned by the governor decision.
|
||||
* This function will block until an interrupt occurs and will take
|
||||
* care of re-enabling the local interrupts
|
||||
*/
|
||||
entered_state = cpuidle_enter(drv, dev,
|
||||
next_state);
|
||||
entered_state = cpuidle_enter(drv, dev, next_state);
|
||||
|
||||
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
|
||||
dev->cpu);
|
||||
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
|
||||
|
||||
if (broadcast)
|
||||
clockevents_notify(
|
||||
CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
|
||||
&dev->cpu);
|
||||
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
|
||||
|
||||
/*
|
||||
* Give the governor an opportunity to reflect on the
|
||||
* outcome
|
||||
* Give the governor an opportunity to reflect on the outcome
|
||||
*/
|
||||
cpuidle_reflect(dev, entered_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We can't use the cpuidle framework, let's use the default
|
||||
* idle routine
|
||||
*/
|
||||
if (ret)
|
||||
arch_cpu_idle();
|
||||
|
||||
exit_idle:
|
||||
__current_set_polling();
|
||||
|
||||
/*
|
||||
* It is up to the idle functions to enable back the local
|
||||
* interrupt
|
||||
* It is up to the idle functions to reenable local interrupts
|
||||
*/
|
||||
if (WARN_ON_ONCE(irqs_disabled()))
|
||||
local_irq_enable();
|
||||
|
||||
rcu_idle_exit();
|
||||
start_critical_timings();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
|
||||
rt_rq->overloaded = 0;
|
||||
plist_head_init(&rt_rq->pushable_tasks);
|
||||
#endif
|
||||
/* We start is dequeued state, because no RT tasks are queued */
|
||||
rt_rq->rt_queued = 0;
|
||||
|
||||
rt_rq->rt_time = 0;
|
||||
rt_rq->rt_throttled = 0;
|
||||
@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
|
||||
return rt_se->rt_rq;
|
||||
}
|
||||
|
||||
static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
|
||||
{
|
||||
struct rt_rq *rt_rq = rt_se->rt_rq;
|
||||
|
||||
return rt_rq->rq;
|
||||
}
|
||||
|
||||
void free_rt_sched_group(struct task_group *tg)
|
||||
{
|
||||
int i;
|
||||
@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
|
||||
return container_of(rt_rq, struct rq, rt);
|
||||
}
|
||||
|
||||
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
|
||||
static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
|
||||
{
|
||||
struct task_struct *p = rt_task_of(rt_se);
|
||||
struct rq *rq = task_rq(p);
|
||||
|
||||
return task_rq(p);
|
||||
}
|
||||
|
||||
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
|
||||
{
|
||||
struct rq *rq = rq_of_rt_se(rt_se);
|
||||
|
||||
return &rq->rt;
|
||||
}
|
||||
@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
|
||||
}
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
|
||||
static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
|
||||
|
||||
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
|
||||
{
|
||||
return !list_empty(&rt_se->run_list);
|
||||
@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
|
||||
rt_se = rt_rq->tg->rt_se[cpu];
|
||||
|
||||
if (rt_rq->rt_nr_running) {
|
||||
if (rt_se && !on_rt_rq(rt_se))
|
||||
if (!rt_se)
|
||||
enqueue_top_rt_rq(rt_rq);
|
||||
else if (!on_rt_rq(rt_se))
|
||||
enqueue_rt_entity(rt_se, false);
|
||||
|
||||
if (rt_rq->highest_prio.curr < curr->prio)
|
||||
resched_task(curr);
|
||||
}
|
||||
@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
|
||||
|
||||
rt_se = rt_rq->tg->rt_se[cpu];
|
||||
|
||||
if (rt_se && on_rt_rq(rt_se))
|
||||
if (!rt_se)
|
||||
dequeue_top_rt_rq(rt_rq);
|
||||
else if (on_rt_rq(rt_se))
|
||||
dequeue_rt_entity(rt_se);
|
||||
}
|
||||
|
||||
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
|
||||
{
|
||||
return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
|
||||
}
|
||||
|
||||
static int rt_se_boosted(struct sched_rt_entity *rt_se)
|
||||
{
|
||||
struct rt_rq *rt_rq = group_rt_rq(rt_se);
|
||||
@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
|
||||
|
||||
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
|
||||
{
|
||||
if (rt_rq->rt_nr_running)
|
||||
resched_task(rq_of_rt_rq(rt_rq)->curr);
|
||||
struct rq *rq = rq_of_rt_rq(rt_rq);
|
||||
|
||||
if (!rt_rq->rt_nr_running)
|
||||
return;
|
||||
|
||||
enqueue_top_rt_rq(rt_rq);
|
||||
resched_task(rq->curr);
|
||||
}
|
||||
|
||||
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
|
||||
{
|
||||
dequeue_top_rt_rq(rt_rq);
|
||||
}
|
||||
|
||||
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
|
||||
{
|
||||
return rt_rq->rt_throttled;
|
||||
}
|
||||
|
||||
static inline const struct cpumask *sched_rt_period_mask(void)
|
||||
@ -922,6 +961,38 @@ static void update_curr_rt(struct rq *rq)
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
dequeue_top_rt_rq(struct rt_rq *rt_rq)
|
||||
{
|
||||
struct rq *rq = rq_of_rt_rq(rt_rq);
|
||||
|
||||
BUG_ON(&rq->rt != rt_rq);
|
||||
|
||||
if (!rt_rq->rt_queued)
|
||||
return;
|
||||
|
||||
BUG_ON(!rq->nr_running);
|
||||
|
||||
sub_nr_running(rq, rt_rq->rt_nr_running);
|
||||
rt_rq->rt_queued = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
enqueue_top_rt_rq(struct rt_rq *rt_rq)
|
||||
{
|
||||
struct rq *rq = rq_of_rt_rq(rt_rq);
|
||||
|
||||
BUG_ON(&rq->rt != rt_rq);
|
||||
|
||||
if (rt_rq->rt_queued)
|
||||
return;
|
||||
if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
|
||||
return;
|
||||
|
||||
add_nr_running(rq, rt_rq->rt_nr_running);
|
||||
rt_rq->rt_queued = 1;
|
||||
}
|
||||
|
||||
#if defined CONFIG_SMP
|
||||
|
||||
static void
|
||||
@ -1044,13 +1115,24 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
|
||||
|
||||
#endif /* CONFIG_RT_GROUP_SCHED */
|
||||
|
||||
static inline
|
||||
unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
|
||||
{
|
||||
struct rt_rq *group_rq = group_rt_rq(rt_se);
|
||||
|
||||
if (group_rq)
|
||||
return group_rq->rt_nr_running;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline
|
||||
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
||||
{
|
||||
int prio = rt_se_prio(rt_se);
|
||||
|
||||
WARN_ON(!rt_prio(prio));
|
||||
rt_rq->rt_nr_running++;
|
||||
rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
|
||||
|
||||
inc_rt_prio(rt_rq, prio);
|
||||
inc_rt_migration(rt_se, rt_rq);
|
||||
@ -1062,7 +1144,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
||||
{
|
||||
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
|
||||
WARN_ON(!rt_rq->rt_nr_running);
|
||||
rt_rq->rt_nr_running--;
|
||||
rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
|
||||
|
||||
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
|
||||
dec_rt_migration(rt_se, rt_rq);
|
||||
@ -1119,6 +1201,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
|
||||
back = rt_se;
|
||||
}
|
||||
|
||||
dequeue_top_rt_rq(rt_rq_of_se(back));
|
||||
|
||||
for (rt_se = back; rt_se; rt_se = rt_se->back) {
|
||||
if (on_rt_rq(rt_se))
|
||||
__dequeue_rt_entity(rt_se);
|
||||
@ -1127,13 +1211,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
|
||||
|
||||
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
|
||||
{
|
||||
struct rq *rq = rq_of_rt_se(rt_se);
|
||||
|
||||
dequeue_rt_stack(rt_se);
|
||||
for_each_sched_rt_entity(rt_se)
|
||||
__enqueue_rt_entity(rt_se, head);
|
||||
enqueue_top_rt_rq(&rq->rt);
|
||||
}
|
||||
|
||||
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
|
||||
{
|
||||
struct rq *rq = rq_of_rt_se(rt_se);
|
||||
|
||||
dequeue_rt_stack(rt_se);
|
||||
|
||||
for_each_sched_rt_entity(rt_se) {
|
||||
@ -1142,6 +1231,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
|
||||
if (rt_rq && rt_rq->rt_nr_running)
|
||||
__enqueue_rt_entity(rt_se, false);
|
||||
}
|
||||
enqueue_top_rt_rq(&rq->rt);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1159,8 +1249,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
||||
|
||||
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
|
||||
enqueue_pushable_task(rq, p);
|
||||
|
||||
inc_nr_running(rq);
|
||||
}
|
||||
|
||||
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
||||
@ -1171,8 +1259,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
||||
dequeue_rt_entity(rt_se);
|
||||
|
||||
dequeue_pushable_task(rq, p);
|
||||
|
||||
dec_nr_running(rq);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1377,10 +1463,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
|
||||
if (prev->sched_class == &rt_sched_class)
|
||||
update_curr_rt(rq);
|
||||
|
||||
if (!rt_rq->rt_nr_running)
|
||||
return NULL;
|
||||
|
||||
if (rt_rq_throttled(rt_rq))
|
||||
if (!rt_rq->rt_queued)
|
||||
return NULL;
|
||||
|
||||
put_prev_task(rq, prev);
|
||||
@ -1892,9 +1975,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
|
||||
*/
|
||||
if (p->on_rq && rq->curr != p) {
|
||||
#ifdef CONFIG_SMP
|
||||
if (rq->rt.overloaded && push_rt_task(rq) &&
|
||||
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
|
||||
/* Don't resched if we changed runqueues */
|
||||
rq != task_rq(p))
|
||||
push_rt_task(rq) && rq != task_rq(p))
|
||||
check_resched = 0;
|
||||
#endif /* CONFIG_SMP */
|
||||
if (check_resched && p->prio < rq->curr->prio)
|
||||
|
@ -409,6 +409,8 @@ struct rt_rq {
|
||||
int overloaded;
|
||||
struct plist_head pushable_tasks;
|
||||
#endif
|
||||
int rt_queued;
|
||||
|
||||
int rt_throttled;
|
||||
u64 rt_time;
|
||||
u64 rt_runtime;
|
||||
@ -423,18 +425,6 @@ struct rt_rq {
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
|
||||
{
|
||||
return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
|
||||
}
|
||||
#else
|
||||
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
|
||||
{
|
||||
return rt_rq->rt_throttled;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Deadline class' related fields in a runqueue */
|
||||
struct dl_rq {
|
||||
/* runqueue is an rbtree, ordered by deadline */
|
||||
@ -1216,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
|
||||
|
||||
extern void init_task_runnable_average(struct task_struct *p);
|
||||
|
||||
static inline void inc_nr_running(struct rq *rq)
|
||||
static inline void add_nr_running(struct rq *rq, unsigned count)
|
||||
{
|
||||
rq->nr_running++;
|
||||
unsigned prev_nr = rq->nr_running;
|
||||
|
||||
rq->nr_running = prev_nr + count;
|
||||
|
||||
#ifdef CONFIG_NO_HZ_FULL
|
||||
if (rq->nr_running == 2) {
|
||||
if (prev_nr < 2 && rq->nr_running >= 2) {
|
||||
if (tick_nohz_full_cpu(rq->cpu)) {
|
||||
/* Order rq->nr_running write against the IPI */
|
||||
smp_wmb();
|
||||
@ -1231,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq)
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void dec_nr_running(struct rq *rq)
|
||||
static inline void sub_nr_running(struct rq *rq, unsigned count)
|
||||
{
|
||||
rq->nr_running--;
|
||||
rq->nr_running -= count;
|
||||
}
|
||||
|
||||
static inline void rq_last_tick_reset(struct rq *rq)
|
||||
|
@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
|
||||
static void
|
||||
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
inc_nr_running(rq);
|
||||
add_nr_running(rq, 1);
|
||||
}
|
||||
|
||||
static void
|
||||
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
dec_nr_running(rq);
|
||||
sub_nr_running(rq, 1);
|
||||
}
|
||||
|
||||
static void yield_task_stop(struct rq *rq)
|
||||
|
@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
|
||||
else
|
||||
p = current;
|
||||
if (p) {
|
||||
niceval = 20 - task_nice(p);
|
||||
niceval = nice_to_rlimit(task_nice(p));
|
||||
if (niceval > retval)
|
||||
retval = niceval;
|
||||
}
|
||||
@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
|
||||
else
|
||||
pgrp = task_pgrp(current);
|
||||
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
|
||||
niceval = 20 - task_nice(p);
|
||||
niceval = nice_to_rlimit(task_nice(p));
|
||||
if (niceval > retval)
|
||||
retval = niceval;
|
||||
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
|
||||
@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
|
||||
|
||||
do_each_thread(g, p) {
|
||||
if (uid_eq(task_uid(p), uid)) {
|
||||
niceval = 20 - task_nice(p);
|
||||
niceval = nice_to_rlimit(task_nice(p));
|
||||
if (niceval > retval)
|
||||
retval = niceval;
|
||||
}
|
||||
|
@ -100,10 +100,10 @@ enum {
|
||||
|
||||
/*
|
||||
* Rescue workers are used only on emergencies and shared by
|
||||
* all cpus. Give -20.
|
||||
* all cpus. Give MIN_NICE.
|
||||
*/
|
||||
RESCUER_NICE_LEVEL = -20,
|
||||
HIGHPRI_NICE_LEVEL = -20,
|
||||
RESCUER_NICE_LEVEL = MIN_NICE,
|
||||
HIGHPRI_NICE_LEVEL = MIN_NICE,
|
||||
|
||||
WQ_NAME_LEN = 24,
|
||||
};
|
||||
|
@ -2740,7 +2740,7 @@ static int khugepaged(void *none)
|
||||
struct mm_slot *mm_slot;
|
||||
|
||||
set_freezable();
|
||||
set_user_nice(current, 19);
|
||||
set_user_nice(current, MAX_NICE);
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
khugepaged_do_scan();
|
||||
|
@ -3920,9 +3920,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
}
|
||||
}
|
||||
|
||||
/* THP should already have been handled */
|
||||
BUG_ON(pmd_numa(*pmd));
|
||||
|
||||
/*
|
||||
* Use __pte_alloc instead of pte_alloc_map, because we can't
|
||||
* run pte_offset_map on the pmd, if an huge pmd could
|
||||
|
Loading…
x
Reference in New Issue
Block a user