2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2011-11-15 17:14:39 +01:00
|
|
|
* kernel/sched/core.c
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
|
|
|
* Kernel scheduler and related syscalls
|
|
|
|
*
|
|
|
|
* Copyright (C) 1991-2002 Linus Torvalds
|
|
|
|
*
|
|
|
|
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
|
|
|
|
* make semaphores SMP safe
|
|
|
|
* 1998-11-19 Implemented schedule_timeout() and related stuff
|
|
|
|
* by Andrea Arcangeli
|
|
|
|
* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
|
|
|
|
* hybrid priority-list and round-robin design with
|
|
|
|
* an array-switch method of distributing timeslices
|
|
|
|
* and per-CPU runqueues. Cleanups and useful suggestions
|
|
|
|
* by Davide Libenzi, preemptible kernel bits by Robert Love.
|
|
|
|
* 2003-09-03 Interactivity tuning by Con Kolivas.
|
|
|
|
* 2004-04-02 Scheduler domains code by Nick Piggin
|
2007-07-09 18:52:01 +02:00
|
|
|
* 2007-04-15 Work begun on replacing all interactivity tuning with a
|
|
|
|
* fair scheduling design by Con Kolivas.
|
|
|
|
* 2007-05-05 Load balancing (smp-nice) and other improvements
|
|
|
|
* by Peter Williams
|
|
|
|
* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
|
|
|
|
* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
|
2008-01-25 21:08:19 +01:00
|
|
|
* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
|
|
|
|
* Thomas Gleixner, Mike Kravetz
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/nmi.h>
|
|
|
|
#include <linux/init.h>
|
2007-07-09 18:52:00 +02:00
|
|
|
#include <linux/uaccess.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <asm/mmu_context.h>
|
|
|
|
#include <linux/interrupt.h>
|
2006-01-11 12:17:46 -08:00
|
|
|
#include <linux/capability.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/completion.h>
|
|
|
|
#include <linux/kernel_stat.h>
|
2006-07-03 00:24:33 -07:00
|
|
|
#include <linux/debug_locks.h>
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 12:02:48 +02:00
|
|
|
#include <linux/perf_event.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/security.h>
|
|
|
|
#include <linux/notifier.h>
|
|
|
|
#include <linux/profile.h>
|
2006-12-06 20:34:23 -08:00
|
|
|
#include <linux/freezer.h>
|
[PATCH] scheduler cache-hot-autodetect
)
From: Ingo Molnar <mingo@elte.hu>
This is the latest version of the scheduler cache-hot-auto-tune patch.
The first problem was that detection time scaled with O(N^2), which is
unacceptable on larger SMP and NUMA systems. To solve this:
- I've added a 'domain distance' function, which is used to cache
measurement results. Each distance is only measured once. This means
that e.g. on NUMA distances of 0, 1 and 2 might be measured, on HT
distances 0 and 1, and on SMP distance 0 is measured. The code walks
the domain tree to determine the distance, so it automatically follows
whatever hierarchy an architecture sets up. This cuts down on the boot
time significantly and removes the O(N^2) limit. The only assumption
is that migration costs can be expressed as a function of domain
distance - this covers the overwhelming majority of existing systems,
and is a good guess even for more assymetric systems.
[ People hacking systems that have assymetries that break this
assumption (e.g. different CPU speeds) should experiment a bit with
the cpu_distance() function. Adding a ->migration_distance factor to
the domain structure would be one possible solution - but lets first
see the problem systems, if they exist at all. Lets not overdesign. ]
Another problem was that only a single cache-size was used for measuring
the cost of migration, and most architectures didnt set that variable
up. Furthermore, a single cache-size does not fit NUMA hierarchies with
L3 caches and does not fit HT setups, where different CPUs will often
have different 'effective cache sizes'. To solve this problem:
- Instead of relying on a single cache-size provided by the platform and
sticking to it, the code now auto-detects the 'effective migration
cost' between two measured CPUs, via iterating through a wide range of
cachesizes. The code searches for the maximum migration cost, which
occurs when the working set of the test-workload falls just below the
'effective cache size'. I.e. real-life optimized search is done for
the maximum migration cost, between two real CPUs.
This, amongst other things, has the positive effect hat if e.g. two
CPUs share a L2/L3 cache, a different (and accurate) migration cost
will be found than between two CPUs on the same system that dont share
any caches.
(The reliable measurement of migration costs is tricky - see the source
for details.)
Furthermore i've added various boot-time options to override/tune
migration behavior.
Firstly, there's a blanket override for autodetection:
migration_cost=1000,2000,3000
will override the depth 0/1/2 values with 1msec/2msec/3msec values.
Secondly, there's a global factor that can be used to increase (or
decrease) the autodetected values:
migration_factor=120
will increase the autodetected values by 20%. This option is useful to
tune things in a workload-dependent way - e.g. if a workload is
cache-insensitive then CPU utilization can be maximized by specifying
migration_factor=0.
I've tested the autodetection code quite extensively on x86, on 3
P3/Xeon/2MB, and the autodetected values look pretty good:
Dual Celeron (128K L2 cache):
---------------------
migration cost matrix (max_cache_size: 131072, cpu: 467 MHz):
---------------------
[00] [01]
[00]: - 1.7(1)
[01]: 1.7(1) -
---------------------
cacheflush times [2]: 0.0 (0) 1.7 (1784008)
---------------------
Here the slow memory subsystem dominates system performance, and even
though caches are small, the migration cost is 1.7 msecs.
Dual HT P4 (512K L2 cache):
---------------------
migration cost matrix (max_cache_size: 524288, cpu: 2379 MHz):
---------------------
[00] [01] [02] [03]
[00]: - 0.4(1) 0.0(0) 0.4(1)
[01]: 0.4(1) - 0.4(1) 0.0(0)
[02]: 0.0(0) 0.4(1) - 0.4(1)
[03]: 0.4(1) 0.0(0) 0.4(1) -
---------------------
cacheflush times [2]: 0.0 (33900) 0.4 (448514)
---------------------
Here it can be seen that there is no migration cost between two HT
siblings (CPU#0/2 and CPU#1/3 are separate physical CPUs). A fast memory
system makes inter-physical-CPU migration pretty cheap: 0.4 msecs.
8-way P3/Xeon [2MB L2 cache]:
---------------------
migration cost matrix (max_cache_size: 2097152, cpu: 700 MHz):
---------------------
[00] [01] [02] [03] [04] [05] [06] [07]
[00]: - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[01]: 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[02]: 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[03]: 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[04]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1)
[05]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1)
[06]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1)
[07]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) -
---------------------
cacheflush times [2]: 0.0 (0) 19.2 (19281756)
---------------------
This one has huge caches and a relatively slow memory subsystem - so the
migration cost is 19 msecs.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: <wilder@us.ibm.com>
Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-12 01:05:30 -08:00
|
|
|
#include <linux/vmalloc.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/delay.h>
|
2007-10-18 23:40:14 -07:00
|
|
|
#include <linux/pid_namespace.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/threads.h>
|
|
|
|
#include <linux/timer.h>
|
|
|
|
#include <linux/rcupdate.h>
|
|
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/cpuset.h>
|
|
|
|
#include <linux/percpu.h>
|
2008-10-06 13:23:43 +04:00
|
|
|
#include <linux/proc_fs.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/seq_file.h>
|
2007-07-26 13:40:43 +02:00
|
|
|
#include <linux/sysctl.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/syscalls.h>
|
|
|
|
#include <linux/times.h>
|
2006-09-30 23:28:59 -07:00
|
|
|
#include <linux/tsacct_kern.h>
|
2006-03-26 01:38:20 -08:00
|
|
|
#include <linux/kprobes.h>
|
2006-07-14 00:24:37 -07:00
|
|
|
#include <linux/delayacct.h>
|
2007-07-09 18:52:00 +02:00
|
|
|
#include <linux/unistd.h>
|
2007-09-21 09:19:54 +02:00
|
|
|
#include <linux/pagemap.h>
|
2008-01-25 21:08:29 +01:00
|
|
|
#include <linux/hrtimer.h>
|
2008-03-17 16:19:05 -07:00
|
|
|
#include <linux/tick.h>
|
2008-04-19 19:45:00 +02:00
|
|
|
#include <linux/debugfs.h>
|
|
|
|
#include <linux/ctype.h>
|
2008-05-12 21:20:42 +02:00
|
|
|
#include <linux/ftrace.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 17:04:11 +09:00
|
|
|
#include <linux/slab.h>
|
2011-10-26 23:14:16 +02:00
|
|
|
#include <linux/init_task.h>
|
2012-02-13 03:58:52 +00:00
|
|
|
#include <linux/binfmts.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2012-03-28 18:30:03 +01:00
|
|
|
#include <asm/switch_to.h>
|
2007-05-08 00:32:57 -07:00
|
|
|
#include <asm/tlb.h>
|
2007-10-24 18:23:50 +02:00
|
|
|
#include <asm/irq_regs.h>
|
2012-01-11 08:58:16 +01:00
|
|
|
#include <asm/mutex.h>
|
2011-07-11 15:28:17 -04:00
|
|
|
#ifdef CONFIG_PARAVIRT
|
|
|
|
#include <asm/paravirt.h>
|
|
|
|
#endif
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
#include "sched.h"
|
2011-11-15 17:14:39 +01:00
|
|
|
#include "../workqueue_sched.h"
|
2012-04-20 13:05:45 +00:00
|
|
|
#include "../smpboot.h"
|
2008-05-12 21:21:01 +02:00
|
|
|
|
2009-04-10 09:36:00 -04:00
|
|
|
#define CREATE_TRACE_POINTS
|
2009-04-14 19:39:12 -04:00
|
|
|
#include <trace/events/sched.h>
|
2009-04-10 09:36:00 -04:00
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
|
2008-04-19 19:44:57 +02:00
|
|
|
{
|
2011-07-21 09:43:31 -07:00
|
|
|
unsigned long delta;
|
|
|
|
ktime_t soft, hard, now;
|
2008-04-19 19:44:57 +02:00
|
|
|
|
2011-07-21 09:43:31 -07:00
|
|
|
for (;;) {
|
|
|
|
if (hrtimer_active(period_timer))
|
|
|
|
break;
|
|
|
|
|
|
|
|
now = hrtimer_cb_get_time(period_timer);
|
|
|
|
hrtimer_forward(period_timer, now, period);
|
2008-04-19 19:44:57 +02:00
|
|
|
|
2011-07-21 09:43:31 -07:00
|
|
|
soft = hrtimer_get_softexpires(period_timer);
|
|
|
|
hard = hrtimer_get_expires(period_timer);
|
|
|
|
delta = ktime_to_ns(ktime_sub(hard, soft));
|
|
|
|
__hrtimer_start_range_ns(period_timer, soft, delta,
|
|
|
|
HRTIMER_MODE_ABS_PINNED, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
DEFINE_MUTEX(sched_domains_mutex);
|
|
|
|
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
2010-06-08 11:40:42 +02:00
|
|
|
|
2010-12-09 14:15:34 +01:00
|
|
|
static void update_rq_clock_task(struct rq *rq, s64 delta);
|
2010-10-04 17:03:21 -07:00
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void update_rq_clock(struct rq *rq)
|
2008-05-03 18:29:28 +02:00
|
|
|
{
|
2010-12-09 14:15:34 +01:00
|
|
|
s64 delta;
|
2010-10-04 17:03:21 -07:00
|
|
|
|
2011-04-29 08:36:50 +02:00
|
|
|
if (rq->skip_clock_update > 0)
|
2010-12-08 11:05:42 +01:00
|
|
|
return;
|
2010-10-04 17:03:22 -07:00
|
|
|
|
2010-12-09 14:15:34 +01:00
|
|
|
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
|
|
|
|
rq->clock += delta;
|
|
|
|
update_rq_clock_task(rq, delta);
|
2008-05-03 18:29:28 +02:00
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:04 +02:00
|
|
|
/*
|
|
|
|
* Debugging: various feature bits
|
|
|
|
*/
|
2008-04-19 19:45:00 +02:00
|
|
|
|
|
|
|
#define SCHED_FEAT(name, enabled) \
|
|
|
|
(1UL << __SCHED_FEAT_##name) * enabled |
|
|
|
|
|
2007-10-15 17:00:04 +02:00
|
|
|
const_debug unsigned int sysctl_sched_features =
|
2011-11-15 17:14:39 +01:00
|
|
|
#include "features.h"
|
2008-04-19 19:45:00 +02:00
|
|
|
0;
|
|
|
|
|
|
|
|
#undef SCHED_FEAT
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
#define SCHED_FEAT(name, enabled) \
|
|
|
|
#name ,
|
|
|
|
|
2012-05-25 15:41:54 +09:00
|
|
|
static const char * const sched_feat_names[] = {
|
2011-11-15 17:14:39 +01:00
|
|
|
#include "features.h"
|
2008-04-19 19:45:00 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
#undef SCHED_FEAT
|
|
|
|
|
2008-10-30 15:23:32 +08:00
|
|
|
static int sched_feat_show(struct seq_file *m, void *v)
|
2008-04-19 19:45:00 +02:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2011-07-06 14:20:14 +02:00
|
|
|
for (i = 0; i < __SCHED_FEAT_NR; i++) {
|
2008-10-30 15:23:32 +08:00
|
|
|
if (!(sysctl_sched_features & (1UL << i)))
|
|
|
|
seq_puts(m, "NO_");
|
|
|
|
seq_printf(m, "%s ", sched_feat_names[i]);
|
2008-04-19 19:45:00 +02:00
|
|
|
}
|
2008-10-30 15:23:32 +08:00
|
|
|
seq_puts(m, "\n");
|
2008-04-19 19:45:00 +02:00
|
|
|
|
2008-10-30 15:23:32 +08:00
|
|
|
return 0;
|
2008-04-19 19:45:00 +02:00
|
|
|
}
|
|
|
|
|
2011-07-06 14:20:14 +02:00
|
|
|
#ifdef HAVE_JUMP_LABEL
|
|
|
|
|
2012-02-24 08:31:31 +01:00
|
|
|
#define jump_label_key__true STATIC_KEY_INIT_TRUE
|
|
|
|
#define jump_label_key__false STATIC_KEY_INIT_FALSE
|
2011-07-06 14:20:14 +02:00
|
|
|
|
|
|
|
#define SCHED_FEAT(name, enabled) \
|
|
|
|
jump_label_key__##enabled ,
|
|
|
|
|
2012-02-24 08:31:31 +01:00
|
|
|
struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
|
2011-07-06 14:20:14 +02:00
|
|
|
#include "features.h"
|
|
|
|
};
|
|
|
|
|
|
|
|
#undef SCHED_FEAT
|
|
|
|
|
|
|
|
static void sched_feat_disable(int i)
|
|
|
|
{
|
2012-02-24 08:31:31 +01:00
|
|
|
if (static_key_enabled(&sched_feat_keys[i]))
|
|
|
|
static_key_slow_dec(&sched_feat_keys[i]);
|
2011-07-06 14:20:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void sched_feat_enable(int i)
|
|
|
|
{
|
2012-02-24 08:31:31 +01:00
|
|
|
if (!static_key_enabled(&sched_feat_keys[i]))
|
|
|
|
static_key_slow_inc(&sched_feat_keys[i]);
|
2011-07-06 14:20:14 +02:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void sched_feat_disable(int i) { };
|
|
|
|
static void sched_feat_enable(int i) { };
|
|
|
|
#endif /* HAVE_JUMP_LABEL */
|
|
|
|
|
2008-04-19 19:45:00 +02:00
|
|
|
static ssize_t
|
|
|
|
sched_feat_write(struct file *filp, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos)
|
|
|
|
{
|
|
|
|
char buf[64];
|
2010-09-13 17:47:00 -04:00
|
|
|
char *cmp;
|
2008-04-19 19:45:00 +02:00
|
|
|
int neg = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (cnt > 63)
|
|
|
|
cnt = 63;
|
|
|
|
|
|
|
|
if (copy_from_user(&buf, ubuf, cnt))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
buf[cnt] = 0;
|
2010-09-13 17:47:00 -04:00
|
|
|
cmp = strstrip(buf);
|
2008-04-19 19:45:00 +02:00
|
|
|
|
2011-01-06 20:58:12 +08:00
|
|
|
if (strncmp(cmp, "NO_", 3) == 0) {
|
2008-04-19 19:45:00 +02:00
|
|
|
neg = 1;
|
|
|
|
cmp += 3;
|
|
|
|
}
|
|
|
|
|
2011-07-06 14:20:14 +02:00
|
|
|
for (i = 0; i < __SCHED_FEAT_NR; i++) {
|
2010-09-13 17:47:00 -04:00
|
|
|
if (strcmp(cmp, sched_feat_names[i]) == 0) {
|
2011-07-06 14:20:14 +02:00
|
|
|
if (neg) {
|
2008-04-19 19:45:00 +02:00
|
|
|
sysctl_sched_features &= ~(1UL << i);
|
2011-07-06 14:20:14 +02:00
|
|
|
sched_feat_disable(i);
|
|
|
|
} else {
|
2008-04-19 19:45:00 +02:00
|
|
|
sysctl_sched_features |= (1UL << i);
|
2011-07-06 14:20:14 +02:00
|
|
|
sched_feat_enable(i);
|
|
|
|
}
|
2008-04-19 19:45:00 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-06 14:20:14 +02:00
|
|
|
if (i == __SCHED_FEAT_NR)
|
2008-04-19 19:45:00 +02:00
|
|
|
return -EINVAL;
|
|
|
|
|
2009-11-20 17:40:37 +01:00
|
|
|
*ppos += cnt;
|
2008-04-19 19:45:00 +02:00
|
|
|
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
2008-10-30 15:23:32 +08:00
|
|
|
static int sched_feat_open(struct inode *inode, struct file *filp)
|
|
|
|
{
|
|
|
|
return single_open(filp, sched_feat_show, NULL);
|
|
|
|
}
|
|
|
|
|
2009-10-01 15:43:56 -07:00
|
|
|
static const struct file_operations sched_feat_fops = {
|
2008-10-30 15:23:32 +08:00
|
|
|
.open = sched_feat_open,
|
|
|
|
.write = sched_feat_write,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = single_release,
|
2008-04-19 19:45:00 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
static __init int sched_init_debug(void)
|
|
|
|
{
|
|
|
|
debugfs_create_file("sched_features", 0644, NULL, NULL,
|
|
|
|
&sched_feat_fops);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
late_initcall(sched_init_debug);
|
2011-07-06 14:20:14 +02:00
|
|
|
#endif /* CONFIG_SCHED_DEBUG */
|
2007-10-15 17:00:04 +02:00
|
|
|
|
2007-11-09 22:39:39 +01:00
|
|
|
/*
|
|
|
|
* Number of tasks to iterate in a single balance run.
|
|
|
|
* Limited because this is done with IRQs disabled.
|
|
|
|
*/
|
|
|
|
const_debug unsigned int sysctl_sched_nr_migrate = 32;
|
|
|
|
|
2009-09-01 10:34:37 +02:00
|
|
|
/*
|
|
|
|
* period over which we average the RT time consumption, measured
|
|
|
|
* in ms.
|
|
|
|
*
|
|
|
|
* default: 1s
|
|
|
|
*/
|
|
|
|
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
|
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
/*
|
2008-02-13 15:45:39 +01:00
|
|
|
* period over which we measure -rt task cpu usage in us.
|
2008-01-25 21:08:29 +01:00
|
|
|
* default: 1s
|
|
|
|
*/
|
2008-02-13 15:45:39 +01:00
|
|
|
unsigned int sysctl_sched_rt_period = 1000000;
|
2008-01-25 21:08:29 +01:00
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
__read_mostly int scheduler_running;
|
2008-02-13 14:02:36 +01:00
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
/*
|
|
|
|
* part of the period that we allow rt tasks to run in us.
|
|
|
|
* default: 0.95s
|
|
|
|
*/
|
|
|
|
int sysctl_sched_rt_runtime = 950000;
|
2008-01-25 21:08:29 +01:00
|
|
|
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2010-02-15 14:45:54 +01:00
|
|
|
/*
|
2011-04-05 17:23:51 +02:00
|
|
|
* __task_rq_lock - lock the rq @p resides on.
|
2006-06-27 02:54:51 -07:00
|
|
|
*/
|
2006-07-03 00:25:42 -07:00
|
|
|
static inline struct rq *__task_rq_lock(struct task_struct *p)
|
2006-06-27 02:54:51 -07:00
|
|
|
__acquires(rq->lock)
|
|
|
|
{
|
2010-02-15 14:45:54 +01:00
|
|
|
struct rq *rq;
|
|
|
|
|
2011-04-05 17:23:51 +02:00
|
|
|
lockdep_assert_held(&p->pi_lock);
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
for (;;) {
|
2010-02-15 14:45:54 +01:00
|
|
|
rq = task_rq(p);
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock(&rq->lock);
|
2010-03-25 21:05:16 +01:00
|
|
|
if (likely(rq == task_rq(p)))
|
2007-10-15 17:00:14 +02:00
|
|
|
return rq;
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock(&rq->lock);
|
2006-06-27 02:54:51 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2011-04-05 17:23:51 +02:00
|
|
|
* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2006-07-03 00:25:42 -07:00
|
|
|
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
|
2011-04-05 17:23:51 +02:00
|
|
|
__acquires(p->pi_lock)
|
2005-04-16 15:20:36 -07:00
|
|
|
__acquires(rq->lock)
|
|
|
|
{
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
for (;;) {
|
2011-04-05 17:23:51 +02:00
|
|
|
raw_spin_lock_irqsave(&p->pi_lock, *flags);
|
2007-10-15 17:00:14 +02:00
|
|
|
rq = task_rq(p);
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock(&rq->lock);
|
2010-03-25 21:05:16 +01:00
|
|
|
if (likely(rq == task_rq(p)))
|
2007-10-15 17:00:14 +02:00
|
|
|
return rq;
|
2011-04-05 17:23:51 +02:00
|
|
|
raw_spin_unlock(&rq->lock);
|
|
|
|
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:13 +02:00
|
|
|
static void __task_rq_unlock(struct rq *rq)
|
2006-06-27 02:54:51 -07:00
|
|
|
__releases(rq->lock)
|
|
|
|
{
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock(&rq->lock);
|
2006-06-27 02:54:51 -07:00
|
|
|
}
|
|
|
|
|
2011-04-05 17:23:51 +02:00
|
|
|
static inline void
|
|
|
|
task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
|
2005-04-16 15:20:36 -07:00
|
|
|
__releases(rq->lock)
|
2011-04-05 17:23:51 +02:00
|
|
|
__releases(p->pi_lock)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-04-05 17:23:51 +02:00
|
|
|
raw_spin_unlock(&rq->lock);
|
|
|
|
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2006-12-10 02:20:00 -08:00
|
|
|
* this_rq_lock - lock this runqueue and disable interrupts.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2007-10-15 17:00:13 +02:00
|
|
|
static struct rq *this_rq_lock(void)
|
2005-04-16 15:20:36 -07:00
|
|
|
__acquires(rq->lock)
|
|
|
|
{
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
local_irq_disable();
|
|
|
|
rq = this_rq();
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock(&rq->lock);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
return rq;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
#ifdef CONFIG_SCHED_HRTICK
|
|
|
|
/*
|
|
|
|
* Use HR-timers to deliver accurate preemption points.
|
|
|
|
*
|
|
|
|
* Its all a bit involved since we cannot program an hrt while holding the
|
|
|
|
* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
|
|
|
|
* reschedule event.
|
|
|
|
*
|
|
|
|
* When we get rescheduled we reprogram the hrtick_timer outside of the
|
|
|
|
* rq->lock.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void hrtick_clear(struct rq *rq)
|
|
|
|
{
|
|
|
|
if (hrtimer_active(&rq->hrtick_timer))
|
|
|
|
hrtimer_cancel(&rq->hrtick_timer);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* High-resolution timer tick.
|
|
|
|
* Runs from hardirq context with interrupts disabled.
|
|
|
|
*/
|
|
|
|
static enum hrtimer_restart hrtick(struct hrtimer *timer)
|
|
|
|
{
|
|
|
|
struct rq *rq = container_of(timer, struct rq, hrtick_timer);
|
|
|
|
|
|
|
|
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
|
|
|
|
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock(&rq->lock);
|
2008-05-03 18:29:28 +02:00
|
|
|
update_rq_clock(rq);
|
2008-01-25 21:08:29 +01:00
|
|
|
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock(&rq->lock);
|
2008-01-25 21:08:29 +01:00
|
|
|
|
|
|
|
return HRTIMER_NORESTART;
|
|
|
|
}
|
|
|
|
|
2008-05-11 05:55:33 +05:30
|
|
|
#ifdef CONFIG_SMP
|
2008-07-18 18:01:23 +02:00
|
|
|
/*
|
|
|
|
* called from hardirq (IPI) context
|
|
|
|
*/
|
|
|
|
static void __hrtick_start(void *arg)
|
2008-04-29 10:02:46 +02:00
|
|
|
{
|
2008-07-18 18:01:23 +02:00
|
|
|
struct rq *rq = arg;
|
2008-04-29 10:02:46 +02:00
|
|
|
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock(&rq->lock);
|
2008-07-18 18:01:23 +02:00
|
|
|
hrtimer_restart(&rq->hrtick_timer);
|
|
|
|
rq->hrtick_csd_pending = 0;
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock(&rq->lock);
|
2008-04-29 10:02:46 +02:00
|
|
|
}
|
|
|
|
|
2008-07-18 18:01:23 +02:00
|
|
|
/*
|
|
|
|
* Called to set the hrtick timer state.
|
|
|
|
*
|
|
|
|
* called with rq->lock held and irqs disabled
|
|
|
|
*/
|
2011-10-25 10:00:11 +02:00
|
|
|
void hrtick_start(struct rq *rq, u64 delay)
|
2008-04-29 10:02:46 +02:00
|
|
|
{
|
2008-07-18 18:01:23 +02:00
|
|
|
struct hrtimer *timer = &rq->hrtick_timer;
|
|
|
|
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
|
2008-04-29 10:02:46 +02:00
|
|
|
|
2008-09-01 15:02:30 -07:00
|
|
|
hrtimer_set_expires(timer, time);
|
2008-07-18 18:01:23 +02:00
|
|
|
|
|
|
|
if (rq == this_rq()) {
|
|
|
|
hrtimer_restart(timer);
|
|
|
|
} else if (!rq->hrtick_csd_pending) {
|
2009-02-25 13:59:48 +01:00
|
|
|
__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
|
2008-07-18 18:01:23 +02:00
|
|
|
rq->hrtick_csd_pending = 1;
|
|
|
|
}
|
2008-04-29 10:02:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
|
|
|
|
{
|
|
|
|
int cpu = (int)(long)hcpu;
|
|
|
|
|
|
|
|
switch (action) {
|
|
|
|
case CPU_UP_CANCELED:
|
|
|
|
case CPU_UP_CANCELED_FROZEN:
|
|
|
|
case CPU_DOWN_PREPARE:
|
|
|
|
case CPU_DOWN_PREPARE_FROZEN:
|
|
|
|
case CPU_DEAD:
|
|
|
|
case CPU_DEAD_FROZEN:
|
2008-07-18 18:01:23 +02:00
|
|
|
hrtick_clear(cpu_rq(cpu));
|
2008-04-29 10:02:46 +02:00
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
|
2008-09-22 14:55:45 -07:00
|
|
|
static __init void init_hrtick(void)
|
2008-04-29 10:02:46 +02:00
|
|
|
{
|
|
|
|
hotcpu_notifier(hotplug_hrtick, 0);
|
|
|
|
}
|
2008-07-18 18:01:23 +02:00
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* Called to set the hrtick timer state.
|
|
|
|
*
|
|
|
|
* called with rq->lock held and irqs disabled
|
|
|
|
*/
|
2011-10-25 10:00:11 +02:00
|
|
|
void hrtick_start(struct rq *rq, u64 delay)
|
2008-07-18 18:01:23 +02:00
|
|
|
{
|
2009-03-13 12:21:27 +01:00
|
|
|
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
|
2009-04-16 12:14:37 +05:30
|
|
|
HRTIMER_MODE_REL_PINNED, 0);
|
2008-07-18 18:01:23 +02:00
|
|
|
}
|
2008-04-29 10:02:46 +02:00
|
|
|
|
2008-09-22 14:55:46 -07:00
|
|
|
static inline void init_hrtick(void)
|
2008-01-25 21:08:29 +01:00
|
|
|
{
|
|
|
|
}
|
2008-07-18 18:01:23 +02:00
|
|
|
#endif /* CONFIG_SMP */
|
2008-01-25 21:08:29 +01:00
|
|
|
|
2008-07-18 18:01:23 +02:00
|
|
|
static void init_rq_hrtick(struct rq *rq)
|
2008-01-25 21:08:29 +01:00
|
|
|
{
|
2008-07-18 18:01:23 +02:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
rq->hrtick_csd_pending = 0;
|
2008-01-25 21:08:29 +01:00
|
|
|
|
2008-07-18 18:01:23 +02:00
|
|
|
rq->hrtick_csd.flags = 0;
|
|
|
|
rq->hrtick_csd.func = __hrtick_start;
|
|
|
|
rq->hrtick_csd.info = rq;
|
|
|
|
#endif
|
2008-01-25 21:08:29 +01:00
|
|
|
|
2008-07-18 18:01:23 +02:00
|
|
|
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
|
|
rq->hrtick_timer.function = hrtick;
|
2008-01-25 21:08:29 +01:00
|
|
|
}
|
2008-09-22 14:55:46 -07:00
|
|
|
#else /* CONFIG_SCHED_HRTICK */
|
2008-01-25 21:08:29 +01:00
|
|
|
static inline void hrtick_clear(struct rq *rq)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void init_rq_hrtick(struct rq *rq)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2008-04-29 10:02:46 +02:00
|
|
|
static inline void init_hrtick(void)
|
|
|
|
{
|
|
|
|
}
|
2008-09-22 14:55:46 -07:00
|
|
|
#endif /* CONFIG_SCHED_HRTICK */
|
2008-01-25 21:08:29 +01:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* resched_task - mark a task 'to be rescheduled now'.
|
|
|
|
*
|
|
|
|
* On UP this means the setting of the need_resched flag, on SMP it
|
|
|
|
* might also involve a cross-CPU call to trigger the scheduler on
|
|
|
|
* the target CPU.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
|
|
|
#ifndef tsk_is_polling
|
|
|
|
#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
|
|
|
|
#endif
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void resched_task(struct task_struct *p)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
|
2009-11-17 14:28:38 +01:00
|
|
|
assert_raw_spin_locked(&task_rq(p)->lock);
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2009-03-06 19:40:20 +08:00
|
|
|
if (test_tsk_need_resched(p))
|
2007-07-09 18:51:59 +02:00
|
|
|
return;
|
|
|
|
|
2009-03-06 19:40:20 +08:00
|
|
|
set_tsk_need_resched(p);
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
cpu = task_cpu(p);
|
|
|
|
if (cpu == smp_processor_id())
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* NEED_RESCHED must be visible before we test polling */
|
|
|
|
smp_mb();
|
|
|
|
if (!tsk_is_polling(p))
|
|
|
|
smp_send_reschedule(cpu);
|
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void resched_cpu(int cpu)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
|
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
|
unsigned long flags;
|
|
|
|
|
2009-11-17 14:28:38 +01:00
|
|
|
if (!raw_spin_trylock_irqsave(&rq->lock, flags))
|
2007-07-09 18:51:59 +02:00
|
|
|
return;
|
|
|
|
resched_task(cpu_curr(cpu));
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
2008-03-22 09:20:24 +01:00
|
|
|
|
|
|
|
#ifdef CONFIG_NO_HZ
|
2010-05-21 17:09:41 -07:00
|
|
|
/*
|
|
|
|
* In the semi idle case, use the nearest busy cpu for migrating timers
|
|
|
|
* from an idle cpu. This is good for power-savings.
|
|
|
|
*
|
|
|
|
* We don't do similar optimization for completely idle system, as
|
|
|
|
* selecting an idle cpu will add more delays to the timers than intended
|
|
|
|
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
|
|
|
|
*/
|
|
|
|
int get_nohz_timer_target(void)
|
|
|
|
{
|
|
|
|
int cpu = smp_processor_id();
|
|
|
|
int i;
|
|
|
|
struct sched_domain *sd;
|
|
|
|
|
2011-04-18 11:24:34 +02:00
|
|
|
rcu_read_lock();
|
2010-05-21 17:09:41 -07:00
|
|
|
for_each_domain(cpu, sd) {
|
2011-04-18 11:24:34 +02:00
|
|
|
for_each_cpu(i, sched_domain_span(sd)) {
|
|
|
|
if (!idle_cpu(i)) {
|
|
|
|
cpu = i;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
}
|
2010-05-21 17:09:41 -07:00
|
|
|
}
|
2011-04-18 11:24:34 +02:00
|
|
|
unlock:
|
|
|
|
rcu_read_unlock();
|
2010-05-21 17:09:41 -07:00
|
|
|
return cpu;
|
|
|
|
}
|
2008-03-22 09:20:24 +01:00
|
|
|
/*
|
|
|
|
* When add_timer_on() enqueues a timer into the timer wheel of an
|
|
|
|
* idle CPU then this timer might expire before the next timer event
|
|
|
|
* which is scheduled to wake up that CPU. In case of a completely
|
|
|
|
* idle system the next event might even be infinite time into the
|
|
|
|
* future. wake_up_idle_cpu() ensures that the CPU is woken up and
|
|
|
|
* leaves the inner idle loop so the newly added timer is taken into
|
|
|
|
* account when the CPU goes back to idle and evaluates the timer
|
|
|
|
* wheel for the next timer event.
|
|
|
|
*/
|
|
|
|
void wake_up_idle_cpu(int cpu)
|
|
|
|
{
|
|
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
|
|
|
|
|
if (cpu == smp_processor_id())
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is safe, as this function is called with the timer
|
|
|
|
* wheel base lock of (cpu) held. When the CPU is on the way
|
|
|
|
* to idle and has not yet set rq->curr to idle then it will
|
|
|
|
* be serialized on the timer wheel base lock and take the new
|
|
|
|
* timer into account automatically.
|
|
|
|
*/
|
|
|
|
if (rq->curr != rq->idle)
|
|
|
|
return;
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
/*
|
2008-03-22 09:20:24 +01:00
|
|
|
* We can set TIF_RESCHED on the idle task of the other CPU
|
|
|
|
* lockless. The worst case is that the other CPU runs the
|
|
|
|
* idle task through an additional NOOP schedule()
|
2007-07-09 18:51:59 +02:00
|
|
|
*/
|
2009-03-06 19:40:20 +08:00
|
|
|
set_tsk_need_resched(rq->idle);
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2008-03-22 09:20:24 +01:00
|
|
|
/* NEED_RESCHED must be visible before we test polling */
|
|
|
|
smp_mb();
|
|
|
|
if (!tsk_is_polling(rq->idle))
|
|
|
|
smp_send_reschedule(cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2011-10-03 15:09:00 -07:00
|
|
|
static inline bool got_nohz_idle_kick(void)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
2011-12-01 17:07:32 -08:00
|
|
|
int cpu = smp_processor_id();
|
|
|
|
return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2011-10-03 15:09:00 -07:00
|
|
|
#else /* CONFIG_NO_HZ */
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2011-10-03 15:09:00 -07:00
|
|
|
static inline bool got_nohz_idle_kick(void)
|
2010-11-15 15:47:00 -08:00
|
|
|
{
|
2011-10-03 15:09:00 -07:00
|
|
|
return false;
|
2010-11-15 15:47:00 -08:00
|
|
|
}
|
|
|
|
|
2008-05-30 14:23:45 +02:00
|
|
|
#endif /* CONFIG_NO_HZ */
|
2007-12-02 20:04:49 +01:00
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void sched_avg_update(struct rq *rq)
|
2008-04-19 19:45:00 +02:00
|
|
|
{
|
2009-09-01 10:34:37 +02:00
|
|
|
s64 period = sched_avg_period();
|
|
|
|
|
|
|
|
while ((s64)(rq->clock - rq->age_stamp) > period) {
|
2010-05-24 12:11:43 -07:00
|
|
|
/*
|
|
|
|
* Inline assembly required to prevent the compiler
|
|
|
|
* optimising this loop into a divmod call.
|
|
|
|
* See __iter_div_u64_rem() for another example of this.
|
|
|
|
*/
|
|
|
|
asm("" : "+rm" (rq->age_stamp));
|
2009-09-01 10:34:37 +02:00
|
|
|
rq->age_stamp += period;
|
|
|
|
rq->rt_avg /= 2;
|
|
|
|
}
|
2008-04-19 19:45:00 +02:00
|
|
|
}
|
|
|
|
|
2008-05-30 14:23:45 +02:00
|
|
|
#else /* !CONFIG_SMP */
|
2011-10-25 10:00:11 +02:00
|
|
|
void resched_task(struct task_struct *p)
|
2008-04-19 19:45:00 +02:00
|
|
|
{
|
2009-11-17 14:28:38 +01:00
|
|
|
assert_raw_spin_locked(&task_rq(p)->lock);
|
2008-07-18 18:01:23 +02:00
|
|
|
set_tsk_need_resched(p);
|
2008-04-19 19:45:00 +02:00
|
|
|
}
|
2008-05-30 14:23:45 +02:00
|
|
|
#endif /* CONFIG_SMP */
|
2008-04-19 19:45:00 +02:00
|
|
|
|
2011-07-21 09:43:29 -07:00
|
|
|
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
|
|
|
|
(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
|
2008-06-27 13:41:14 +02:00
|
|
|
/*
|
2011-07-21 09:43:35 -07:00
|
|
|
* Iterate task_group tree rooted at *from, calling @down when first entering a
|
|
|
|
* node and @up when leaving it for the final time.
|
|
|
|
*
|
|
|
|
* Caller must hold rcu_lock or sufficient equivalent.
|
2008-06-27 13:41:14 +02:00
|
|
|
*/
|
2011-10-25 10:00:11 +02:00
|
|
|
int walk_tg_tree_from(struct task_group *from,
|
2011-07-21 09:43:35 -07:00
|
|
|
tg_visitor down, tg_visitor up, void *data)
|
2008-06-27 13:41:14 +02:00
|
|
|
{
|
|
|
|
struct task_group *parent, *child;
|
2008-08-19 12:33:05 +02:00
|
|
|
int ret;
|
2008-06-27 13:41:14 +02:00
|
|
|
|
2011-07-21 09:43:35 -07:00
|
|
|
parent = from;
|
|
|
|
|
2008-06-27 13:41:14 +02:00
|
|
|
down:
|
2008-08-19 12:33:05 +02:00
|
|
|
ret = (*down)(parent, data);
|
|
|
|
if (ret)
|
2011-07-21 09:43:35 -07:00
|
|
|
goto out;
|
2008-06-27 13:41:14 +02:00
|
|
|
list_for_each_entry_rcu(child, &parent->children, siblings) {
|
|
|
|
parent = child;
|
|
|
|
goto down;
|
|
|
|
|
|
|
|
up:
|
|
|
|
continue;
|
|
|
|
}
|
2008-08-19 12:33:05 +02:00
|
|
|
ret = (*up)(parent, data);
|
2011-07-21 09:43:35 -07:00
|
|
|
if (ret || parent == from)
|
|
|
|
goto out;
|
2008-06-27 13:41:14 +02:00
|
|
|
|
|
|
|
child = parent;
|
|
|
|
parent = parent->parent;
|
|
|
|
if (parent)
|
|
|
|
goto up;
|
2011-07-21 09:43:35 -07:00
|
|
|
out:
|
2008-08-19 12:33:05 +02:00
|
|
|
return ret;
|
2008-06-27 13:41:14 +02:00
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
int tg_nop(struct task_group *tg, void *data)
|
2008-08-19 12:33:05 +02:00
|
|
|
{
|
2011-08-01 11:03:28 +02:00
|
|
|
return 0;
|
2008-08-19 12:33:05 +02:00
|
|
|
}
|
2008-04-19 19:45:00 +02:00
|
|
|
#endif
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
static void set_load_weight(struct task_struct *p)
|
|
|
|
{
|
2011-05-18 10:09:38 -07:00
|
|
|
int prio = p->static_prio - MAX_RT_PRIO;
|
|
|
|
struct load_weight *load = &p->se.load;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* SCHED_IDLE tasks get minimal weight:
|
|
|
|
*/
|
|
|
|
if (p->policy == SCHED_IDLE) {
|
sched: Increase SCHED_LOAD_SCALE resolution
Introduce SCHED_LOAD_RESOLUTION, which scales is added to
SCHED_LOAD_SHIFT and increases the resolution of
SCHED_LOAD_SCALE. This patch sets the value of
SCHED_LOAD_RESOLUTION to 10, scaling up the weights for all
sched entities by a factor of 1024. With this extra resolution,
we can handle deeper cgroup hiearchies and the scheduler can do
better shares distribution and load load balancing on larger
systems (especially for low weight task groups).
This does not change the existing user interface, the scaled
weights are only used internally. We do not modify
prio_to_weight values or inverses, but use the original weights
when calculating the inverse which is used to scale execution
time delta in calc_delta_mine(). This ensures we do not lose
accuracy when accounting time to the sched entities. Thanks to
Nikunj Dadhania for fixing an bug in c_d_m() that broken fairness.
Below is some analysis of the performance costs/improvements of
this patch.
1. Micro-arch performance costs:
Experiment was to run Ingo's pipe_test_100k 200 times with the
task pinned to one cpu. I measured instruction, cycles and
stalled-cycles for the runs. See:
http://thread.gmane.org/gmane.linux.kernel/1129232/focus=1129389
for more info.
-tip (baseline):
Performance counter stats for '/root/load-scale/pipe-test-100k' (200 runs):
964,991,769 instructions # 0.82 insns per cycle
# 0.33 stalled cycles per insn
# ( +- 0.05% )
1,171,186,635 cycles # 0.000 GHz ( +- 0.08% )
306,373,664 stalled-cycles-backend # 26.16% backend cycles idle ( +- 0.28% )
314,933,621 stalled-cycles-frontend # 26.89% frontend cycles idle ( +- 0.34% )
1.122405684 seconds time elapsed ( +- 0.05% )
-tip+patches:
Performance counter stats for './load-scale/pipe-test-100k' (200 runs):
963,624,821 instructions # 0.82 insns per cycle
# 0.33 stalled cycles per insn
# ( +- 0.04% )
1,175,215,649 cycles # 0.000 GHz ( +- 0.08% )
315,321,126 stalled-cycles-backend # 26.83% backend cycles idle ( +- 0.28% )
316,835,873 stalled-cycles-frontend # 26.96% frontend cycles idle ( +- 0.29% )
1.122238659 seconds time elapsed ( +- 0.06% )
With this patch, instructions decrease by ~0.10% and cycles
increase by 0.27%. This doesn't look statistically significant.
The number of stalled cycles in the backend increased from
26.16% to 26.83%. This can be attributed to the shifts we do in
c_d_m() and other places. The fraction of stalled cycles in the
frontend remains about the same, at 26.96% compared to 26.89% in -tip.
2. Balancing low-weight task groups
Test setup: run 50 tasks with random sleep/busy times (biased
around 100ms) in a low weight container (with cpu.shares = 2).
Measure %idle as reported by mpstat over a 10s window.
-tip (baseline):
06:47:48 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s
06:47:49 PM all 94.32 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.62 15888.00
06:47:50 PM all 94.57 0.00 0.62 0.00 0.00 0.00 0.00 0.00 4.81 16180.00
06:47:51 PM all 94.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.25 15966.00
06:47:52 PM all 95.81 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.19 16053.00
06:47:53 PM all 94.88 0.06 0.00 0.00 0.00 0.00 0.00 0.00 5.06 15984.00
06:47:54 PM all 93.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.69 15806.00
06:47:55 PM all 94.19 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.75 15896.00
06:47:56 PM all 92.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.13 15716.00
06:47:57 PM all 94.88 0.00 0.00 0.00 0.00 0.00 0.00 0.00 5.12 15982.00
06:47:58 PM all 95.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.56 16075.00
Average: all 94.49 0.01 0.08 0.00 0.00 0.00 0.00 0.00 5.42 15954.60
-tip+patches:
06:47:03 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s
06:47:04 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16630.00
06:47:05 PM all 99.69 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.31 16580.20
06:47:06 PM all 99.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.25 16596.00
06:47:07 PM all 99.20 0.00 0.74 0.00 0.00 0.06 0.00 0.00 0.00 17838.61
06:47:08 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16540.00
06:47:09 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16575.00
06:47:10 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16614.00
06:47:11 PM all 99.94 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 16588.00
06:47:12 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16593.00
06:47:13 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16551.00
Average: all 99.84 0.00 0.09 0.00 0.00 0.01 0.00 0.00 0.06 16711.58
We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06%
with the patches).
We see an improvement in idle% on the system (drops from 5.42%
on -tip to 0.06% with the patches).
Signed-off-by: Nikhil Rao <ncrao@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Nikunj A. Dadhania <nikunj@linux.vnet.ibm.com>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Stephan Barwolf <stephan.baerwolf@tu-ilmenau.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1305754668-18792-1-git-send-email-ncrao@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-05-18 14:37:48 -07:00
|
|
|
load->weight = scale_load(WEIGHT_IDLEPRIO);
|
2011-05-18 10:09:38 -07:00
|
|
|
load->inv_weight = WMULT_IDLEPRIO;
|
2007-07-09 18:51:59 +02:00
|
|
|
return;
|
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
|
sched: Increase SCHED_LOAD_SCALE resolution
Introduce SCHED_LOAD_RESOLUTION, which scales is added to
SCHED_LOAD_SHIFT and increases the resolution of
SCHED_LOAD_SCALE. This patch sets the value of
SCHED_LOAD_RESOLUTION to 10, scaling up the weights for all
sched entities by a factor of 1024. With this extra resolution,
we can handle deeper cgroup hiearchies and the scheduler can do
better shares distribution and load load balancing on larger
systems (especially for low weight task groups).
This does not change the existing user interface, the scaled
weights are only used internally. We do not modify
prio_to_weight values or inverses, but use the original weights
when calculating the inverse which is used to scale execution
time delta in calc_delta_mine(). This ensures we do not lose
accuracy when accounting time to the sched entities. Thanks to
Nikunj Dadhania for fixing an bug in c_d_m() that broken fairness.
Below is some analysis of the performance costs/improvements of
this patch.
1. Micro-arch performance costs:
Experiment was to run Ingo's pipe_test_100k 200 times with the
task pinned to one cpu. I measured instruction, cycles and
stalled-cycles for the runs. See:
http://thread.gmane.org/gmane.linux.kernel/1129232/focus=1129389
for more info.
-tip (baseline):
Performance counter stats for '/root/load-scale/pipe-test-100k' (200 runs):
964,991,769 instructions # 0.82 insns per cycle
# 0.33 stalled cycles per insn
# ( +- 0.05% )
1,171,186,635 cycles # 0.000 GHz ( +- 0.08% )
306,373,664 stalled-cycles-backend # 26.16% backend cycles idle ( +- 0.28% )
314,933,621 stalled-cycles-frontend # 26.89% frontend cycles idle ( +- 0.34% )
1.122405684 seconds time elapsed ( +- 0.05% )
-tip+patches:
Performance counter stats for './load-scale/pipe-test-100k' (200 runs):
963,624,821 instructions # 0.82 insns per cycle
# 0.33 stalled cycles per insn
# ( +- 0.04% )
1,175,215,649 cycles # 0.000 GHz ( +- 0.08% )
315,321,126 stalled-cycles-backend # 26.83% backend cycles idle ( +- 0.28% )
316,835,873 stalled-cycles-frontend # 26.96% frontend cycles idle ( +- 0.29% )
1.122238659 seconds time elapsed ( +- 0.06% )
With this patch, instructions decrease by ~0.10% and cycles
increase by 0.27%. This doesn't look statistically significant.
The number of stalled cycles in the backend increased from
26.16% to 26.83%. This can be attributed to the shifts we do in
c_d_m() and other places. The fraction of stalled cycles in the
frontend remains about the same, at 26.96% compared to 26.89% in -tip.
2. Balancing low-weight task groups
Test setup: run 50 tasks with random sleep/busy times (biased
around 100ms) in a low weight container (with cpu.shares = 2).
Measure %idle as reported by mpstat over a 10s window.
-tip (baseline):
06:47:48 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s
06:47:49 PM all 94.32 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.62 15888.00
06:47:50 PM all 94.57 0.00 0.62 0.00 0.00 0.00 0.00 0.00 4.81 16180.00
06:47:51 PM all 94.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.25 15966.00
06:47:52 PM all 95.81 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.19 16053.00
06:47:53 PM all 94.88 0.06 0.00 0.00 0.00 0.00 0.00 0.00 5.06 15984.00
06:47:54 PM all 93.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.69 15806.00
06:47:55 PM all 94.19 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.75 15896.00
06:47:56 PM all 92.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.13 15716.00
06:47:57 PM all 94.88 0.00 0.00 0.00 0.00 0.00 0.00 0.00 5.12 15982.00
06:47:58 PM all 95.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.56 16075.00
Average: all 94.49 0.01 0.08 0.00 0.00 0.00 0.00 0.00 5.42 15954.60
-tip+patches:
06:47:03 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s
06:47:04 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16630.00
06:47:05 PM all 99.69 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.31 16580.20
06:47:06 PM all 99.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.25 16596.00
06:47:07 PM all 99.20 0.00 0.74 0.00 0.00 0.06 0.00 0.00 0.00 17838.61
06:47:08 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16540.00
06:47:09 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16575.00
06:47:10 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16614.00
06:47:11 PM all 99.94 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 16588.00
06:47:12 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16593.00
06:47:13 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16551.00
Average: all 99.84 0.00 0.09 0.00 0.00 0.01 0.00 0.00 0.06 16711.58
We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06%
with the patches).
We see an improvement in idle% on the system (drops from 5.42%
on -tip to 0.06% with the patches).
Signed-off-by: Nikhil Rao <ncrao@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Nikunj A. Dadhania <nikunj@linux.vnet.ibm.com>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Stephan Barwolf <stephan.baerwolf@tu-ilmenau.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1305754668-18792-1-git-send-email-ncrao@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-05-18 14:37:48 -07:00
|
|
|
load->weight = scale_load(prio_to_weight[prio]);
|
2011-05-18 10:09:38 -07:00
|
|
|
load->inv_weight = prio_to_wmult[prio];
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2010-03-24 16:38:48 +01:00
|
|
|
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
|
2008-06-27 14:30:00 -06:00
|
|
|
{
|
2010-03-11 17:16:20 +01:00
|
|
|
update_rq_clock(rq);
|
2007-07-09 18:51:59 +02:00
|
|
|
sched_info_queued(p);
|
2010-03-24 16:38:48 +01:00
|
|
|
p->sched_class->enqueue_task(rq, p, flags);
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2010-03-24 16:38:48 +01:00
|
|
|
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
2010-03-11 17:16:20 +01:00
|
|
|
update_rq_clock(rq);
|
sched: fix accounting in task delay accounting & migration
On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote:
> On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote:
>
> > Thanks Peter for the explanation...
> >
> > I agree with the above and that is the reason why I did not see weird
> > values with cpu_time. But, run_delay still would suffer skews as the end
> > points for delta could be taken on different cpus due to migration (more
> > so on RT kernel due to the push-pull operations). With the below patch,
> > I could not reproduce the issue I had seen earlier. After every dequeue,
> > we take the delta and start wait measurements from zero when moved to a
> > different rq.
>
> OK, so task delay delay accounting is broken because it doesn't take
> migration into account.
>
> What you've done is make it symmetric wrt enqueue, and account it like
>
> cpu0 cpu1
>
> enqueue
> <wait-d1>
> dequeue
> enqueue
> <wait-d2>
> run
>
> Where you add both d1 and d2 to the run_delay,.. right?
>
Thanks for reviewing the patch. The above is exactly what I have done.
> This seems like a good fix, however it looks like the patch will break
> compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it
> failing to provide a stub for sched_info_dequeue() in that case.
Fixed. Pl. find the new patch below.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: rostedt@goodmis.org
Cc: suresh.b.siddha@intel.com
Cc: aneesh.kumar@linux.vnet.ibm.com
Cc: dhaval@linux.vnet.ibm.com
Cc: vatsa@linux.vnet.ibm.com
Cc: David Bahi <DBahi@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-01 14:30:06 +05:30
|
|
|
sched_info_dequeued(p);
|
2010-03-24 16:38:48 +01:00
|
|
|
p->sched_class->dequeue_task(rq, p, flags);
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void activate_task(struct rq *rq, struct task_struct *p, int flags)
|
2009-12-17 17:00:43 +01:00
|
|
|
{
|
|
|
|
if (task_contributes_to_load(p))
|
|
|
|
rq->nr_uninterruptible--;
|
|
|
|
|
2010-03-24 16:38:48 +01:00
|
|
|
enqueue_task(rq, p, flags);
|
2009-12-17 17:00:43 +01:00
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
|
2009-12-17 17:00:43 +01:00
|
|
|
{
|
|
|
|
if (task_contributes_to_load(p))
|
|
|
|
rq->nr_uninterruptible++;
|
|
|
|
|
2010-03-24 16:38:48 +01:00
|
|
|
dequeue_task(rq, p, flags);
|
2009-12-17 17:00:43 +01:00
|
|
|
}
|
|
|
|
|
2010-10-04 17:03:19 -07:00
|
|
|
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
|
|
|
|
2010-10-04 17:03:21 -07:00
|
|
|
/*
|
|
|
|
* There are no locks covering percpu hardirq/softirq time.
|
|
|
|
* They are only modified in account_system_vtime, on corresponding CPU
|
|
|
|
* with interrupts disabled. So, writes are safe.
|
|
|
|
* They are read and saved off onto struct rq in update_rq_clock().
|
|
|
|
* This may result in other CPU reading this CPU's irq time and can
|
|
|
|
* race with irq/account_system_vtime on this CPU. We would either get old
|
2010-12-09 14:15:34 +01:00
|
|
|
* or new value with a side effect of accounting a slice of irq time to wrong
|
|
|
|
* task when irq is in progress while we read rq->clock. That is a worthy
|
|
|
|
* compromise in place of having locks on each irq in account_system_time.
|
2010-10-04 17:03:21 -07:00
|
|
|
*/
|
2010-10-04 17:03:19 -07:00
|
|
|
static DEFINE_PER_CPU(u64, cpu_hardirq_time);
|
|
|
|
static DEFINE_PER_CPU(u64, cpu_softirq_time);
|
|
|
|
|
|
|
|
static DEFINE_PER_CPU(u64, irq_start_time);
|
|
|
|
static int sched_clock_irqtime;
|
|
|
|
|
|
|
|
void enable_sched_clock_irqtime(void)
|
|
|
|
{
|
|
|
|
sched_clock_irqtime = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
void disable_sched_clock_irqtime(void)
|
|
|
|
{
|
|
|
|
sched_clock_irqtime = 0;
|
|
|
|
}
|
|
|
|
|
2010-12-09 14:15:34 +01:00
|
|
|
#ifndef CONFIG_64BIT
|
|
|
|
static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
|
|
|
|
|
|
|
|
static inline void irq_time_write_begin(void)
|
|
|
|
{
|
|
|
|
__this_cpu_inc(irq_time_seq.sequence);
|
|
|
|
smp_wmb();
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void irq_time_write_end(void)
|
|
|
|
{
|
|
|
|
smp_wmb();
|
|
|
|
__this_cpu_inc(irq_time_seq.sequence);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 irq_time_read(int cpu)
|
|
|
|
{
|
|
|
|
u64 irq_time;
|
|
|
|
unsigned seq;
|
|
|
|
|
|
|
|
do {
|
|
|
|
seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
|
|
|
|
irq_time = per_cpu(cpu_softirq_time, cpu) +
|
|
|
|
per_cpu(cpu_hardirq_time, cpu);
|
|
|
|
} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
|
|
|
|
|
|
|
|
return irq_time;
|
|
|
|
}
|
|
|
|
#else /* CONFIG_64BIT */
|
|
|
|
static inline void irq_time_write_begin(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void irq_time_write_end(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 irq_time_read(int cpu)
|
2010-10-04 17:03:21 -07:00
|
|
|
{
|
|
|
|
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
|
|
|
|
}
|
2010-12-09 14:15:34 +01:00
|
|
|
#endif /* CONFIG_64BIT */
|
2010-10-04 17:03:21 -07:00
|
|
|
|
2010-12-09 14:15:34 +01:00
|
|
|
/*
|
|
|
|
* Called before incrementing preempt_count on {soft,}irq_enter
|
|
|
|
* and before decrementing preempt_count on {soft,}irq_exit.
|
|
|
|
*/
|
2010-10-04 17:03:19 -07:00
|
|
|
void account_system_vtime(struct task_struct *curr)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
2010-12-09 14:15:34 +01:00
|
|
|
s64 delta;
|
2010-10-04 17:03:19 -07:00
|
|
|
int cpu;
|
|
|
|
|
|
|
|
if (!sched_clock_irqtime)
|
|
|
|
return;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
|
|
|
|
cpu = smp_processor_id();
|
2010-12-09 14:15:34 +01:00
|
|
|
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
|
|
|
|
__this_cpu_add(irq_start_time, delta);
|
|
|
|
|
2010-12-09 14:15:34 +01:00
|
|
|
irq_time_write_begin();
|
2010-10-04 17:03:19 -07:00
|
|
|
/*
|
|
|
|
* We do not account for softirq time from ksoftirqd here.
|
|
|
|
* We want to continue accounting softirq time to ksoftirqd thread
|
|
|
|
* in that case, so as not to confuse scheduler with a special task
|
|
|
|
* that do not consume any time, but still wants to run.
|
|
|
|
*/
|
|
|
|
if (hardirq_count())
|
2010-12-09 14:15:34 +01:00
|
|
|
__this_cpu_add(cpu_hardirq_time, delta);
|
2010-12-21 17:09:00 -08:00
|
|
|
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
|
2010-12-09 14:15:34 +01:00
|
|
|
__this_cpu_add(cpu_softirq_time, delta);
|
2010-10-04 17:03:19 -07:00
|
|
|
|
2010-12-09 14:15:34 +01:00
|
|
|
irq_time_write_end();
|
2010-10-04 17:03:19 -07:00
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
2010-10-18 20:00:37 +02:00
|
|
|
EXPORT_SYMBOL_GPL(account_system_vtime);
|
2010-10-04 17:03:19 -07:00
|
|
|
|
2011-07-11 15:28:17 -04:00
|
|
|
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
|
|
|
|
|
|
|
|
#ifdef CONFIG_PARAVIRT
|
|
|
|
static inline u64 steal_ticks(u64 steal)
|
2010-10-04 17:03:22 -07:00
|
|
|
{
|
2011-07-11 15:28:17 -04:00
|
|
|
if (unlikely(steal > NSEC_PER_SEC))
|
|
|
|
return div_u64(steal, TICK_NSEC);
|
2010-12-09 14:15:34 +01:00
|
|
|
|
2011-07-11 15:28:17 -04:00
|
|
|
return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2010-12-09 14:15:34 +01:00
|
|
|
static void update_rq_clock_task(struct rq *rq, s64 delta)
|
2010-10-04 17:03:22 -07:00
|
|
|
{
|
2011-07-11 15:28:18 -04:00
|
|
|
/*
|
|
|
|
* In theory, the compile should just see 0 here, and optimize out the call
|
|
|
|
* to sched_rt_avg_update. But I don't trust it...
|
|
|
|
*/
|
|
|
|
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
|
|
|
s64 steal = 0, irq_delta = 0;
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
2010-12-09 14:15:34 +01:00
|
|
|
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
|
2010-12-09 14:15:34 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Since irq_time is only updated on {soft,}irq_exit, we might run into
|
|
|
|
* this case when a previous update_rq_clock() happened inside a
|
|
|
|
* {soft,}irq region.
|
|
|
|
*
|
|
|
|
* When this happens, we stop ->clock_task and only update the
|
|
|
|
* prev_irq_time stamp to account for the part that fit, so that a next
|
|
|
|
* update will consume the rest. This ensures ->clock_task is
|
|
|
|
* monotonic.
|
|
|
|
*
|
|
|
|
* It does however cause some slight miss-attribution of {soft,}irq
|
|
|
|
* time, a more accurate solution would be to update the irq_time using
|
|
|
|
* the current rq->clock timestamp, except that would require using
|
|
|
|
* atomic ops.
|
|
|
|
*/
|
|
|
|
if (irq_delta > delta)
|
|
|
|
irq_delta = delta;
|
|
|
|
|
|
|
|
rq->prev_irq_time += irq_delta;
|
|
|
|
delta -= irq_delta;
|
2011-07-11 15:28:18 -04:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
|
2012-02-24 08:31:31 +01:00
|
|
|
if (static_key_false((¶virt_steal_rq_enabled))) {
|
2011-07-11 15:28:18 -04:00
|
|
|
u64 st;
|
|
|
|
|
|
|
|
steal = paravirt_steal_clock(cpu_of(rq));
|
|
|
|
steal -= rq->prev_steal_time_rq;
|
|
|
|
|
|
|
|
if (unlikely(steal > delta))
|
|
|
|
steal = delta;
|
|
|
|
|
|
|
|
st = steal_ticks(steal);
|
|
|
|
steal = st * TICK_NSEC;
|
|
|
|
|
|
|
|
rq->prev_steal_time_rq += steal;
|
|
|
|
|
|
|
|
delta -= steal;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2010-12-09 14:15:34 +01:00
|
|
|
rq->clock_task += delta;
|
|
|
|
|
2011-07-11 15:28:18 -04:00
|
|
|
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
|
|
|
if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
|
|
|
|
sched_rt_avg_update(rq, irq_delta + steal);
|
|
|
|
#endif
|
2010-10-04 17:03:22 -07:00
|
|
|
}
|
|
|
|
|
2011-07-11 15:28:18 -04:00
|
|
|
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
2010-12-21 17:09:03 -08:00
|
|
|
static int irqtime_account_hi_update(void)
|
|
|
|
{
|
2011-11-28 14:45:17 -02:00
|
|
|
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
2010-12-21 17:09:03 -08:00
|
|
|
unsigned long flags;
|
|
|
|
u64 latest_ns;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
latest_ns = this_cpu_read(cpu_hardirq_time);
|
2011-12-19 19:23:15 +01:00
|
|
|
if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
|
2010-12-21 17:09:03 -08:00
|
|
|
ret = 1;
|
|
|
|
local_irq_restore(flags);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int irqtime_account_si_update(void)
|
|
|
|
{
|
2011-11-28 14:45:17 -02:00
|
|
|
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
2010-12-21 17:09:03 -08:00
|
|
|
unsigned long flags;
|
|
|
|
u64 latest_ns;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
latest_ns = this_cpu_read(cpu_softirq_time);
|
2011-12-19 19:23:15 +01:00
|
|
|
if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
|
2010-12-21 17:09:03 -08:00
|
|
|
ret = 1;
|
|
|
|
local_irq_restore(flags);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-12-09 14:15:34 +01:00
|
|
|
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
|
2010-10-04 17:03:21 -07:00
|
|
|
|
2010-12-21 17:09:03 -08:00
|
|
|
#define sched_clock_irqtime (0)
|
|
|
|
|
2011-07-11 15:28:18 -04:00
|
|
|
#endif
|
2010-10-04 17:03:19 -07:00
|
|
|
|
2010-09-22 13:53:15 +02:00
|
|
|
void sched_set_stop_task(int cpu, struct task_struct *stop)
|
|
|
|
{
|
|
|
|
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
|
|
|
|
struct task_struct *old_stop = cpu_rq(cpu)->stop;
|
|
|
|
|
|
|
|
if (stop) {
|
|
|
|
/*
|
|
|
|
* Make it appear like a SCHED_FIFO task, its something
|
|
|
|
* userspace knows about and won't get confused about.
|
|
|
|
*
|
|
|
|
* Also, it will make PI more or less work without too
|
|
|
|
* much confusion -- but then, stop work should not
|
|
|
|
* rely on PI working anyway.
|
|
|
|
*/
|
|
|
|
sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
|
|
|
|
|
|
|
|
stop->sched_class = &stop_sched_class;
|
|
|
|
}
|
|
|
|
|
|
|
|
cpu_rq(cpu)->stop = stop;
|
|
|
|
|
|
|
|
if (old_stop) {
|
|
|
|
/*
|
|
|
|
* Reset it back to a normal scheduling class so that
|
|
|
|
* it can die in pieces.
|
|
|
|
*/
|
|
|
|
old_stop->sched_class = &rt_sched_class;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
2007-07-09 18:51:59 +02:00
|
|
|
* __normal_prio - return the priority that is based on the static prio
|
2007-07-09 18:51:59 +02:00
|
|
|
*/
|
|
|
|
static inline int __normal_prio(struct task_struct *p)
|
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
return p->static_prio;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2006-06-27 02:54:51 -07:00
|
|
|
/*
|
|
|
|
* Calculate the expected normal priority: i.e. priority
|
|
|
|
* without taking RT-inheritance into account. Might be
|
|
|
|
* boosted by interactivity modifiers. Changes upon fork,
|
|
|
|
* setprio syscalls, and whenever the interactivity
|
|
|
|
* estimator recalculates.
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
static inline int normal_prio(struct task_struct *p)
|
2006-06-27 02:54:51 -07:00
|
|
|
{
|
|
|
|
int prio;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (task_has_rt_policy(p))
|
2006-06-27 02:54:51 -07:00
|
|
|
prio = MAX_RT_PRIO-1 - p->rt_priority;
|
|
|
|
else
|
|
|
|
prio = __normal_prio(p);
|
|
|
|
return prio;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate the current priority, i.e. the priority
|
|
|
|
* taken into account by the scheduler. This value might
|
|
|
|
* be boosted by RT tasks, or might be boosted by
|
|
|
|
* interactivity modifiers. Will be RT if the task got
|
|
|
|
* RT-boosted. If not then it returns p->normal_prio.
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
static int effective_prio(struct task_struct *p)
|
2006-06-27 02:54:51 -07:00
|
|
|
{
|
|
|
|
p->normal_prio = normal_prio(p);
|
|
|
|
/*
|
|
|
|
* If we are RT tasks or we were boosted to RT priority,
|
|
|
|
* keep the priority unchanged. Otherwise, update priority
|
|
|
|
* to the normal priority:
|
|
|
|
*/
|
|
|
|
if (!rt_prio(p->prio))
|
|
|
|
return p->normal_prio;
|
|
|
|
return p->prio;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/**
|
|
|
|
* task_curr - is this task currently executing on a CPU?
|
|
|
|
* @p: the task in question.
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
inline int task_curr(const struct task_struct *p)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
return cpu_curr(task_cpu(p)) == p;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:22 +01:00
|
|
|
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
|
|
|
|
const struct sched_class *prev_class,
|
2011-01-17 17:03:27 +01:00
|
|
|
int oldprio)
|
2008-01-25 21:08:22 +01:00
|
|
|
{
|
|
|
|
if (prev_class != p->sched_class) {
|
|
|
|
if (prev_class->switched_from)
|
2011-01-17 17:03:27 +01:00
|
|
|
prev_class->switched_from(rq, p);
|
|
|
|
p->sched_class->switched_to(rq, p);
|
|
|
|
} else if (oldprio != p->prio)
|
|
|
|
p->sched_class->prio_changed(rq, p, oldprio);
|
2008-01-25 21:08:22 +01:00
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
|
2010-10-31 12:37:04 +01:00
|
|
|
{
|
|
|
|
const struct sched_class *class;
|
|
|
|
|
|
|
|
if (p->sched_class == rq->curr->sched_class) {
|
|
|
|
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
|
|
|
|
} else {
|
|
|
|
for_each_class(class) {
|
|
|
|
if (class == rq->curr->sched_class)
|
|
|
|
break;
|
|
|
|
if (class == p->sched_class) {
|
|
|
|
resched_task(rq->curr);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A queue event has occurred, and we're going to schedule. In
|
|
|
|
* this case, we can save a useless back to back clock update.
|
|
|
|
*/
|
2011-04-05 17:23:44 +02:00
|
|
|
if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
|
2010-10-31 12:37:04 +01:00
|
|
|
rq->skip_clock_update = 1;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifdef CONFIG_SMP
|
2007-07-09 18:51:59 +02:00
|
|
|
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2009-12-16 18:04:36 +01:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
/*
|
|
|
|
* We should never call set_task_cpu() on a blocked task,
|
|
|
|
* ttwu() will sort out the placement.
|
|
|
|
*/
|
2009-12-17 13:16:31 +01:00
|
|
|
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
|
|
|
|
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
|
2011-04-05 17:23:51 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_LOCKDEP
|
2011-06-03 17:37:07 +02:00
|
|
|
/*
|
|
|
|
* The caller should hold either p->pi_lock or rq->lock, when changing
|
|
|
|
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
|
|
|
|
*
|
|
|
|
* sched_move_task() holds both and thus holding either pins the cgroup,
|
2012-06-22 13:36:05 +02:00
|
|
|
* see task_group().
|
2011-06-03 17:37:07 +02:00
|
|
|
*
|
|
|
|
* Furthermore, all task_rq users should acquire both locks, see
|
|
|
|
* task_rq_lock().
|
|
|
|
*/
|
2011-04-05 17:23:51 +02:00
|
|
|
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
|
|
|
|
lockdep_is_held(&task_rq(p)->lock)));
|
|
|
|
#endif
|
2009-12-16 18:04:36 +01:00
|
|
|
#endif
|
|
|
|
|
2009-05-05 16:49:59 +08:00
|
|
|
trace_sched_migrate_task(p, new_cpu);
|
2008-12-10 08:08:22 +01:00
|
|
|
|
2009-12-22 15:43:19 +01:00
|
|
|
if (task_cpu(p) != new_cpu) {
|
|
|
|
p->se.nr_migrations++;
|
2011-06-27 14:41:57 +02:00
|
|
|
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
|
2009-12-22 15:43:19 +01:00
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
__set_task_cpu(p, new_cpu);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2010-05-06 18:49:21 +02:00
|
|
|
struct migration_arg {
|
2006-07-03 00:25:41 -07:00
|
|
|
struct task_struct *task;
|
2005-04-16 15:20:36 -07:00
|
|
|
int dest_cpu;
|
2006-07-03 00:25:42 -07:00
|
|
|
};
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2010-05-06 18:49:21 +02:00
|
|
|
static int migration_cpu_stop(void *data);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* wait_task_inactive - wait for a thread to unschedule.
|
|
|
|
*
|
2008-07-25 19:45:58 -07:00
|
|
|
* If @match_state is nonzero, it's the @p->state value just checked and
|
|
|
|
* not expected to change. If it changes, i.e. @p might have woken up,
|
|
|
|
* then return zero. When we succeed in waiting for @p to be off its CPU,
|
|
|
|
* we return a positive number (its total switch count). If a second call
|
|
|
|
* a short while later returns the same number, the caller can be sure that
|
|
|
|
* @p has remained unscheduled the whole time.
|
|
|
|
*
|
2005-04-16 15:20:36 -07:00
|
|
|
* The caller must ensure that the task *will* unschedule sometime soon,
|
|
|
|
* else this function might spin for a *long* time. This function can't
|
|
|
|
* be called with interrupts off, or it may introduce deadlock with
|
|
|
|
* smp_call_function() if an IPI is sent by the same process we are
|
|
|
|
* waiting to become inactive.
|
|
|
|
*/
|
2008-07-25 19:45:58 -07:00
|
|
|
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2007-07-09 18:51:59 +02:00
|
|
|
int running, on_rq;
|
2008-07-25 19:45:58 -07:00
|
|
|
unsigned long ncsw;
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
for (;;) {
|
|
|
|
/*
|
|
|
|
* We do the initial early heuristics without holding
|
|
|
|
* any task-queue locks at all. We'll only try to get
|
|
|
|
* the runqueue lock when things look like they will
|
|
|
|
* work out!
|
|
|
|
*/
|
|
|
|
rq = task_rq(p);
|
Fix possible runqueue lock starvation in wait_task_inactive()
Miklos Szeredi reported very long pauses (several seconds, sometimes
more) on his T60 (with a Core2Duo) which he managed to track down to
wait_task_inactive()'s open-coded busy-loop.
He observed that an interrupt on one core tries to acquire the
runqueue-lock but does not succeed in doing so for a very long time -
while wait_task_inactive() on the other core loops waiting for the first
core to deschedule a task (which it wont do while spinning in an
interrupt handler).
This rewrites wait_task_inactive() to do all its waiting optimistically
without any locks taken at all, and then just double-check the end
result with the proper runqueue lock held over just a very short
section. If there were races in the optimistic wait, of a preemption
event scheduled the process away, we simply re-synchronize, and start
over.
So the code now looks like this:
repeat:
/* Unlocked, optimistic looping! */
rq = task_rq(p);
while (task_running(rq, p))
cpu_relax();
/* Get the *real* values */
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
array = p->array;
task_rq_unlock(rq, &flags);
/* Check them.. */
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/* Preempted away? Yield if so.. */
if (unlikely(array)) {
yield();
goto repeat;
}
Basically, that first "while()" loop is done entirely without any
locking at all (and doesn't check for the case where the target process
might have been preempted away), and so it's possibly "incorrect", but
we don't really care. Both the runqueue used, and the "task_running()"
check might be the wrong tests, but they won't oops - they just mean
that we could possibly get the wrong results due to lack of locking and
exit the loop early in the case of a race condition.
So once we've exited the loop, we then get the proper (and careful) rq
lock, and check the running/runnable state _safely_. And if it turns
out that our quick-and-dirty and unsafe loop was wrong after all, we
just go back and try it all again.
(The patch also adds a lot of comments, which is the actual bulk of it
all, to make it more obvious why we can do these things without holding
the locks).
Thanks to Miklos for all the testing and tracking it down.
Tested-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-18 09:34:40 -07:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* If the task is actively running on another CPU
|
|
|
|
* still, just relax and busy-wait without holding
|
|
|
|
* any locks.
|
|
|
|
*
|
|
|
|
* NOTE! Since we don't hold any locks, it's not
|
|
|
|
* even sure that "rq" stays as the right runqueue!
|
|
|
|
* But we don't care, since "task_running()" will
|
|
|
|
* return false if the runqueue has changed and p
|
|
|
|
* is actually now running somewhere else!
|
|
|
|
*/
|
2008-07-25 19:45:58 -07:00
|
|
|
while (task_running(rq, p)) {
|
|
|
|
if (match_state && unlikely(p->state != match_state))
|
|
|
|
return 0;
|
2007-10-15 17:00:14 +02:00
|
|
|
cpu_relax();
|
2008-07-25 19:45:58 -07:00
|
|
|
}
|
Fix possible runqueue lock starvation in wait_task_inactive()
Miklos Szeredi reported very long pauses (several seconds, sometimes
more) on his T60 (with a Core2Duo) which he managed to track down to
wait_task_inactive()'s open-coded busy-loop.
He observed that an interrupt on one core tries to acquire the
runqueue-lock but does not succeed in doing so for a very long time -
while wait_task_inactive() on the other core loops waiting for the first
core to deschedule a task (which it wont do while spinning in an
interrupt handler).
This rewrites wait_task_inactive() to do all its waiting optimistically
without any locks taken at all, and then just double-check the end
result with the proper runqueue lock held over just a very short
section. If there were races in the optimistic wait, of a preemption
event scheduled the process away, we simply re-synchronize, and start
over.
So the code now looks like this:
repeat:
/* Unlocked, optimistic looping! */
rq = task_rq(p);
while (task_running(rq, p))
cpu_relax();
/* Get the *real* values */
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
array = p->array;
task_rq_unlock(rq, &flags);
/* Check them.. */
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/* Preempted away? Yield if so.. */
if (unlikely(array)) {
yield();
goto repeat;
}
Basically, that first "while()" loop is done entirely without any
locking at all (and doesn't check for the case where the target process
might have been preempted away), and so it's possibly "incorrect", but
we don't really care. Both the runqueue used, and the "task_running()"
check might be the wrong tests, but they won't oops - they just mean
that we could possibly get the wrong results due to lack of locking and
exit the loop early in the case of a race condition.
So once we've exited the loop, we then get the proper (and careful) rq
lock, and check the running/runnable state _safely_. And if it turns
out that our quick-and-dirty and unsafe loop was wrong after all, we
just go back and try it all again.
(The patch also adds a lot of comments, which is the actual bulk of it
all, to make it more obvious why we can do these things without holding
the locks).
Thanks to Miklos for all the testing and tracking it down.
Tested-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-18 09:34:40 -07:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* Ok, time to look more closely! We need the rq
|
|
|
|
* lock now, to be *sure*. If we're wrong, we'll
|
|
|
|
* just go back and repeat.
|
|
|
|
*/
|
|
|
|
rq = task_rq_lock(p, &flags);
|
2010-05-04 20:36:56 +02:00
|
|
|
trace_sched_wait_task(p);
|
2007-10-15 17:00:14 +02:00
|
|
|
running = task_running(rq, p);
|
2011-04-05 17:23:44 +02:00
|
|
|
on_rq = p->on_rq;
|
2008-07-25 19:45:58 -07:00
|
|
|
ncsw = 0;
|
2008-08-20 16:54:44 -07:00
|
|
|
if (!match_state || p->state == match_state)
|
2008-08-20 16:54:44 -07:00
|
|
|
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
Fix possible runqueue lock starvation in wait_task_inactive()
Miklos Szeredi reported very long pauses (several seconds, sometimes
more) on his T60 (with a Core2Duo) which he managed to track down to
wait_task_inactive()'s open-coded busy-loop.
He observed that an interrupt on one core tries to acquire the
runqueue-lock but does not succeed in doing so for a very long time -
while wait_task_inactive() on the other core loops waiting for the first
core to deschedule a task (which it wont do while spinning in an
interrupt handler).
This rewrites wait_task_inactive() to do all its waiting optimistically
without any locks taken at all, and then just double-check the end
result with the proper runqueue lock held over just a very short
section. If there were races in the optimistic wait, of a preemption
event scheduled the process away, we simply re-synchronize, and start
over.
So the code now looks like this:
repeat:
/* Unlocked, optimistic looping! */
rq = task_rq(p);
while (task_running(rq, p))
cpu_relax();
/* Get the *real* values */
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
array = p->array;
task_rq_unlock(rq, &flags);
/* Check them.. */
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/* Preempted away? Yield if so.. */
if (unlikely(array)) {
yield();
goto repeat;
}
Basically, that first "while()" loop is done entirely without any
locking at all (and doesn't check for the case where the target process
might have been preempted away), and so it's possibly "incorrect", but
we don't really care. Both the runqueue used, and the "task_running()"
check might be the wrong tests, but they won't oops - they just mean
that we could possibly get the wrong results due to lack of locking and
exit the loop early in the case of a race condition.
So once we've exited the loop, we then get the proper (and careful) rq
lock, and check the running/runnable state _safely_. And if it turns
out that our quick-and-dirty and unsafe loop was wrong after all, we
just go back and try it all again.
(The patch also adds a lot of comments, which is the actual bulk of it
all, to make it more obvious why we can do these things without holding
the locks).
Thanks to Miklos for all the testing and tracking it down.
Tested-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-18 09:34:40 -07:00
|
|
|
|
2008-07-25 19:45:58 -07:00
|
|
|
/*
|
|
|
|
* If it changed from the expected state, bail out now.
|
|
|
|
*/
|
|
|
|
if (unlikely(!ncsw))
|
|
|
|
break;
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* Was it really running after all now that we
|
|
|
|
* checked with the proper locks actually held?
|
|
|
|
*
|
|
|
|
* Oops. Go back and try again..
|
|
|
|
*/
|
|
|
|
if (unlikely(running)) {
|
|
|
|
cpu_relax();
|
|
|
|
continue;
|
|
|
|
}
|
Fix possible runqueue lock starvation in wait_task_inactive()
Miklos Szeredi reported very long pauses (several seconds, sometimes
more) on his T60 (with a Core2Duo) which he managed to track down to
wait_task_inactive()'s open-coded busy-loop.
He observed that an interrupt on one core tries to acquire the
runqueue-lock but does not succeed in doing so for a very long time -
while wait_task_inactive() on the other core loops waiting for the first
core to deschedule a task (which it wont do while spinning in an
interrupt handler).
This rewrites wait_task_inactive() to do all its waiting optimistically
without any locks taken at all, and then just double-check the end
result with the proper runqueue lock held over just a very short
section. If there were races in the optimistic wait, of a preemption
event scheduled the process away, we simply re-synchronize, and start
over.
So the code now looks like this:
repeat:
/* Unlocked, optimistic looping! */
rq = task_rq(p);
while (task_running(rq, p))
cpu_relax();
/* Get the *real* values */
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
array = p->array;
task_rq_unlock(rq, &flags);
/* Check them.. */
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/* Preempted away? Yield if so.. */
if (unlikely(array)) {
yield();
goto repeat;
}
Basically, that first "while()" loop is done entirely without any
locking at all (and doesn't check for the case where the target process
might have been preempted away), and so it's possibly "incorrect", but
we don't really care. Both the runqueue used, and the "task_running()"
check might be the wrong tests, but they won't oops - they just mean
that we could possibly get the wrong results due to lack of locking and
exit the loop early in the case of a race condition.
So once we've exited the loop, we then get the proper (and careful) rq
lock, and check the running/runnable state _safely_. And if it turns
out that our quick-and-dirty and unsafe loop was wrong after all, we
just go back and try it all again.
(The patch also adds a lot of comments, which is the actual bulk of it
all, to make it more obvious why we can do these things without holding
the locks).
Thanks to Miklos for all the testing and tracking it down.
Tested-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-18 09:34:40 -07:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* It's not enough that it's not actively running,
|
|
|
|
* it must be off the runqueue _entirely_, and not
|
|
|
|
* preempted!
|
|
|
|
*
|
2009-03-16 19:58:09 +00:00
|
|
|
* So if it was still runnable (but just not actively
|
2007-10-15 17:00:14 +02:00
|
|
|
* running right now), it's preempted, and we should
|
|
|
|
* yield - it could be a while.
|
|
|
|
*/
|
|
|
|
if (unlikely(on_rq)) {
|
2011-02-23 23:52:21 +00:00
|
|
|
ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
|
|
|
|
|
|
|
|
set_current_state(TASK_UNINTERRUPTIBLE);
|
|
|
|
schedule_hrtimeout(&to, HRTIMER_MODE_REL);
|
2007-10-15 17:00:14 +02:00
|
|
|
continue;
|
|
|
|
}
|
Fix possible runqueue lock starvation in wait_task_inactive()
Miklos Szeredi reported very long pauses (several seconds, sometimes
more) on his T60 (with a Core2Duo) which he managed to track down to
wait_task_inactive()'s open-coded busy-loop.
He observed that an interrupt on one core tries to acquire the
runqueue-lock but does not succeed in doing so for a very long time -
while wait_task_inactive() on the other core loops waiting for the first
core to deschedule a task (which it wont do while spinning in an
interrupt handler).
This rewrites wait_task_inactive() to do all its waiting optimistically
without any locks taken at all, and then just double-check the end
result with the proper runqueue lock held over just a very short
section. If there were races in the optimistic wait, of a preemption
event scheduled the process away, we simply re-synchronize, and start
over.
So the code now looks like this:
repeat:
/* Unlocked, optimistic looping! */
rq = task_rq(p);
while (task_running(rq, p))
cpu_relax();
/* Get the *real* values */
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
array = p->array;
task_rq_unlock(rq, &flags);
/* Check them.. */
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/* Preempted away? Yield if so.. */
if (unlikely(array)) {
yield();
goto repeat;
}
Basically, that first "while()" loop is done entirely without any
locking at all (and doesn't check for the case where the target process
might have been preempted away), and so it's possibly "incorrect", but
we don't really care. Both the runqueue used, and the "task_running()"
check might be the wrong tests, but they won't oops - they just mean
that we could possibly get the wrong results due to lack of locking and
exit the loop early in the case of a race condition.
So once we've exited the loop, we then get the proper (and careful) rq
lock, and check the running/runnable state _safely_. And if it turns
out that our quick-and-dirty and unsafe loop was wrong after all, we
just go back and try it all again.
(The patch also adds a lot of comments, which is the actual bulk of it
all, to make it more obvious why we can do these things without holding
the locks).
Thanks to Miklos for all the testing and tracking it down.
Tested-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-18 09:34:40 -07:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* Ahh, all good. It wasn't running, and it wasn't
|
|
|
|
* runnable, which means that it will never become
|
|
|
|
* running in the future either. We're all done!
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
2008-07-25 19:45:58 -07:00
|
|
|
|
|
|
|
return ncsw;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/***
|
|
|
|
* kick_process - kick a running thread to enter/exit the kernel
|
|
|
|
* @p: the to-be-kicked thread
|
|
|
|
*
|
|
|
|
* Cause a process which is running on another CPU to enter
|
|
|
|
* kernel-mode, without any delay. (to get signals handled.)
|
|
|
|
*
|
2011-03-30 22:57:33 -03:00
|
|
|
* NOTE: this function doesn't have to take the runqueue lock,
|
2005-04-16 15:20:36 -07:00
|
|
|
* because all it wants to ensure is that the remote task enters
|
|
|
|
* the kernel. If the IPI races and the task has been migrated
|
|
|
|
* to another CPU then no harm is done and the purpose has been
|
|
|
|
* achieved as well.
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
void kick_process(struct task_struct *p)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
preempt_disable();
|
|
|
|
cpu = task_cpu(p);
|
|
|
|
if ((cpu != smp_processor_id()) && task_curr(p))
|
|
|
|
smp_send_reschedule(cpu);
|
|
|
|
preempt_enable();
|
|
|
|
}
|
2009-06-12 22:27:00 -06:00
|
|
|
EXPORT_SYMBOL_GPL(kick_process);
|
2005-06-25 14:57:29 -07:00
|
|
|
#endif /* CONFIG_SMP */
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-11-25 13:31:39 +01:00
|
|
|
#ifdef CONFIG_SMP
|
2010-03-15 10:10:19 +01:00
|
|
|
/*
|
2011-04-05 17:23:45 +02:00
|
|
|
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
|
2010-03-15 10:10:19 +01:00
|
|
|
*/
|
2009-12-16 18:04:38 +01:00
|
|
|
static int select_fallback_rq(int cpu, struct task_struct *p)
|
|
|
|
{
|
|
|
|
const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
|
sched: Fix select_fallback_rq() vs cpu_active/cpu_online
Commit 5fbd036b55 ("sched: Cleanup cpu_active madness"), which was
supposed to finally sort the cpu_active mess, instead uncovered more.
Since CPU_STARTING is ran before setting the cpu online, there's a
(small) window where the cpu has active,!online.
If during this time there's a wakeup of a task that used to reside on
that cpu select_task_rq() will use select_fallback_rq() to compute an
alternative cpu to run on since we find !online.
select_fallback_rq() however will compute the new cpu against
cpu_active, this means that it can return the same cpu it started out
with, the !online one, since that cpu is in fact marked active.
This results in us trying to scheduling a task on an offline cpu and
triggering a WARN in the IPI code.
The solution proposed by Chuansheng Liu of setting cpu_active in
set_cpu_online() is buggy, firstly not all archs actually use
set_cpu_online(), secondly, not all archs call set_cpu_online() with
IRQs disabled, this means we would introduce either the same race or
the race from fd8a7de17 ("x86: cpu-hotplug: Prevent softirq wakeup on
wrong CPU") -- albeit much narrower.
[ By setting online first and active later we have a window of
online,!active, fresh and bound kthreads have task_cpu() of 0 and
since cpu0 isn't in tsk_cpus_allowed() we end up in
select_fallback_rq() which excludes !active, resulting in a reset
of ->cpus_allowed and the thread running all over the place. ]
The solution is to re-work select_fallback_rq() to require active
_and_ online. This makes the active,!online case work as expected,
OTOH archs running CPU_STARTING after setting online are now
vulnerable to the issue from fd8a7de17 -- these are alpha and
blackfin.
Reported-by: Chuansheng Liu <chuansheng.liu@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: linux-alpha@vger.kernel.org
Link: http://lkml.kernel.org/n/tip-hubqk1i10o4dpvlm06gq7v6j@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-03-20 15:57:01 +01:00
|
|
|
enum { cpuset, possible, fail } state = cpuset;
|
|
|
|
int dest_cpu;
|
2009-12-16 18:04:38 +01:00
|
|
|
|
|
|
|
/* Look for allowed, online CPU in same node. */
|
2012-03-30 19:40:28 +05:30
|
|
|
for_each_cpu(dest_cpu, nodemask) {
|
sched: Fix select_fallback_rq() vs cpu_active/cpu_online
Commit 5fbd036b55 ("sched: Cleanup cpu_active madness"), which was
supposed to finally sort the cpu_active mess, instead uncovered more.
Since CPU_STARTING is ran before setting the cpu online, there's a
(small) window where the cpu has active,!online.
If during this time there's a wakeup of a task that used to reside on
that cpu select_task_rq() will use select_fallback_rq() to compute an
alternative cpu to run on since we find !online.
select_fallback_rq() however will compute the new cpu against
cpu_active, this means that it can return the same cpu it started out
with, the !online one, since that cpu is in fact marked active.
This results in us trying to scheduling a task on an offline cpu and
triggering a WARN in the IPI code.
The solution proposed by Chuansheng Liu of setting cpu_active in
set_cpu_online() is buggy, firstly not all archs actually use
set_cpu_online(), secondly, not all archs call set_cpu_online() with
IRQs disabled, this means we would introduce either the same race or
the race from fd8a7de17 ("x86: cpu-hotplug: Prevent softirq wakeup on
wrong CPU") -- albeit much narrower.
[ By setting online first and active later we have a window of
online,!active, fresh and bound kthreads have task_cpu() of 0 and
since cpu0 isn't in tsk_cpus_allowed() we end up in
select_fallback_rq() which excludes !active, resulting in a reset
of ->cpus_allowed and the thread running all over the place. ]
The solution is to re-work select_fallback_rq() to require active
_and_ online. This makes the active,!online case work as expected,
OTOH archs running CPU_STARTING after setting online are now
vulnerable to the issue from fd8a7de17 -- these are alpha and
blackfin.
Reported-by: Chuansheng Liu <chuansheng.liu@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: linux-alpha@vger.kernel.org
Link: http://lkml.kernel.org/n/tip-hubqk1i10o4dpvlm06gq7v6j@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-03-20 15:57:01 +01:00
|
|
|
if (!cpu_online(dest_cpu))
|
|
|
|
continue;
|
|
|
|
if (!cpu_active(dest_cpu))
|
|
|
|
continue;
|
2011-06-16 12:23:22 +02:00
|
|
|
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
|
2009-12-16 18:04:38 +01:00
|
|
|
return dest_cpu;
|
sched: Fix select_fallback_rq() vs cpu_active/cpu_online
Commit 5fbd036b55 ("sched: Cleanup cpu_active madness"), which was
supposed to finally sort the cpu_active mess, instead uncovered more.
Since CPU_STARTING is ran before setting the cpu online, there's a
(small) window where the cpu has active,!online.
If during this time there's a wakeup of a task that used to reside on
that cpu select_task_rq() will use select_fallback_rq() to compute an
alternative cpu to run on since we find !online.
select_fallback_rq() however will compute the new cpu against
cpu_active, this means that it can return the same cpu it started out
with, the !online one, since that cpu is in fact marked active.
This results in us trying to scheduling a task on an offline cpu and
triggering a WARN in the IPI code.
The solution proposed by Chuansheng Liu of setting cpu_active in
set_cpu_online() is buggy, firstly not all archs actually use
set_cpu_online(), secondly, not all archs call set_cpu_online() with
IRQs disabled, this means we would introduce either the same race or
the race from fd8a7de17 ("x86: cpu-hotplug: Prevent softirq wakeup on
wrong CPU") -- albeit much narrower.
[ By setting online first and active later we have a window of
online,!active, fresh and bound kthreads have task_cpu() of 0 and
since cpu0 isn't in tsk_cpus_allowed() we end up in
select_fallback_rq() which excludes !active, resulting in a reset
of ->cpus_allowed and the thread running all over the place. ]
The solution is to re-work select_fallback_rq() to require active
_and_ online. This makes the active,!online case work as expected,
OTOH archs running CPU_STARTING after setting online are now
vulnerable to the issue from fd8a7de17 -- these are alpha and
blackfin.
Reported-by: Chuansheng Liu <chuansheng.liu@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: linux-alpha@vger.kernel.org
Link: http://lkml.kernel.org/n/tip-hubqk1i10o4dpvlm06gq7v6j@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-03-20 15:57:01 +01:00
|
|
|
}
|
2009-12-16 18:04:38 +01:00
|
|
|
|
sched: Fix select_fallback_rq() vs cpu_active/cpu_online
Commit 5fbd036b55 ("sched: Cleanup cpu_active madness"), which was
supposed to finally sort the cpu_active mess, instead uncovered more.
Since CPU_STARTING is ran before setting the cpu online, there's a
(small) window where the cpu has active,!online.
If during this time there's a wakeup of a task that used to reside on
that cpu select_task_rq() will use select_fallback_rq() to compute an
alternative cpu to run on since we find !online.
select_fallback_rq() however will compute the new cpu against
cpu_active, this means that it can return the same cpu it started out
with, the !online one, since that cpu is in fact marked active.
This results in us trying to scheduling a task on an offline cpu and
triggering a WARN in the IPI code.
The solution proposed by Chuansheng Liu of setting cpu_active in
set_cpu_online() is buggy, firstly not all archs actually use
set_cpu_online(), secondly, not all archs call set_cpu_online() with
IRQs disabled, this means we would introduce either the same race or
the race from fd8a7de17 ("x86: cpu-hotplug: Prevent softirq wakeup on
wrong CPU") -- albeit much narrower.
[ By setting online first and active later we have a window of
online,!active, fresh and bound kthreads have task_cpu() of 0 and
since cpu0 isn't in tsk_cpus_allowed() we end up in
select_fallback_rq() which excludes !active, resulting in a reset
of ->cpus_allowed and the thread running all over the place. ]
The solution is to re-work select_fallback_rq() to require active
_and_ online. This makes the active,!online case work as expected,
OTOH archs running CPU_STARTING after setting online are now
vulnerable to the issue from fd8a7de17 -- these are alpha and
blackfin.
Reported-by: Chuansheng Liu <chuansheng.liu@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: linux-alpha@vger.kernel.org
Link: http://lkml.kernel.org/n/tip-hubqk1i10o4dpvlm06gq7v6j@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-03-20 15:57:01 +01:00
|
|
|
for (;;) {
|
|
|
|
/* Any allowed, online CPU? */
|
2012-03-30 19:40:28 +05:30
|
|
|
for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
|
sched: Fix select_fallback_rq() vs cpu_active/cpu_online
Commit 5fbd036b55 ("sched: Cleanup cpu_active madness"), which was
supposed to finally sort the cpu_active mess, instead uncovered more.
Since CPU_STARTING is ran before setting the cpu online, there's a
(small) window where the cpu has active,!online.
If during this time there's a wakeup of a task that used to reside on
that cpu select_task_rq() will use select_fallback_rq() to compute an
alternative cpu to run on since we find !online.
select_fallback_rq() however will compute the new cpu against
cpu_active, this means that it can return the same cpu it started out
with, the !online one, since that cpu is in fact marked active.
This results in us trying to scheduling a task on an offline cpu and
triggering a WARN in the IPI code.
The solution proposed by Chuansheng Liu of setting cpu_active in
set_cpu_online() is buggy, firstly not all archs actually use
set_cpu_online(), secondly, not all archs call set_cpu_online() with
IRQs disabled, this means we would introduce either the same race or
the race from fd8a7de17 ("x86: cpu-hotplug: Prevent softirq wakeup on
wrong CPU") -- albeit much narrower.
[ By setting online first and active later we have a window of
online,!active, fresh and bound kthreads have task_cpu() of 0 and
since cpu0 isn't in tsk_cpus_allowed() we end up in
select_fallback_rq() which excludes !active, resulting in a reset
of ->cpus_allowed and the thread running all over the place. ]
The solution is to re-work select_fallback_rq() to require active
_and_ online. This makes the active,!online case work as expected,
OTOH archs running CPU_STARTING after setting online are now
vulnerable to the issue from fd8a7de17 -- these are alpha and
blackfin.
Reported-by: Chuansheng Liu <chuansheng.liu@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: linux-alpha@vger.kernel.org
Link: http://lkml.kernel.org/n/tip-hubqk1i10o4dpvlm06gq7v6j@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-03-20 15:57:01 +01:00
|
|
|
if (!cpu_online(dest_cpu))
|
|
|
|
continue;
|
|
|
|
if (!cpu_active(dest_cpu))
|
|
|
|
continue;
|
|
|
|
goto out;
|
|
|
|
}
|
2009-12-16 18:04:38 +01:00
|
|
|
|
sched: Fix select_fallback_rq() vs cpu_active/cpu_online
Commit 5fbd036b55 ("sched: Cleanup cpu_active madness"), which was
supposed to finally sort the cpu_active mess, instead uncovered more.
Since CPU_STARTING is ran before setting the cpu online, there's a
(small) window where the cpu has active,!online.
If during this time there's a wakeup of a task that used to reside on
that cpu select_task_rq() will use select_fallback_rq() to compute an
alternative cpu to run on since we find !online.
select_fallback_rq() however will compute the new cpu against
cpu_active, this means that it can return the same cpu it started out
with, the !online one, since that cpu is in fact marked active.
This results in us trying to scheduling a task on an offline cpu and
triggering a WARN in the IPI code.
The solution proposed by Chuansheng Liu of setting cpu_active in
set_cpu_online() is buggy, firstly not all archs actually use
set_cpu_online(), secondly, not all archs call set_cpu_online() with
IRQs disabled, this means we would introduce either the same race or
the race from fd8a7de17 ("x86: cpu-hotplug: Prevent softirq wakeup on
wrong CPU") -- albeit much narrower.
[ By setting online first and active later we have a window of
online,!active, fresh and bound kthreads have task_cpu() of 0 and
since cpu0 isn't in tsk_cpus_allowed() we end up in
select_fallback_rq() which excludes !active, resulting in a reset
of ->cpus_allowed and the thread running all over the place. ]
The solution is to re-work select_fallback_rq() to require active
_and_ online. This makes the active,!online case work as expected,
OTOH archs running CPU_STARTING after setting online are now
vulnerable to the issue from fd8a7de17 -- these are alpha and
blackfin.
Reported-by: Chuansheng Liu <chuansheng.liu@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: linux-alpha@vger.kernel.org
Link: http://lkml.kernel.org/n/tip-hubqk1i10o4dpvlm06gq7v6j@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-03-20 15:57:01 +01:00
|
|
|
switch (state) {
|
|
|
|
case cpuset:
|
|
|
|
/* No more Mr. Nice Guy. */
|
|
|
|
cpuset_cpus_allowed_fallback(p);
|
|
|
|
state = possible;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case possible:
|
|
|
|
do_set_cpus_allowed(p, cpu_possible_mask);
|
|
|
|
state = fail;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case fail:
|
|
|
|
BUG();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (state != cpuset) {
|
|
|
|
/*
|
|
|
|
* Don't tell them about moving exiting tasks or
|
|
|
|
* kernel threads (both mm NULL), since they never
|
|
|
|
* leave kernel.
|
|
|
|
*/
|
|
|
|
if (p->mm && printk_ratelimit()) {
|
|
|
|
printk_sched("process %d (%s) no longer affine to cpu%d\n",
|
|
|
|
task_pid_nr(p), p->comm, cpu);
|
|
|
|
}
|
2009-12-16 18:04:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return dest_cpu;
|
|
|
|
}
|
|
|
|
|
2009-12-16 18:04:36 +01:00
|
|
|
/*
|
2011-04-05 17:23:45 +02:00
|
|
|
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
|
2009-12-16 18:04:36 +01:00
|
|
|
*/
|
2009-11-25 13:31:39 +01:00
|
|
|
static inline
|
2011-04-05 17:23:46 +02:00
|
|
|
int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
|
2009-11-25 13:31:39 +01:00
|
|
|
{
|
2011-04-05 17:23:46 +02:00
|
|
|
int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
|
2009-12-16 18:04:36 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* In order not to call set_task_cpu() on a blocking task we need
|
|
|
|
* to rely on ttwu() to place the task on a valid ->cpus_allowed
|
|
|
|
* cpu.
|
|
|
|
*
|
|
|
|
* Since this is common to all placement strategies, this lives here.
|
|
|
|
*
|
|
|
|
* [ this allows ->select_task() to simply return task_cpu(p) and
|
|
|
|
* not worry about this generic constraint ]
|
|
|
|
*/
|
2011-06-16 12:23:22 +02:00
|
|
|
if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
|
2009-12-20 17:36:27 +01:00
|
|
|
!cpu_online(cpu)))
|
2009-12-16 18:04:38 +01:00
|
|
|
cpu = select_fallback_rq(task_cpu(p), p);
|
2009-12-16 18:04:36 +01:00
|
|
|
|
|
|
|
return cpu;
|
2009-11-25 13:31:39 +01:00
|
|
|
}
|
2010-04-15 07:29:59 +02:00
|
|
|
|
|
|
|
static void update_avg(u64 *avg, u64 sample)
|
|
|
|
{
|
|
|
|
s64 diff = sample - *avg;
|
|
|
|
*avg += diff >> 3;
|
|
|
|
}
|
2009-11-25 13:31:39 +01:00
|
|
|
#endif
|
|
|
|
|
2011-04-05 17:23:43 +02:00
|
|
|
static void
|
2011-04-05 17:23:55 +02:00
|
|
|
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
|
2009-12-03 15:08:03 +09:00
|
|
|
{
|
2011-04-05 17:23:43 +02:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
2011-04-05 17:23:55 +02:00
|
|
|
struct rq *rq = this_rq();
|
|
|
|
|
2011-04-05 17:23:43 +02:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
int this_cpu = smp_processor_id();
|
|
|
|
|
|
|
|
if (cpu == this_cpu) {
|
|
|
|
schedstat_inc(rq, ttwu_local);
|
|
|
|
schedstat_inc(p, se.statistics.nr_wakeups_local);
|
|
|
|
} else {
|
|
|
|
struct sched_domain *sd;
|
|
|
|
|
|
|
|
schedstat_inc(p, se.statistics.nr_wakeups_remote);
|
2011-04-18 11:24:34 +02:00
|
|
|
rcu_read_lock();
|
2011-04-05 17:23:43 +02:00
|
|
|
for_each_domain(this_cpu, sd) {
|
|
|
|
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
|
|
|
|
schedstat_inc(sd, ttwu_wake_remote);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2011-04-18 11:24:34 +02:00
|
|
|
rcu_read_unlock();
|
2011-04-05 17:23:43 +02:00
|
|
|
}
|
2011-05-31 10:49:20 +02:00
|
|
|
|
|
|
|
if (wake_flags & WF_MIGRATED)
|
|
|
|
schedstat_inc(p, se.statistics.nr_wakeups_migrate);
|
|
|
|
|
2011-04-05 17:23:43 +02:00
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
|
|
|
schedstat_inc(rq, ttwu_count);
|
2009-12-03 15:08:03 +09:00
|
|
|
schedstat_inc(p, se.statistics.nr_wakeups);
|
2011-04-05 17:23:43 +02:00
|
|
|
|
|
|
|
if (wake_flags & WF_SYNC)
|
2009-12-03 15:08:03 +09:00
|
|
|
schedstat_inc(p, se.statistics.nr_wakeups_sync);
|
2011-04-05 17:23:43 +02:00
|
|
|
|
|
|
|
#endif /* CONFIG_SCHEDSTATS */
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
|
|
|
|
{
|
2009-12-03 15:08:03 +09:00
|
|
|
activate_task(rq, p, en_flags);
|
2011-04-05 17:23:44 +02:00
|
|
|
p->on_rq = 1;
|
2011-04-13 13:28:56 +02:00
|
|
|
|
|
|
|
/* if a worker is waking up, notify workqueue */
|
|
|
|
if (p->flags & PF_WQ_WORKER)
|
|
|
|
wq_worker_waking_up(p, cpu_of(rq));
|
2009-12-03 15:08:03 +09:00
|
|
|
}
|
|
|
|
|
2011-04-05 17:23:56 +02:00
|
|
|
/*
|
|
|
|
* Mark the task runnable and perform wakeup-preemption.
|
|
|
|
*/
|
2011-04-05 17:23:42 +02:00
|
|
|
static void
|
2011-04-05 17:23:56 +02:00
|
|
|
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
|
2009-12-03 15:08:03 +09:00
|
|
|
{
|
2011-04-05 17:23:42 +02:00
|
|
|
trace_sched_wakeup(p, true);
|
2009-12-03 15:08:03 +09:00
|
|
|
check_preempt_curr(rq, p, wake_flags);
|
|
|
|
|
|
|
|
p->state = TASK_RUNNING;
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
if (p->sched_class->task_woken)
|
|
|
|
p->sched_class->task_woken(rq, p);
|
|
|
|
|
2010-12-06 17:10:31 -05:00
|
|
|
if (rq->idle_stamp) {
|
2009-12-03 15:08:03 +09:00
|
|
|
u64 delta = rq->clock - rq->idle_stamp;
|
|
|
|
u64 max = 2*sysctl_sched_migration_cost;
|
|
|
|
|
|
|
|
if (delta > max)
|
|
|
|
rq->avg_idle = max;
|
|
|
|
else
|
|
|
|
update_avg(&rq->avg_idle, delta);
|
|
|
|
rq->idle_stamp = 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2011-04-05 17:23:57 +02:00
|
|
|
static void
|
|
|
|
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
if (p->sched_contributes_to_load)
|
|
|
|
rq->nr_uninterruptible--;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
|
|
|
|
ttwu_do_wakeup(rq, p, wake_flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called in case the task @p isn't fully descheduled from its runqueue,
|
|
|
|
* in this case we must do a remote wakeup. Its a 'light' wakeup though,
|
|
|
|
* since all we need to do is flip p->state to TASK_RUNNING, since
|
|
|
|
* the task is still ->on_rq.
|
|
|
|
*/
|
|
|
|
static int ttwu_remote(struct task_struct *p, int wake_flags)
|
|
|
|
{
|
|
|
|
struct rq *rq;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
rq = __task_rq_lock(p);
|
|
|
|
if (p->on_rq) {
|
|
|
|
ttwu_do_wakeup(rq, p, wake_flags);
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
__task_rq_unlock(rq);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-04-05 17:23:58 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2011-09-12 13:06:17 +02:00
|
|
|
static void sched_ttwu_pending(void)
|
2011-04-05 17:23:58 +02:00
|
|
|
{
|
|
|
|
struct rq *rq = this_rq();
|
2011-09-12 13:06:17 +02:00
|
|
|
struct llist_node *llist = llist_del_all(&rq->wake_list);
|
|
|
|
struct task_struct *p;
|
2011-04-05 17:23:58 +02:00
|
|
|
|
|
|
|
raw_spin_lock(&rq->lock);
|
|
|
|
|
2011-09-12 13:06:17 +02:00
|
|
|
while (llist) {
|
|
|
|
p = llist_entry(llist, struct task_struct, wake_entry);
|
|
|
|
llist = llist_next(llist);
|
2011-04-05 17:23:58 +02:00
|
|
|
ttwu_do_activate(rq, p, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
raw_spin_unlock(&rq->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void scheduler_ipi(void)
|
|
|
|
{
|
2011-10-03 15:09:00 -07:00
|
|
|
if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
|
2011-07-19 15:07:25 -07:00
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Not all reschedule IPI handlers call irq_enter/irq_exit, since
|
|
|
|
* traditionally all their work was done from the interrupt return
|
|
|
|
* path. Now that we actually do some work, we need to make sure
|
|
|
|
* we do call them.
|
|
|
|
*
|
|
|
|
* Some archs already do call them, luckily irq_enter/exit nest
|
|
|
|
* properly.
|
|
|
|
*
|
|
|
|
* Arguably we should visit all archs and update all handlers,
|
|
|
|
* however a fair share of IPIs are still resched only so this would
|
|
|
|
* somewhat pessimize the simple resched case.
|
|
|
|
*/
|
|
|
|
irq_enter();
|
2011-09-12 13:06:17 +02:00
|
|
|
sched_ttwu_pending();
|
2011-10-03 15:09:00 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if someone kicked us for doing the nohz idle load balance.
|
|
|
|
*/
|
2011-10-03 15:09:01 -07:00
|
|
|
if (unlikely(got_nohz_idle_kick() && !need_resched())) {
|
|
|
|
this_rq()->idle_balance = 1;
|
2011-10-03 15:09:00 -07:00
|
|
|
raise_softirq_irqoff(SCHED_SOFTIRQ);
|
2011-10-03 15:09:01 -07:00
|
|
|
}
|
2011-07-19 15:07:25 -07:00
|
|
|
irq_exit();
|
2011-04-05 17:23:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void ttwu_queue_remote(struct task_struct *p, int cpu)
|
|
|
|
{
|
2011-09-12 13:06:17 +02:00
|
|
|
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
|
2011-04-05 17:23:58 +02:00
|
|
|
smp_send_reschedule(cpu);
|
|
|
|
}
|
2011-05-26 14:21:33 +02:00
|
|
|
|
|
|
|
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
|
|
|
|
static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
|
|
|
|
{
|
|
|
|
struct rq *rq;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
rq = __task_rq_lock(p);
|
|
|
|
if (p->on_cpu) {
|
|
|
|
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
|
|
|
|
ttwu_do_wakeup(rq, p, wake_flags);
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
__task_rq_unlock(rq);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
}
|
|
|
|
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
|
2011-12-07 15:07:31 +01:00
|
|
|
|
2012-01-26 12:44:34 +01:00
|
|
|
bool cpus_share_cache(int this_cpu, int that_cpu)
|
2011-12-07 15:07:31 +01:00
|
|
|
{
|
|
|
|
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
|
|
|
|
}
|
2011-05-26 14:21:33 +02:00
|
|
|
#endif /* CONFIG_SMP */
|
2011-04-05 17:23:58 +02:00
|
|
|
|
2011-04-05 17:23:57 +02:00
|
|
|
static void ttwu_queue(struct task_struct *p, int cpu)
|
|
|
|
{
|
|
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
|
|
2011-05-20 04:01:10 +00:00
|
|
|
#if defined(CONFIG_SMP)
|
2012-01-26 12:44:34 +01:00
|
|
|
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
|
2011-05-31 12:26:55 +02:00
|
|
|
sched_clock_cpu(cpu); /* sync clocks x-cpu */
|
2011-04-05 17:23:58 +02:00
|
|
|
ttwu_queue_remote(p, cpu);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2011-04-05 17:23:57 +02:00
|
|
|
raw_spin_lock(&rq->lock);
|
|
|
|
ttwu_do_activate(rq, p, 0);
|
|
|
|
raw_spin_unlock(&rq->lock);
|
2009-12-03 15:08:03 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2005-04-16 15:20:36 -07:00
|
|
|
* try_to_wake_up - wake up a thread
|
2009-12-03 15:08:03 +09:00
|
|
|
* @p: the thread to be awakened
|
2005-04-16 15:20:36 -07:00
|
|
|
* @state: the mask of task states that can be woken
|
2009-12-03 15:08:03 +09:00
|
|
|
* @wake_flags: wake modifier flags (WF_*)
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
|
|
|
* Put it on the run-queue if it's not already there. The "current"
|
|
|
|
* thread is always on the run-queue (except when the actual
|
|
|
|
* re-schedule is in progress), and as such you're allowed to do
|
|
|
|
* the simpler "current->state = TASK_RUNNING" to mark yourself
|
|
|
|
* runnable without the overhead of this.
|
|
|
|
*
|
2009-12-03 15:08:03 +09:00
|
|
|
* Returns %true if @p was woken up, %false if it was already running
|
|
|
|
* or @state didn't match @p's state.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2011-04-05 17:23:54 +02:00
|
|
|
static int
|
|
|
|
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2011-04-05 17:23:57 +02:00
|
|
|
int cpu, success = 0;
|
2008-06-27 13:41:35 +02:00
|
|
|
|
2008-02-23 18:05:03 -08:00
|
|
|
smp_wmb();
|
2011-04-05 17:23:45 +02:00
|
|
|
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
2009-09-15 14:43:03 +02:00
|
|
|
if (!(p->state & state))
|
2005-04-16 15:20:36 -07:00
|
|
|
goto out;
|
|
|
|
|
2011-04-05 17:23:57 +02:00
|
|
|
success = 1; /* we're going to change ->state */
|
2005-04-16 15:20:36 -07:00
|
|
|
cpu = task_cpu(p);
|
|
|
|
|
2011-04-05 17:23:57 +02:00
|
|
|
if (p->on_rq && ttwu_remote(p, wake_flags))
|
|
|
|
goto stat;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
2009-09-15 14:43:03 +02:00
|
|
|
/*
|
2011-04-05 17:23:57 +02:00
|
|
|
* If the owning (remote) cpu is still in the middle of schedule() with
|
|
|
|
* this task as prev, wait until its done referencing the task.
|
2009-09-15 14:43:03 +02:00
|
|
|
*/
|
2011-04-05 17:23:54 +02:00
|
|
|
while (p->on_cpu) {
|
|
|
|
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
|
|
|
|
/*
|
2011-05-26 14:21:33 +02:00
|
|
|
* In case the architecture enables interrupts in
|
|
|
|
* context_switch(), we cannot busy wait, since that
|
|
|
|
* would lead to deadlocks when an interrupt hits and
|
|
|
|
* tries to wake up @prev. So bail and do a complete
|
|
|
|
* remote wakeup.
|
2011-04-05 17:23:54 +02:00
|
|
|
*/
|
2011-05-26 14:21:33 +02:00
|
|
|
if (ttwu_activate_remote(p, wake_flags))
|
2011-04-05 17:23:57 +02:00
|
|
|
goto stat;
|
2011-05-26 14:21:33 +02:00
|
|
|
#else
|
2011-04-05 17:23:54 +02:00
|
|
|
cpu_relax();
|
2011-05-26 14:21:33 +02:00
|
|
|
#endif
|
2010-03-24 16:38:48 +01:00
|
|
|
}
|
2010-02-15 14:45:54 +01:00
|
|
|
/*
|
2011-04-05 17:23:54 +02:00
|
|
|
* Pairs with the smp_wmb() in finish_lock_switch().
|
2010-02-15 14:45:54 +01:00
|
|
|
*/
|
2011-04-05 17:23:54 +02:00
|
|
|
smp_rmb();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-04-05 17:23:49 +02:00
|
|
|
p->sched_contributes_to_load = !!task_contributes_to_load(p);
|
2009-09-15 14:43:03 +02:00
|
|
|
p->state = TASK_WAKING;
|
2008-01-25 21:08:09 +01:00
|
|
|
|
2011-04-05 17:23:54 +02:00
|
|
|
if (p->sched_class->task_waking)
|
2011-04-05 17:23:47 +02:00
|
|
|
p->sched_class->task_waking(p);
|
2009-12-16 18:04:40 +01:00
|
|
|
|
2011-04-05 17:23:46 +02:00
|
|
|
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
|
2011-05-31 10:49:20 +02:00
|
|
|
if (task_cpu(p) != cpu) {
|
|
|
|
wake_flags |= WF_MIGRATED;
|
2011-04-05 17:23:54 +02:00
|
|
|
set_task_cpu(p, cpu);
|
2011-05-31 10:49:20 +02:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
2011-04-05 17:23:57 +02:00
|
|
|
ttwu_queue(p, cpu);
|
|
|
|
stat:
|
2011-04-05 17:23:55 +02:00
|
|
|
ttwu_stat(p, cpu, wake_flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
out:
|
2011-04-05 17:23:45 +02:00
|
|
|
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
return success;
|
|
|
|
}
|
|
|
|
|
2010-06-08 21:40:37 +02:00
|
|
|
/**
|
|
|
|
* try_to_wake_up_local - try to wake up a local task with rq lock held
|
|
|
|
* @p: the thread to be awakened
|
|
|
|
*
|
2011-04-05 17:23:50 +02:00
|
|
|
* Put @p on the run-queue if it's not already there. The caller must
|
2010-06-08 21:40:37 +02:00
|
|
|
* ensure that this_rq() is locked, @p is bound to this_rq() and not
|
2011-04-05 17:23:50 +02:00
|
|
|
* the current task.
|
2010-06-08 21:40:37 +02:00
|
|
|
*/
|
|
|
|
static void try_to_wake_up_local(struct task_struct *p)
|
|
|
|
{
|
|
|
|
struct rq *rq = task_rq(p);
|
|
|
|
|
|
|
|
BUG_ON(rq != this_rq());
|
|
|
|
BUG_ON(p == current);
|
|
|
|
lockdep_assert_held(&rq->lock);
|
|
|
|
|
2011-04-05 17:23:50 +02:00
|
|
|
if (!raw_spin_trylock(&p->pi_lock)) {
|
|
|
|
raw_spin_unlock(&rq->lock);
|
|
|
|
raw_spin_lock(&p->pi_lock);
|
|
|
|
raw_spin_lock(&rq->lock);
|
|
|
|
}
|
|
|
|
|
2010-06-08 21:40:37 +02:00
|
|
|
if (!(p->state & TASK_NORMAL))
|
2011-04-05 17:23:50 +02:00
|
|
|
goto out;
|
2010-06-08 21:40:37 +02:00
|
|
|
|
2011-04-05 17:23:44 +02:00
|
|
|
if (!p->on_rq)
|
2011-04-05 17:23:43 +02:00
|
|
|
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
|
|
|
|
|
2011-04-05 17:23:56 +02:00
|
|
|
ttwu_do_wakeup(rq, p, 0);
|
2011-04-05 17:23:55 +02:00
|
|
|
ttwu_stat(p, smp_processor_id(), 0);
|
2011-04-05 17:23:50 +02:00
|
|
|
out:
|
|
|
|
raw_spin_unlock(&p->pi_lock);
|
2010-06-08 21:40:37 +02:00
|
|
|
}
|
|
|
|
|
2009-04-28 15:01:38 +01:00
|
|
|
/**
|
|
|
|
* wake_up_process - Wake up a specific process
|
|
|
|
* @p: The process to be woken up.
|
|
|
|
*
|
|
|
|
* Attempt to wake up the nominated process and move it to the set of runnable
|
|
|
|
* processes. Returns 1 if the process was woken up, 0 if it was already
|
|
|
|
* running.
|
|
|
|
*
|
|
|
|
* It may be assumed that this function implies a write memory barrier before
|
|
|
|
* changing the task state if and only if any tasks are woken up.
|
|
|
|
*/
|
2008-02-08 04:19:53 -08:00
|
|
|
int wake_up_process(struct task_struct *p)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-12-06 11:07:07 -05:00
|
|
|
return try_to_wake_up(p, TASK_ALL, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(wake_up_process);
|
|
|
|
|
2008-02-08 04:19:53 -08:00
|
|
|
int wake_up_state(struct task_struct *p, unsigned int state)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
return try_to_wake_up(p, state, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Perform scheduler related setup for a newly forked process p.
|
|
|
|
* p is forked by current.
|
2007-07-09 18:51:59 +02:00
|
|
|
*
|
|
|
|
* __sched_fork() is basic setup used by init_idle() too:
|
|
|
|
*/
|
|
|
|
static void __sched_fork(struct task_struct *p)
|
|
|
|
{
|
2011-04-05 17:23:44 +02:00
|
|
|
p->on_rq = 0;
|
|
|
|
|
|
|
|
p->se.on_rq = 0;
|
2007-07-09 18:51:59 +02:00
|
|
|
p->se.exec_start = 0;
|
|
|
|
p->se.sum_exec_runtime = 0;
|
2007-08-28 12:53:24 +02:00
|
|
|
p->se.prev_sum_exec_runtime = 0;
|
2008-12-14 12:34:15 +01:00
|
|
|
p->se.nr_migrations = 0;
|
2011-01-17 17:03:27 +01:00
|
|
|
p->se.vruntime = 0;
|
2011-04-05 17:23:44 +02:00
|
|
|
INIT_LIST_HEAD(&p->se.group_node);
|
2007-08-02 17:41:40 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
2010-03-10 23:37:45 -03:00
|
|
|
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
|
2007-08-02 17:41:40 +02:00
|
|
|
#endif
|
2005-06-25 14:57:29 -07:00
|
|
|
|
2008-01-25 21:08:27 +01:00
|
|
|
INIT_LIST_HEAD(&p->rt.run_list);
|
2005-06-25 14:57:29 -07:00
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
|
|
|
INIT_HLIST_HEAD(&p->preempt_notifiers);
|
|
|
|
#endif
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* fork()/clone()-time setup:
|
|
|
|
*/
|
2011-05-11 18:18:05 +02:00
|
|
|
void sched_fork(struct task_struct *p)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
2011-04-05 17:23:51 +02:00
|
|
|
unsigned long flags;
|
2007-07-09 18:51:59 +02:00
|
|
|
int cpu = get_cpu();
|
|
|
|
|
|
|
|
__sched_fork(p);
|
2009-12-16 18:04:35 +01:00
|
|
|
/*
|
2010-03-24 18:34:10 +01:00
|
|
|
* We mark the process as running here. This guarantees that
|
2009-12-16 18:04:35 +01:00
|
|
|
* nobody will actually run it, and a signal or other external
|
|
|
|
* event cannot wake it up and insert it on the runqueue either.
|
|
|
|
*/
|
2010-03-24 18:34:10 +01:00
|
|
|
p->state = TASK_RUNNING;
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2011-07-27 17:14:55 +02:00
|
|
|
/*
|
|
|
|
* Make sure we do not leak PI boosting priority to the child.
|
|
|
|
*/
|
|
|
|
p->prio = current->normal_prio;
|
|
|
|
|
2009-06-17 10:46:01 +02:00
|
|
|
/*
|
|
|
|
* Revert to default priority/policy on fork if requested.
|
|
|
|
*/
|
|
|
|
if (unlikely(p->sched_reset_on_fork)) {
|
2011-07-27 17:14:55 +02:00
|
|
|
if (task_has_rt_policy(p)) {
|
2009-06-17 10:46:01 +02:00
|
|
|
p->policy = SCHED_NORMAL;
|
2009-06-17 10:48:02 +02:00
|
|
|
p->static_prio = NICE_TO_PRIO(0);
|
2011-07-27 17:14:55 +02:00
|
|
|
p->rt_priority = 0;
|
|
|
|
} else if (PRIO_TO_NICE(p->static_prio) < 0)
|
|
|
|
p->static_prio = NICE_TO_PRIO(0);
|
|
|
|
|
|
|
|
p->prio = p->normal_prio = __normal_prio(p);
|
|
|
|
set_load_weight(p);
|
2009-06-17 10:48:02 +02:00
|
|
|
|
2009-06-17 10:46:01 +02:00
|
|
|
/*
|
|
|
|
* We don't need the reset flag anymore after the fork. It has
|
|
|
|
* fulfilled its duty:
|
|
|
|
*/
|
|
|
|
p->sched_reset_on_fork = 0;
|
|
|
|
}
|
2009-06-15 17:17:47 +02:00
|
|
|
|
2007-10-15 17:00:11 +02:00
|
|
|
if (!rt_prio(p->prio))
|
|
|
|
p->sched_class = &fair_sched_class;
|
2006-06-27 02:54:51 -07:00
|
|
|
|
2009-11-27 17:32:46 +01:00
|
|
|
if (p->sched_class->task_fork)
|
|
|
|
p->sched_class->task_fork(p);
|
|
|
|
|
2010-06-22 11:44:53 +02:00
|
|
|
/*
|
|
|
|
* The child is not yet in the pid-hash so no cgroup attach races,
|
|
|
|
* and the cgroup is pinned to this child due to cgroup_fork()
|
|
|
|
* is ran before sched_fork().
|
|
|
|
*
|
|
|
|
* Silence PROVE_RCU.
|
|
|
|
*/
|
2011-04-05 17:23:51 +02:00
|
|
|
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
2009-09-10 13:42:00 +02:00
|
|
|
set_task_cpu(p, cpu);
|
2011-04-05 17:23:51 +02:00
|
|
|
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
2009-09-10 13:42:00 +02:00
|
|
|
|
2006-07-14 00:24:38 -07:00
|
|
|
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
|
2007-07-09 18:51:59 +02:00
|
|
|
if (likely(sched_info_on()))
|
2006-07-14 00:24:38 -07:00
|
|
|
memset(&p->sched_info, 0, sizeof(p->sched_info));
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
2011-04-05 17:23:40 +02:00
|
|
|
#if defined(CONFIG_SMP)
|
|
|
|
p->on_cpu = 0;
|
2005-06-25 14:57:23 -07:00
|
|
|
#endif
|
2011-06-08 01:13:27 +02:00
|
|
|
#ifdef CONFIG_PREEMPT_COUNT
|
2005-06-25 14:57:23 -07:00
|
|
|
/* Want to start with kernel preemption disabled. */
|
2005-11-13 16:06:55 -08:00
|
|
|
task_thread_info(p)->preempt_count = 1;
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
2010-11-30 19:51:33 +01:00
|
|
|
#ifdef CONFIG_SMP
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 09:39:53 -05:00
|
|
|
plist_node_init(&p->pushable_tasks, MAX_PRIO);
|
2010-11-30 19:51:33 +01:00
|
|
|
#endif
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 09:39:53 -05:00
|
|
|
|
2005-06-25 14:57:29 -07:00
|
|
|
put_cpu();
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* wake_up_new_task - wake up a newly created task for the first time.
|
|
|
|
*
|
|
|
|
* This function will do some initial scheduler statistics housekeeping
|
|
|
|
* that must be done for every newly created context, then puts the task
|
|
|
|
* on the runqueue and wakes it.
|
|
|
|
*/
|
2011-05-11 18:18:05 +02:00
|
|
|
void wake_up_new_task(struct task_struct *p)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2007-07-09 18:51:59 +02:00
|
|
|
struct rq *rq;
|
2010-01-21 21:04:57 +01:00
|
|
|
|
2011-04-05 17:23:52 +02:00
|
|
|
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
2010-01-21 21:04:57 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
/*
|
|
|
|
* Fork balancing, do it here and not earlier because:
|
|
|
|
* - cpus_allowed can change in the fork path
|
|
|
|
* - any previously selected cpu might disappear through hotplug
|
|
|
|
*/
|
2011-04-05 17:23:52 +02:00
|
|
|
set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
|
2010-03-24 18:34:10 +01:00
|
|
|
#endif
|
|
|
|
|
2011-04-05 17:23:52 +02:00
|
|
|
rq = __task_rq_lock(p);
|
2009-11-27 17:32:46 +01:00
|
|
|
activate_task(rq, p, 0);
|
2011-04-05 17:23:44 +02:00
|
|
|
p->on_rq = 1;
|
2011-04-05 17:23:42 +02:00
|
|
|
trace_sched_wakeup_new(p, true);
|
2009-09-14 20:02:34 +02:00
|
|
|
check_preempt_curr(rq, p, WF_FORK);
|
2008-01-25 21:08:22 +01:00
|
|
|
#ifdef CONFIG_SMP
|
2009-12-16 18:04:40 +01:00
|
|
|
if (p->sched_class->task_woken)
|
|
|
|
p->sched_class->task_woken(rq, p);
|
2008-01-25 21:08:22 +01:00
|
|
|
#endif
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
|
|
|
|
|
|
|
/**
|
2009-03-16 19:58:09 +00:00
|
|
|
* preempt_notifier_register - tell me when current is being preempted & rescheduled
|
2007-07-31 00:37:50 -07:00
|
|
|
* @notifier: notifier struct to register
|
2007-07-26 13:40:43 +02:00
|
|
|
*/
|
|
|
|
void preempt_notifier_register(struct preempt_notifier *notifier)
|
|
|
|
{
|
|
|
|
hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(preempt_notifier_register);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* preempt_notifier_unregister - no longer interested in preemption notifications
|
2007-07-31 00:37:50 -07:00
|
|
|
* @notifier: notifier struct to unregister
|
2007-07-26 13:40:43 +02:00
|
|
|
*
|
|
|
|
* This is safe to call from within a preemption notifier.
|
|
|
|
*/
|
|
|
|
void preempt_notifier_unregister(struct preempt_notifier *notifier)
|
|
|
|
{
|
|
|
|
hlist_del(¬ifier->link);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
|
|
|
|
|
|
|
|
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
|
|
|
{
|
|
|
|
struct preempt_notifier *notifier;
|
|
|
|
struct hlist_node *node;
|
|
|
|
|
|
|
|
hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
|
|
|
|
notifier->ops->sched_in(notifier, raw_smp_processor_id());
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
|
|
|
struct task_struct *next)
|
|
|
|
{
|
|
|
|
struct preempt_notifier *notifier;
|
|
|
|
struct hlist_node *node;
|
|
|
|
|
|
|
|
hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
|
|
|
|
notifier->ops->sched_out(notifier, next);
|
|
|
|
}
|
|
|
|
|
2008-05-30 14:23:45 +02:00
|
|
|
#else /* !CONFIG_PREEMPT_NOTIFIERS */
|
2007-07-26 13:40:43 +02:00
|
|
|
|
|
|
|
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
|
|
|
struct task_struct *next)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2008-05-30 14:23:45 +02:00
|
|
|
#endif /* CONFIG_PREEMPT_NOTIFIERS */
|
2007-07-26 13:40:43 +02:00
|
|
|
|
2005-06-25 14:57:23 -07:00
|
|
|
/**
|
|
|
|
* prepare_task_switch - prepare to switch tasks
|
|
|
|
* @rq: the runqueue preparing to switch
|
2007-07-31 00:37:50 -07:00
|
|
|
* @prev: the current task that is being switched out
|
2005-06-25 14:57:23 -07:00
|
|
|
* @next: the task we are going to switch to.
|
|
|
|
*
|
|
|
|
* This is called with the rq lock held and interrupts off. It must
|
|
|
|
* be paired with a subsequent finish_task_switch after the context
|
|
|
|
* switch.
|
|
|
|
*
|
|
|
|
* prepare_task_switch sets up locking and calls architecture specific
|
|
|
|
* hooks.
|
|
|
|
*/
|
2007-07-26 13:40:43 +02:00
|
|
|
static inline void
|
|
|
|
prepare_task_switch(struct rq *rq, struct task_struct *prev,
|
|
|
|
struct task_struct *next)
|
2005-06-25 14:57:23 -07:00
|
|
|
{
|
2012-07-12 14:14:29 +04:00
|
|
|
trace_sched_switch(prev, next);
|
2011-02-02 13:19:09 +01:00
|
|
|
sched_info_switch(prev, next);
|
|
|
|
perf_event_task_sched_out(prev, next);
|
2007-07-26 13:40:43 +02:00
|
|
|
fire_sched_out_preempt_notifiers(prev, next);
|
2005-06-25 14:57:23 -07:00
|
|
|
prepare_lock_switch(rq, next);
|
|
|
|
prepare_arch_switch(next);
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/**
|
|
|
|
* finish_task_switch - clean up after a task-switch
|
2005-09-07 01:15:17 -04:00
|
|
|
* @rq: runqueue associated with task-switch
|
2005-04-16 15:20:36 -07:00
|
|
|
* @prev: the thread we just switched away from.
|
|
|
|
*
|
2005-06-25 14:57:23 -07:00
|
|
|
* finish_task_switch must be called after the context switch, paired
|
|
|
|
* with a prepare_task_switch call before the context switch.
|
|
|
|
* finish_task_switch will reconcile locking set up by prepare_task_switch,
|
|
|
|
* and do any other architecture-specific cleanup actions.
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
|
|
|
* Note that we may have delayed dropping an mm in context_switch(). If
|
2007-12-05 15:46:09 +01:00
|
|
|
* so, we finish that here outside of the runqueue lock. (Doing it
|
2005-04-16 15:20:36 -07:00
|
|
|
* with the lock held can cause deadlocks; see schedule() for
|
|
|
|
* details.)
|
|
|
|
*/
|
2007-10-15 17:00:13 +02:00
|
|
|
static void finish_task_switch(struct rq *rq, struct task_struct *prev)
|
2005-04-16 15:20:36 -07:00
|
|
|
__releases(rq->lock)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = rq->prev_mm;
|
2006-09-29 02:01:10 -07:00
|
|
|
long prev_state;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
rq->prev_mm = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A task struct has one reference for the use as "current".
|
2006-09-29 02:01:11 -07:00
|
|
|
* If a task dies, then it sets TASK_DEAD in tsk->state and calls
|
2006-09-29 02:01:10 -07:00
|
|
|
* schedule one last time. The schedule call will never return, and
|
|
|
|
* the scheduled task must drop that reference.
|
2006-09-29 02:01:11 -07:00
|
|
|
* The test for TASK_DEAD must occur while the runqueue locks are
|
2005-04-16 15:20:36 -07:00
|
|
|
* still held, otherwise prev could be scheduled on another cpu, die
|
|
|
|
* there before we look at prev->state, and then the reference would
|
|
|
|
* be dropped twice.
|
|
|
|
* Manfred Spraul <manfred@colorfullife.com>
|
|
|
|
*/
|
2006-09-29 02:01:10 -07:00
|
|
|
prev_state = prev->state;
|
2005-06-25 14:57:23 -07:00
|
|
|
finish_arch_switch(prev);
|
2010-01-08 15:27:33 +00:00
|
|
|
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
|
|
|
|
local_irq_disable();
|
|
|
|
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
|
perf events: Fix slow and broken cgroup context switch code
The current cgroup context switch code was incorrect leading
to bogus counts. Furthermore, as soon as there was an active
cgroup event on a CPU, the context switch cost on that CPU
would increase by a significant amount as demonstrated by a
simple ping/pong example:
$ ./pong
Both processes pinned to CPU1, running for 10s
10684.51 ctxsw/s
Now start a cgroup perf stat:
$ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100
$ ./pong
Both processes pinned to CPU1, running for 10s
6674.61 ctxsw/s
That's a 37% penalty.
Note that pong is not even in the monitored cgroup.
The results shown by perf stat are bogus:
$ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100
Performance counter stats for 'sleep 100':
CPU1 <not counted> cycles test
CPU1 16,984,189,138 cycles # 0.000 GHz
The second 'cycles' event should report a count @ CPU clock
(here 2.4GHz) as it is counting across all cgroups.
The patch below fixes the bogus accounting and bypasses any
cgroup switches in case the outgoing and incoming tasks are
in the same cgroup.
With this patch the same test now yields:
$ ./pong
Both processes pinned to CPU1, running for 10s
10775.30 ctxsw/s
Start perf stat with cgroup:
$ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10
Run pong outside the cgroup:
$ /pong
Both processes pinned to CPU1, running for 10s
10687.80 ctxsw/s
The penalty is now less than 2%.
And the results for perf stat are correct:
$ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10
Performance counter stats for 'sleep 10':
CPU1 <not counted> cycles test # 0.000 GHz
CPU1 23,933,981,448 cycles # 0.000 GHz
Now perf stat reports the correct counts for
for the non cgroup event.
If we run pong inside the cgroup, then we also get the
correct counts:
$ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10
Performance counter stats for 'sleep 10':
CPU1 22,297,726,205 cycles test # 0.000 GHz
CPU1 23,933,981,448 cycles # 0.000 GHz
10.001457237 seconds time elapsed
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110825135803.GA4697@quad
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-08-25 15:58:03 +02:00
|
|
|
perf_event_task_sched_in(prev, current);
|
2010-01-08 15:27:33 +00:00
|
|
|
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
|
|
|
|
local_irq_enable();
|
|
|
|
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
|
2005-06-25 14:57:23 -07:00
|
|
|
finish_lock_switch(rq, prev);
|
2011-11-27 21:43:10 +00:00
|
|
|
finish_arch_post_lock_switch();
|
2008-01-25 21:08:05 +01:00
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
fire_sched_in_preempt_notifiers(current);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (mm)
|
|
|
|
mmdrop(mm);
|
2006-09-29 02:01:11 -07:00
|
|
|
if (unlikely(prev_state == TASK_DEAD)) {
|
2006-03-26 01:38:20 -08:00
|
|
|
/*
|
|
|
|
* Remove function-return probe instances associated with this
|
|
|
|
* task and put them back on the free list.
|
2007-07-09 18:52:00 +02:00
|
|
|
*/
|
2006-03-26 01:38:20 -08:00
|
|
|
kprobe_flush_task(prev);
|
2005-04-16 15:20:36 -07:00
|
|
|
put_task_struct(prev);
|
2006-03-26 01:38:20 -08:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2009-07-29 11:08:47 -04:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
|
|
|
/* assumes rq->lock is held */
|
|
|
|
static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
|
|
|
|
{
|
|
|
|
if (prev->sched_class->pre_schedule)
|
|
|
|
prev->sched_class->pre_schedule(rq, prev);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* rq->lock is NOT held, but preemption is disabled */
|
|
|
|
static inline void post_schedule(struct rq *rq)
|
|
|
|
{
|
|
|
|
if (rq->post_schedule) {
|
|
|
|
unsigned long flags;
|
|
|
|
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock_irqsave(&rq->lock, flags);
|
2009-07-29 11:08:47 -04:00
|
|
|
if (rq->curr->sched_class->post_schedule)
|
|
|
|
rq->curr->sched_class->post_schedule(rq);
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
2009-07-29 11:08:47 -04:00
|
|
|
|
|
|
|
rq->post_schedule = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
2009-07-29 00:21:22 -04:00
|
|
|
|
2009-07-29 11:08:47 -04:00
|
|
|
static inline void pre_schedule(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void post_schedule(struct rq *rq)
|
|
|
|
{
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2009-07-29 11:08:47 -04:00
|
|
|
#endif
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/**
|
|
|
|
* schedule_tail - first thing a freshly forked thread must call.
|
|
|
|
* @prev: the thread we just switched away from.
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
asmlinkage void schedule_tail(struct task_struct *prev)
|
2005-04-16 15:20:36 -07:00
|
|
|
__releases(rq->lock)
|
|
|
|
{
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq = this_rq();
|
|
|
|
|
2005-06-25 14:57:23 -07:00
|
|
|
finish_task_switch(rq, prev);
|
2009-07-29 00:21:22 -04:00
|
|
|
|
2009-07-29 11:08:47 -04:00
|
|
|
/*
|
|
|
|
* FIXME: do we need to worry about rq being invalidated by the
|
|
|
|
* task_switch?
|
|
|
|
*/
|
|
|
|
post_schedule(rq);
|
2006-07-03 00:25:42 -07:00
|
|
|
|
2005-06-25 14:57:23 -07:00
|
|
|
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
|
|
|
|
/* In this case, finish_task_switch does not reenable preemption */
|
|
|
|
preempt_enable();
|
|
|
|
#endif
|
2005-04-16 15:20:36 -07:00
|
|
|
if (current->set_child_tid)
|
2007-10-18 23:40:14 -07:00
|
|
|
put_user(task_pid_vnr(current), current->set_child_tid);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* context_switch - switch to the new MM and the new
|
|
|
|
* thread's register state.
|
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
static inline void
|
2006-07-03 00:25:42 -07:00
|
|
|
context_switch(struct rq *rq, struct task_struct *prev,
|
2006-07-03 00:25:41 -07:00
|
|
|
struct task_struct *next)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
struct mm_struct *mm, *oldmm;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
prepare_task_switch(rq, prev, next);
|
2011-02-02 13:19:09 +01:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
mm = next->mm;
|
|
|
|
oldmm = prev->active_mm;
|
2007-02-13 13:26:21 +01:00
|
|
|
/*
|
|
|
|
* For paravirt, this is coupled with an exit in switch_to to
|
|
|
|
* combine the page table reload and the switch backend into
|
|
|
|
* one hypercall.
|
|
|
|
*/
|
2009-02-18 11:18:57 -08:00
|
|
|
arch_start_context_switch(prev);
|
2007-02-13 13:26:21 +01:00
|
|
|
|
2010-09-16 14:42:25 +02:00
|
|
|
if (!mm) {
|
2005-04-16 15:20:36 -07:00
|
|
|
next->active_mm = oldmm;
|
|
|
|
atomic_inc(&oldmm->mm_count);
|
|
|
|
enter_lazy_tlb(oldmm, next);
|
|
|
|
} else
|
|
|
|
switch_mm(oldmm, mm, next);
|
|
|
|
|
2010-09-16 14:42:25 +02:00
|
|
|
if (!prev->mm) {
|
2005-04-16 15:20:36 -07:00
|
|
|
prev->active_mm = NULL;
|
|
|
|
rq->prev_mm = oldmm;
|
|
|
|
}
|
2006-07-14 00:24:27 -07:00
|
|
|
/*
|
|
|
|
* Since the runqueue lock will be released by the next
|
|
|
|
* task (which is an invalid locking op but in the case
|
|
|
|
* of the scheduler it's an obvious special-case), so we
|
|
|
|
* do an early lockdep release here:
|
|
|
|
*/
|
|
|
|
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
|
2006-07-03 00:24:54 -07:00
|
|
|
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
|
2006-07-14 00:24:27 -07:00
|
|
|
#endif
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/* Here we just switch the register state and the stack. */
|
2012-07-16 15:06:40 -07:00
|
|
|
rcu_switch(prev, next);
|
2005-04-16 15:20:36 -07:00
|
|
|
switch_to(prev, next, prev);
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
barrier();
|
|
|
|
/*
|
|
|
|
* this_rq must be evaluated again because prev may have moved
|
|
|
|
* CPUs since it called schedule(), thus the 'rq' on its stack
|
|
|
|
* frame will be invalid.
|
|
|
|
*/
|
|
|
|
finish_task_switch(this_rq(), prev);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nr_running, nr_uninterruptible and nr_context_switches:
|
|
|
|
*
|
|
|
|
* externally visible scheduler statistics: current number of runnable
|
|
|
|
* threads, current number of uninterruptible-sleeping threads, total
|
|
|
|
* number of context switches performed since bootup.
|
|
|
|
*/
|
|
|
|
unsigned long nr_running(void)
|
|
|
|
{
|
|
|
|
unsigned long i, sum = 0;
|
|
|
|
|
|
|
|
for_each_online_cpu(i)
|
|
|
|
sum += cpu_rq(i)->nr_running;
|
|
|
|
|
|
|
|
return sum;
|
2009-04-14 10:25:30 +05:30
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
unsigned long nr_uninterruptible(void)
|
2009-04-14 10:25:30 +05:30
|
|
|
{
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long i, sum = 0;
|
2009-04-14 10:25:30 +05:30
|
|
|
|
2006-03-28 01:56:37 -08:00
|
|
|
for_each_possible_cpu(i)
|
2005-04-16 15:20:36 -07:00
|
|
|
sum += cpu_rq(i)->nr_uninterruptible;
|
2009-04-14 10:25:30 +05:30
|
|
|
|
|
|
|
/*
|
2005-04-16 15:20:36 -07:00
|
|
|
* Since we read the counters lockless, it might be slightly
|
|
|
|
* inaccurate. Do not allow it to go below zero though:
|
2009-04-14 10:25:30 +05:30
|
|
|
*/
|
2005-04-16 15:20:36 -07:00
|
|
|
if (unlikely((long)sum < 0))
|
|
|
|
sum = 0;
|
2009-04-14 10:25:30 +05:30
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return sum;
|
2009-04-14 10:25:30 +05:30
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long long nr_context_switches(void)
|
2007-05-08 00:32:51 -07:00
|
|
|
{
|
2006-06-27 02:54:31 -07:00
|
|
|
int i;
|
|
|
|
unsigned long long sum = 0;
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2006-03-28 01:56:37 -08:00
|
|
|
for_each_possible_cpu(i)
|
2005-04-16 15:20:36 -07:00
|
|
|
sum += cpu_rq(i)->nr_switches;
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return sum;
|
|
|
|
}
|
2009-02-04 11:59:44 -08:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long nr_iowait(void)
|
|
|
|
{
|
|
|
|
unsigned long i, sum = 0;
|
2009-02-04 11:59:44 -08:00
|
|
|
|
2006-03-28 01:56:37 -08:00
|
|
|
for_each_possible_cpu(i)
|
2005-04-16 15:20:36 -07:00
|
|
|
sum += atomic_read(&cpu_rq(i)->nr_iowait);
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return sum;
|
|
|
|
}
|
2009-02-04 11:59:44 -08:00
|
|
|
|
2010-07-01 09:07:17 +02:00
|
|
|
unsigned long nr_iowait_cpu(int cpu)
|
2009-09-21 17:04:08 -07:00
|
|
|
{
|
2010-07-01 09:07:17 +02:00
|
|
|
struct rq *this = cpu_rq(cpu);
|
2009-09-21 17:04:08 -07:00
|
|
|
return atomic_read(&this->nr_iowait);
|
|
|
|
}
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2009-09-21 17:04:08 -07:00
|
|
|
unsigned long this_cpu_load(void)
|
|
|
|
{
|
|
|
|
struct rq *this = this_rq();
|
|
|
|
return this->cpu_load[0];
|
|
|
|
}
|
2009-04-14 10:25:35 +05:30
|
|
|
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
/*
|
|
|
|
* Global load-average calculations
|
|
|
|
*
|
|
|
|
* We take a distributed and async approach to calculating the global load-avg
|
|
|
|
* in order to minimize overhead.
|
|
|
|
*
|
|
|
|
* The global load average is an exponentially decaying average of nr_running +
|
|
|
|
* nr_uninterruptible.
|
|
|
|
*
|
|
|
|
* Once every LOAD_FREQ:
|
|
|
|
*
|
|
|
|
* nr_active = 0;
|
|
|
|
* for_each_possible_cpu(cpu)
|
|
|
|
* nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
|
|
|
|
*
|
|
|
|
* avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
|
|
|
|
*
|
|
|
|
* Due to a number of reasons the above turns in the mess below:
|
|
|
|
*
|
|
|
|
* - for_each_possible_cpu() is prohibitively expensive on machines with
|
|
|
|
* serious number of cpus, therefore we need to take a distributed approach
|
|
|
|
* to calculating nr_active.
|
|
|
|
*
|
|
|
|
* \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
|
|
|
|
* = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
|
|
|
|
*
|
|
|
|
* So assuming nr_active := 0 when we start out -- true per definition, we
|
|
|
|
* can simply take per-cpu deltas and fold those into a global accumulate
|
|
|
|
* to obtain the same result. See calc_load_fold_active().
|
|
|
|
*
|
|
|
|
* Furthermore, in order to avoid synchronizing all per-cpu delta folding
|
|
|
|
* across the machine, we assume 10 ticks is sufficient time for every
|
|
|
|
* cpu to have completed this task.
|
|
|
|
*
|
|
|
|
* This places an upper-bound on the IRQ-off latency of the machine. Then
|
|
|
|
* again, being late doesn't loose the delta, just wrecks the sample.
|
|
|
|
*
|
|
|
|
* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
|
|
|
|
* this would add another cross-cpu cacheline miss and atomic operation
|
|
|
|
* to the wakeup path. Instead we increment on whatever cpu the task ran
|
|
|
|
* when it went into uninterruptible state and decrement on whatever cpu
|
|
|
|
* did the wakeup. This means that only the sum of nr_uninterruptible over
|
|
|
|
* all cpus yields the correct result.
|
|
|
|
*
|
|
|
|
* This covers the NO_HZ=n code, for extra head-aches, see the comment below.
|
|
|
|
*/
|
|
|
|
|
2009-04-11 10:43:41 +02:00
|
|
|
/* Variables and functions for calc_load */
|
|
|
|
static atomic_long_t calc_load_tasks;
|
|
|
|
static unsigned long calc_load_update;
|
|
|
|
unsigned long avenrun[3];
|
2012-06-22 15:52:09 +02:00
|
|
|
EXPORT_SYMBOL(avenrun); /* should be removed */
|
|
|
|
|
|
|
|
/**
|
|
|
|
* get_avenrun - get the load average array
|
|
|
|
* @loads: pointer to dest load array
|
|
|
|
* @offset: offset to add
|
|
|
|
* @shift: shift count to shift the result left
|
|
|
|
*
|
|
|
|
* These values are estimates at best, so no need for locking.
|
|
|
|
*/
|
|
|
|
void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
|
|
|
|
{
|
|
|
|
loads[0] = (avenrun[0] + offset) << shift;
|
|
|
|
loads[1] = (avenrun[1] + offset) << shift;
|
|
|
|
loads[2] = (avenrun[2] + offset) << shift;
|
|
|
|
}
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2010-04-22 21:50:19 +02:00
|
|
|
static long calc_load_fold_active(struct rq *this_rq)
|
|
|
|
{
|
|
|
|
long nr_active, delta = 0;
|
|
|
|
|
|
|
|
nr_active = this_rq->nr_running;
|
|
|
|
nr_active += (long) this_rq->nr_uninterruptible;
|
|
|
|
|
|
|
|
if (nr_active != this_rq->calc_load_active) {
|
|
|
|
delta = nr_active - this_rq->calc_load_active;
|
|
|
|
this_rq->calc_load_active = nr_active;
|
|
|
|
}
|
|
|
|
|
|
|
|
return delta;
|
|
|
|
}
|
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
/*
|
|
|
|
* a1 = a0 * e + a * (1 - e)
|
|
|
|
*/
|
2010-11-30 19:48:45 +01:00
|
|
|
static unsigned long
|
|
|
|
calc_load(unsigned long load, unsigned long exp, unsigned long active)
|
|
|
|
{
|
|
|
|
load *= exp;
|
|
|
|
load += active * (FIXED_1 - exp);
|
|
|
|
load += 1UL << (FSHIFT - 1);
|
|
|
|
return load >> FSHIFT;
|
|
|
|
}
|
|
|
|
|
2010-04-22 21:50:19 +02:00
|
|
|
#ifdef CONFIG_NO_HZ
|
|
|
|
/*
|
2012-06-22 15:52:09 +02:00
|
|
|
* Handle NO_HZ for the global load-average.
|
|
|
|
*
|
|
|
|
* Since the above described distributed algorithm to compute the global
|
|
|
|
* load-average relies on per-cpu sampling from the tick, it is affected by
|
|
|
|
* NO_HZ.
|
|
|
|
*
|
|
|
|
* The basic idea is to fold the nr_active delta into a global idle-delta upon
|
|
|
|
* entering NO_HZ state such that we can include this as an 'extra' cpu delta
|
|
|
|
* when we read the global state.
|
|
|
|
*
|
|
|
|
* Obviously reality has to ruin such a delightfully simple scheme:
|
|
|
|
*
|
|
|
|
* - When we go NO_HZ idle during the window, we can negate our sample
|
|
|
|
* contribution, causing under-accounting.
|
|
|
|
*
|
|
|
|
* We avoid this by keeping two idle-delta counters and flipping them
|
|
|
|
* when the window starts, thus separating old and new NO_HZ load.
|
|
|
|
*
|
|
|
|
* The only trick is the slight shift in index flip for read vs write.
|
|
|
|
*
|
|
|
|
* 0s 5s 10s 15s
|
|
|
|
* +10 +10 +10 +10
|
|
|
|
* |-|-----------|-|-----------|-|-----------|-|
|
|
|
|
* r:0 0 1 1 0 0 1 1 0
|
|
|
|
* w:0 1 1 0 0 1 1 0 0
|
|
|
|
*
|
|
|
|
* This ensures we'll fold the old idle contribution in this window while
|
|
|
|
* accumlating the new one.
|
|
|
|
*
|
|
|
|
* - When we wake up from NO_HZ idle during the window, we push up our
|
|
|
|
* contribution, since we effectively move our sample point to a known
|
|
|
|
* busy state.
|
|
|
|
*
|
|
|
|
* This is solved by pushing the window forward, and thus skipping the
|
|
|
|
* sample, for this cpu (effectively using the idle-delta for this cpu which
|
|
|
|
* was in effect at the time the window opened). This also solves the issue
|
|
|
|
* of having to deal with a cpu having been in NOHZ idle for multiple
|
|
|
|
* LOAD_FREQ intervals.
|
2010-04-22 21:50:19 +02:00
|
|
|
*
|
|
|
|
* When making the ILB scale, we should try to pull this in as well.
|
|
|
|
*/
|
2012-06-22 15:52:09 +02:00
|
|
|
static atomic_long_t calc_load_idle[2];
|
|
|
|
static int calc_load_idx;
|
2010-04-22 21:50:19 +02:00
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
static inline int calc_load_write_idx(void)
|
2010-04-22 21:50:19 +02:00
|
|
|
{
|
2012-06-22 15:52:09 +02:00
|
|
|
int idx = calc_load_idx;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* See calc_global_nohz(), if we observe the new index, we also
|
|
|
|
* need to observe the new update time.
|
|
|
|
*/
|
|
|
|
smp_rmb();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the folding window started, make sure we start writing in the
|
|
|
|
* next idle-delta.
|
|
|
|
*/
|
|
|
|
if (!time_before(jiffies, calc_load_update))
|
|
|
|
idx++;
|
|
|
|
|
|
|
|
return idx & 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int calc_load_read_idx(void)
|
|
|
|
{
|
|
|
|
return calc_load_idx & 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
void calc_load_enter_idle(void)
|
|
|
|
{
|
|
|
|
struct rq *this_rq = this_rq();
|
2010-04-22 21:50:19 +02:00
|
|
|
long delta;
|
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
/*
|
|
|
|
* We're going into NOHZ mode, if there's any pending delta, fold it
|
|
|
|
* into the pending idle delta.
|
|
|
|
*/
|
2010-04-22 21:50:19 +02:00
|
|
|
delta = calc_load_fold_active(this_rq);
|
2012-06-22 15:52:09 +02:00
|
|
|
if (delta) {
|
|
|
|
int idx = calc_load_write_idx();
|
|
|
|
atomic_long_add(delta, &calc_load_idle[idx]);
|
|
|
|
}
|
2010-04-22 21:50:19 +02:00
|
|
|
}
|
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
void calc_load_exit_idle(void)
|
2010-04-22 21:50:19 +02:00
|
|
|
{
|
2012-06-22 15:52:09 +02:00
|
|
|
struct rq *this_rq = this_rq();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're still before the sample window, we're done.
|
|
|
|
*/
|
|
|
|
if (time_before(jiffies, this_rq->calc_load_update))
|
|
|
|
return;
|
2010-04-22 21:50:19 +02:00
|
|
|
|
|
|
|
/*
|
2012-06-22 15:52:09 +02:00
|
|
|
* We woke inside or after the sample window, this means we're already
|
|
|
|
* accounted through the nohz accounting, so skip the entire deal and
|
|
|
|
* sync up for the next window.
|
2010-04-22 21:50:19 +02:00
|
|
|
*/
|
2012-06-22 15:52:09 +02:00
|
|
|
this_rq->calc_load_update = calc_load_update;
|
|
|
|
if (time_before(jiffies, this_rq->calc_load_update + 10))
|
|
|
|
this_rq->calc_load_update += LOAD_FREQ;
|
|
|
|
}
|
|
|
|
|
|
|
|
static long calc_load_fold_idle(void)
|
|
|
|
{
|
|
|
|
int idx = calc_load_read_idx();
|
|
|
|
long delta = 0;
|
|
|
|
|
|
|
|
if (atomic_long_read(&calc_load_idle[idx]))
|
|
|
|
delta = atomic_long_xchg(&calc_load_idle[idx], 0);
|
2010-04-22 21:50:19 +02:00
|
|
|
|
|
|
|
return delta;
|
|
|
|
}
|
2010-11-30 19:48:45 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* fixed_power_int - compute: x^n, in O(log n) time
|
|
|
|
*
|
|
|
|
* @x: base of the power
|
|
|
|
* @frac_bits: fractional bits of @x
|
|
|
|
* @n: power to raise @x to.
|
|
|
|
*
|
|
|
|
* By exploiting the relation between the definition of the natural power
|
|
|
|
* function: x^n := x*x*...*x (x multiplied by itself for n times), and
|
|
|
|
* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
|
|
|
|
* (where: n_i \elem {0, 1}, the binary vector representing n),
|
|
|
|
* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
|
|
|
|
* of course trivially computable in O(log_2 n), the length of our binary
|
|
|
|
* vector.
|
|
|
|
*/
|
|
|
|
static unsigned long
|
|
|
|
fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
|
|
|
|
{
|
|
|
|
unsigned long result = 1UL << frac_bits;
|
|
|
|
|
|
|
|
if (n) for (;;) {
|
|
|
|
if (n & 1) {
|
|
|
|
result *= x;
|
|
|
|
result += 1UL << (frac_bits - 1);
|
|
|
|
result >>= frac_bits;
|
|
|
|
}
|
|
|
|
n >>= 1;
|
|
|
|
if (!n)
|
|
|
|
break;
|
|
|
|
x *= x;
|
|
|
|
x += 1UL << (frac_bits - 1);
|
|
|
|
x >>= frac_bits;
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* a1 = a0 * e + a * (1 - e)
|
|
|
|
*
|
|
|
|
* a2 = a1 * e + a * (1 - e)
|
|
|
|
* = (a0 * e + a * (1 - e)) * e + a * (1 - e)
|
|
|
|
* = a0 * e^2 + a * (1 - e) * (1 + e)
|
|
|
|
*
|
|
|
|
* a3 = a2 * e + a * (1 - e)
|
|
|
|
* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
|
|
|
|
* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
|
|
|
|
*
|
|
|
|
* ...
|
|
|
|
*
|
|
|
|
* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
|
|
|
|
* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
|
|
|
|
* = a0 * e^n + a * (1 - e^n)
|
|
|
|
*
|
|
|
|
* [1] application of the geometric series:
|
|
|
|
*
|
|
|
|
* n 1 - x^(n+1)
|
|
|
|
* S_n := \Sum x^i = -------------
|
|
|
|
* i=0 1 - x
|
|
|
|
*/
|
|
|
|
static unsigned long
|
|
|
|
calc_load_n(unsigned long load, unsigned long exp,
|
|
|
|
unsigned long active, unsigned int n)
|
|
|
|
{
|
|
|
|
|
|
|
|
return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NO_HZ can leave us missing all per-cpu ticks calling
|
|
|
|
* calc_load_account_active(), but since an idle CPU folds its delta into
|
|
|
|
* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
|
|
|
|
* in the pending idle delta if our idle period crossed a load cycle boundary.
|
|
|
|
*
|
|
|
|
* Once we've updated the global active value, we need to apply the exponential
|
|
|
|
* weights adjusted to the number of cycles missed.
|
|
|
|
*/
|
2012-03-01 15:04:46 +01:00
|
|
|
static void calc_global_nohz(void)
|
2010-11-30 19:48:45 +01:00
|
|
|
{
|
|
|
|
long delta, active, n;
|
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
if (!time_before(jiffies, calc_load_update + 10)) {
|
|
|
|
/*
|
|
|
|
* Catch-up, fold however many we are behind still
|
|
|
|
*/
|
|
|
|
delta = jiffies - calc_load_update - 10;
|
|
|
|
n = 1 + (delta / LOAD_FREQ);
|
2010-11-30 19:48:45 +01:00
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
active = atomic_long_read(&calc_load_tasks);
|
|
|
|
active = active > 0 ? active * FIXED_1 : 0;
|
2010-11-30 19:48:45 +01:00
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
|
|
|
|
avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
|
|
|
|
avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
|
2010-11-30 19:48:45 +01:00
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
calc_load_update += n * LOAD_FREQ;
|
|
|
|
}
|
2010-04-22 21:50:19 +02:00
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
/*
|
|
|
|
* Flip the idle index...
|
|
|
|
*
|
|
|
|
* Make sure we first write the new time then flip the index, so that
|
|
|
|
* calc_load_write_idx() will see the new time when it reads the new
|
|
|
|
* index, this avoids a double flip messing things up.
|
|
|
|
*/
|
|
|
|
smp_wmb();
|
|
|
|
calc_load_idx++;
|
2010-04-22 21:50:19 +02:00
|
|
|
}
|
2012-06-22 15:52:09 +02:00
|
|
|
#else /* !CONFIG_NO_HZ */
|
2010-11-30 19:48:45 +01:00
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
static inline long calc_load_fold_idle(void) { return 0; }
|
|
|
|
static inline void calc_global_nohz(void) { }
|
2010-04-22 21:50:19 +02:00
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
#endif /* CONFIG_NO_HZ */
|
2007-05-08 00:32:51 -07:00
|
|
|
|
|
|
|
/*
|
2009-04-11 10:43:41 +02:00
|
|
|
* calc_load - update the avenrun load estimates 10 ticks after the
|
|
|
|
* CPUs have updated calc_load_tasks.
|
2006-12-10 02:20:22 -08:00
|
|
|
*/
|
2010-11-30 19:48:45 +01:00
|
|
|
void calc_global_load(unsigned long ticks)
|
2006-12-10 02:20:22 -08:00
|
|
|
{
|
2012-06-22 15:52:09 +02:00
|
|
|
long active, delta;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2010-11-30 19:48:45 +01:00
|
|
|
if (time_before(jiffies, calc_load_update + 10))
|
2009-04-11 10:43:41 +02:00
|
|
|
return;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
/*
|
|
|
|
* Fold the 'old' idle-delta to include all NO_HZ cpus.
|
|
|
|
*/
|
|
|
|
delta = calc_load_fold_idle();
|
|
|
|
if (delta)
|
|
|
|
atomic_long_add(delta, &calc_load_tasks);
|
|
|
|
|
2009-04-11 10:43:41 +02:00
|
|
|
active = atomic_long_read(&calc_load_tasks);
|
|
|
|
active = active > 0 ? active * FIXED_1 : 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-04-11 10:43:41 +02:00
|
|
|
avenrun[0] = calc_load(avenrun[0], EXP_1, active);
|
|
|
|
avenrun[1] = calc_load(avenrun[1], EXP_5, active);
|
|
|
|
avenrun[2] = calc_load(avenrun[2], EXP_15, active);
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2009-04-11 10:43:41 +02:00
|
|
|
calc_load_update += LOAD_FREQ;
|
2012-03-01 15:04:46 +01:00
|
|
|
|
|
|
|
/*
|
2012-06-22 15:52:09 +02:00
|
|
|
* In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
|
2012-03-01 15:04:46 +01:00
|
|
|
*/
|
|
|
|
calc_global_nohz();
|
2009-04-11 10:43:41 +02:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-04-11 10:43:41 +02:00
|
|
|
/*
|
2010-04-22 21:50:19 +02:00
|
|
|
* Called from update_cpu_load() to periodically update this CPU's
|
|
|
|
* active count.
|
2009-04-11 10:43:41 +02:00
|
|
|
*/
|
|
|
|
static void calc_load_account_active(struct rq *this_rq)
|
|
|
|
{
|
2010-04-22 21:50:19 +02:00
|
|
|
long delta;
|
2006-12-10 02:20:29 -08:00
|
|
|
|
2010-04-22 21:50:19 +02:00
|
|
|
if (time_before(jiffies, this_rq->calc_load_update))
|
|
|
|
return;
|
2006-12-10 02:20:33 -08:00
|
|
|
|
2010-04-22 21:50:19 +02:00
|
|
|
delta = calc_load_fold_active(this_rq);
|
|
|
|
if (delta)
|
2009-04-11 10:43:41 +02:00
|
|
|
atomic_long_add(delta, &calc_load_tasks);
|
2010-04-22 21:50:19 +02:00
|
|
|
|
|
|
|
this_rq->calc_load_update += LOAD_FREQ;
|
2007-05-08 00:32:51 -07:00
|
|
|
}
|
|
|
|
|
2012-06-22 15:52:09 +02:00
|
|
|
/*
|
|
|
|
* End of global load-average stuff
|
|
|
|
*/
|
|
|
|
|
2010-05-17 18:14:43 -07:00
|
|
|
/*
|
|
|
|
* The exact cpuload at various idx values, calculated at every tick would be
|
|
|
|
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
|
|
|
|
*
|
|
|
|
* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
|
|
|
|
* on nth tick when cpu may be busy, then we have:
|
|
|
|
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
|
|
|
* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
|
|
|
|
*
|
|
|
|
* decay_load_missed() below does efficient calculation of
|
|
|
|
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
|
|
|
* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
|
|
|
|
*
|
|
|
|
* The calculation is approximated on a 128 point scale.
|
|
|
|
* degrade_zero_ticks is the number of ticks after which load at any
|
|
|
|
* particular idx is approximated to be zero.
|
|
|
|
* degrade_factor is a precomputed table, a row for each load idx.
|
|
|
|
* Each column corresponds to degradation factor for a power of two ticks,
|
|
|
|
* based on 128 point scale.
|
|
|
|
* Example:
|
|
|
|
* row 2, col 3 (=12) says that the degradation at load idx 2 after
|
|
|
|
* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
|
|
|
|
*
|
|
|
|
* With this power of 2 load factors, we can degrade the load n times
|
|
|
|
* by looking at 1 bits in n and doing as many mult/shift instead of
|
|
|
|
* n mult/shifts needed by the exact degradation.
|
|
|
|
*/
|
|
|
|
#define DEGRADE_SHIFT 7
|
|
|
|
static const unsigned char
|
|
|
|
degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
|
|
|
|
static const unsigned char
|
|
|
|
degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
|
|
|
|
{0, 0, 0, 0, 0, 0, 0, 0},
|
|
|
|
{64, 32, 8, 0, 0, 0, 0, 0},
|
|
|
|
{96, 72, 40, 12, 1, 0, 0},
|
|
|
|
{112, 98, 75, 43, 15, 1, 0},
|
|
|
|
{120, 112, 98, 76, 45, 16, 2} };
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
|
|
|
|
* would be when CPU is idle and so we just decay the old load without
|
|
|
|
* adding any new load.
|
|
|
|
*/
|
|
|
|
static unsigned long
|
|
|
|
decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
|
|
|
|
{
|
|
|
|
int j = 0;
|
|
|
|
|
|
|
|
if (!missed_updates)
|
|
|
|
return load;
|
|
|
|
|
|
|
|
if (missed_updates >= degrade_zero_ticks[idx])
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (idx == 1)
|
|
|
|
return load >> missed_updates;
|
|
|
|
|
|
|
|
while (missed_updates) {
|
|
|
|
if (missed_updates % 2)
|
|
|
|
load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
|
|
|
|
|
|
|
|
missed_updates >>= 1;
|
|
|
|
j++;
|
|
|
|
}
|
|
|
|
return load;
|
|
|
|
}
|
|
|
|
|
2007-05-08 00:32:51 -07:00
|
|
|
/*
|
2007-07-09 18:51:59 +02:00
|
|
|
* Update rq->cpu_load[] statistics. This function is usually called every
|
2010-05-17 18:14:43 -07:00
|
|
|
* scheduler tick (TICK_NSEC). With tickless idle this will not be called
|
|
|
|
* every tick. We fix it up based on jiffies.
|
2007-05-08 00:32:51 -07:00
|
|
|
*/
|
2012-05-11 17:31:26 +02:00
|
|
|
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
|
|
|
|
unsigned long pending_updates)
|
2007-05-08 00:32:51 -07:00
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
int i, scale;
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
this_rq->nr_load_updates++;
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/* Update our load: */
|
2010-05-17 18:14:43 -07:00
|
|
|
this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
|
|
|
|
for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
|
2007-07-09 18:51:59 +02:00
|
|
|
unsigned long old_load, new_load;
|
2008-11-25 02:35:09 +10:30
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/* scale is effectively 1 << i now, and >> i divides by scale */
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
old_load = this_rq->cpu_load[i];
|
2010-05-17 18:14:43 -07:00
|
|
|
old_load = decay_load_missed(old_load, pending_updates - 1, i);
|
2007-07-09 18:51:59 +02:00
|
|
|
new_load = this_load;
|
2007-10-15 17:00:03 +02:00
|
|
|
/*
|
|
|
|
* Round up the averaging division if load is increasing. This
|
|
|
|
* prevents us from getting stuck on 9 if the load is 10, for
|
|
|
|
* example.
|
|
|
|
*/
|
|
|
|
if (new_load > old_load)
|
2010-05-17 18:14:43 -07:00
|
|
|
new_load += scale - 1;
|
|
|
|
|
|
|
|
this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
2010-08-23 13:42:51 -07:00
|
|
|
|
|
|
|
sched_avg_update(this_rq);
|
2010-05-17 18:14:43 -07:00
|
|
|
}
|
|
|
|
|
2012-05-17 17:15:29 +02:00
|
|
|
#ifdef CONFIG_NO_HZ
|
|
|
|
/*
|
|
|
|
* There is no sane way to deal with nohz on smp when using jiffies because the
|
|
|
|
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
|
|
|
|
* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
|
|
|
|
*
|
|
|
|
* Therefore we cannot use the delta approach from the regular tick since that
|
|
|
|
* would seriously skew the load calculation. However we'll make do for those
|
|
|
|
* updates happening while idle (nohz_idle_balance) or coming out of idle
|
|
|
|
* (tick_nohz_idle_exit).
|
|
|
|
*
|
|
|
|
* This means we might still be one tick off for nohz periods.
|
|
|
|
*/
|
|
|
|
|
2012-05-11 17:31:26 +02:00
|
|
|
/*
|
|
|
|
* Called from nohz_idle_balance() to update the load ratings before doing the
|
|
|
|
* idle balance.
|
|
|
|
*/
|
|
|
|
void update_idle_cpu_load(struct rq *this_rq)
|
|
|
|
{
|
2012-05-17 17:15:29 +02:00
|
|
|
unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
|
2012-05-11 17:31:26 +02:00
|
|
|
unsigned long load = this_rq->load.weight;
|
|
|
|
unsigned long pending_updates;
|
|
|
|
|
|
|
|
/*
|
2012-05-17 17:15:29 +02:00
|
|
|
* bail if there's load or we're actually up-to-date.
|
2012-05-11 17:31:26 +02:00
|
|
|
*/
|
|
|
|
if (load || curr_jiffies == this_rq->last_load_update_tick)
|
|
|
|
return;
|
|
|
|
|
|
|
|
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
|
|
|
this_rq->last_load_update_tick = curr_jiffies;
|
|
|
|
|
|
|
|
__update_cpu_load(this_rq, load, pending_updates);
|
|
|
|
}
|
|
|
|
|
2012-05-17 17:15:29 +02:00
|
|
|
/*
|
|
|
|
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
|
|
|
|
*/
|
|
|
|
void update_cpu_load_nohz(void)
|
|
|
|
{
|
|
|
|
struct rq *this_rq = this_rq();
|
|
|
|
unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
|
|
|
|
unsigned long pending_updates;
|
|
|
|
|
|
|
|
if (curr_jiffies == this_rq->last_load_update_tick)
|
|
|
|
return;
|
|
|
|
|
|
|
|
raw_spin_lock(&this_rq->lock);
|
|
|
|
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
|
|
|
if (pending_updates) {
|
|
|
|
this_rq->last_load_update_tick = curr_jiffies;
|
|
|
|
/*
|
|
|
|
* We were idle, this means load 0, the current load might be
|
|
|
|
* !0 due to remote wakeups and the sort.
|
|
|
|
*/
|
|
|
|
__update_cpu_load(this_rq, 0, pending_updates);
|
|
|
|
}
|
|
|
|
raw_spin_unlock(&this_rq->lock);
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_NO_HZ */
|
|
|
|
|
2012-05-11 17:31:26 +02:00
|
|
|
/*
|
|
|
|
* Called from scheduler_tick()
|
|
|
|
*/
|
2010-05-17 18:14:43 -07:00
|
|
|
static void update_cpu_load_active(struct rq *this_rq)
|
|
|
|
{
|
2012-05-11 17:31:26 +02:00
|
|
|
/*
|
2012-05-17 17:15:29 +02:00
|
|
|
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
|
2012-05-11 17:31:26 +02:00
|
|
|
*/
|
|
|
|
this_rq->last_load_update_tick = jiffies;
|
|
|
|
__update_cpu_load(this_rq, this_rq->load.weight, 1);
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2010-04-22 21:50:19 +02:00
|
|
|
calc_load_account_active(this_rq);
|
2007-05-08 00:32:51 -07:00
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
#ifdef CONFIG_SMP
|
sched: don't rebalance if attached on NULL domain
Impact: fix function graph trace hang / drop pointless softirq on UP
While debugging a function graph trace hang on an old PII, I saw
that it consumed most of its time on the timer interrupt. And
the domain rebalancing softirq was the most concerned.
The timer interrupt calls trigger_load_balance() which will
decide if it is worth to schedule a rebalancing softirq.
In case of builtin UP kernel, no problem arises because there is
no domain question.
In case of builtin SMP kernel running on an SMP box, still no
problem, the softirq will be raised each time we reach the
next_balance time.
In case of builtin SMP kernel running on a UP box (most distros
provide default SMP kernels, whatever the box you have), then
the CPU is attached to the NULL sched domain. So a kind of
unexpected behaviour happen:
trigger_load_balance() -> raises the rebalancing softirq later
on softirq: run_rebalance_domains() -> rebalance_domains() where
the for_each_domain(cpu, sd) is not taken because of the NULL
domain we are attached at. Which means rq->next_balance is never
updated. So on the next timer tick, we will enter
trigger_load_balance() which will always reschedule() the
rebalacing softirq:
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
So for each tick, we process this pointless softirq.
This patch fixes it by checking if we are attached to the null
domain before raising the softirq, another possible fix would be
to set the maximal possible JIFFIES value to rq->next_balance if
we are attached to the NULL domain.
v2: build fix on UP
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49af242d.1c07d00a.32d5.ffffc019@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-03-05 01:27:02 +01:00
|
|
|
|
2007-05-08 00:32:51 -07:00
|
|
|
/*
|
2009-12-16 18:04:37 +01:00
|
|
|
* sched_exec - execve() is a valuable balancing opportunity, because at
|
|
|
|
* this point the task has the smallest effective memory and cache footprint.
|
2007-05-08 00:32:51 -07:00
|
|
|
*/
|
2009-12-16 18:04:37 +01:00
|
|
|
void sched_exec(void)
|
2007-05-08 00:32:51 -07:00
|
|
|
{
|
2009-12-16 18:04:37 +01:00
|
|
|
struct task_struct *p = current;
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long flags;
|
2010-03-24 18:34:10 +01:00
|
|
|
int dest_cpu;
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2011-04-05 17:23:53 +02:00
|
|
|
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
2011-04-05 17:23:46 +02:00
|
|
|
dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
|
2010-03-24 18:34:10 +01:00
|
|
|
if (dest_cpu == smp_processor_id())
|
|
|
|
goto unlock;
|
2009-12-16 18:04:37 +01:00
|
|
|
|
2011-04-05 17:23:53 +02:00
|
|
|
if (likely(cpu_active(dest_cpu))) {
|
2010-05-06 18:49:21 +02:00
|
|
|
struct migration_arg arg = { p, dest_cpu };
|
2007-05-08 00:32:51 -07:00
|
|
|
|
2011-04-05 17:23:53 +02:00
|
|
|
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
|
|
|
stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
|
2005-04-16 15:20:36 -07:00
|
|
|
return;
|
|
|
|
}
|
2010-03-24 18:34:10 +01:00
|
|
|
unlock:
|
2011-04-05 17:23:53 +02:00
|
|
|
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
DEFINE_PER_CPU(struct kernel_stat, kstat);
|
2011-11-28 14:45:17 -02:00
|
|
|
DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
EXPORT_PER_CPU_SYMBOL(kstat);
|
2011-11-28 14:45:17 -02:00
|
|
|
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
2009-03-31 16:56:03 +09:00
|
|
|
* Return any ns on the sched_clock that have not yet been accounted in
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-12 09:54:39 -07:00
|
|
|
* @p in case that task is currently running.
|
2009-03-31 16:56:03 +09:00
|
|
|
*
|
|
|
|
* Called with task_rq_lock() held on @rq.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2009-03-31 16:56:03 +09:00
|
|
|
static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
|
|
|
|
{
|
|
|
|
u64 ns = 0;
|
|
|
|
|
|
|
|
if (task_current(rq, p)) {
|
|
|
|
update_rq_clock(rq);
|
2010-10-04 17:03:21 -07:00
|
|
|
ns = rq->clock_task - p->se.exec_start;
|
2009-03-31 16:56:03 +09:00
|
|
|
if ((s64)ns < 0)
|
|
|
|
ns = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ns;
|
|
|
|
}
|
|
|
|
|
2008-09-12 09:54:39 -07:00
|
|
|
unsigned long long task_delta_exec(struct task_struct *p)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2007-07-09 18:51:58 +02:00
|
|
|
struct rq *rq;
|
2008-09-12 09:54:39 -07:00
|
|
|
u64 ns = 0;
|
2006-07-03 00:25:40 -07:00
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
rq = task_rq_lock(p, &flags);
|
2009-03-31 16:56:03 +09:00
|
|
|
ns = do_task_delta_exec(p, rq);
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
2008-09-30 08:28:17 +02:00
|
|
|
|
2009-03-31 16:56:03 +09:00
|
|
|
return ns;
|
|
|
|
}
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-12 09:54:39 -07:00
|
|
|
|
2009-03-31 16:56:03 +09:00
|
|
|
/*
|
|
|
|
* Return accounted runtime for the task.
|
|
|
|
* In case the task is currently running, return the runtime plus current's
|
|
|
|
* pending runtime that have not been accounted yet.
|
|
|
|
*/
|
|
|
|
unsigned long long task_sched_runtime(struct task_struct *p)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
struct rq *rq;
|
|
|
|
u64 ns = 0;
|
|
|
|
|
|
|
|
rq = task_rq_lock(p, &flags);
|
|
|
|
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
2009-03-31 16:56:03 +09:00
|
|
|
|
|
|
|
return ns;
|
|
|
|
}
|
2006-07-03 00:25:40 -07:00
|
|
|
|
2011-11-28 14:45:19 -02:00
|
|
|
#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
struct cgroup_subsys cpuacct_subsys;
|
|
|
|
struct cpuacct root_cpuacct;
|
|
|
|
#endif
|
|
|
|
|
2011-12-02 19:58:39 -02:00
|
|
|
static inline void task_group_account_field(struct task_struct *p, int index,
|
|
|
|
u64 tmp)
|
2011-11-28 14:45:19 -02:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
struct kernel_cpustat *kcpustat;
|
|
|
|
struct cpuacct *ca;
|
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
* Since all updates are sure to touch the root cgroup, we
|
|
|
|
* get ourselves ahead and touch it first. If the root cgroup
|
|
|
|
* is the only cgroup, then nothing else should be necessary.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
|
|
|
|
|
|
|
|
#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
if (unlikely(!cpuacct_subsys.active))
|
|
|
|
return;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
ca = task_ca(p);
|
|
|
|
while (ca && (ca != &root_cpuacct)) {
|
|
|
|
kcpustat = this_cpu_ptr(ca->cpustat);
|
|
|
|
kcpustat->cpustat[index] += tmp;
|
|
|
|
ca = parent_ca(ca);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Account user cpu time to a process.
|
|
|
|
* @p: the process that the cpu time gets accounted to
|
|
|
|
* @cputime: the cpu time spent in user space since the last update
|
2008-12-31 15:11:37 +01:00
|
|
|
* @cputime_scaled: cputime scaled by cpu frequency
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2008-12-31 15:11:37 +01:00
|
|
|
void account_user_time(struct task_struct *p, cputime_t cputime,
|
|
|
|
cputime_t cputime_scaled)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-11-28 14:45:17 -02:00
|
|
|
int index;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-12-31 15:11:37 +01:00
|
|
|
/* Add user time to process. */
|
2011-12-15 14:56:09 +01:00
|
|
|
p->utime += cputime;
|
|
|
|
p->utimescaled += cputime_scaled;
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-12 09:54:39 -07:00
|
|
|
account_group_user_time(p, cputime);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-11-28 14:45:17 -02:00
|
|
|
index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
|
2009-03-31 10:02:22 +05:30
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/* Add user time to cpustat. */
|
2011-12-19 19:23:15 +01:00
|
|
|
task_group_account_field(p, index, (__force u64) cputime);
|
2009-03-31 10:02:22 +05:30
|
|
|
|
2008-07-25 01:48:40 -07:00
|
|
|
/* Account for user time used */
|
|
|
|
acct_update_integrals(p);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:19 +02:00
|
|
|
/*
|
|
|
|
* Account guest cpu time to a process.
|
|
|
|
* @p: the process that the cpu time gets accounted to
|
|
|
|
* @cputime: the cpu time spent in virtual machine since the last update
|
2008-12-31 15:11:37 +01:00
|
|
|
* @cputime_scaled: cputime scaled by cpu frequency
|
2007-10-15 17:00:19 +02:00
|
|
|
*/
|
2008-12-31 15:11:37 +01:00
|
|
|
static void account_guest_time(struct task_struct *p, cputime_t cputime,
|
|
|
|
cputime_t cputime_scaled)
|
2007-10-15 17:00:19 +02:00
|
|
|
{
|
2011-11-28 14:45:17 -02:00
|
|
|
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
2007-10-15 17:00:19 +02:00
|
|
|
|
2008-12-31 15:11:37 +01:00
|
|
|
/* Add guest time to process. */
|
2011-12-15 14:56:09 +01:00
|
|
|
p->utime += cputime;
|
|
|
|
p->utimescaled += cputime_scaled;
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-12 09:54:39 -07:00
|
|
|
account_group_user_time(p, cputime);
|
2011-12-15 14:56:09 +01:00
|
|
|
p->gtime += cputime;
|
2007-10-15 17:00:19 +02:00
|
|
|
|
2008-12-31 15:11:37 +01:00
|
|
|
/* Add guest time to cpustat. */
|
2009-10-24 01:20:10 +09:00
|
|
|
if (TASK_NICE(p) > 0) {
|
2011-12-19 19:23:15 +01:00
|
|
|
cpustat[CPUTIME_NICE] += (__force u64) cputime;
|
|
|
|
cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
|
2009-10-24 01:20:10 +09:00
|
|
|
} else {
|
2011-12-19 19:23:15 +01:00
|
|
|
cpustat[CPUTIME_USER] += (__force u64) cputime;
|
|
|
|
cpustat[CPUTIME_GUEST] += (__force u64) cputime;
|
2009-10-24 01:20:10 +09:00
|
|
|
}
|
2007-10-15 17:00:19 +02:00
|
|
|
}
|
|
|
|
|
2010-12-21 17:09:02 -08:00
|
|
|
/*
|
|
|
|
* Account system cpu time to a process and desired cpustat field
|
|
|
|
* @p: the process that the cpu time gets accounted to
|
|
|
|
* @cputime: the cpu time spent in kernel space since the last update
|
|
|
|
* @cputime_scaled: cputime scaled by cpu frequency
|
|
|
|
* @target_cputime64: pointer to cpustat field that has to be updated
|
|
|
|
*/
|
|
|
|
static inline
|
|
|
|
void __account_system_time(struct task_struct *p, cputime_t cputime,
|
2011-11-28 14:45:17 -02:00
|
|
|
cputime_t cputime_scaled, int index)
|
2010-12-21 17:09:02 -08:00
|
|
|
{
|
|
|
|
/* Add system time to process. */
|
2011-12-15 14:56:09 +01:00
|
|
|
p->stime += cputime;
|
|
|
|
p->stimescaled += cputime_scaled;
|
2010-12-21 17:09:02 -08:00
|
|
|
account_group_system_time(p, cputime);
|
|
|
|
|
|
|
|
/* Add system time to cpustat. */
|
2011-12-19 19:23:15 +01:00
|
|
|
task_group_account_field(p, index, (__force u64) cputime);
|
2010-12-21 17:09:02 -08:00
|
|
|
|
|
|
|
/* Account for system time used */
|
|
|
|
acct_update_integrals(p);
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Account system cpu time to a process.
|
|
|
|
* @p: the process that the cpu time gets accounted to
|
|
|
|
* @hardirq_offset: the offset to subtract from hardirq_count()
|
|
|
|
* @cputime: the cpu time spent in kernel space since the last update
|
2008-12-31 15:11:37 +01:00
|
|
|
* @cputime_scaled: cputime scaled by cpu frequency
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
void account_system_time(struct task_struct *p, int hardirq_offset,
|
2008-12-31 15:11:37 +01:00
|
|
|
cputime_t cputime, cputime_t cputime_scaled)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-11-28 14:45:17 -02:00
|
|
|
int index;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-04-24 18:17:55 -07:00
|
|
|
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
|
2008-12-31 15:11:37 +01:00
|
|
|
account_guest_time(p, cputime, cputime_scaled);
|
2008-04-24 18:17:55 -07:00
|
|
|
return;
|
|
|
|
}
|
2007-10-15 17:00:19 +02:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
if (hardirq_count() - hardirq_offset)
|
2011-11-28 14:45:17 -02:00
|
|
|
index = CPUTIME_IRQ;
|
2010-10-04 17:03:16 -07:00
|
|
|
else if (in_serving_softirq())
|
2011-11-28 14:45:17 -02:00
|
|
|
index = CPUTIME_SOFTIRQ;
|
2005-04-16 15:20:36 -07:00
|
|
|
else
|
2011-11-28 14:45:17 -02:00
|
|
|
index = CPUTIME_SYSTEM;
|
2009-03-31 10:02:22 +05:30
|
|
|
|
2011-11-28 14:45:17 -02:00
|
|
|
__account_system_time(p, cputime, cputime_scaled, index);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2007-10-18 03:06:34 -07:00
|
|
|
/*
|
2005-04-16 15:20:36 -07:00
|
|
|
* Account for involuntary wait time.
|
2011-02-25 15:13:16 -08:00
|
|
|
* @cputime: the cpu time spent in involuntary wait
|
2007-10-18 03:06:34 -07:00
|
|
|
*/
|
2008-12-31 15:11:38 +01:00
|
|
|
void account_steal_time(cputime_t cputime)
|
2007-10-18 03:06:34 -07:00
|
|
|
{
|
2011-11-28 14:45:17 -02:00
|
|
|
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
2008-12-31 15:11:38 +01:00
|
|
|
|
2011-12-19 19:23:15 +01:00
|
|
|
cpustat[CPUTIME_STEAL] += (__force u64) cputime;
|
2007-10-18 03:06:34 -07:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2008-12-31 15:11:38 +01:00
|
|
|
* Account for idle time.
|
|
|
|
* @cputime: the cpu time spent in idle wait
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2008-12-31 15:11:38 +01:00
|
|
|
void account_idle_time(cputime_t cputime)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-11-28 14:45:17 -02:00
|
|
|
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq = this_rq();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-12-31 15:11:38 +01:00
|
|
|
if (atomic_read(&rq->nr_iowait) > 0)
|
2011-12-19 19:23:15 +01:00
|
|
|
cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
|
2008-12-31 15:11:38 +01:00
|
|
|
else
|
2011-12-19 19:23:15 +01:00
|
|
|
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2011-07-11 15:28:17 -04:00
|
|
|
static __always_inline bool steal_account_process_tick(void)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_PARAVIRT
|
2012-02-24 08:31:31 +01:00
|
|
|
if (static_key_false(¶virt_steal_enabled)) {
|
2011-07-11 15:28:17 -04:00
|
|
|
u64 steal, st = 0;
|
|
|
|
|
|
|
|
steal = paravirt_steal_clock(smp_processor_id());
|
|
|
|
steal -= this_rq()->prev_steal_time;
|
|
|
|
|
|
|
|
st = steal_ticks(steal);
|
|
|
|
this_rq()->prev_steal_time += st * TICK_NSEC;
|
|
|
|
|
|
|
|
account_steal_time(st);
|
|
|
|
return st;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2008-12-31 15:11:38 +01:00
|
|
|
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
|
|
|
|
|
2010-12-21 17:09:03 -08:00
|
|
|
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
|
|
|
/*
|
|
|
|
* Account a tick to a process and cpustat
|
|
|
|
* @p: the process that the cpu time gets accounted to
|
|
|
|
* @user_tick: is the tick from userspace
|
|
|
|
* @rq: the pointer to rq
|
|
|
|
*
|
|
|
|
* Tick demultiplexing follows the order
|
|
|
|
* - pending hardirq update
|
|
|
|
* - pending softirq update
|
|
|
|
* - user_time
|
|
|
|
* - idle_time
|
|
|
|
* - system time
|
|
|
|
* - check for guest_time
|
|
|
|
* - else account as system_time
|
|
|
|
*
|
|
|
|
* Check for hardirq is done both for system and user time as there is
|
|
|
|
* no timer going off while we are on hardirq and hence we may never get an
|
|
|
|
* opportunity to update it solely in system time.
|
|
|
|
* p->stime and friends are only updated on system time and not on irq
|
|
|
|
* softirq as those do not count in task exec_runtime any more.
|
|
|
|
*/
|
|
|
|
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
|
|
|
struct rq *rq)
|
|
|
|
{
|
|
|
|
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
|
2011-11-28 14:45:17 -02:00
|
|
|
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
2010-12-21 17:09:03 -08:00
|
|
|
|
2011-07-11 15:28:17 -04:00
|
|
|
if (steal_account_process_tick())
|
|
|
|
return;
|
|
|
|
|
2010-12-21 17:09:03 -08:00
|
|
|
if (irqtime_account_hi_update()) {
|
2011-12-19 19:23:15 +01:00
|
|
|
cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
|
2010-12-21 17:09:03 -08:00
|
|
|
} else if (irqtime_account_si_update()) {
|
2011-12-19 19:23:15 +01:00
|
|
|
cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
|
2010-12-21 17:09:04 -08:00
|
|
|
} else if (this_cpu_ksoftirqd() == p) {
|
|
|
|
/*
|
|
|
|
* ksoftirqd time do not get accounted in cpu_softirq_time.
|
|
|
|
* So, we have to handle it separately here.
|
|
|
|
* Also, p->stime needs to be updated for ksoftirqd.
|
|
|
|
*/
|
|
|
|
__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
|
2011-11-28 14:45:17 -02:00
|
|
|
CPUTIME_SOFTIRQ);
|
2010-12-21 17:09:03 -08:00
|
|
|
} else if (user_tick) {
|
|
|
|
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
|
|
|
|
} else if (p == rq->idle) {
|
|
|
|
account_idle_time(cputime_one_jiffy);
|
|
|
|
} else if (p->flags & PF_VCPU) { /* System time or guest time */
|
|
|
|
account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
|
|
|
|
} else {
|
|
|
|
__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
|
2011-11-28 14:45:17 -02:00
|
|
|
CPUTIME_SYSTEM);
|
2010-12-21 17:09:03 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void irqtime_account_idle_ticks(int ticks)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct rq *rq = this_rq();
|
|
|
|
|
|
|
|
for (i = 0; i < ticks; i++)
|
|
|
|
irqtime_account_process_tick(current, 0, rq);
|
|
|
|
}
|
2011-02-25 15:13:16 -08:00
|
|
|
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
|
2010-12-21 17:09:03 -08:00
|
|
|
static void irqtime_account_idle_ticks(int ticks) {}
|
|
|
|
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
|
|
|
struct rq *rq) {}
|
2011-02-25 15:13:16 -08:00
|
|
|
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
|
2008-12-31 15:11:38 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Account a single tick of cpu time.
|
|
|
|
* @p: the process that the cpu time gets accounted to
|
|
|
|
* @user_tick: indicates if the tick is a user or a system tick
|
|
|
|
*/
|
|
|
|
void account_process_tick(struct task_struct *p, int user_tick)
|
|
|
|
{
|
2009-07-29 12:15:29 +02:00
|
|
|
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
|
2008-12-31 15:11:38 +01:00
|
|
|
struct rq *rq = this_rq();
|
|
|
|
|
2010-12-21 17:09:03 -08:00
|
|
|
if (sched_clock_irqtime) {
|
|
|
|
irqtime_account_process_tick(p, user_tick, rq);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-07-11 15:28:17 -04:00
|
|
|
if (steal_account_process_tick())
|
|
|
|
return;
|
|
|
|
|
2008-12-31 15:11:38 +01:00
|
|
|
if (user_tick)
|
2009-07-29 12:15:29 +02:00
|
|
|
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
|
2009-04-29 14:44:49 +02:00
|
|
|
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
|
2009-07-29 12:15:29 +02:00
|
|
|
account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
|
2008-12-31 15:11:38 +01:00
|
|
|
one_jiffy_scaled);
|
|
|
|
else
|
2009-07-29 12:15:29 +02:00
|
|
|
account_idle_time(cputime_one_jiffy);
|
2008-12-31 15:11:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Account multiple ticks of steal time.
|
|
|
|
* @p: the process from which the cpu time has been stolen
|
|
|
|
* @ticks: number of stolen ticks
|
|
|
|
*/
|
|
|
|
void account_steal_ticks(unsigned long ticks)
|
|
|
|
{
|
|
|
|
account_steal_time(jiffies_to_cputime(ticks));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Account multiple ticks of idle time.
|
|
|
|
* @ticks: number of stolen ticks
|
|
|
|
*/
|
|
|
|
void account_idle_ticks(unsigned long ticks)
|
|
|
|
{
|
2010-12-21 17:09:03 -08:00
|
|
|
|
|
|
|
if (sched_clock_irqtime) {
|
|
|
|
irqtime_account_idle_ticks(ticks);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2008-12-31 15:11:38 +01:00
|
|
|
account_idle_time(jiffies_to_cputime(ticks));
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2008-12-31 15:11:38 +01:00
|
|
|
#endif
|
|
|
|
|
2008-09-05 18:12:23 +02:00
|
|
|
/*
|
|
|
|
* Use precise platform statistics if available:
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
|
2009-11-26 14:48:30 +09:00
|
|
|
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
2008-09-05 18:12:23 +02:00
|
|
|
{
|
2009-12-02 17:26:47 +09:00
|
|
|
*ut = p->utime;
|
|
|
|
*st = p->stime;
|
2008-09-05 18:12:23 +02:00
|
|
|
}
|
|
|
|
|
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing
described in the thread:
http://lkml.org/lkml/2009/11/3/522
Now cputime is accounted in the following way:
- {u,s}time in task_struct are increased every time when the thread
is interrupted by a tick (timer interrupt).
- When a thread exits, its {u,s}time are added to signal->{u,s}time,
after adjusted by task_times().
- When all threads in a thread_group exits, accumulated {u,s}time
(and also c{u,s}time) in signal struct are added to c{u,s}time
in signal struct of the group's parent.
So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.
And accounted values are used by:
- task_times(), to get cputime of a thread:
This function returns adjusted values that originates from raw
{u,s}time and scaled by sum_exec_runtime that accounted by CFS.
- thread_group_cputime(), to get cputime of a thread group:
This function returns sum of all {u,s}time of living threads in
the group, plus {u,s}time in the signal struct that is sum of
adjusted cputimes of all exited threads belonged to the group.
The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:
group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)
This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).
To fix this, we could do:
group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)
But task_times() contains hard divisions, so applying it for
every thread should be avoided.
This patch fixes the above problem in the following way:
- Modify thread's exit (= __exit_signal()) not to use task_times().
It means {u,s}time in signal struct accumulates raw values instead
of adjusted values. As the result it makes thread_group_cputime()
to return pure sum of "raw" values.
- Introduce a new function thread_group_times(*task, *utime, *stime)
that converts "raw" values of thread_group_cputime() to "adjusted"
values, in same calculation procedure as task_times().
- Modify group's exit (= wait_task_zombie()) to use this introduced
thread_group_times(). It make c{u,s}time in signal struct to
have adjusted values like before this patch.
- Replace some thread_group_cputime() by thread_group_times().
This replacements are only applied where conveys the "adjusted"
cputime to users, and where already uses task_times() near by it.
(i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)
This patch have a positive side effect:
- Before this patch, if a group contains many short-life threads
(e.g. runs 0.9ms and not interrupted by ticks), the group's
cputime could be invisible since thread's cputime was accumulated
after adjusted: imagine adjustment function as adj(ticks, runtime),
{adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
After this patch it will not happen because the adjustment is
applied after accumulated.
v2:
- remove if()s, put new variables into signal_struct.
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-02 17:28:07 +09:00
|
|
|
void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
2008-09-05 18:12:23 +02:00
|
|
|
{
|
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing
described in the thread:
http://lkml.org/lkml/2009/11/3/522
Now cputime is accounted in the following way:
- {u,s}time in task_struct are increased every time when the thread
is interrupted by a tick (timer interrupt).
- When a thread exits, its {u,s}time are added to signal->{u,s}time,
after adjusted by task_times().
- When all threads in a thread_group exits, accumulated {u,s}time
(and also c{u,s}time) in signal struct are added to c{u,s}time
in signal struct of the group's parent.
So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.
And accounted values are used by:
- task_times(), to get cputime of a thread:
This function returns adjusted values that originates from raw
{u,s}time and scaled by sum_exec_runtime that accounted by CFS.
- thread_group_cputime(), to get cputime of a thread group:
This function returns sum of all {u,s}time of living threads in
the group, plus {u,s}time in the signal struct that is sum of
adjusted cputimes of all exited threads belonged to the group.
The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:
group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)
This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).
To fix this, we could do:
group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)
But task_times() contains hard divisions, so applying it for
every thread should be avoided.
This patch fixes the above problem in the following way:
- Modify thread's exit (= __exit_signal()) not to use task_times().
It means {u,s}time in signal struct accumulates raw values instead
of adjusted values. As the result it makes thread_group_cputime()
to return pure sum of "raw" values.
- Introduce a new function thread_group_times(*task, *utime, *stime)
that converts "raw" values of thread_group_cputime() to "adjusted"
values, in same calculation procedure as task_times().
- Modify group's exit (= wait_task_zombie()) to use this introduced
thread_group_times(). It make c{u,s}time in signal struct to
have adjusted values like before this patch.
- Replace some thread_group_cputime() by thread_group_times().
This replacements are only applied where conveys the "adjusted"
cputime to users, and where already uses task_times() near by it.
(i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)
This patch have a positive side effect:
- Before this patch, if a group contains many short-life threads
(e.g. runs 0.9ms and not interrupted by ticks), the group's
cputime could be invisible since thread's cputime was accumulated
after adjusted: imagine adjustment function as adj(ticks, runtime),
{adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
After this patch it will not happen because the adjustment is
applied after accumulated.
v2:
- remove if()s, put new variables into signal_struct.
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-02 17:28:07 +09:00
|
|
|
struct task_cputime cputime;
|
|
|
|
|
|
|
|
thread_group_cputime(p, &cputime);
|
|
|
|
|
|
|
|
*ut = cputime.utime;
|
|
|
|
*st = cputime.stime;
|
2008-09-05 18:12:23 +02:00
|
|
|
}
|
|
|
|
#else
|
2009-11-12 13:33:45 +09:00
|
|
|
|
|
|
|
#ifndef nsecs_to_cputime
|
2009-11-26 14:49:27 +09:00
|
|
|
# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
|
2009-11-12 13:33:45 +09:00
|
|
|
#endif
|
|
|
|
|
2012-08-08 11:27:15 +02:00
|
|
|
static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
|
|
|
|
{
|
|
|
|
u64 temp = (__force u64) rtime;
|
|
|
|
|
|
|
|
temp *= (__force u64) utime;
|
|
|
|
|
|
|
|
if (sizeof(cputime_t) == 4)
|
|
|
|
temp = div_u64(temp, (__force u32) total);
|
|
|
|
else
|
|
|
|
temp = div64_u64(temp, (__force u64) total);
|
|
|
|
|
|
|
|
return (__force cputime_t) temp;
|
|
|
|
}
|
|
|
|
|
2009-11-26 14:48:30 +09:00
|
|
|
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
2008-09-05 18:12:23 +02:00
|
|
|
{
|
2011-12-15 14:56:09 +01:00
|
|
|
cputime_t rtime, utime = p->utime, total = utime + p->stime;
|
2008-09-05 18:12:23 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Use CFS's precise accounting:
|
|
|
|
*/
|
2009-11-26 14:48:30 +09:00
|
|
|
rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
|
2008-09-05 18:12:23 +02:00
|
|
|
|
2012-08-08 11:27:15 +02:00
|
|
|
if (total)
|
|
|
|
utime = scale_utime(utime, rtime, total);
|
|
|
|
else
|
2009-11-26 14:48:30 +09:00
|
|
|
utime = rtime;
|
2008-09-05 18:12:23 +02:00
|
|
|
|
2009-11-26 14:48:30 +09:00
|
|
|
/*
|
|
|
|
* Compare with previous values, to keep monotonicity:
|
|
|
|
*/
|
2009-11-12 13:33:45 +09:00
|
|
|
p->prev_utime = max(p->prev_utime, utime);
|
2011-12-15 14:56:09 +01:00
|
|
|
p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
|
2008-09-05 18:12:23 +02:00
|
|
|
|
2009-12-02 17:26:47 +09:00
|
|
|
*ut = p->prev_utime;
|
|
|
|
*st = p->prev_stime;
|
2008-09-05 18:12:23 +02:00
|
|
|
}
|
|
|
|
|
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing
described in the thread:
http://lkml.org/lkml/2009/11/3/522
Now cputime is accounted in the following way:
- {u,s}time in task_struct are increased every time when the thread
is interrupted by a tick (timer interrupt).
- When a thread exits, its {u,s}time are added to signal->{u,s}time,
after adjusted by task_times().
- When all threads in a thread_group exits, accumulated {u,s}time
(and also c{u,s}time) in signal struct are added to c{u,s}time
in signal struct of the group's parent.
So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.
And accounted values are used by:
- task_times(), to get cputime of a thread:
This function returns adjusted values that originates from raw
{u,s}time and scaled by sum_exec_runtime that accounted by CFS.
- thread_group_cputime(), to get cputime of a thread group:
This function returns sum of all {u,s}time of living threads in
the group, plus {u,s}time in the signal struct that is sum of
adjusted cputimes of all exited threads belonged to the group.
The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:
group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)
This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).
To fix this, we could do:
group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)
But task_times() contains hard divisions, so applying it for
every thread should be avoided.
This patch fixes the above problem in the following way:
- Modify thread's exit (= __exit_signal()) not to use task_times().
It means {u,s}time in signal struct accumulates raw values instead
of adjusted values. As the result it makes thread_group_cputime()
to return pure sum of "raw" values.
- Introduce a new function thread_group_times(*task, *utime, *stime)
that converts "raw" values of thread_group_cputime() to "adjusted"
values, in same calculation procedure as task_times().
- Modify group's exit (= wait_task_zombie()) to use this introduced
thread_group_times(). It make c{u,s}time in signal struct to
have adjusted values like before this patch.
- Replace some thread_group_cputime() by thread_group_times().
This replacements are only applied where conveys the "adjusted"
cputime to users, and where already uses task_times() near by it.
(i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)
This patch have a positive side effect:
- Before this patch, if a group contains many short-life threads
(e.g. runs 0.9ms and not interrupted by ticks), the group's
cputime could be invisible since thread's cputime was accumulated
after adjusted: imagine adjustment function as adj(ticks, runtime),
{adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
After this patch it will not happen because the adjustment is
applied after accumulated.
v2:
- remove if()s, put new variables into signal_struct.
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-02 17:28:07 +09:00
|
|
|
/*
|
|
|
|
* Must be called with siglock held.
|
|
|
|
*/
|
|
|
|
void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
2008-09-05 18:12:23 +02:00
|
|
|
{
|
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing
described in the thread:
http://lkml.org/lkml/2009/11/3/522
Now cputime is accounted in the following way:
- {u,s}time in task_struct are increased every time when the thread
is interrupted by a tick (timer interrupt).
- When a thread exits, its {u,s}time are added to signal->{u,s}time,
after adjusted by task_times().
- When all threads in a thread_group exits, accumulated {u,s}time
(and also c{u,s}time) in signal struct are added to c{u,s}time
in signal struct of the group's parent.
So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.
And accounted values are used by:
- task_times(), to get cputime of a thread:
This function returns adjusted values that originates from raw
{u,s}time and scaled by sum_exec_runtime that accounted by CFS.
- thread_group_cputime(), to get cputime of a thread group:
This function returns sum of all {u,s}time of living threads in
the group, plus {u,s}time in the signal struct that is sum of
adjusted cputimes of all exited threads belonged to the group.
The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:
group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)
This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).
To fix this, we could do:
group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)
But task_times() contains hard divisions, so applying it for
every thread should be avoided.
This patch fixes the above problem in the following way:
- Modify thread's exit (= __exit_signal()) not to use task_times().
It means {u,s}time in signal struct accumulates raw values instead
of adjusted values. As the result it makes thread_group_cputime()
to return pure sum of "raw" values.
- Introduce a new function thread_group_times(*task, *utime, *stime)
that converts "raw" values of thread_group_cputime() to "adjusted"
values, in same calculation procedure as task_times().
- Modify group's exit (= wait_task_zombie()) to use this introduced
thread_group_times(). It make c{u,s}time in signal struct to
have adjusted values like before this patch.
- Replace some thread_group_cputime() by thread_group_times().
This replacements are only applied where conveys the "adjusted"
cputime to users, and where already uses task_times() near by it.
(i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)
This patch have a positive side effect:
- Before this patch, if a group contains many short-life threads
(e.g. runs 0.9ms and not interrupted by ticks), the group's
cputime could be invisible since thread's cputime was accumulated
after adjusted: imagine adjustment function as adj(ticks, runtime),
{adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
After this patch it will not happen because the adjustment is
applied after accumulated.
v2:
- remove if()s, put new variables into signal_struct.
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-02 17:28:07 +09:00
|
|
|
struct signal_struct *sig = p->signal;
|
|
|
|
struct task_cputime cputime;
|
|
|
|
cputime_t rtime, utime, total;
|
2008-09-05 18:12:23 +02:00
|
|
|
|
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing
described in the thread:
http://lkml.org/lkml/2009/11/3/522
Now cputime is accounted in the following way:
- {u,s}time in task_struct are increased every time when the thread
is interrupted by a tick (timer interrupt).
- When a thread exits, its {u,s}time are added to signal->{u,s}time,
after adjusted by task_times().
- When all threads in a thread_group exits, accumulated {u,s}time
(and also c{u,s}time) in signal struct are added to c{u,s}time
in signal struct of the group's parent.
So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.
And accounted values are used by:
- task_times(), to get cputime of a thread:
This function returns adjusted values that originates from raw
{u,s}time and scaled by sum_exec_runtime that accounted by CFS.
- thread_group_cputime(), to get cputime of a thread group:
This function returns sum of all {u,s}time of living threads in
the group, plus {u,s}time in the signal struct that is sum of
adjusted cputimes of all exited threads belonged to the group.
The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:
group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)
This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).
To fix this, we could do:
group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)
But task_times() contains hard divisions, so applying it for
every thread should be avoided.
This patch fixes the above problem in the following way:
- Modify thread's exit (= __exit_signal()) not to use task_times().
It means {u,s}time in signal struct accumulates raw values instead
of adjusted values. As the result it makes thread_group_cputime()
to return pure sum of "raw" values.
- Introduce a new function thread_group_times(*task, *utime, *stime)
that converts "raw" values of thread_group_cputime() to "adjusted"
values, in same calculation procedure as task_times().
- Modify group's exit (= wait_task_zombie()) to use this introduced
thread_group_times(). It make c{u,s}time in signal struct to
have adjusted values like before this patch.
- Replace some thread_group_cputime() by thread_group_times().
This replacements are only applied where conveys the "adjusted"
cputime to users, and where already uses task_times() near by it.
(i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)
This patch have a positive side effect:
- Before this patch, if a group contains many short-life threads
(e.g. runs 0.9ms and not interrupted by ticks), the group's
cputime could be invisible since thread's cputime was accumulated
after adjusted: imagine adjustment function as adj(ticks, runtime),
{adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
After this patch it will not happen because the adjustment is
applied after accumulated.
v2:
- remove if()s, put new variables into signal_struct.
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-02 17:28:07 +09:00
|
|
|
thread_group_cputime(p, &cputime);
|
2008-09-05 18:12:23 +02:00
|
|
|
|
2011-12-15 14:56:09 +01:00
|
|
|
total = cputime.utime + cputime.stime;
|
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing
described in the thread:
http://lkml.org/lkml/2009/11/3/522
Now cputime is accounted in the following way:
- {u,s}time in task_struct are increased every time when the thread
is interrupted by a tick (timer interrupt).
- When a thread exits, its {u,s}time are added to signal->{u,s}time,
after adjusted by task_times().
- When all threads in a thread_group exits, accumulated {u,s}time
(and also c{u,s}time) in signal struct are added to c{u,s}time
in signal struct of the group's parent.
So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.
And accounted values are used by:
- task_times(), to get cputime of a thread:
This function returns adjusted values that originates from raw
{u,s}time and scaled by sum_exec_runtime that accounted by CFS.
- thread_group_cputime(), to get cputime of a thread group:
This function returns sum of all {u,s}time of living threads in
the group, plus {u,s}time in the signal struct that is sum of
adjusted cputimes of all exited threads belonged to the group.
The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:
group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)
This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).
To fix this, we could do:
group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)
But task_times() contains hard divisions, so applying it for
every thread should be avoided.
This patch fixes the above problem in the following way:
- Modify thread's exit (= __exit_signal()) not to use task_times().
It means {u,s}time in signal struct accumulates raw values instead
of adjusted values. As the result it makes thread_group_cputime()
to return pure sum of "raw" values.
- Introduce a new function thread_group_times(*task, *utime, *stime)
that converts "raw" values of thread_group_cputime() to "adjusted"
values, in same calculation procedure as task_times().
- Modify group's exit (= wait_task_zombie()) to use this introduced
thread_group_times(). It make c{u,s}time in signal struct to
have adjusted values like before this patch.
- Replace some thread_group_cputime() by thread_group_times().
This replacements are only applied where conveys the "adjusted"
cputime to users, and where already uses task_times() near by it.
(i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)
This patch have a positive side effect:
- Before this patch, if a group contains many short-life threads
(e.g. runs 0.9ms and not interrupted by ticks), the group's
cputime could be invisible since thread's cputime was accumulated
after adjusted: imagine adjustment function as adj(ticks, runtime),
{adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
After this patch it will not happen because the adjustment is
applied after accumulated.
v2:
- remove if()s, put new variables into signal_struct.
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-02 17:28:07 +09:00
|
|
|
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
|
2008-09-05 18:12:23 +02:00
|
|
|
|
2012-08-08 11:27:15 +02:00
|
|
|
if (total)
|
|
|
|
utime = scale_utime(cputime.utime, rtime, total);
|
|
|
|
else
|
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing
described in the thread:
http://lkml.org/lkml/2009/11/3/522
Now cputime is accounted in the following way:
- {u,s}time in task_struct are increased every time when the thread
is interrupted by a tick (timer interrupt).
- When a thread exits, its {u,s}time are added to signal->{u,s}time,
after adjusted by task_times().
- When all threads in a thread_group exits, accumulated {u,s}time
(and also c{u,s}time) in signal struct are added to c{u,s}time
in signal struct of the group's parent.
So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.
And accounted values are used by:
- task_times(), to get cputime of a thread:
This function returns adjusted values that originates from raw
{u,s}time and scaled by sum_exec_runtime that accounted by CFS.
- thread_group_cputime(), to get cputime of a thread group:
This function returns sum of all {u,s}time of living threads in
the group, plus {u,s}time in the signal struct that is sum of
adjusted cputimes of all exited threads belonged to the group.
The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:
group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)
This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).
To fix this, we could do:
group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)
But task_times() contains hard divisions, so applying it for
every thread should be avoided.
This patch fixes the above problem in the following way:
- Modify thread's exit (= __exit_signal()) not to use task_times().
It means {u,s}time in signal struct accumulates raw values instead
of adjusted values. As the result it makes thread_group_cputime()
to return pure sum of "raw" values.
- Introduce a new function thread_group_times(*task, *utime, *stime)
that converts "raw" values of thread_group_cputime() to "adjusted"
values, in same calculation procedure as task_times().
- Modify group's exit (= wait_task_zombie()) to use this introduced
thread_group_times(). It make c{u,s}time in signal struct to
have adjusted values like before this patch.
- Replace some thread_group_cputime() by thread_group_times().
This replacements are only applied where conveys the "adjusted"
cputime to users, and where already uses task_times() near by it.
(i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)
This patch have a positive side effect:
- Before this patch, if a group contains many short-life threads
(e.g. runs 0.9ms and not interrupted by ticks), the group's
cputime could be invisible since thread's cputime was accumulated
after adjusted: imagine adjustment function as adj(ticks, runtime),
{adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
After this patch it will not happen because the adjustment is
applied after accumulated.
v2:
- remove if()s, put new variables into signal_struct.
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-02 17:28:07 +09:00
|
|
|
utime = rtime;
|
|
|
|
|
|
|
|
sig->prev_utime = max(sig->prev_utime, utime);
|
2011-12-15 14:56:09 +01:00
|
|
|
sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
|
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing
described in the thread:
http://lkml.org/lkml/2009/11/3/522
Now cputime is accounted in the following way:
- {u,s}time in task_struct are increased every time when the thread
is interrupted by a tick (timer interrupt).
- When a thread exits, its {u,s}time are added to signal->{u,s}time,
after adjusted by task_times().
- When all threads in a thread_group exits, accumulated {u,s}time
(and also c{u,s}time) in signal struct are added to c{u,s}time
in signal struct of the group's parent.
So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.
And accounted values are used by:
- task_times(), to get cputime of a thread:
This function returns adjusted values that originates from raw
{u,s}time and scaled by sum_exec_runtime that accounted by CFS.
- thread_group_cputime(), to get cputime of a thread group:
This function returns sum of all {u,s}time of living threads in
the group, plus {u,s}time in the signal struct that is sum of
adjusted cputimes of all exited threads belonged to the group.
The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:
group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)
This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).
To fix this, we could do:
group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)
But task_times() contains hard divisions, so applying it for
every thread should be avoided.
This patch fixes the above problem in the following way:
- Modify thread's exit (= __exit_signal()) not to use task_times().
It means {u,s}time in signal struct accumulates raw values instead
of adjusted values. As the result it makes thread_group_cputime()
to return pure sum of "raw" values.
- Introduce a new function thread_group_times(*task, *utime, *stime)
that converts "raw" values of thread_group_cputime() to "adjusted"
values, in same calculation procedure as task_times().
- Modify group's exit (= wait_task_zombie()) to use this introduced
thread_group_times(). It make c{u,s}time in signal struct to
have adjusted values like before this patch.
- Replace some thread_group_cputime() by thread_group_times().
This replacements are only applied where conveys the "adjusted"
cputime to users, and where already uses task_times() near by it.
(i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)
This patch have a positive side effect:
- Before this patch, if a group contains many short-life threads
(e.g. runs 0.9ms and not interrupted by ticks), the group's
cputime could be invisible since thread's cputime was accumulated
after adjusted: imagine adjustment function as adj(ticks, runtime),
{adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
After this patch it will not happen because the adjustment is
applied after accumulated.
v2:
- remove if()s, put new variables into signal_struct.
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-02 17:28:07 +09:00
|
|
|
|
|
|
|
*ut = sig->prev_utime;
|
|
|
|
*st = sig->prev_stime;
|
2008-09-05 18:12:23 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2006-12-10 02:20:22 -08:00
|
|
|
/*
|
|
|
|
* This function gets called by the timer code, with HZ frequency.
|
|
|
|
* We call it with interrupts disabled.
|
|
|
|
*/
|
|
|
|
void scheduler_tick(void)
|
|
|
|
{
|
|
|
|
int cpu = smp_processor_id();
|
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
struct task_struct *curr = rq->curr;
|
2008-05-03 18:29:28 +02:00
|
|
|
|
|
|
|
sched_clock_tick();
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock(&rq->lock);
|
2008-05-03 18:29:28 +02:00
|
|
|
update_rq_clock(rq);
|
2010-05-17 18:14:43 -07:00
|
|
|
update_cpu_load_active(rq);
|
2008-01-25 21:08:29 +01:00
|
|
|
curr->sched_class->task_tick(rq, curr, 0);
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock(&rq->lock);
|
2006-12-10 02:20:22 -08:00
|
|
|
|
2010-09-17 11:28:50 +02:00
|
|
|
perf_event_task_tick();
|
2009-05-23 18:28:55 +02:00
|
|
|
|
2006-12-10 02:20:23 -08:00
|
|
|
#ifdef CONFIG_SMP
|
2011-10-03 15:09:01 -07:00
|
|
|
rq->idle_balance = idle_cpu(cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
trigger_load_balance(rq, cpu);
|
2006-12-10 02:20:23 -08:00
|
|
|
#endif
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2009-04-02 14:18:25 +08:00
|
|
|
notrace unsigned long get_parent_ip(unsigned long addr)
|
2008-05-12 21:20:42 +02:00
|
|
|
{
|
|
|
|
if (in_lock_functions(addr)) {
|
|
|
|
addr = CALLER_ADDR2;
|
|
|
|
if (in_lock_functions(addr))
|
|
|
|
addr = CALLER_ADDR3;
|
|
|
|
}
|
|
|
|
return addr;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-01-22 19:01:40 -05:00
|
|
|
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
|
|
|
|
defined(CONFIG_PREEMPT_TRACER))
|
|
|
|
|
2008-02-23 15:24:04 -08:00
|
|
|
void __kprobes add_preempt_count(int val)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2008-05-12 21:20:42 +02:00
|
|
|
#ifdef CONFIG_DEBUG_PREEMPT
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Underflow?
|
|
|
|
*/
|
2006-07-03 00:24:33 -07:00
|
|
|
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
|
|
|
|
return;
|
2008-05-12 21:20:42 +02:00
|
|
|
#endif
|
2005-04-16 15:20:36 -07:00
|
|
|
preempt_count() += val;
|
2008-05-12 21:20:42 +02:00
|
|
|
#ifdef CONFIG_DEBUG_PREEMPT
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Spinlock count overflowing soon?
|
|
|
|
*/
|
2006-12-10 02:20:38 -08:00
|
|
|
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
|
|
|
|
PREEMPT_MASK - 10);
|
2008-05-12 21:20:42 +02:00
|
|
|
#endif
|
|
|
|
if (preempt_count() == val)
|
|
|
|
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(add_preempt_count);
|
|
|
|
|
2008-02-23 15:24:04 -08:00
|
|
|
void __kprobes sub_preempt_count(int val)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2008-05-12 21:20:42 +02:00
|
|
|
#ifdef CONFIG_DEBUG_PREEMPT
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Underflow?
|
|
|
|
*/
|
2009-01-12 13:00:50 +01:00
|
|
|
if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
|
2006-07-03 00:24:33 -07:00
|
|
|
return;
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Is the spinlock portion underflowing?
|
|
|
|
*/
|
2006-07-03 00:24:33 -07:00
|
|
|
if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
|
|
|
|
!(preempt_count() & PREEMPT_MASK)))
|
|
|
|
return;
|
2008-05-12 21:20:42 +02:00
|
|
|
#endif
|
2006-07-03 00:24:33 -07:00
|
|
|
|
2008-05-12 21:20:42 +02:00
|
|
|
if (preempt_count() == val)
|
|
|
|
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
|
2005-04-16 15:20:36 -07:00
|
|
|
preempt_count() -= val;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(sub_preempt_count);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
2007-07-09 18:51:59 +02:00
|
|
|
* Print scheduling while atomic bug:
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
static noinline void __schedule_bug(struct task_struct *prev)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-12-22 16:39:30 -05:00
|
|
|
if (oops_in_progress)
|
|
|
|
return;
|
|
|
|
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
|
|
|
|
prev->comm, prev->pid, preempt_count());
|
2007-10-24 18:23:50 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
debug_show_held_locks(prev);
|
2008-05-23 09:05:58 -07:00
|
|
|
print_modules();
|
2007-07-09 18:51:59 +02:00
|
|
|
if (irqs_disabled())
|
|
|
|
print_irqtrace_events(prev);
|
2012-03-28 17:10:47 -07:00
|
|
|
dump_stack();
|
2012-05-10 16:20:04 +04:00
|
|
|
add_taint(TAINT_WARN);
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* Various schedule()-time debugging checks and statistics:
|
|
|
|
*/
|
|
|
|
static inline void schedule_debug(struct task_struct *prev)
|
|
|
|
{
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2007-12-05 15:46:09 +01:00
|
|
|
* Test if we are atomic. Since do_exit() needs to call into
|
2005-04-16 15:20:36 -07:00
|
|
|
* schedule() atomically, we ignore that path for now.
|
|
|
|
* Otherwise, whine if we are scheduling when we should not be.
|
|
|
|
*/
|
2008-05-13 23:44:11 +02:00
|
|
|
if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
|
2007-07-09 18:51:59 +02:00
|
|
|
__schedule_bug(prev);
|
2011-05-24 08:31:09 -07:00
|
|
|
rcu_sleep_check();
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
|
|
|
|
|
2007-10-15 17:00:12 +02:00
|
|
|
schedstat_inc(this_rq(), sched_count);
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2009-11-30 13:00:37 +01:00
|
|
|
static void put_prev_task(struct rq *rq, struct task_struct *prev)
|
2009-03-10 19:08:11 +01:00
|
|
|
{
|
2011-04-29 08:36:50 +02:00
|
|
|
if (prev->on_rq || rq->skip_clock_update < 0)
|
2010-03-11 17:16:20 +01:00
|
|
|
update_rq_clock(rq);
|
2009-11-30 13:00:37 +01:00
|
|
|
prev->sched_class->put_prev_task(rq, prev);
|
2009-03-10 19:08:11 +01:00
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* Pick up the highest-prio task:
|
|
|
|
*/
|
|
|
|
static inline struct task_struct *
|
2009-03-02 13:55:26 +08:00
|
|
|
pick_next_task(struct rq *rq)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
2007-10-15 17:00:12 +02:00
|
|
|
const struct sched_class *class;
|
2007-07-09 18:51:59 +02:00
|
|
|
struct task_struct *p;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
2007-07-09 18:51:59 +02:00
|
|
|
* Optimization: we know that if all tasks are in
|
|
|
|
* the fair class we can call that function directly:
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2011-07-21 09:43:27 -07:00
|
|
|
if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
|
2007-08-09 11:16:48 +02:00
|
|
|
p = fair_sched_class.pick_next_task(rq);
|
2007-07-09 18:51:59 +02:00
|
|
|
if (likely(p))
|
|
|
|
return p;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2010-09-22 13:53:15 +02:00
|
|
|
for_each_class(class) {
|
2007-08-09 11:16:48 +02:00
|
|
|
p = class->pick_next_task(rq);
|
2007-07-09 18:51:59 +02:00
|
|
|
if (p)
|
|
|
|
return p;
|
|
|
|
}
|
2010-09-22 13:53:15 +02:00
|
|
|
|
|
|
|
BUG(); /* the idle class will always have a runnable task */
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
2011-06-22 19:47:00 +02:00
|
|
|
* __schedule() is the main scheduler function.
|
2007-07-09 18:51:59 +02:00
|
|
|
*/
|
2011-06-22 19:47:00 +02:00
|
|
|
static void __sched __schedule(void)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
|
|
|
struct task_struct *prev, *next;
|
2008-02-15 09:56:36 -08:00
|
|
|
unsigned long *switch_count;
|
2007-07-09 18:51:59 +02:00
|
|
|
struct rq *rq;
|
2008-07-18 18:01:23 +02:00
|
|
|
int cpu;
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2009-03-13 12:21:26 +01:00
|
|
|
need_resched:
|
|
|
|
preempt_disable();
|
2007-07-09 18:51:59 +02:00
|
|
|
cpu = smp_processor_id();
|
|
|
|
rq = cpu_rq(cpu);
|
rcu: refactor RCU's context-switch handling
The addition of preemptible RCU to treercu resulted in a bit of
confusion and inefficiency surrounding the handling of context switches
for RCU-sched and for RCU-preempt. For RCU-sched, a context switch
is a quiescent state, pure and simple, just like it always has been.
For RCU-preempt, a context switch is in no way a quiescent state, but
special handling is required when a task blocks in an RCU read-side
critical section.
However, the callout from the scheduler and the outer loop in ksoftirqd
still calls something named rcu_sched_qs(), whose name is no longer
accurate. Furthermore, when rcu_check_callbacks() notes an RCU-sched
quiescent state, it ends up unnecessarily (though harmlessly, aside
from the performance hit) enqueuing the current task if it happens to
be running in an RCU-preempt read-side critical section. This not only
increases the maximum latency of scheduler_tick(), it also needlessly
increases the overhead of the next outermost rcu_read_unlock() invocation.
This patch addresses this situation by separating the notion of RCU's
context-switch handling from that of RCU-sched's quiescent states.
The context-switch handling is covered by rcu_note_context_switch() in
general and by rcu_preempt_note_context_switch() for preemptible RCU.
This permits rcu_sched_qs() to handle quiescent states and only quiescent
states. It also reduces the maximum latency of scheduler_tick(), though
probably by much less than a microsecond. Finally, it means that tasks
within preemptible-RCU read-side critical sections avoid incurring the
overhead of queuing unless there really is a context switch.
Suggested-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
2010-04-01 17:37:01 -07:00
|
|
|
rcu_note_context_switch(cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
prev = rq->curr;
|
|
|
|
|
|
|
|
schedule_debug(prev);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-07-18 18:01:23 +02:00
|
|
|
if (sched_feat(HRTICK))
|
2008-05-12 21:20:55 +02:00
|
|
|
hrtick_clear(rq);
|
2008-01-25 21:08:29 +01:00
|
|
|
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock_irq(&rq->lock);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2010-05-19 14:57:11 +02:00
|
|
|
switch_count = &prev->nivcsw;
|
2005-04-16 15:20:36 -07:00
|
|
|
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
|
2010-06-08 21:40:37 +02:00
|
|
|
if (unlikely(signal_pending_state(prev->state, prev))) {
|
2005-04-16 15:20:36 -07:00
|
|
|
prev->state = TASK_RUNNING;
|
2010-06-08 21:40:37 +02:00
|
|
|
} else {
|
2011-04-05 17:23:50 +02:00
|
|
|
deactivate_task(rq, prev, DEQUEUE_SLEEP);
|
|
|
|
prev->on_rq = 0;
|
|
|
|
|
2010-06-08 21:40:37 +02:00
|
|
|
/*
|
2011-04-05 17:23:50 +02:00
|
|
|
* If a worker went to sleep, notify and ask workqueue
|
|
|
|
* whether it wants to wake up a task to maintain
|
|
|
|
* concurrency.
|
2010-06-08 21:40:37 +02:00
|
|
|
*/
|
|
|
|
if (prev->flags & PF_WQ_WORKER) {
|
|
|
|
struct task_struct *to_wakeup;
|
|
|
|
|
|
|
|
to_wakeup = wq_worker_sleeping(prev, cpu);
|
|
|
|
if (to_wakeup)
|
|
|
|
try_to_wake_up_local(to_wakeup);
|
|
|
|
}
|
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
switch_count = &prev->nvcsw;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2009-07-29 11:08:47 -04:00
|
|
|
pre_schedule(rq, prev);
|
2008-01-25 21:08:07 +01:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (unlikely(!rq->nr_running))
|
2005-04-16 15:20:36 -07:00
|
|
|
idle_balance(cpu, rq);
|
|
|
|
|
2009-03-10 19:08:11 +01:00
|
|
|
put_prev_task(rq, prev);
|
2009-03-02 13:55:26 +08:00
|
|
|
next = pick_next_task(rq);
|
2010-12-08 11:05:42 +01:00
|
|
|
clear_tsk_need_resched(prev);
|
|
|
|
rq->skip_clock_update = 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
if (likely(prev != next)) {
|
|
|
|
rq->nr_switches++;
|
|
|
|
rq->curr = next;
|
|
|
|
++*switch_count;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
context_switch(rq, prev, next); /* unlocks the rq */
|
2008-01-25 21:08:29 +01:00
|
|
|
/*
|
2010-05-19 14:57:11 +02:00
|
|
|
* The context switch have flipped the stack from under us
|
|
|
|
* and restored the local variables which were saved when
|
|
|
|
* this task called schedule() in the past. prev == current
|
|
|
|
* is still correct, but it can be moved to another cpu/rq.
|
2008-01-25 21:08:29 +01:00
|
|
|
*/
|
|
|
|
cpu = smp_processor_id();
|
|
|
|
rq = cpu_rq(cpu);
|
2005-04-16 15:20:36 -07:00
|
|
|
} else
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock_irq(&rq->lock);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-07-29 11:08:47 -04:00
|
|
|
post_schedule(rq);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-03-21 13:32:17 +01:00
|
|
|
sched_preempt_enable_no_resched();
|
2009-03-13 12:21:26 +01:00
|
|
|
if (need_resched())
|
2005-04-16 15:20:36 -07:00
|
|
|
goto need_resched;
|
|
|
|
}
|
2011-06-22 19:47:00 +02:00
|
|
|
|
2011-06-22 19:47:01 +02:00
|
|
|
static inline void sched_submit_work(struct task_struct *tsk)
|
|
|
|
{
|
2011-07-17 20:46:52 +02:00
|
|
|
if (!tsk->state || tsk_is_pi_blocked(tsk))
|
2011-06-22 19:47:01 +02:00
|
|
|
return;
|
|
|
|
/*
|
|
|
|
* If we are going to sleep and we have plugged IO queued,
|
|
|
|
* make sure to submit it to avoid deadlocks.
|
|
|
|
*/
|
|
|
|
if (blk_needs_flush_plug(tsk))
|
|
|
|
blk_schedule_flush_plug(tsk);
|
|
|
|
}
|
|
|
|
|
2011-09-22 17:03:46 -07:00
|
|
|
asmlinkage void __sched schedule(void)
|
2011-06-22 19:47:00 +02:00
|
|
|
{
|
2011-06-22 19:47:01 +02:00
|
|
|
struct task_struct *tsk = current;
|
|
|
|
|
|
|
|
sched_submit_work(tsk);
|
2011-06-22 19:47:00 +02:00
|
|
|
__schedule();
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
EXPORT_SYMBOL(schedule);
|
|
|
|
|
2011-03-21 12:09:35 +01:00
|
|
|
/**
|
|
|
|
* schedule_preempt_disabled - called with preemption disabled
|
|
|
|
*
|
|
|
|
* Returns with preemption disabled. Note: preempt_count must be 1
|
|
|
|
*/
|
|
|
|
void __sched schedule_preempt_disabled(void)
|
|
|
|
{
|
2011-03-21 13:32:17 +01:00
|
|
|
sched_preempt_enable_no_resched();
|
2011-03-21 12:09:35 +01:00
|
|
|
schedule();
|
|
|
|
preempt_disable();
|
|
|
|
}
|
|
|
|
|
2009-12-02 20:49:17 +01:00
|
|
|
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
|
2009-01-12 14:01:47 +01:00
|
|
|
|
2011-04-05 17:23:41 +02:00
|
|
|
static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
|
|
|
|
{
|
|
|
|
if (lock->owner != owner)
|
2011-06-10 15:08:55 +02:00
|
|
|
return false;
|
2009-01-12 14:01:47 +01:00
|
|
|
|
|
|
|
/*
|
2011-04-05 17:23:41 +02:00
|
|
|
* Ensure we emit the owner->on_cpu, dereference _after_ checking
|
|
|
|
* lock->owner still matches owner, if that fails, owner might
|
|
|
|
* point to free()d memory, if it still matches, the rcu_read_lock()
|
|
|
|
* ensures the memory stays valid.
|
2009-01-12 14:01:47 +01:00
|
|
|
*/
|
2011-04-05 17:23:41 +02:00
|
|
|
barrier();
|
2009-01-12 14:01:47 +01:00
|
|
|
|
2011-06-10 15:08:55 +02:00
|
|
|
return owner->on_cpu;
|
2011-04-05 17:23:41 +02:00
|
|
|
}
|
2009-01-12 14:01:47 +01:00
|
|
|
|
2011-04-05 17:23:41 +02:00
|
|
|
/*
|
|
|
|
* Look out! "owner" is an entirely speculative pointer
|
|
|
|
* access and not reliable.
|
|
|
|
*/
|
|
|
|
int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
|
|
|
|
{
|
|
|
|
if (!sched_feat(OWNER_SPIN))
|
|
|
|
return 0;
|
2009-01-12 14:01:47 +01:00
|
|
|
|
2011-06-10 15:08:55 +02:00
|
|
|
rcu_read_lock();
|
2011-04-05 17:23:41 +02:00
|
|
|
while (owner_running(lock, owner)) {
|
|
|
|
if (need_resched())
|
2011-06-10 15:08:55 +02:00
|
|
|
break;
|
2009-01-12 14:01:47 +01:00
|
|
|
|
2010-11-22 15:47:36 +01:00
|
|
|
arch_mutex_cpu_relax();
|
2009-01-12 14:01:47 +01:00
|
|
|
}
|
2011-06-10 15:08:55 +02:00
|
|
|
rcu_read_unlock();
|
2010-04-16 23:20:00 +02:00
|
|
|
|
2011-04-05 17:23:41 +02:00
|
|
|
/*
|
2011-06-10 15:08:55 +02:00
|
|
|
* We break out the loop above on need_resched() and when the
|
|
|
|
* owner changed, which is a sign for heavy contention. Return
|
|
|
|
* success only when lock->owner is NULL.
|
2011-04-05 17:23:41 +02:00
|
|
|
*/
|
2011-06-10 15:08:55 +02:00
|
|
|
return lock->owner == NULL;
|
2009-01-12 14:01:47 +01:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifdef CONFIG_PREEMPT
|
|
|
|
/*
|
2006-07-10 04:43:52 -07:00
|
|
|
* this is the entry point to schedule() from in-kernel preemption
|
2007-12-05 15:46:09 +01:00
|
|
|
* off of preempt_enable. Kernel preemptions off return from interrupt
|
2005-04-16 15:20:36 -07:00
|
|
|
* occur there and call schedule directly.
|
|
|
|
*/
|
2010-06-02 21:52:29 -04:00
|
|
|
asmlinkage void __sched notrace preempt_schedule(void)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct thread_info *ti = current_thread_info();
|
2008-01-25 21:08:33 +01:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* If there is a non-zero preempt_count or interrupts are disabled,
|
2007-12-05 15:46:09 +01:00
|
|
|
* we do not want to preempt the current task. Just return..
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2006-10-11 01:21:52 -07:00
|
|
|
if (likely(ti->preempt_count || irqs_disabled()))
|
2005-04-16 15:20:36 -07:00
|
|
|
return;
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
do {
|
2010-06-02 21:52:29 -04:00
|
|
|
add_preempt_count_notrace(PREEMPT_ACTIVE);
|
2011-06-22 19:47:00 +02:00
|
|
|
__schedule();
|
2010-06-02 21:52:29 -04:00
|
|
|
sub_preempt_count_notrace(PREEMPT_ACTIVE);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* Check again in case we missed a preemption opportunity
|
|
|
|
* between schedule and now.
|
|
|
|
*/
|
|
|
|
barrier();
|
2009-03-06 19:40:20 +08:00
|
|
|
} while (need_resched());
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(preempt_schedule);
|
|
|
|
|
|
|
|
/*
|
2006-07-10 04:43:52 -07:00
|
|
|
* this is the entry point to schedule() from kernel preemption
|
2005-04-16 15:20:36 -07:00
|
|
|
* off of irq context.
|
|
|
|
* Note, that this is called and return with irqs disabled. This will
|
|
|
|
* protect us against recursive calling from irq.
|
|
|
|
*/
|
|
|
|
asmlinkage void __sched preempt_schedule_irq(void)
|
|
|
|
{
|
|
|
|
struct thread_info *ti = current_thread_info();
|
2008-01-25 21:08:33 +01:00
|
|
|
|
2006-07-10 04:43:52 -07:00
|
|
|
/* Catch callers which need to be fixed */
|
2005-04-16 15:20:36 -07:00
|
|
|
BUG_ON(ti->preempt_count || !irqs_disabled());
|
|
|
|
|
2012-07-11 20:26:36 +02:00
|
|
|
rcu_user_exit();
|
2007-10-15 17:00:14 +02:00
|
|
|
do {
|
|
|
|
add_preempt_count(PREEMPT_ACTIVE);
|
|
|
|
local_irq_enable();
|
2011-06-22 19:47:00 +02:00
|
|
|
__schedule();
|
2007-10-15 17:00:14 +02:00
|
|
|
local_irq_disable();
|
|
|
|
sub_preempt_count(PREEMPT_ACTIVE);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* Check again in case we missed a preemption opportunity
|
|
|
|
* between schedule and now.
|
|
|
|
*/
|
|
|
|
barrier();
|
2009-03-06 19:40:20 +08:00
|
|
|
} while (need_resched());
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_PREEMPT */
|
|
|
|
|
2009-09-15 19:14:42 +02:00
|
|
|
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
|
2005-09-10 00:26:11 -07:00
|
|
|
void *key)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2009-09-15 19:14:42 +02:00
|
|
|
return try_to_wake_up(curr->private, mode, wake_flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(default_wake_function);
|
|
|
|
|
|
|
|
/*
|
2007-12-05 15:46:09 +01:00
|
|
|
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
|
|
|
|
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
|
2005-04-16 15:20:36 -07:00
|
|
|
* number) then we wake all the non-exclusive tasks and one exclusive task.
|
|
|
|
*
|
|
|
|
* There are circumstances in which we can try to wake a task which has already
|
2007-12-05 15:46:09 +01:00
|
|
|
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
|
2005-04-16 15:20:36 -07:00
|
|
|
* zero in this (rare) case, and we handle it by continuing to scan the queue.
|
|
|
|
*/
|
2009-04-14 16:53:05 +02:00
|
|
|
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
|
2009-09-15 19:14:42 +02:00
|
|
|
int nr_exclusive, int wake_flags, void *key)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-10-15 17:00:02 +02:00
|
|
|
wait_queue_t *curr, *next;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-15 17:00:02 +02:00
|
|
|
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
|
2006-07-03 00:25:40 -07:00
|
|
|
unsigned flags = curr->flags;
|
|
|
|
|
2009-09-15 19:14:42 +02:00
|
|
|
if (curr->func(curr, mode, wake_flags, key) &&
|
2006-07-03 00:25:40 -07:00
|
|
|
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
|
2005-04-16 15:20:36 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* __wake_up - wake up threads blocked on a waitqueue.
|
|
|
|
* @q: the waitqueue
|
|
|
|
* @mode: which threads
|
|
|
|
* @nr_exclusive: how many wake-one or wake-many threads to wake up
|
2005-05-01 08:59:26 -07:00
|
|
|
* @key: is directly passed to the wakeup function
|
2009-04-28 15:01:38 +01:00
|
|
|
*
|
|
|
|
* It may be assumed that this function implies a write memory barrier before
|
|
|
|
* changing the task state if and only if any tasks are woken up.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2008-02-08 04:19:53 -08:00
|
|
|
void __wake_up(wait_queue_head_t *q, unsigned int mode,
|
2005-09-10 00:26:11 -07:00
|
|
|
int nr_exclusive, void *key)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&q->lock, flags);
|
|
|
|
__wake_up_common(q, mode, nr_exclusive, 0, key);
|
|
|
|
spin_unlock_irqrestore(&q->lock, flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__wake_up);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
|
|
|
|
*/
|
2011-12-01 00:04:00 +01:00
|
|
|
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-12-01 00:04:00 +01:00
|
|
|
__wake_up_common(q, mode, nr, 0, NULL);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2010-05-05 12:53:11 +02:00
|
|
|
EXPORT_SYMBOL_GPL(__wake_up_locked);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key()
This patchset introduces wakeup hints for some of the most popular (from
epoll POV) devices, so that epoll code can avoid spurious wakeups on its
waiters.
The problem with epoll is that the callback-based wakeups do not, ATM,
carry any information about the events the wakeup is related to. So the
only choice epoll has (not being able to call f_op->poll() from inside the
callback), is to add the file* to a ready-list and resolve the real events
later on, at epoll_wait() (or its own f_op->poll()) time. This can cause
spurious wakeups, since the wake_up() itself might be for an event the
caller is not interested into.
The rate of these spurious wakeup can be pretty high in case of many
network sockets being monitored.
By allowing devices to report the events the wakeups refer to (at least
the two major classes - POLLIN/POLLOUT), we are able to spare useless
wakeups by proper handling inside the epoll's poll callback.
Epoll will have in any case to call f_op->poll() on the file* later on,
since the change to be done in order to have the full event set sent via
wakeup, is too invasive for the way our f_op->poll() system works (the
full event set is calculated inside the poll function - there are too many
of them to even start thinking the change - also poll/select would need
change too).
Epoll is changed in a way that both devices which send event hints, and
the ones that don't, are correctly handled. The former will gain some
efficiency though.
As a general rule for devices, would be to add an event mask by using
key-aware wakeup macros, when making up poll wait queues. I tested it
(together with the epoll's poll fix patch Andrew has in -mm) and wakeups
for the supported devices are correctly filtered.
Test program available here:
http://www.xmailserver.org/epoll_test.c
This patch:
Nothing revolutionary here. Just using the available "key" that our
wakeup core already support. The __wake_up_locked_key() was no brainer,
since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers
around __wake_up_common().
The __wake_up_sync() function had a body, so the choice was between
borrowing the body for __wake_up_sync_key() and calling it from
__wake_up_sync(), or make an inline and calling it from both. I chose the
former since in most archs it all resolves to "mov $0, REG; jmp ADDR".
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-03-31 15:24:20 -07:00
|
|
|
void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
|
|
|
|
{
|
|
|
|
__wake_up_common(q, mode, 1, 0, key);
|
|
|
|
}
|
2011-02-21 11:05:41 -08:00
|
|
|
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
|
epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key()
This patchset introduces wakeup hints for some of the most popular (from
epoll POV) devices, so that epoll code can avoid spurious wakeups on its
waiters.
The problem with epoll is that the callback-based wakeups do not, ATM,
carry any information about the events the wakeup is related to. So the
only choice epoll has (not being able to call f_op->poll() from inside the
callback), is to add the file* to a ready-list and resolve the real events
later on, at epoll_wait() (or its own f_op->poll()) time. This can cause
spurious wakeups, since the wake_up() itself might be for an event the
caller is not interested into.
The rate of these spurious wakeup can be pretty high in case of many
network sockets being monitored.
By allowing devices to report the events the wakeups refer to (at least
the two major classes - POLLIN/POLLOUT), we are able to spare useless
wakeups by proper handling inside the epoll's poll callback.
Epoll will have in any case to call f_op->poll() on the file* later on,
since the change to be done in order to have the full event set sent via
wakeup, is too invasive for the way our f_op->poll() system works (the
full event set is calculated inside the poll function - there are too many
of them to even start thinking the change - also poll/select would need
change too).
Epoll is changed in a way that both devices which send event hints, and
the ones that don't, are correctly handled. The former will gain some
efficiency though.
As a general rule for devices, would be to add an event mask by using
key-aware wakeup macros, when making up poll wait queues. I tested it
(together with the epoll's poll fix patch Andrew has in -mm) and wakeups
for the supported devices are correctly filtered.
Test program available here:
http://www.xmailserver.org/epoll_test.c
This patch:
Nothing revolutionary here. Just using the available "key" that our
wakeup core already support. The __wake_up_locked_key() was no brainer,
since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers
around __wake_up_common().
The __wake_up_sync() function had a body, so the choice was between
borrowing the body for __wake_up_sync_key() and calling it from
__wake_up_sync(), or make an inline and calling it from both. I chose the
former since in most archs it all resolves to "mov $0, REG; jmp ADDR".
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-03-31 15:24:20 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/**
|
epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key()
This patchset introduces wakeup hints for some of the most popular (from
epoll POV) devices, so that epoll code can avoid spurious wakeups on its
waiters.
The problem with epoll is that the callback-based wakeups do not, ATM,
carry any information about the events the wakeup is related to. So the
only choice epoll has (not being able to call f_op->poll() from inside the
callback), is to add the file* to a ready-list and resolve the real events
later on, at epoll_wait() (or its own f_op->poll()) time. This can cause
spurious wakeups, since the wake_up() itself might be for an event the
caller is not interested into.
The rate of these spurious wakeup can be pretty high in case of many
network sockets being monitored.
By allowing devices to report the events the wakeups refer to (at least
the two major classes - POLLIN/POLLOUT), we are able to spare useless
wakeups by proper handling inside the epoll's poll callback.
Epoll will have in any case to call f_op->poll() on the file* later on,
since the change to be done in order to have the full event set sent via
wakeup, is too invasive for the way our f_op->poll() system works (the
full event set is calculated inside the poll function - there are too many
of them to even start thinking the change - also poll/select would need
change too).
Epoll is changed in a way that both devices which send event hints, and
the ones that don't, are correctly handled. The former will gain some
efficiency though.
As a general rule for devices, would be to add an event mask by using
key-aware wakeup macros, when making up poll wait queues. I tested it
(together with the epoll's poll fix patch Andrew has in -mm) and wakeups
for the supported devices are correctly filtered.
Test program available here:
http://www.xmailserver.org/epoll_test.c
This patch:
Nothing revolutionary here. Just using the available "key" that our
wakeup core already support. The __wake_up_locked_key() was no brainer,
since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers
around __wake_up_common().
The __wake_up_sync() function had a body, so the choice was between
borrowing the body for __wake_up_sync_key() and calling it from
__wake_up_sync(), or make an inline and calling it from both. I chose the
former since in most archs it all resolves to "mov $0, REG; jmp ADDR".
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-03-31 15:24:20 -07:00
|
|
|
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
|
2005-04-16 15:20:36 -07:00
|
|
|
* @q: the waitqueue
|
|
|
|
* @mode: which threads
|
|
|
|
* @nr_exclusive: how many wake-one or wake-many threads to wake up
|
epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key()
This patchset introduces wakeup hints for some of the most popular (from
epoll POV) devices, so that epoll code can avoid spurious wakeups on its
waiters.
The problem with epoll is that the callback-based wakeups do not, ATM,
carry any information about the events the wakeup is related to. So the
only choice epoll has (not being able to call f_op->poll() from inside the
callback), is to add the file* to a ready-list and resolve the real events
later on, at epoll_wait() (or its own f_op->poll()) time. This can cause
spurious wakeups, since the wake_up() itself might be for an event the
caller is not interested into.
The rate of these spurious wakeup can be pretty high in case of many
network sockets being monitored.
By allowing devices to report the events the wakeups refer to (at least
the two major classes - POLLIN/POLLOUT), we are able to spare useless
wakeups by proper handling inside the epoll's poll callback.
Epoll will have in any case to call f_op->poll() on the file* later on,
since the change to be done in order to have the full event set sent via
wakeup, is too invasive for the way our f_op->poll() system works (the
full event set is calculated inside the poll function - there are too many
of them to even start thinking the change - also poll/select would need
change too).
Epoll is changed in a way that both devices which send event hints, and
the ones that don't, are correctly handled. The former will gain some
efficiency though.
As a general rule for devices, would be to add an event mask by using
key-aware wakeup macros, when making up poll wait queues. I tested it
(together with the epoll's poll fix patch Andrew has in -mm) and wakeups
for the supported devices are correctly filtered.
Test program available here:
http://www.xmailserver.org/epoll_test.c
This patch:
Nothing revolutionary here. Just using the available "key" that our
wakeup core already support. The __wake_up_locked_key() was no brainer,
since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers
around __wake_up_common().
The __wake_up_sync() function had a body, so the choice was between
borrowing the body for __wake_up_sync_key() and calling it from
__wake_up_sync(), or make an inline and calling it from both. I chose the
former since in most archs it all resolves to "mov $0, REG; jmp ADDR".
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-03-31 15:24:20 -07:00
|
|
|
* @key: opaque value to be passed to wakeup targets
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
|
|
|
* The sync wakeup differs that the waker knows that it will schedule
|
|
|
|
* away soon, so while the target thread will be woken up, it will not
|
|
|
|
* be migrated to another CPU - ie. the two threads are 'synchronized'
|
|
|
|
* with each other. This can prevent needless bouncing between CPUs.
|
|
|
|
*
|
|
|
|
* On UP it can prevent extra preemption.
|
2009-04-28 15:01:38 +01:00
|
|
|
*
|
|
|
|
* It may be assumed that this function implies a write memory barrier before
|
|
|
|
* changing the task state if and only if any tasks are woken up.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key()
This patchset introduces wakeup hints for some of the most popular (from
epoll POV) devices, so that epoll code can avoid spurious wakeups on its
waiters.
The problem with epoll is that the callback-based wakeups do not, ATM,
carry any information about the events the wakeup is related to. So the
only choice epoll has (not being able to call f_op->poll() from inside the
callback), is to add the file* to a ready-list and resolve the real events
later on, at epoll_wait() (or its own f_op->poll()) time. This can cause
spurious wakeups, since the wake_up() itself might be for an event the
caller is not interested into.
The rate of these spurious wakeup can be pretty high in case of many
network sockets being monitored.
By allowing devices to report the events the wakeups refer to (at least
the two major classes - POLLIN/POLLOUT), we are able to spare useless
wakeups by proper handling inside the epoll's poll callback.
Epoll will have in any case to call f_op->poll() on the file* later on,
since the change to be done in order to have the full event set sent via
wakeup, is too invasive for the way our f_op->poll() system works (the
full event set is calculated inside the poll function - there are too many
of them to even start thinking the change - also poll/select would need
change too).
Epoll is changed in a way that both devices which send event hints, and
the ones that don't, are correctly handled. The former will gain some
efficiency though.
As a general rule for devices, would be to add an event mask by using
key-aware wakeup macros, when making up poll wait queues. I tested it
(together with the epoll's poll fix patch Andrew has in -mm) and wakeups
for the supported devices are correctly filtered.
Test program available here:
http://www.xmailserver.org/epoll_test.c
This patch:
Nothing revolutionary here. Just using the available "key" that our
wakeup core already support. The __wake_up_locked_key() was no brainer,
since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers
around __wake_up_common().
The __wake_up_sync() function had a body, so the choice was between
borrowing the body for __wake_up_sync_key() and calling it from
__wake_up_sync(), or make an inline and calling it from both. I chose the
former since in most archs it all resolves to "mov $0, REG; jmp ADDR".
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-03-31 15:24:20 -07:00
|
|
|
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
|
|
|
|
int nr_exclusive, void *key)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2009-09-14 19:55:44 +02:00
|
|
|
int wake_flags = WF_SYNC;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
if (unlikely(!q))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (unlikely(!nr_exclusive))
|
2009-09-14 19:55:44 +02:00
|
|
|
wake_flags = 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
spin_lock_irqsave(&q->lock, flags);
|
2009-09-14 19:55:44 +02:00
|
|
|
__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
|
2005-04-16 15:20:36 -07:00
|
|
|
spin_unlock_irqrestore(&q->lock, flags);
|
|
|
|
}
|
epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key()
This patchset introduces wakeup hints for some of the most popular (from
epoll POV) devices, so that epoll code can avoid spurious wakeups on its
waiters.
The problem with epoll is that the callback-based wakeups do not, ATM,
carry any information about the events the wakeup is related to. So the
only choice epoll has (not being able to call f_op->poll() from inside the
callback), is to add the file* to a ready-list and resolve the real events
later on, at epoll_wait() (or its own f_op->poll()) time. This can cause
spurious wakeups, since the wake_up() itself might be for an event the
caller is not interested into.
The rate of these spurious wakeup can be pretty high in case of many
network sockets being monitored.
By allowing devices to report the events the wakeups refer to (at least
the two major classes - POLLIN/POLLOUT), we are able to spare useless
wakeups by proper handling inside the epoll's poll callback.
Epoll will have in any case to call f_op->poll() on the file* later on,
since the change to be done in order to have the full event set sent via
wakeup, is too invasive for the way our f_op->poll() system works (the
full event set is calculated inside the poll function - there are too many
of them to even start thinking the change - also poll/select would need
change too).
Epoll is changed in a way that both devices which send event hints, and
the ones that don't, are correctly handled. The former will gain some
efficiency though.
As a general rule for devices, would be to add an event mask by using
key-aware wakeup macros, when making up poll wait queues. I tested it
(together with the epoll's poll fix patch Andrew has in -mm) and wakeups
for the supported devices are correctly filtered.
Test program available here:
http://www.xmailserver.org/epoll_test.c
This patch:
Nothing revolutionary here. Just using the available "key" that our
wakeup core already support. The __wake_up_locked_key() was no brainer,
since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers
around __wake_up_common().
The __wake_up_sync() function had a body, so the choice was between
borrowing the body for __wake_up_sync_key() and calling it from
__wake_up_sync(), or make an inline and calling it from both. I chose the
former since in most archs it all resolves to "mov $0, REG; jmp ADDR".
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-03-31 15:24:20 -07:00
|
|
|
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* __wake_up_sync - see __wake_up_sync_key()
|
|
|
|
*/
|
|
|
|
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
|
|
|
|
{
|
|
|
|
__wake_up_sync_key(q, mode, nr_exclusive, NULL);
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
|
|
|
|
|
2008-08-26 10:26:54 +02:00
|
|
|
/**
|
|
|
|
* complete: - signals a single thread waiting on this completion
|
|
|
|
* @x: holds the state of this particular completion
|
|
|
|
*
|
|
|
|
* This will wake up a single thread waiting on this completion. Threads will be
|
|
|
|
* awakened in the same order in which they were queued.
|
|
|
|
*
|
|
|
|
* See also complete_all(), wait_for_completion() and related routines.
|
2009-04-28 15:01:38 +01:00
|
|
|
*
|
|
|
|
* It may be assumed that this function implies a write memory barrier before
|
|
|
|
* changing the task state if and only if any tasks are woken up.
|
2008-08-26 10:26:54 +02:00
|
|
|
*/
|
2007-10-24 18:23:48 +02:00
|
|
|
void complete(struct completion *x)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&x->wait.lock, flags);
|
|
|
|
x->done++;
|
2007-12-06 11:07:07 -05:00
|
|
|
__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
|
2005-04-16 15:20:36 -07:00
|
|
|
spin_unlock_irqrestore(&x->wait.lock, flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(complete);
|
|
|
|
|
2008-08-26 10:26:54 +02:00
|
|
|
/**
|
|
|
|
* complete_all: - signals all threads waiting on this completion
|
|
|
|
* @x: holds the state of this particular completion
|
|
|
|
*
|
|
|
|
* This will wake up all threads waiting on this particular completion event.
|
2009-04-28 15:01:38 +01:00
|
|
|
*
|
|
|
|
* It may be assumed that this function implies a write memory barrier before
|
|
|
|
* changing the task state if and only if any tasks are woken up.
|
2008-08-26 10:26:54 +02:00
|
|
|
*/
|
2007-10-24 18:23:48 +02:00
|
|
|
void complete_all(struct completion *x)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&x->wait.lock, flags);
|
|
|
|
x->done += UINT_MAX/2;
|
2007-12-06 11:07:07 -05:00
|
|
|
__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
|
2005-04-16 15:20:36 -07:00
|
|
|
spin_unlock_irqrestore(&x->wait.lock, flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(complete_all);
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
static inline long __sched
|
|
|
|
do_wait_for_common(struct completion *x, long timeout, int state)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
if (!x->done) {
|
|
|
|
DECLARE_WAITQUEUE(wait, current);
|
|
|
|
|
2010-05-07 14:33:26 +08:00
|
|
|
__add_wait_queue_tail_exclusive(&x->wait, &wait);
|
2005-04-16 15:20:36 -07:00
|
|
|
do {
|
2008-08-20 16:54:41 -07:00
|
|
|
if (signal_pending_state(state, current)) {
|
2008-06-20 18:32:20 +04:00
|
|
|
timeout = -ERESTARTSYS;
|
|
|
|
break;
|
2007-10-15 17:00:14 +02:00
|
|
|
}
|
|
|
|
__set_current_state(state);
|
2005-04-16 15:20:36 -07:00
|
|
|
spin_unlock_irq(&x->wait.lock);
|
|
|
|
timeout = schedule_timeout(timeout);
|
|
|
|
spin_lock_irq(&x->wait.lock);
|
2008-06-20 18:32:20 +04:00
|
|
|
} while (!x->done && timeout);
|
2005-04-16 15:20:36 -07:00
|
|
|
__remove_wait_queue(&x->wait, &wait);
|
2008-06-20 18:32:20 +04:00
|
|
|
if (!x->done)
|
|
|
|
return timeout;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
x->done--;
|
2008-06-20 18:32:20 +04:00
|
|
|
return timeout ?: 1;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
static long __sched
|
|
|
|
wait_for_common(struct completion *x, long timeout, int state)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
might_sleep();
|
|
|
|
|
|
|
|
spin_lock_irq(&x->wait.lock);
|
2007-10-15 17:00:14 +02:00
|
|
|
timeout = do_wait_for_common(x, timeout, state);
|
2005-04-16 15:20:36 -07:00
|
|
|
spin_unlock_irq(&x->wait.lock);
|
2007-10-15 17:00:14 +02:00
|
|
|
return timeout;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-08-26 10:26:54 +02:00
|
|
|
/**
|
|
|
|
* wait_for_completion: - waits for completion of a task
|
|
|
|
* @x: holds the state of this particular completion
|
|
|
|
*
|
|
|
|
* This waits to be signaled for completion of a specific task. It is NOT
|
|
|
|
* interruptible and there is no timeout.
|
|
|
|
*
|
|
|
|
* See also similar routines (i.e. wait_for_completion_timeout()) with timeout
|
|
|
|
* and interrupt capability. Also see complete().
|
|
|
|
*/
|
2007-10-24 18:23:48 +02:00
|
|
|
void __sched wait_for_completion(struct completion *x)
|
2007-10-15 17:00:14 +02:00
|
|
|
{
|
|
|
|
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
EXPORT_SYMBOL(wait_for_completion);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-08-26 10:26:54 +02:00
|
|
|
/**
|
|
|
|
* wait_for_completion_timeout: - waits for completion of a task (w/timeout)
|
|
|
|
* @x: holds the state of this particular completion
|
|
|
|
* @timeout: timeout value in jiffies
|
|
|
|
*
|
|
|
|
* This waits for either a completion of a specific task to be signaled or for a
|
|
|
|
* specified timeout to expire. The timeout is in jiffies. It is not
|
|
|
|
* interruptible.
|
2011-10-06 15:22:46 -04:00
|
|
|
*
|
|
|
|
* The return value is 0 if timed out, and positive (at least 1, or number of
|
|
|
|
* jiffies left till timeout) if completed.
|
2008-08-26 10:26:54 +02:00
|
|
|
*/
|
2007-10-24 18:23:48 +02:00
|
|
|
unsigned long __sched
|
2007-10-15 17:00:14 +02:00
|
|
|
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
EXPORT_SYMBOL(wait_for_completion_timeout);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-08-26 10:26:54 +02:00
|
|
|
/**
|
|
|
|
* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
|
|
|
|
* @x: holds the state of this particular completion
|
|
|
|
*
|
|
|
|
* This waits for completion of a specific task to be signaled. It is
|
|
|
|
* interruptible.
|
2011-10-06 15:22:46 -04:00
|
|
|
*
|
|
|
|
* The return value is -ERESTARTSYS if interrupted, 0 if completed.
|
2008-08-26 10:26:54 +02:00
|
|
|
*/
|
2007-10-15 17:00:14 +02:00
|
|
|
int __sched wait_for_completion_interruptible(struct completion *x)
|
2007-07-09 18:52:01 +02:00
|
|
|
{
|
2007-10-18 21:32:55 +02:00
|
|
|
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
|
|
|
|
if (t == -ERESTARTSYS)
|
|
|
|
return t;
|
|
|
|
return 0;
|
2007-07-09 18:52:01 +02:00
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
EXPORT_SYMBOL(wait_for_completion_interruptible);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-08-26 10:26:54 +02:00
|
|
|
/**
|
|
|
|
* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
|
|
|
|
* @x: holds the state of this particular completion
|
|
|
|
* @timeout: timeout value in jiffies
|
|
|
|
*
|
|
|
|
* This waits for either a completion of a specific task to be signaled or for a
|
|
|
|
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
|
2011-10-06 15:22:46 -04:00
|
|
|
*
|
|
|
|
* The return value is -ERESTARTSYS if interrupted, 0 if timed out,
|
|
|
|
* positive (at least 1, or number of jiffies left till timeout) if completed.
|
2008-08-26 10:26:54 +02:00
|
|
|
*/
|
2011-01-05 12:50:16 +11:00
|
|
|
long __sched
|
2007-10-15 17:00:14 +02:00
|
|
|
wait_for_completion_interruptible_timeout(struct completion *x,
|
|
|
|
unsigned long timeout)
|
2007-07-09 18:52:01 +02:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
|
2007-07-09 18:52:01 +02:00
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-08-26 10:26:54 +02:00
|
|
|
/**
|
|
|
|
* wait_for_completion_killable: - waits for completion of a task (killable)
|
|
|
|
* @x: holds the state of this particular completion
|
|
|
|
*
|
|
|
|
* This waits to be signaled for completion of a specific task. It can be
|
|
|
|
* interrupted by a kill signal.
|
2011-10-06 15:22:46 -04:00
|
|
|
*
|
|
|
|
* The return value is -ERESTARTSYS if interrupted, 0 if completed.
|
2008-08-26 10:26:54 +02:00
|
|
|
*/
|
2007-12-06 12:29:54 -05:00
|
|
|
int __sched wait_for_completion_killable(struct completion *x)
|
|
|
|
{
|
|
|
|
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
|
|
|
|
if (t == -ERESTARTSYS)
|
|
|
|
return t;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(wait_for_completion_killable);
|
|
|
|
|
2010-05-29 09:12:30 -07:00
|
|
|
/**
|
|
|
|
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
|
|
|
|
* @x: holds the state of this particular completion
|
|
|
|
* @timeout: timeout value in jiffies
|
|
|
|
*
|
|
|
|
* This waits for either a completion of a specific task to be
|
|
|
|
* signaled or for a specified timeout to expire. It can be
|
|
|
|
* interrupted by a kill signal. The timeout is in jiffies.
|
2011-10-06 15:22:46 -04:00
|
|
|
*
|
|
|
|
* The return value is -ERESTARTSYS if interrupted, 0 if timed out,
|
|
|
|
* positive (at least 1, or number of jiffies left till timeout) if completed.
|
2010-05-29 09:12:30 -07:00
|
|
|
*/
|
2011-01-05 12:50:16 +11:00
|
|
|
long __sched
|
2010-05-29 09:12:30 -07:00
|
|
|
wait_for_completion_killable_timeout(struct completion *x,
|
|
|
|
unsigned long timeout)
|
|
|
|
{
|
|
|
|
return wait_for_common(x, timeout, TASK_KILLABLE);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(wait_for_completion_killable_timeout);
|
|
|
|
|
2008-08-15 00:40:44 -07:00
|
|
|
/**
|
|
|
|
* try_wait_for_completion - try to decrement a completion without blocking
|
|
|
|
* @x: completion structure
|
|
|
|
*
|
|
|
|
* Returns: 0 if a decrement cannot be done without blocking
|
|
|
|
* 1 if a decrement succeeded.
|
|
|
|
*
|
|
|
|
* If a completion is being used as a counting completion,
|
|
|
|
* attempt to decrement the counter without blocking. This
|
|
|
|
* enables us to avoid waiting if the resource the completion
|
|
|
|
* is protecting is not available.
|
|
|
|
*/
|
|
|
|
bool try_wait_for_completion(struct completion *x)
|
|
|
|
{
|
2009-12-13 00:07:30 +01:00
|
|
|
unsigned long flags;
|
2008-08-15 00:40:44 -07:00
|
|
|
int ret = 1;
|
|
|
|
|
2009-12-13 00:07:30 +01:00
|
|
|
spin_lock_irqsave(&x->wait.lock, flags);
|
2008-08-15 00:40:44 -07:00
|
|
|
if (!x->done)
|
|
|
|
ret = 0;
|
|
|
|
else
|
|
|
|
x->done--;
|
2009-12-13 00:07:30 +01:00
|
|
|
spin_unlock_irqrestore(&x->wait.lock, flags);
|
2008-08-15 00:40:44 -07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(try_wait_for_completion);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* completion_done - Test to see if a completion has any waiters
|
|
|
|
* @x: completion structure
|
|
|
|
*
|
|
|
|
* Returns: 0 if there are waiters (wait_for_completion() in progress)
|
|
|
|
* 1 if there are no waiters.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
bool completion_done(struct completion *x)
|
|
|
|
{
|
2009-12-13 00:07:30 +01:00
|
|
|
unsigned long flags;
|
2008-08-15 00:40:44 -07:00
|
|
|
int ret = 1;
|
|
|
|
|
2009-12-13 00:07:30 +01:00
|
|
|
spin_lock_irqsave(&x->wait.lock, flags);
|
2008-08-15 00:40:44 -07:00
|
|
|
if (!x->done)
|
|
|
|
ret = 0;
|
2009-12-13 00:07:30 +01:00
|
|
|
spin_unlock_irqrestore(&x->wait.lock, flags);
|
2008-08-15 00:40:44 -07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(completion_done);
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
static long __sched
|
|
|
|
sleep_on_common(wait_queue_head_t *q, int state, long timeout)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-07-09 18:52:01 +02:00
|
|
|
unsigned long flags;
|
|
|
|
wait_queue_t wait;
|
|
|
|
|
|
|
|
init_waitqueue_entry(&wait, current);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
__set_current_state(state);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
spin_lock_irqsave(&q->lock, flags);
|
|
|
|
__add_wait_queue(q, &wait);
|
|
|
|
spin_unlock(&q->lock);
|
|
|
|
timeout = schedule_timeout(timeout);
|
|
|
|
spin_lock_irq(&q->lock);
|
|
|
|
__remove_wait_queue(q, &wait);
|
|
|
|
spin_unlock_irqrestore(&q->lock, flags);
|
|
|
|
|
|
|
|
return timeout;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __sched interruptible_sleep_on(wait_queue_head_t *q)
|
|
|
|
{
|
|
|
|
sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(interruptible_sleep_on);
|
|
|
|
|
2007-07-09 18:52:01 +02:00
|
|
|
long __sched
|
2005-09-10 00:26:11 -07:00
|
|
|
interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
|
|
|
|
|
2007-07-09 18:52:01 +02:00
|
|
|
void __sched sleep_on(wait_queue_head_t *q)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(sleep_on);
|
|
|
|
|
2007-07-09 18:52:01 +02:00
|
|
|
long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(sleep_on_timeout);
|
|
|
|
|
2006-06-27 02:54:51 -07:00
|
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
|
|
|
|
|
|
/*
|
|
|
|
* rt_mutex_setprio - set the current priority of a task
|
|
|
|
* @p: task
|
|
|
|
* @prio: prio value (kernel-internal form)
|
|
|
|
*
|
|
|
|
* This function changes the 'effective' priority of a task. It does
|
|
|
|
* not touch ->normal_prio like __setscheduler().
|
|
|
|
*
|
|
|
|
* Used by the rt_mutex code to implement priority inheritance logic.
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
void rt_mutex_setprio(struct task_struct *p, int prio)
|
2006-06-27 02:54:51 -07:00
|
|
|
{
|
2007-10-15 17:00:08 +02:00
|
|
|
int oldprio, on_rq, running;
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq;
|
2010-02-17 09:05:48 +01:00
|
|
|
const struct sched_class *prev_class;
|
2006-06-27 02:54:51 -07:00
|
|
|
|
|
|
|
BUG_ON(prio < 0 || prio > MAX_PRIO);
|
|
|
|
|
2011-04-05 17:23:51 +02:00
|
|
|
rq = __task_rq_lock(p);
|
2006-06-27 02:54:51 -07:00
|
|
|
|
2011-06-06 20:07:38 +02:00
|
|
|
/*
|
|
|
|
* Idle task boosting is a nono in general. There is one
|
|
|
|
* exception, when PREEMPT_RT and NOHZ is active:
|
|
|
|
*
|
|
|
|
* The idle task calls get_next_timer_interrupt() and holds
|
|
|
|
* the timer wheel base->lock on the CPU and another CPU wants
|
|
|
|
* to access the timer (probably to cancel it). We can safely
|
|
|
|
* ignore the boosting request, as the idle CPU runs this code
|
|
|
|
* with interrupts disabled and will complete the lock
|
|
|
|
* protected section without being interrupted. So there is no
|
|
|
|
* real need to boost.
|
|
|
|
*/
|
|
|
|
if (unlikely(p == rq->idle)) {
|
|
|
|
WARN_ON(p != rq->curr);
|
|
|
|
WARN_ON(p->pi_blocked_on);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2010-09-20 15:13:34 -04:00
|
|
|
trace_sched_pi_setprio(p, prio);
|
2007-05-08 20:27:06 -07:00
|
|
|
oldprio = p->prio;
|
2010-02-17 09:05:48 +01:00
|
|
|
prev_class = p->sched_class;
|
2011-04-05 17:23:44 +02:00
|
|
|
on_rq = p->on_rq;
|
2007-12-18 15:21:13 +01:00
|
|
|
running = task_current(rq, p);
|
2008-03-10 11:01:20 -07:00
|
|
|
if (on_rq)
|
2007-08-09 11:16:49 +02:00
|
|
|
dequeue_task(rq, p, 0);
|
2008-03-10 11:01:20 -07:00
|
|
|
if (running)
|
|
|
|
p->sched_class->put_prev_task(rq, p);
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
if (rt_prio(prio))
|
|
|
|
p->sched_class = &rt_sched_class;
|
|
|
|
else
|
|
|
|
p->sched_class = &fair_sched_class;
|
|
|
|
|
2006-06-27 02:54:51 -07:00
|
|
|
p->prio = prio;
|
|
|
|
|
2008-03-10 11:01:20 -07:00
|
|
|
if (running)
|
|
|
|
p->sched_class->set_curr_task(rq);
|
2011-01-17 17:03:27 +01:00
|
|
|
if (on_rq)
|
2010-03-24 16:38:48 +01:00
|
|
|
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
|
2008-01-25 21:08:22 +01:00
|
|
|
|
2011-01-17 17:03:27 +01:00
|
|
|
check_class_changed(rq, p, prev_class, oldprio);
|
2011-06-06 20:07:38 +02:00
|
|
|
out_unlock:
|
2011-04-05 17:23:51 +02:00
|
|
|
__task_rq_unlock(rq);
|
2006-06-27 02:54:51 -07:00
|
|
|
}
|
|
|
|
#endif
|
2006-07-03 00:25:41 -07:00
|
|
|
void set_user_nice(struct task_struct *p, long nice)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
int old_prio, delta, on_rq;
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long flags;
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
|
|
|
|
return;
|
|
|
|
/*
|
|
|
|
* We have to be careful, if called from sys_setpriority(),
|
|
|
|
* the task might be in the middle of scheduling on another CPU.
|
|
|
|
*/
|
|
|
|
rq = task_rq_lock(p, &flags);
|
|
|
|
/*
|
|
|
|
* The RT priorities are set via sched_setscheduler(), but we still
|
|
|
|
* allow the 'normal' nice value to be set - but as expected
|
|
|
|
* it wont have any effect on scheduling until the task is
|
2007-07-09 18:51:59 +02:00
|
|
|
* SCHED_FIFO/SCHED_RR:
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
if (task_has_rt_policy(p)) {
|
2005-04-16 15:20:36 -07:00
|
|
|
p->static_prio = NICE_TO_PRIO(nice);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2011-04-05 17:23:44 +02:00
|
|
|
on_rq = p->on_rq;
|
2008-06-27 13:41:14 +02:00
|
|
|
if (on_rq)
|
2007-08-09 11:16:49 +02:00
|
|
|
dequeue_task(rq, p, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
p->static_prio = NICE_TO_PRIO(nice);
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:54:34 -07:00
|
|
|
set_load_weight(p);
|
2006-06-27 02:54:51 -07:00
|
|
|
old_prio = p->prio;
|
|
|
|
p->prio = effective_prio(p);
|
|
|
|
delta = p->prio - old_prio;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (on_rq) {
|
2010-03-24 16:38:48 +01:00
|
|
|
enqueue_task(rq, p, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2007-05-08 20:27:06 -07:00
|
|
|
* If the task increased its priority or is running and
|
|
|
|
* lowered its priority, then reschedule its CPU:
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2007-05-08 20:27:06 -07:00
|
|
|
if (delta < 0 || (delta > 0 && task_running(rq, p)))
|
2005-04-16 15:20:36 -07:00
|
|
|
resched_task(rq->curr);
|
|
|
|
}
|
|
|
|
out_unlock:
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(set_user_nice);
|
|
|
|
|
2005-05-01 08:59:00 -07:00
|
|
|
/*
|
|
|
|
* can_nice - check if a task can reduce its nice value
|
|
|
|
* @p: task
|
|
|
|
* @nice: nice value
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
int can_nice(const struct task_struct *p, const int nice)
|
2005-05-01 08:59:00 -07:00
|
|
|
{
|
2005-08-18 11:24:19 -07:00
|
|
|
/* convert nice value [19,-20] to rlimit style value [1,40] */
|
|
|
|
int nice_rlim = 20 - nice;
|
2006-07-03 00:25:40 -07:00
|
|
|
|
2010-03-05 13:42:54 -08:00
|
|
|
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
|
2005-05-01 08:59:00 -07:00
|
|
|
capable(CAP_SYS_NICE));
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifdef __ARCH_WANT_SYS_NICE
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sys_nice - change the priority of the current process.
|
|
|
|
* @increment: priority increment
|
|
|
|
*
|
|
|
|
* sys_setpriority is a more generic, but much slower function that
|
|
|
|
* does similar things.
|
|
|
|
*/
|
2009-01-14 14:14:08 +01:00
|
|
|
SYSCALL_DEFINE1(nice, int, increment)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-07-03 00:25:40 -07:00
|
|
|
long nice, retval;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Setpriority might change our priority at the same moment.
|
|
|
|
* We don't have to worry. Conceptually one call occurs first
|
|
|
|
* and we have a single winner.
|
|
|
|
*/
|
2005-05-01 08:59:00 -07:00
|
|
|
if (increment < -40)
|
|
|
|
increment = -40;
|
2005-04-16 15:20:36 -07:00
|
|
|
if (increment > 40)
|
|
|
|
increment = 40;
|
|
|
|
|
2009-02-16 18:54:21 +08:00
|
|
|
nice = TASK_NICE(current) + increment;
|
2005-04-16 15:20:36 -07:00
|
|
|
if (nice < -20)
|
|
|
|
nice = -20;
|
|
|
|
if (nice > 19)
|
|
|
|
nice = 19;
|
|
|
|
|
2005-05-01 08:59:00 -07:00
|
|
|
if (increment < 0 && !can_nice(current, nice))
|
|
|
|
return -EPERM;
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
retval = security_task_setnice(current, nice);
|
|
|
|
if (retval)
|
|
|
|
return retval;
|
|
|
|
|
|
|
|
set_user_nice(current, nice);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/**
|
|
|
|
* task_prio - return the priority value of a given task.
|
|
|
|
* @p: the task in question.
|
|
|
|
*
|
|
|
|
* This is the priority value as seen by users in /proc.
|
|
|
|
* RT tasks are offset by -200. Normal tasks are centered
|
|
|
|
* around 0, value goes from -16 to +15.
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
int task_prio(const struct task_struct *p)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
return p->prio - MAX_RT_PRIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* task_nice - return the nice value of a given task.
|
|
|
|
* @p: the task in question.
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
int task_nice(const struct task_struct *p)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
return TASK_NICE(p);
|
|
|
|
}
|
2008-03-05 16:56:37 -05:00
|
|
|
EXPORT_SYMBOL(task_nice);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/**
|
|
|
|
* idle_cpu - is a given cpu idle currently?
|
|
|
|
* @cpu: the processor in question.
|
|
|
|
*/
|
|
|
|
int idle_cpu(int cpu)
|
|
|
|
{
|
2011-09-15 15:32:06 +02:00
|
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
|
|
|
|
|
if (rq->curr != rq->idle)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (rq->nr_running)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
if (!llist_empty(&rq->wake_list))
|
|
|
|
return 0;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return 1;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* idle_task - return the idle task for a given cpu.
|
|
|
|
* @cpu: the processor in question.
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
struct task_struct *idle_task(int cpu)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
return cpu_rq(cpu)->idle;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* find_process_by_pid - find a process with a matching PID value.
|
|
|
|
* @pid: the pid in question.
|
|
|
|
*/
|
2007-10-15 17:00:13 +02:00
|
|
|
static struct task_struct *find_process_by_pid(pid_t pid)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-10-18 23:40:16 -07:00
|
|
|
return pid ? find_task_by_vpid(pid) : current;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Actually do priority change: must hold rq lock. */
|
2007-07-09 18:51:59 +02:00
|
|
|
static void
|
|
|
|
__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
p->policy = policy;
|
|
|
|
p->rt_priority = prio;
|
2006-06-27 02:54:51 -07:00
|
|
|
p->normal_prio = normal_prio(p);
|
|
|
|
/* we are holding p->pi_lock already */
|
|
|
|
p->prio = rt_mutex_getprio(p);
|
2009-11-10 20:12:01 +01:00
|
|
|
if (rt_prio(p->prio))
|
|
|
|
p->sched_class = &rt_sched_class;
|
|
|
|
else
|
|
|
|
p->sched_class = &fair_sched_class;
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:54:34 -07:00
|
|
|
set_load_weight(p);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2008-11-14 10:39:19 +11:00
|
|
|
/*
|
|
|
|
* check the target process has a UID that matches the current process's
|
|
|
|
*/
|
|
|
|
static bool check_same_owner(struct task_struct *p)
|
|
|
|
{
|
|
|
|
const struct cred *cred = current_cred(), *pcred;
|
|
|
|
bool match;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
pcred = __task_cred(p);
|
2012-02-02 18:54:02 -08:00
|
|
|
match = (uid_eq(cred->euid, pcred->euid) ||
|
|
|
|
uid_eq(cred->euid, pcred->uid));
|
2008-11-14 10:39:19 +11:00
|
|
|
rcu_read_unlock();
|
|
|
|
return match;
|
|
|
|
}
|
|
|
|
|
2008-06-23 13:55:38 +10:00
|
|
|
static int __sched_setscheduler(struct task_struct *p, int policy,
|
2010-10-20 16:01:12 -07:00
|
|
|
const struct sched_param *param, bool user)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-10-15 17:00:08 +02:00
|
|
|
int retval, oldprio, oldpolicy = -1, on_rq, running;
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long flags;
|
2010-02-17 09:05:48 +01:00
|
|
|
const struct sched_class *prev_class;
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq;
|
2009-06-15 17:17:47 +02:00
|
|
|
int reset_on_fork;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-06-27 02:54:44 -07:00
|
|
|
/* may grab non-irq protected spin_locks */
|
|
|
|
BUG_ON(in_interrupt());
|
2005-04-16 15:20:36 -07:00
|
|
|
recheck:
|
|
|
|
/* double check policy once rq lock held */
|
2009-06-15 17:17:47 +02:00
|
|
|
if (policy < 0) {
|
|
|
|
reset_on_fork = p->sched_reset_on_fork;
|
2005-04-16 15:20:36 -07:00
|
|
|
policy = oldpolicy = p->policy;
|
2009-06-15 17:17:47 +02:00
|
|
|
} else {
|
|
|
|
reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
|
|
|
|
policy &= ~SCHED_RESET_ON_FORK;
|
|
|
|
|
|
|
|
if (policy != SCHED_FIFO && policy != SCHED_RR &&
|
|
|
|
policy != SCHED_NORMAL && policy != SCHED_BATCH &&
|
|
|
|
policy != SCHED_IDLE)
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Valid priorities for SCHED_FIFO and SCHED_RR are
|
2007-07-09 18:51:59 +02:00
|
|
|
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
|
|
|
|
* SCHED_BATCH and SCHED_IDLE is 0.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
if (param->sched_priority < 0 ||
|
2005-09-10 00:26:11 -07:00
|
|
|
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
|
2005-07-25 16:28:39 -04:00
|
|
|
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EINVAL;
|
2007-07-09 18:51:59 +02:00
|
|
|
if (rt_policy(policy) != (param->sched_priority != 0))
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EINVAL;
|
|
|
|
|
[PATCH] Changing RT priority without CAP_SYS_NICE
Presently, a process without the capability CAP_SYS_NICE can not change
its own policy, which is OK.
But it can also not decrease its RT priority (if scheduled with policy
SCHED_RR or SCHED_FIFO), which is what this patch changes.
The rationale is the same as for the nice value: a process should be
able to require less priority for itself. Increasing the priority is
still not allowed.
This is for example useful if you give a multithreaded user process a RT
priority, and the process would like to organize its internal threads
using priorities also. Then you can give the process the highest
priority needed N, and the process starts its threads with lower
priorities: N-1, N-2...
The POSIX norm says that the permissions are implementation specific, so
I think we can do that.
In a sense, it makes the permissions consistent whatever the policy is:
with this patch, process scheduled by SCHED_FIFO, SCHED_RR and
SCHED_OTHER can all decrease their priority.
From: Ingo Molnar <mingo@elte.hu>
cleaned up and merged to -mm.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 14:57:32 -07:00
|
|
|
/*
|
|
|
|
* Allow unprivileged RT tasks to decrease priority:
|
|
|
|
*/
|
2008-06-23 13:55:38 +10:00
|
|
|
if (user && !capable(CAP_SYS_NICE)) {
|
2007-07-09 18:51:59 +02:00
|
|
|
if (rt_policy(policy)) {
|
2010-06-11 01:09:44 +02:00
|
|
|
unsigned long rlim_rtprio =
|
|
|
|
task_rlimit(p, RLIMIT_RTPRIO);
|
2006-09-29 02:00:50 -07:00
|
|
|
|
|
|
|
/* can't set/change the rt policy */
|
|
|
|
if (policy != p->policy && !rlim_rtprio)
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
/* can't increase priority */
|
|
|
|
if (param->sched_priority > p->rt_priority &&
|
|
|
|
param->sched_priority > rlim_rtprio)
|
|
|
|
return -EPERM;
|
|
|
|
}
|
2011-02-17 15:37:07 -08:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
2011-02-17 15:37:07 -08:00
|
|
|
* Treat SCHED_IDLE as nice 20. Only allow a switch to
|
|
|
|
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
|
2007-07-09 18:51:59 +02:00
|
|
|
*/
|
2011-02-17 15:37:07 -08:00
|
|
|
if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
|
|
|
|
if (!can_nice(p, TASK_NICE(p)))
|
|
|
|
return -EPERM;
|
|
|
|
}
|
2006-09-29 02:00:48 -07:00
|
|
|
|
[PATCH] Changing RT priority without CAP_SYS_NICE
Presently, a process without the capability CAP_SYS_NICE can not change
its own policy, which is OK.
But it can also not decrease its RT priority (if scheduled with policy
SCHED_RR or SCHED_FIFO), which is what this patch changes.
The rationale is the same as for the nice value: a process should be
able to require less priority for itself. Increasing the priority is
still not allowed.
This is for example useful if you give a multithreaded user process a RT
priority, and the process would like to organize its internal threads
using priorities also. Then you can give the process the highest
priority needed N, and the process starts its threads with lower
priorities: N-1, N-2...
The POSIX norm says that the permissions are implementation specific, so
I think we can do that.
In a sense, it makes the permissions consistent whatever the policy is:
with this patch, process scheduled by SCHED_FIFO, SCHED_RR and
SCHED_OTHER can all decrease their priority.
From: Ingo Molnar <mingo@elte.hu>
cleaned up and merged to -mm.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 14:57:32 -07:00
|
|
|
/* can't change other user's priorities */
|
2008-11-14 10:39:19 +11:00
|
|
|
if (!check_same_owner(p))
|
[PATCH] Changing RT priority without CAP_SYS_NICE
Presently, a process without the capability CAP_SYS_NICE can not change
its own policy, which is OK.
But it can also not decrease its RT priority (if scheduled with policy
SCHED_RR or SCHED_FIFO), which is what this patch changes.
The rationale is the same as for the nice value: a process should be
able to require less priority for itself. Increasing the priority is
still not allowed.
This is for example useful if you give a multithreaded user process a RT
priority, and the process would like to organize its internal threads
using priorities also. Then you can give the process the highest
priority needed N, and the process starts its threads with lower
priorities: N-1, N-2...
The POSIX norm says that the permissions are implementation specific, so
I think we can do that.
In a sense, it makes the permissions consistent whatever the policy is:
with this patch, process scheduled by SCHED_FIFO, SCHED_RR and
SCHED_OTHER can all decrease their priority.
From: Ingo Molnar <mingo@elte.hu>
cleaned up and merged to -mm.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 14:57:32 -07:00
|
|
|
return -EPERM;
|
2009-06-15 17:17:47 +02:00
|
|
|
|
|
|
|
/* Normal users shall not reset the sched_reset_on_fork flag */
|
|
|
|
if (p->sched_reset_on_fork && !reset_on_fork)
|
|
|
|
return -EPERM;
|
[PATCH] Changing RT priority without CAP_SYS_NICE
Presently, a process without the capability CAP_SYS_NICE can not change
its own policy, which is OK.
But it can also not decrease its RT priority (if scheduled with policy
SCHED_RR or SCHED_FIFO), which is what this patch changes.
The rationale is the same as for the nice value: a process should be
able to require less priority for itself. Increasing the priority is
still not allowed.
This is for example useful if you give a multithreaded user process a RT
priority, and the process would like to organize its internal threads
using priorities also. Then you can give the process the highest
priority needed N, and the process starts its threads with lower
priorities: N-1, N-2...
The POSIX norm says that the permissions are implementation specific, so
I think we can do that.
In a sense, it makes the permissions consistent whatever the policy is:
with this patch, process scheduled by SCHED_FIFO, SCHED_RR and
SCHED_OTHER can all decrease their priority.
From: Ingo Molnar <mingo@elte.hu>
cleaned up and merged to -mm.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 14:57:32 -07:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-08-03 09:33:03 -07:00
|
|
|
if (user) {
|
2010-10-15 04:21:18 +09:00
|
|
|
retval = security_task_setscheduler(p);
|
2008-08-03 09:33:03 -07:00
|
|
|
if (retval)
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2006-06-27 02:54:51 -07:00
|
|
|
/*
|
|
|
|
* make sure no PI-waiters arrive (or leave) while we are
|
|
|
|
* changing the priority of the task:
|
2011-04-05 17:23:51 +02:00
|
|
|
*
|
2011-03-30 22:57:33 -03:00
|
|
|
* To be able to change p->policy safely, the appropriate
|
2005-04-16 15:20:36 -07:00
|
|
|
* runqueue lock must be held.
|
|
|
|
*/
|
2011-04-05 17:23:51 +02:00
|
|
|
rq = task_rq_lock(p, &flags);
|
2010-06-08 11:40:42 +02:00
|
|
|
|
2010-09-22 13:53:15 +02:00
|
|
|
/*
|
|
|
|
* Changing the policy of the stop threads its a very bad idea
|
|
|
|
*/
|
|
|
|
if (p == rq->stop) {
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
2010-09-22 13:53:15 +02:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
sched: Leave sched_setscheduler() earlier if possible, do not disturb SCHED_FIFO tasks
sched_setscheduler() (in sched.c) is called in order of changing the
scheduling policy and/or the real-time priority of a task. Thus,
if we find out that neither of those are actually being modified, it
is possible to return earlier and save the overhead of a full
deactivate+activate cycle of the task in question.
Beside that, if we have more than one SCHED_FIFO task with the same
priority on the same rq (which means they share the same priority queue)
having one of them changing its position in the priority queue because of
a sched_setscheduler (as it happens by means of the deactivate+activate)
that does not actually change the priority violates POSIX which states,
for SCHED_FIFO:
"If a thread whose policy or priority has been modified by
pthread_setschedprio() is a running thread or is runnable, the effect on
its position in the thread list depends on the direction of the
modification, as follows: a. <...> b. If the priority is unchanged, the
thread does not change position in the thread list. c. <...>"
http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_08.html
(ed: And the POSIX specification here does, briefly and somewhat unexpectedly,
match what common sense tells us as well. )
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1300971618.3960.82.camel@Palantir>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-24 14:00:18 +01:00
|
|
|
/*
|
|
|
|
* If not changing anything there's no need to proceed further:
|
|
|
|
*/
|
|
|
|
if (unlikely(policy == p->policy && (!rt_policy(policy) ||
|
|
|
|
param->sched_priority == p->rt_priority))) {
|
2012-07-07 16:49:02 +09:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
sched: Leave sched_setscheduler() earlier if possible, do not disturb SCHED_FIFO tasks
sched_setscheduler() (in sched.c) is called in order of changing the
scheduling policy and/or the real-time priority of a task. Thus,
if we find out that neither of those are actually being modified, it
is possible to return earlier and save the overhead of a full
deactivate+activate cycle of the task in question.
Beside that, if we have more than one SCHED_FIFO task with the same
priority on the same rq (which means they share the same priority queue)
having one of them changing its position in the priority queue because of
a sched_setscheduler (as it happens by means of the deactivate+activate)
that does not actually change the priority violates POSIX which states,
for SCHED_FIFO:
"If a thread whose policy or priority has been modified by
pthread_setschedprio() is a running thread or is runnable, the effect on
its position in the thread list depends on the direction of the
modification, as follows: a. <...> b. If the priority is unchanged, the
thread does not change position in the thread list. c. <...>"
http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_08.html
(ed: And the POSIX specification here does, briefly and somewhat unexpectedly,
match what common sense tells us as well. )
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1300971618.3960.82.camel@Palantir>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-24 14:00:18 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-06-08 11:40:42 +02:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
if (user) {
|
|
|
|
/*
|
|
|
|
* Do not allow realtime tasks into groups that have no runtime
|
|
|
|
* assigned.
|
|
|
|
*/
|
|
|
|
if (rt_bandwidth_enabled() && rt_policy(policy) &&
|
2011-01-13 04:54:50 +01:00
|
|
|
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
|
|
|
|
!task_group_is_autogroup(task_group(p))) {
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
2010-06-08 11:40:42 +02:00
|
|
|
return -EPERM;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/* recheck policy now with rq lock held */
|
|
|
|
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
|
|
|
|
policy = oldpolicy = -1;
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
goto recheck;
|
|
|
|
}
|
2011-04-05 17:23:44 +02:00
|
|
|
on_rq = p->on_rq;
|
2007-12-18 15:21:13 +01:00
|
|
|
running = task_current(rq, p);
|
2008-03-10 11:01:20 -07:00
|
|
|
if (on_rq)
|
2012-01-25 11:50:51 +01:00
|
|
|
dequeue_task(rq, p, 0);
|
2008-03-10 11:01:20 -07:00
|
|
|
if (running)
|
|
|
|
p->sched_class->put_prev_task(rq, p);
|
2007-10-15 17:00:08 +02:00
|
|
|
|
2009-06-15 17:17:47 +02:00
|
|
|
p->sched_reset_on_fork = reset_on_fork;
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
oldprio = p->prio;
|
2010-02-17 09:05:48 +01:00
|
|
|
prev_class = p->sched_class;
|
2007-07-09 18:51:59 +02:00
|
|
|
__setscheduler(rq, p, policy, param->sched_priority);
|
2007-10-15 17:00:08 +02:00
|
|
|
|
2008-03-10 11:01:20 -07:00
|
|
|
if (running)
|
|
|
|
p->sched_class->set_curr_task(rq);
|
2011-01-17 17:03:27 +01:00
|
|
|
if (on_rq)
|
2012-01-25 11:50:51 +01:00
|
|
|
enqueue_task(rq, p, 0);
|
2008-01-25 21:08:22 +01:00
|
|
|
|
2011-01-17 17:03:27 +01:00
|
|
|
check_class_changed(rq, p, prev_class, oldprio);
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
2006-06-27 02:54:51 -07:00
|
|
|
|
2006-06-27 02:55:02 -07:00
|
|
|
rt_mutex_adjust_pi(p);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return 0;
|
|
|
|
}
|
2008-06-23 13:55:38 +10:00
|
|
|
|
|
|
|
/**
|
|
|
|
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
|
|
|
|
* @p: the task in question.
|
|
|
|
* @policy: new policy.
|
|
|
|
* @param: structure containing the new RT priority.
|
|
|
|
*
|
|
|
|
* NOTE that the task may be already dead.
|
|
|
|
*/
|
|
|
|
int sched_setscheduler(struct task_struct *p, int policy,
|
2010-10-20 16:01:12 -07:00
|
|
|
const struct sched_param *param)
|
2008-06-23 13:55:38 +10:00
|
|
|
{
|
|
|
|
return __sched_setscheduler(p, policy, param, true);
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
EXPORT_SYMBOL_GPL(sched_setscheduler);
|
|
|
|
|
2008-06-23 13:55:38 +10:00
|
|
|
/**
|
|
|
|
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
|
|
|
|
* @p: the task in question.
|
|
|
|
* @policy: new policy.
|
|
|
|
* @param: structure containing the new RT priority.
|
|
|
|
*
|
|
|
|
* Just like sched_setscheduler, only don't bother checking if the
|
|
|
|
* current context has permission. For example, this is needed in
|
|
|
|
* stop_machine(): we create temporary high priority worker threads,
|
|
|
|
* but our caller might not have that capability.
|
|
|
|
*/
|
|
|
|
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
|
2010-10-20 16:01:12 -07:00
|
|
|
const struct sched_param *param)
|
2008-06-23 13:55:38 +10:00
|
|
|
{
|
|
|
|
return __sched_setscheduler(p, policy, param, false);
|
|
|
|
}
|
|
|
|
|
2005-09-10 00:26:11 -07:00
|
|
|
static int
|
|
|
|
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct sched_param lparam;
|
|
|
|
struct task_struct *p;
|
2006-07-03 00:25:41 -07:00
|
|
|
int retval;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
if (!param || pid < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
|
|
|
|
return -EFAULT;
|
2006-09-29 02:00:48 -07:00
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
retval = -ESRCH;
|
2005-04-16 15:20:36 -07:00
|
|
|
p = find_process_by_pid(pid);
|
2006-09-29 02:00:48 -07:00
|
|
|
if (p != NULL)
|
|
|
|
retval = sched_setscheduler(p, policy, &lparam);
|
|
|
|
rcu_read_unlock();
|
2006-07-03 00:25:41 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
|
|
|
|
* @pid: the pid in question.
|
|
|
|
* @policy: new policy.
|
|
|
|
* @param: structure containing the new RT priority.
|
|
|
|
*/
|
2009-01-14 14:14:08 +01:00
|
|
|
SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
|
|
|
|
struct sched_param __user *, param)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-01-18 17:43:03 -08:00
|
|
|
/* negative values for policy are not valid */
|
|
|
|
if (policy < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return do_sched_setscheduler(pid, policy, param);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_setparam - set/change the RT priority of a thread
|
|
|
|
* @pid: the pid in question.
|
|
|
|
* @param: structure containing the new RT priority.
|
|
|
|
*/
|
2009-01-14 14:14:08 +01:00
|
|
|
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
return do_sched_setscheduler(pid, -1, param);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
|
|
|
|
* @pid: the pid in question.
|
|
|
|
*/
|
2009-01-14 14:14:08 +01:00
|
|
|
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-07-03 00:25:41 -07:00
|
|
|
struct task_struct *p;
|
2007-10-15 17:00:14 +02:00
|
|
|
int retval;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
if (pid < 0)
|
2007-10-15 17:00:14 +02:00
|
|
|
return -EINVAL;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
retval = -ESRCH;
|
2009-12-09 10:14:58 +00:00
|
|
|
rcu_read_lock();
|
2005-04-16 15:20:36 -07:00
|
|
|
p = find_process_by_pid(pid);
|
|
|
|
if (p) {
|
|
|
|
retval = security_task_getscheduler(p);
|
|
|
|
if (!retval)
|
2009-06-15 17:17:47 +02:00
|
|
|
retval = p->policy
|
|
|
|
| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2009-12-09 10:14:58 +00:00
|
|
|
rcu_read_unlock();
|
2005-04-16 15:20:36 -07:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2009-06-15 17:17:47 +02:00
|
|
|
* sys_sched_getparam - get the RT priority of a thread
|
2005-04-16 15:20:36 -07:00
|
|
|
* @pid: the pid in question.
|
|
|
|
* @param: structure containing the RT priority.
|
|
|
|
*/
|
2009-01-14 14:14:08 +01:00
|
|
|
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct sched_param lp;
|
2006-07-03 00:25:41 -07:00
|
|
|
struct task_struct *p;
|
2007-10-15 17:00:14 +02:00
|
|
|
int retval;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
if (!param || pid < 0)
|
2007-10-15 17:00:14 +02:00
|
|
|
return -EINVAL;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-12-09 10:14:58 +00:00
|
|
|
rcu_read_lock();
|
2005-04-16 15:20:36 -07:00
|
|
|
p = find_process_by_pid(pid);
|
|
|
|
retval = -ESRCH;
|
|
|
|
if (!p)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
retval = security_task_getscheduler(p);
|
|
|
|
if (retval)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
lp.sched_priority = p->rt_priority;
|
2009-12-09 10:14:58 +00:00
|
|
|
rcu_read_unlock();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This one might sleep, we cannot do it with a spinlock held ...
|
|
|
|
*/
|
|
|
|
retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
|
|
|
|
|
|
|
|
return retval;
|
|
|
|
|
|
|
|
out_unlock:
|
2009-12-09 10:14:58 +00:00
|
|
|
rcu_read_unlock();
|
2005-04-16 15:20:36 -07:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2008-11-25 02:35:14 +10:30
|
|
|
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2008-11-25 02:35:11 +10:30
|
|
|
cpumask_var_t cpus_allowed, new_mask;
|
2006-07-03 00:25:41 -07:00
|
|
|
struct task_struct *p;
|
|
|
|
int retval;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-01-25 21:08:02 +01:00
|
|
|
get_online_cpus();
|
2009-12-09 10:15:01 +00:00
|
|
|
rcu_read_lock();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
p = find_process_by_pid(pid);
|
|
|
|
if (!p) {
|
2009-12-09 10:15:01 +00:00
|
|
|
rcu_read_unlock();
|
2008-01-25 21:08:02 +01:00
|
|
|
put_online_cpus();
|
2005-04-16 15:20:36 -07:00
|
|
|
return -ESRCH;
|
|
|
|
}
|
|
|
|
|
2009-12-09 10:15:01 +00:00
|
|
|
/* Prevent p going away */
|
2005-04-16 15:20:36 -07:00
|
|
|
get_task_struct(p);
|
2009-12-09 10:15:01 +00:00
|
|
|
rcu_read_unlock();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-11-25 02:35:11 +10:30
|
|
|
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
|
|
|
|
retval = -ENOMEM;
|
|
|
|
goto out_put_task;
|
|
|
|
}
|
|
|
|
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
|
|
|
|
retval = -ENOMEM;
|
|
|
|
goto out_free_cpus_allowed;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
retval = -EPERM;
|
2012-01-03 12:25:15 -05:00
|
|
|
if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
|
2005-04-16 15:20:36 -07:00
|
|
|
goto out_unlock;
|
|
|
|
|
2010-10-15 04:21:18 +09:00
|
|
|
retval = security_task_setscheduler(p);
|
2006-06-23 02:03:59 -07:00
|
|
|
if (retval)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2008-11-25 02:35:11 +10:30
|
|
|
cpuset_cpus_allowed(p, cpus_allowed);
|
|
|
|
cpumask_and(new_mask, in_mask, cpus_allowed);
|
2010-10-17 21:46:10 +02:00
|
|
|
again:
|
2008-11-25 02:35:11 +10:30
|
|
|
retval = set_cpus_allowed_ptr(p, new_mask);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-18 23:40:22 -07:00
|
|
|
if (!retval) {
|
2008-11-25 02:35:11 +10:30
|
|
|
cpuset_cpus_allowed(p, cpus_allowed);
|
|
|
|
if (!cpumask_subset(new_mask, cpus_allowed)) {
|
2007-10-18 23:40:22 -07:00
|
|
|
/*
|
|
|
|
* We must have raced with a concurrent cpuset
|
|
|
|
* update. Just reset the cpus_allowed to the
|
|
|
|
* cpuset's cpus_allowed
|
|
|
|
*/
|
2008-11-25 02:35:11 +10:30
|
|
|
cpumask_copy(new_mask, cpus_allowed);
|
2007-10-18 23:40:22 -07:00
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
out_unlock:
|
2008-11-25 02:35:11 +10:30
|
|
|
free_cpumask_var(new_mask);
|
|
|
|
out_free_cpus_allowed:
|
|
|
|
free_cpumask_var(cpus_allowed);
|
|
|
|
out_put_task:
|
2005-04-16 15:20:36 -07:00
|
|
|
put_task_struct(p);
|
2008-01-25 21:08:02 +01:00
|
|
|
put_online_cpus();
|
2005-04-16 15:20:36 -07:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
|
2008-11-25 02:35:14 +10:30
|
|
|
struct cpumask *new_mask)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2008-11-25 02:35:14 +10:30
|
|
|
if (len < cpumask_size())
|
|
|
|
cpumask_clear(new_mask);
|
|
|
|
else if (len > cpumask_size())
|
|
|
|
len = cpumask_size();
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_setaffinity - set the cpu affinity of a process
|
|
|
|
* @pid: pid of the process
|
|
|
|
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
|
|
|
|
* @user_mask_ptr: user-space pointer to the new cpu mask
|
|
|
|
*/
|
2009-01-14 14:14:08 +01:00
|
|
|
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
|
|
|
|
unsigned long __user *, user_mask_ptr)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2008-11-25 02:35:11 +10:30
|
|
|
cpumask_var_t new_mask;
|
2005-04-16 15:20:36 -07:00
|
|
|
int retval;
|
|
|
|
|
2008-11-25 02:35:11 +10:30
|
|
|
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
|
|
|
|
return -ENOMEM;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-11-25 02:35:11 +10:30
|
|
|
retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
|
|
|
|
if (retval == 0)
|
|
|
|
retval = sched_setaffinity(pid, new_mask);
|
|
|
|
free_cpumask_var(new_mask);
|
|
|
|
return retval;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2008-11-25 02:35:14 +10:30
|
|
|
long sched_getaffinity(pid_t pid, struct cpumask *mask)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-07-03 00:25:41 -07:00
|
|
|
struct task_struct *p;
|
2009-12-08 20:24:16 +00:00
|
|
|
unsigned long flags;
|
2005-04-16 15:20:36 -07:00
|
|
|
int retval;
|
|
|
|
|
2008-01-25 21:08:02 +01:00
|
|
|
get_online_cpus();
|
2009-12-09 10:15:01 +00:00
|
|
|
rcu_read_lock();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
retval = -ESRCH;
|
|
|
|
p = find_process_by_pid(pid);
|
|
|
|
if (!p)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2006-06-23 02:03:59 -07:00
|
|
|
retval = security_task_getscheduler(p);
|
|
|
|
if (retval)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2011-04-05 17:23:45 +02:00
|
|
|
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
2008-11-25 02:35:14 +10:30
|
|
|
cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
|
2011-04-05 17:23:45 +02:00
|
|
|
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
out_unlock:
|
2009-12-09 10:15:01 +00:00
|
|
|
rcu_read_unlock();
|
2008-01-25 21:08:02 +01:00
|
|
|
put_online_cpus();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-08-09 11:16:46 +02:00
|
|
|
return retval;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_getaffinity - get the cpu affinity of a process
|
|
|
|
* @pid: pid of the process
|
|
|
|
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
|
|
|
|
* @user_mask_ptr: user-space pointer to hold the current cpu mask
|
|
|
|
*/
|
2009-01-14 14:14:08 +01:00
|
|
|
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
|
|
|
|
unsigned long __user *, user_mask_ptr)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
int ret;
|
2008-11-25 02:35:11 +10:30
|
|
|
cpumask_var_t mask;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2010-04-06 17:02:19 +10:00
|
|
|
if ((len * BITS_PER_BYTE) < nr_cpu_ids)
|
sched: sched_getaffinity(): Allow less than NR_CPUS length
[ Note, this commit changes the syscall ABI for > 1024 CPUs systems. ]
Recently, some distro decided to use NR_CPUS=4096 for mysterious reasons.
Unfortunately, glibc sched interface has the following definition:
# define __CPU_SETSIZE 1024
# define __NCPUBITS (8 * sizeof (__cpu_mask))
typedef unsigned long int __cpu_mask;
typedef struct
{
__cpu_mask __bits[__CPU_SETSIZE / __NCPUBITS];
} cpu_set_t;
It mean, if NR_CPUS is bigger than 1024, cpu_set_t makes an
ABI issue ...
More recently, Sharyathi Nagesh reported following test program makes
misterious syscall failure:
-----------------------------------------------------------------------
#define _GNU_SOURCE
#include<stdio.h>
#include<errno.h>
#include<sched.h>
int main()
{
cpu_set_t set;
if (sched_getaffinity(0, sizeof(cpu_set_t), &set) < 0)
printf("\n Call is failing with:%d", errno);
}
-----------------------------------------------------------------------
Because the kernel assumes len argument of sched_getaffinity() is bigger
than NR_CPUS. But now it is not correct.
Now we are faced with the following annoying dilemma, due to
the limitations of the glibc interface built in years ago:
(1) if we change glibc's __CPU_SETSIZE definition, we lost
binary compatibility of _all_ application.
(2) if we don't change it, we also lost binary compatibility of
Sharyathi's use case.
Then, I would propse to change the rule of the len argument of
sched_getaffinity().
Old:
len should be bigger than NR_CPUS
New:
len should be bigger than maximum possible cpu id
This creates the following behavior:
(A) In the real 4096 cpus machine, the above test program still
return -EINVAL.
(B) NR_CPUS=4096 but the machine have less than 1024 cpus (almost
all machines in the world), the above can run successfully.
Fortunatelly, BIG SGI machine is mainly used for HPC use case. It means
they can rebuild their programs.
IOW we hope they are not annoyed by this issue ...
Reported-by: Sharyathi Nagesh <sharyath@in.ibm.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Mike Travis <travis@sgi.com>
LKML-Reference: <20100312161316.9520.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-12 16:15:36 +09:00
|
|
|
return -EINVAL;
|
|
|
|
if (len & (sizeof(unsigned long)-1))
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EINVAL;
|
|
|
|
|
2008-11-25 02:35:11 +10:30
|
|
|
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
|
|
|
|
return -ENOMEM;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-11-25 02:35:11 +10:30
|
|
|
ret = sched_getaffinity(pid, mask);
|
|
|
|
if (ret == 0) {
|
2010-03-17 09:36:58 +09:00
|
|
|
size_t retlen = min_t(size_t, len, cpumask_size());
|
sched: sched_getaffinity(): Allow less than NR_CPUS length
[ Note, this commit changes the syscall ABI for > 1024 CPUs systems. ]
Recently, some distro decided to use NR_CPUS=4096 for mysterious reasons.
Unfortunately, glibc sched interface has the following definition:
# define __CPU_SETSIZE 1024
# define __NCPUBITS (8 * sizeof (__cpu_mask))
typedef unsigned long int __cpu_mask;
typedef struct
{
__cpu_mask __bits[__CPU_SETSIZE / __NCPUBITS];
} cpu_set_t;
It mean, if NR_CPUS is bigger than 1024, cpu_set_t makes an
ABI issue ...
More recently, Sharyathi Nagesh reported following test program makes
misterious syscall failure:
-----------------------------------------------------------------------
#define _GNU_SOURCE
#include<stdio.h>
#include<errno.h>
#include<sched.h>
int main()
{
cpu_set_t set;
if (sched_getaffinity(0, sizeof(cpu_set_t), &set) < 0)
printf("\n Call is failing with:%d", errno);
}
-----------------------------------------------------------------------
Because the kernel assumes len argument of sched_getaffinity() is bigger
than NR_CPUS. But now it is not correct.
Now we are faced with the following annoying dilemma, due to
the limitations of the glibc interface built in years ago:
(1) if we change glibc's __CPU_SETSIZE definition, we lost
binary compatibility of _all_ application.
(2) if we don't change it, we also lost binary compatibility of
Sharyathi's use case.
Then, I would propse to change the rule of the len argument of
sched_getaffinity().
Old:
len should be bigger than NR_CPUS
New:
len should be bigger than maximum possible cpu id
This creates the following behavior:
(A) In the real 4096 cpus machine, the above test program still
return -EINVAL.
(B) NR_CPUS=4096 but the machine have less than 1024 cpus (almost
all machines in the world), the above can run successfully.
Fortunatelly, BIG SGI machine is mainly used for HPC use case. It means
they can rebuild their programs.
IOW we hope they are not annoyed by this issue ...
Reported-by: Sharyathi Nagesh <sharyath@in.ibm.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Mike Travis <travis@sgi.com>
LKML-Reference: <20100312161316.9520.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-12 16:15:36 +09:00
|
|
|
|
|
|
|
if (copy_to_user(user_mask_ptr, mask, retlen))
|
2008-11-25 02:35:11 +10:30
|
|
|
ret = -EFAULT;
|
|
|
|
else
|
sched: sched_getaffinity(): Allow less than NR_CPUS length
[ Note, this commit changes the syscall ABI for > 1024 CPUs systems. ]
Recently, some distro decided to use NR_CPUS=4096 for mysterious reasons.
Unfortunately, glibc sched interface has the following definition:
# define __CPU_SETSIZE 1024
# define __NCPUBITS (8 * sizeof (__cpu_mask))
typedef unsigned long int __cpu_mask;
typedef struct
{
__cpu_mask __bits[__CPU_SETSIZE / __NCPUBITS];
} cpu_set_t;
It mean, if NR_CPUS is bigger than 1024, cpu_set_t makes an
ABI issue ...
More recently, Sharyathi Nagesh reported following test program makes
misterious syscall failure:
-----------------------------------------------------------------------
#define _GNU_SOURCE
#include<stdio.h>
#include<errno.h>
#include<sched.h>
int main()
{
cpu_set_t set;
if (sched_getaffinity(0, sizeof(cpu_set_t), &set) < 0)
printf("\n Call is failing with:%d", errno);
}
-----------------------------------------------------------------------
Because the kernel assumes len argument of sched_getaffinity() is bigger
than NR_CPUS. But now it is not correct.
Now we are faced with the following annoying dilemma, due to
the limitations of the glibc interface built in years ago:
(1) if we change glibc's __CPU_SETSIZE definition, we lost
binary compatibility of _all_ application.
(2) if we don't change it, we also lost binary compatibility of
Sharyathi's use case.
Then, I would propse to change the rule of the len argument of
sched_getaffinity().
Old:
len should be bigger than NR_CPUS
New:
len should be bigger than maximum possible cpu id
This creates the following behavior:
(A) In the real 4096 cpus machine, the above test program still
return -EINVAL.
(B) NR_CPUS=4096 but the machine have less than 1024 cpus (almost
all machines in the world), the above can run successfully.
Fortunatelly, BIG SGI machine is mainly used for HPC use case. It means
they can rebuild their programs.
IOW we hope they are not annoyed by this issue ...
Reported-by: Sharyathi Nagesh <sharyath@in.ibm.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Mike Travis <travis@sgi.com>
LKML-Reference: <20100312161316.9520.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-12 16:15:36 +09:00
|
|
|
ret = retlen;
|
2008-11-25 02:35:11 +10:30
|
|
|
}
|
|
|
|
free_cpumask_var(mask);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-11-25 02:35:11 +10:30
|
|
|
return ret;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_yield - yield the current processor to other threads.
|
|
|
|
*
|
2007-07-09 18:51:59 +02:00
|
|
|
* This function yields the current CPU to other tasks. If there are no
|
|
|
|
* other threads running on this CPU then this function will return.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2009-01-14 14:14:08 +01:00
|
|
|
SYSCALL_DEFINE0(sched_yield)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq = this_rq_lock();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-15 17:00:12 +02:00
|
|
|
schedstat_inc(rq, yld_count);
|
2007-10-15 17:00:08 +02:00
|
|
|
current->sched_class->yield_task(rq);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Since we are going to call schedule() anyway, there's
|
|
|
|
* no need to preempt or enable interrupts:
|
|
|
|
*/
|
|
|
|
__release(rq->lock);
|
2006-07-03 00:24:54 -07:00
|
|
|
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
|
2009-12-03 20:55:53 +01:00
|
|
|
do_raw_spin_unlock(&rq->lock);
|
2011-03-21 13:32:17 +01:00
|
|
|
sched_preempt_enable_no_resched();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
schedule();
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-07-10 14:57:57 +02:00
|
|
|
static inline int should_resched(void)
|
|
|
|
{
|
|
|
|
return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
|
|
|
|
}
|
|
|
|
|
2006-06-30 01:56:00 -07:00
|
|
|
static void __cond_resched(void)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2009-07-16 15:44:29 +02:00
|
|
|
add_preempt_count(PREEMPT_ACTIVE);
|
2011-06-22 19:47:00 +02:00
|
|
|
__schedule();
|
2009-07-16 15:44:29 +02:00
|
|
|
sub_preempt_count(PREEMPT_ACTIVE);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:28 +01:00
|
|
|
int __sched _cond_resched(void)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2009-07-10 14:57:57 +02:00
|
|
|
if (should_resched()) {
|
2005-04-16 15:20:36 -07:00
|
|
|
__cond_resched();
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2008-01-25 21:08:28 +01:00
|
|
|
EXPORT_SYMBOL(_cond_resched);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
2009-07-16 15:44:29 +02:00
|
|
|
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
|
2005-04-16 15:20:36 -07:00
|
|
|
* call schedule, and on return reacquire the lock.
|
|
|
|
*
|
2007-12-05 15:46:09 +01:00
|
|
|
* This works OK both with and without CONFIG_PREEMPT. We do strange low-level
|
2005-04-16 15:20:36 -07:00
|
|
|
* operations here to prevent schedule() from being called twice (once via
|
|
|
|
* spin_unlock(), once by hand).
|
|
|
|
*/
|
2009-07-16 15:44:29 +02:00
|
|
|
int __cond_resched_lock(spinlock_t *lock)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2009-07-10 14:57:57 +02:00
|
|
|
int resched = should_resched();
|
2005-06-13 15:52:32 -07:00
|
|
|
int ret = 0;
|
|
|
|
|
2009-07-20 19:16:29 +02:00
|
|
|
lockdep_assert_held(lock);
|
|
|
|
|
2008-01-30 13:31:20 +01:00
|
|
|
if (spin_needbreak(lock) || resched) {
|
2005-04-16 15:20:36 -07:00
|
|
|
spin_unlock(lock);
|
2009-07-10 14:57:57 +02:00
|
|
|
if (resched)
|
2008-01-30 13:31:20 +01:00
|
|
|
__cond_resched();
|
|
|
|
else
|
|
|
|
cpu_relax();
|
2005-06-13 15:52:32 -07:00
|
|
|
ret = 1;
|
2005-04-16 15:20:36 -07:00
|
|
|
spin_lock(lock);
|
|
|
|
}
|
2005-06-13 15:52:32 -07:00
|
|
|
return ret;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2009-07-16 15:44:29 +02:00
|
|
|
EXPORT_SYMBOL(__cond_resched_lock);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-07-16 15:44:29 +02:00
|
|
|
int __sched __cond_resched_softirq(void)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
BUG_ON(!in_softirq());
|
|
|
|
|
2009-07-10 14:57:57 +02:00
|
|
|
if (should_resched()) {
|
2007-05-23 13:58:18 -07:00
|
|
|
local_bh_enable();
|
2005-04-16 15:20:36 -07:00
|
|
|
__cond_resched();
|
|
|
|
local_bh_disable();
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2009-07-16 15:44:29 +02:00
|
|
|
EXPORT_SYMBOL(__cond_resched_softirq);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/**
|
|
|
|
* yield - yield the current processor to other threads.
|
|
|
|
*
|
2012-03-06 18:54:26 +01:00
|
|
|
* Do not ever use this function, there's a 99% chance you're doing it wrong.
|
|
|
|
*
|
|
|
|
* The scheduler is at all times free to pick the calling task as the most
|
|
|
|
* eligible task to run, if removing the yield() call from your code breaks
|
|
|
|
* it, its already broken.
|
|
|
|
*
|
|
|
|
* Typical broken usage is:
|
|
|
|
*
|
|
|
|
* while (!event)
|
|
|
|
* yield();
|
|
|
|
*
|
|
|
|
* where one assumes that yield() will let 'the other' process run that will
|
|
|
|
* make event true. If the current task is a SCHED_FIFO task that will never
|
|
|
|
* happen. Never use yield() as a progress guarantee!!
|
|
|
|
*
|
|
|
|
* If you want to use yield() to wait for something, use wait_event().
|
|
|
|
* If you want to use yield() to be 'nice' for others, use cond_resched().
|
|
|
|
* If you still want to use yield(), do not!
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
void __sched yield(void)
|
|
|
|
{
|
|
|
|
set_current_state(TASK_RUNNING);
|
|
|
|
sys_sched_yield();
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(yield);
|
|
|
|
|
2011-02-01 09:50:51 -05:00
|
|
|
/**
|
|
|
|
* yield_to - yield the current processor to another thread in
|
|
|
|
* your thread group, or accelerate that thread toward the
|
|
|
|
* processor it's on.
|
2011-03-18 09:34:53 -07:00
|
|
|
* @p: target task
|
|
|
|
* @preempt: whether task preemption is allowed or not
|
2011-02-01 09:50:51 -05:00
|
|
|
*
|
|
|
|
* It's the caller's job to ensure that the target task struct
|
|
|
|
* can't go away on us before we can do any checks.
|
|
|
|
*
|
|
|
|
* Returns true if we indeed boosted the target task.
|
|
|
|
*/
|
|
|
|
bool __sched yield_to(struct task_struct *p, bool preempt)
|
|
|
|
{
|
|
|
|
struct task_struct *curr = current;
|
|
|
|
struct rq *rq, *p_rq;
|
|
|
|
unsigned long flags;
|
|
|
|
bool yielded = 0;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
rq = this_rq();
|
|
|
|
|
|
|
|
again:
|
|
|
|
p_rq = task_rq(p);
|
|
|
|
double_rq_lock(rq, p_rq);
|
|
|
|
while (task_rq(p) != p_rq) {
|
|
|
|
double_rq_unlock(rq, p_rq);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!curr->sched_class->yield_to_task)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (curr->sched_class != p->sched_class)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (task_running(p_rq, p) || p->state)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
yielded = curr->sched_class->yield_to_task(rq, p, preempt);
|
2011-03-01 16:28:21 -08:00
|
|
|
if (yielded) {
|
2011-02-01 09:50:51 -05:00
|
|
|
schedstat_inc(rq, yld_count);
|
2011-03-01 16:28:21 -08:00
|
|
|
/*
|
|
|
|
* Make p's CPU reschedule; pick_next_entity takes care of
|
|
|
|
* fairness.
|
|
|
|
*/
|
|
|
|
if (preempt && rq != p_rq)
|
|
|
|
resched_task(p_rq->curr);
|
2011-11-22 15:21:26 +01:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* We might have set it in task_yield_fair(), but are
|
|
|
|
* not going to schedule(), so don't want to skip
|
|
|
|
* the next update.
|
|
|
|
*/
|
|
|
|
rq->skip_clock_update = 0;
|
2011-03-01 16:28:21 -08:00
|
|
|
}
|
2011-02-01 09:50:51 -05:00
|
|
|
|
|
|
|
out:
|
|
|
|
double_rq_unlock(rq, p_rq);
|
|
|
|
local_irq_restore(flags);
|
|
|
|
|
|
|
|
if (yielded)
|
|
|
|
schedule();
|
|
|
|
|
|
|
|
return yielded;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(yield_to);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2007-12-05 15:46:09 +01:00
|
|
|
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
|
2005-04-16 15:20:36 -07:00
|
|
|
* that process accounting knows that this is a task in IO wait state.
|
|
|
|
*/
|
|
|
|
void __sched io_schedule(void)
|
|
|
|
{
|
2009-06-29 14:44:57 +09:00
|
|
|
struct rq *rq = raw_rq();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-07-14 00:24:37 -07:00
|
|
|
delayacct_blkio_start();
|
2005-04-16 15:20:36 -07:00
|
|
|
atomic_inc(&rq->nr_iowait);
|
2011-03-08 13:19:51 +01:00
|
|
|
blk_flush_plug(current);
|
2009-07-20 11:26:58 -07:00
|
|
|
current->in_iowait = 1;
|
2005-04-16 15:20:36 -07:00
|
|
|
schedule();
|
2009-07-20 11:26:58 -07:00
|
|
|
current->in_iowait = 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
atomic_dec(&rq->nr_iowait);
|
2006-07-14 00:24:37 -07:00
|
|
|
delayacct_blkio_end();
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(io_schedule);
|
|
|
|
|
|
|
|
long __sched io_schedule_timeout(long timeout)
|
|
|
|
{
|
2009-06-29 14:44:57 +09:00
|
|
|
struct rq *rq = raw_rq();
|
2005-04-16 15:20:36 -07:00
|
|
|
long ret;
|
|
|
|
|
2006-07-14 00:24:37 -07:00
|
|
|
delayacct_blkio_start();
|
2005-04-16 15:20:36 -07:00
|
|
|
atomic_inc(&rq->nr_iowait);
|
2011-03-08 13:19:51 +01:00
|
|
|
blk_flush_plug(current);
|
2009-07-20 11:26:58 -07:00
|
|
|
current->in_iowait = 1;
|
2005-04-16 15:20:36 -07:00
|
|
|
ret = schedule_timeout(timeout);
|
2009-07-20 11:26:58 -07:00
|
|
|
current->in_iowait = 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
atomic_dec(&rq->nr_iowait);
|
2006-07-14 00:24:37 -07:00
|
|
|
delayacct_blkio_end();
|
2005-04-16 15:20:36 -07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_get_priority_max - return maximum RT priority.
|
|
|
|
* @policy: scheduling class.
|
|
|
|
*
|
|
|
|
* this syscall returns the maximum rt_priority that can be used
|
|
|
|
* by a given scheduling class.
|
|
|
|
*/
|
2009-01-14 14:14:08 +01:00
|
|
|
SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
int ret = -EINVAL;
|
|
|
|
|
|
|
|
switch (policy) {
|
|
|
|
case SCHED_FIFO:
|
|
|
|
case SCHED_RR:
|
|
|
|
ret = MAX_USER_RT_PRIO-1;
|
|
|
|
break;
|
|
|
|
case SCHED_NORMAL:
|
2006-01-14 13:20:41 -08:00
|
|
|
case SCHED_BATCH:
|
2007-07-09 18:51:59 +02:00
|
|
|
case SCHED_IDLE:
|
2005-04-16 15:20:36 -07:00
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_get_priority_min - return minimum RT priority.
|
|
|
|
* @policy: scheduling class.
|
|
|
|
*
|
|
|
|
* this syscall returns the minimum rt_priority that can be used
|
|
|
|
* by a given scheduling class.
|
|
|
|
*/
|
2009-01-14 14:14:08 +01:00
|
|
|
SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
int ret = -EINVAL;
|
|
|
|
|
|
|
|
switch (policy) {
|
|
|
|
case SCHED_FIFO:
|
|
|
|
case SCHED_RR:
|
|
|
|
ret = 1;
|
|
|
|
break;
|
|
|
|
case SCHED_NORMAL:
|
2006-01-14 13:20:41 -08:00
|
|
|
case SCHED_BATCH:
|
2007-07-09 18:51:59 +02:00
|
|
|
case SCHED_IDLE:
|
2005-04-16 15:20:36 -07:00
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_rr_get_interval - return the default timeslice of a process.
|
|
|
|
* @pid: pid of the process.
|
|
|
|
* @interval: userspace pointer to the timeslice value.
|
|
|
|
*
|
|
|
|
* this syscall writes the default timeslice value of a given process
|
|
|
|
* into the user-space timespec buffer. A value of '0' means infinity.
|
|
|
|
*/
|
2009-01-14 14:14:10 +01:00
|
|
|
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
|
2009-01-14 14:14:09 +01:00
|
|
|
struct timespec __user *, interval)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-07-03 00:25:41 -07:00
|
|
|
struct task_struct *p;
|
2007-10-15 17:00:13 +02:00
|
|
|
unsigned int time_slice;
|
2009-12-09 09:32:03 +01:00
|
|
|
unsigned long flags;
|
|
|
|
struct rq *rq;
|
2007-10-15 17:00:14 +02:00
|
|
|
int retval;
|
2005-04-16 15:20:36 -07:00
|
|
|
struct timespec t;
|
|
|
|
|
|
|
|
if (pid < 0)
|
2007-10-15 17:00:14 +02:00
|
|
|
return -EINVAL;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
retval = -ESRCH;
|
2009-12-09 10:15:11 +00:00
|
|
|
rcu_read_lock();
|
2005-04-16 15:20:36 -07:00
|
|
|
p = find_process_by_pid(pid);
|
|
|
|
if (!p)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
retval = security_task_getscheduler(p);
|
|
|
|
if (retval)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2009-12-09 09:32:03 +01:00
|
|
|
rq = task_rq_lock(p, &flags);
|
|
|
|
time_slice = p->sched_class->get_rr_interval(rq, p);
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
2007-10-15 17:00:13 +02:00
|
|
|
|
2009-12-09 10:15:11 +00:00
|
|
|
rcu_read_unlock();
|
2007-10-15 17:00:13 +02:00
|
|
|
jiffies_to_timespec(time_slice, &t);
|
2005-04-16 15:20:36 -07:00
|
|
|
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
|
|
|
|
return retval;
|
2007-10-15 17:00:14 +02:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
out_unlock:
|
2009-12-09 10:15:11 +00:00
|
|
|
rcu_read_unlock();
|
2005-04-16 15:20:36 -07:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2008-05-12 21:20:41 +02:00
|
|
|
static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
|
2006-07-03 00:25:41 -07:00
|
|
|
|
2008-01-25 21:08:02 +01:00
|
|
|
void sched_show_task(struct task_struct *p)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
unsigned long free = 0;
|
2006-07-03 00:25:41 -07:00
|
|
|
unsigned state;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
state = p->state ? __ffs(p->state) + 1 : 0;
|
2010-11-19 18:08:51 -08:00
|
|
|
printk(KERN_INFO "%-15.15s %c", p->comm,
|
2006-07-10 04:43:52 -07:00
|
|
|
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
|
2007-07-11 21:21:47 +02:00
|
|
|
#if BITS_PER_LONG == 32
|
2005-04-16 15:20:36 -07:00
|
|
|
if (state == TASK_RUNNING)
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT " running ");
|
2005-04-16 15:20:36 -07:00
|
|
|
else
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT " %08lx ", thread_saved_pc(p));
|
2005-04-16 15:20:36 -07:00
|
|
|
#else
|
|
|
|
if (state == TASK_RUNNING)
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT " running task ");
|
2005-04-16 15:20:36 -07:00
|
|
|
else
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT " %016lx ", thread_saved_pc(p));
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_DEBUG_STACK_USAGE
|
2008-04-22 16:38:23 -05:00
|
|
|
free = stack_not_used(p);
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
|
2011-12-15 08:49:18 -08:00
|
|
|
task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
|
2009-05-04 01:38:05 -07:00
|
|
|
(unsigned long)task_thread_info(p)->flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-01-25 21:08:34 +01:00
|
|
|
show_stack(p, NULL);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2006-12-06 20:35:59 -08:00
|
|
|
void show_state_filter(unsigned long state_filter)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-07-03 00:25:41 -07:00
|
|
|
struct task_struct *g, *p;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-07-11 21:21:47 +02:00
|
|
|
#if BITS_PER_LONG == 32
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_INFO
|
|
|
|
" task PC stack pid father\n");
|
2005-04-16 15:20:36 -07:00
|
|
|
#else
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_INFO
|
|
|
|
" task PC stack pid father\n");
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
2011-07-17 20:47:54 +02:00
|
|
|
rcu_read_lock();
|
2005-04-16 15:20:36 -07:00
|
|
|
do_each_thread(g, p) {
|
|
|
|
/*
|
|
|
|
* reset the NMI-timeout, listing all files on a slow
|
2011-03-30 22:57:33 -03:00
|
|
|
* console might take a lot of time:
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
touch_nmi_watchdog();
|
2007-04-25 20:50:03 -07:00
|
|
|
if (!state_filter || (p->state & state_filter))
|
2008-01-25 21:08:02 +01:00
|
|
|
sched_show_task(p);
|
2005-04-16 15:20:36 -07:00
|
|
|
} while_each_thread(g, p);
|
|
|
|
|
2007-05-08 00:28:05 -07:00
|
|
|
touch_all_softlockup_watchdogs();
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
sysrq_sched_debug_show();
|
|
|
|
#endif
|
2011-07-17 20:47:54 +02:00
|
|
|
rcu_read_unlock();
|
2006-12-06 20:35:59 -08:00
|
|
|
/*
|
|
|
|
* Only show locks if all tasks are dumped:
|
|
|
|
*/
|
2009-11-25 15:23:41 +02:00
|
|
|
if (!state_filter)
|
2006-12-06 20:35:59 -08:00
|
|
|
debug_show_all_locks();
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
void __cpuinit init_idle_bootup_task(struct task_struct *idle)
|
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
idle->sched_class = &idle_sched_class;
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2005-06-28 16:40:42 +02:00
|
|
|
/**
|
|
|
|
* init_idle - set up an idle thread for a given CPU
|
|
|
|
* @idle: task in question
|
|
|
|
* @cpu: cpu the idle task belongs to
|
|
|
|
*
|
|
|
|
* NOTE: this function does not set the idle thread's NEED_RESCHED
|
|
|
|
* flag, to make booting more robust.
|
|
|
|
*/
|
2006-10-03 01:14:04 -07:00
|
|
|
void __cpuinit init_idle(struct task_struct *idle, int cpu)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long flags;
|
|
|
|
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock_irqsave(&rq->lock, flags);
|
2008-11-12 20:05:50 +01:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
__sched_fork(idle);
|
2009-12-16 18:04:35 +01:00
|
|
|
idle->state = TASK_RUNNING;
|
2007-07-09 18:51:59 +02:00
|
|
|
idle->se.exec_start = sched_clock();
|
|
|
|
|
2011-05-19 15:08:58 +09:00
|
|
|
do_set_cpus_allowed(idle, cpumask_of(cpu));
|
2010-09-16 17:50:31 +02:00
|
|
|
/*
|
|
|
|
* We're having a chicken and egg problem, even though we are
|
|
|
|
* holding rq->lock, the cpu isn't yet set to this cpu so the
|
|
|
|
* lockdep check in task_group() will fail.
|
|
|
|
*
|
|
|
|
* Similar case to sched_fork(). / Alternatively we could
|
|
|
|
* use task_rq_lock() here and obtain the other rq->lock.
|
|
|
|
*
|
|
|
|
* Silence PROVE_RCU
|
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
2007-07-09 18:51:59 +02:00
|
|
|
__set_task_cpu(idle, cpu);
|
2010-09-16 17:50:31 +02:00
|
|
|
rcu_read_unlock();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
rq->curr = rq->idle = idle;
|
2011-04-05 17:23:40 +02:00
|
|
|
#if defined(CONFIG_SMP)
|
|
|
|
idle->on_cpu = 1;
|
2005-06-25 14:57:23 -07:00
|
|
|
#endif
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/* Set the preempt count _outside_ the spinlocks! */
|
2005-11-13 16:06:55 -08:00
|
|
|
task_thread_info(idle)->preempt_count = 0;
|
2008-08-04 08:54:26 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* The idle tasks have their own, simple scheduling class:
|
|
|
|
*/
|
|
|
|
idle->sched_class = &idle_sched_class;
|
ftrace: Fix memory leak with function graph and cpu hotplug
When the fuction graph tracer starts, it needs to make a special
stack for each task to save the real return values of the tasks.
All running tasks have this stack created, as well as any new
tasks.
On CPU hot plug, the new idle task will allocate a stack as well
when init_idle() is called. The problem is that cpu hotplug does
not create a new idle_task. Instead it uses the idle task that
existed when the cpu went down.
ftrace_graph_init_task() will add a new ret_stack to the task
that is given to it. Because a clone will make the task
have a stack of its parent it does not check if the task's
ret_stack is already NULL or not. When the CPU hotplug code
starts a CPU up again, it will allocate a new stack even
though one already existed for it.
The solution is to treat the idle_task specially. In fact, the
function_graph code already does, just not at init_idle().
Instead of using the ftrace_graph_init_task() for the idle task,
which that function expects the task to be a clone, have a
separate ftrace_graph_init_idle_task(). Also, we will create a
per_cpu ret_stack that is used by the idle task. When we call
ftrace_graph_init_idle_task() it will check if the idle task's
ret_stack is NULL, if it is, then it will assign it the per_cpu
ret_stack.
Reported-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stable Tree <stable@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-02-10 21:26:13 -05:00
|
|
|
ftrace_graph_init_idle_task(idle, cpu);
|
2011-10-26 23:14:16 +02:00
|
|
|
#if defined(CONFIG_SMP)
|
|
|
|
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
|
|
|
|
#endif
|
2007-11-09 22:39:38 +01:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifdef CONFIG_SMP
|
2011-05-19 15:08:58 +09:00
|
|
|
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
|
|
|
|
{
|
|
|
|
if (p->sched_class && p->sched_class->set_cpus_allowed)
|
|
|
|
p->sched_class->set_cpus_allowed(p, new_mask);
|
2011-06-25 15:45:46 +02:00
|
|
|
|
|
|
|
cpumask_copy(&p->cpus_allowed, new_mask);
|
2012-04-23 12:11:21 +02:00
|
|
|
p->nr_cpus_allowed = cpumask_weight(new_mask);
|
2011-05-19 15:08:58 +09:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* This is how migration works:
|
|
|
|
*
|
2010-05-06 18:49:21 +02:00
|
|
|
* 1) we invoke migration_cpu_stop() on the target CPU using
|
|
|
|
* stop_one_cpu().
|
|
|
|
* 2) stopper starts to run (implicitly forcing the migrated thread
|
|
|
|
* off the CPU)
|
|
|
|
* 3) it checks whether the migrated task is still in the wrong runqueue.
|
|
|
|
* 4) if it's in the wrong runqueue then the migration thread removes
|
2005-04-16 15:20:36 -07:00
|
|
|
* it and puts it into the right queue.
|
2010-05-06 18:49:21 +02:00
|
|
|
* 5) stopper completes and stop_one_cpu() returns and the migration
|
|
|
|
* is done.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Change a given task's CPU affinity. Migrate the thread to a
|
|
|
|
* proper CPU and schedule it away if the CPU it's executing on
|
|
|
|
* is removed from the allowed bitmask.
|
|
|
|
*
|
|
|
|
* NOTE: the caller must have a valid reference to the task, the
|
2007-12-05 15:46:09 +01:00
|
|
|
* task must not exit() & deallocate itself prematurely. The
|
2005-04-16 15:20:36 -07:00
|
|
|
* call is not atomic; no spinlocks may be held.
|
|
|
|
*/
|
2008-11-25 02:35:14 +10:30
|
|
|
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq;
|
2010-05-06 18:49:21 +02:00
|
|
|
unsigned int dest_cpu;
|
2006-07-03 00:25:40 -07:00
|
|
|
int ret = 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
rq = task_rq_lock(p, &flags);
|
2009-12-16 18:04:36 +01:00
|
|
|
|
2011-05-09 22:07:05 +08:00
|
|
|
if (cpumask_equal(&p->cpus_allowed, new_mask))
|
|
|
|
goto out;
|
|
|
|
|
2009-11-25 13:31:39 +01:00
|
|
|
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
|
2005-04-16 15:20:36 -07:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2011-05-09 22:07:05 +08:00
|
|
|
if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
|
2008-06-05 12:57:11 -07:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2011-05-19 15:08:58 +09:00
|
|
|
do_set_cpus_allowed(p, new_mask);
|
2008-01-25 21:08:07 +01:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/* Can the task run on the task's current CPU? If so, we're done */
|
2008-11-25 02:35:14 +10:30
|
|
|
if (cpumask_test_cpu(task_cpu(p), new_mask))
|
2005-04-16 15:20:36 -07:00
|
|
|
goto out;
|
|
|
|
|
2010-05-06 18:49:21 +02:00
|
|
|
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
|
2011-04-05 17:23:59 +02:00
|
|
|
if (p->on_rq) {
|
2010-05-06 18:49:21 +02:00
|
|
|
struct migration_arg arg = { p, dest_cpu };
|
2005-04-16 15:20:36 -07:00
|
|
|
/* Need help from migration thread: drop lock and wait. */
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
2010-05-06 18:49:21 +02:00
|
|
|
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
|
2005-04-16 15:20:36 -07:00
|
|
|
tlb_migrate_finish(p->mm);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
out:
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, p, &flags);
|
2006-07-03 00:25:40 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return ret;
|
|
|
|
}
|
2008-03-26 14:23:49 -07:00
|
|
|
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
2007-12-05 15:46:09 +01:00
|
|
|
* Move (not current) task off this cpu, onto dest cpu. We're doing
|
2005-04-16 15:20:36 -07:00
|
|
|
* this because either it can't run here any more (set_cpus_allowed()
|
|
|
|
* away from this CPU, or CPU going down), or because we're
|
|
|
|
* attempting to rebalance this task on exec (sched_exec).
|
|
|
|
*
|
|
|
|
* So we race with normal scheduler movements, but that's OK, as long
|
|
|
|
* as the task is no longer on this CPU.
|
2006-06-27 02:54:32 -07:00
|
|
|
*
|
|
|
|
* Returns non-zero if task was successfully migrated.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2006-06-27 02:54:32 -07:00
|
|
|
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq_dest, *rq_src;
|
2009-12-16 18:04:36 +01:00
|
|
|
int ret = 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-07-15 04:43:49 -07:00
|
|
|
if (unlikely(!cpu_active(dest_cpu)))
|
2006-06-27 02:54:32 -07:00
|
|
|
return ret;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
rq_src = cpu_rq(src_cpu);
|
|
|
|
rq_dest = cpu_rq(dest_cpu);
|
|
|
|
|
2011-04-05 17:23:51 +02:00
|
|
|
raw_spin_lock(&p->pi_lock);
|
2005-04-16 15:20:36 -07:00
|
|
|
double_rq_lock(rq_src, rq_dest);
|
|
|
|
/* Already moved. */
|
|
|
|
if (task_cpu(p) != src_cpu)
|
2008-07-10 11:25:03 -07:00
|
|
|
goto done;
|
2005-04-16 15:20:36 -07:00
|
|
|
/* Affinity changed (again). */
|
2011-06-16 12:23:22 +02:00
|
|
|
if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
|
2008-07-10 11:25:03 -07:00
|
|
|
goto fail;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-12-16 18:04:36 +01:00
|
|
|
/*
|
|
|
|
* If we're not on a rq, the next wake-up will ensure we're
|
|
|
|
* placed properly.
|
|
|
|
*/
|
2011-04-05 17:23:44 +02:00
|
|
|
if (p->on_rq) {
|
2012-01-25 11:50:51 +01:00
|
|
|
dequeue_task(rq_src, p, 0);
|
2009-12-16 18:04:36 +01:00
|
|
|
set_task_cpu(p, dest_cpu);
|
2012-01-25 11:50:51 +01:00
|
|
|
enqueue_task(rq_dest, p, 0);
|
2008-09-20 23:38:02 +02:00
|
|
|
check_preempt_curr(rq_dest, p, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2008-07-10 11:25:03 -07:00
|
|
|
done:
|
2006-06-27 02:54:32 -07:00
|
|
|
ret = 1;
|
2008-07-10 11:25:03 -07:00
|
|
|
fail:
|
2005-04-16 15:20:36 -07:00
|
|
|
double_rq_unlock(rq_src, rq_dest);
|
2011-04-05 17:23:51 +02:00
|
|
|
raw_spin_unlock(&p->pi_lock);
|
2006-06-27 02:54:32 -07:00
|
|
|
return ret;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-05-06 18:49:21 +02:00
|
|
|
* migration_cpu_stop - this will be executed by a highprio stopper thread
|
|
|
|
* and performs thread migration by bumping thread off CPU then
|
|
|
|
* 'pushing' onto another runqueue.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2010-05-06 18:49:21 +02:00
|
|
|
static int migration_cpu_stop(void *data)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2010-05-06 18:49:21 +02:00
|
|
|
struct migration_arg *arg = data;
|
2007-10-16 23:30:56 -07:00
|
|
|
|
2010-05-06 18:49:21 +02:00
|
|
|
/*
|
|
|
|
* The original target cpu might have gone down and we might
|
|
|
|
* be on another cpu but it doesn't matter.
|
|
|
|
*/
|
2007-10-16 23:30:56 -07:00
|
|
|
local_irq_disable();
|
2010-05-06 18:49:21 +02:00
|
|
|
__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
|
2007-10-16 23:30:56 -07:00
|
|
|
local_irq_enable();
|
2005-04-16 15:20:36 -07:00
|
|
|
return 0;
|
2007-10-16 23:30:56 -07:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
2010-11-13 19:32:29 +01:00
|
|
|
|
2006-12-10 02:20:11 -08:00
|
|
|
/*
|
2010-11-13 19:32:29 +01:00
|
|
|
* Ensures that the idle task is using init_mm right before its cpu goes
|
|
|
|
* offline.
|
2006-12-10 02:20:11 -08:00
|
|
|
*/
|
2010-11-13 19:32:29 +01:00
|
|
|
void idle_task_exit(void)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2010-11-13 19:32:29 +01:00
|
|
|
struct mm_struct *mm = current->active_mm;
|
2008-11-25 02:35:11 +10:30
|
|
|
|
2010-11-13 19:32:29 +01:00
|
|
|
BUG_ON(cpu_online(smp_processor_id()));
|
2008-11-25 02:35:11 +10:30
|
|
|
|
2010-11-13 19:32:29 +01:00
|
|
|
if (mm != &init_mm)
|
|
|
|
switch_mm(mm, &init_mm, current);
|
|
|
|
mmdrop(mm);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2012-08-20 11:26:57 +02:00
|
|
|
* Since this CPU is going 'away' for a while, fold any nr_active delta
|
|
|
|
* we might have. Assumes we're called after migrate_tasks() so that the
|
|
|
|
* nr_active count is stable.
|
|
|
|
*
|
|
|
|
* Also see the comment "Global load-average calculations".
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2012-08-20 11:26:57 +02:00
|
|
|
static void calc_load_migrate(struct rq *rq)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2012-08-20 11:26:57 +02:00
|
|
|
long delta = calc_load_fold_active(rq);
|
|
|
|
if (delta)
|
|
|
|
atomic_long_add(delta, &calc_load_tasks);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2006-07-03 00:25:40 -07:00
|
|
|
/*
|
2010-11-13 19:32:29 +01:00
|
|
|
* Migrate all tasks from the rq, sleeping tasks will be migrated by
|
|
|
|
* try_to_wake_up()->select_task_rq().
|
|
|
|
*
|
|
|
|
* Called with rq->lock held even though we'er in stop_machine() and
|
|
|
|
* there's no concurrency possible, we hold the required locks anyway
|
|
|
|
* because of lock validation efforts.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2010-11-13 19:32:29 +01:00
|
|
|
static void migrate_tasks(unsigned int dead_cpu)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq = cpu_rq(dead_cpu);
|
2010-11-13 19:32:29 +01:00
|
|
|
struct task_struct *next, *stop = rq->stop;
|
|
|
|
int dest_cpu;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
2010-11-13 19:32:29 +01:00
|
|
|
* Fudge the rq selection such that the below task selection loop
|
|
|
|
* doesn't get stuck on the currently eligible stop task.
|
|
|
|
*
|
|
|
|
* We're currently inside stop_machine() and the rq is either stuck
|
|
|
|
* in the stop_machine_cpu_stop() loop, or we're executing this code,
|
|
|
|
* either way we should never end up calling schedule() until we're
|
|
|
|
* done here.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2010-11-13 19:32:29 +01:00
|
|
|
rq->stop = NULL;
|
2006-07-03 00:25:40 -07:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
for ( ; ; ) {
|
2010-11-13 19:32:29 +01:00
|
|
|
/*
|
|
|
|
* There's this thread running, bail when that's the only
|
|
|
|
* remaining thread.
|
|
|
|
*/
|
|
|
|
if (rq->nr_running == 1)
|
2007-07-09 18:51:59 +02:00
|
|
|
break;
|
2010-11-13 19:32:29 +01:00
|
|
|
|
2009-03-02 13:55:26 +08:00
|
|
|
next = pick_next_task(rq);
|
2010-11-13 19:32:29 +01:00
|
|
|
BUG_ON(!next);
|
2008-06-29 00:16:56 +02:00
|
|
|
next->sched_class->put_prev_task(rq, next);
|
2007-07-26 13:40:43 +02:00
|
|
|
|
2010-11-13 19:32:29 +01:00
|
|
|
/* Find suitable destination for @next, with force if needed. */
|
|
|
|
dest_cpu = select_fallback_rq(dead_cpu, next);
|
|
|
|
raw_spin_unlock(&rq->lock);
|
|
|
|
|
|
|
|
__migrate_task(next, dead_cpu, dest_cpu);
|
|
|
|
|
|
|
|
raw_spin_lock(&rq->lock);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2009-04-11 10:43:41 +02:00
|
|
|
|
2010-11-13 19:32:29 +01:00
|
|
|
rq->stop = stop;
|
2009-04-11 10:43:41 +02:00
|
|
|
}
|
2010-11-13 19:32:29 +01:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif /* CONFIG_HOTPLUG_CPU */
|
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
|
|
|
|
|
|
|
|
static struct ctl_table sd_ctl_dir[] = {
|
2007-08-09 11:16:46 +02:00
|
|
|
{
|
|
|
|
.procname = "sched_domain",
|
2007-08-23 15:18:02 +02:00
|
|
|
.mode = 0555,
|
2007-08-09 11:16:46 +02:00
|
|
|
},
|
2009-11-05 15:38:40 -08:00
|
|
|
{}
|
2007-07-26 13:40:43 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
static struct ctl_table sd_ctl_root[] = {
|
2007-08-09 11:16:46 +02:00
|
|
|
{
|
|
|
|
.procname = "kernel",
|
2007-08-23 15:18:02 +02:00
|
|
|
.mode = 0555,
|
2007-08-09 11:16:46 +02:00
|
|
|
.child = sd_ctl_dir,
|
|
|
|
},
|
2009-11-05 15:38:40 -08:00
|
|
|
{}
|
2007-07-26 13:40:43 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
static struct ctl_table *sd_alloc_ctl_entry(int n)
|
|
|
|
{
|
|
|
|
struct ctl_table *entry =
|
2007-10-15 17:00:19 +02:00
|
|
|
kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
|
2007-07-26 13:40:43 +02:00
|
|
|
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:19 +02:00
|
|
|
static void sd_free_ctl_entry(struct ctl_table **tablep)
|
|
|
|
{
|
2007-10-17 16:55:11 +02:00
|
|
|
struct ctl_table *entry;
|
2007-10-15 17:00:19 +02:00
|
|
|
|
2007-10-17 16:55:11 +02:00
|
|
|
/*
|
|
|
|
* In the intermediate directories, both the child directory and
|
|
|
|
* procname are dynamically allocated and could fail but the mode
|
2007-12-05 15:46:09 +01:00
|
|
|
* will always be set. In the lowest directory the names are
|
2007-10-17 16:55:11 +02:00
|
|
|
* static strings and all have proc handlers.
|
|
|
|
*/
|
|
|
|
for (entry = *tablep; entry->mode; entry++) {
|
2007-10-15 17:00:19 +02:00
|
|
|
if (entry->child)
|
|
|
|
sd_free_ctl_entry(&entry->child);
|
2007-10-17 16:55:11 +02:00
|
|
|
if (entry->proc_handler == NULL)
|
|
|
|
kfree(entry->procname);
|
|
|
|
}
|
2007-10-15 17:00:19 +02:00
|
|
|
|
|
|
|
kfree(*tablep);
|
|
|
|
*tablep = NULL;
|
|
|
|
}
|
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
static void
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(struct ctl_table *entry,
|
2007-07-26 13:40:43 +02:00
|
|
|
const char *procname, void *data, int maxlen,
|
2011-07-26 03:47:31 -04:00
|
|
|
umode_t mode, proc_handler *proc_handler)
|
2007-07-26 13:40:43 +02:00
|
|
|
{
|
|
|
|
entry->procname = procname;
|
|
|
|
entry->data = data;
|
|
|
|
entry->maxlen = maxlen;
|
|
|
|
entry->mode = mode;
|
|
|
|
entry->proc_handler = proc_handler;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct ctl_table *
|
|
|
|
sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
|
|
|
{
|
2008-10-09 11:35:51 +02:00
|
|
|
struct ctl_table *table = sd_alloc_ctl_entry(13);
|
2007-07-26 13:40:43 +02:00
|
|
|
|
2007-10-15 17:00:19 +02:00
|
|
|
if (table == NULL)
|
|
|
|
return NULL;
|
|
|
|
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[0], "min_interval", &sd->min_interval,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(long), 0644, proc_doulongvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[1], "max_interval", &sd->max_interval,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(long), 0644, proc_doulongvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-10-15 17:00:14 +02:00
|
|
|
set_table_entry(&table[9], "cache_nice_tries",
|
2007-07-26 13:40:43 +02:00
|
|
|
&sd->cache_nice_tries,
|
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-10-15 17:00:14 +02:00
|
|
|
set_table_entry(&table[10], "flags", &sd->flags,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2008-10-09 11:35:51 +02:00
|
|
|
set_table_entry(&table[11], "name", sd->name,
|
|
|
|
CORENAME_MAX_SIZE, 0444, proc_dostring);
|
|
|
|
/* &table[12] is terminator */
|
2007-07-26 13:40:43 +02:00
|
|
|
|
|
|
|
return table;
|
|
|
|
}
|
|
|
|
|
2007-11-28 15:52:56 +01:00
|
|
|
static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
|
2007-07-26 13:40:43 +02:00
|
|
|
{
|
|
|
|
struct ctl_table *entry, *table;
|
|
|
|
struct sched_domain *sd;
|
|
|
|
int domain_num = 0, i;
|
|
|
|
char buf[32];
|
|
|
|
|
|
|
|
for_each_domain(cpu, sd)
|
|
|
|
domain_num++;
|
|
|
|
entry = table = sd_alloc_ctl_entry(domain_num + 1);
|
2007-10-15 17:00:19 +02:00
|
|
|
if (table == NULL)
|
|
|
|
return NULL;
|
2007-07-26 13:40:43 +02:00
|
|
|
|
|
|
|
i = 0;
|
|
|
|
for_each_domain(cpu, sd) {
|
|
|
|
snprintf(buf, 32, "domain%d", i);
|
|
|
|
entry->procname = kstrdup(buf, GFP_KERNEL);
|
2007-08-23 15:18:02 +02:00
|
|
|
entry->mode = 0555;
|
2007-07-26 13:40:43 +02:00
|
|
|
entry->child = sd_alloc_ctl_domain_table(sd);
|
|
|
|
entry++;
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
return table;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct ctl_table_header *sd_sysctl_header;
|
2007-10-15 17:00:19 +02:00
|
|
|
static void register_sched_domain_sysctl(void)
|
2007-07-26 13:40:43 +02:00
|
|
|
{
|
2009-11-25 13:31:39 +01:00
|
|
|
int i, cpu_num = num_possible_cpus();
|
2007-07-26 13:40:43 +02:00
|
|
|
struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
|
|
|
|
char buf[32];
|
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
WARN_ON(sd_ctl_dir[0].child);
|
|
|
|
sd_ctl_dir[0].child = entry;
|
|
|
|
|
2007-10-15 17:00:19 +02:00
|
|
|
if (entry == NULL)
|
|
|
|
return;
|
|
|
|
|
2009-11-25 13:31:39 +01:00
|
|
|
for_each_possible_cpu(i) {
|
2007-07-26 13:40:43 +02:00
|
|
|
snprintf(buf, 32, "cpu%d", i);
|
|
|
|
entry->procname = kstrdup(buf, GFP_KERNEL);
|
2007-08-23 15:18:02 +02:00
|
|
|
entry->mode = 0555;
|
2007-07-26 13:40:43 +02:00
|
|
|
entry->child = sd_alloc_ctl_cpu_table(i);
|
2007-10-15 17:00:19 +02:00
|
|
|
entry++;
|
2007-07-26 13:40:43 +02:00
|
|
|
}
|
2007-10-24 18:23:48 +02:00
|
|
|
|
|
|
|
WARN_ON(sd_sysctl_header);
|
2007-07-26 13:40:43 +02:00
|
|
|
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
|
|
|
|
}
|
2007-10-15 17:00:19 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
/* may be called multiple times per register */
|
2007-10-15 17:00:19 +02:00
|
|
|
static void unregister_sched_domain_sysctl(void)
|
|
|
|
{
|
2007-10-24 18:23:48 +02:00
|
|
|
if (sd_sysctl_header)
|
|
|
|
unregister_sysctl_table(sd_sysctl_header);
|
2007-10-15 17:00:19 +02:00
|
|
|
sd_sysctl_header = NULL;
|
2007-10-24 18:23:48 +02:00
|
|
|
if (sd_ctl_dir[0].child)
|
|
|
|
sd_free_ctl_entry(&sd_ctl_dir[0].child);
|
2007-10-15 17:00:19 +02:00
|
|
|
}
|
2007-07-26 13:40:43 +02:00
|
|
|
#else
|
2007-10-15 17:00:19 +02:00
|
|
|
static void register_sched_domain_sysctl(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static void unregister_sched_domain_sysctl(void)
|
2007-07-26 13:40:43 +02:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-06-04 15:04:05 -04:00
|
|
|
static void set_rq_online(struct rq *rq)
|
|
|
|
{
|
|
|
|
if (!rq->online) {
|
|
|
|
const struct sched_class *class;
|
|
|
|
|
2008-11-25 02:35:05 +10:30
|
|
|
cpumask_set_cpu(rq->cpu, rq->rd->online);
|
2008-06-04 15:04:05 -04:00
|
|
|
rq->online = 1;
|
|
|
|
|
|
|
|
for_each_class(class) {
|
|
|
|
if (class->rq_online)
|
|
|
|
class->rq_online(rq);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void set_rq_offline(struct rq *rq)
|
|
|
|
{
|
|
|
|
if (rq->online) {
|
|
|
|
const struct sched_class *class;
|
|
|
|
|
|
|
|
for_each_class(class) {
|
|
|
|
if (class->rq_offline)
|
|
|
|
class->rq_offline(rq);
|
|
|
|
}
|
|
|
|
|
2008-11-25 02:35:05 +10:30
|
|
|
cpumask_clear_cpu(rq->cpu, rq->rd->online);
|
2008-06-04 15:04:05 -04:00
|
|
|
rq->online = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* migration_call - callback that gets triggered when a CPU is added.
|
|
|
|
* Here we can start up the necessary migration thread for the new CPU.
|
|
|
|
*/
|
2006-07-03 00:25:40 -07:00
|
|
|
static int __cpuinit
|
|
|
|
migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-07-03 00:25:40 -07:00
|
|
|
int cpu = (long)hcpu;
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long flags;
|
2010-05-06 18:49:21 +02:00
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2010-11-13 19:32:29 +01:00
|
|
|
switch (action & ~CPU_TASKS_FROZEN) {
|
2007-05-09 02:34:04 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
case CPU_UP_PREPARE:
|
2009-07-17 14:15:46 +02:00
|
|
|
rq->calc_load_update = calc_load_update;
|
2005-04-16 15:20:36 -07:00
|
|
|
break;
|
2006-07-03 00:25:40 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
case CPU_ONLINE:
|
2008-03-10 16:52:41 -04:00
|
|
|
/* Update our root-domain */
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock_irqsave(&rq->lock, flags);
|
2008-03-10 16:52:41 -04:00
|
|
|
if (rq->rd) {
|
2008-11-25 02:35:05 +10:30
|
|
|
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
|
2008-06-04 15:04:05 -04:00
|
|
|
|
|
|
|
set_rq_online(rq);
|
2008-03-10 16:52:41 -04:00
|
|
|
}
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
break;
|
2006-07-03 00:25:40 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
2008-03-10 17:59:11 -04:00
|
|
|
case CPU_DYING:
|
2011-04-05 17:23:58 +02:00
|
|
|
sched_ttwu_pending();
|
2008-01-25 21:08:18 +01:00
|
|
|
/* Update our root-domain */
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock_irqsave(&rq->lock, flags);
|
2008-01-25 21:08:18 +01:00
|
|
|
if (rq->rd) {
|
2008-11-25 02:35:05 +10:30
|
|
|
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
|
2008-06-04 15:04:05 -04:00
|
|
|
set_rq_offline(rq);
|
2008-01-25 21:08:18 +01:00
|
|
|
}
|
2010-11-13 19:32:29 +01:00
|
|
|
migrate_tasks(cpu);
|
|
|
|
BUG_ON(rq->nr_running != 1); /* the migration thread */
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
2012-08-20 11:26:57 +02:00
|
|
|
break;
|
2010-11-13 19:32:29 +01:00
|
|
|
|
2012-08-20 11:26:57 +02:00
|
|
|
case CPU_DEAD:
|
2012-08-20 11:26:57 +02:00
|
|
|
calc_load_migrate(rq);
|
2008-01-25 21:08:18 +01:00
|
|
|
break;
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
|
|
|
}
|
2011-04-05 10:14:25 +02:00
|
|
|
|
|
|
|
update_max_interval();
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
2009-06-02 21:05:16 +10:00
|
|
|
/*
|
|
|
|
* Register at high priority so that task migration (migrate_all_tasks)
|
|
|
|
* happens before everything else. This has to be lower priority than
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 12:02:48 +02:00
|
|
|
* the notifier in the perf_event subsystem, though.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2006-06-27 02:54:10 -07:00
|
|
|
static struct notifier_block __cpuinitdata migration_notifier = {
|
2005-04-16 15:20:36 -07:00
|
|
|
.notifier_call = migration_call,
|
2010-06-08 21:40:36 +02:00
|
|
|
.priority = CPU_PRI_MIGRATION,
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
2010-06-08 21:40:36 +02:00
|
|
|
static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
|
|
|
|
unsigned long action, void *hcpu)
|
|
|
|
{
|
|
|
|
switch (action & ~CPU_TASKS_FROZEN) {
|
2011-12-15 17:09:22 +01:00
|
|
|
case CPU_STARTING:
|
2010-06-08 21:40:36 +02:00
|
|
|
case CPU_DOWN_FAILED:
|
|
|
|
set_cpu_active((long)hcpu, true);
|
|
|
|
return NOTIFY_OK;
|
|
|
|
default:
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
|
|
|
|
unsigned long action, void *hcpu)
|
|
|
|
{
|
|
|
|
switch (action & ~CPU_TASKS_FROZEN) {
|
|
|
|
case CPU_DOWN_PREPARE:
|
|
|
|
set_cpu_active((long)hcpu, false);
|
|
|
|
return NOTIFY_OK;
|
|
|
|
default:
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-07-25 19:45:11 -07:00
|
|
|
static int __init migration_init(void)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
void *cpu = (void *)(long)smp_processor_id();
|
2006-09-29 02:00:22 -07:00
|
|
|
int err;
|
2006-07-03 00:25:40 -07:00
|
|
|
|
2010-06-08 21:40:36 +02:00
|
|
|
/* Initialize migration for the boot CPU */
|
2006-09-29 02:00:22 -07:00
|
|
|
err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
|
|
|
|
BUG_ON(err == NOTIFY_BAD);
|
2005-04-16 15:20:36 -07:00
|
|
|
migration_call(&migration_notifier, CPU_ONLINE, cpu);
|
|
|
|
register_cpu_notifier(&migration_notifier);
|
2008-07-25 19:45:11 -07:00
|
|
|
|
2010-06-08 21:40:36 +02:00
|
|
|
/* Register cpu active notifiers */
|
|
|
|
cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
|
|
|
|
cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
|
|
|
|
|
2009-07-21 09:54:05 +02:00
|
|
|
return 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2008-07-25 19:45:11 -07:00
|
|
|
early_initcall(migration_init);
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
2007-05-06 14:48:58 -07:00
|
|
|
|
2011-04-07 14:09:58 +02:00
|
|
|
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
|
|
|
|
|
2007-10-15 17:00:13 +02:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
2007-10-24 18:23:48 +02:00
|
|
|
|
2012-05-31 21:20:16 +02:00
|
|
|
static __read_mostly int sched_debug_enabled;
|
2009-11-17 18:22:15 -06:00
|
|
|
|
2012-05-31 21:20:16 +02:00
|
|
|
static int __init sched_debug_setup(char *str)
|
2009-11-17 18:22:15 -06:00
|
|
|
{
|
2012-05-31 21:20:16 +02:00
|
|
|
sched_debug_enabled = 1;
|
2009-11-17 18:22:15 -06:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2012-05-31 21:20:16 +02:00
|
|
|
early_param("sched_debug", sched_debug_setup);
|
|
|
|
|
|
|
|
static inline bool sched_debug(void)
|
|
|
|
{
|
|
|
|
return sched_debug_enabled;
|
|
|
|
}
|
2009-11-17 18:22:15 -06:00
|
|
|
|
2008-04-04 18:11:11 -07:00
|
|
|
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
|
2008-11-25 02:35:14 +10:30
|
|
|
struct cpumask *groupmask)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-10-24 18:23:48 +02:00
|
|
|
struct sched_group *group = sd->groups;
|
2008-04-04 18:11:04 -07:00
|
|
|
char str[256];
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-12-13 21:55:51 +10:30
|
|
|
cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
|
2008-11-25 02:35:14 +10:30
|
|
|
cpumask_clear(groupmask);
|
2007-10-24 18:23:48 +02:00
|
|
|
|
|
|
|
printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
|
|
|
|
|
|
|
|
if (!(sd->flags & SD_LOAD_BALANCE)) {
|
2009-12-20 14:23:57 +01:00
|
|
|
printk("does not load-balance\n");
|
2007-10-24 18:23:48 +02:00
|
|
|
if (sd->parent)
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
|
|
|
|
" has parent");
|
2007-10-24 18:23:48 +02:00
|
|
|
return -1;
|
2005-06-25 14:57:24 -07:00
|
|
|
}
|
|
|
|
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT "span %s level %s\n", str, sd->name);
|
2007-10-24 18:23:48 +02:00
|
|
|
|
2008-11-25 02:35:04 +10:30
|
|
|
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_ERR "ERROR: domain->span does not contain "
|
|
|
|
"CPU%d\n", cpu);
|
2007-10-24 18:23:48 +02:00
|
|
|
}
|
2008-11-25 02:35:04 +10:30
|
|
|
if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_ERR "ERROR: domain->groups does not contain"
|
|
|
|
" CPU%d\n", cpu);
|
2007-10-24 18:23:48 +02:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
printk(KERN_DEBUG "%*s groups:", level + 1, "");
|
2005-04-16 15:20:36 -07:00
|
|
|
do {
|
2007-10-24 18:23:48 +02:00
|
|
|
if (!group) {
|
2009-12-20 14:23:57 +01:00
|
|
|
printk("\n");
|
|
|
|
printk(KERN_ERR "ERROR: group is NULL\n");
|
2005-04-16 15:20:36 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2012-05-31 12:05:32 +02:00
|
|
|
/*
|
|
|
|
* Even though we initialize ->power to something semi-sane,
|
|
|
|
* we leave power_orig unset. This allows us to detect if
|
|
|
|
* domain iteration is still funny without causing /0 traps.
|
|
|
|
*/
|
|
|
|
if (!group->sgp->power_orig) {
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT "\n");
|
|
|
|
printk(KERN_ERR "ERROR: domain->cpu_power not "
|
|
|
|
"set\n");
|
2007-10-24 18:23:48 +02:00
|
|
|
break;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-11-25 02:35:04 +10:30
|
|
|
if (!cpumask_weight(sched_group_cpus(group))) {
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT "\n");
|
|
|
|
printk(KERN_ERR "ERROR: empty group\n");
|
2007-10-24 18:23:48 +02:00
|
|
|
break;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2012-04-17 15:49:36 +02:00
|
|
|
if (!(sd->flags & SD_OVERLAP) &&
|
|
|
|
cpumask_intersects(groupmask, sched_group_cpus(group))) {
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT "\n");
|
|
|
|
printk(KERN_ERR "ERROR: repeated CPUs\n");
|
2007-10-24 18:23:48 +02:00
|
|
|
break;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-11-25 02:35:04 +10:30
|
|
|
cpumask_or(groupmask, groupmask, sched_group_cpus(group));
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-12-13 21:55:51 +10:30
|
|
|
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
|
2009-04-14 09:09:36 +05:30
|
|
|
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT " %s", str);
|
2011-07-14 13:00:06 +02:00
|
|
|
if (group->sgp->power != SCHED_POWER_SCALE) {
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT " (cpu_power = %d)",
|
2011-07-14 13:00:06 +02:00
|
|
|
group->sgp->power);
|
2009-04-14 09:09:36 +05:30
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
group = group->next;
|
|
|
|
} while (group != sd->groups);
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_CONT "\n");
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-11-25 02:35:04 +10:30
|
|
|
if (!cpumask_equal(sched_domain_span(sd), groupmask))
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_ERR "ERROR: groups don't span domain->span\n");
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-11-25 02:35:04 +10:30
|
|
|
if (sd->parent &&
|
|
|
|
!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_ERR "ERROR: parent span is not a superset "
|
|
|
|
"of domain->span\n");
|
2007-10-24 18:23:48 +02:00
|
|
|
return 0;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
static void sched_domain_debug(struct sched_domain *sd, int cpu)
|
|
|
|
{
|
|
|
|
int level = 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2012-05-31 21:20:16 +02:00
|
|
|
if (!sched_debug_enabled)
|
2009-11-17 18:22:15 -06:00
|
|
|
return;
|
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
if (!sd) {
|
|
|
|
printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
|
|
|
|
return;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
|
|
|
|
|
|
|
|
for (;;) {
|
2011-04-07 14:09:58 +02:00
|
|
|
if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
|
2007-10-24 18:23:48 +02:00
|
|
|
break;
|
2005-04-16 15:20:36 -07:00
|
|
|
level++;
|
|
|
|
sd = sd->parent;
|
2006-12-10 02:20:38 -08:00
|
|
|
if (!sd)
|
2007-10-24 18:23:48 +02:00
|
|
|
break;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2008-05-30 14:23:45 +02:00
|
|
|
#else /* !CONFIG_SCHED_DEBUG */
|
2006-07-03 00:25:40 -07:00
|
|
|
# define sched_domain_debug(sd, cpu) do { } while (0)
|
2012-05-31 21:20:16 +02:00
|
|
|
static inline bool sched_debug(void)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
2008-05-30 14:23:45 +02:00
|
|
|
#endif /* CONFIG_SCHED_DEBUG */
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2005-06-25 14:57:33 -07:00
|
|
|
static int sd_degenerate(struct sched_domain *sd)
|
2005-06-25 14:57:25 -07:00
|
|
|
{
|
2008-11-25 02:35:04 +10:30
|
|
|
if (cpumask_weight(sched_domain_span(sd)) == 1)
|
2005-06-25 14:57:25 -07:00
|
|
|
return 1;
|
|
|
|
|
|
|
|
/* Following flags need at least 2 groups */
|
|
|
|
if (sd->flags & (SD_LOAD_BALANCE |
|
|
|
|
SD_BALANCE_NEWIDLE |
|
|
|
|
SD_BALANCE_FORK |
|
2006-10-03 01:14:09 -07:00
|
|
|
SD_BALANCE_EXEC |
|
|
|
|
SD_SHARE_CPUPOWER |
|
|
|
|
SD_SHARE_PKG_RESOURCES)) {
|
2005-06-25 14:57:25 -07:00
|
|
|
if (sd->groups != sd->groups->next)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Following flags don't use groups */
|
sched: Merge select_task_rq_fair() and sched_balance_self()
The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.
To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().
Modify sched_balance_self() to:
- update_shares() when walking up the domain tree,
(it only called it for the top domain, but it should
have done this anyway), which allows us to remove
this ugly bit from try_to_wake_up().
- do wake_affine() on the smallest domain that contains
both this (the waking) and the prev (the wakee) cpu for
WAKE invocations.
Then use the top-down balance steps it had to replace wake_idle().
This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.
SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.
Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-10 13:50:02 +02:00
|
|
|
if (sd->flags & (SD_WAKE_AFFINE))
|
2005-06-25 14:57:25 -07:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2006-07-03 00:25:40 -07:00
|
|
|
static int
|
|
|
|
sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
|
2005-06-25 14:57:25 -07:00
|
|
|
{
|
|
|
|
unsigned long cflags = sd->flags, pflags = parent->flags;
|
|
|
|
|
|
|
|
if (sd_degenerate(parent))
|
|
|
|
return 1;
|
|
|
|
|
2008-11-25 02:35:04 +10:30
|
|
|
if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
|
2005-06-25 14:57:25 -07:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Flags needing groups don't count if only 1 group in parent */
|
|
|
|
if (parent->groups == parent->groups->next) {
|
|
|
|
pflags &= ~(SD_LOAD_BALANCE |
|
|
|
|
SD_BALANCE_NEWIDLE |
|
|
|
|
SD_BALANCE_FORK |
|
2006-10-03 01:14:09 -07:00
|
|
|
SD_BALANCE_EXEC |
|
|
|
|
SD_SHARE_CPUPOWER |
|
|
|
|
SD_SHARE_PKG_RESOURCES);
|
2008-12-07 18:47:37 -08:00
|
|
|
if (nr_node_ids == 1)
|
|
|
|
pflags &= ~SD_SERIALIZE;
|
2005-06-25 14:57:25 -07:00
|
|
|
}
|
|
|
|
if (~cflags & pflags)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
static void free_rootdomain(struct rcu_head *rcu)
|
2008-11-25 02:35:05 +10:30
|
|
|
{
|
2011-04-07 14:09:50 +02:00
|
|
|
struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
|
2009-11-16 10:28:09 +01:00
|
|
|
|
2008-11-25 02:35:13 +10:30
|
|
|
cpupri_cleanup(&rd->cpupri);
|
2008-11-25 02:35:05 +10:30
|
|
|
free_cpumask_var(rd->rto_mask);
|
|
|
|
free_cpumask_var(rd->online);
|
|
|
|
free_cpumask_var(rd->span);
|
|
|
|
kfree(rd);
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
static void rq_attach_root(struct rq *rq, struct root_domain *rd)
|
|
|
|
{
|
2009-02-12 11:35:40 +01:00
|
|
|
struct root_domain *old_rd = NULL;
|
2008-01-25 21:08:18 +01:00
|
|
|
unsigned long flags;
|
|
|
|
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock_irqsave(&rq->lock, flags);
|
2008-01-25 21:08:18 +01:00
|
|
|
|
|
|
|
if (rq->rd) {
|
2009-02-12 11:35:40 +01:00
|
|
|
old_rd = rq->rd;
|
2008-01-25 21:08:18 +01:00
|
|
|
|
2008-11-25 02:35:05 +10:30
|
|
|
if (cpumask_test_cpu(rq->cpu, old_rd->online))
|
2008-06-04 15:04:05 -04:00
|
|
|
set_rq_offline(rq);
|
2008-01-25 21:08:18 +01:00
|
|
|
|
2008-11-25 02:35:05 +10:30
|
|
|
cpumask_clear_cpu(rq->cpu, old_rd->span);
|
2008-01-25 21:08:26 +01:00
|
|
|
|
2009-02-12 11:35:40 +01:00
|
|
|
/*
|
|
|
|
* If we dont want to free the old_rt yet then
|
|
|
|
* set old_rd to NULL to skip the freeing later
|
|
|
|
* in this function:
|
|
|
|
*/
|
|
|
|
if (!atomic_dec_and_test(&old_rd->refcount))
|
|
|
|
old_rd = NULL;
|
2008-01-25 21:08:18 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
atomic_inc(&rd->refcount);
|
|
|
|
rq->rd = rd;
|
|
|
|
|
2008-11-25 02:35:05 +10:30
|
|
|
cpumask_set_cpu(rq->cpu, rd->span);
|
2009-07-30 10:57:23 -04:00
|
|
|
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
|
2008-06-04 15:04:05 -04:00
|
|
|
set_rq_online(rq);
|
2008-01-25 21:08:18 +01:00
|
|
|
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
2009-02-12 11:35:40 +01:00
|
|
|
|
|
|
|
if (old_rd)
|
2011-04-07 14:09:50 +02:00
|
|
|
call_rcu_sched(&old_rd->rcu, free_rootdomain);
|
2008-01-25 21:08:18 +01:00
|
|
|
}
|
|
|
|
|
2010-07-15 23:18:22 +03:00
|
|
|
static int init_rootdomain(struct root_domain *rd)
|
2008-01-25 21:08:18 +01:00
|
|
|
{
|
|
|
|
memset(rd, 0, sizeof(*rd));
|
|
|
|
|
2010-07-15 23:18:22 +03:00
|
|
|
if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
|
2009-01-06 17:39:06 +08:00
|
|
|
goto out;
|
2010-07-15 23:18:22 +03:00
|
|
|
if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
|
2008-11-25 02:35:05 +10:30
|
|
|
goto free_span;
|
2010-07-15 23:18:22 +03:00
|
|
|
if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
|
2008-11-25 02:35:05 +10:30
|
|
|
goto free_online;
|
2008-05-12 21:21:01 +02:00
|
|
|
|
2010-07-15 23:18:22 +03:00
|
|
|
if (cpupri_init(&rd->cpupri) != 0)
|
2008-11-25 02:35:13 +10:30
|
|
|
goto free_rto_mask;
|
2008-11-25 02:35:05 +10:30
|
|
|
return 0;
|
2008-05-12 21:21:01 +02:00
|
|
|
|
2008-11-25 02:35:13 +10:30
|
|
|
free_rto_mask:
|
|
|
|
free_cpumask_var(rd->rto_mask);
|
2008-11-25 02:35:05 +10:30
|
|
|
free_online:
|
|
|
|
free_cpumask_var(rd->online);
|
|
|
|
free_span:
|
|
|
|
free_cpumask_var(rd->span);
|
2009-01-06 17:39:06 +08:00
|
|
|
out:
|
2008-11-25 02:35:05 +10:30
|
|
|
return -ENOMEM;
|
2008-01-25 21:08:18 +01:00
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
/*
|
|
|
|
* By default the system creates a single root-domain with all cpus as
|
|
|
|
* members (mimicking the global state we have today).
|
|
|
|
*/
|
|
|
|
struct root_domain def_root_domain;
|
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
static void init_defrootdomain(void)
|
|
|
|
{
|
2010-07-15 23:18:22 +03:00
|
|
|
init_rootdomain(&def_root_domain);
|
2008-11-25 02:35:05 +10:30
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
atomic_set(&def_root_domain.refcount, 1);
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:26 +01:00
|
|
|
static struct root_domain *alloc_rootdomain(void)
|
2008-01-25 21:08:18 +01:00
|
|
|
{
|
|
|
|
struct root_domain *rd;
|
|
|
|
|
|
|
|
rd = kmalloc(sizeof(*rd), GFP_KERNEL);
|
|
|
|
if (!rd)
|
|
|
|
return NULL;
|
|
|
|
|
2010-07-15 23:18:22 +03:00
|
|
|
if (init_rootdomain(rd) != 0) {
|
2008-11-25 02:35:05 +10:30
|
|
|
kfree(rd);
|
|
|
|
return NULL;
|
|
|
|
}
|
2008-01-25 21:08:18 +01:00
|
|
|
|
|
|
|
return rd;
|
|
|
|
}
|
|
|
|
|
2011-07-15 10:35:52 +02:00
|
|
|
static void free_sched_groups(struct sched_group *sg, int free_sgp)
|
|
|
|
{
|
|
|
|
struct sched_group *tmp, *first;
|
|
|
|
|
|
|
|
if (!sg)
|
|
|
|
return;
|
|
|
|
|
|
|
|
first = sg;
|
|
|
|
do {
|
|
|
|
tmp = sg->next;
|
|
|
|
|
|
|
|
if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
|
|
|
|
kfree(sg->sgp);
|
|
|
|
|
|
|
|
kfree(sg);
|
|
|
|
sg = tmp;
|
|
|
|
} while (sg != first);
|
|
|
|
}
|
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
static void free_sched_domain(struct rcu_head *rcu)
|
|
|
|
{
|
|
|
|
struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
|
2011-07-15 10:35:52 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If its an overlapping domain it has private groups, iterate and
|
|
|
|
* nuke them all.
|
|
|
|
*/
|
|
|
|
if (sd->flags & SD_OVERLAP) {
|
|
|
|
free_sched_groups(sd->groups, 1);
|
|
|
|
} else if (atomic_dec_and_test(&sd->groups->ref)) {
|
2011-07-14 13:00:06 +02:00
|
|
|
kfree(sd->groups->sgp);
|
2011-04-07 14:09:50 +02:00
|
|
|
kfree(sd->groups);
|
2011-07-14 13:00:06 +02:00
|
|
|
}
|
2011-04-07 14:09:50 +02:00
|
|
|
kfree(sd);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void destroy_sched_domain(struct sched_domain *sd, int cpu)
|
|
|
|
{
|
|
|
|
call_rcu(&sd->rcu, free_sched_domain);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void destroy_sched_domains(struct sched_domain *sd, int cpu)
|
|
|
|
{
|
|
|
|
for (; sd; sd = sd->parent)
|
|
|
|
destroy_sched_domain(sd, cpu);
|
|
|
|
}
|
|
|
|
|
2011-12-07 15:07:31 +01:00
|
|
|
/*
|
|
|
|
* Keep a special pointer to the highest sched_domain that has
|
|
|
|
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
|
|
|
|
* allows us to avoid some pointer chasing select_idle_sibling().
|
|
|
|
*
|
|
|
|
* Also keep a unique ID per domain (we use the first cpu number in
|
|
|
|
* the cpumask of the domain), this allows us to quickly tell if
|
2012-01-26 12:44:34 +01:00
|
|
|
* two cpus are in the same cache domain, see cpus_share_cache().
|
2011-12-07 15:07:31 +01:00
|
|
|
*/
|
|
|
|
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
|
|
|
|
DEFINE_PER_CPU(int, sd_llc_id);
|
|
|
|
|
|
|
|
static void update_top_cache_domain(int cpu)
|
|
|
|
{
|
|
|
|
struct sched_domain *sd;
|
|
|
|
int id = cpu;
|
|
|
|
|
|
|
|
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
|
2012-09-16 12:29:43 -07:00
|
|
|
if (sd)
|
2011-12-07 15:07:31 +01:00
|
|
|
id = cpumask_first(sched_domain_span(sd));
|
|
|
|
|
|
|
|
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
|
|
|
|
per_cpu(sd_llc_id, cpu) = id;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2008-01-25 21:08:19 +01:00
|
|
|
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
|
2005-04-16 15:20:36 -07:00
|
|
|
* hold the hotplug lock.
|
|
|
|
*/
|
2008-01-25 21:08:19 +01:00
|
|
|
static void
|
|
|
|
cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2005-06-25 14:57:25 -07:00
|
|
|
struct sched_domain *tmp;
|
|
|
|
|
|
|
|
/* Remove the sched domains which do not contribute to scheduling. */
|
2008-11-06 09:45:16 +08:00
|
|
|
for (tmp = sd; tmp; ) {
|
2005-06-25 14:57:25 -07:00
|
|
|
struct sched_domain *parent = tmp->parent;
|
|
|
|
if (!parent)
|
|
|
|
break;
|
2008-11-06 09:45:16 +08:00
|
|
|
|
2006-10-03 01:14:08 -07:00
|
|
|
if (sd_parent_degenerate(tmp, parent)) {
|
2005-06-25 14:57:25 -07:00
|
|
|
tmp->parent = parent->parent;
|
2006-10-03 01:14:08 -07:00
|
|
|
if (parent->parent)
|
|
|
|
parent->parent->child = tmp;
|
2011-04-07 14:09:50 +02:00
|
|
|
destroy_sched_domain(parent, cpu);
|
2008-11-06 09:45:16 +08:00
|
|
|
} else
|
|
|
|
tmp = tmp->parent;
|
2005-06-25 14:57:25 -07:00
|
|
|
}
|
|
|
|
|
2006-10-03 01:14:08 -07:00
|
|
|
if (sd && sd_degenerate(sd)) {
|
2011-04-07 14:09:50 +02:00
|
|
|
tmp = sd;
|
2005-06-25 14:57:25 -07:00
|
|
|
sd = sd->parent;
|
2011-04-07 14:09:50 +02:00
|
|
|
destroy_sched_domain(tmp, cpu);
|
2006-10-03 01:14:08 -07:00
|
|
|
if (sd)
|
|
|
|
sd->child = NULL;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-04-07 14:09:58 +02:00
|
|
|
sched_domain_debug(sd, cpu);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
rq_attach_root(rq, rd);
|
2011-04-07 14:09:50 +02:00
|
|
|
tmp = rq->sd;
|
2005-06-25 14:57:27 -07:00
|
|
|
rcu_assign_pointer(rq->sd, sd);
|
2011-04-07 14:09:50 +02:00
|
|
|
destroy_sched_domains(tmp, cpu);
|
2011-12-07 15:07:31 +01:00
|
|
|
|
|
|
|
update_top_cache_domain(cpu);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* cpus with isolated domains */
|
2008-11-25 02:35:12 +10:30
|
|
|
static cpumask_var_t cpu_isolated_map;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/* Setup the mask of cpus configured for isolated domains */
|
|
|
|
static int __init isolated_cpu_setup(char *str)
|
|
|
|
{
|
2009-12-02 14:09:16 +10:30
|
|
|
alloc_bootmem_cpumask_var(&cpu_isolated_map);
|
2008-12-13 21:55:51 +10:30
|
|
|
cpulist_parse(str, cpu_isolated_map);
|
2005-04-16 15:20:36 -07:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:13 +02:00
|
|
|
__setup("isolcpus=", isolated_cpu_setup);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-04-07 14:09:59 +02:00
|
|
|
static const struct cpumask *cpu_cpu_mask(int cpu)
|
|
|
|
{
|
|
|
|
return cpumask_of_node(cpu_to_node(cpu));
|
|
|
|
}
|
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
struct sd_data {
|
|
|
|
struct sched_domain **__percpu sd;
|
|
|
|
struct sched_group **__percpu sg;
|
2011-07-14 13:00:06 +02:00
|
|
|
struct sched_group_power **__percpu sgp;
|
2011-04-07 14:09:50 +02:00
|
|
|
};
|
|
|
|
|
2009-08-18 12:51:52 +02:00
|
|
|
struct s_data {
|
2011-04-07 14:09:48 +02:00
|
|
|
struct sched_domain ** __percpu sd;
|
2009-08-18 12:51:52 +02:00
|
|
|
struct root_domain *rd;
|
|
|
|
};
|
|
|
|
|
2009-08-18 12:53:00 +02:00
|
|
|
enum s_alloc {
|
|
|
|
sa_rootdomain,
|
2011-04-07 14:09:48 +02:00
|
|
|
sa_sd,
|
2011-04-07 14:09:50 +02:00
|
|
|
sa_sd_storage,
|
2009-08-18 12:53:00 +02:00
|
|
|
sa_none,
|
|
|
|
};
|
|
|
|
|
2011-04-07 14:10:03 +02:00
|
|
|
struct sched_domain_topology_level;
|
|
|
|
|
|
|
|
typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
|
2011-04-07 14:10:00 +02:00
|
|
|
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
|
|
|
|
|
2011-07-15 10:35:52 +02:00
|
|
|
#define SDTL_OVERLAP 0x01
|
|
|
|
|
2011-04-07 14:10:00 +02:00
|
|
|
struct sched_domain_topology_level {
|
2011-04-07 14:10:01 +02:00
|
|
|
sched_domain_init_f init;
|
|
|
|
sched_domain_mask_f mask;
|
2011-07-15 10:35:52 +02:00
|
|
|
int flags;
|
2012-04-17 15:49:36 +02:00
|
|
|
int numa_level;
|
2011-04-07 14:10:03 +02:00
|
|
|
struct sd_data data;
|
2011-04-07 14:10:00 +02:00
|
|
|
};
|
|
|
|
|
2012-05-31 14:47:33 +02:00
|
|
|
/*
|
|
|
|
* Build an iteration mask that can exclude certain CPUs from the upwards
|
|
|
|
* domain traversal.
|
|
|
|
*
|
|
|
|
* Asymmetric node setups can result in situations where the domain tree is of
|
|
|
|
* unequal depth, make sure to skip domains that already cover the entire
|
|
|
|
* range.
|
|
|
|
*
|
|
|
|
* In that case build_sched_domains() will have terminated the iteration early
|
|
|
|
* and our sibling sd spans will be empty. Domains should always include the
|
|
|
|
* cpu they're built on, so check that.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
|
|
|
|
{
|
|
|
|
const struct cpumask *span = sched_domain_span(sd);
|
|
|
|
struct sd_data *sdd = sd->private;
|
|
|
|
struct sched_domain *sibling;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_cpu(i, span) {
|
|
|
|
sibling = *per_cpu_ptr(sdd->sd, i);
|
|
|
|
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
cpumask_set_cpu(i, sched_group_mask(sg));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the canonical balance cpu for this group, this is the first cpu
|
|
|
|
* of this group that's also in the iteration mask.
|
|
|
|
*/
|
|
|
|
int group_balance_cpu(struct sched_group *sg)
|
|
|
|
{
|
|
|
|
return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
|
|
|
|
}
|
|
|
|
|
2011-07-15 10:35:52 +02:00
|
|
|
static int
|
|
|
|
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
|
|
|
|
{
|
|
|
|
struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
|
|
|
|
const struct cpumask *span = sched_domain_span(sd);
|
|
|
|
struct cpumask *covered = sched_domains_tmpmask;
|
|
|
|
struct sd_data *sdd = sd->private;
|
|
|
|
struct sched_domain *child;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
cpumask_clear(covered);
|
|
|
|
|
|
|
|
for_each_cpu(i, span) {
|
|
|
|
struct cpumask *sg_span;
|
|
|
|
|
|
|
|
if (cpumask_test_cpu(i, covered))
|
|
|
|
continue;
|
|
|
|
|
2012-05-31 14:47:33 +02:00
|
|
|
child = *per_cpu_ptr(sdd->sd, i);
|
|
|
|
|
|
|
|
/* See the comment near build_group_mask(). */
|
|
|
|
if (!cpumask_test_cpu(i, sched_domain_span(child)))
|
|
|
|
continue;
|
|
|
|
|
2011-07-15 10:35:52 +02:00
|
|
|
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
|
2011-11-18 15:03:29 -08:00
|
|
|
GFP_KERNEL, cpu_to_node(cpu));
|
2011-07-15 10:35:52 +02:00
|
|
|
|
|
|
|
if (!sg)
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
sg_span = sched_group_cpus(sg);
|
|
|
|
if (child->child) {
|
|
|
|
child = child->child;
|
|
|
|
cpumask_copy(sg_span, sched_domain_span(child));
|
|
|
|
} else
|
|
|
|
cpumask_set_cpu(i, sg_span);
|
|
|
|
|
|
|
|
cpumask_or(covered, covered, sg_span);
|
|
|
|
|
2012-05-23 18:00:43 +02:00
|
|
|
sg->sgp = *per_cpu_ptr(sdd->sgp, i);
|
2012-05-31 14:47:33 +02:00
|
|
|
if (atomic_inc_return(&sg->sgp->ref) == 1)
|
|
|
|
build_group_mask(sd, sg);
|
|
|
|
|
2012-05-31 12:05:32 +02:00
|
|
|
/*
|
|
|
|
* Initialize sgp->power such that even if we mess up the
|
|
|
|
* domains and no possible iteration will get us here, we won't
|
|
|
|
* die on a /0 trap.
|
|
|
|
*/
|
|
|
|
sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
|
2011-07-15 10:35:52 +02:00
|
|
|
|
2012-05-31 14:47:33 +02:00
|
|
|
/*
|
|
|
|
* Make sure the first group of this domain contains the
|
|
|
|
* canonical balance cpu. Otherwise the sched_domain iteration
|
|
|
|
* breaks. See update_sg_lb_stats().
|
|
|
|
*/
|
2012-05-23 18:00:43 +02:00
|
|
|
if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
|
2012-05-31 14:47:33 +02:00
|
|
|
group_balance_cpu(sg) == cpu)
|
2011-07-15 10:35:52 +02:00
|
|
|
groups = sg;
|
|
|
|
|
|
|
|
if (!first)
|
|
|
|
first = sg;
|
|
|
|
if (last)
|
|
|
|
last->next = sg;
|
|
|
|
last = sg;
|
|
|
|
last->next = first;
|
|
|
|
}
|
|
|
|
sd->groups = groups;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
fail:
|
|
|
|
free_sched_groups(first, 0);
|
|
|
|
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-04-07 14:09:50 +02:00
|
|
|
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
|
|
|
|
struct sched_domain *child = sd->child;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
if (child)
|
|
|
|
cpu = cpumask_first(sched_domain_span(child));
|
2006-03-27 01:15:22 -08:00
|
|
|
|
2011-07-14 13:00:06 +02:00
|
|
|
if (sg) {
|
2011-04-07 14:09:50 +02:00
|
|
|
*sg = *per_cpu_ptr(sdd->sg, cpu);
|
2011-07-14 13:00:06 +02:00
|
|
|
(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
|
2011-07-15 10:35:52 +02:00
|
|
|
atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
|
2011-07-14 13:00:06 +02:00
|
|
|
}
|
2011-04-07 14:09:50 +02:00
|
|
|
|
|
|
|
return cpu;
|
2006-03-27 01:15:22 -08:00
|
|
|
}
|
|
|
|
|
2010-08-31 10:28:16 +02:00
|
|
|
/*
|
2011-04-07 14:09:50 +02:00
|
|
|
* build_sched_groups will build a circular linked list of the groups
|
|
|
|
* covered by the given span, and will set each group's ->cpumask correctly,
|
|
|
|
* and ->cpu_power to 0.
|
2011-07-15 10:35:52 +02:00
|
|
|
*
|
|
|
|
* Assumes the sched_domain tree is fully constructed
|
2010-08-31 10:28:16 +02:00
|
|
|
*/
|
2011-07-15 10:35:52 +02:00
|
|
|
static int
|
|
|
|
build_sched_groups(struct sched_domain *sd, int cpu)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2011-04-07 14:09:50 +02:00
|
|
|
struct sched_group *first = NULL, *last = NULL;
|
|
|
|
struct sd_data *sdd = sd->private;
|
|
|
|
const struct cpumask *span = sched_domain_span(sd);
|
2011-04-07 14:09:57 +02:00
|
|
|
struct cpumask *covered;
|
2011-04-07 14:09:50 +02:00
|
|
|
int i;
|
2005-09-06 15:18:14 -07:00
|
|
|
|
2011-07-15 10:35:52 +02:00
|
|
|
get_group(cpu, sdd, &sd->groups);
|
|
|
|
atomic_inc(&sd->groups->ref);
|
|
|
|
|
|
|
|
if (cpu != cpumask_first(sched_domain_span(sd)))
|
|
|
|
return 0;
|
|
|
|
|
2011-04-07 14:09:57 +02:00
|
|
|
lockdep_assert_held(&sched_domains_mutex);
|
|
|
|
covered = sched_domains_tmpmask;
|
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
cpumask_clear(covered);
|
2006-12-10 02:20:07 -08:00
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
for_each_cpu(i, span) {
|
|
|
|
struct sched_group *sg;
|
|
|
|
int group = get_group(i, sdd, &sg);
|
|
|
|
int j;
|
2006-12-10 02:20:07 -08:00
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
if (cpumask_test_cpu(i, covered))
|
|
|
|
continue;
|
2006-12-10 02:20:07 -08:00
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
cpumask_clear(sched_group_cpus(sg));
|
2011-07-14 13:00:06 +02:00
|
|
|
sg->sgp->power = 0;
|
2012-05-31 14:47:33 +02:00
|
|
|
cpumask_setall(sched_group_mask(sg));
|
2009-08-18 13:01:11 +02:00
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
for_each_cpu(j, span) {
|
|
|
|
if (get_group(j, sdd, NULL) != group)
|
|
|
|
continue;
|
2009-08-18 13:01:11 +02:00
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
cpumask_set_cpu(j, covered);
|
|
|
|
cpumask_set_cpu(j, sched_group_cpus(sg));
|
|
|
|
}
|
2009-08-18 13:01:11 +02:00
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
if (!first)
|
|
|
|
first = sg;
|
|
|
|
if (last)
|
|
|
|
last->next = sg;
|
|
|
|
last = sg;
|
|
|
|
}
|
|
|
|
last->next = first;
|
2011-07-15 10:35:52 +02:00
|
|
|
|
|
|
|
return 0;
|
2009-08-18 13:01:11 +02:00
|
|
|
}
|
2006-06-27 02:54:38 -07:00
|
|
|
|
2006-10-03 01:14:09 -07:00
|
|
|
/*
|
|
|
|
* Initialize sched groups cpu_power.
|
|
|
|
*
|
|
|
|
* cpu_power indicates the capacity of sched group, which is used while
|
|
|
|
* distributing the load between different sched groups in a sched domain.
|
|
|
|
* Typically cpu_power for all the groups in a sched domain will be same unless
|
|
|
|
* there are asymmetries in the topology. If there are asymmetries, group
|
|
|
|
* having more cpu_power will pickup more load compared to the group having
|
|
|
|
* less cpu_power.
|
|
|
|
*/
|
|
|
|
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
|
|
|
|
{
|
2011-07-15 10:35:52 +02:00
|
|
|
struct sched_group *sg = sd->groups;
|
2006-10-03 01:14:09 -07:00
|
|
|
|
2011-07-15 10:35:52 +02:00
|
|
|
WARN_ON(!sd || !sg);
|
|
|
|
|
|
|
|
do {
|
|
|
|
sg->group_weight = cpumask_weight(sched_group_cpus(sg));
|
|
|
|
sg = sg->next;
|
|
|
|
} while (sg != sd->groups);
|
2006-10-03 01:14:09 -07:00
|
|
|
|
2012-05-31 14:47:33 +02:00
|
|
|
if (cpu != group_balance_cpu(sg))
|
2011-07-15 10:35:52 +02:00
|
|
|
return;
|
2010-09-17 15:02:32 -07:00
|
|
|
|
2011-04-07 14:09:43 +02:00
|
|
|
update_group_power(sd, cpu);
|
2011-12-01 17:07:33 -08:00
|
|
|
atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
|
2006-10-03 01:14:09 -07:00
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
int __weak arch_sd_sibling_asym_packing(void)
|
|
|
|
{
|
|
|
|
return 0*SD_ASYM_PACKING;
|
2006-10-03 01:14:09 -07:00
|
|
|
}
|
|
|
|
|
2008-04-04 18:11:11 -07:00
|
|
|
/*
|
|
|
|
* Initializers for schedule domains
|
|
|
|
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
|
|
|
|
*/
|
|
|
|
|
2008-10-09 11:35:51 +02:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
# define SD_INIT_NAME(sd, type) sd->name = #type
|
|
|
|
#else
|
|
|
|
# define SD_INIT_NAME(sd, type) do { } while (0)
|
|
|
|
#endif
|
|
|
|
|
2011-04-07 14:10:03 +02:00
|
|
|
#define SD_INIT_FUNC(type) \
|
|
|
|
static noinline struct sched_domain * \
|
|
|
|
sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
|
|
|
|
{ \
|
|
|
|
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
|
|
|
|
*sd = SD_##type##_INIT; \
|
|
|
|
SD_INIT_NAME(sd, type); \
|
|
|
|
sd->private = &tl->data; \
|
|
|
|
return sd; \
|
2008-04-04 18:11:11 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
SD_INIT_FUNC(CPU)
|
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
SD_INIT_FUNC(SIBLING)
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_SCHED_MC
|
|
|
|
SD_INIT_FUNC(MC)
|
|
|
|
#endif
|
2010-08-31 10:28:16 +02:00
|
|
|
#ifdef CONFIG_SCHED_BOOK
|
|
|
|
SD_INIT_FUNC(BOOK)
|
|
|
|
#endif
|
2008-04-04 18:11:11 -07:00
|
|
|
|
2008-04-15 14:04:23 +09:00
|
|
|
static int default_relax_domain_level = -1;
|
2011-04-07 14:10:04 +02:00
|
|
|
int sched_domain_level_max;
|
2008-04-15 14:04:23 +09:00
|
|
|
|
|
|
|
static int __init setup_relax_domain_level(char *str)
|
|
|
|
{
|
2012-06-05 13:44:36 -05:00
|
|
|
if (kstrtoint(str, 0, &default_relax_domain_level))
|
|
|
|
pr_warn("Unable to set relax_domain_level\n");
|
2008-05-13 10:27:17 +08:00
|
|
|
|
2008-04-15 14:04:23 +09:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("relax_domain_level=", setup_relax_domain_level);
|
|
|
|
|
|
|
|
static void set_domain_attribute(struct sched_domain *sd,
|
|
|
|
struct sched_domain_attr *attr)
|
|
|
|
{
|
|
|
|
int request;
|
|
|
|
|
|
|
|
if (!attr || attr->relax_domain_level < 0) {
|
|
|
|
if (default_relax_domain_level < 0)
|
|
|
|
return;
|
|
|
|
else
|
|
|
|
request = default_relax_domain_level;
|
|
|
|
} else
|
|
|
|
request = attr->relax_domain_level;
|
|
|
|
if (request < sd->level) {
|
|
|
|
/* turn off idle balance on this domain */
|
sched: Merge select_task_rq_fair() and sched_balance_self()
The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.
To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().
Modify sched_balance_self() to:
- update_shares() when walking up the domain tree,
(it only called it for the top domain, but it should
have done this anyway), which allows us to remove
this ugly bit from try_to_wake_up().
- do wake_affine() on the smallest domain that contains
both this (the waking) and the prev (the wakee) cpu for
WAKE invocations.
Then use the top-down balance steps it had to replace wake_idle().
This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.
SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.
Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-10 13:50:02 +02:00
|
|
|
sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
|
2008-04-15 14:04:23 +09:00
|
|
|
} else {
|
|
|
|
/* turn on idle balance on this domain */
|
sched: Merge select_task_rq_fair() and sched_balance_self()
The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.
To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().
Modify sched_balance_self() to:
- update_shares() when walking up the domain tree,
(it only called it for the top domain, but it should
have done this anyway), which allows us to remove
this ugly bit from try_to_wake_up().
- do wake_affine() on the smallest domain that contains
both this (the waking) and the prev (the wakee) cpu for
WAKE invocations.
Then use the top-down balance steps it had to replace wake_idle().
This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.
SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.
Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-10 13:50:02 +02:00
|
|
|
sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
|
2008-04-15 14:04:23 +09:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-04-07 14:10:03 +02:00
|
|
|
static void __sdt_free(const struct cpumask *cpu_map);
|
|
|
|
static int __sdt_alloc(const struct cpumask *cpu_map);
|
|
|
|
|
2009-08-18 12:53:00 +02:00
|
|
|
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
|
|
|
|
const struct cpumask *cpu_map)
|
|
|
|
{
|
|
|
|
switch (what) {
|
|
|
|
case sa_rootdomain:
|
2011-04-07 14:09:51 +02:00
|
|
|
if (!atomic_read(&d->rd->refcount))
|
|
|
|
free_rootdomain(&d->rd->rcu); /* fall through */
|
2011-04-07 14:09:48 +02:00
|
|
|
case sa_sd:
|
|
|
|
free_percpu(d->sd); /* fall through */
|
2011-04-07 14:09:50 +02:00
|
|
|
case sa_sd_storage:
|
2011-04-07 14:10:03 +02:00
|
|
|
__sdt_free(cpu_map); /* fall through */
|
2009-08-18 12:53:00 +02:00
|
|
|
case sa_none:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2008-11-25 02:35:03 +10:30
|
|
|
|
2009-08-18 12:53:00 +02:00
|
|
|
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
|
|
|
|
const struct cpumask *cpu_map)
|
|
|
|
{
|
2011-04-07 14:09:50 +02:00
|
|
|
memset(d, 0, sizeof(*d));
|
|
|
|
|
2011-04-07 14:10:03 +02:00
|
|
|
if (__sdt_alloc(cpu_map))
|
|
|
|
return sa_sd_storage;
|
2011-04-07 14:09:50 +02:00
|
|
|
d->sd = alloc_percpu(struct sched_domain *);
|
|
|
|
if (!d->sd)
|
|
|
|
return sa_sd_storage;
|
2009-08-18 12:53:00 +02:00
|
|
|
d->rd = alloc_rootdomain();
|
2011-04-07 14:09:50 +02:00
|
|
|
if (!d->rd)
|
2011-04-07 14:09:48 +02:00
|
|
|
return sa_sd;
|
2009-08-18 12:53:00 +02:00
|
|
|
return sa_rootdomain;
|
|
|
|
}
|
2008-01-25 21:08:18 +01:00
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
/*
|
|
|
|
* NULL the sd_data elements we've used to build the sched_domain and
|
|
|
|
* sched_group structure so that the subsequent __free_domain_allocs()
|
|
|
|
* will not free the data we're using.
|
|
|
|
*/
|
|
|
|
static void claim_allocations(int cpu, struct sched_domain *sd)
|
|
|
|
{
|
|
|
|
struct sd_data *sdd = sd->private;
|
|
|
|
|
|
|
|
WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
|
|
|
|
*per_cpu_ptr(sdd->sd, cpu) = NULL;
|
|
|
|
|
2011-07-15 10:35:52 +02:00
|
|
|
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
|
2011-04-07 14:09:50 +02:00
|
|
|
*per_cpu_ptr(sdd->sg, cpu) = NULL;
|
2011-07-15 10:35:52 +02:00
|
|
|
|
|
|
|
if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
|
2011-07-14 13:00:06 +02:00
|
|
|
*per_cpu_ptr(sdd->sgp, cpu) = NULL;
|
2011-04-07 14:09:50 +02:00
|
|
|
}
|
|
|
|
|
2011-04-07 14:10:01 +02:00
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
static const struct cpumask *cpu_smt_mask(int cpu)
|
2009-08-18 12:54:06 +02:00
|
|
|
{
|
2011-04-07 14:10:01 +02:00
|
|
|
return topology_thread_cpumask(cpu);
|
2011-04-07 14:09:54 +02:00
|
|
|
}
|
2011-04-07 14:10:01 +02:00
|
|
|
#endif
|
2009-08-18 12:54:06 +02:00
|
|
|
|
2011-04-07 14:10:02 +02:00
|
|
|
/*
|
|
|
|
* Topology list, bottom-up.
|
|
|
|
*/
|
2011-04-07 14:10:01 +02:00
|
|
|
static struct sched_domain_topology_level default_topology[] = {
|
2011-04-07 14:10:02 +02:00
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
{ sd_init_SIBLING, cpu_smt_mask, },
|
2010-08-31 10:28:16 +02:00
|
|
|
#endif
|
2006-03-27 01:15:22 -08:00
|
|
|
#ifdef CONFIG_SCHED_MC
|
2011-04-07 14:10:01 +02:00
|
|
|
{ sd_init_MC, cpu_coregroup_mask, },
|
2006-03-27 01:15:22 -08:00
|
|
|
#endif
|
2011-04-07 14:10:02 +02:00
|
|
|
#ifdef CONFIG_SCHED_BOOK
|
|
|
|
{ sd_init_BOOK, cpu_book_mask, },
|
|
|
|
#endif
|
|
|
|
{ sd_init_CPU, cpu_cpu_mask, },
|
2011-04-07 14:10:00 +02:00
|
|
|
{ NULL, },
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct sched_domain_topology_level *sched_domain_topology = default_topology;
|
|
|
|
|
2012-04-17 15:49:36 +02:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
|
|
|
|
static int sched_domains_numa_levels;
|
|
|
|
static int *sched_domains_numa_distance;
|
|
|
|
static struct cpumask ***sched_domains_numa_masks;
|
|
|
|
static int sched_domains_curr_level;
|
|
|
|
|
|
|
|
static inline int sd_local_flags(int level)
|
|
|
|
{
|
sched/numa: Load balance between remote nodes
Commit cb83b629b ("sched/numa: Rewrite the CONFIG_NUMA sched
domain support") removed the NODE sched domain and started checking
if the node distance in SLIT table is farther than REMOTE_DISTANCE,
if so, it will lose the load balance chance at exec/fork/wake_affine
points.
But actually, even the node distance is farther than REMOTE_DISTANCE.
Modern CPUs also has QPI like connections, which ensures that memory
access is not too slow between nodes. So the above change in behavior
on NUMA machine causes a performance regression on various benchmarks:
hackbench, tbench, netperf, oltp, etc.
This patch will recover the scheduler behavior to old mode on all my
Intel platforms: NHM EP/EX, WSM EP, SNB EP/EP4S, and thus fixes the
perfromance regressions. (all of them just have 2 kinds distance, 10, 21)
Signed-off-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1338965571-9812-1-git-send-email-alex.shi@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-06-06 14:52:51 +08:00
|
|
|
if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
|
2012-04-17 15:49:36 +02:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct sched_domain *
|
|
|
|
sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
|
|
|
|
{
|
|
|
|
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
|
|
|
|
int level = tl->numa_level;
|
|
|
|
int sd_weight = cpumask_weight(
|
|
|
|
sched_domains_numa_masks[level][cpu_to_node(cpu)]);
|
|
|
|
|
|
|
|
*sd = (struct sched_domain){
|
|
|
|
.min_interval = sd_weight,
|
|
|
|
.max_interval = 2*sd_weight,
|
|
|
|
.busy_factor = 32,
|
2012-05-11 00:26:27 +02:00
|
|
|
.imbalance_pct = 125,
|
2012-04-17 15:49:36 +02:00
|
|
|
.cache_nice_tries = 2,
|
|
|
|
.busy_idx = 3,
|
|
|
|
.idle_idx = 2,
|
|
|
|
.newidle_idx = 0,
|
|
|
|
.wake_idx = 0,
|
|
|
|
.forkexec_idx = 0,
|
|
|
|
|
|
|
|
.flags = 1*SD_LOAD_BALANCE
|
|
|
|
| 1*SD_BALANCE_NEWIDLE
|
|
|
|
| 0*SD_BALANCE_EXEC
|
|
|
|
| 0*SD_BALANCE_FORK
|
|
|
|
| 0*SD_BALANCE_WAKE
|
|
|
|
| 0*SD_WAKE_AFFINE
|
|
|
|
| 0*SD_PREFER_LOCAL
|
|
|
|
| 0*SD_SHARE_CPUPOWER
|
|
|
|
| 0*SD_SHARE_PKG_RESOURCES
|
|
|
|
| 1*SD_SERIALIZE
|
|
|
|
| 0*SD_PREFER_SIBLING
|
|
|
|
| sd_local_flags(level)
|
|
|
|
,
|
|
|
|
.last_balance = jiffies,
|
|
|
|
.balance_interval = sd_weight,
|
|
|
|
};
|
|
|
|
SD_INIT_NAME(sd, NUMA);
|
|
|
|
sd->private = &tl->data;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ugly hack to pass state to sd_numa_mask()...
|
|
|
|
*/
|
|
|
|
sched_domains_curr_level = tl->numa_level;
|
|
|
|
|
|
|
|
return sd;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct cpumask *sd_numa_mask(int cpu)
|
|
|
|
{
|
|
|
|
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
|
|
|
|
}
|
|
|
|
|
2012-05-31 21:20:16 +02:00
|
|
|
static void sched_numa_warn(const char *str)
|
|
|
|
{
|
|
|
|
static int done = false;
|
|
|
|
int i,j;
|
|
|
|
|
|
|
|
if (done)
|
|
|
|
return;
|
|
|
|
|
|
|
|
done = true;
|
|
|
|
|
|
|
|
printk(KERN_WARNING "ERROR: %s\n\n", str);
|
|
|
|
|
|
|
|
for (i = 0; i < nr_node_ids; i++) {
|
|
|
|
printk(KERN_WARNING " ");
|
|
|
|
for (j = 0; j < nr_node_ids; j++)
|
|
|
|
printk(KERN_CONT "%02d ", node_distance(i,j));
|
|
|
|
printk(KERN_CONT "\n");
|
|
|
|
}
|
|
|
|
printk(KERN_WARNING "\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool find_numa_distance(int distance)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (distance == node_distance(0, 0))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
for (i = 0; i < sched_domains_numa_levels; i++) {
|
|
|
|
if (sched_domains_numa_distance[i] == distance)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-04-17 15:49:36 +02:00
|
|
|
static void sched_init_numa(void)
|
|
|
|
{
|
|
|
|
int next_distance, curr_distance = node_distance(0, 0);
|
|
|
|
struct sched_domain_topology_level *tl;
|
|
|
|
int level = 0;
|
|
|
|
int i, j, k;
|
|
|
|
|
|
|
|
sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
|
|
|
|
if (!sched_domains_numa_distance)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
|
|
|
|
* unique distances in the node_distance() table.
|
|
|
|
*
|
|
|
|
* Assumes node_distance(0,j) includes all distances in
|
|
|
|
* node_distance(i,j) in order to avoid cubic time.
|
|
|
|
*/
|
|
|
|
next_distance = curr_distance;
|
|
|
|
for (i = 0; i < nr_node_ids; i++) {
|
|
|
|
for (j = 0; j < nr_node_ids; j++) {
|
2012-05-31 21:20:16 +02:00
|
|
|
for (k = 0; k < nr_node_ids; k++) {
|
|
|
|
int distance = node_distance(i, k);
|
|
|
|
|
|
|
|
if (distance > curr_distance &&
|
|
|
|
(distance < next_distance ||
|
|
|
|
next_distance == curr_distance))
|
|
|
|
next_distance = distance;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* While not a strong assumption it would be nice to know
|
|
|
|
* about cases where if node A is connected to B, B is not
|
|
|
|
* equally connected to A.
|
|
|
|
*/
|
|
|
|
if (sched_debug() && node_distance(k, i) != distance)
|
|
|
|
sched_numa_warn("Node-distance not symmetric");
|
|
|
|
|
|
|
|
if (sched_debug() && i && !find_numa_distance(distance))
|
|
|
|
sched_numa_warn("Node-0 not representative");
|
|
|
|
}
|
|
|
|
if (next_distance != curr_distance) {
|
|
|
|
sched_domains_numa_distance[level++] = next_distance;
|
|
|
|
sched_domains_numa_levels = level;
|
|
|
|
curr_distance = next_distance;
|
|
|
|
} else break;
|
2012-04-17 15:49:36 +02:00
|
|
|
}
|
2012-05-31 21:20:16 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* In case of sched_debug() we verify the above assumption.
|
|
|
|
*/
|
|
|
|
if (!sched_debug())
|
|
|
|
break;
|
2012-04-17 15:49:36 +02:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* 'level' contains the number of unique distances, excluding the
|
|
|
|
* identity distance node_distance(i,i).
|
|
|
|
*
|
|
|
|
* The sched_domains_nume_distance[] array includes the actual distance
|
|
|
|
* numbers.
|
|
|
|
*/
|
|
|
|
|
|
|
|
sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
|
|
|
|
if (!sched_domains_numa_masks)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now for each level, construct a mask per node which contains all
|
|
|
|
* cpus of nodes that are that many hops away from us.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < level; i++) {
|
|
|
|
sched_domains_numa_masks[i] =
|
|
|
|
kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
|
|
|
|
if (!sched_domains_numa_masks[i])
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (j = 0; j < nr_node_ids; j++) {
|
2012-05-25 09:26:43 +02:00
|
|
|
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
|
2012-04-17 15:49:36 +02:00
|
|
|
if (!mask)
|
|
|
|
return;
|
|
|
|
|
|
|
|
sched_domains_numa_masks[i][j] = mask;
|
|
|
|
|
|
|
|
for (k = 0; k < nr_node_ids; k++) {
|
2012-05-11 00:56:20 +02:00
|
|
|
if (node_distance(j, k) > sched_domains_numa_distance[i])
|
2012-04-17 15:49:36 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
cpumask_or(mask, mask, cpumask_of_node(k));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
|
|
|
|
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
|
|
|
|
if (!tl)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copy the default topology bits..
|
|
|
|
*/
|
|
|
|
for (i = 0; default_topology[i].init; i++)
|
|
|
|
tl[i] = default_topology[i];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* .. and append 'j' levels of NUMA goodness.
|
|
|
|
*/
|
|
|
|
for (j = 0; j < level; i++, j++) {
|
|
|
|
tl[i] = (struct sched_domain_topology_level){
|
|
|
|
.init = sd_numa_init,
|
|
|
|
.mask = sd_numa_mask,
|
|
|
|
.flags = SDTL_OVERLAP,
|
|
|
|
.numa_level = j,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
sched_domain_topology = tl;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void sched_init_numa(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
|
2011-04-07 14:10:03 +02:00
|
|
|
static int __sdt_alloc(const struct cpumask *cpu_map)
|
|
|
|
{
|
|
|
|
struct sched_domain_topology_level *tl;
|
|
|
|
int j;
|
|
|
|
|
|
|
|
for (tl = sched_domain_topology; tl->init; tl++) {
|
|
|
|
struct sd_data *sdd = &tl->data;
|
|
|
|
|
|
|
|
sdd->sd = alloc_percpu(struct sched_domain *);
|
|
|
|
if (!sdd->sd)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
sdd->sg = alloc_percpu(struct sched_group *);
|
|
|
|
if (!sdd->sg)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2011-07-14 13:00:06 +02:00
|
|
|
sdd->sgp = alloc_percpu(struct sched_group_power *);
|
|
|
|
if (!sdd->sgp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2011-04-07 14:10:03 +02:00
|
|
|
for_each_cpu(j, cpu_map) {
|
|
|
|
struct sched_domain *sd;
|
|
|
|
struct sched_group *sg;
|
2011-07-14 13:00:06 +02:00
|
|
|
struct sched_group_power *sgp;
|
2011-04-07 14:10:03 +02:00
|
|
|
|
|
|
|
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
|
|
|
|
GFP_KERNEL, cpu_to_node(j));
|
|
|
|
if (!sd)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
*per_cpu_ptr(sdd->sd, j) = sd;
|
|
|
|
|
|
|
|
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
|
|
|
|
GFP_KERNEL, cpu_to_node(j));
|
|
|
|
if (!sg)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2012-05-09 12:38:28 +02:00
|
|
|
sg->next = sg;
|
|
|
|
|
2011-04-07 14:10:03 +02:00
|
|
|
*per_cpu_ptr(sdd->sg, j) = sg;
|
2011-07-14 13:00:06 +02:00
|
|
|
|
2012-05-31 14:47:33 +02:00
|
|
|
sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
|
2011-07-14 13:00:06 +02:00
|
|
|
GFP_KERNEL, cpu_to_node(j));
|
|
|
|
if (!sgp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
*per_cpu_ptr(sdd->sgp, j) = sgp;
|
2011-04-07 14:10:03 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __sdt_free(const struct cpumask *cpu_map)
|
|
|
|
{
|
|
|
|
struct sched_domain_topology_level *tl;
|
|
|
|
int j;
|
|
|
|
|
|
|
|
for (tl = sched_domain_topology; tl->init; tl++) {
|
|
|
|
struct sd_data *sdd = &tl->data;
|
|
|
|
|
|
|
|
for_each_cpu(j, cpu_map) {
|
2012-04-25 19:59:21 +08:00
|
|
|
struct sched_domain *sd;
|
|
|
|
|
|
|
|
if (sdd->sd) {
|
|
|
|
sd = *per_cpu_ptr(sdd->sd, j);
|
|
|
|
if (sd && (sd->flags & SD_OVERLAP))
|
|
|
|
free_sched_groups(sd->groups, 0);
|
|
|
|
kfree(*per_cpu_ptr(sdd->sd, j));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sdd->sg)
|
|
|
|
kfree(*per_cpu_ptr(sdd->sg, j));
|
|
|
|
if (sdd->sgp)
|
|
|
|
kfree(*per_cpu_ptr(sdd->sgp, j));
|
2011-04-07 14:10:03 +02:00
|
|
|
}
|
|
|
|
free_percpu(sdd->sd);
|
2012-04-25 19:59:21 +08:00
|
|
|
sdd->sd = NULL;
|
2011-04-07 14:10:03 +02:00
|
|
|
free_percpu(sdd->sg);
|
2012-04-25 19:59:21 +08:00
|
|
|
sdd->sg = NULL;
|
2011-07-14 13:00:06 +02:00
|
|
|
free_percpu(sdd->sgp);
|
2012-04-25 19:59:21 +08:00
|
|
|
sdd->sgp = NULL;
|
2011-04-07 14:10:03 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-04-07 14:10:01 +02:00
|
|
|
struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
|
|
|
|
struct s_data *d, const struct cpumask *cpu_map,
|
2011-04-07 14:10:02 +02:00
|
|
|
struct sched_domain_attr *attr, struct sched_domain *child,
|
2011-04-07 14:10:01 +02:00
|
|
|
int cpu)
|
|
|
|
{
|
2011-04-07 14:10:03 +02:00
|
|
|
struct sched_domain *sd = tl->init(tl, cpu);
|
2011-04-07 14:10:01 +02:00
|
|
|
if (!sd)
|
2011-04-07 14:10:02 +02:00
|
|
|
return child;
|
2011-04-07 14:10:01 +02:00
|
|
|
|
|
|
|
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
|
2011-04-07 14:10:04 +02:00
|
|
|
if (child) {
|
|
|
|
sd->level = child->level + 1;
|
|
|
|
sched_domain_level_max = max(sched_domain_level_max, sd->level);
|
2011-04-07 14:10:02 +02:00
|
|
|
child->parent = sd;
|
2011-04-07 14:10:04 +02:00
|
|
|
}
|
2011-04-07 14:10:02 +02:00
|
|
|
sd->child = child;
|
2012-06-05 13:44:36 -05:00
|
|
|
set_domain_attribute(sd, attr);
|
2011-04-07 14:10:01 +02:00
|
|
|
|
|
|
|
return sd;
|
|
|
|
}
|
|
|
|
|
2009-08-18 12:53:00 +02:00
|
|
|
/*
|
|
|
|
* Build sched domains for a given set of cpus and attach the sched domains
|
|
|
|
* to the individual cpus
|
|
|
|
*/
|
2011-04-07 14:09:50 +02:00
|
|
|
static int build_sched_domains(const struct cpumask *cpu_map,
|
|
|
|
struct sched_domain_attr *attr)
|
2009-08-18 12:53:00 +02:00
|
|
|
{
|
|
|
|
enum s_alloc alloc_state = sa_none;
|
2011-04-07 14:09:50 +02:00
|
|
|
struct sched_domain *sd;
|
2009-08-18 12:53:00 +02:00
|
|
|
struct s_data d;
|
2011-04-07 14:09:51 +02:00
|
|
|
int i, ret = -ENOMEM;
|
2005-09-06 15:18:14 -07:00
|
|
|
|
2009-08-18 12:53:00 +02:00
|
|
|
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
|
|
|
|
if (alloc_state != sa_rootdomain)
|
|
|
|
goto error;
|
2005-09-06 15:18:14 -07:00
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
/* Set up domains for cpus specified by the cpu_map. */
|
2008-11-25 02:35:02 +10:30
|
|
|
for_each_cpu(i, cpu_map) {
|
2011-04-07 14:10:00 +02:00
|
|
|
struct sched_domain_topology_level *tl;
|
|
|
|
|
2011-04-07 14:09:54 +02:00
|
|
|
sd = NULL;
|
2011-07-15 10:35:52 +02:00
|
|
|
for (tl = sched_domain_topology; tl->init; tl++) {
|
2011-04-07 14:10:01 +02:00
|
|
|
sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
|
2011-07-15 10:35:52 +02:00
|
|
|
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
|
|
|
|
sd->flags |= SD_OVERLAP;
|
2011-07-20 18:42:57 +02:00
|
|
|
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
|
|
|
|
break;
|
2011-07-15 10:35:52 +02:00
|
|
|
}
|
2011-04-07 14:09:43 +02:00
|
|
|
|
2011-04-07 14:10:02 +02:00
|
|
|
while (sd->child)
|
|
|
|
sd = sd->child;
|
|
|
|
|
2011-04-07 14:09:48 +02:00
|
|
|
*per_cpu_ptr(d.sd, i) = sd;
|
2011-04-07 14:09:50 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Build the groups for the domains */
|
|
|
|
for_each_cpu(i, cpu_map) {
|
|
|
|
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
|
|
|
|
sd->span_weight = cpumask_weight(sched_domain_span(sd));
|
2011-07-15 10:35:52 +02:00
|
|
|
if (sd->flags & SD_OVERLAP) {
|
|
|
|
if (build_overlap_sched_groups(sd, i))
|
|
|
|
goto error;
|
|
|
|
} else {
|
|
|
|
if (build_sched_groups(sd, i))
|
|
|
|
goto error;
|
|
|
|
}
|
2011-04-07 14:09:47 +02:00
|
|
|
}
|
2011-04-07 14:09:44 +02:00
|
|
|
}
|
2005-09-06 15:18:14 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/* Calculate CPU power for physical packages and nodes */
|
2011-04-07 14:09:49 +02:00
|
|
|
for (i = nr_cpumask_bits-1; i >= 0; i--) {
|
|
|
|
if (!cpumask_test_cpu(i, cpu_map))
|
|
|
|
continue;
|
2005-09-06 15:18:14 -07:00
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
|
|
|
|
claim_allocations(i, sd);
|
2011-04-07 14:09:45 +02:00
|
|
|
init_sched_groups_power(i, sd);
|
2011-04-07 14:09:50 +02:00
|
|
|
}
|
2006-07-30 03:02:59 -07:00
|
|
|
}
|
2005-09-06 15:18:14 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/* Attach the domains */
|
2011-04-07 14:09:50 +02:00
|
|
|
rcu_read_lock();
|
2008-11-25 02:35:02 +10:30
|
|
|
for_each_cpu(i, cpu_map) {
|
2011-04-07 14:09:48 +02:00
|
|
|
sd = *per_cpu_ptr(d.sd, i);
|
2009-08-18 12:51:52 +02:00
|
|
|
cpu_attach_domain(sd, d.rd, i);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2011-04-07 14:09:50 +02:00
|
|
|
rcu_read_unlock();
|
2006-06-27 02:54:38 -07:00
|
|
|
|
2011-04-07 14:09:51 +02:00
|
|
|
ret = 0;
|
2006-06-27 02:54:38 -07:00
|
|
|
error:
|
2009-08-18 12:53:00 +02:00
|
|
|
__free_domain_allocs(&d, alloc_state, cpu_map);
|
2011-04-07 14:09:51 +02:00
|
|
|
return ret;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2007-10-18 23:40:20 -07:00
|
|
|
|
2009-11-03 14:53:40 +10:30
|
|
|
static cpumask_var_t *doms_cur; /* current sched domains */
|
2007-10-18 23:40:20 -07:00
|
|
|
static int ndoms_cur; /* number of sched domains in 'doms_cur' */
|
2008-05-16 17:47:14 +02:00
|
|
|
static struct sched_domain_attr *dattr_cur;
|
|
|
|
/* attribues of custom domains in 'doms_cur' */
|
2007-10-18 23:40:20 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Special case: If a kmalloc of a doms_cur partition (array of
|
2008-11-25 02:35:12 +10:30
|
|
|
* cpumask) fails, then fallback to a single sched domain,
|
|
|
|
* as determined by the single cpumask fallback_doms.
|
2007-10-18 23:40:20 -07:00
|
|
|
*/
|
2008-11-25 02:35:12 +10:30
|
|
|
static cpumask_var_t fallback_doms;
|
2007-10-18 23:40:20 -07:00
|
|
|
|
2008-12-09 18:49:50 +01:00
|
|
|
/*
|
|
|
|
* arch_update_cpu_topology lets virtualized architectures update the
|
|
|
|
* cpu core maps. It is supposed to return 1 if the topology changed
|
|
|
|
* or 0 if it stayed the same.
|
|
|
|
*/
|
|
|
|
int __attribute__((weak)) arch_update_cpu_topology(void)
|
2008-03-12 18:31:59 +01:00
|
|
|
{
|
2008-12-09 18:49:50 +01:00
|
|
|
return 0;
|
2008-03-12 18:31:59 +01:00
|
|
|
}
|
|
|
|
|
2009-11-03 14:53:40 +10:30
|
|
|
cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
cpumask_var_t *doms;
|
|
|
|
|
|
|
|
doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
|
|
|
|
if (!doms)
|
|
|
|
return NULL;
|
|
|
|
for (i = 0; i < ndoms; i++) {
|
|
|
|
if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
|
|
|
|
free_sched_domains(doms, i);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return doms;
|
|
|
|
}
|
|
|
|
|
|
|
|
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
|
|
|
|
{
|
|
|
|
unsigned int i;
|
|
|
|
for (i = 0; i < ndoms; i++)
|
|
|
|
free_cpumask_var(doms[i]);
|
|
|
|
kfree(doms);
|
|
|
|
}
|
|
|
|
|
2005-06-25 14:57:33 -07:00
|
|
|
/*
|
2007-12-05 15:46:09 +01:00
|
|
|
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
|
2007-10-18 23:40:20 -07:00
|
|
|
* For now this just excludes isolated cpus, but could be used to
|
|
|
|
* exclude other special cases in the future.
|
2005-06-25 14:57:33 -07:00
|
|
|
*/
|
2011-04-07 14:09:42 +02:00
|
|
|
static int init_sched_domains(const struct cpumask *cpu_map)
|
2005-06-25 14:57:33 -07:00
|
|
|
{
|
2007-10-24 18:23:48 +02:00
|
|
|
int err;
|
|
|
|
|
2008-03-12 18:31:59 +01:00
|
|
|
arch_update_cpu_topology();
|
2007-10-18 23:40:20 -07:00
|
|
|
ndoms_cur = 1;
|
2009-11-03 14:53:40 +10:30
|
|
|
doms_cur = alloc_sched_domains(ndoms_cur);
|
2007-10-18 23:40:20 -07:00
|
|
|
if (!doms_cur)
|
2009-11-03 14:53:40 +10:30
|
|
|
doms_cur = &fallback_doms;
|
|
|
|
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
|
2011-04-07 14:09:50 +02:00
|
|
|
err = build_sched_domains(doms_cur[0], NULL);
|
2007-10-15 17:00:19 +02:00
|
|
|
register_sched_domain_sysctl();
|
2007-10-24 18:23:48 +02:00
|
|
|
|
|
|
|
return err;
|
2005-06-25 14:57:33 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Detach sched domains from a group of cpus specified in cpu_map
|
|
|
|
* These cpus will now be attached to the NULL domain
|
|
|
|
*/
|
2008-11-25 02:35:14 +10:30
|
|
|
static void detach_destroy_domains(const struct cpumask *cpu_map)
|
2005-06-25 14:57:33 -07:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2011-04-07 14:09:50 +02:00
|
|
|
rcu_read_lock();
|
2008-11-25 02:35:02 +10:30
|
|
|
for_each_cpu(i, cpu_map)
|
2008-01-25 21:08:18 +01:00
|
|
|
cpu_attach_domain(NULL, &def_root_domain, i);
|
2011-04-07 14:09:50 +02:00
|
|
|
rcu_read_unlock();
|
2005-06-25 14:57:33 -07:00
|
|
|
}
|
|
|
|
|
2008-04-15 14:04:23 +09:00
|
|
|
/* handle null as "default" */
|
|
|
|
static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
|
|
|
|
struct sched_domain_attr *new, int idx_new)
|
|
|
|
{
|
|
|
|
struct sched_domain_attr tmp;
|
|
|
|
|
|
|
|
/* fast path */
|
|
|
|
if (!new && !cur)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
tmp = SD_ATTR_INIT;
|
|
|
|
return !memcmp(cur ? (cur + idx_cur) : &tmp,
|
|
|
|
new ? (new + idx_new) : &tmp,
|
|
|
|
sizeof(struct sched_domain_attr));
|
|
|
|
}
|
|
|
|
|
2007-10-18 23:40:20 -07:00
|
|
|
/*
|
|
|
|
* Partition sched domains as specified by the 'ndoms_new'
|
2007-12-05 15:46:09 +01:00
|
|
|
* cpumasks in the array doms_new[] of cpumasks. This compares
|
2007-10-18 23:40:20 -07:00
|
|
|
* doms_new[] to the current sched domain partitioning, doms_cur[].
|
|
|
|
* It destroys each deleted domain and builds each new domain.
|
|
|
|
*
|
2009-11-03 14:53:40 +10:30
|
|
|
* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
|
2007-12-05 15:46:09 +01:00
|
|
|
* The masks don't intersect (don't overlap.) We should setup one
|
|
|
|
* sched domain for each mask. CPUs not in any of the cpumasks will
|
|
|
|
* not be load balanced. If the same cpumask appears both in the
|
2007-10-18 23:40:20 -07:00
|
|
|
* current 'doms_cur' domains and in the new 'doms_new', we can leave
|
|
|
|
* it as it is.
|
|
|
|
*
|
2009-11-03 14:53:40 +10:30
|
|
|
* The passed in 'doms_new' should be allocated using
|
|
|
|
* alloc_sched_domains. This routine takes ownership of it and will
|
|
|
|
* free_sched_domains it when done with it. If the caller failed the
|
|
|
|
* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
|
|
|
|
* and partition_sched_domains() will fallback to the single partition
|
|
|
|
* 'fallback_doms', it also forces the domains to be rebuilt.
|
2007-10-18 23:40:20 -07:00
|
|
|
*
|
2008-11-25 02:35:14 +10:30
|
|
|
* If doms_new == NULL it will be replaced with cpu_online_mask.
|
2008-11-18 14:02:03 +08:00
|
|
|
* ndoms_new == 0 is a special case for destroying existing domains,
|
|
|
|
* and it will not create the default domain.
|
2008-08-29 13:11:41 -07:00
|
|
|
*
|
2007-10-18 23:40:20 -07:00
|
|
|
* Call with hotplug lock held
|
|
|
|
*/
|
2009-11-03 14:53:40 +10:30
|
|
|
void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
2008-04-15 14:04:23 +09:00
|
|
|
struct sched_domain_attr *dattr_new)
|
2007-10-18 23:40:20 -07:00
|
|
|
{
|
2008-08-29 13:11:41 -07:00
|
|
|
int i, j, n;
|
2008-12-09 18:49:51 +01:00
|
|
|
int new_topology;
|
2007-10-18 23:40:20 -07:00
|
|
|
|
2008-04-28 11:33:07 +02:00
|
|
|
mutex_lock(&sched_domains_mutex);
|
2008-01-25 21:08:00 +01:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
/* always unregister in case we don't destroy any domains */
|
|
|
|
unregister_sched_domain_sysctl();
|
|
|
|
|
2008-12-09 18:49:51 +01:00
|
|
|
/* Let architecture update cpu core mappings. */
|
|
|
|
new_topology = arch_update_cpu_topology();
|
|
|
|
|
2008-08-29 13:11:41 -07:00
|
|
|
n = doms_new ? ndoms_new : 0;
|
2007-10-18 23:40:20 -07:00
|
|
|
|
|
|
|
/* Destroy deleted domains */
|
|
|
|
for (i = 0; i < ndoms_cur; i++) {
|
2008-12-09 18:49:51 +01:00
|
|
|
for (j = 0; j < n && !new_topology; j++) {
|
2009-11-03 14:53:40 +10:30
|
|
|
if (cpumask_equal(doms_cur[i], doms_new[j])
|
2008-04-15 14:04:23 +09:00
|
|
|
&& dattrs_equal(dattr_cur, i, dattr_new, j))
|
2007-10-18 23:40:20 -07:00
|
|
|
goto match1;
|
|
|
|
}
|
|
|
|
/* no match - a current sched domain not in new doms_new[] */
|
2009-11-03 14:53:40 +10:30
|
|
|
detach_destroy_domains(doms_cur[i]);
|
2007-10-18 23:40:20 -07:00
|
|
|
match1:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
2008-07-15 04:43:49 -07:00
|
|
|
if (doms_new == NULL) {
|
|
|
|
ndoms_cur = 0;
|
2009-11-03 14:53:40 +10:30
|
|
|
doms_new = &fallback_doms;
|
2009-11-25 13:31:39 +01:00
|
|
|
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
|
2008-11-04 16:20:23 +08:00
|
|
|
WARN_ON_ONCE(dattr_new);
|
2008-07-15 04:43:49 -07:00
|
|
|
}
|
|
|
|
|
2007-10-18 23:40:20 -07:00
|
|
|
/* Build new domains */
|
|
|
|
for (i = 0; i < ndoms_new; i++) {
|
2008-12-09 18:49:51 +01:00
|
|
|
for (j = 0; j < ndoms_cur && !new_topology; j++) {
|
2009-11-03 14:53:40 +10:30
|
|
|
if (cpumask_equal(doms_new[i], doms_cur[j])
|
2008-04-15 14:04:23 +09:00
|
|
|
&& dattrs_equal(dattr_new, i, dattr_cur, j))
|
2007-10-18 23:40:20 -07:00
|
|
|
goto match2;
|
|
|
|
}
|
|
|
|
/* no match - add a new doms_new */
|
2011-04-07 14:09:50 +02:00
|
|
|
build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
|
2007-10-18 23:40:20 -07:00
|
|
|
match2:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Remember the new sched domains */
|
2009-11-03 14:53:40 +10:30
|
|
|
if (doms_cur != &fallback_doms)
|
|
|
|
free_sched_domains(doms_cur, ndoms_cur);
|
2008-04-15 14:04:23 +09:00
|
|
|
kfree(dattr_cur); /* kfree(NULL) is safe */
|
2007-10-18 23:40:20 -07:00
|
|
|
doms_cur = doms_new;
|
2008-04-15 14:04:23 +09:00
|
|
|
dattr_cur = dattr_new;
|
2007-10-18 23:40:20 -07:00
|
|
|
ndoms_cur = ndoms_new;
|
2007-10-24 18:23:48 +02:00
|
|
|
|
|
|
|
register_sched_domain_sysctl();
|
2008-01-25 21:08:00 +01:00
|
|
|
|
2008-04-28 11:33:07 +02:00
|
|
|
mutex_unlock(&sched_domains_mutex);
|
2007-10-18 23:40:20 -07:00
|
|
|
}
|
|
|
|
|
CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed
masks as and when necessary to ensure that the tasks belonging to the cpusets
have some place (online CPUs) to run on. And regular CPU hotplug is
destructive in the sense that the kernel doesn't remember the original cpuset
configurations set by the user, across hotplug operations.
However, suspend/resume (which uses CPU hotplug) is a special case in which
the kernel has the responsibility to restore the system (during resume), to
exactly the same state it was in before suspend.
In order to achieve that, do the following:
1. Don't modify cpusets during suspend/resume. At all.
In particular, don't move the tasks from one cpuset to another, and
don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets
during the CPU hotplug operations that are carried out in the
suspend/resume path.
2. However, cpusets and sched domains are related. We just want to avoid
altering cpusets alone. So, to keep the sched domains updated, build
a single sched domain (containing all active cpus) during each of the
CPU hotplug operations carried out in s/r path, effectively ignoring
the cpusets' cpus_allowed masks.
(Since userspace is frozen while doing all this, it will go unnoticed.)
3. During the last CPU online operation during resume, build the sched
domains by looking up the (unaltered) cpusets' cpus_allowed masks.
That will bring back the system to the same original state as it was in
before suspend.
Ultimately, this will not only solve the cpuset problem related to suspend
resume (ie., restores the cpusets to exactly what it was before suspend, by
not touching it at all) but also speeds up suspend/resume because we avoid
running cpuset update code for every CPU being offlined/onlined.
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-05-24 19:46:26 +05:30
|
|
|
static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2010-06-08 21:40:36 +02:00
|
|
|
* Update cpusets according to cpu_active mask. If cpusets are
|
|
|
|
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
|
|
|
|
* around partition_sched_domains().
|
CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed
masks as and when necessary to ensure that the tasks belonging to the cpusets
have some place (online CPUs) to run on. And regular CPU hotplug is
destructive in the sense that the kernel doesn't remember the original cpuset
configurations set by the user, across hotplug operations.
However, suspend/resume (which uses CPU hotplug) is a special case in which
the kernel has the responsibility to restore the system (during resume), to
exactly the same state it was in before suspend.
In order to achieve that, do the following:
1. Don't modify cpusets during suspend/resume. At all.
In particular, don't move the tasks from one cpuset to another, and
don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets
during the CPU hotplug operations that are carried out in the
suspend/resume path.
2. However, cpusets and sched domains are related. We just want to avoid
altering cpusets alone. So, to keep the sched domains updated, build
a single sched domain (containing all active cpus) during each of the
CPU hotplug operations carried out in s/r path, effectively ignoring
the cpusets' cpus_allowed masks.
(Since userspace is frozen while doing all this, it will go unnoticed.)
3. During the last CPU online operation during resume, build the sched
domains by looking up the (unaltered) cpusets' cpus_allowed masks.
That will bring back the system to the same original state as it was in
before suspend.
Ultimately, this will not only solve the cpuset problem related to suspend
resume (ie., restores the cpusets to exactly what it was before suspend, by
not touching it at all) but also speeds up suspend/resume because we avoid
running cpuset update code for every CPU being offlined/onlined.
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-05-24 19:46:26 +05:30
|
|
|
*
|
|
|
|
* If we come here as part of a suspend/resume, don't touch cpusets because we
|
|
|
|
* want to restore it back to its original state upon resume anyway.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2010-06-21 23:53:31 +02:00
|
|
|
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
|
|
|
|
void *hcpu)
|
2008-07-15 04:43:49 -07:00
|
|
|
{
|
CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed
masks as and when necessary to ensure that the tasks belonging to the cpusets
have some place (online CPUs) to run on. And regular CPU hotplug is
destructive in the sense that the kernel doesn't remember the original cpuset
configurations set by the user, across hotplug operations.
However, suspend/resume (which uses CPU hotplug) is a special case in which
the kernel has the responsibility to restore the system (during resume), to
exactly the same state it was in before suspend.
In order to achieve that, do the following:
1. Don't modify cpusets during suspend/resume. At all.
In particular, don't move the tasks from one cpuset to another, and
don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets
during the CPU hotplug operations that are carried out in the
suspend/resume path.
2. However, cpusets and sched domains are related. We just want to avoid
altering cpusets alone. So, to keep the sched domains updated, build
a single sched domain (containing all active cpus) during each of the
CPU hotplug operations carried out in s/r path, effectively ignoring
the cpusets' cpus_allowed masks.
(Since userspace is frozen while doing all this, it will go unnoticed.)
3. During the last CPU online operation during resume, build the sched
domains by looking up the (unaltered) cpusets' cpus_allowed masks.
That will bring back the system to the same original state as it was in
before suspend.
Ultimately, this will not only solve the cpuset problem related to suspend
resume (ie., restores the cpusets to exactly what it was before suspend, by
not touching it at all) but also speeds up suspend/resume because we avoid
running cpuset update code for every CPU being offlined/onlined.
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-05-24 19:46:26 +05:30
|
|
|
switch (action) {
|
|
|
|
case CPU_ONLINE_FROZEN:
|
|
|
|
case CPU_DOWN_FAILED_FROZEN:
|
|
|
|
|
|
|
|
/*
|
|
|
|
* num_cpus_frozen tracks how many CPUs are involved in suspend
|
|
|
|
* resume sequence. As long as this is not the last online
|
|
|
|
* operation in the resume sequence, just build a single sched
|
|
|
|
* domain, ignoring cpusets.
|
|
|
|
*/
|
|
|
|
num_cpus_frozen--;
|
|
|
|
if (likely(num_cpus_frozen)) {
|
|
|
|
partition_sched_domains(1, NULL, NULL);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is the last CPU online operation. So fall through and
|
|
|
|
* restore the original sched domains by considering the
|
|
|
|
* cpuset configurations.
|
|
|
|
*/
|
|
|
|
|
2008-07-15 04:43:49 -07:00
|
|
|
case CPU_ONLINE:
|
2009-11-25 13:31:39 +01:00
|
|
|
case CPU_DOWN_FAILED:
|
2012-05-24 19:46:55 +05:30
|
|
|
cpuset_update_active_cpus(true);
|
CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed
masks as and when necessary to ensure that the tasks belonging to the cpusets
have some place (online CPUs) to run on. And regular CPU hotplug is
destructive in the sense that the kernel doesn't remember the original cpuset
configurations set by the user, across hotplug operations.
However, suspend/resume (which uses CPU hotplug) is a special case in which
the kernel has the responsibility to restore the system (during resume), to
exactly the same state it was in before suspend.
In order to achieve that, do the following:
1. Don't modify cpusets during suspend/resume. At all.
In particular, don't move the tasks from one cpuset to another, and
don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets
during the CPU hotplug operations that are carried out in the
suspend/resume path.
2. However, cpusets and sched domains are related. We just want to avoid
altering cpusets alone. So, to keep the sched domains updated, build
a single sched domain (containing all active cpus) during each of the
CPU hotplug operations carried out in s/r path, effectively ignoring
the cpusets' cpus_allowed masks.
(Since userspace is frozen while doing all this, it will go unnoticed.)
3. During the last CPU online operation during resume, build the sched
domains by looking up the (unaltered) cpusets' cpus_allowed masks.
That will bring back the system to the same original state as it was in
before suspend.
Ultimately, this will not only solve the cpuset problem related to suspend
resume (ie., restores the cpusets to exactly what it was before suspend, by
not touching it at all) but also speeds up suspend/resume because we avoid
running cpuset update code for every CPU being offlined/onlined.
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-05-24 19:46:26 +05:30
|
|
|
break;
|
2010-06-08 21:40:36 +02:00
|
|
|
default:
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed
masks as and when necessary to ensure that the tasks belonging to the cpusets
have some place (online CPUs) to run on. And regular CPU hotplug is
destructive in the sense that the kernel doesn't remember the original cpuset
configurations set by the user, across hotplug operations.
However, suspend/resume (which uses CPU hotplug) is a special case in which
the kernel has the responsibility to restore the system (during resume), to
exactly the same state it was in before suspend.
In order to achieve that, do the following:
1. Don't modify cpusets during suspend/resume. At all.
In particular, don't move the tasks from one cpuset to another, and
don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets
during the CPU hotplug operations that are carried out in the
suspend/resume path.
2. However, cpusets and sched domains are related. We just want to avoid
altering cpusets alone. So, to keep the sched domains updated, build
a single sched domain (containing all active cpus) during each of the
CPU hotplug operations carried out in s/r path, effectively ignoring
the cpusets' cpus_allowed masks.
(Since userspace is frozen while doing all this, it will go unnoticed.)
3. During the last CPU online operation during resume, build the sched
domains by looking up the (unaltered) cpusets' cpus_allowed masks.
That will bring back the system to the same original state as it was in
before suspend.
Ultimately, this will not only solve the cpuset problem related to suspend
resume (ie., restores the cpusets to exactly what it was before suspend, by
not touching it at all) but also speeds up suspend/resume because we avoid
running cpuset update code for every CPU being offlined/onlined.
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-05-24 19:46:26 +05:30
|
|
|
return NOTIFY_OK;
|
2010-06-08 21:40:36 +02:00
|
|
|
}
|
2008-07-15 04:43:49 -07:00
|
|
|
|
2010-06-21 23:53:31 +02:00
|
|
|
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
|
|
|
|
void *hcpu)
|
2010-06-08 21:40:36 +02:00
|
|
|
{
|
CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed
masks as and when necessary to ensure that the tasks belonging to the cpusets
have some place (online CPUs) to run on. And regular CPU hotplug is
destructive in the sense that the kernel doesn't remember the original cpuset
configurations set by the user, across hotplug operations.
However, suspend/resume (which uses CPU hotplug) is a special case in which
the kernel has the responsibility to restore the system (during resume), to
exactly the same state it was in before suspend.
In order to achieve that, do the following:
1. Don't modify cpusets during suspend/resume. At all.
In particular, don't move the tasks from one cpuset to another, and
don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets
during the CPU hotplug operations that are carried out in the
suspend/resume path.
2. However, cpusets and sched domains are related. We just want to avoid
altering cpusets alone. So, to keep the sched domains updated, build
a single sched domain (containing all active cpus) during each of the
CPU hotplug operations carried out in s/r path, effectively ignoring
the cpusets' cpus_allowed masks.
(Since userspace is frozen while doing all this, it will go unnoticed.)
3. During the last CPU online operation during resume, build the sched
domains by looking up the (unaltered) cpusets' cpus_allowed masks.
That will bring back the system to the same original state as it was in
before suspend.
Ultimately, this will not only solve the cpuset problem related to suspend
resume (ie., restores the cpusets to exactly what it was before suspend, by
not touching it at all) but also speeds up suspend/resume because we avoid
running cpuset update code for every CPU being offlined/onlined.
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-05-24 19:46:26 +05:30
|
|
|
switch (action) {
|
2010-06-08 21:40:36 +02:00
|
|
|
case CPU_DOWN_PREPARE:
|
2012-05-24 19:46:55 +05:30
|
|
|
cpuset_update_active_cpus(false);
|
CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed
masks as and when necessary to ensure that the tasks belonging to the cpusets
have some place (online CPUs) to run on. And regular CPU hotplug is
destructive in the sense that the kernel doesn't remember the original cpuset
configurations set by the user, across hotplug operations.
However, suspend/resume (which uses CPU hotplug) is a special case in which
the kernel has the responsibility to restore the system (during resume), to
exactly the same state it was in before suspend.
In order to achieve that, do the following:
1. Don't modify cpusets during suspend/resume. At all.
In particular, don't move the tasks from one cpuset to another, and
don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets
during the CPU hotplug operations that are carried out in the
suspend/resume path.
2. However, cpusets and sched domains are related. We just want to avoid
altering cpusets alone. So, to keep the sched domains updated, build
a single sched domain (containing all active cpus) during each of the
CPU hotplug operations carried out in s/r path, effectively ignoring
the cpusets' cpus_allowed masks.
(Since userspace is frozen while doing all this, it will go unnoticed.)
3. During the last CPU online operation during resume, build the sched
domains by looking up the (unaltered) cpusets' cpus_allowed masks.
That will bring back the system to the same original state as it was in
before suspend.
Ultimately, this will not only solve the cpuset problem related to suspend
resume (ie., restores the cpusets to exactly what it was before suspend, by
not touching it at all) but also speeds up suspend/resume because we avoid
running cpuset update code for every CPU being offlined/onlined.
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-05-24 19:46:26 +05:30
|
|
|
break;
|
|
|
|
case CPU_DOWN_PREPARE_FROZEN:
|
|
|
|
num_cpus_frozen++;
|
|
|
|
partition_sched_domains(1, NULL, NULL);
|
|
|
|
break;
|
2008-07-15 04:43:49 -07:00
|
|
|
default:
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed
masks as and when necessary to ensure that the tasks belonging to the cpusets
have some place (online CPUs) to run on. And regular CPU hotplug is
destructive in the sense that the kernel doesn't remember the original cpuset
configurations set by the user, across hotplug operations.
However, suspend/resume (which uses CPU hotplug) is a special case in which
the kernel has the responsibility to restore the system (during resume), to
exactly the same state it was in before suspend.
In order to achieve that, do the following:
1. Don't modify cpusets during suspend/resume. At all.
In particular, don't move the tasks from one cpuset to another, and
don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets
during the CPU hotplug operations that are carried out in the
suspend/resume path.
2. However, cpusets and sched domains are related. We just want to avoid
altering cpusets alone. So, to keep the sched domains updated, build
a single sched domain (containing all active cpus) during each of the
CPU hotplug operations carried out in s/r path, effectively ignoring
the cpusets' cpus_allowed masks.
(Since userspace is frozen while doing all this, it will go unnoticed.)
3. During the last CPU online operation during resume, build the sched
domains by looking up the (unaltered) cpusets' cpus_allowed masks.
That will bring back the system to the same original state as it was in
before suspend.
Ultimately, this will not only solve the cpuset problem related to suspend
resume (ie., restores the cpusets to exactly what it was before suspend, by
not touching it at all) but also speeds up suspend/resume because we avoid
running cpuset update code for every CPU being offlined/onlined.
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-05-24 19:46:26 +05:30
|
|
|
return NOTIFY_OK;
|
2008-07-15 04:43:49 -07:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
void __init sched_init_smp(void)
|
|
|
|
{
|
2008-11-25 02:35:12 +10:30
|
|
|
cpumask_var_t non_isolated_cpus;
|
|
|
|
|
|
|
|
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
|
2009-09-14 20:20:16 +08:00
|
|
|
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
|
2006-10-03 01:14:04 -07:00
|
|
|
|
2012-04-17 15:49:36 +02:00
|
|
|
sched_init_numa();
|
|
|
|
|
2008-01-25 21:08:02 +01:00
|
|
|
get_online_cpus();
|
2008-04-28 11:33:07 +02:00
|
|
|
mutex_lock(&sched_domains_mutex);
|
2011-04-07 14:09:42 +02:00
|
|
|
init_sched_domains(cpu_active_mask);
|
2008-11-25 02:35:12 +10:30
|
|
|
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
|
|
|
|
if (cpumask_empty(non_isolated_cpus))
|
|
|
|
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
|
2008-04-28 11:33:07 +02:00
|
|
|
mutex_unlock(&sched_domains_mutex);
|
2008-01-25 21:08:02 +01:00
|
|
|
put_online_cpus();
|
2008-07-15 04:43:49 -07:00
|
|
|
|
2010-06-08 21:40:36 +02:00
|
|
|
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
|
|
|
|
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
|
2008-07-15 04:43:49 -07:00
|
|
|
|
|
|
|
/* RT runtime code needs to handle some hotplug events */
|
|
|
|
hotcpu_notifier(update_runtime, 0);
|
|
|
|
|
2008-04-29 10:02:46 +02:00
|
|
|
init_hrtick();
|
2006-10-03 01:14:04 -07:00
|
|
|
|
|
|
|
/* Move init over to a non-isolated CPU */
|
2008-11-25 02:35:12 +10:30
|
|
|
if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
|
2006-10-03 01:14:04 -07:00
|
|
|
BUG();
|
2007-11-09 22:39:38 +01:00
|
|
|
sched_init_granularity();
|
2008-11-25 02:35:12 +10:30
|
|
|
free_cpumask_var(non_isolated_cpus);
|
2008-11-25 02:35:12 +10:30
|
|
|
|
2008-11-25 02:35:13 +10:30
|
|
|
init_sched_rt_class();
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
void __init sched_init_smp(void)
|
|
|
|
{
|
2007-11-09 22:39:38 +01:00
|
|
|
sched_init_granularity();
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
2009-04-16 12:15:34 +05:30
|
|
|
const_debug unsigned int sysctl_timer_migration = 1;
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
int in_sched_functions(unsigned long addr)
|
|
|
|
{
|
|
|
|
return in_lock_functions(addr) ||
|
|
|
|
(addr >= (unsigned long)__sched_text_start
|
|
|
|
&& addr < (unsigned long)__sched_text_end);
|
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
#ifdef CONFIG_CGROUP_SCHED
|
|
|
|
struct task_group root_task_group;
|
2012-08-07 05:00:13 +02:00
|
|
|
LIST_HEAD(task_groups);
|
2008-02-13 15:45:40 +01:00
|
|
|
#endif
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
void __init sched_init(void)
|
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
int i, j;
|
2008-04-04 18:11:04 -07:00
|
|
|
unsigned long alloc_size = 0, ptr;
|
|
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
|
2008-04-19 19:45:00 +02:00
|
|
|
#endif
|
2009-03-19 15:22:20 +10:30
|
|
|
#ifdef CONFIG_CPUMASK_OFFSTACK
|
2009-03-19 15:22:20 +10:30
|
|
|
alloc_size += num_possible_cpus() * cpumask_size();
|
2008-04-04 18:11:04 -07:00
|
|
|
#endif
|
|
|
|
if (alloc_size) {
|
2009-06-10 23:42:36 +03:00
|
|
|
ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
|
2008-04-04 18:11:04 -07:00
|
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2011-01-07 15:17:36 +08:00
|
|
|
root_task_group.se = (struct sched_entity **)ptr;
|
2008-04-04 18:11:04 -07:00
|
|
|
ptr += nr_cpu_ids * sizeof(void **);
|
|
|
|
|
2011-01-07 15:17:36 +08:00
|
|
|
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
|
2008-04-04 18:11:04 -07:00
|
|
|
ptr += nr_cpu_ids * sizeof(void **);
|
2008-04-19 19:45:00 +02:00
|
|
|
|
2008-05-30 14:23:45 +02:00
|
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
2008-04-04 18:11:04 -07:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2011-01-07 15:17:36 +08:00
|
|
|
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
|
2008-04-04 18:11:04 -07:00
|
|
|
ptr += nr_cpu_ids * sizeof(void **);
|
|
|
|
|
2011-01-07 15:17:36 +08:00
|
|
|
root_task_group.rt_rq = (struct rt_rq **)ptr;
|
2008-04-19 19:45:00 +02:00
|
|
|
ptr += nr_cpu_ids * sizeof(void **);
|
|
|
|
|
2008-05-30 14:23:45 +02:00
|
|
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
2009-03-19 15:22:20 +10:30
|
|
|
#ifdef CONFIG_CPUMASK_OFFSTACK
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
per_cpu(load_balance_tmpmask, i) = (void *)ptr;
|
|
|
|
ptr += cpumask_size();
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_CPUMASK_OFFSTACK */
|
2008-04-04 18:11:04 -07:00
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
init_defrootdomain();
|
|
|
|
#endif
|
|
|
|
|
2008-04-19 19:44:57 +02:00
|
|
|
init_rt_bandwidth(&def_rt_bandwidth,
|
|
|
|
global_rt_period(), global_rt_runtime());
|
|
|
|
|
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2011-01-07 15:17:36 +08:00
|
|
|
init_rt_bandwidth(&root_task_group.rt_bandwidth,
|
2008-04-19 19:44:57 +02:00
|
|
|
global_rt_period(), global_rt_runtime());
|
2008-05-30 14:23:45 +02:00
|
|
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
2008-04-19 19:44:57 +02:00
|
|
|
|
2010-01-20 13:26:18 +01:00
|
|
|
#ifdef CONFIG_CGROUP_SCHED
|
2011-01-07 15:17:36 +08:00
|
|
|
list_add(&root_task_group.list, &task_groups);
|
|
|
|
INIT_LIST_HEAD(&root_task_group.children);
|
2011-11-01 19:19:07 -02:00
|
|
|
INIT_LIST_HEAD(&root_task_group.siblings);
|
sched: Add 'autogroup' scheduling feature: automated per session task groups
A recurring complaint from CFS users is that parallel kbuild has
a negative impact on desktop interactivity. This patch
implements an idea from Linus, to automatically create task
groups. Currently, only per session autogroups are implemented,
but the patch leaves the way open for enhancement.
Implementation: each task's signal struct contains an inherited
pointer to a refcounted autogroup struct containing a task group
pointer, the default for all tasks pointing to the
init_task_group. When a task calls setsid(), a new task group
is created, the process is moved into the new task group, and a
reference to the preveious task group is dropped. Child
processes inherit this task group thereafter, and increase it's
refcount. When the last thread of a process exits, the
process's reference is dropped, such that when the last process
referencing an autogroup exits, the autogroup is destroyed.
At runqueue selection time, IFF a task has no cgroup assignment,
its current autogroup is used.
Autogroup bandwidth is controllable via setting it's nice level
through the proc filesystem:
cat /proc/<pid>/autogroup
Displays the task's group and the group's nice level.
echo <nice level> > /proc/<pid>/autogroup
Sets the task group's shares to the weight of nice <level> task.
Setting nice level is rate limited for !admin users due to the
abuse risk of task group locking.
The feature is enabled from boot by default if
CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via
the boot option noautogroup, and can also be turned on/off on
the fly via:
echo [01] > /proc/sys/kernel/sched_autogroup_enabled
... which will automatically move tasks to/from the root task group.
Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Paul Turner <pjt@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
[ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-11-30 14:18:03 +01:00
|
|
|
autogroup_init(&init_task);
|
2011-11-28 14:45:19 -02:00
|
|
|
|
2010-01-20 13:26:18 +01:00
|
|
|
#endif /* CONFIG_CGROUP_SCHED */
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2011-11-28 14:45:19 -02:00
|
|
|
#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
root_cpuacct.cpustat = &kernel_cpustat;
|
|
|
|
root_cpuacct.cpuusage = alloc_percpu(u64);
|
|
|
|
/* Too early, not expected to fail */
|
|
|
|
BUG_ON(!root_cpuacct.cpuusage);
|
|
|
|
#endif
|
2006-03-28 01:56:37 -08:00
|
|
|
for_each_possible_cpu(i) {
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
rq = cpu_rq(i);
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock_init(&rq->lock);
|
2005-06-25 14:57:13 -07:00
|
|
|
rq->nr_running = 0;
|
2009-04-11 10:43:41 +02:00
|
|
|
rq->calc_load_active = 0;
|
|
|
|
rq->calc_load_update = jiffies + LOAD_FREQ;
|
2011-07-14 18:32:43 +02:00
|
|
|
init_cfs_rq(&rq->cfs);
|
2008-01-25 21:08:30 +01:00
|
|
|
init_rt_rq(&rq->rt, rq);
|
2007-07-09 18:51:59 +02:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2011-10-25 10:00:11 +02:00
|
|
|
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
|
2008-01-25 21:08:30 +01:00
|
|
|
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
|
2008-04-19 19:44:59 +02:00
|
|
|
/*
|
2011-01-07 15:17:36 +08:00
|
|
|
* How much cpu bandwidth does root_task_group get?
|
2008-04-19 19:44:59 +02:00
|
|
|
*
|
|
|
|
* In case of task-groups formed thr' the cgroup filesystem, it
|
|
|
|
* gets 100% of the cpu resources in the system. This overall
|
|
|
|
* system cpu resource is divided among the tasks of
|
2011-01-07 15:17:36 +08:00
|
|
|
* root_task_group and its child task-groups in a fair manner,
|
2008-04-19 19:44:59 +02:00
|
|
|
* based on each entity's (task or task-group's) weight
|
|
|
|
* (se->load.weight).
|
|
|
|
*
|
2011-01-07 15:17:36 +08:00
|
|
|
* In other words, if root_task_group has 10 tasks of weight
|
2008-04-19 19:44:59 +02:00
|
|
|
* 1024) and two child groups A0 and A1 (of weight 1024 each),
|
|
|
|
* then A0's share of the cpu resource is:
|
|
|
|
*
|
2009-05-04 19:13:30 +02:00
|
|
|
* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
|
2008-04-19 19:44:59 +02:00
|
|
|
*
|
2011-01-07 15:17:36 +08:00
|
|
|
* We achieve this by letting root_task_group's tasks sit
|
|
|
|
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
|
2008-04-19 19:44:59 +02:00
|
|
|
*/
|
2011-07-21 09:43:28 -07:00
|
|
|
init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
|
2011-01-07 15:17:36 +08:00
|
|
|
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
|
2008-04-19 19:44:59 +02:00
|
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
|
|
|
|
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
|
2008-02-13 15:45:40 +01:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2008-01-25 21:08:30 +01:00
|
|
|
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
|
2011-01-07 15:17:36 +08:00
|
|
|
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
|
2007-07-09 18:51:59 +02:00
|
|
|
#endif
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
|
|
|
|
rq->cpu_load[j] = 0;
|
2010-05-17 18:14:43 -07:00
|
|
|
|
|
|
|
rq->last_load_update_tick = jiffies;
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifdef CONFIG_SMP
|
2005-06-25 14:57:24 -07:00
|
|
|
rq->sd = NULL;
|
2008-01-25 21:08:18 +01:00
|
|
|
rq->rd = NULL;
|
2011-05-18 10:09:39 -07:00
|
|
|
rq->cpu_power = SCHED_POWER_SCALE;
|
2009-07-29 11:08:47 -04:00
|
|
|
rq->post_schedule = 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
rq->active_balance = 0;
|
2007-07-09 18:51:59 +02:00
|
|
|
rq->next_balance = jiffies;
|
2005-04-16 15:20:36 -07:00
|
|
|
rq->push_cpu = 0;
|
2006-09-25 23:30:51 -07:00
|
|
|
rq->cpu = i;
|
2008-06-04 15:04:05 -04:00
|
|
|
rq->online = 0;
|
2009-11-10 03:50:02 +01:00
|
|
|
rq->idle_stamp = 0;
|
|
|
|
rq->avg_idle = 2*sysctl_sched_migration_cost;
|
2012-02-20 21:49:09 +01:00
|
|
|
|
|
|
|
INIT_LIST_HEAD(&rq->cfs_tasks);
|
|
|
|
|
2008-01-25 21:08:26 +01:00
|
|
|
rq_attach_root(rq, &def_root_domain);
|
2010-05-21 17:09:41 -07:00
|
|
|
#ifdef CONFIG_NO_HZ
|
2011-12-01 17:07:32 -08:00
|
|
|
rq->nohz_flags = 0;
|
2010-05-21 17:09:41 -07:00
|
|
|
#endif
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
2008-01-25 21:08:29 +01:00
|
|
|
init_rq_hrtick(rq);
|
2005-04-16 15:20:36 -07:00
|
|
|
atomic_set(&rq->nr_iowait, 0);
|
|
|
|
}
|
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:54:34 -07:00
|
|
|
set_load_weight(&init_task);
|
2006-07-30 03:03:52 -07:00
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
|
|
|
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
|
|
|
|
#endif
|
|
|
|
|
2006-07-30 03:03:52 -07:00
|
|
|
#ifdef CONFIG_RT_MUTEXES
|
2011-07-07 17:27:59 -07:00
|
|
|
plist_head_init(&init_task.pi_waiters);
|
2006-07-30 03:03:52 -07:00
|
|
|
#endif
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* The boot idle thread does lazy MMU switching as well:
|
|
|
|
*/
|
|
|
|
atomic_inc(&init_mm.mm_count);
|
|
|
|
enter_lazy_tlb(&init_mm, current);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make us the idle thread. Technically, schedule() should not be
|
|
|
|
* called from this thread, however somewhere below it might be,
|
|
|
|
* but because we are the idle thread, we just pick up running again
|
|
|
|
* when this runqueue becomes "idle".
|
|
|
|
*/
|
|
|
|
init_idle(current, smp_processor_id());
|
2009-04-11 10:43:41 +02:00
|
|
|
|
|
|
|
calc_load_update = jiffies + LOAD_FREQ;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* During early bootup we pretend to be a normal task:
|
|
|
|
*/
|
|
|
|
current->sched_class = &fair_sched_class;
|
2008-02-13 14:02:36 +01:00
|
|
|
|
2008-11-25 09:57:51 +10:30
|
|
|
#ifdef CONFIG_SMP
|
2011-04-07 14:09:58 +02:00
|
|
|
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
|
2009-12-02 14:09:16 +10:30
|
|
|
/* May be allocated at isolcpus cmdline parse time */
|
|
|
|
if (cpu_isolated_map == NULL)
|
|
|
|
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
|
2012-04-20 13:05:45 +00:00
|
|
|
idle_thread_set_boot_cpu();
|
2011-10-25 10:00:11 +02:00
|
|
|
#endif
|
|
|
|
init_sched_fair_class();
|
2008-11-25 02:35:04 +10:30
|
|
|
|
2008-02-13 14:02:36 +01:00
|
|
|
scheduler_running = 1;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2011-06-08 19:31:56 +02:00
|
|
|
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
|
2009-07-16 15:44:29 +02:00
|
|
|
static inline int preempt_count_equals(int preempt_offset)
|
|
|
|
{
|
2009-12-16 20:21:05 +01:00
|
|
|
int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
|
2009-07-16 15:44:29 +02:00
|
|
|
|
2011-01-25 22:52:22 +01:00
|
|
|
return (nested == preempt_offset);
|
2009-07-16 15:44:29 +02:00
|
|
|
}
|
|
|
|
|
2009-12-23 11:08:18 +01:00
|
|
|
void __might_sleep(const char *file, int line, int preempt_offset)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
static unsigned long prev_jiffy; /* ratelimiting */
|
|
|
|
|
2011-05-24 08:31:09 -07:00
|
|
|
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
|
2009-07-16 15:44:29 +02:00
|
|
|
if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
|
|
|
|
system_state != SYSTEM_RUNNING || oops_in_progress)
|
2008-08-28 11:34:43 +02:00
|
|
|
return;
|
|
|
|
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
|
|
|
|
return;
|
|
|
|
prev_jiffy = jiffies;
|
|
|
|
|
2009-12-20 14:23:57 +01:00
|
|
|
printk(KERN_ERR
|
|
|
|
"BUG: sleeping function called from invalid context at %s:%d\n",
|
|
|
|
file, line);
|
|
|
|
printk(KERN_ERR
|
|
|
|
"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
|
|
|
|
in_atomic(), irqs_disabled(),
|
|
|
|
current->pid, current->comm);
|
2008-08-28 11:34:43 +02:00
|
|
|
|
|
|
|
debug_show_held_locks(current);
|
|
|
|
if (irqs_disabled())
|
|
|
|
print_irqtrace_events(current);
|
|
|
|
dump_stack();
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__might_sleep);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_MAGIC_SYSRQ
|
2007-10-15 17:00:15 +02:00
|
|
|
static void normalize_task(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
2011-01-17 17:03:27 +01:00
|
|
|
const struct sched_class *prev_class = p->sched_class;
|
|
|
|
int old_prio = p->prio;
|
2007-10-15 17:00:15 +02:00
|
|
|
int on_rq;
|
2008-05-03 18:29:28 +02:00
|
|
|
|
2011-04-05 17:23:44 +02:00
|
|
|
on_rq = p->on_rq;
|
2007-10-15 17:00:15 +02:00
|
|
|
if (on_rq)
|
2012-01-25 11:50:51 +01:00
|
|
|
dequeue_task(rq, p, 0);
|
2007-10-15 17:00:15 +02:00
|
|
|
__setscheduler(rq, p, SCHED_NORMAL, 0);
|
|
|
|
if (on_rq) {
|
2012-01-25 11:50:51 +01:00
|
|
|
enqueue_task(rq, p, 0);
|
2007-10-15 17:00:15 +02:00
|
|
|
resched_task(rq->curr);
|
|
|
|
}
|
2011-01-17 17:03:27 +01:00
|
|
|
|
|
|
|
check_class_changed(rq, p, prev_class, old_prio);
|
2007-10-15 17:00:15 +02:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
void normalize_rt_tasks(void)
|
|
|
|
{
|
2007-06-17 18:37:45 +02:00
|
|
|
struct task_struct *g, *p;
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned long flags;
|
2006-07-03 00:25:42 -07:00
|
|
|
struct rq *rq;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
read_lock_irqsave(&tasklist_lock, flags);
|
2007-06-17 18:37:45 +02:00
|
|
|
do_each_thread(g, p) {
|
2007-10-15 17:00:18 +02:00
|
|
|
/*
|
|
|
|
* Only normalize user tasks:
|
|
|
|
*/
|
|
|
|
if (!p->mm)
|
|
|
|
continue;
|
|
|
|
|
2007-08-02 17:41:40 +02:00
|
|
|
p->se.exec_start = 0;
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
2010-03-10 23:37:45 -03:00
|
|
|
p->se.statistics.wait_start = 0;
|
|
|
|
p->se.statistics.sleep_start = 0;
|
|
|
|
p->se.statistics.block_start = 0;
|
2007-08-02 17:41:40 +02:00
|
|
|
#endif
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
if (!rt_task(p)) {
|
|
|
|
/*
|
|
|
|
* Renice negative nice level userspace
|
|
|
|
* tasks back to 0:
|
|
|
|
*/
|
|
|
|
if (TASK_NICE(p) < 0 && p->mm)
|
|
|
|
set_user_nice(p, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
continue;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-11-17 14:54:03 +01:00
|
|
|
raw_spin_lock(&p->pi_lock);
|
2006-06-27 02:54:51 -07:00
|
|
|
rq = __task_rq_lock(p);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
normalize_task(rq, p);
|
2007-10-15 17:00:15 +02:00
|
|
|
|
2006-06-27 02:54:51 -07:00
|
|
|
__task_rq_unlock(rq);
|
2009-11-17 14:54:03 +01:00
|
|
|
raw_spin_unlock(&p->pi_lock);
|
2007-06-17 18:37:45 +02:00
|
|
|
} while_each_thread(g, p);
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
read_unlock_irqrestore(&tasklist_lock, flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_MAGIC_SYSRQ */
|
2005-09-12 07:59:21 -07:00
|
|
|
|
2010-05-20 21:04:21 -05:00
|
|
|
#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
|
2005-09-12 07:59:21 -07:00
|
|
|
/*
|
2010-05-20 21:04:21 -05:00
|
|
|
* These functions are only useful for the IA64 MCA handling, or kdb.
|
2005-09-12 07:59:21 -07:00
|
|
|
*
|
|
|
|
* They can only be called when the whole system has been
|
|
|
|
* stopped - every CPU needs to be quiescent, and no scheduling
|
|
|
|
* activity can take place. Using them for anything else would
|
|
|
|
* be a serious bug, and as a result, they aren't even visible
|
|
|
|
* under any other configuration.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* curr_task - return the current task for a given cpu.
|
|
|
|
* @cpu: the processor in question.
|
|
|
|
*
|
|
|
|
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
struct task_struct *curr_task(int cpu)
|
2005-09-12 07:59:21 -07:00
|
|
|
{
|
|
|
|
return cpu_curr(cpu);
|
|
|
|
}
|
|
|
|
|
2010-05-20 21:04:21 -05:00
|
|
|
#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
|
|
|
|
|
|
|
|
#ifdef CONFIG_IA64
|
2005-09-12 07:59:21 -07:00
|
|
|
/**
|
|
|
|
* set_curr_task - set the current task for a given cpu.
|
|
|
|
* @cpu: the processor in question.
|
|
|
|
* @p: the task pointer to set.
|
|
|
|
*
|
|
|
|
* Description: This function must only be used when non-maskable interrupts
|
2007-12-05 15:46:09 +01:00
|
|
|
* are serviced on a separate stack. It allows the architecture to switch the
|
|
|
|
* notion of the current task on a cpu in a non-blocking manner. This function
|
2005-09-12 07:59:21 -07:00
|
|
|
* must be called with all CPU's synchronized, and interrupts disabled, the
|
|
|
|
* and caller must save the original value of the current task (see
|
|
|
|
* curr_task() above) and restore that value before reenabling interrupts and
|
|
|
|
* re-starting the system.
|
|
|
|
*
|
|
|
|
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
|
|
|
|
*/
|
2006-07-03 00:25:41 -07:00
|
|
|
void set_curr_task(int cpu, struct task_struct *p)
|
2005-09-12 07:59:21 -07:00
|
|
|
{
|
|
|
|
cpu_curr(cpu) = p;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2010-01-20 13:26:18 +01:00
|
|
|
#ifdef CONFIG_CGROUP_SCHED
|
2011-10-25 10:00:11 +02:00
|
|
|
/* task_group_lock serializes the addition/removal of task groups */
|
|
|
|
static DEFINE_SPINLOCK(task_group_lock);
|
|
|
|
|
2008-02-13 15:45:40 +01:00
|
|
|
static void free_sched_group(struct task_group *tg)
|
|
|
|
{
|
|
|
|
free_fair_sched_group(tg);
|
|
|
|
free_rt_sched_group(tg);
|
2011-01-05 11:11:25 +01:00
|
|
|
autogroup_free(tg);
|
2008-02-13 15:45:40 +01:00
|
|
|
kfree(tg);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* allocate runqueue etc for a new task group */
|
2008-04-19 19:44:59 +02:00
|
|
|
struct task_group *sched_create_group(struct task_group *parent)
|
2008-02-13 15:45:40 +01:00
|
|
|
{
|
|
|
|
struct task_group *tg;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
|
|
|
|
if (!tg)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
2008-04-19 19:44:59 +02:00
|
|
|
if (!alloc_fair_sched_group(tg, parent))
|
2008-02-13 15:45:40 +01:00
|
|
|
goto err;
|
|
|
|
|
2008-04-19 19:44:59 +02:00
|
|
|
if (!alloc_rt_sched_group(tg, parent))
|
2008-02-13 15:45:40 +01:00
|
|
|
goto err;
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
spin_lock_irqsave(&task_group_lock, flags);
|
2008-01-25 21:08:30 +01:00
|
|
|
list_add_rcu(&tg->list, &task_groups);
|
2008-04-19 19:45:00 +02:00
|
|
|
|
|
|
|
WARN_ON(!parent); /* root should already exist */
|
|
|
|
|
|
|
|
tg->parent = parent;
|
|
|
|
INIT_LIST_HEAD(&tg->children);
|
2030-08-14 15:56:40 +08:00
|
|
|
list_add_rcu(&tg->siblings, &parent->children);
|
2008-02-13 15:45:39 +01:00
|
|
|
spin_unlock_irqrestore(&task_group_lock, flags);
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
return tg;
|
2007-10-15 17:00:07 +02:00
|
|
|
|
|
|
|
err:
|
2008-01-25 21:08:30 +01:00
|
|
|
free_sched_group(tg);
|
2007-10-15 17:00:07 +02:00
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
/* rcu callback to free various structures associated with a task group */
|
2008-01-25 21:08:30 +01:00
|
|
|
static void free_sched_group_rcu(struct rcu_head *rhp)
|
2007-10-15 17:00:07 +02:00
|
|
|
{
|
|
|
|
/* now it should be safe to free those cfs_rqs */
|
2008-01-25 21:08:30 +01:00
|
|
|
free_sched_group(container_of(rhp, struct task_group, rcu));
|
2007-10-15 17:00:07 +02:00
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
/* Destroy runqueue etc associated with a task group */
|
2007-10-15 17:00:14 +02:00
|
|
|
void sched_destroy_group(struct task_group *tg)
|
2007-10-15 17:00:07 +02:00
|
|
|
{
|
2008-02-13 15:45:39 +01:00
|
|
|
unsigned long flags;
|
2007-10-15 17:00:09 +02:00
|
|
|
int i;
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2010-11-15 15:47:01 -08:00
|
|
|
/* end participation in shares distribution */
|
|
|
|
for_each_possible_cpu(i)
|
2008-02-13 15:45:40 +01:00
|
|
|
unregister_fair_sched_group(tg, i);
|
2010-11-15 15:47:01 -08:00
|
|
|
|
|
|
|
spin_lock_irqsave(&task_group_lock, flags);
|
2008-01-25 21:08:30 +01:00
|
|
|
list_del_rcu(&tg->list);
|
2008-04-19 19:45:00 +02:00
|
|
|
list_del_rcu(&tg->siblings);
|
2008-02-13 15:45:39 +01:00
|
|
|
spin_unlock_irqrestore(&task_group_lock, flags);
|
2007-10-15 17:00:09 +02:00
|
|
|
|
|
|
|
/* wait for possible concurrent references to cfs_rqs complete */
|
2008-01-25 21:08:30 +01:00
|
|
|
call_rcu(&tg->rcu, free_sched_group_rcu);
|
2007-10-15 17:00:07 +02:00
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
/* change task's runqueue when it moves between groups.
|
2007-10-15 17:00:12 +02:00
|
|
|
* The caller of this function should have put the task in its new group
|
|
|
|
* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
|
|
|
|
* reflect its new group.
|
2007-10-15 17:00:09 +02:00
|
|
|
*/
|
|
|
|
void sched_move_task(struct task_struct *tsk)
|
2007-10-15 17:00:07 +02:00
|
|
|
{
|
2012-06-22 13:36:05 +02:00
|
|
|
struct task_group *tg;
|
2007-10-15 17:00:07 +02:00
|
|
|
int on_rq, running;
|
|
|
|
unsigned long flags;
|
|
|
|
struct rq *rq;
|
|
|
|
|
|
|
|
rq = task_rq_lock(tsk, &flags);
|
|
|
|
|
2007-12-18 15:21:13 +01:00
|
|
|
running = task_current(rq, tsk);
|
2011-04-05 17:23:44 +02:00
|
|
|
on_rq = tsk->on_rq;
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2008-03-10 11:01:20 -07:00
|
|
|
if (on_rq)
|
2007-10-15 17:00:07 +02:00
|
|
|
dequeue_task(rq, tsk, 0);
|
2008-03-10 11:01:20 -07:00
|
|
|
if (unlikely(running))
|
|
|
|
tsk->sched_class->put_prev_task(rq, tsk);
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2012-06-22 13:36:05 +02:00
|
|
|
tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
|
|
|
|
lockdep_is_held(&tsk->sighand->siglock)),
|
|
|
|
struct task_group, css);
|
|
|
|
tg = autogroup_task_group(tsk, tg);
|
|
|
|
tsk->sched_task_group = tg;
|
|
|
|
|
2008-02-29 15:21:01 -05:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2010-10-15 15:24:15 +02:00
|
|
|
if (tsk->sched_class->task_move_group)
|
|
|
|
tsk->sched_class->task_move_group(tsk, on_rq);
|
|
|
|
else
|
2008-02-29 15:21:01 -05:00
|
|
|
#endif
|
2010-10-15 15:24:15 +02:00
|
|
|
set_task_rq(tsk, task_cpu(tsk));
|
2008-02-29 15:21:01 -05:00
|
|
|
|
2008-03-10 11:01:20 -07:00
|
|
|
if (unlikely(running))
|
|
|
|
tsk->sched_class->set_curr_task(rq);
|
|
|
|
if (on_rq)
|
2010-03-24 16:38:48 +01:00
|
|
|
enqueue_task(rq, tsk, 0);
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2011-04-05 17:23:51 +02:00
|
|
|
task_rq_unlock(rq, tsk, &flags);
|
2007-10-15 17:00:07 +02:00
|
|
|
}
|
2010-01-20 13:26:18 +01:00
|
|
|
#endif /* CONFIG_CGROUP_SCHED */
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2011-07-21 09:43:29 -07:00
|
|
|
#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
|
2008-02-13 15:45:39 +01:00
|
|
|
static unsigned long to_ratio(u64 period, u64 runtime)
|
|
|
|
{
|
|
|
|
if (runtime == RUNTIME_INF)
|
2008-08-19 12:33:06 +02:00
|
|
|
return 1ULL << 20;
|
2008-02-13 15:45:39 +01:00
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
return div64_u64(runtime << 20, period);
|
2008-02-13 15:45:39 +01:00
|
|
|
}
|
2011-07-21 09:43:29 -07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
/*
|
|
|
|
* Ensure that the real time constraints are schedulable.
|
|
|
|
*/
|
|
|
|
static DEFINE_MUTEX(rt_constraints_mutex);
|
2008-02-13 15:45:39 +01:00
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
/* Must be called with tasklist_lock held */
|
|
|
|
static inline int tg_has_rt_tasks(struct task_group *tg)
|
2008-04-19 19:45:00 +02:00
|
|
|
{
|
2008-08-19 12:33:06 +02:00
|
|
|
struct task_struct *g, *p;
|
2008-04-19 19:45:00 +02:00
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
do_each_thread(g, p) {
|
2011-10-25 10:00:11 +02:00
|
|
|
if (rt_task(p) && task_rq(p)->rt.tg == tg)
|
2008-08-19 12:33:06 +02:00
|
|
|
return 1;
|
|
|
|
} while_each_thread(g, p);
|
2008-04-19 19:45:00 +02:00
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
return 0;
|
|
|
|
}
|
2008-04-19 19:45:00 +02:00
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
struct rt_schedulable_data {
|
|
|
|
struct task_group *tg;
|
|
|
|
u64 rt_period;
|
|
|
|
u64 rt_runtime;
|
|
|
|
};
|
2008-04-19 19:45:00 +02:00
|
|
|
|
2011-07-21 09:43:29 -07:00
|
|
|
static int tg_rt_schedulable(struct task_group *tg, void *data)
|
2008-08-19 12:33:06 +02:00
|
|
|
{
|
|
|
|
struct rt_schedulable_data *d = data;
|
|
|
|
struct task_group *child;
|
|
|
|
unsigned long total, sum = 0;
|
|
|
|
u64 period, runtime;
|
2008-04-19 19:45:00 +02:00
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
period = ktime_to_ns(tg->rt_bandwidth.rt_period);
|
|
|
|
runtime = tg->rt_bandwidth.rt_runtime;
|
2008-04-19 19:45:00 +02:00
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
if (tg == d->tg) {
|
|
|
|
period = d->rt_period;
|
|
|
|
runtime = d->rt_runtime;
|
2008-04-19 19:45:00 +02:00
|
|
|
}
|
|
|
|
|
2008-09-23 15:33:44 +02:00
|
|
|
/*
|
|
|
|
* Cannot have more runtime than the period.
|
|
|
|
*/
|
|
|
|
if (runtime > period && runtime != RUNTIME_INF)
|
|
|
|
return -EINVAL;
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-09-23 15:33:44 +02:00
|
|
|
/*
|
|
|
|
* Ensure we don't starve existing RT tasks.
|
|
|
|
*/
|
2008-08-19 12:33:06 +02:00
|
|
|
if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
|
|
|
|
return -EBUSY;
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
total = to_ratio(period, runtime);
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-09-23 15:33:44 +02:00
|
|
|
/*
|
|
|
|
* Nobody can have more than the global setting allows.
|
|
|
|
*/
|
|
|
|
if (total > to_ratio(global_rt_period(), global_rt_runtime()))
|
|
|
|
return -EINVAL;
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-09-23 15:33:44 +02:00
|
|
|
/*
|
|
|
|
* The sum of our children's runtime should not exceed our own.
|
|
|
|
*/
|
2008-08-19 12:33:06 +02:00
|
|
|
list_for_each_entry_rcu(child, &tg->children, siblings) {
|
|
|
|
period = ktime_to_ns(child->rt_bandwidth.rt_period);
|
|
|
|
runtime = child->rt_bandwidth.rt_runtime;
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
if (child == d->tg) {
|
|
|
|
period = d->rt_period;
|
|
|
|
runtime = d->rt_runtime;
|
|
|
|
}
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
sum += to_ratio(period, runtime);
|
2008-02-13 15:45:39 +01:00
|
|
|
}
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
if (sum > total)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return 0;
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
|
2008-02-28 15:21:56 +05:30
|
|
|
{
|
2011-07-21 09:43:35 -07:00
|
|
|
int ret;
|
|
|
|
|
2008-08-19 12:33:06 +02:00
|
|
|
struct rt_schedulable_data data = {
|
|
|
|
.tg = tg,
|
|
|
|
.rt_period = period,
|
|
|
|
.rt_runtime = runtime,
|
|
|
|
};
|
|
|
|
|
2011-07-21 09:43:35 -07:00
|
|
|
rcu_read_lock();
|
|
|
|
ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return ret;
|
2008-02-28 15:21:56 +05:30
|
|
|
}
|
|
|
|
|
2011-07-21 09:43:28 -07:00
|
|
|
static int tg_set_rt_bandwidth(struct task_group *tg,
|
2008-04-19 19:44:57 +02:00
|
|
|
u64 rt_period, u64 rt_runtime)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
2008-04-19 19:44:58 +02:00
|
|
|
int i, err = 0;
|
2008-02-13 15:45:39 +01:00
|
|
|
|
|
|
|
mutex_lock(&rt_constraints_mutex);
|
2008-02-28 15:21:56 +05:30
|
|
|
read_lock(&tasklist_lock);
|
2008-08-19 12:33:06 +02:00
|
|
|
err = __rt_schedulable(tg, rt_period, rt_runtime);
|
|
|
|
if (err)
|
2008-02-13 15:45:39 +01:00
|
|
|
goto unlock;
|
2008-04-19 19:44:58 +02:00
|
|
|
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
|
2008-04-19 19:44:57 +02:00
|
|
|
tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
|
|
|
|
tg->rt_bandwidth.rt_runtime = rt_runtime;
|
2008-04-19 19:44:58 +02:00
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
struct rt_rq *rt_rq = tg->rt_rq[i];
|
|
|
|
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock(&rt_rq->rt_runtime_lock);
|
2008-04-19 19:44:58 +02:00
|
|
|
rt_rq->rt_runtime = rt_runtime;
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
2008-04-19 19:44:58 +02:00
|
|
|
}
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
|
2010-10-17 21:46:10 +02:00
|
|
|
unlock:
|
2008-02-28 15:21:56 +05:30
|
|
|
read_unlock(&tasklist_lock);
|
2008-02-13 15:45:39 +01:00
|
|
|
mutex_unlock(&rt_constraints_mutex);
|
|
|
|
|
|
|
|
return err;
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
|
|
|
|
2008-04-19 19:44:57 +02:00
|
|
|
int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
|
|
|
|
{
|
|
|
|
u64 rt_runtime, rt_period;
|
|
|
|
|
|
|
|
rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
|
|
|
|
rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
|
|
|
|
if (rt_runtime_us < 0)
|
|
|
|
rt_runtime = RUNTIME_INF;
|
|
|
|
|
2011-07-21 09:43:28 -07:00
|
|
|
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
|
2008-04-19 19:44:57 +02:00
|
|
|
}
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
long sched_group_rt_runtime(struct task_group *tg)
|
|
|
|
{
|
|
|
|
u64 rt_runtime_us;
|
|
|
|
|
2008-04-19 19:44:57 +02:00
|
|
|
if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
|
2008-02-13 15:45:39 +01:00
|
|
|
return -1;
|
|
|
|
|
2008-04-19 19:44:57 +02:00
|
|
|
rt_runtime_us = tg->rt_bandwidth.rt_runtime;
|
2008-02-13 15:45:39 +01:00
|
|
|
do_div(rt_runtime_us, NSEC_PER_USEC);
|
|
|
|
return rt_runtime_us;
|
|
|
|
}
|
2008-04-19 19:44:57 +02:00
|
|
|
|
|
|
|
int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
|
|
|
|
{
|
|
|
|
u64 rt_runtime, rt_period;
|
|
|
|
|
|
|
|
rt_period = (u64)rt_period_us * NSEC_PER_USEC;
|
|
|
|
rt_runtime = tg->rt_bandwidth.rt_runtime;
|
|
|
|
|
2008-06-26 18:54:09 +02:00
|
|
|
if (rt_period == 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2011-07-21 09:43:28 -07:00
|
|
|
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
|
2008-04-19 19:44:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
long sched_group_rt_period(struct task_group *tg)
|
|
|
|
{
|
|
|
|
u64 rt_period_us;
|
|
|
|
|
|
|
|
rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
|
|
|
|
do_div(rt_period_us, NSEC_PER_USEC);
|
|
|
|
return rt_period_us;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sched_rt_global_constraints(void)
|
|
|
|
{
|
2008-09-23 15:33:44 +02:00
|
|
|
u64 runtime, period;
|
2008-04-19 19:44:57 +02:00
|
|
|
int ret = 0;
|
|
|
|
|
2008-09-10 17:00:19 -07:00
|
|
|
if (sysctl_sched_rt_period <= 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2008-09-23 15:33:44 +02:00
|
|
|
runtime = global_rt_runtime();
|
|
|
|
period = global_rt_period();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sanity check on the sysctl variables.
|
|
|
|
*/
|
|
|
|
if (runtime > period && runtime != RUNTIME_INF)
|
|
|
|
return -EINVAL;
|
2008-06-19 14:22:27 +02:00
|
|
|
|
2008-04-19 19:44:57 +02:00
|
|
|
mutex_lock(&rt_constraints_mutex);
|
2008-08-19 12:33:06 +02:00
|
|
|
read_lock(&tasklist_lock);
|
2008-09-23 15:33:44 +02:00
|
|
|
ret = __rt_schedulable(NULL, 0, 0);
|
2008-08-19 12:33:06 +02:00
|
|
|
read_unlock(&tasklist_lock);
|
2008-04-19 19:44:57 +02:00
|
|
|
mutex_unlock(&rt_constraints_mutex);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
2009-02-27 15:13:54 +05:30
|
|
|
|
|
|
|
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
/* Don't accept realtime tasks when there is no way for them to run */
|
|
|
|
if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2008-05-30 14:23:45 +02:00
|
|
|
#else /* !CONFIG_RT_GROUP_SCHED */
|
2008-04-19 19:44:57 +02:00
|
|
|
static int sched_rt_global_constraints(void)
|
|
|
|
{
|
2008-04-19 19:44:58 +02:00
|
|
|
unsigned long flags;
|
|
|
|
int i;
|
|
|
|
|
2008-09-10 17:00:19 -07:00
|
|
|
if (sysctl_sched_rt_period <= 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2009-05-05 17:50:21 +02:00
|
|
|
/*
|
|
|
|
* There's always some RT tasks in the root group
|
|
|
|
* -- migration, kstopmachine etc..
|
|
|
|
*/
|
|
|
|
if (sysctl_sched_rt_runtime == 0)
|
|
|
|
return -EBUSY;
|
|
|
|
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
|
2008-04-19 19:44:58 +02:00
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
struct rt_rq *rt_rq = &cpu_rq(i)->rt;
|
|
|
|
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock(&rt_rq->rt_runtime_lock);
|
2008-04-19 19:44:58 +02:00
|
|
|
rt_rq->rt_runtime = global_rt_runtime();
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
2008-04-19 19:44:58 +02:00
|
|
|
}
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
|
2008-04-19 19:44:58 +02:00
|
|
|
|
2008-04-19 19:44:57 +02:00
|
|
|
return 0;
|
|
|
|
}
|
2008-05-30 14:23:45 +02:00
|
|
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
2008-04-19 19:44:57 +02:00
|
|
|
|
|
|
|
int sched_rt_handler(struct ctl_table *table, int write,
|
2009-09-23 15:57:19 -07:00
|
|
|
void __user *buffer, size_t *lenp,
|
2008-04-19 19:44:57 +02:00
|
|
|
loff_t *ppos)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
int old_period, old_runtime;
|
|
|
|
static DEFINE_MUTEX(mutex);
|
|
|
|
|
|
|
|
mutex_lock(&mutex);
|
|
|
|
old_period = sysctl_sched_rt_period;
|
|
|
|
old_runtime = sysctl_sched_rt_runtime;
|
|
|
|
|
2009-09-23 15:57:19 -07:00
|
|
|
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
2008-04-19 19:44:57 +02:00
|
|
|
|
|
|
|
if (!ret && write) {
|
|
|
|
ret = sched_rt_global_constraints();
|
|
|
|
if (ret) {
|
|
|
|
sysctl_sched_rt_period = old_period;
|
|
|
|
sysctl_sched_rt_runtime = old_runtime;
|
|
|
|
} else {
|
|
|
|
def_rt_bandwidth.rt_runtime = global_rt_runtime();
|
|
|
|
def_rt_bandwidth.rt_period =
|
|
|
|
ns_to_ktime(global_rt_period());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&mutex);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
2007-10-18 23:41:03 -07:00
|
|
|
|
2008-02-13 15:45:40 +01:00
|
|
|
#ifdef CONFIG_CGROUP_SCHED
|
2007-10-18 23:41:03 -07:00
|
|
|
|
|
|
|
/* return corresponding task_group object of a cgroup */
|
2007-10-24 18:23:50 +02:00
|
|
|
static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
|
2007-10-18 23:41:03 -07:00
|
|
|
{
|
2007-10-24 18:23:50 +02:00
|
|
|
return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
|
|
|
|
struct task_group, css);
|
2007-10-18 23:41:03 -07:00
|
|
|
}
|
|
|
|
|
2012-01-31 13:47:36 +08:00
|
|
|
static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
|
2007-10-18 23:41:03 -07:00
|
|
|
{
|
2008-04-19 19:44:59 +02:00
|
|
|
struct task_group *tg, *parent;
|
2007-10-18 23:41:03 -07:00
|
|
|
|
2007-10-24 18:23:50 +02:00
|
|
|
if (!cgrp->parent) {
|
2007-10-18 23:41:03 -07:00
|
|
|
/* This is early initialization for the top cgroup */
|
2011-01-07 15:17:36 +08:00
|
|
|
return &root_task_group.css;
|
2007-10-18 23:41:03 -07:00
|
|
|
}
|
|
|
|
|
2008-04-19 19:44:59 +02:00
|
|
|
parent = cgroup_tg(cgrp->parent);
|
|
|
|
tg = sched_create_group(parent);
|
2007-10-18 23:41:03 -07:00
|
|
|
if (IS_ERR(tg))
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
return &tg->css;
|
|
|
|
}
|
|
|
|
|
2012-01-31 13:47:36 +08:00
|
|
|
static void cpu_cgroup_destroy(struct cgroup *cgrp)
|
2007-10-18 23:41:03 -07:00
|
|
|
{
|
2007-10-24 18:23:50 +02:00
|
|
|
struct task_group *tg = cgroup_tg(cgrp);
|
2007-10-18 23:41:03 -07:00
|
|
|
|
|
|
|
sched_destroy_group(tg);
|
|
|
|
}
|
|
|
|
|
2012-01-31 13:47:36 +08:00
|
|
|
static int cpu_cgroup_can_attach(struct cgroup *cgrp,
|
2011-12-12 18:12:21 -08:00
|
|
|
struct cgroup_taskset *tset)
|
2007-10-18 23:41:03 -07:00
|
|
|
{
|
2011-12-12 18:12:21 -08:00
|
|
|
struct task_struct *task;
|
|
|
|
|
|
|
|
cgroup_taskset_for_each(task, cgrp, tset) {
|
2008-02-13 15:45:40 +01:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2011-12-12 18:12:21 -08:00
|
|
|
if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
|
|
|
|
return -EINVAL;
|
2008-02-13 15:45:40 +01:00
|
|
|
#else
|
2011-12-12 18:12:21 -08:00
|
|
|
/* We don't support RT-tasks being in separate groups */
|
|
|
|
if (task->sched_class != &fair_sched_class)
|
|
|
|
return -EINVAL;
|
2008-02-13 15:45:40 +01:00
|
|
|
#endif
|
2011-12-12 18:12:21 -08:00
|
|
|
}
|
2009-09-23 15:56:31 -07:00
|
|
|
return 0;
|
|
|
|
}
|
2007-10-18 23:41:03 -07:00
|
|
|
|
2012-01-31 13:47:36 +08:00
|
|
|
static void cpu_cgroup_attach(struct cgroup *cgrp,
|
2011-12-12 18:12:21 -08:00
|
|
|
struct cgroup_taskset *tset)
|
2007-10-18 23:41:03 -07:00
|
|
|
{
|
2011-12-12 18:12:21 -08:00
|
|
|
struct task_struct *task;
|
|
|
|
|
|
|
|
cgroup_taskset_for_each(task, cgrp, tset)
|
|
|
|
sched_move_task(task);
|
2007-10-18 23:41:03 -07:00
|
|
|
}
|
|
|
|
|
2011-01-19 12:26:11 +01:00
|
|
|
static void
|
2012-01-31 13:47:36 +08:00
|
|
|
cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
|
|
|
|
struct task_struct *task)
|
2011-01-19 12:26:11 +01:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* cgroup_exit() is called in the copy_process() failure path.
|
|
|
|
* Ignore this case since the task hasn't ran yet, this avoids
|
|
|
|
* trying to poke a half freed task state from generic code.
|
|
|
|
*/
|
|
|
|
if (!(task->flags & PF_EXITING))
|
|
|
|
return;
|
|
|
|
|
|
|
|
sched_move_task(task);
|
|
|
|
}
|
|
|
|
|
2008-02-13 15:45:40 +01:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2008-04-29 00:59:56 -07:00
|
|
|
static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
|
2007-10-24 18:23:50 +02:00
|
|
|
u64 shareval)
|
2007-10-18 23:41:03 -07:00
|
|
|
{
|
sched: Increase SCHED_LOAD_SCALE resolution
Introduce SCHED_LOAD_RESOLUTION, which scales is added to
SCHED_LOAD_SHIFT and increases the resolution of
SCHED_LOAD_SCALE. This patch sets the value of
SCHED_LOAD_RESOLUTION to 10, scaling up the weights for all
sched entities by a factor of 1024. With this extra resolution,
we can handle deeper cgroup hiearchies and the scheduler can do
better shares distribution and load load balancing on larger
systems (especially for low weight task groups).
This does not change the existing user interface, the scaled
weights are only used internally. We do not modify
prio_to_weight values or inverses, but use the original weights
when calculating the inverse which is used to scale execution
time delta in calc_delta_mine(). This ensures we do not lose
accuracy when accounting time to the sched entities. Thanks to
Nikunj Dadhania for fixing an bug in c_d_m() that broken fairness.
Below is some analysis of the performance costs/improvements of
this patch.
1. Micro-arch performance costs:
Experiment was to run Ingo's pipe_test_100k 200 times with the
task pinned to one cpu. I measured instruction, cycles and
stalled-cycles for the runs. See:
http://thread.gmane.org/gmane.linux.kernel/1129232/focus=1129389
for more info.
-tip (baseline):
Performance counter stats for '/root/load-scale/pipe-test-100k' (200 runs):
964,991,769 instructions # 0.82 insns per cycle
# 0.33 stalled cycles per insn
# ( +- 0.05% )
1,171,186,635 cycles # 0.000 GHz ( +- 0.08% )
306,373,664 stalled-cycles-backend # 26.16% backend cycles idle ( +- 0.28% )
314,933,621 stalled-cycles-frontend # 26.89% frontend cycles idle ( +- 0.34% )
1.122405684 seconds time elapsed ( +- 0.05% )
-tip+patches:
Performance counter stats for './load-scale/pipe-test-100k' (200 runs):
963,624,821 instructions # 0.82 insns per cycle
# 0.33 stalled cycles per insn
# ( +- 0.04% )
1,175,215,649 cycles # 0.000 GHz ( +- 0.08% )
315,321,126 stalled-cycles-backend # 26.83% backend cycles idle ( +- 0.28% )
316,835,873 stalled-cycles-frontend # 26.96% frontend cycles idle ( +- 0.29% )
1.122238659 seconds time elapsed ( +- 0.06% )
With this patch, instructions decrease by ~0.10% and cycles
increase by 0.27%. This doesn't look statistically significant.
The number of stalled cycles in the backend increased from
26.16% to 26.83%. This can be attributed to the shifts we do in
c_d_m() and other places. The fraction of stalled cycles in the
frontend remains about the same, at 26.96% compared to 26.89% in -tip.
2. Balancing low-weight task groups
Test setup: run 50 tasks with random sleep/busy times (biased
around 100ms) in a low weight container (with cpu.shares = 2).
Measure %idle as reported by mpstat over a 10s window.
-tip (baseline):
06:47:48 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s
06:47:49 PM all 94.32 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.62 15888.00
06:47:50 PM all 94.57 0.00 0.62 0.00 0.00 0.00 0.00 0.00 4.81 16180.00
06:47:51 PM all 94.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.25 15966.00
06:47:52 PM all 95.81 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.19 16053.00
06:47:53 PM all 94.88 0.06 0.00 0.00 0.00 0.00 0.00 0.00 5.06 15984.00
06:47:54 PM all 93.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.69 15806.00
06:47:55 PM all 94.19 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.75 15896.00
06:47:56 PM all 92.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.13 15716.00
06:47:57 PM all 94.88 0.00 0.00 0.00 0.00 0.00 0.00 0.00 5.12 15982.00
06:47:58 PM all 95.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.56 16075.00
Average: all 94.49 0.01 0.08 0.00 0.00 0.00 0.00 0.00 5.42 15954.60
-tip+patches:
06:47:03 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s
06:47:04 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16630.00
06:47:05 PM all 99.69 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.31 16580.20
06:47:06 PM all 99.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.25 16596.00
06:47:07 PM all 99.20 0.00 0.74 0.00 0.00 0.06 0.00 0.00 0.00 17838.61
06:47:08 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16540.00
06:47:09 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16575.00
06:47:10 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16614.00
06:47:11 PM all 99.94 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 16588.00
06:47:12 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16593.00
06:47:13 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16551.00
Average: all 99.84 0.00 0.09 0.00 0.00 0.01 0.00 0.00 0.06 16711.58
We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06%
with the patches).
We see an improvement in idle% on the system (drops from 5.42%
on -tip to 0.06% with the patches).
Signed-off-by: Nikhil Rao <ncrao@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Nikunj A. Dadhania <nikunj@linux.vnet.ibm.com>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Stephan Barwolf <stephan.baerwolf@tu-ilmenau.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1305754668-18792-1-git-send-email-ncrao@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-05-18 14:37:48 -07:00
|
|
|
return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
|
2007-10-18 23:41:03 -07:00
|
|
|
}
|
|
|
|
|
2008-04-29 00:59:56 -07:00
|
|
|
static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
|
2007-10-18 23:41:03 -07:00
|
|
|
{
|
2007-10-24 18:23:50 +02:00
|
|
|
struct task_group *tg = cgroup_tg(cgrp);
|
2007-10-18 23:41:03 -07:00
|
|
|
|
sched: Increase SCHED_LOAD_SCALE resolution
Introduce SCHED_LOAD_RESOLUTION, which scales is added to
SCHED_LOAD_SHIFT and increases the resolution of
SCHED_LOAD_SCALE. This patch sets the value of
SCHED_LOAD_RESOLUTION to 10, scaling up the weights for all
sched entities by a factor of 1024. With this extra resolution,
we can handle deeper cgroup hiearchies and the scheduler can do
better shares distribution and load load balancing on larger
systems (especially for low weight task groups).
This does not change the existing user interface, the scaled
weights are only used internally. We do not modify
prio_to_weight values or inverses, but use the original weights
when calculating the inverse which is used to scale execution
time delta in calc_delta_mine(). This ensures we do not lose
accuracy when accounting time to the sched entities. Thanks to
Nikunj Dadhania for fixing an bug in c_d_m() that broken fairness.
Below is some analysis of the performance costs/improvements of
this patch.
1. Micro-arch performance costs:
Experiment was to run Ingo's pipe_test_100k 200 times with the
task pinned to one cpu. I measured instruction, cycles and
stalled-cycles for the runs. See:
http://thread.gmane.org/gmane.linux.kernel/1129232/focus=1129389
for more info.
-tip (baseline):
Performance counter stats for '/root/load-scale/pipe-test-100k' (200 runs):
964,991,769 instructions # 0.82 insns per cycle
# 0.33 stalled cycles per insn
# ( +- 0.05% )
1,171,186,635 cycles # 0.000 GHz ( +- 0.08% )
306,373,664 stalled-cycles-backend # 26.16% backend cycles idle ( +- 0.28% )
314,933,621 stalled-cycles-frontend # 26.89% frontend cycles idle ( +- 0.34% )
1.122405684 seconds time elapsed ( +- 0.05% )
-tip+patches:
Performance counter stats for './load-scale/pipe-test-100k' (200 runs):
963,624,821 instructions # 0.82 insns per cycle
# 0.33 stalled cycles per insn
# ( +- 0.04% )
1,175,215,649 cycles # 0.000 GHz ( +- 0.08% )
315,321,126 stalled-cycles-backend # 26.83% backend cycles idle ( +- 0.28% )
316,835,873 stalled-cycles-frontend # 26.96% frontend cycles idle ( +- 0.29% )
1.122238659 seconds time elapsed ( +- 0.06% )
With this patch, instructions decrease by ~0.10% and cycles
increase by 0.27%. This doesn't look statistically significant.
The number of stalled cycles in the backend increased from
26.16% to 26.83%. This can be attributed to the shifts we do in
c_d_m() and other places. The fraction of stalled cycles in the
frontend remains about the same, at 26.96% compared to 26.89% in -tip.
2. Balancing low-weight task groups
Test setup: run 50 tasks with random sleep/busy times (biased
around 100ms) in a low weight container (with cpu.shares = 2).
Measure %idle as reported by mpstat over a 10s window.
-tip (baseline):
06:47:48 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s
06:47:49 PM all 94.32 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.62 15888.00
06:47:50 PM all 94.57 0.00 0.62 0.00 0.00 0.00 0.00 0.00 4.81 16180.00
06:47:51 PM all 94.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.25 15966.00
06:47:52 PM all 95.81 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.19 16053.00
06:47:53 PM all 94.88 0.06 0.00 0.00 0.00 0.00 0.00 0.00 5.06 15984.00
06:47:54 PM all 93.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.69 15806.00
06:47:55 PM all 94.19 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.75 15896.00
06:47:56 PM all 92.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.13 15716.00
06:47:57 PM all 94.88 0.00 0.00 0.00 0.00 0.00 0.00 0.00 5.12 15982.00
06:47:58 PM all 95.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.56 16075.00
Average: all 94.49 0.01 0.08 0.00 0.00 0.00 0.00 0.00 5.42 15954.60
-tip+patches:
06:47:03 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s
06:47:04 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16630.00
06:47:05 PM all 99.69 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.31 16580.20
06:47:06 PM all 99.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.25 16596.00
06:47:07 PM all 99.20 0.00 0.74 0.00 0.00 0.06 0.00 0.00 0.00 17838.61
06:47:08 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16540.00
06:47:09 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16575.00
06:47:10 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16614.00
06:47:11 PM all 99.94 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 16588.00
06:47:12 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16593.00
06:47:13 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16551.00
Average: all 99.84 0.00 0.09 0.00 0.00 0.01 0.00 0.00 0.06 16711.58
We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06%
with the patches).
We see an improvement in idle% on the system (drops from 5.42%
on -tip to 0.06% with the patches).
Signed-off-by: Nikhil Rao <ncrao@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Nikunj A. Dadhania <nikunj@linux.vnet.ibm.com>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Stephan Barwolf <stephan.baerwolf@tu-ilmenau.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1305754668-18792-1-git-send-email-ncrao@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-05-18 14:37:48 -07:00
|
|
|
return (u64) scale_load_down(tg->shares);
|
2007-10-18 23:41:03 -07:00
|
|
|
}
|
2011-07-21 09:43:28 -07:00
|
|
|
|
|
|
|
#ifdef CONFIG_CFS_BANDWIDTH
|
2011-07-21 09:43:29 -07:00
|
|
|
static DEFINE_MUTEX(cfs_constraints_mutex);
|
|
|
|
|
2011-07-21 09:43:28 -07:00
|
|
|
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
|
|
|
|
const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
|
|
|
|
|
2011-07-21 09:43:29 -07:00
|
|
|
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
|
|
|
|
|
2011-07-21 09:43:28 -07:00
|
|
|
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
|
|
|
{
|
2011-11-07 20:26:33 -08:00
|
|
|
int i, ret = 0, runtime_enabled, runtime_was_enabled;
|
2011-10-25 10:00:11 +02:00
|
|
|
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
2011-07-21 09:43:28 -07:00
|
|
|
|
|
|
|
if (tg == &root_task_group)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ensure we have at some amount of bandwidth every period. This is
|
|
|
|
* to prevent reaching a state of large arrears when throttled via
|
|
|
|
* entity_tick() resulting in prolonged exit starvation.
|
|
|
|
*/
|
|
|
|
if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Likewise, bound things on the otherside by preventing insane quota
|
|
|
|
* periods. This also allows us to normalize in computing quota
|
|
|
|
* feasibility.
|
|
|
|
*/
|
|
|
|
if (period > max_cfs_quota_period)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2011-07-21 09:43:29 -07:00
|
|
|
mutex_lock(&cfs_constraints_mutex);
|
|
|
|
ret = __cfs_schedulable(tg, period, quota);
|
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2011-07-21 09:43:31 -07:00
|
|
|
runtime_enabled = quota != RUNTIME_INF;
|
2011-11-07 20:26:33 -08:00
|
|
|
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
|
|
|
|
account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
|
2011-07-21 09:43:28 -07:00
|
|
|
raw_spin_lock_irq(&cfs_b->lock);
|
|
|
|
cfs_b->period = ns_to_ktime(period);
|
|
|
|
cfs_b->quota = quota;
|
2011-07-21 09:43:31 -07:00
|
|
|
|
2011-07-21 09:43:32 -07:00
|
|
|
__refill_cfs_bandwidth_runtime(cfs_b);
|
2011-07-21 09:43:31 -07:00
|
|
|
/* restart the period timer (if active) to handle new period expiry */
|
|
|
|
if (runtime_enabled && cfs_b->timer_active) {
|
|
|
|
/* force a reprogram */
|
|
|
|
cfs_b->timer_active = 0;
|
|
|
|
__start_cfs_bandwidth(cfs_b);
|
|
|
|
}
|
2011-07-21 09:43:28 -07:00
|
|
|
raw_spin_unlock_irq(&cfs_b->lock);
|
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
|
2011-10-25 10:00:11 +02:00
|
|
|
struct rq *rq = cfs_rq->rq;
|
2011-07-21 09:43:28 -07:00
|
|
|
|
|
|
|
raw_spin_lock_irq(&rq->lock);
|
2011-07-21 09:43:31 -07:00
|
|
|
cfs_rq->runtime_enabled = runtime_enabled;
|
2011-07-21 09:43:28 -07:00
|
|
|
cfs_rq->runtime_remaining = 0;
|
2011-07-21 09:43:34 -07:00
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
if (cfs_rq->throttled)
|
2011-07-21 09:43:34 -07:00
|
|
|
unthrottle_cfs_rq(cfs_rq);
|
2011-07-21 09:43:28 -07:00
|
|
|
raw_spin_unlock_irq(&rq->lock);
|
|
|
|
}
|
2011-07-21 09:43:29 -07:00
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&cfs_constraints_mutex);
|
2011-07-21 09:43:28 -07:00
|
|
|
|
2011-07-21 09:43:29 -07:00
|
|
|
return ret;
|
2011-07-21 09:43:28 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
|
|
|
|
{
|
|
|
|
u64 quota, period;
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
period = ktime_to_ns(tg->cfs_bandwidth.period);
|
2011-07-21 09:43:28 -07:00
|
|
|
if (cfs_quota_us < 0)
|
|
|
|
quota = RUNTIME_INF;
|
|
|
|
else
|
|
|
|
quota = (u64)cfs_quota_us * NSEC_PER_USEC;
|
|
|
|
|
|
|
|
return tg_set_cfs_bandwidth(tg, period, quota);
|
|
|
|
}
|
|
|
|
|
|
|
|
long tg_get_cfs_quota(struct task_group *tg)
|
|
|
|
{
|
|
|
|
u64 quota_us;
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
if (tg->cfs_bandwidth.quota == RUNTIME_INF)
|
2011-07-21 09:43:28 -07:00
|
|
|
return -1;
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
quota_us = tg->cfs_bandwidth.quota;
|
2011-07-21 09:43:28 -07:00
|
|
|
do_div(quota_us, NSEC_PER_USEC);
|
|
|
|
|
|
|
|
return quota_us;
|
|
|
|
}
|
|
|
|
|
|
|
|
int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
|
|
|
|
{
|
|
|
|
u64 quota, period;
|
|
|
|
|
|
|
|
period = (u64)cfs_period_us * NSEC_PER_USEC;
|
2011-10-25 10:00:11 +02:00
|
|
|
quota = tg->cfs_bandwidth.quota;
|
2011-07-21 09:43:28 -07:00
|
|
|
|
|
|
|
return tg_set_cfs_bandwidth(tg, period, quota);
|
|
|
|
}
|
|
|
|
|
|
|
|
long tg_get_cfs_period(struct task_group *tg)
|
|
|
|
{
|
|
|
|
u64 cfs_period_us;
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
|
2011-07-21 09:43:28 -07:00
|
|
|
do_div(cfs_period_us, NSEC_PER_USEC);
|
|
|
|
|
|
|
|
return cfs_period_us;
|
|
|
|
}
|
|
|
|
|
|
|
|
static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
|
|
|
|
{
|
|
|
|
return tg_get_cfs_quota(cgroup_tg(cgrp));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
|
|
|
|
s64 cfs_quota_us)
|
|
|
|
{
|
|
|
|
return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
|
|
|
|
}
|
|
|
|
|
|
|
|
static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
|
|
|
|
{
|
|
|
|
return tg_get_cfs_period(cgroup_tg(cgrp));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
|
|
|
|
u64 cfs_period_us)
|
|
|
|
{
|
|
|
|
return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
|
|
|
|
}
|
|
|
|
|
2011-07-21 09:43:29 -07:00
|
|
|
struct cfs_schedulable_data {
|
|
|
|
struct task_group *tg;
|
|
|
|
u64 period, quota;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* normalize group quota/period to be quota/max_period
|
|
|
|
* note: units are usecs
|
|
|
|
*/
|
|
|
|
static u64 normalize_cfs_quota(struct task_group *tg,
|
|
|
|
struct cfs_schedulable_data *d)
|
|
|
|
{
|
|
|
|
u64 quota, period;
|
|
|
|
|
|
|
|
if (tg == d->tg) {
|
|
|
|
period = d->period;
|
|
|
|
quota = d->quota;
|
|
|
|
} else {
|
|
|
|
period = tg_get_cfs_period(tg);
|
|
|
|
quota = tg_get_cfs_quota(tg);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* note: these should typically be equivalent */
|
|
|
|
if (quota == RUNTIME_INF || quota == -1)
|
|
|
|
return RUNTIME_INF;
|
|
|
|
|
|
|
|
return to_ratio(period, quota);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
|
|
|
|
{
|
|
|
|
struct cfs_schedulable_data *d = data;
|
2011-10-25 10:00:11 +02:00
|
|
|
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
2011-07-21 09:43:29 -07:00
|
|
|
s64 quota = 0, parent_quota = -1;
|
|
|
|
|
|
|
|
if (!tg->parent) {
|
|
|
|
quota = RUNTIME_INF;
|
|
|
|
} else {
|
2011-10-25 10:00:11 +02:00
|
|
|
struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
|
2011-07-21 09:43:29 -07:00
|
|
|
|
|
|
|
quota = normalize_cfs_quota(tg, d);
|
|
|
|
parent_quota = parent_b->hierarchal_quota;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ensure max(child_quota) <= parent_quota, inherit when no
|
|
|
|
* limit is set
|
|
|
|
*/
|
|
|
|
if (quota == RUNTIME_INF)
|
|
|
|
quota = parent_quota;
|
|
|
|
else if (parent_quota != RUNTIME_INF && quota > parent_quota)
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
cfs_b->hierarchal_quota = quota;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
|
|
|
{
|
2011-07-21 09:43:35 -07:00
|
|
|
int ret;
|
2011-07-21 09:43:29 -07:00
|
|
|
struct cfs_schedulable_data data = {
|
|
|
|
.tg = tg,
|
|
|
|
.period = period,
|
|
|
|
.quota = quota,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (quota != RUNTIME_INF) {
|
|
|
|
do_div(data.period, NSEC_PER_USEC);
|
|
|
|
do_div(data.quota, NSEC_PER_USEC);
|
|
|
|
}
|
|
|
|
|
2011-07-21 09:43:35 -07:00
|
|
|
rcu_read_lock();
|
|
|
|
ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return ret;
|
2011-07-21 09:43:29 -07:00
|
|
|
}
|
2011-07-21 09:43:40 -07:00
|
|
|
|
|
|
|
static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
|
|
|
|
struct cgroup_map_cb *cb)
|
|
|
|
{
|
|
|
|
struct task_group *tg = cgroup_tg(cgrp);
|
2011-10-25 10:00:11 +02:00
|
|
|
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
2011-07-21 09:43:40 -07:00
|
|
|
|
|
|
|
cb->fill(cb, "nr_periods", cfs_b->nr_periods);
|
|
|
|
cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
|
|
|
|
cb->fill(cb, "throttled_time", cfs_b->throttled_time);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2011-07-21 09:43:28 -07:00
|
|
|
#endif /* CONFIG_CFS_BANDWIDTH */
|
2008-05-30 14:23:45 +02:00
|
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
2007-10-18 23:41:03 -07:00
|
|
|
|
2008-02-13 15:45:40 +01:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2008-05-14 16:05:46 -07:00
|
|
|
static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
|
2008-04-29 01:00:06 -07:00
|
|
|
s64 val)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
2008-04-29 01:00:06 -07:00
|
|
|
return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
|
|
|
|
2008-04-29 01:00:06 -07:00
|
|
|
static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
2008-04-29 01:00:06 -07:00
|
|
|
return sched_group_rt_runtime(cgroup_tg(cgrp));
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
2008-04-19 19:44:57 +02:00
|
|
|
|
|
|
|
static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
|
|
|
|
u64 rt_period_us)
|
|
|
|
{
|
|
|
|
return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
|
|
|
|
}
|
|
|
|
|
|
|
|
static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
|
|
|
|
{
|
|
|
|
return sched_group_rt_period(cgroup_tg(cgrp));
|
|
|
|
}
|
2008-05-30 14:23:45 +02:00
|
|
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2007-10-29 21:18:11 +01:00
|
|
|
static struct cftype cpu_files[] = {
|
2008-02-13 15:45:40 +01:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2007-10-29 21:18:11 +01:00
|
|
|
{
|
|
|
|
.name = "shares",
|
2008-04-29 00:59:56 -07:00
|
|
|
.read_u64 = cpu_shares_read_u64,
|
|
|
|
.write_u64 = cpu_shares_write_u64,
|
2007-10-29 21:18:11 +01:00
|
|
|
},
|
2008-02-13 15:45:40 +01:00
|
|
|
#endif
|
2011-07-21 09:43:28 -07:00
|
|
|
#ifdef CONFIG_CFS_BANDWIDTH
|
|
|
|
{
|
|
|
|
.name = "cfs_quota_us",
|
|
|
|
.read_s64 = cpu_cfs_quota_read_s64,
|
|
|
|
.write_s64 = cpu_cfs_quota_write_s64,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "cfs_period_us",
|
|
|
|
.read_u64 = cpu_cfs_period_read_u64,
|
|
|
|
.write_u64 = cpu_cfs_period_write_u64,
|
|
|
|
},
|
2011-07-21 09:43:40 -07:00
|
|
|
{
|
|
|
|
.name = "stat",
|
|
|
|
.read_map = cpu_stats_show,
|
|
|
|
},
|
2011-07-21 09:43:28 -07:00
|
|
|
#endif
|
2008-02-13 15:45:40 +01:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
2008-02-13 15:45:39 +01:00
|
|
|
.name = "rt_runtime_us",
|
2008-04-29 01:00:06 -07:00
|
|
|
.read_s64 = cpu_rt_runtime_read,
|
|
|
|
.write_s64 = cpu_rt_runtime_write,
|
2008-01-25 21:08:30 +01:00
|
|
|
},
|
2008-04-19 19:44:57 +02:00
|
|
|
{
|
|
|
|
.name = "rt_period_us",
|
2008-04-29 00:59:56 -07:00
|
|
|
.read_u64 = cpu_rt_period_read_uint,
|
|
|
|
.write_u64 = cpu_rt_period_write_uint,
|
2008-04-19 19:44:57 +02:00
|
|
|
},
|
2008-02-13 15:45:40 +01:00
|
|
|
#endif
|
2012-04-01 12:09:55 -07:00
|
|
|
{ } /* terminate */
|
2007-10-18 23:41:03 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
struct cgroup_subsys cpu_cgroup_subsys = {
|
2007-10-29 21:18:11 +01:00
|
|
|
.name = "cpu",
|
|
|
|
.create = cpu_cgroup_create,
|
|
|
|
.destroy = cpu_cgroup_destroy,
|
2011-12-12 18:12:21 -08:00
|
|
|
.can_attach = cpu_cgroup_can_attach,
|
|
|
|
.attach = cpu_cgroup_attach,
|
2011-01-19 12:26:11 +01:00
|
|
|
.exit = cpu_cgroup_exit,
|
2007-10-29 21:18:11 +01:00
|
|
|
.subsys_id = cpu_cgroup_subsys_id,
|
2012-04-01 12:09:55 -07:00
|
|
|
.base_cftypes = cpu_files,
|
2007-10-18 23:41:03 -07:00
|
|
|
.early_init = 1,
|
|
|
|
};
|
|
|
|
|
2008-02-13 15:45:40 +01:00
|
|
|
#endif /* CONFIG_CGROUP_SCHED */
|
2007-12-02 20:04:49 +01:00
|
|
|
|
|
|
|
#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
|
|
|
|
/*
|
|
|
|
* CPU accounting code for task groups.
|
|
|
|
*
|
|
|
|
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
|
|
|
|
* (balbir@in.ibm.com).
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* create a new cpu accounting group */
|
2012-01-31 13:47:36 +08:00
|
|
|
static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
|
2007-12-02 20:04:49 +01:00
|
|
|
{
|
2011-11-28 14:45:19 -02:00
|
|
|
struct cpuacct *ca;
|
2007-12-02 20:04:49 +01:00
|
|
|
|
2011-11-28 14:45:19 -02:00
|
|
|
if (!cgrp->parent)
|
|
|
|
return &root_cpuacct.css;
|
|
|
|
|
|
|
|
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
|
2007-12-02 20:04:49 +01:00
|
|
|
if (!ca)
|
2009-03-31 10:02:22 +05:30
|
|
|
goto out;
|
2007-12-02 20:04:49 +01:00
|
|
|
|
|
|
|
ca->cpuusage = alloc_percpu(u64);
|
2009-03-31 10:02:22 +05:30
|
|
|
if (!ca->cpuusage)
|
|
|
|
goto out_free_ca;
|
|
|
|
|
2011-11-28 14:45:19 -02:00
|
|
|
ca->cpustat = alloc_percpu(struct kernel_cpustat);
|
|
|
|
if (!ca->cpustat)
|
|
|
|
goto out_free_cpuusage;
|
2008-11-10 20:41:13 +05:30
|
|
|
|
2007-12-02 20:04:49 +01:00
|
|
|
return &ca->css;
|
2009-03-31 10:02:22 +05:30
|
|
|
|
2011-11-28 14:45:19 -02:00
|
|
|
out_free_cpuusage:
|
2009-03-31 10:02:22 +05:30
|
|
|
free_percpu(ca->cpuusage);
|
|
|
|
out_free_ca:
|
|
|
|
kfree(ca);
|
|
|
|
out:
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2007-12-02 20:04:49 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* destroy an existing cpu accounting group */
|
2012-01-31 13:47:36 +08:00
|
|
|
static void cpuacct_destroy(struct cgroup *cgrp)
|
2007-12-02 20:04:49 +01:00
|
|
|
{
|
2008-02-29 10:02:43 +05:30
|
|
|
struct cpuacct *ca = cgroup_ca(cgrp);
|
2007-12-02 20:04:49 +01:00
|
|
|
|
2011-11-28 14:45:19 -02:00
|
|
|
free_percpu(ca->cpustat);
|
2007-12-02 20:04:49 +01:00
|
|
|
free_percpu(ca->cpuusage);
|
|
|
|
kfree(ca);
|
|
|
|
}
|
|
|
|
|
2008-12-15 22:02:01 -08:00
|
|
|
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
|
|
|
|
{
|
2009-02-20 16:29:08 +09:00
|
|
|
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
2008-12-15 22:02:01 -08:00
|
|
|
u64 data;
|
|
|
|
|
|
|
|
#ifndef CONFIG_64BIT
|
|
|
|
/*
|
|
|
|
* Take rq->lock to make 64-bit read safe on 32-bit platforms.
|
|
|
|
*/
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
|
2008-12-15 22:02:01 -08:00
|
|
|
data = *cpuusage;
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
|
2008-12-15 22:02:01 -08:00
|
|
|
#else
|
|
|
|
data = *cpuusage;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return data;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
|
|
|
|
{
|
2009-02-20 16:29:08 +09:00
|
|
|
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
2008-12-15 22:02:01 -08:00
|
|
|
|
|
|
|
#ifndef CONFIG_64BIT
|
|
|
|
/*
|
|
|
|
* Take rq->lock to make 64-bit write safe on 32-bit platforms.
|
|
|
|
*/
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
|
2008-12-15 22:02:01 -08:00
|
|
|
*cpuusage = val;
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
|
2008-12-15 22:02:01 -08:00
|
|
|
#else
|
|
|
|
*cpuusage = val;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2007-12-02 20:04:49 +01:00
|
|
|
/* return total cpu usage (in nanoseconds) of a group */
|
2008-02-29 10:02:43 +05:30
|
|
|
static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
|
2007-12-02 20:04:49 +01:00
|
|
|
{
|
2008-02-29 10:02:43 +05:30
|
|
|
struct cpuacct *ca = cgroup_ca(cgrp);
|
2007-12-02 20:04:49 +01:00
|
|
|
u64 totalcpuusage = 0;
|
|
|
|
int i;
|
|
|
|
|
2008-12-15 22:02:01 -08:00
|
|
|
for_each_present_cpu(i)
|
|
|
|
totalcpuusage += cpuacct_cpuusage_read(ca, i);
|
2007-12-02 20:04:49 +01:00
|
|
|
|
|
|
|
return totalcpuusage;
|
|
|
|
}
|
|
|
|
|
2008-02-29 10:02:44 +05:30
|
|
|
static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
|
|
|
|
u64 reset)
|
|
|
|
{
|
|
|
|
struct cpuacct *ca = cgroup_ca(cgrp);
|
|
|
|
int err = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (reset) {
|
|
|
|
err = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2008-12-15 22:02:01 -08:00
|
|
|
for_each_present_cpu(i)
|
|
|
|
cpuacct_cpuusage_write(ca, i, 0);
|
2008-02-29 10:02:44 +05:30
|
|
|
|
|
|
|
out:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2008-12-15 22:04:15 -08:00
|
|
|
static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
|
|
|
|
struct seq_file *m)
|
|
|
|
{
|
|
|
|
struct cpuacct *ca = cgroup_ca(cgroup);
|
|
|
|
u64 percpu;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_present_cpu(i) {
|
|
|
|
percpu = cpuacct_cpuusage_read(ca, i);
|
|
|
|
seq_printf(m, "%llu ", (unsigned long long) percpu);
|
|
|
|
}
|
|
|
|
seq_printf(m, "\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-03-31 10:02:22 +05:30
|
|
|
static const char *cpuacct_stat_desc[] = {
|
|
|
|
[CPUACCT_STAT_USER] = "user",
|
|
|
|
[CPUACCT_STAT_SYSTEM] = "system",
|
|
|
|
};
|
|
|
|
|
|
|
|
static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
|
2011-11-28 14:45:19 -02:00
|
|
|
struct cgroup_map_cb *cb)
|
2009-03-31 10:02:22 +05:30
|
|
|
{
|
|
|
|
struct cpuacct *ca = cgroup_ca(cgrp);
|
2011-11-28 14:45:19 -02:00
|
|
|
int cpu;
|
|
|
|
s64 val = 0;
|
2009-03-31 10:02:22 +05:30
|
|
|
|
2011-11-28 14:45:19 -02:00
|
|
|
for_each_online_cpu(cpu) {
|
|
|
|
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
|
|
|
val += kcpustat->cpustat[CPUTIME_USER];
|
|
|
|
val += kcpustat->cpustat[CPUTIME_NICE];
|
2009-03-31 10:02:22 +05:30
|
|
|
}
|
2011-11-28 14:45:19 -02:00
|
|
|
val = cputime64_to_clock_t(val);
|
|
|
|
cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
|
2009-03-31 10:02:22 +05:30
|
|
|
|
2011-11-28 14:45:19 -02:00
|
|
|
val = 0;
|
|
|
|
for_each_online_cpu(cpu) {
|
|
|
|
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
|
|
|
val += kcpustat->cpustat[CPUTIME_SYSTEM];
|
|
|
|
val += kcpustat->cpustat[CPUTIME_IRQ];
|
|
|
|
val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
|
2009-03-31 10:02:22 +05:30
|
|
|
}
|
2011-11-28 14:45:19 -02:00
|
|
|
|
|
|
|
val = cputime64_to_clock_t(val);
|
|
|
|
cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
|
|
|
|
|
2009-03-31 10:02:22 +05:30
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-12-02 20:04:49 +01:00
|
|
|
static struct cftype files[] = {
|
|
|
|
{
|
|
|
|
.name = "usage",
|
2008-04-29 00:59:56 -07:00
|
|
|
.read_u64 = cpuusage_read,
|
|
|
|
.write_u64 = cpuusage_write,
|
2007-12-02 20:04:49 +01:00
|
|
|
},
|
2008-12-15 22:04:15 -08:00
|
|
|
{
|
|
|
|
.name = "usage_percpu",
|
|
|
|
.read_seq_string = cpuacct_percpu_seq_read,
|
|
|
|
},
|
2009-03-31 10:02:22 +05:30
|
|
|
{
|
|
|
|
.name = "stat",
|
|
|
|
.read_map = cpuacct_stats_show,
|
|
|
|
},
|
2012-04-01 12:09:55 -07:00
|
|
|
{ } /* terminate */
|
2007-12-02 20:04:49 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* charge this task's execution time to its accounting group.
|
|
|
|
*
|
|
|
|
* called with rq->lock held.
|
|
|
|
*/
|
2011-10-25 10:00:11 +02:00
|
|
|
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
2007-12-02 20:04:49 +01:00
|
|
|
{
|
|
|
|
struct cpuacct *ca;
|
2008-11-10 20:41:13 +05:30
|
|
|
int cpu;
|
2007-12-02 20:04:49 +01:00
|
|
|
|
2009-02-26 15:40:15 +08:00
|
|
|
if (unlikely(!cpuacct_subsys.active))
|
2007-12-02 20:04:49 +01:00
|
|
|
return;
|
|
|
|
|
2008-11-10 20:41:13 +05:30
|
|
|
cpu = task_cpu(tsk);
|
2009-03-23 10:02:53 +05:30
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
2007-12-02 20:04:49 +01:00
|
|
|
ca = task_ca(tsk);
|
|
|
|
|
2011-11-28 14:45:18 -02:00
|
|
|
for (; ca; ca = parent_ca(ca)) {
|
2009-02-20 16:29:08 +09:00
|
|
|
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
2007-12-02 20:04:49 +01:00
|
|
|
*cpuusage += cputime;
|
|
|
|
}
|
2009-03-23 10:02:53 +05:30
|
|
|
|
|
|
|
rcu_read_unlock();
|
2007-12-02 20:04:49 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
struct cgroup_subsys cpuacct_subsys = {
|
|
|
|
.name = "cpuacct",
|
|
|
|
.create = cpuacct_create,
|
|
|
|
.destroy = cpuacct_destroy,
|
|
|
|
.subsys_id = cpuacct_subsys_id,
|
2012-04-01 12:09:55 -07:00
|
|
|
.base_cftypes = files,
|
2007-12-02 20:04:49 +01:00
|
|
|
};
|
|
|
|
#endif /* CONFIG_CGROUP_CPUACCT */
|