2005-04-16 15:20:36 -07:00
|
|
|
#ifndef _LINUX_TIMER_H
|
|
|
|
#define _LINUX_TIMER_H
|
|
|
|
|
|
|
|
#include <linux/list.h>
|
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 01:28:13 -08:00
|
|
|
#include <linux/ktime.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/stddef.h>
|
2008-04-30 00:55:03 -07:00
|
|
|
#include <linux/debugobjects.h>
|
2009-01-29 16:03:20 +01:00
|
|
|
#include <linux/stringify.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-01-30 13:30:00 +01:00
|
|
|
struct tvec_base;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
struct timer_list {
|
2010-03-11 14:04:36 -08:00
|
|
|
/*
|
|
|
|
* All fields that change during normal runtime grouped to the
|
|
|
|
* same cacheline
|
|
|
|
*/
|
2015-05-26 22:50:29 +00:00
|
|
|
struct hlist_node entry;
|
|
|
|
unsigned long expires;
|
|
|
|
void (*function)(unsigned long);
|
|
|
|
unsigned long data;
|
|
|
|
u32 flags;
|
2010-03-11 14:04:36 -08:00
|
|
|
|
2009-01-29 16:03:20 +01:00
|
|
|
#ifdef CONFIG_LOCKDEP
|
2015-05-26 22:50:29 +00:00
|
|
|
struct lockdep_map lockdep_map;
|
2009-01-29 16:03:20 +01:00
|
|
|
#endif
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
2009-01-29 16:03:20 +01:00
|
|
|
#ifdef CONFIG_LOCKDEP
|
|
|
|
/*
|
|
|
|
* NB: because we have to copy the lockdep_map, setting the lockdep_map key
|
|
|
|
* (second argument) here is required, otherwise it could be initialised to
|
|
|
|
* the copy of the lockdep_map later! We use the pointer to and the string
|
|
|
|
* "<file>:<line>" as the key resp. the name of the lockdep_map.
|
|
|
|
*/
|
|
|
|
#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) \
|
|
|
|
.lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn),
|
|
|
|
#else
|
|
|
|
#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
|
|
|
|
#endif
|
|
|
|
|
2010-10-20 15:57:33 -07:00
|
|
|
/*
|
|
|
|
* A deferrable timer will work normally when the system is busy, but
|
|
|
|
* will not cause a CPU to come out of idle just to service it; instead,
|
|
|
|
* the timer will be serviced when the CPU eventually wakes up with a
|
|
|
|
* subsequent non-deferrable timer.
|
2012-08-08 11:10:28 -07:00
|
|
|
*
|
|
|
|
* An irqsafe timer is executed with IRQ disabled and it's safe to wait for
|
|
|
|
* the completion of the running instance from IRQ handlers, for example,
|
|
|
|
* by calling del_timer_sync().
|
|
|
|
*
|
|
|
|
* Note: The irq disabled callback execution is a special case for
|
|
|
|
* workqueue locking issues. It's not meant for executing random crap
|
|
|
|
* with interrupts disabled. Abuse is monitored!
|
2010-10-20 15:57:33 -07:00
|
|
|
*/
|
2016-07-04 09:50:29 +00:00
|
|
|
#define TIMER_CPUMASK 0x0003FFFF
|
|
|
|
#define TIMER_MIGRATING 0x00040000
|
2015-05-26 22:50:29 +00:00
|
|
|
#define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING)
|
2016-07-04 09:50:29 +00:00
|
|
|
#define TIMER_DEFERRABLE 0x00080000
|
|
|
|
#define TIMER_PINNED 0x00100000
|
|
|
|
#define TIMER_IRQSAFE 0x00200000
|
timers: Switch to a non-cascading wheel
The current timer wheel has some drawbacks:
1) Cascading:
Cascading can be an unbound operation and is completely pointless in most
cases because the vast majority of the timer wheel timers are canceled or
rearmed before expiration. (They are used as timeout safeguards, not as
real timers to measure time.)
2) No fast lookup of the next expiring timer:
In NOHZ scenarios the first timer soft interrupt after a long NOHZ period
must fast forward the base time to the current value of jiffies. As we
have no way to find the next expiring timer fast, the code loops linearly
and increments the base time one by one and checks for expired timers
in each step. This causes unbound overhead spikes exactly in the moment
when we should wake up as fast as possible.
After a thorough analysis of real world data gathered on laptops,
workstations, webservers and other machines (thanks Chris!) I came to the
conclusion that the current 'classic' timer wheel implementation can be
modified to address the above issues.
The vast majority of timer wheel timers is canceled or rearmed before
expiry. Most of them are timeouts for networking and other I/O tasks. The
nature of timeouts is to catch the exception from normal operation (TCP ack
timed out, disk does not respond, etc.). For these kinds of timeouts the
accuracy of the timeout is not really a concern. Timeouts are very often
approximate worst-case values and in case the timeout fires, we already
waited for a long time and performance is down the drain already.
The few timers which actually expire can be split into two categories:
1) Short expiry times which expect halfways accurate expiry
2) Long term expiry times are inaccurate today already due to the
batching which is done for NOHZ automatically and also via the
set_timer_slack() API.
So for long term expiry timers we can avoid the cascading property and just
leave them in the less granular outer wheels until expiry or
cancelation. Timers which are armed with a timeout larger than the wheel
capacity are no longer cascaded. We expire them with the longest possible
timeout (6+ days). We have not observed such timeouts in our data collection,
but at least we handle them, applying the rule of the least surprise.
To avoid extending the wheel levels for HZ=1000 so we can accomodate the
longest observed timeouts (5 days in the network conntrack code) we reduce the
first level granularity on HZ=1000 to 4ms, which effectively is the same as
the HZ=250 behaviour. From our data analysis there is nothing which relies on
that 1ms granularity and as a side effect we get better batching and timer
locality for the networking code as well.
Contrary to the classic wheel the granularity of the next wheel is not the
capacity of the first wheel. The granularities of the wheels are in the
currently chosen setting 8 times the granularity of the previous wheel.
So for HZ=250 we end up with the following granularity levels:
Level Offset Granularity Range
0 0 4 ms 0 ms - 252 ms
1 64 32 ms 256 ms - 2044 ms (256ms - ~2s)
2 128 256 ms 2048 ms - 16380 ms (~2s - ~16s)
3 192 2048 ms (~2s) 16384 ms - 131068 ms (~16s - ~2m)
4 256 16384 ms (~16s) 131072 ms - 1048572 ms (~2m - ~17m)
5 320 131072 ms (~2m) 1048576 ms - 8388604 ms (~17m - ~2h)
6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
That's a worst case inaccuracy of 12.5% for the timers which are queued at the
beginning of a level.
So the new wheel concept addresses the old issues:
1) Cascading is avoided completely
2) By keeping the timers in the bucket until expiry/cancelation we can track
the buckets which have timers enqueued in a bucket bitmap and therefore can
look up the next expiring timer very fast and O(1).
A further benefit of the concept is that the slack calculation which is done
on every timer start is no longer necessary because the granularity levels
provide natural batching already.
Our extensive testing with various loads did not show any performance
degradation vs. the current wheel implementation.
This patch does not address the 'fast lookup' issue as we wanted to make sure
that there is no regression introduced by the wheel redesign. The
optimizations are in follow up patches.
This patch contains fixes from Anna-Maria Gleixner and Richard Cochran.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.108621834@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-04 09:50:30 +00:00
|
|
|
#define TIMER_ARRAYSHIFT 22
|
|
|
|
#define TIMER_ARRAYMASK 0xFFC00000
|
2010-10-20 15:57:33 -07:00
|
|
|
|
2017-02-10 16:41:15 +01:00
|
|
|
#define TIMER_TRACE_FLAGMASK (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
|
|
|
|
|
2017-10-04 16:27:05 -07:00
|
|
|
#define __TIMER_INITIALIZER(_function, _data, _flags) { \
|
2015-05-26 22:50:28 +00:00
|
|
|
.entry = { .next = TIMER_ENTRY_STATIC }, \
|
2005-04-16 15:20:36 -07:00
|
|
|
.function = (_function), \
|
|
|
|
.data = (_data), \
|
2015-05-26 22:50:29 +00:00
|
|
|
.flags = (_flags), \
|
2009-01-29 16:03:20 +01:00
|
|
|
__TIMER_LOCKDEP_MAP_INITIALIZER( \
|
|
|
|
__FILE__ ":" __stringify(__LINE__)) \
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2017-10-04 16:27:04 -07:00
|
|
|
#define DEFINE_TIMER(_name, _function) \
|
2005-09-09 13:10:40 -07:00
|
|
|
struct timer_list _name = \
|
2017-10-04 16:27:05 -07:00
|
|
|
__TIMER_INITIALIZER(_function, 0, 0)
|
2005-09-09 13:10:40 -07:00
|
|
|
|
2012-08-08 11:10:27 -07:00
|
|
|
void init_timer_key(struct timer_list *timer, unsigned int flags,
|
|
|
|
const char *name, struct lock_class_key *key);
|
2009-01-29 16:03:20 +01:00
|
|
|
|
2012-08-08 11:10:26 -07:00
|
|
|
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
|
|
|
|
extern void init_timer_on_stack_key(struct timer_list *timer,
|
2012-08-08 11:10:27 -07:00
|
|
|
unsigned int flags, const char *name,
|
2012-08-08 11:10:26 -07:00
|
|
|
struct lock_class_key *key);
|
|
|
|
extern void destroy_timer_on_stack(struct timer_list *timer);
|
|
|
|
#else
|
|
|
|
static inline void destroy_timer_on_stack(struct timer_list *timer) { }
|
|
|
|
static inline void init_timer_on_stack_key(struct timer_list *timer,
|
2012-08-08 11:10:27 -07:00
|
|
|
unsigned int flags, const char *name,
|
2012-08-08 11:10:26 -07:00
|
|
|
struct lock_class_key *key)
|
|
|
|
{
|
2012-08-08 11:10:27 -07:00
|
|
|
init_timer_key(timer, flags, name, key);
|
2012-08-08 11:10:26 -07:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-01-29 16:03:20 +01:00
|
|
|
#ifdef CONFIG_LOCKDEP
|
2012-08-08 11:10:27 -07:00
|
|
|
#define __init_timer(_timer, _flags) \
|
2009-01-29 16:03:20 +01:00
|
|
|
do { \
|
|
|
|
static struct lock_class_key __key; \
|
2012-08-08 11:10:27 -07:00
|
|
|
init_timer_key((_timer), (_flags), #_timer, &__key); \
|
2009-01-29 16:03:20 +01:00
|
|
|
} while (0)
|
|
|
|
|
2012-08-08 11:10:27 -07:00
|
|
|
#define __init_timer_on_stack(_timer, _flags) \
|
2009-01-29 16:03:20 +01:00
|
|
|
do { \
|
|
|
|
static struct lock_class_key __key; \
|
2012-08-08 11:10:27 -07:00
|
|
|
init_timer_on_stack_key((_timer), (_flags), #_timer, &__key); \
|
2009-01-29 16:03:20 +01:00
|
|
|
} while (0)
|
2012-08-08 11:10:27 -07:00
|
|
|
#else
|
|
|
|
#define __init_timer(_timer, _flags) \
|
|
|
|
init_timer_key((_timer), (_flags), NULL, NULL)
|
|
|
|
#define __init_timer_on_stack(_timer, _flags) \
|
|
|
|
init_timer_on_stack_key((_timer), (_flags), NULL, NULL)
|
|
|
|
#endif
|
2009-01-29 16:03:20 +01:00
|
|
|
|
2012-08-08 11:10:27 -07:00
|
|
|
#define init_timer(timer) \
|
|
|
|
__init_timer((timer), 0)
|
|
|
|
|
|
|
|
#define __setup_timer(_timer, _fn, _data, _flags) \
|
2009-01-29 16:03:20 +01:00
|
|
|
do { \
|
2012-08-08 11:10:27 -07:00
|
|
|
__init_timer((_timer), (_flags)); \
|
|
|
|
(_timer)->function = (_fn); \
|
|
|
|
(_timer)->data = (_data); \
|
2009-01-29 16:03:20 +01:00
|
|
|
} while (0)
|
|
|
|
|
2012-08-08 11:10:27 -07:00
|
|
|
#define __setup_timer_on_stack(_timer, _fn, _data, _flags) \
|
2009-01-29 16:03:20 +01:00
|
|
|
do { \
|
2012-08-08 11:10:27 -07:00
|
|
|
__init_timer_on_stack((_timer), (_flags)); \
|
|
|
|
(_timer)->function = (_fn); \
|
|
|
|
(_timer)->data = (_data); \
|
2009-01-29 16:03:20 +01:00
|
|
|
} while (0)
|
|
|
|
|
2012-08-08 11:10:27 -07:00
|
|
|
#define setup_timer(timer, fn, data) \
|
|
|
|
__setup_timer((timer), (fn), (data), 0)
|
2016-07-04 09:50:15 +00:00
|
|
|
#define setup_pinned_timer(timer, fn, data) \
|
|
|
|
__setup_timer((timer), (fn), (data), TIMER_PINNED)
|
2016-01-12 18:17:19 +01:00
|
|
|
#define setup_deferrable_timer(timer, fn, data) \
|
|
|
|
__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE)
|
2016-07-04 09:50:15 +00:00
|
|
|
#define setup_pinned_deferrable_timer(timer, fn, data) \
|
|
|
|
__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
|
2009-01-29 16:03:20 +01:00
|
|
|
#define setup_timer_on_stack(timer, fn, data) \
|
2012-08-08 11:10:27 -07:00
|
|
|
__setup_timer_on_stack((timer), (fn), (data), 0)
|
2016-07-04 09:50:15 +00:00
|
|
|
#define setup_pinned_timer_on_stack(timer, fn, data) \
|
|
|
|
__setup_timer_on_stack((timer), (fn), (data), TIMER_PINNED)
|
2010-05-10 14:26:20 -07:00
|
|
|
#define setup_deferrable_timer_on_stack(timer, fn, data) \
|
2012-08-08 11:10:27 -07:00
|
|
|
__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE)
|
2016-07-04 09:50:15 +00:00
|
|
|
#define setup_pinned_deferrable_timer_on_stack(timer, fn, data) \
|
|
|
|
__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
|
2010-05-10 14:26:20 -07:00
|
|
|
|
timer: Prepare to change timer callback argument type
Modern kernel callback systems pass the structure associated with a
given callback to the callback function. The timer callback remains one
of the legacy cases where an arbitrary unsigned long argument continues
to be passed as the callback argument. This has several problems:
- This bloats the timer_list structure with a normally redundant
.data field.
- No type checking is being performed, forcing callbacks to do
explicit type casts of the unsigned long argument into the object
that was passed, rather than using container_of(), as done in most
of the other callback infrastructure.
- Neighboring buffer overflows can overwrite both the .function and
the .data field, providing attackers with a way to elevate from a buffer
overflow into a simplistic ROP-like mechanism that allows calling
arbitrary functions with a controlled first argument.
- For future Control Flow Integrity work, this creates a unique function
prototype for timer callbacks, instead of allowing them to continue to
be clustered with other void functions that take a single unsigned long
argument.
This adds a new timer initialization API, which will ultimately replace
the existing setup_timer(), setup_{deferrable,pinned,etc}_timer() family,
named timer_setup() (to mirror hrtimer_setup(), making instances of its
use much easier to grep for).
In order to support the migration of existing timers into the new
callback arguments, timer_setup() casts its arguments to the existing
legacy types, and explicitly passes the timer pointer as the legacy
data argument. Once all setup_*timer() callers have been replaced with
timer_setup(), the casts can be removed, and the data argument can be
dropped with the timer expiration code changed to just pass the timer
to the callback directly.
Since the regular pattern of using container_of() during local variable
declaration repeats the need for the variable type declaration
to be included, this adds a helper modeled after other from_*()
helpers that wrap container_of(), named from_timer(). This helper uses
typeof(*variable), removing the type redundancy and minimizing the need
for line wraps in forthcoming conversions from "unsigned data long" to
"struct timer_list *" in the timer callbacks:
-void callback(unsigned long data)
+void callback(struct timer_list *t)
{
- struct some_data_structure *local = (struct some_data_structure *)data;
+ struct some_data_structure *local = from_timer(local, t, timer);
Finally, in order to support the handful of timer users that perform
open-coded assignments of the .function (and .data) fields, provide
cast macros (TIMER_FUNC_TYPE and TIMER_DATA_TYPE) that can be used
temporarily. Once conversion has been completed, these can be globally
trivially removed.
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20170928133817.GA113410@beast
2017-09-28 06:38:17 -07:00
|
|
|
#define TIMER_DATA_TYPE unsigned long
|
|
|
|
#define TIMER_FUNC_TYPE void (*)(TIMER_DATA_TYPE)
|
|
|
|
|
timer: Provide wrappers safe for use with LOCKDEP
Under LOCKDEP, the timer lock_class_key (set up in __setup_timer) needs
to be tied to the caller's context, so an inline for timer_setup()
won't work. We do, however, want to keep the inline version around for
argument type checking, though, so this provides macro wrappers in the
LOCKDEP case.
This fixes the case of different timers sharing the same LOCKDEP instance,
and producing a false positive warning:
[ 580.840858] ======================================================
[ 580.842299] WARNING: possible circular locking dependency detected
[ 580.843684] 4.14.0-rc4+ #17 Not tainted
[ 580.844554] ------------------------------------------------------
[ 580.845945] swapper/9/0 is trying to acquire lock:
[ 580.847024] (slock-AF_INET){+.-.}, at: [<ffffffff84ea4c34>] tcp_write_timer+0x24/0xd0
[ 580.848834]
but task is already holding lock:
[ 580.850107] ((timer)#2){+.-.}, at: [<ffffffff846df7c0>] call_timer_fn+0x0/0x300
[ 580.851663]
which lock already depends on the new lock.
[ 580.853439]
the existing dependency chain (in reverse order) is:
[ 580.855311]
-> #1 ((timer)#2){+.-.}:
[ 580.856538] __lock_acquire+0x114d/0x11a0
[ 580.857506] lock_acquire+0xb0/0x1d0
[ 580.858373] del_timer_sync+0x3c/0xb0
[ 580.859260] inet_csk_reqsk_queue_drop+0x7f/0x1b0
...
-> #0 (slock-AF_INET){+.-.}:
[ 580.884980] check_prev_add+0x666/0x700
[ 580.885790] __lock_acquire+0x114d/0x11a0
[ 580.886575] lock_acquire+0xb0/0x1d0
[ 580.887289] _raw_spin_lock+0x2c/0x40
[ 580.888021] tcp_write_timer+0x24/0xd0
...
[ 580.900055] Possible unsafe locking scenario:
[ 580.901043] CPU0 CPU1
[ 580.901797] ---- ----
[ 580.902540] lock((timer)#2);
[ 580.903046] lock(slock-AF_INET);
[ 580.904006] lock((timer)#2);
[ 580.904915] lock(slock-AF_INET);
[ 580.905502]
In this report, del_timer_sync() is from:
inet_csk_reqsk_queue_drop()
reqsk_queue_unlink()
del_timer_sync(&req->rsk_timer)
but tcp_write_timer()'s timer is attached to icsk_retransmit_timer. Both
had the same lock_class_key, since they were using timer_setup(). Switching
to a macro allows for a separate context, avoiding the false positive.
Fixes: 686fef928bba ("timer: Prepare to change timer callback argument type")
Reported-by: Craig Gallek <cgallek@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Link: https://lkml.kernel.org/r/20171019202838.GA43223@beast
2017-10-19 13:28:38 -07:00
|
|
|
#ifndef CONFIG_LOCKDEP
|
timer: Prepare to change timer callback argument type
Modern kernel callback systems pass the structure associated with a
given callback to the callback function. The timer callback remains one
of the legacy cases where an arbitrary unsigned long argument continues
to be passed as the callback argument. This has several problems:
- This bloats the timer_list structure with a normally redundant
.data field.
- No type checking is being performed, forcing callbacks to do
explicit type casts of the unsigned long argument into the object
that was passed, rather than using container_of(), as done in most
of the other callback infrastructure.
- Neighboring buffer overflows can overwrite both the .function and
the .data field, providing attackers with a way to elevate from a buffer
overflow into a simplistic ROP-like mechanism that allows calling
arbitrary functions with a controlled first argument.
- For future Control Flow Integrity work, this creates a unique function
prototype for timer callbacks, instead of allowing them to continue to
be clustered with other void functions that take a single unsigned long
argument.
This adds a new timer initialization API, which will ultimately replace
the existing setup_timer(), setup_{deferrable,pinned,etc}_timer() family,
named timer_setup() (to mirror hrtimer_setup(), making instances of its
use much easier to grep for).
In order to support the migration of existing timers into the new
callback arguments, timer_setup() casts its arguments to the existing
legacy types, and explicitly passes the timer pointer as the legacy
data argument. Once all setup_*timer() callers have been replaced with
timer_setup(), the casts can be removed, and the data argument can be
dropped with the timer expiration code changed to just pass the timer
to the callback directly.
Since the regular pattern of using container_of() during local variable
declaration repeats the need for the variable type declaration
to be included, this adds a helper modeled after other from_*()
helpers that wrap container_of(), named from_timer(). This helper uses
typeof(*variable), removing the type redundancy and minimizing the need
for line wraps in forthcoming conversions from "unsigned data long" to
"struct timer_list *" in the timer callbacks:
-void callback(unsigned long data)
+void callback(struct timer_list *t)
{
- struct some_data_structure *local = (struct some_data_structure *)data;
+ struct some_data_structure *local = from_timer(local, t, timer);
Finally, in order to support the handful of timer users that perform
open-coded assignments of the .function (and .data) fields, provide
cast macros (TIMER_FUNC_TYPE and TIMER_DATA_TYPE) that can be used
temporarily. Once conversion has been completed, these can be globally
trivially removed.
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20170928133817.GA113410@beast
2017-09-28 06:38:17 -07:00
|
|
|
static inline void timer_setup(struct timer_list *timer,
|
|
|
|
void (*callback)(struct timer_list *),
|
|
|
|
unsigned int flags)
|
|
|
|
{
|
|
|
|
__setup_timer(timer, (TIMER_FUNC_TYPE)callback,
|
|
|
|
(TIMER_DATA_TYPE)timer, flags);
|
|
|
|
}
|
|
|
|
|
2017-10-04 16:26:55 -07:00
|
|
|
static inline void timer_setup_on_stack(struct timer_list *timer,
|
|
|
|
void (*callback)(struct timer_list *),
|
|
|
|
unsigned int flags)
|
|
|
|
{
|
|
|
|
__setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback,
|
|
|
|
(TIMER_DATA_TYPE)timer, flags);
|
|
|
|
}
|
timer: Provide wrappers safe for use with LOCKDEP
Under LOCKDEP, the timer lock_class_key (set up in __setup_timer) needs
to be tied to the caller's context, so an inline for timer_setup()
won't work. We do, however, want to keep the inline version around for
argument type checking, though, so this provides macro wrappers in the
LOCKDEP case.
This fixes the case of different timers sharing the same LOCKDEP instance,
and producing a false positive warning:
[ 580.840858] ======================================================
[ 580.842299] WARNING: possible circular locking dependency detected
[ 580.843684] 4.14.0-rc4+ #17 Not tainted
[ 580.844554] ------------------------------------------------------
[ 580.845945] swapper/9/0 is trying to acquire lock:
[ 580.847024] (slock-AF_INET){+.-.}, at: [<ffffffff84ea4c34>] tcp_write_timer+0x24/0xd0
[ 580.848834]
but task is already holding lock:
[ 580.850107] ((timer)#2){+.-.}, at: [<ffffffff846df7c0>] call_timer_fn+0x0/0x300
[ 580.851663]
which lock already depends on the new lock.
[ 580.853439]
the existing dependency chain (in reverse order) is:
[ 580.855311]
-> #1 ((timer)#2){+.-.}:
[ 580.856538] __lock_acquire+0x114d/0x11a0
[ 580.857506] lock_acquire+0xb0/0x1d0
[ 580.858373] del_timer_sync+0x3c/0xb0
[ 580.859260] inet_csk_reqsk_queue_drop+0x7f/0x1b0
...
-> #0 (slock-AF_INET){+.-.}:
[ 580.884980] check_prev_add+0x666/0x700
[ 580.885790] __lock_acquire+0x114d/0x11a0
[ 580.886575] lock_acquire+0xb0/0x1d0
[ 580.887289] _raw_spin_lock+0x2c/0x40
[ 580.888021] tcp_write_timer+0x24/0xd0
...
[ 580.900055] Possible unsafe locking scenario:
[ 580.901043] CPU0 CPU1
[ 580.901797] ---- ----
[ 580.902540] lock((timer)#2);
[ 580.903046] lock(slock-AF_INET);
[ 580.904006] lock((timer)#2);
[ 580.904915] lock(slock-AF_INET);
[ 580.905502]
In this report, del_timer_sync() is from:
inet_csk_reqsk_queue_drop()
reqsk_queue_unlink()
del_timer_sync(&req->rsk_timer)
but tcp_write_timer()'s timer is attached to icsk_retransmit_timer. Both
had the same lock_class_key, since they were using timer_setup(). Switching
to a macro allows for a separate context, avoiding the false positive.
Fixes: 686fef928bba ("timer: Prepare to change timer callback argument type")
Reported-by: Craig Gallek <cgallek@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Link: https://lkml.kernel.org/r/20171019202838.GA43223@beast
2017-10-19 13:28:38 -07:00
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* Under LOCKDEP, the timer lock_class_key (set up in __init_timer) needs
|
|
|
|
* to be tied to the caller's context, so an inline (above) won't work. We
|
|
|
|
* do want to keep the inline for argument type checking, though.
|
|
|
|
*/
|
|
|
|
# define timer_setup(timer, callback, flags) \
|
|
|
|
__setup_timer(timer, (TIMER_FUNC_TYPE)callback, \
|
|
|
|
(TIMER_DATA_TYPE)timer, flags)
|
|
|
|
# define timer_setup_on_stack(timer, callback, flags) \
|
|
|
|
__setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback,\
|
|
|
|
(TIMER_DATA_TYPE)timer, flags)
|
|
|
|
#endif
|
2017-10-04 16:26:55 -07:00
|
|
|
|
timer: Prepare to change timer callback argument type
Modern kernel callback systems pass the structure associated with a
given callback to the callback function. The timer callback remains one
of the legacy cases where an arbitrary unsigned long argument continues
to be passed as the callback argument. This has several problems:
- This bloats the timer_list structure with a normally redundant
.data field.
- No type checking is being performed, forcing callbacks to do
explicit type casts of the unsigned long argument into the object
that was passed, rather than using container_of(), as done in most
of the other callback infrastructure.
- Neighboring buffer overflows can overwrite both the .function and
the .data field, providing attackers with a way to elevate from a buffer
overflow into a simplistic ROP-like mechanism that allows calling
arbitrary functions with a controlled first argument.
- For future Control Flow Integrity work, this creates a unique function
prototype for timer callbacks, instead of allowing them to continue to
be clustered with other void functions that take a single unsigned long
argument.
This adds a new timer initialization API, which will ultimately replace
the existing setup_timer(), setup_{deferrable,pinned,etc}_timer() family,
named timer_setup() (to mirror hrtimer_setup(), making instances of its
use much easier to grep for).
In order to support the migration of existing timers into the new
callback arguments, timer_setup() casts its arguments to the existing
legacy types, and explicitly passes the timer pointer as the legacy
data argument. Once all setup_*timer() callers have been replaced with
timer_setup(), the casts can be removed, and the data argument can be
dropped with the timer expiration code changed to just pass the timer
to the callback directly.
Since the regular pattern of using container_of() during local variable
declaration repeats the need for the variable type declaration
to be included, this adds a helper modeled after other from_*()
helpers that wrap container_of(), named from_timer(). This helper uses
typeof(*variable), removing the type redundancy and minimizing the need
for line wraps in forthcoming conversions from "unsigned data long" to
"struct timer_list *" in the timer callbacks:
-void callback(unsigned long data)
+void callback(struct timer_list *t)
{
- struct some_data_structure *local = (struct some_data_structure *)data;
+ struct some_data_structure *local = from_timer(local, t, timer);
Finally, in order to support the handful of timer users that perform
open-coded assignments of the .function (and .data) fields, provide
cast macros (TIMER_FUNC_TYPE and TIMER_DATA_TYPE) that can be used
temporarily. Once conversion has been completed, these can be globally
trivially removed.
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20170928133817.GA113410@beast
2017-09-28 06:38:17 -07:00
|
|
|
#define from_timer(var, callback_timer, timer_fieldname) \
|
|
|
|
container_of(callback_timer, typeof(*var), timer_fieldname)
|
|
|
|
|
2007-01-26 00:57:09 -08:00
|
|
|
/**
|
2005-04-16 15:20:36 -07:00
|
|
|
* timer_pending - is a timer pending?
|
|
|
|
* @timer: the timer in question
|
|
|
|
*
|
|
|
|
* timer_pending will tell whether a given timer is currently pending,
|
|
|
|
* or not. Callers must ensure serialization wrt. other operations done
|
|
|
|
* to this timer, eg. interrupt contexts, or other CPUs on SMP.
|
|
|
|
*
|
|
|
|
* return value: 1 if the timer is pending, 0 if not.
|
|
|
|
*/
|
|
|
|
static inline int timer_pending(const struct timer_list * timer)
|
|
|
|
{
|
2015-05-26 22:50:28 +00:00
|
|
|
return timer->entry.pprev != NULL;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
extern void add_timer_on(struct timer_list *timer, int cpu);
|
|
|
|
extern int del_timer(struct timer_list * timer);
|
|
|
|
extern int mod_timer(struct timer_list *timer, unsigned long expires);
|
2009-02-18 12:23:29 +01:00
|
|
|
extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-05-29 23:47:39 +02:00
|
|
|
/*
|
|
|
|
* The jiffies value which is added to now, when there is no timer
|
|
|
|
* in the timer wheel:
|
|
|
|
*/
|
|
|
|
#define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1)
|
|
|
|
|
2009-02-18 12:23:29 +01:00
|
|
|
extern void add_timer(struct timer_list *timer);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2010-10-20 15:57:31 -07:00
|
|
|
extern int try_to_del_timer_sync(struct timer_list *timer);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
extern int del_timer_sync(struct timer_list *timer);
|
|
|
|
#else
|
2005-06-23 00:08:59 -07:00
|
|
|
# define del_timer_sync(t) del_timer(t)
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
|
|
|
|
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:08:56 -07:00
|
|
|
#define del_singleshot_timer_sync(t) del_timer_sync(t)
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
extern void init_timers(void);
|
|
|
|
extern void run_local_timers(void);
|
2006-03-26 01:38:12 -08:00
|
|
|
struct hrtimer;
|
2007-02-16 01:27:49 -08:00
|
|
|
extern enum hrtimer_restart it_real_fn(struct hrtimer *);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2015-05-26 22:50:33 +00:00
|
|
|
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
|
2017-02-06 09:56:40 +01:00
|
|
|
struct ctl_table;
|
2015-05-26 22:50:33 +00:00
|
|
|
|
|
|
|
extern unsigned int sysctl_timer_migration;
|
|
|
|
int timer_migration_handler(struct ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *lenp,
|
|
|
|
loff_t *ppos);
|
|
|
|
#endif
|
|
|
|
|
2006-12-10 02:21:24 -08:00
|
|
|
unsigned long __round_jiffies(unsigned long j, int cpu);
|
|
|
|
unsigned long __round_jiffies_relative(unsigned long j, int cpu);
|
|
|
|
unsigned long round_jiffies(unsigned long j);
|
|
|
|
unsigned long round_jiffies_relative(unsigned long j);
|
|
|
|
|
2008-11-06 08:42:48 +01:00
|
|
|
unsigned long __round_jiffies_up(unsigned long j, int cpu);
|
|
|
|
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu);
|
|
|
|
unsigned long round_jiffies_up(unsigned long j);
|
|
|
|
unsigned long round_jiffies_up_relative(unsigned long j);
|
|
|
|
|
2016-07-13 17:16:59 +00:00
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
|
|
int timers_dead_cpu(unsigned int cpu);
|
|
|
|
#else
|
|
|
|
#define timers_dead_cpu NULL
|
|
|
|
#endif
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|