mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-15 02:05:33 +00:00
bc7a34b8b9
Eric reported that the timer_migration sysctl is not really nice performance wise as it needs to check at every timer insertion whether the feature is enabled or not. Further the check does not live in the timer code, so we have an extra function call which checks an extra cache line to figure out that it is disabled. We can do better and store that information in the per cpu (hr)timer bases. I pondered to use a static key, but that's a nightmare to update from the nohz code and the timer base cache line is hot anyway when we select a timer base. The old logic enabled the timer migration unconditionally if CONFIG_NO_HZ was set even if nohz was disabled on the kernel command line. With this modification, we start off with migration disabled. The user visible sysctl is still set to enabled. If the kernel switches to NOHZ migration is enabled, if the user did not disable it via the sysctl prior to the switch. If nohz=off is on the kernel command line, migration stays disabled no matter what. Before: 47.76% hog [.] main 14.84% [kernel] [k] _raw_spin_lock_irqsave 9.55% [kernel] [k] _raw_spin_unlock_irqrestore 6.71% [kernel] [k] mod_timer 6.24% [kernel] [k] lock_timer_base.isra.38 3.76% [kernel] [k] detach_if_pending 3.71% [kernel] [k] del_timer 2.50% [kernel] [k] internal_add_timer 1.51% [kernel] [k] get_nohz_timer_target 1.28% [kernel] [k] __internal_add_timer 0.78% [kernel] [k] timerfn 0.48% [kernel] [k] wake_up_nohz_cpu After: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu Reported-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: John Stultz <john.stultz@linaro.org> Cc: Joonwoo Park <joonwoop@codeaurora.org> Cc: Wenbo Wang <wenbo.wang@memblaze.com> Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
99 lines
3.2 KiB
C
99 lines
3.2 KiB
C
#ifndef _SCHED_SYSCTL_H
|
|
#define _SCHED_SYSCTL_H
|
|
|
|
#ifdef CONFIG_DETECT_HUNG_TASK
|
|
extern int sysctl_hung_task_check_count;
|
|
extern unsigned int sysctl_hung_task_panic;
|
|
extern unsigned long sysctl_hung_task_timeout_secs;
|
|
extern int sysctl_hung_task_warnings;
|
|
extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
|
|
void __user *buffer,
|
|
size_t *lenp, loff_t *ppos);
|
|
#else
|
|
/* Avoid need for ifdefs elsewhere in the code */
|
|
enum { sysctl_hung_task_timeout_secs = 0 };
|
|
#endif
|
|
|
|
/*
|
|
* Default maximum number of active map areas, this limits the number of vmas
|
|
* per mm struct. Users can overwrite this number by sysctl but there is a
|
|
* problem.
|
|
*
|
|
* When a program's coredump is generated as ELF format, a section is created
|
|
* per a vma. In ELF, the number of sections is represented in unsigned short.
|
|
* This means the number of sections should be smaller than 65535 at coredump.
|
|
* Because the kernel adds some informative sections to a image of program at
|
|
* generating coredump, we need some margin. The number of extra sections is
|
|
* 1-3 now and depends on arch. We use "5" as safe margin, here.
|
|
*
|
|
* ELF extended numbering allows more than 65535 sections, so 16-bit bound is
|
|
* not a hard limit any more. Although some userspace tools can be surprised by
|
|
* that.
|
|
*/
|
|
#define MAPCOUNT_ELF_CORE_MARGIN (5)
|
|
#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
|
|
|
|
extern int sysctl_max_map_count;
|
|
|
|
extern unsigned int sysctl_sched_latency;
|
|
extern unsigned int sysctl_sched_min_granularity;
|
|
extern unsigned int sysctl_sched_wakeup_granularity;
|
|
extern unsigned int sysctl_sched_child_runs_first;
|
|
|
|
enum sched_tunable_scaling {
|
|
SCHED_TUNABLESCALING_NONE,
|
|
SCHED_TUNABLESCALING_LOG,
|
|
SCHED_TUNABLESCALING_LINEAR,
|
|
SCHED_TUNABLESCALING_END,
|
|
};
|
|
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
|
|
|
|
extern unsigned int sysctl_numa_balancing_scan_delay;
|
|
extern unsigned int sysctl_numa_balancing_scan_period_min;
|
|
extern unsigned int sysctl_numa_balancing_scan_period_max;
|
|
extern unsigned int sysctl_numa_balancing_scan_size;
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
extern unsigned int sysctl_sched_migration_cost;
|
|
extern unsigned int sysctl_sched_nr_migrate;
|
|
extern unsigned int sysctl_sched_time_avg;
|
|
extern unsigned int sysctl_sched_shares_window;
|
|
|
|
int sched_proc_update_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *length,
|
|
loff_t *ppos);
|
|
#endif
|
|
|
|
/*
|
|
* control realtime throttling:
|
|
*
|
|
* /proc/sys/kernel/sched_rt_period_us
|
|
* /proc/sys/kernel/sched_rt_runtime_us
|
|
*/
|
|
extern unsigned int sysctl_sched_rt_period;
|
|
extern int sysctl_sched_rt_runtime;
|
|
|
|
#ifdef CONFIG_CFS_BANDWIDTH
|
|
extern unsigned int sysctl_sched_cfs_bandwidth_slice;
|
|
#endif
|
|
|
|
#ifdef CONFIG_SCHED_AUTOGROUP
|
|
extern unsigned int sysctl_sched_autogroup_enabled;
|
|
#endif
|
|
|
|
extern int sched_rr_timeslice;
|
|
|
|
extern int sched_rr_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos);
|
|
|
|
extern int sched_rt_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos);
|
|
|
|
extern int sysctl_numa_balancing(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos);
|
|
|
|
#endif /* _SCHED_SYSCTL_H */
|