From 78a01feb4024ffb6c6321e45dc2bfcafb2d1d1e5 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Tue, 25 Oct 2022 15:39:23 +0000 Subject: [PATCH 01/44] ftrace: Clean comments related to FTRACE_OPS_FL_PER_CPU Commit b3a88803ac5b ("ftrace: Kill FTRACE_OPS_FL_PER_CPU") didn't completely remove the comments related to FTRACE_OPS_FL_PER_CPU. Link: https://lkml.kernel.org/r/20221025153923.1995973-1-zhengyejian1@huawei.com Fixes: b3a88803ac5b ("ftrace: Kill FTRACE_OPS_FL_PER_CPU") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 33236241f236..65a5d36463e0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -163,7 +163,7 @@ static void ftrace_sync_ipi(void *data) static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* - * If this is a dynamic, RCU, or per CPU ops, or we force list func, + * If this is a dynamic or RCU ops, or we force list func, * then it needs to call the list anyway. */ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) || @@ -3071,8 +3071,6 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. - * The same goes for freeing the per_cpu data of the per_cpu - * ops. */ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) { /* @@ -7519,8 +7517,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, /* * Check the following for each ops before calling their func: * if RCU flag is set, then rcu_is_watching() must be true - * if PER_CPU is set, then ftrace_function_local_disable() - * must be false * Otherwise test if the ip matches the ops filter * * If any of the above fails then the op->func() is not executed. @@ -7570,8 +7566,8 @@ NOKPROBE_SYMBOL(arch_ftrace_ops_list_func); /* * If there's only one function registered but it does not support - * recursion, needs RCU protection and/or requires per cpu handling, then - * this function will be called by the mcount trampoline. + * recursion, needs RCU protection, then this function will be called + * by the mcount trampoline. */ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) From 8230f27b1ccc4b8976c137e3d6d690f9d4ffca8d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 14 Oct 2022 08:04:56 -0400 Subject: [PATCH 02/44] tracing: Add __cpumask to denote a trace event field that is a cpumask_t The trace events have a __bitmask field that can be used for anything that requires bitmasks. Although currently it is only used for CPU masks, it could be used in the future for any type of bitmasks. There is some user space tooling that wants to know if a field is a CPU mask and not just some random unsigned long bitmask. Introduce "__cpumask()" helper functions that work the same as the current __bitmask() helpers but displays in the format file: field:__data_loc cpumask_t *[] mask; offset:36; size:4; signed:0; Instead of: field:__data_loc unsigned long[] mask; offset:32; size:4; signed:0; The main difference is the type. Instead of "unsigned long" it is "cpumask_t *". Note, this type field needs to be a real type in the __dynamic_array() logic that both __cpumask and__bitmask use, but the comparison field requires it to be a scalar type whereas cpumask_t is a structure (non-scalar). But everything works when making it a pointer. Valentin added changes to remove the need of passing in "nr_bits" and the __cpumask will always use nr_cpumask_bits as its size. Link: https://lkml.kernel.org/r/20221014080456.1d32b989@rorschach.local.home Requested-by: Valentin Schneider Reviewed-by: Valentin Schneider Signed-off-by: Valentin Schneider Signed-off-by: Steven Rostedt (Google) --- include/trace/bpf_probe.h | 6 ++++ include/trace/perf.h | 6 ++++ include/trace/stages/stage1_struct_define.h | 6 ++++ include/trace/stages/stage2_data_offsets.h | 6 ++++ include/trace/stages/stage3_trace_output.h | 6 ++++ include/trace/stages/stage4_event_fields.h | 6 ++++ include/trace/stages/stage5_get_offsets.h | 6 ++++ include/trace/stages/stage6_event_callback.h | 20 ++++++++++++ include/trace/stages/stage7_class_define.h | 2 ++ samples/trace_events/trace-events-sample.c | 2 +- samples/trace_events/trace-events-sample.h | 34 +++++++++++++++----- 11 files changed, 91 insertions(+), 9 deletions(-) diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 6a13220d2d27..155c495b89ea 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -21,6 +21,9 @@ #undef __get_bitmask #define __get_bitmask(field) (char *)__get_dynamic_array(field) +#undef __get_cpumask +#define __get_cpumask(field) (char *)__get_dynamic_array(field) + #undef __get_sockaddr #define __get_sockaddr(field) ((struct sockaddr *)__get_dynamic_array(field)) @@ -40,6 +43,9 @@ #undef __get_rel_bitmask #define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) +#undef __get_rel_cpumask +#define __get_rel_cpumask(field) (char *)__get_rel_dynamic_array(field) + #undef __get_rel_sockaddr #define __get_rel_sockaddr(field) ((struct sockaddr *)__get_rel_dynamic_array(field)) diff --git a/include/trace/perf.h b/include/trace/perf.h index 5800d13146c3..8f3bf1e17707 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -21,6 +21,9 @@ #undef __get_bitmask #define __get_bitmask(field) (char *)__get_dynamic_array(field) +#undef __get_cpumask +#define __get_cpumask(field) (char *)__get_dynamic_array(field) + #undef __get_sockaddr #define __get_sockaddr(field) ((struct sockaddr *)__get_dynamic_array(field)) @@ -41,6 +44,9 @@ #undef __get_rel_bitmask #define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) +#undef __get_rel_cpumask +#define __get_rel_cpumask(field) (char *)__get_rel_dynamic_array(field) + #undef __get_rel_sockaddr #define __get_rel_sockaddr(field) ((struct sockaddr *)__get_rel_dynamic_array(field)) diff --git a/include/trace/stages/stage1_struct_define.h b/include/trace/stages/stage1_struct_define.h index 1b7bab60434c..69e0dae453bf 100644 --- a/include/trace/stages/stage1_struct_define.h +++ b/include/trace/stages/stage1_struct_define.h @@ -32,6 +32,9 @@ #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(char, item, -1) +#undef __cpumask +#define __cpumask(item) __dynamic_array(char, item, -1) + #undef __sockaddr #define __sockaddr(field, len) __dynamic_array(u8, field, len) @@ -47,6 +50,9 @@ #undef __rel_bitmask #define __rel_bitmask(item, nr_bits) __rel_dynamic_array(char, item, -1) +#undef __rel_cpumask +#define __rel_cpumask(item) __rel_dynamic_array(char, item, -1) + #undef __rel_sockaddr #define __rel_sockaddr(field, len) __rel_dynamic_array(u8, field, len) diff --git a/include/trace/stages/stage2_data_offsets.h b/include/trace/stages/stage2_data_offsets.h index 1b7a8f764fdd..469b6a64293d 100644 --- a/include/trace/stages/stage2_data_offsets.h +++ b/include/trace/stages/stage2_data_offsets.h @@ -38,6 +38,9 @@ #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) +#undef __cpumask +#define __cpumask(item) __dynamic_array(unsigned long, item, -1) + #undef __sockaddr #define __sockaddr(field, len) __dynamic_array(u8, field, len) @@ -53,5 +56,8 @@ #undef __rel_bitmask #define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, -1) +#undef __rel_cpumask +#define __rel_cpumask(item) __rel_dynamic_array(unsigned long, item, -1) + #undef __rel_sockaddr #define __rel_sockaddr(field, len) __rel_dynamic_array(u8, field, len) diff --git a/include/trace/stages/stage3_trace_output.h b/include/trace/stages/stage3_trace_output.h index e3b183e9d18e..66374df61ed3 100644 --- a/include/trace/stages/stage3_trace_output.h +++ b/include/trace/stages/stage3_trace_output.h @@ -42,6 +42,9 @@ trace_print_bitmask_seq(p, __bitmask, __bitmask_size); \ }) +#undef __get_cpumask +#define __get_cpumask(field) __get_bitmask(field) + #undef __get_rel_bitmask #define __get_rel_bitmask(field) \ ({ \ @@ -51,6 +54,9 @@ trace_print_bitmask_seq(p, __bitmask, __bitmask_size); \ }) +#undef __get_rel_cpumask +#define __get_rel_cpumask(field) __get_rel_bitmask(field) + #undef __get_sockaddr #define __get_sockaddr(field) ((struct sockaddr *)__get_dynamic_array(field)) diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h index a8fb25f39a99..f2990d22313c 100644 --- a/include/trace/stages/stage4_event_fields.h +++ b/include/trace/stages/stage4_event_fields.h @@ -46,6 +46,9 @@ #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) +#undef __cpumask +#define __cpumask(item) __dynamic_array(cpumask_t *, item, -1) + #undef __sockaddr #define __sockaddr(field, len) __dynamic_array(u8, field, len) @@ -64,5 +67,8 @@ #undef __rel_bitmask #define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, -1) +#undef __rel_cpumask +#define __rel_cpumask(item) __rel_dynamic_array(cpumask_t *, item, -1) + #undef __rel_sockaddr #define __rel_sockaddr(field, len) __rel_dynamic_array(u8, field, len) diff --git a/include/trace/stages/stage5_get_offsets.h b/include/trace/stages/stage5_get_offsets.h index fba4c24ed9e6..ac5c24d3beeb 100644 --- a/include/trace/stages/stage5_get_offsets.h +++ b/include/trace/stages/stage5_get_offsets.h @@ -82,10 +82,16 @@ #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, \ __bitmask_size_in_longs(nr_bits)) +#undef __cpumask +#define __cpumask(item) __bitmask(item, nr_cpumask_bits) + #undef __rel_bitmask #define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, \ __bitmask_size_in_longs(nr_bits)) +#undef __rel_cpumask +#define __rel_cpumask(item) __rel_bitmask(item, nr_cpumask_bits) + #undef __sockaddr #define __sockaddr(field, len) __dynamic_array(u8, field, len) diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index 3c554a585320..49c32394b53f 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -57,6 +57,16 @@ #define __assign_bitmask(dst, src, nr_bits) \ memcpy(__get_bitmask(dst), (src), __bitmask_size_in_bytes(nr_bits)) +#undef __cpumask +#define __cpumask(item) __dynamic_array(unsigned long, item, -1) + +#undef __get_cpumask +#define __get_cpumask(field) (char *)__get_dynamic_array(field) + +#undef __assign_cpumask +#define __assign_cpumask(dst, src) \ + memcpy(__get_cpumask(dst), (src), __bitmask_size_in_bytes(nr_cpumask_bits)) + #undef __sockaddr #define __sockaddr(field, len) __dynamic_array(u8, field, len) @@ -98,6 +108,16 @@ #define __assign_rel_bitmask(dst, src, nr_bits) \ memcpy(__get_rel_bitmask(dst), (src), __bitmask_size_in_bytes(nr_bits)) +#undef __rel_cpumask +#define __rel_cpumask(item) __rel_dynamic_array(unsigned long, item, -1) + +#undef __get_rel_cpumask +#define __get_rel_cpumask(field) (char *)__get_rel_dynamic_array(field) + +#undef __assign_rel_cpumask +#define __assign_rel_cpumask(dst, src) \ + memcpy(__get_rel_cpumask(dst), (src), __bitmask_size_in_bytes(nr_cpumask_bits)) + #undef __rel_sockaddr #define __rel_sockaddr(field, len) __rel_dynamic_array(u8, field, len) diff --git a/include/trace/stages/stage7_class_define.h b/include/trace/stages/stage7_class_define.h index 8a7ec24c246d..8795429f388b 100644 --- a/include/trace/stages/stage7_class_define.h +++ b/include/trace/stages/stage7_class_define.h @@ -13,11 +13,13 @@ #undef __get_dynamic_array_len #undef __get_str #undef __get_bitmask +#undef __get_cpumask #undef __get_sockaddr #undef __get_rel_dynamic_array #undef __get_rel_dynamic_array_len #undef __get_rel_str #undef __get_rel_bitmask +#undef __get_rel_cpumask #undef __get_rel_sockaddr #undef __print_array #undef __print_hex_dump diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index 608c4ae3b08a..ecc7db237f2e 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -50,7 +50,7 @@ static void do_simple_thread_func(int cnt, const char *fmt, ...) trace_foo_with_template_print("I have to be different", cnt); - trace_foo_rel_loc("Hello __rel_loc", cnt, bitmask); + trace_foo_rel_loc("Hello __rel_loc", cnt, bitmask, current->cpus_ptr); } static void simple_thread_func(int cnt) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 1a92226202fc..fb4548a44153 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -200,6 +200,16 @@ * * __assign_bitmask(target_cpus, cpumask_bits(bar), nr_cpumask_bits); * + * __cpumask: This is pretty much the same as __bitmask but is specific for + * CPU masks. The type displayed to the user via the format files will + * be "cpumaks_t" such that user space may deal with them differently + * if they choose to do so, and the bits is always set to nr_cpumask_bits. + * + * __cpumask(target_cpu) + * + * To assign a cpumask, use the __assign_cpumask() helper macro. + * + * __assign_cpumask(target_cpus, cpumask_bits(bar)); * * fast_assign: This is a C like function that is used to store the items * into the ring buffer. A special variable called "__entry" will be the @@ -212,8 +222,8 @@ * This is also used to print out the data from the trace files. * Again, the __entry macro is used to access the data from the ring buffer. * - * Note, __dynamic_array, __string, and __bitmask require special helpers - * to access the data. + * Note, __dynamic_array, __string, __bitmask and __cpumask require special + * helpers to access the data. * * For __dynamic_array(int, foo, bar) use __get_dynamic_array(foo) * Use __get_dynamic_array_len(foo) to get the length of the array @@ -226,6 +236,8 @@ * * For __bitmask(target_cpus, nr_cpumask_bits) use __get_bitmask(target_cpus) * + * For __cpumask(target_cpus) use __get_cpumask(target_cpus) + * * * Note, that for both the assign and the printk, __entry is the handler * to the data structure in the ring buffer, and is defined by the @@ -288,6 +300,7 @@ TRACE_EVENT(foo_bar, __dynamic_array(int, list, __length_of(lst)) __string( str, string ) __bitmask( cpus, num_possible_cpus() ) + __cpumask( cpum ) __vstring( vstr, fmt, va ) ), @@ -299,9 +312,10 @@ TRACE_EVENT(foo_bar, __assign_str(str, string); __assign_vstr(vstr, fmt, va); __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus()); + __assign_cpumask(cpum, cpumask_bits(mask)); ), - TP_printk("foo %s %d %s %s %s %s (%s) %s", __entry->foo, __entry->bar, + TP_printk("foo %s %d %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar, /* * Notice here the use of some helper functions. This includes: @@ -345,7 +359,8 @@ TRACE_EVENT(foo_bar, __print_array(__get_dynamic_array(list), __get_dynamic_array_len(list) / sizeof(int), sizeof(int)), - __get_str(str), __get_bitmask(cpus), __get_str(vstr)) + __get_str(str), __get_bitmask(cpus), __get_cpumask(cpus), + __get_str(vstr)) ); /* @@ -542,15 +557,16 @@ DEFINE_EVENT_PRINT(foo_template, foo_with_template_print, TRACE_EVENT(foo_rel_loc, - TP_PROTO(const char *foo, int bar, unsigned long *mask), + TP_PROTO(const char *foo, int bar, unsigned long *mask, const cpumask_t *cpus), - TP_ARGS(foo, bar, mask), + TP_ARGS(foo, bar, mask, cpus), TP_STRUCT__entry( __rel_string( foo, foo ) __field( int, bar ) __rel_bitmask( bitmask, BITS_PER_BYTE * sizeof(unsigned long) ) + __rel_cpumask( cpumask ) ), TP_fast_assign( @@ -558,10 +574,12 @@ TRACE_EVENT(foo_rel_loc, __entry->bar = bar; __assign_rel_bitmask(bitmask, mask, BITS_PER_BYTE * sizeof(unsigned long)); + __assign_rel_cpumask(cpumask, cpus); ), - TP_printk("foo_rel_loc %s, %d, %s", __get_rel_str(foo), __entry->bar, - __get_rel_bitmask(bitmask)) + TP_printk("foo_rel_loc %s, %d, %s, %s", __get_rel_str(foo), __entry->bar, + __get_rel_bitmask(bitmask), + __get_rel_cpumask(cpumask)) ); #endif From a01fdc897fa56ffb596d0a0cd7ea2ab3bd8398c5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 20 Oct 2022 21:00:56 -0400 Subject: [PATCH 03/44] tracing: Add trace_trigger kernel command line option Allow triggers to be enabled at kernel boot up. For example: trace_trigger="sched_switch.stacktrace if prev_state == 2" The above will enable the stacktrace trigger on top of the sched_switch event and only trigger if its prev_state is 2 (TASK_UNINTERRUPTIBLE). Then at boot up, a stacktrace will trigger and be recorded in the tracing ring buffer every time the sched_switch happens where the previous state is TASK_INTERRUPTIBLE. Another useful trigger would be "traceoff" which can stop tracing on an event if a field of the event matches a certain value defined by the filter ("if" statement). Link: https://lore.kernel.org/linux-trace-kernel/20221020210056.0d8d0a5b@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- .../admin-guide/kernel-parameters.txt | 19 +++++ kernel/trace/trace_events.c | 72 ++++++++++++++++++- 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a465d5242774..ccf91a4bf113 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6257,6 +6257,25 @@ See also Documentation/trace/ftrace.rst "trace options" section. + trace_trigger=[trigger-list] + [FTRACE] Add a event trigger on specific events. + Set a trigger on top of a specific event, with an optional + filter. + + The format is is "trace_trigger=.[ if ],..." + Where more than one trigger may be specified that are comma deliminated. + + For example: + + trace_trigger="sched_switch.stacktrace if prev_state == 2" + + The above will enable the "stacktrace" trigger on the "sched_switch" + event but only trigger it if the "prev_state" of the "sched_switch" + event is "2" (TASK_UNINTERUPTIBLE). + + See also "Event triggers" in Documentation/trace/events.rst + + traceoff_on_warning [FTRACE] enable this option to disable tracing when a warning is hit. This turns off "tracing_on". Tracing can diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f71ea6e79b3c..3bfaf560ecc4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2796,6 +2796,44 @@ trace_create_new_event(struct trace_event_call *call, return file; } +#ifdef CONFIG_HIST_TRIGGERS +#define MAX_BOOT_TRIGGERS 32 + +static struct boot_triggers { + const char *event; + char *trigger; +} bootup_triggers[MAX_BOOT_TRIGGERS]; + +static char bootup_trigger_buf[COMMAND_LINE_SIZE]; +static int nr_boot_triggers; + +static __init int setup_trace_triggers(char *str) +{ + char *trigger; + char *buf; + int i; + + strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = true; + disable_tracing_selftest("running event triggers"); + + buf = bootup_trigger_buf; + for (i = 0; i < MAX_BOOT_TRIGGERS; i++) { + trigger = strsep(&buf, ","); + if (!trigger) + break; + bootup_triggers[i].event = strsep(&trigger, "."); + bootup_triggers[i].trigger = strsep(&trigger, "."); + if (!bootup_triggers[i].trigger) + break; + } + + nr_boot_triggers = i; + return 1; +} +__setup("trace_trigger=", setup_trace_triggers); +#endif + /* Add an event to a trace directory */ static int __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -2812,6 +2850,28 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) return event_define_fields(call); } +#ifdef CONFIG_HIST_TRIGGERS +static void trace_early_triggers(struct trace_event_file *file, const char *name) +{ + int ret; + int i; + + for (i = 0; i < nr_boot_triggers; i++) { + if (strcmp(name, bootup_triggers[i].event)) + continue; + mutex_lock(&event_mutex); + ret = trigger_process_regex(file, bootup_triggers[i].trigger); + mutex_unlock(&event_mutex); + if (ret) + pr_err("Failed to register trigger '%s' on event %s\n", + bootup_triggers[i].trigger, + bootup_triggers[i].event); + } +} +#else +static inline void trace_early_triggers(struct trace_event_file *file, const char *name) { } +#endif + /* * Just create a descriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before @@ -2822,12 +2882,19 @@ __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; + int ret; file = trace_create_new_event(call, tr); if (!file) return -ENOMEM; - return event_define_fields(call); + ret = event_define_fields(call); + if (ret) + return ret; + + trace_early_triggers(file, trace_event_name(call)); + + return 0; } struct ftrace_module_file_ops; @@ -3735,6 +3802,8 @@ static __init int event_trace_enable(void) list_add(&call->list, &ftrace_events); } + register_trigger_cmds(); + /* * We need the top trace array to have a working set of trace * points at early init, before the debug files and directories @@ -3749,7 +3818,6 @@ static __init int event_trace_enable(void) register_event_cmds(); - register_trigger_cmds(); return 0; } From 04aabc32fb677f91d676fd306bca1043805e78d5 Mon Sep 17 00:00:00 2001 From: Song Chen Date: Thu, 20 Oct 2022 22:06:51 +0800 Subject: [PATCH 04/44] ring_buffer: Remove unused "event" parameter After commit a389d86f7fd0 ("ring-buffer: Have nested events still record running time stamp"), the "event" parameter is no longer used in either ring_buffer_unlock_commit() or rb_commit(). Best to remove it. Link: https://lkml.kernel.org/r/1666274811-24138-1-git-send-email-chensong_2000@189.cn Signed-off-by: Song Chen Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 3 +-- kernel/trace/ring_buffer.c | 12 +++++------- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 2 +- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 3c7d295746f6..782e14f62201 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -113,8 +113,7 @@ void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val); struct ring_buffer_event *ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length); -int ring_buffer_unlock_commit(struct trace_buffer *buffer, - struct ring_buffer_event *event); +int ring_buffer_unlock_commit(struct trace_buffer *buffer); int ring_buffer_write(struct trace_buffer *buffer, unsigned long length, void *data); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b21bf14bae9b..843818ee4814 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3180,8 +3180,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->entries); rb_end_commit(cpu_buffer); @@ -3383,15 +3382,14 @@ void ring_buffer_nest_end(struct trace_buffer *buffer) * * Must be paired with ring_buffer_lock_reserve. */ -int ring_buffer_unlock_commit(struct trace_buffer *buffer, - struct ring_buffer_event *event) +int ring_buffer_unlock_commit(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - rb_commit(cpu_buffer, event); + rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); @@ -3977,7 +3975,7 @@ int ring_buffer_write(struct trace_buffer *buffer, memcpy(body, data, length); - rb_commit(cpu_buffer, event); + rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); @@ -5998,7 +5996,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) } out: - ring_buffer_unlock_commit(data->buffer, event); + ring_buffer_unlock_commit(data->buffer); return 0; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 78e576575b79..aef34673d79d 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -258,7 +258,7 @@ static void ring_buffer_producer(void) hit++; entry = ring_buffer_event_data(event); *entry = smp_processor_id(); - ring_buffer_unlock_commit(buffer, event); + ring_buffer_unlock_commit(buffer); } } end_time = ktime_get(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5cfc95a52bc3..5c97dbef741b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -999,7 +999,7 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev /* ring_buffer_unlock_commit() enables preemption */ preempt_enable_notrace(); } else - ring_buffer_unlock_commit(buffer, event); + ring_buffer_unlock_commit(buffer); } /** From b179d48b6aab21f3999f5006685ea4254c0618a9 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 17 Nov 2022 14:46:18 +0100 Subject: [PATCH 05/44] tracing/osnoise: Add osnoise/options file Add the tracing/osnoise/options file to control osnoise/timerlat tracer features. It is a single file to contain multiple features, similar to the sched/features file. Reading the file displays a list of options. Writing the OPTION_NAME enables it, writing NO_OPTION_NAME disables it. The DEAFULTS is a particular option that resets the options to the default ones. It uses a bitmask to keep track of the status of the option. When needed, we can add a list of static keys, but for now it does not justify the memory increase. Link: https://lkml.kernel.org/r/f8d34aefdb225d2603fcb4c02a120832a0cd3339.1668692096.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 170 +++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 4300c5dc4e5d..17b77fe3950b 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -48,6 +48,19 @@ #define DEFAULT_TIMERLAT_PERIOD 1000 /* 1ms */ #define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */ +/* + * osnoise/options entries. + */ +enum osnoise_options_index { + OSN_DEFAULTS = 0, + OSN_MAX +}; + +static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS" }; + +#define OSN_DEFAULT_OPTIONS 0 +unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; + /* * trace_array of the enabled osnoise/timerlat instances. */ @@ -1860,6 +1873,150 @@ static void osnoise_init_hotplug_support(void) } #endif /* CONFIG_HOTPLUG_CPU */ +/* + * seq file functions for the osnoise/options file. + */ +static void *s_options_start(struct seq_file *s, loff_t *pos) +{ + int option = *pos; + + mutex_lock(&interface_lock); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static void *s_options_next(struct seq_file *s, void *v, loff_t *pos) +{ + int option = ++(*pos); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static int s_options_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + int option = *pos; + + if (option == OSN_DEFAULTS) { + if (osnoise_options == OSN_DEFAULT_OPTIONS) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + goto out; + } + + if (test_bit(option, &osnoise_options)) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + +out: + if (option != OSN_MAX) + seq_puts(s, " "); + + return 0; +} + +static void s_options_stop(struct seq_file *s, void *v) +{ + seq_puts(s, "\n"); + mutex_unlock(&interface_lock); +} + +static const struct seq_operations osnoise_options_seq_ops = { + .start = s_options_start, + .next = s_options_next, + .show = s_options_show, + .stop = s_options_stop +}; + +static int osnoise_options_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &osnoise_options_seq_ops); +}; + +/** + * osnoise_options_write - Write function for "options" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * Writing the option name sets the option, writing the "NO_" + * prefix in front of the option name disables it. + * + * Writing "DEFAULTS" resets the option values to the default ones. + */ +static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int running, option, enable, retval; + char buf[256], *option_str; + + if (cnt >= 256) + return -EINVAL; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3)) { + option_str = strstrip(buf); + enable = true; + } else { + option_str = strstrip(&buf[3]); + enable = false; + } + + option = match_string(osnoise_options_str, OSN_MAX, option_str); + if (option < 0) + return -EINVAL; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop. + */ + mutex_lock(&trace_types_lock); + running = osnoise_has_registered_instances(); + if (running) + stop_per_cpu_kthreads(); + + mutex_lock(&interface_lock); + /* + * avoid CPU hotplug operations that might read options. + */ + cpus_read_lock(); + + retval = cnt; + + if (enable) { + if (option == OSN_DEFAULTS) + osnoise_options = OSN_DEFAULT_OPTIONS; + else + set_bit(option, &osnoise_options); + } else { + if (option == OSN_DEFAULTS) + retval = -EINVAL; + else + clear_bit(option, &osnoise_options); + } + + cpus_read_unlock(); + mutex_unlock(&interface_lock); + + if (running) + start_per_cpu_kthreads(); + mutex_unlock(&trace_types_lock); + + return retval; +} + /* * osnoise_cpus_read - Read function for reading the "cpus" file * @filp: The active open file structure @@ -2042,6 +2199,14 @@ static const struct file_operations cpus_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations osnoise_options_fops = { + .open = osnoise_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = osnoise_options_write +}; + #ifdef CONFIG_TIMERLAT_TRACER #ifdef CONFIG_STACKTRACE static int init_timerlat_stack_tracefs(struct dentry *top_dir) @@ -2128,6 +2293,11 @@ static int init_tracefs(void) if (!tmp) goto err; + tmp = trace_create_file("options", TRACE_MODE_WRITE, top_dir, NULL, + &osnoise_options_fops); + if (!tmp) + goto err; + ret = init_timerlat_tracefs(top_dir); if (ret) goto err; From 30838fcd81078d078b10209bc18a6357ba4dd5fa Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 17 Nov 2022 14:46:19 +0100 Subject: [PATCH 06/44] tracing/osnoise: Add OSNOISE_WORKLOAD option The osnoise tracer is not only a tracer, and a set of tracepoints, but also a workload dispatcher. In preparation for having other workloads, e.g., in user-space, add an option to avoid dispatching the workload. By not dispatching the workload, the osnoise: tracepoints become generic events to measure the execution time of *any* task on Linux. For example: # cd /sys/kernel/tracing/ # cat osnoise/options DEFAULTS OSNOISE_WORKLOAD # echo NO_OSNOISE_WORKLOAD > osnoise/options # cat osnoise/options NO_DEFAULTS NO_OSNOISE_WORKLOAD # echo osnoise > set_event # echo osnoise > current_tracer # tail -8 trace make-94722 [002] d..3. 1371.794507: thread_noise: make:94722 start 1371.794302286 duration 200897 ns sh-121042 [020] d..3. 1371.794534: thread_noise: sh:121042 start 1371.781610976 duration 8943683 ns make-121097 [005] d..3. 1371.794542: thread_noise: make:121097 start 1371.794481522 duration 60444 ns <...>-40 [005] d..3. 1371.794550: thread_noise: migration/5:40 start 1371.794542256 duration 7154 ns -0 [018] dNh2. 1371.794554: irq_noise: reschedule:253 start 1371.794553547 duration 40 ns -0 [018] dNh2. 1371.794561: irq_noise: local_timer:236 start 1371.794556222 duration 4890 ns -0 [018] .Ns2. 1371.794563: softirq_noise: SCHED:7 start 1371.794561803 duration 992 ns -0 [018] d..3. 1371.794566: thread_noise: swapper/18:0 start 1371.781368110 duration 13191798 ns In preparation for the rtla exec_time tracer/tool and rtla osnoise --user option. Link: https://lkml.kernel.org/r/f5cfbd37aefd419eefe9243b4d2fc38ed5753fe4.1668692096.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 17b77fe3950b..3f10dd1f2f1c 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -53,12 +53,13 @@ */ enum osnoise_options_index { OSN_DEFAULTS = 0, + OSN_WORKLOAD, OSN_MAX }; -static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS" }; +static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS", "OSNOISE_WORKLOAD" }; -#define OSN_DEFAULT_OPTIONS 0 +#define OSN_DEFAULT_OPTIONS 0x2 unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; /* @@ -1186,11 +1187,12 @@ trace_sched_switch_callback(void *data, bool preempt, unsigned int prev_state) { struct osnoise_variables *osn_var = this_cpu_osn_var(); + int workload = test_bit(OSN_WORKLOAD, &osnoise_options); - if (p->pid != osn_var->pid) + if ((p->pid != osn_var->pid) || !workload) thread_exit(osn_var, p); - if (n->pid != osn_var->pid) + if ((n->pid != osn_var->pid) || !workload) thread_entry(osn_var, n); } @@ -1723,9 +1725,16 @@ static void stop_kthread(unsigned int cpu) struct task_struct *kthread; kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; - if (kthread) + if (kthread) { kthread_stop(kthread); - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + } else { + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).sampling = false; + barrier(); + return; + } + } } /* @@ -1759,6 +1768,13 @@ static int start_kthread(unsigned int cpu) snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; } else { + /* if no workload, just return */ + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).sampling = true; + barrier(); + return 0; + } + snprintf(comm, 24, "osnoise/%d", cpu); } From 67543cd6b8eee53959e624b9ce420ca4d47be0c8 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 17 Nov 2022 14:46:20 +0100 Subject: [PATCH 07/44] Documentation/osnoise: Add osnoise/options documentation Add the documentation about the osnoise/options file, along with an explanation about the OSNOISE_WORKLOAD option. Link: https://lkml.kernel.org/r/777af8f3d87beedd304805f98eff6c8291d64226.1668692096.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/osnoise-tracer.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/trace/osnoise-tracer.rst b/Documentation/trace/osnoise-tracer.rst index 963def9f97c6..3c675ed82b27 100644 --- a/Documentation/trace/osnoise-tracer.rst +++ b/Documentation/trace/osnoise-tracer.rst @@ -109,6 +109,11 @@ The tracer has a set of options inside the osnoise directory, they are: - tracing_threshold: the minimum delta between two time() reads to be considered as noise, in us. When set to 0, the default value will be used, which is currently 5 us. + - osnoise/options: a set of on/off options that can be enabled by + writing the option name to the file or disabled by writing the option + name preceded with the 'NO_' prefix. For example, writing + NO_OSNOISE_WORKLOAD disables the OSNOISE_WORKLOAD option. The + special DEAFAULTS option resets all options to the default value. Additional Tracing ------------------ @@ -150,3 +155,10 @@ tracepoints is smaller than eight us reported in the sample_threshold. The reason roots in the overhead of the entry and exit code that happens before and after any interference execution. This justifies the dual approach: measuring thread and tracing. + +Running osnoise tracer without workload +--------------------------------------- + +By enabling the osnoise tracer with the NO_OSNOISE_WORKLOAD option set, +the osnoise: tracepoints serve to measure the execution time of +any type of Linux task, free from the interference of other tasks. From 9430cd62b6ccdf1f06915cc06561f0e364809604 Mon Sep 17 00:00:00 2001 From: Chuang Wang Date: Mon, 21 Nov 2022 16:08:31 +0800 Subject: [PATCH 08/44] tracing/perf: Use strndup_user instead of kzalloc/strncpy_from_user This patch uses strndup_user instead of kzalloc + strncpy_from_user, which makes the code more concise. Link: https://lkml.kernel.org/r/20221121080831.707409-1-nashuiliang@gmail.com Signed-off-by: Chuang Wang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_event_perf.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 61e3a2620fa3..05e791241812 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -251,16 +251,12 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) struct trace_event_call *tp_event; if (p_event->attr.kprobe_func) { - func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); - if (!func) - return -ENOMEM; - ret = strncpy_from_user( - func, u64_to_user_ptr(p_event->attr.kprobe_func), - KSYM_NAME_LEN); - if (ret == KSYM_NAME_LEN) - ret = -E2BIG; - if (ret < 0) - goto out; + func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func), + KSYM_NAME_LEN); + if (IS_ERR(func)) { + ret = PTR_ERR(func); + return (ret == -EINVAL) ? -E2BIG : ret; + } if (func[0] == '\0') { kfree(func); From a76d4648a0bbd624a1322c15073b5032a5135f01 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 22 Nov 2022 17:14:56 +0800 Subject: [PATCH 09/44] tracing: Make tracepoint_print_iter static After change in commit 4239174570da ("tracing: Make tracepoint_printk a static_key"), this symbol is not used outside of the file, so mark it static. Link: https://lkml.kernel.org/r/20221122091456.72055-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c97dbef741b..93a75a97118f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -85,7 +85,7 @@ void __init disable_tracing_selftest(const char *reason) #endif /* Pipe tracepoints to printk */ -struct trace_iterator *tracepoint_print_iter; +static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d42e24507152..48643f07bc01 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1942,8 +1942,6 @@ static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { } static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } #endif -extern struct trace_iterator *tracepoint_print_iter; - /* * Reset the state of the trace_iterator so that it can read consumed data. * Normally, the trace_iterator is used for reading the data when it is not From 96e6122cb79616c622ae0d025eb9f981120b568d Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Thu, 10 Nov 2022 10:03:19 +0800 Subject: [PATCH 10/44] tracing: Optimize event type allocation with IDA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 060fa5c83e67 ("tracing/events: reuse trace event ids after overflow"), trace events with dynamic type are linked up in list 'ftrace_event_list' through field 'trace_event.list'. Then when max event type number used up, it's possible to reuse type number of some freed one by traversing 'ftrace_event_list'. As instead, using IDA to manage available type numbers can make codes simpler and then the field 'trace_event.list' can be dropped. Since 'struct trace_event' is used in static tracepoints, drop 'trace_event.list' can make vmlinux smaller. Local test with about 2000 tracepoints, vmlinux reduced about 64KB: before:-rwxrwxr-x 1 root root 76669448 Nov 8 17:14 vmlinux after: -rwxrwxr-x 1 root root 76604176 Nov 8 17:15 vmlinux Link: https://lkml.kernel.org/r/20221110020319.1259291-1-zhengyejian1@huawei.com Signed-off-by: Zheng Yejian Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 1 - kernel/trace/trace_output.c | 66 +++++++++--------------------------- 2 files changed, 16 insertions(+), 51 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 20749bd9db71..bb2053246d6a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -136,7 +136,6 @@ struct trace_event_functions { struct trace_event { struct hlist_node node; - struct list_head list; int type; struct trace_event_functions *funcs; }; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 67f47ea27921..f0ba97121345 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "trace_output.h" @@ -21,8 +22,6 @@ DECLARE_RWSEM(trace_event_sem); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; -static int next_event_type = __TRACE_LAST_TYPE; - enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -688,38 +687,23 @@ struct trace_event *ftrace_find_event(int type) return NULL; } -static LIST_HEAD(ftrace_event_list); +static DEFINE_IDA(trace_event_ida); -static int trace_search_list(struct list_head **list) +static void free_trace_event_type(int type) { - struct trace_event *e = NULL, *iter; - int next = __TRACE_LAST_TYPE; + if (type >= __TRACE_LAST_TYPE) + ida_free(&trace_event_ida, type); +} - if (list_empty(&ftrace_event_list)) { - *list = &ftrace_event_list; - return next; - } +static int alloc_trace_event_type(void) +{ + int next; - /* - * We used up all possible max events, - * lets see if somebody freed one. - */ - list_for_each_entry(iter, &ftrace_event_list, list) { - if (iter->type != next) { - e = iter; - break; - } - next++; - } - - /* Did we used up all 65 thousand events??? */ - if (next > TRACE_EVENT_TYPE_MAX) + /* Skip static defined type numbers */ + next = ida_alloc_range(&trace_event_ida, __TRACE_LAST_TYPE, + TRACE_EVENT_TYPE_MAX, GFP_KERNEL); + if (next < 0) return 0; - - if (e) - *list = &e->list; - else - *list = &ftrace_event_list; return next; } @@ -761,28 +745,10 @@ int register_trace_event(struct trace_event *event) if (WARN_ON(!event->funcs)) goto out; - INIT_LIST_HEAD(&event->list); - if (!event->type) { - struct list_head *list = NULL; - - if (next_event_type > TRACE_EVENT_TYPE_MAX) { - - event->type = trace_search_list(&list); - if (!event->type) - goto out; - - } else { - - event->type = next_event_type++; - list = &ftrace_event_list; - } - - if (WARN_ON(ftrace_find_event(event->type))) + event->type = alloc_trace_event_type(); + if (!event->type) goto out; - - list_add_tail(&event->list, list); - } else if (WARN(event->type > __TRACE_LAST_TYPE, "Need to add type to trace.h")) { goto out; @@ -819,7 +785,7 @@ EXPORT_SYMBOL_GPL(register_trace_event); int __unregister_trace_event(struct trace_event *event) { hlist_del(&event->node); - list_del(&event->list); + free_trace_event_type(event->type); return 0; } From bd604f3db49c5b21171abea0414a2020dcbf2646 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 22 Nov 2022 18:09:05 -0500 Subject: [PATCH 11/44] ftrace: Avoid needless updates of the ftrace function call Song Shuai reported: The list func (ftrace_ops_list_func) will be patched first before the transition between old and new calls are set, which fixed the race described in this commit `59338f75`. While ftrace_trace_function changes from the list func to a ftrace_ops func, like unregistering the klp_ops to leave the only global_ops in ftrace_ops_list, the ftrace_[regs]_call will be replaced with the list func although it already exists. So there should be a condition to avoid this. And suggested using another variable to keep track of what the ftrace function is set to. But this could be simplified by using a helper function that does the same with a static variable. Link: https://lore.kernel.org/lkml/20221026132039.2236233-1-suagrfillet@gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20221122180905.737b6f52@gandalf.local.home Reported-by: Song Shuai Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 65a5d36463e0..d04552c0c275 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2763,6 +2763,19 @@ void __weak ftrace_arch_code_modify_post_process(void) { } +static int update_ftrace_func(ftrace_func_t func) +{ + static ftrace_func_t save_func; + + /* Avoid updating if it hasn't changed */ + if (func == save_func) + return 0; + + save_func = func; + + return ftrace_update_ftrace_func(func); +} + void ftrace_modify_all_code(int command) { int update = command & FTRACE_UPDATE_TRACE_FUNC; @@ -2783,7 +2796,7 @@ void ftrace_modify_all_code(int command) * traced. */ if (update) { - err = ftrace_update_ftrace_func(ftrace_ops_list_func); + err = update_ftrace_func(ftrace_ops_list_func); if (FTRACE_WARN_ON(err)) return; } @@ -2799,7 +2812,7 @@ void ftrace_modify_all_code(int command) /* If irqs are disabled, we are in stop machine */ if (!irqs_disabled()) smp_call_function(ftrace_sync_ipi, NULL, 1); - err = ftrace_update_ftrace_func(ftrace_trace_function); + err = update_ftrace_func(ftrace_trace_function); if (FTRACE_WARN_ON(err)) return; } From 4bded7af8b9af6e97514b0521004f90267905aef Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Wed, 23 Nov 2022 10:32:48 -0800 Subject: [PATCH 12/44] tracing/user_events: Fix call print_fmt leak If user_event_trace_register() fails within user_event_parse() the call's print_fmt member is not freed. Add kfree call to fix this. Link: https://lkml.kernel.org/r/20221123183248.554-1-beaub@linux.microsoft.com Fixes: aa3b2b4c6692 ("user_events: Add print_fmt generation support for basic types") Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 539b08ae7020..9cb53182bb31 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1359,6 +1359,7 @@ static int user_event_parse(struct user_event_group *group, char *name, put_user: user_event_destroy_fields(user); user_event_destroy_validators(user); + kfree(user->call.print_fmt); kfree(user); return ret; } From 46a5cd8c8dcacc58912b4bf04ca4d84d2977bbbc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 17 Oct 2022 14:05:13 -0400 Subject: [PATCH 13/44] tracing: Update MAINTAINERS file for new patchwork and mailing list The tracing subsystem now has its own mailing list (although patches should also be sent to LKML) as well as a new patchwork entry for kernel related tracing patches. Update the MAINTAINERS file to reflect the changes. Link: https://lore.kernel.org/linux-trace-kernel/20221017140513.14b9ce2e@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2585e7edc335..d12576150a70 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8461,6 +8461,9 @@ FUNCTION HOOKS (FTRACE) M: Steven Rostedt M: Masami Hiramatsu R: Mark Rutland +L: linux-kernel@vger.kernel.org +L: linux-trace-kernel@vger.kernel.org +Q: https://patchwork.kernel.org/project/linux-trace-kernel/list/ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git F: Documentation/trace/ftrace* @@ -11483,6 +11486,9 @@ M: Naveen N. Rao M: Anil S Keshavamurthy M: "David S. Miller" M: Masami Hiramatsu +L: linux-kernel@vger.kernel.org +L: linux-trace-kernel@vger.kernel.org +Q: https://patchwork.kernel.org/project/linux-trace-kernel/list/ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git F: Documentation/trace/kprobes.rst @@ -20862,6 +20868,9 @@ F: drivers/hwmon/pmbus/tps546d24.c TRACING M: Steven Rostedt M: Masami Hiramatsu +L: linux-kernel@vger.kernel.org +L: linux-trace-kernel@vger.kernel.org +Q: https://patchwork.kernel.org/project/linux-trace-kernel/list/ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git F: Documentation/trace/* From fd3dc56253acbe9c641a66d312d8393cd55eb04c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 9 Dec 2022 10:52:47 -0500 Subject: [PATCH 14/44] ftrace/x86: Add back ftrace_expected for ftrace bug reports After someone reported a bug report with a failed modification due to the expected value not matching what was found, it came to my attention that the ftrace_expected is no longer set when that happens. This makes for debugging the issue a bit more difficult. Set ftrace_expected to the expected code before calling ftrace_bug, so that it shows what was expected and why it failed. Link: https://lore.kernel.org/all/CA+wXwBQ-VhK+hpBtYtyZP-NiX4g8fqRRWithFOHQW-0coQ3vLg@mail.gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20221209105247.01d4e51d@gandalf.local.home Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: "x86@kernel.org" Cc: Borislav Petkov Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 768ae4406a5c ("x86/ftrace: Use text_poke()") Signed-off-by: Steven Rostedt (Google) --- arch/x86/kernel/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index bd165004776d..e07234ec7e23 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -217,7 +217,9 @@ void ftrace_replace_code(int enable) ret = ftrace_verify_code(rec->ip, old); if (ret) { + ftrace_expected = old; ftrace_bug(ret, rec); + ftrace_expected = NULL; return; } } From 5f2e094ed2592abb3d1f49e263957188e121c2ac Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 20 Oct 2022 00:31:55 +0900 Subject: [PATCH 15/44] tracing: Allow multiple hitcount values in histograms The hitcount is treated specially in the histograms - since it's always expected to be there regardless of whether the user specified anything or not, it's always added as the first histogram value. Currently the code doesn't allow it to be added more than once as a value, which is inconsistent with all the other possible values. It would seem to be a pointless thing to want to do, but other features being added such as percent and graph modifiers don't work properly with the current hitcount restrictions. Fix this by allowing multiple hitcounts to be added. Link: https://lore.kernel.org/linux-trace-kernel/166610812248.56030.16754785928712505251.stgit@devnote2 Signed-off-by: Tom Zanussi Signed-off-by: Masami Hiramatsu (Google) Tested-by: Tom Zanussi --- kernel/trace/trace_events_hist.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1c82478e8dff..31d58ddcc1d9 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1356,6 +1356,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; + else if (field->flags & HIST_FIELD_FL_HITCOUNT) + field_name = "hitcount"; if (field_name == NULL) field_name = ""; @@ -2328,6 +2330,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->attrs->ts_in_usecs = true; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; + else if (strcmp(field_name, "hitcount") == 0) + *flags |= HIST_FIELD_FL_HITCOUNT; else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { @@ -4328,8 +4332,8 @@ static int create_var_field(struct hist_trigger_data *hist_data, static int create_val_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { + unsigned int i, j = 1, n_hitcount = 0; char *fields_str, *field_str; - unsigned int i, j = 1; int ret; ret = create_hitcount_val(hist_data); @@ -4346,8 +4350,10 @@ static int create_val_fields(struct hist_trigger_data *hist_data, if (!field_str) break; - if (strcmp(field_str, "hitcount") == 0) - continue; + if (strcmp(field_str, "hitcount") == 0) { + if (!n_hitcount++) + continue; + } ret = create_val_field(hist_data, j++, file, field_str); if (ret) From abaa5258ce5e5887a9de049f50a85dc023391a1c Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 20 Oct 2022 00:31:55 +0900 Subject: [PATCH 16/44] tracing: Add .percent suffix option to histogram values Add .percent suffix option to show the histogram values in percentage. This feature is useful when we need yo undersntand the overall trend for the histograms of large values. E.g. this shows the runtime percentage for each tasks. ------ # cd /sys/kernel/debug/tracing/ # echo hist:keys=pid:vals=hitcount,runtime.percent:sort=pid > \ events/sched/sched_stat_runtime/trigger # sleep 10 # cat events/sched/sched_stat_runtime/hist # event histogram # # trigger info: hist:keys=pid:vals=hitcount,runtime.percent:sort=pid:size=2048 [active] # { pid: 8 } hitcount: 7 runtime (%): 4.14 { pid: 14 } hitcount: 5 runtime (%): 3.69 { pid: 16 } hitcount: 11 runtime (%): 3.41 { pid: 61 } hitcount: 41 runtime (%): 19.75 { pid: 65 } hitcount: 4 runtime (%): 1.48 { pid: 70 } hitcount: 6 runtime (%): 3.60 { pid: 72 } hitcount: 2 runtime (%): 1.10 { pid: 144 } hitcount: 10 runtime (%): 32.01 { pid: 151 } hitcount: 8 runtime (%): 22.66 { pid: 152 } hitcount: 2 runtime (%): 8.10 Totals: Hits: 96 Entries: 10 Dropped: 0 ----- Link: https://lore.kernel.org/linux-trace-kernel/166610813077.56030.4238090506973562347.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi --- kernel/trace/trace.c | 3 +- kernel/trace/trace_events_hist.c | 90 +++++++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 93a75a97118f..08e9568849b1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5724,7 +5724,8 @@ static const char readme_msg[] = "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" "\t .buckets=size display values in groups of size rather than raw number\n" - "\t .usecs display a common_timestamp in microseconds\n\n" + "\t .usecs display a common_timestamp in microseconds\n" + "\t .percent display a number of percentage value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 31d58ddcc1d9..35b0e956f06e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -506,6 +506,7 @@ enum hist_field_flags { HIST_FIELD_FL_ALIAS = 1 << 16, HIST_FIELD_FL_BUCKET = 1 << 17, HIST_FIELD_FL_CONST = 1 << 18, + HIST_FIELD_FL_PERCENT = 1 << 19, }; struct var_defs { @@ -1707,6 +1708,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "buckets"; else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) flags_str = "usecs"; + else if (hist_field->flags & HIST_FIELD_FL_PERCENT) + flags_str = "percent"; return flags_str; } @@ -2315,6 +2318,10 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (ret || !(*buckets)) goto error; *flags |= HIST_FIELD_FL_BUCKET; + } else if (strncmp(modifier, "percent", 7) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_PERCENT; } else { error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); @@ -5291,33 +5298,69 @@ static void hist_trigger_print_key(struct seq_file *m, seq_puts(m, "}"); } +/* Get the 100 times of the percentage of @val in @total */ +static inline unsigned int __get_percentage(u64 val, u64 total) +{ + if (!total) + goto div0; + + if (val < (U64_MAX / 10000)) + return (unsigned int)div64_ul(val * 10000, total); + + total = div64_u64(total, 10000); + if (!total) + goto div0; + + return (unsigned int)div64_ul(val, total); +div0: + return val ? UINT_MAX : 0; +} + +static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, + const char *field_name, unsigned long flags, + u64 *totals, struct tracing_map_elt *elt) +{ + u64 val = tracing_map_read_sum(elt, idx); + unsigned int pc; + + if (flags & HIST_FIELD_FL_PERCENT) { + pc = __get_percentage(val, totals[idx]); + if (pc == UINT_MAX) + seq_printf(m, " %s (%%):[ERROR]", field_name); + else + seq_printf(m, " %s (%%): %3u.%02u", field_name, + pc / 100, pc % 100); + } else if (flags & HIST_FIELD_FL_HEX) { + seq_printf(m, " %s: %10llx", field_name, val); + } else { + seq_printf(m, " %s: %10llu", field_name, val); + } +} + static void hist_trigger_entry_print(struct seq_file *m, struct hist_trigger_data *hist_data, + u64 *totals, void *key, struct tracing_map_elt *elt) { const char *field_name; - unsigned int i; + unsigned int i = HITCOUNT_IDX; + unsigned long flags; hist_trigger_print_key(m, hist_data, key, elt); - seq_printf(m, " hitcount: %10llu", - tracing_map_read_sum(elt, HITCOUNT_IDX)); + /* At first, show the raw hitcount always */ + hist_trigger_print_val(m, i, "hitcount", 0, totals, elt); for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); + flags = hist_data->fields[i]->flags; - if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR || - hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR) + if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR) continue; - if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { - seq_printf(m, " %s: %10llx", field_name, - tracing_map_read_sum(elt, i)); - } else { - seq_printf(m, " %s: %10llu", field_name, - tracing_map_read_sum(elt, i)); - } + seq_puts(m, " "); + hist_trigger_print_val(m, i, field_name, flags, totals, elt); } print_actions(m, hist_data, elt); @@ -5330,7 +5373,8 @@ static int print_entries(struct seq_file *m, { struct tracing_map_sort_entry **sort_entries = NULL; struct tracing_map *map = hist_data->map; - int i, n_entries; + int i, j, n_entries; + u64 *totals = NULL; n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, hist_data->n_sort_keys, @@ -5338,11 +5382,29 @@ static int print_entries(struct seq_file *m, if (n_entries < 0) return n_entries; + for (j = 0; j < hist_data->n_vals; j++) { + if (!(hist_data->fields[j]->flags & HIST_FIELD_FL_PERCENT)) + continue; + if (!totals) { + totals = kcalloc(hist_data->n_vals, sizeof(u64), + GFP_KERNEL); + if (!totals) { + n_entries = -ENOMEM; + goto out; + } + } + for (i = 0; i < n_entries; i++) + totals[j] += tracing_map_read_sum( + sort_entries[i]->elt, j); + } + for (i = 0; i < n_entries; i++) - hist_trigger_entry_print(m, hist_data, + hist_trigger_entry_print(m, hist_data, totals, sort_entries[i]->key, sort_entries[i]->elt); + kfree(totals); +out: tracing_map_destroy_sort_entries(sort_entries, n_entries); return n_entries; From a2c54256dec7510477e2b4f4db187e638f7cac37 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 20 Oct 2022 00:31:55 +0900 Subject: [PATCH 17/44] tracing: Add .graph suffix option to histogram value Add the .graph suffix which shows the bar graph of the histogram value. For example, the below example shows that the bar graph of the histogram of the runtime for each tasks. ------ # cd /sys/kernel/debug/tracing/ # echo hist:keys=pid:vals=runtime.graph:sort=pid > \ events/sched/sched_stat_runtime/trigger # sleep 10 # cat events/sched/sched_stat_runtime/hist # event histogram # # trigger info: hist:keys=pid:vals=hitcount,runtime.graph:sort=pid:size=2048 [active] # { pid: 14 } hitcount: 2 runtime: { pid: 16 } hitcount: 8 runtime: { pid: 26 } hitcount: 1 runtime: { pid: 57 } hitcount: 3 runtime: { pid: 61 } hitcount: 20 runtime: ### { pid: 66 } hitcount: 2 runtime: { pid: 70 } hitcount: 3 runtime: { pid: 72 } hitcount: 2 runtime: { pid: 145 } hitcount: 14 runtime: #################### { pid: 152 } hitcount: 5 runtime: ####### { pid: 153 } hitcount: 2 runtime: #### Totals: Hits: 62 Entries: 11 Dropped: 0 ------- Link: https://lore.kernel.org/linux-trace-kernel/166610813953.56030.10944148382315789485.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi --- kernel/trace/trace.c | 3 +- kernel/trace/trace_events_hist.c | 77 +++++++++++++++++++++++++------- 2 files changed, 63 insertions(+), 17 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 08e9568849b1..55aec4616d8b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5725,7 +5725,8 @@ static const char readme_msg[] = "\t .log2 display log2 value rather than raw number\n" "\t .buckets=size display values in groups of size rather than raw number\n" "\t .usecs display a common_timestamp in microseconds\n" - "\t .percent display a number of percentage value\n\n" + "\t .percent display a number of percentage value\n" + "\t .graph display a bar-graph of a value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 35b0e956f06e..946b2b8f0f2c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -507,6 +507,7 @@ enum hist_field_flags { HIST_FIELD_FL_BUCKET = 1 << 17, HIST_FIELD_FL_CONST = 1 << 18, HIST_FIELD_FL_PERCENT = 1 << 19, + HIST_FIELD_FL_GRAPH = 1 << 20, }; struct var_defs { @@ -1710,6 +1711,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "usecs"; else if (hist_field->flags & HIST_FIELD_FL_PERCENT) flags_str = "percent"; + else if (hist_field->flags & HIST_FIELD_FL_GRAPH) + flags_str = "graph"; return flags_str; } @@ -2322,6 +2325,10 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) goto error; *flags |= HIST_FIELD_FL_PERCENT; + } else if (strncmp(modifier, "graph", 5) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_GRAPH; } else { error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); @@ -5316,20 +5323,52 @@ static inline unsigned int __get_percentage(u64 val, u64 total) return val ? UINT_MAX : 0; } +#define BAR_CHAR '#' + +static inline const char *__fill_bar_str(char *buf, int size, u64 val, u64 max) +{ + unsigned int len = __get_percentage(val, max); + int i; + + if (len == UINT_MAX) { + snprintf(buf, size, "[ERROR]"); + return buf; + } + + len = len * size / 10000; + for (i = 0; i < len && i < size; i++) + buf[i] = BAR_CHAR; + while (i < size) + buf[i++] = ' '; + buf[size] = '\0'; + + return buf; +} + +struct hist_val_stat { + u64 max; + u64 total; +}; + static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, const char *field_name, unsigned long flags, - u64 *totals, struct tracing_map_elt *elt) + struct hist_val_stat *stats, + struct tracing_map_elt *elt) { u64 val = tracing_map_read_sum(elt, idx); unsigned int pc; + char bar[21]; if (flags & HIST_FIELD_FL_PERCENT) { - pc = __get_percentage(val, totals[idx]); + pc = __get_percentage(val, stats[idx].total); if (pc == UINT_MAX) seq_printf(m, " %s (%%):[ERROR]", field_name); else seq_printf(m, " %s (%%): %3u.%02u", field_name, pc / 100, pc % 100); + } else if (flags & HIST_FIELD_FL_GRAPH) { + seq_printf(m, " %s: %20s", field_name, + __fill_bar_str(bar, 20, val, stats[idx].max)); } else if (flags & HIST_FIELD_FL_HEX) { seq_printf(m, " %s: %10llx", field_name, val); } else { @@ -5339,7 +5378,7 @@ static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, static void hist_trigger_entry_print(struct seq_file *m, struct hist_trigger_data *hist_data, - u64 *totals, + struct hist_val_stat *stats, void *key, struct tracing_map_elt *elt) { @@ -5350,7 +5389,7 @@ static void hist_trigger_entry_print(struct seq_file *m, hist_trigger_print_key(m, hist_data, key, elt); /* At first, show the raw hitcount always */ - hist_trigger_print_val(m, i, "hitcount", 0, totals, elt); + hist_trigger_print_val(m, i, "hitcount", 0, stats, elt); for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); @@ -5360,7 +5399,7 @@ static void hist_trigger_entry_print(struct seq_file *m, continue; seq_puts(m, " "); - hist_trigger_print_val(m, i, field_name, flags, totals, elt); + hist_trigger_print_val(m, i, field_name, flags, stats, elt); } print_actions(m, hist_data, elt); @@ -5374,7 +5413,8 @@ static int print_entries(struct seq_file *m, struct tracing_map_sort_entry **sort_entries = NULL; struct tracing_map *map = hist_data->map; int i, j, n_entries; - u64 *totals = NULL; + struct hist_val_stat *stats = NULL; + u64 val; n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, hist_data->n_sort_keys, @@ -5382,28 +5422,33 @@ static int print_entries(struct seq_file *m, if (n_entries < 0) return n_entries; + /* Calculate the max and the total for each field if needed. */ for (j = 0; j < hist_data->n_vals; j++) { - if (!(hist_data->fields[j]->flags & HIST_FIELD_FL_PERCENT)) + if (!(hist_data->fields[j]->flags & + (HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH))) continue; - if (!totals) { - totals = kcalloc(hist_data->n_vals, sizeof(u64), - GFP_KERNEL); - if (!totals) { + if (!stats) { + stats = kcalloc(hist_data->n_vals, sizeof(*stats), + GFP_KERNEL); + if (!stats) { n_entries = -ENOMEM; goto out; } } - for (i = 0; i < n_entries; i++) - totals[j] += tracing_map_read_sum( - sort_entries[i]->elt, j); + for (i = 0; i < n_entries; i++) { + val = tracing_map_read_sum(sort_entries[i]->elt, j); + stats[j].total += val; + if (stats[j].max < val) + stats[j].max = val; + } } for (i = 0; i < n_entries; i++) - hist_trigger_entry_print(m, hist_data, totals, + hist_trigger_entry_print(m, hist_data, stats, sort_entries[i]->key, sort_entries[i]->elt); - kfree(totals); + kfree(stats); out: tracing_map_destroy_sort_entries(sort_entries, n_entries); From ccf47f5cc4cebdba7444cf1c04fc1b02d4b3cd15 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 20 Oct 2022 00:31:55 +0900 Subject: [PATCH 18/44] tracing: Add nohitcount option for suppressing display of raw hitcount Add 'nohitcount' ('NOHC' for short) option for suppressing display of the raw hitcount column in the histogram. Note that you must specify at least one value except raw 'hitcount' when you specify this nohitcount option. # cd /sys/kernel/debug/tracing/ # echo hist:keys=pid:vals=runtime.percent,runtime.graph:sort=pid:NOHC > \ events/sched/sched_stat_runtime/trigger # sleep 10 # cat events/sched/sched_stat_runtime/hist # event histogram # # trigger info: hist:keys=pid:vals=runtime.percent,runtime.graph:sort=pid:size=2048:nohitcount [active] # { pid: 8 } runtime (%): 3.02 runtime: # { pid: 14 } runtime (%): 2.25 runtime: { pid: 16 } runtime (%): 2.25 runtime: { pid: 26 } runtime (%): 0.17 runtime: { pid: 61 } runtime (%): 11.52 runtime: #### { pid: 67 } runtime (%): 1.56 runtime: { pid: 68 } runtime (%): 0.84 runtime: { pid: 76 } runtime (%): 0.92 runtime: { pid: 117 } runtime (%): 2.50 runtime: # { pid: 146 } runtime (%): 49.88 runtime: #################### { pid: 157 } runtime (%): 16.63 runtime: ###### { pid: 158 } runtime (%): 8.38 runtime: ### Link: https://lore.kernel.org/linux-trace-kernel/166610814787.56030.4980636083486339906.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_events_hist.c | 34 ++++++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 55aec4616d8b..948f321b9df1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5678,6 +5678,7 @@ static const char readme_msg[] = "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" + "\t [:nohitcount]\n" "\t [:.]\n" "\t [if ]\n\n" "\t Note, special fields can be used as well:\n" @@ -5734,6 +5735,8 @@ static const char readme_msg[] = "\t The 'clear' parameter will clear the contents of a running\n" "\t hist trigger and leave its current paused/active state\n" "\t unchanged.\n\n" + "\t The 'nohitcount' (or NOHC) parameter will suppress display of\n" + "\t raw hitcount in the histogram.\n\n" "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" "\t already-attached hist trigger. The syntax is analogous to\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 946b2b8f0f2c..a0cd118af527 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -69,7 +69,8 @@ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ C(EXPECT_NUMBER, "Expecting numeric literal"), \ C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ - C(DIVISION_BY_ZERO, "Division by zero"), + C(DIVISION_BY_ZERO, "Division by zero"), \ + C(NEED_NOHC_VAL, "Non-hitcount value is required for 'nohitcount'"), #undef C #define C(a, b) HIST_ERR_##a @@ -526,6 +527,7 @@ struct hist_trigger_attrs { bool cont; bool clear; bool ts_in_usecs; + bool no_hitcount; unsigned int map_bits; char *assignment_str[TRACING_MAP_VARS_MAX]; @@ -1550,7 +1552,10 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) ret = parse_assignment(tr, str, attrs); if (ret) goto free; - } else if (strcmp(str, "pause") == 0) + } else if (strcmp(str, "nohitcount") == 0 || + strcmp(str, "NOHC") == 0) + attrs->no_hitcount = true; + else if (strcmp(str, "pause") == 0) attrs->pause = true; else if ((strcmp(str, "cont") == 0) || (strcmp(str, "continue") == 0)) @@ -4377,6 +4382,12 @@ static int create_val_fields(struct hist_trigger_data *hist_data, if (fields_str && (strcmp(fields_str, "hitcount") != 0)) ret = -EINVAL; out: + /* There is only raw hitcount but nohitcount suppresses it. */ + if (j == 1 && hist_data->attrs->no_hitcount) { + hist_err(hist_data->event_file->tr, HIST_ERR_NEED_NOHC_VAL, 0); + ret = -ENOENT; + } + return ret; } @@ -5388,13 +5399,13 @@ static void hist_trigger_entry_print(struct seq_file *m, hist_trigger_print_key(m, hist_data, key, elt); - /* At first, show the raw hitcount always */ - hist_trigger_print_val(m, i, "hitcount", 0, stats, elt); + /* At first, show the raw hitcount if !nohitcount */ + if (!hist_data->attrs->no_hitcount) + hist_trigger_print_val(m, i, "hitcount", 0, stats, elt); for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); flags = hist_data->fields[i]->flags; - if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR) continue; @@ -5839,6 +5850,7 @@ static int event_hist_trigger_print(struct seq_file *m, struct hist_trigger_data *hist_data = data->private_data; struct hist_field *field; bool have_var = false; + bool show_val = false; unsigned int i; seq_puts(m, HIST_PREFIX); @@ -5869,12 +5881,16 @@ static int event_hist_trigger_print(struct seq_file *m, continue; } - if (i == HITCOUNT_IDX) + if (i == HITCOUNT_IDX) { + if (hist_data->attrs->no_hitcount) + continue; seq_puts(m, "hitcount"); - else { - seq_puts(m, ","); + } else { + if (show_val) + seq_puts(m, ","); hist_field_print(m, field); } + show_val = true; } if (have_var) { @@ -5925,6 +5941,8 @@ static int event_hist_trigger_print(struct seq_file *m, seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); if (hist_data->enable_timestamps) seq_printf(m, ":clock=%s", hist_data->attrs->clock); + if (hist_data->attrs->no_hitcount) + seq_puts(m, ":nohitcount"); print_actions_spec(m, hist_data); From 8c2b99790196d34a58a2b00a4c9862d2de3af3e2 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 20 Oct 2022 00:31:56 +0900 Subject: [PATCH 19/44] tracing: docs: Update histogram doc for .percent/.graph and 'nohitcount' Update histogram document for .percent/.graph suffixes and 'nohitcount' option. Link: https://lore.kernel.org/linux-trace-kernel/166610815604.56030.4124933216911828519.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi --- Documentation/trace/histogram.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index 87bd772836c0..f95459aa984f 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -25,7 +25,7 @@ Documentation written by Tom Zanussi hist:keys=[:values=] [:sort=][:size=#entries][:pause][:continue] - [:clear][:name=histname1][:.] [if ] + [:clear][:name=histname1][:nohitcount][:.] [if ] When a matching event is hit, an entry is added to a hash table using the key(s) and value(s) named. Keys and values correspond to @@ -79,6 +79,8 @@ Documentation written by Tom Zanussi .log2 display log2 value rather than raw number .buckets=size display grouping of values rather than raw number .usecs display a common_timestamp in microseconds + .percent display a number of percentage value + .graph display a bar-graph of a value ============= ================================================= Note that in general the semantics of a given field aren't @@ -137,6 +139,12 @@ Documentation written by Tom Zanussi existing trigger, rather than via the '>' operator, which will cause the trigger to be removed through truncation. + The 'nohitcount' (or NOHC) parameter will suppress display of + raw hitcount in the histogram. This option requires at least one + value field which is not a 'raw hitcount'. For example, + 'hist:...:vals=hitcount:nohitcount' is rejected, but + 'hist:...:vals=hitcount.percent:nohitcount' is OK. + - enable_hist/disable_hist The enable_hist and disable_hist triggers can be used to have one From fff1787adaeebe66f27c01d5c40d8d2e4d79d5ee Mon Sep 17 00:00:00 2001 From: Song Chen Date: Wed, 19 Oct 2022 10:11:18 +0800 Subject: [PATCH 20/44] trace/kprobe: remove duplicated calls of ring_buffer_event_data Function __kprobe_trace_func calls ring_buffer_event_data to get a ring buffer, however, it has been done in above call trace_event_buffer_reserve. So does __kretprobe_trace_func. This patch removes those duplicated calls. Link: https://lore.kernel.org/all/1666145478-4706-1-git-send-email-chensong_2000@189.cn/ Reviewed-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Song Chen Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5a75b039e586..ee77c8203bd5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1344,7 +1344,6 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, return; fbuffer.regs = regs; - entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->ip = (unsigned long)tk->rp.kp.addr; store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); @@ -1385,7 +1384,6 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, return; fbuffer.regs = regs; - entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = get_kretprobe_retaddr(ri); store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); From 575b76cb885532aae13a9d979fd476bb2b156cb9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 22 Nov 2022 12:23:45 -0500 Subject: [PATCH 21/44] tracing/probes: Handle system names with hyphens When creating probe names, a check is done to make sure it matches basic C standard variable naming standards. Basically, starts with alphabetic or underline, and then the rest of the characters have alpha-numeric or underline in them. But system names do not have any true naming conventions, as they are created by the TRACE_SYSTEM macro and nothing tests to see what they are. The "xhci-hcd" trace events has a '-' in the system name. When trying to attach a eprobe to one of these trace points, it fails because the system name does not follow the variable naming convention because of the hyphen, and the eprobe checks fail on this. Allow hyphens in the system name so that eprobes can attach to the "xhci-hcd" trace events. Link: https://lore.kernel.org/all/Y3eJ8GiGnEvVd8%2FN@macondo/ Link: https://lore.kernel.org/linux-trace-kernel/20221122122345.160f5077@gandalf.local.home Cc: Masami Hiramatsu Cc: stable@vger.kernel.org Fixes: 5b7a96220900e ("tracing/probe: Check event/group naming rule at parsing") Reported-by: Rafael Mendonca Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 19 ++++++++++++++++--- kernel/trace/trace_probe.c | 2 +- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 48643f07bc01..8f37ff032b4f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1954,17 +1954,30 @@ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) } /* Check the name is good for event/group/fields */ -static inline bool is_good_name(const char *name) +static inline bool __is_good_name(const char *name, bool hash_ok) { - if (!isalpha(*name) && *name != '_') + if (!isalpha(*name) && *name != '_' && (!hash_ok || *name != '-')) return false; while (*++name != '\0') { - if (!isalpha(*name) && !isdigit(*name) && *name != '_') + if (!isalpha(*name) && !isdigit(*name) && *name != '_' && + (!hash_ok || *name != '-')) return false; } return true; } +/* Check the name is good for event/group/fields */ +static inline bool is_good_name(const char *name) +{ + return __is_good_name(name, false); +} + +/* Check the name is good for system */ +static inline bool is_good_system_name(const char *name) +{ + return __is_good_name(name, true); +} + /* Convert certain expected symbols into '_' when generating event names */ static inline void sanitize_event_name(char *name) { diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 36dff277de46..bb2f95d7175c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -246,7 +246,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return -EINVAL; } strlcpy(buf, event, slash - event + 1); - if (!is_good_name(buf)) { + if (!is_good_system_name(buf)) { trace_probe_log_err(offset, BAD_GROUP_NAME); return -EINVAL; } From e25e43a4e5d8cb2323553d8b6a7ba08d2ebab21f Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 6 Dec 2022 23:18:01 +0900 Subject: [PATCH 22/44] tracing: Fix complicated dependency of CONFIG_TRACER_MAX_TRACE Both CONFIG_OSNOISE_TRACER and CONFIG_HWLAT_TRACER partially enables the CONFIG_TRACER_MAX_TRACE code, but that is complicated and has introduced a bug; It declares tracing_max_lat_fops data structure outside of #ifdefs, but since it is defined only when CONFIG_TRACER_MAX_TRACE=y or CONFIG_HWLAT_TRACER=y, if only CONFIG_OSNOISE_TRACER=y, that declaration comes to a definition(!). To fix this issue, and do not repeat the similar problem, makes CONFIG_OSNOISE_TRACER and CONFIG_HWLAT_TRACER enables the CONFIG_TRACER_MAX_TRACE always. It has there benefits; - Fix the tracing_max_lat_fops bug - Simplify the #ifdefs - CONFIG_TRACER_MAX_TRACE code is fully enabled, or not. Link: https://lore.kernel.org/linux-trace-kernel/167033628155.4111793.12185405690820208159.stgit@devnote3 Fixes: 424b650f35c7 ("tracing: Fix missing osnoise tracer on max_latency") Cc: Daniel Bristot de Oliveira Cc: stable@vger.kernel.org Reported-by: David Howells Reported-by: kernel test robot Signed-off-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/all/166992525941.1716618.13740663757583361463.stgit@warthog.procyon.org.uk/ (original thread and v1) Link: https://lore.kernel.org/all/202212052253.VuhZ2ulJ-lkp@intel.com/T/#u (v1 error report) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 2 ++ kernel/trace/trace.c | 23 +++++++++++++---------- kernel/trace/trace.h | 8 +++----- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e9e95c790b8e..93d724996283 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -375,6 +375,7 @@ config SCHED_TRACER config HWLAT_TRACER bool "Tracer to detect hardware latencies (like SMIs)" select GENERIC_TRACER + select TRACER_MAX_TRACE help This tracer, when enabled will create one or more kernel threads, depending on what the cpumask file is set to, which each thread @@ -410,6 +411,7 @@ config HWLAT_TRACER config OSNOISE_TRACER bool "OS Noise tracer" select GENERIC_TRACER + select TRACER_MAX_TRACE help In the context of high-performance computing (HPC), the Operating System Noise (osnoise) refers to the interference experienced by an diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 948f321b9df1..664619b3f1e1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1421,6 +1421,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) return false; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); +#define free_snapshot(tr) do { } while (0) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1692,6 +1693,8 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) } unsigned long __read_mostly tracing_thresh; + +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops; #ifdef LATENCY_FS_NOTIFY @@ -1748,18 +1751,14 @@ void latency_fsnotify(struct trace_array *tr) irq_work_queue(&tr->fsnotify_irqwork); } -#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER) +#else /* !LATENCY_FS_NOTIFY */ #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ d_tracer, &tr->max_latency, &tracing_max_lat_fops) -#else -#define trace_create_maxlat_file(tr, d_tracer) do { } while (0) #endif -#ifdef CONFIG_TRACER_MAX_TRACE /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, @@ -1834,14 +1833,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, ring_buffer_record_off(tr->max_buffer.buffer); #ifdef CONFIG_TRACER_SNAPSHOT - if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) - goto out_unlock; + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { + arch_spin_unlock(&tr->max_lock); + return; + } #endif swap(tr->array_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); - out_unlock: arch_spin_unlock(&tr->max_lock); } @@ -1888,6 +1888,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); } + #endif /* CONFIG_TRACER_MAX_TRACE */ static int wait_on_pipe(struct trace_iterator *iter, int full) @@ -6577,7 +6578,7 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf, return ret; } -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, @@ -7592,7 +7593,7 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -9606,7 +9607,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); +#ifdef CONFIG_TRACER_MAX_TRACE trace_create_maxlat_file(tr, d_tracer); +#endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8f37ff032b4f..9dc920b01c17 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -308,8 +308,7 @@ struct trace_array { struct array_buffer max_buffer; bool allocated_snapshot; #endif -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -688,12 +687,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -#endif /* CONFIG_TRACER_MAX_TRACE */ -#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY) +#ifdef CONFIG_FSNOTIFY #define LATENCY_FS_NOTIFY #endif +#endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef LATENCY_FS_NOTIFY void latency_fsnotify(struct trace_array *tr); From 4994e387d7332f03cd4eab55f7896ddf04cab1c0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 6 Dec 2022 14:12:02 -0500 Subject: [PATCH 23/44] x86/mm/kmmio: Switch to arch_spin_lock() The mmiotrace tracer is "special". The purpose is to help reverse engineer binary drivers by removing the memory allocated by the driver and when the driver goes to access it, a fault occurs, the mmiotracer will record what the driver was doing and then do the work on its behalf by single stepping through the process. But to achieve this ability, it must do some special things. One is it needs to grab a lock while in the breakpoint handler. This is considered an NMI state, and then lockdep warns that the lock is being held in both an NMI state (really a breakpoint handler) and also in normal context. As the breakpoint/NMI state only happens when the driver is accessing memory, there's no concern of a race condition against the setup and tear-down of mmiotracer. To make lockdep and mmiotrace work together, convert the locks used in the breakpoint handler into arch_spin_lock(). Link: https://lkml.kernel.org/r/20221206191229.656244029@goodmis.org Link: https://lore.kernel.org/lkml/20221201213126.620b7dd3@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Karol Herbst Cc: Pekka Paalanen Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Borislav Petkov Suggested-by: Thomas Gleixner Signed-off-by: Steven Rostedt (Google) --- arch/x86/mm/kmmio.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index d3efbc5b3449..edb486450158 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -62,7 +62,13 @@ struct kmmio_context { int active; }; -static DEFINE_SPINLOCK(kmmio_lock); +/* + * The kmmio_lock is taken in int3 context, which is treated as NMI context. + * This causes lockdep to complain about it bein in both NMI and normal + * context. Hide it from lockdep, as it should not have any other locks + * taken under it, and this is only enabled for debugging mmio anyway. + */ +static arch_spinlock_t kmmio_lock = __ARCH_SPIN_LOCK_UNLOCKED; /* Protected by kmmio_lock */ unsigned int kmmio_count; @@ -346,10 +352,10 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) ctx->probe->post_handler(ctx->probe, condition, regs); /* Prevent racing against release_kmmio_fault_page(). */ - spin_lock(&kmmio_lock); + arch_spin_lock(&kmmio_lock); if (ctx->fpage->count) arm_kmmio_fault_page(ctx->fpage); - spin_unlock(&kmmio_lock); + arch_spin_unlock(&kmmio_lock); regs->flags &= ~X86_EFLAGS_TF; regs->flags |= ctx->saved_flags; @@ -440,7 +446,8 @@ int register_kmmio_probe(struct kmmio_probe *p) unsigned int l; pte_t *pte; - spin_lock_irqsave(&kmmio_lock, flags); + local_irq_save(flags); + arch_spin_lock(&kmmio_lock); if (get_kmmio_probe(addr)) { ret = -EEXIST; goto out; @@ -460,7 +467,9 @@ int register_kmmio_probe(struct kmmio_probe *p) size += page_level_size(l); } out: - spin_unlock_irqrestore(&kmmio_lock, flags); + arch_spin_unlock(&kmmio_lock); + local_irq_restore(flags); + /* * XXX: What should I do here? * Here was a call to global_flush_tlb(), but it does not exist @@ -494,7 +503,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) struct kmmio_fault_page **prevp = &dr->release_list; unsigned long flags; - spin_lock_irqsave(&kmmio_lock, flags); + local_irq_save(flags); + arch_spin_lock(&kmmio_lock); while (f) { if (!f->count) { list_del_rcu(&f->list); @@ -506,7 +516,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) } f = *prevp; } - spin_unlock_irqrestore(&kmmio_lock, flags); + arch_spin_unlock(&kmmio_lock); + local_irq_restore(flags); /* This is the real RCU destroy call. */ call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); @@ -540,14 +551,16 @@ void unregister_kmmio_probe(struct kmmio_probe *p) if (!pte) return; - spin_lock_irqsave(&kmmio_lock, flags); + local_irq_save(flags); + arch_spin_lock(&kmmio_lock); while (size < size_lim) { release_kmmio_fault_page(addr + size, &release_list); size += page_level_size(l); } list_del_rcu(&p->list); kmmio_count--; - spin_unlock_irqrestore(&kmmio_lock, flags); + arch_spin_unlock(&kmmio_lock); + local_irq_restore(flags); if (!release_list) return; From 20fb6c997600012209699564a79911584dd5876b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 9 Dec 2022 13:41:44 -0500 Subject: [PATCH 24/44] x86/mm/kmmio: Use rcu_read_lock_sched_notrace() The mmiotrace tracer is "special". The purpose is to help reverse engineer binary drivers by removing the memory allocated by the driver and when the driver goes to access it, a fault occurs, the mmiotracer will record what the driver was doing and then do the work on its behalf by single stepping through the process. But to achieve this ability, it must do some special things. One is to take the rcu_read_lock() when the fault occurs, and then release it in the breakpoint that is single stepping. This makes lockdep unhappy, as it changes the state of RCU from within an exception that is not contained in that exception, and we get a nasty splat from lockdep. Instead, switch to rcu_read_lock_sched_notrace() as the RCU sched variant has the same grace period as normal RCU. This is basically the same as rcu_read_lock() but does not make lockdep complain about it. Note, the preempt_disable() is still needed as it uses preempt_enable_no_resched(). Link: https://lore.kernel.org/linux-trace-kernel/20221209134144.04f33626@gandalf.local.home Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Karol Herbst Cc: Pekka Paalanen Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (Google) --- arch/x86/mm/kmmio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index edb486450158..853c49877c16 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -254,7 +254,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) * again. */ preempt_disable(); - rcu_read_lock(); + rcu_read_lock_sched_notrace(); faultpage = get_kmmio_fault_page(page_base); if (!faultpage) { @@ -323,7 +323,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) return 1; /* fault handled */ no_kmmio: - rcu_read_unlock(); + rcu_read_unlock_sched_notrace(); preempt_enable_no_resched(); return ret; } @@ -363,7 +363,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) /* These were acquired in kmmio_handler(). */ ctx->active--; BUG_ON(ctx->active); - rcu_read_unlock(); + rcu_read_unlock_sched_notrace(); preempt_enable_no_resched(); /* From 2cc6a528882d0e0ccbc1bca5f95b8c963cedac54 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 7 Dec 2022 11:46:35 +0800 Subject: [PATCH 25/44] tracing/hist: Fix wrong return value in parse_action_params() When number of synth fields is more than SYNTH_FIELDS_MAX, parse_action_params() should return -EINVAL. Link: https://lore.kernel.org/linux-trace-kernel/20221207034635.2253990-1-zhengyejian1@huawei.com Cc: Cc: Cc: stable@vger.kernel.org Fixes: c282a386a397 ("tracing: Add 'onmatch' hist trigger action support") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a0cd118af527..b4ad86c22b43 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3609,6 +3609,7 @@ static int parse_action_params(struct trace_array *tr, char *params, while (params) { if (data->n_params >= SYNTH_FIELDS_MAX) { hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0); + ret = -EINVAL; goto out; } From 82470f7d9044842618c847a7166de2b7458157a7 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 7 Dec 2022 11:51:43 +0800 Subject: [PATCH 26/44] tracing/hist: Fix out-of-bound write on 'action_data.var_ref_idx' When generate a synthetic event with many params and then create a trace action for it [1], kernel panic happened [2]. It is because that in trace_action_create() 'data->n_params' is up to SYNTH_FIELDS_MAX (current value is 64), and array 'data->var_ref_idx' keeps indices into array 'hist_data->var_refs' for each synthetic event param, but the length of 'data->var_ref_idx' is TRACING_MAP_VARS_MAX (current value is 16), so out-of-bound write happened when 'data->n_params' more than 16. In this case, 'data->match_data.event' is overwritten and eventually cause the panic. To solve the issue, adjust the length of 'data->var_ref_idx' to be SYNTH_FIELDS_MAX and add sanity checks to avoid out-of-bound write. [1] # cd /sys/kernel/tracing/ # echo "my_synth_event int v1; int v2; int v3; int v4; int v5; int v6;\ int v7; int v8; int v9; int v10; int v11; int v12; int v13; int v14;\ int v15; int v16; int v17; int v18; int v19; int v20; int v21; int v22;\ int v23; int v24; int v25; int v26; int v27; int v28; int v29; int v30;\ int v31; int v32; int v33; int v34; int v35; int v36; int v37; int v38;\ int v39; int v40; int v41; int v42; int v43; int v44; int v45; int v46;\ int v47; int v48; int v49; int v50; int v51; int v52; int v53; int v54;\ int v55; int v56; int v57; int v58; int v59; int v60; int v61; int v62;\ int v63" >> synthetic_events # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="bash"' >> \ events/sched/sched_waking/trigger # echo "hist:keys=next_pid:onmatch(sched.sched_waking).my_synth_event(\ pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,\ pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,\ pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,pid,\ pid,pid,pid,pid,pid,pid,pid,pid,pid)" >> events/sched/sched_switch/trigger [2] BUG: unable to handle page fault for address: ffff91c900000000 PGD 61001067 P4D 61001067 PUD 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 2 PID: 322 Comm: bash Tainted: G W 6.1.0-rc8+ #229 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:strcmp+0xc/0x30 Code: 75 f7 31 d2 44 0f b6 04 16 44 88 04 11 48 83 c2 01 45 84 c0 75 ee c3 cc cc cc cc 0f 1f 00 31 c0 eb 08 48 83 c0 01 84 d2 74 13 <0f> b6 14 07 3a 14 06 74 ef 19 c0 83 c8 01 c3 cc cc cc cc 31 c3 RSP: 0018:ffff9b3b00f53c48 EFLAGS: 00000246 RAX: 0000000000000000 RBX: ffffffffba958a68 RCX: 0000000000000000 RDX: 0000000000000010 RSI: ffff91c943d33a90 RDI: ffff91c900000000 RBP: ffff91c900000000 R08: 00000018d604b529 R09: 0000000000000000 R10: ffff91c9483eddb1 R11: ffff91ca483eddab R12: ffff91c946171580 R13: ffff91c9479f0538 R14: ffff91c9457c2848 R15: ffff91c9479f0538 FS: 00007f1d1cfbe740(0000) GS:ffff91c9bdc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff91c900000000 CR3: 0000000006316000 CR4: 00000000000006e0 Call Trace: __find_event_file+0x55/0x90 action_create+0x76c/0x1060 event_hist_trigger_parse+0x146d/0x2060 ? event_trigger_write+0x31/0xd0 trigger_process_regex+0xbb/0x110 event_trigger_write+0x6b/0xd0 vfs_write+0xc8/0x3e0 ? alloc_fd+0xc0/0x160 ? preempt_count_add+0x4d/0xa0 ? preempt_count_add+0x70/0xa0 ksys_write+0x5f/0xe0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f1d1d0cf077 Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 RSP: 002b:00007ffcebb0e568 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000143 RCX: 00007f1d1d0cf077 RDX: 0000000000000143 RSI: 00005639265aa7e0 RDI: 0000000000000001 RBP: 00005639265aa7e0 R08: 000000000000000a R09: 0000000000000142 R10: 000056392639c017 R11: 0000000000000246 R12: 0000000000000143 R13: 00007f1d1d1ae6a0 R14: 00007f1d1d1aa4a0 R15: 00007f1d1d1a98a0 Modules linked in: CR2: ffff91c900000000 ---[ end trace 0000000000000000 ]--- RIP: 0010:strcmp+0xc/0x30 Code: 75 f7 31 d2 44 0f b6 04 16 44 88 04 11 48 83 c2 01 45 84 c0 75 ee c3 cc cc cc cc 0f 1f 00 31 c0 eb 08 48 83 c0 01 84 d2 74 13 <0f> b6 14 07 3a 14 06 74 ef 19 c0 83 c8 01 c3 cc cc cc cc 31 c3 RSP: 0018:ffff9b3b00f53c48 EFLAGS: 00000246 RAX: 0000000000000000 RBX: ffffffffba958a68 RCX: 0000000000000000 RDX: 0000000000000010 RSI: ffff91c943d33a90 RDI: ffff91c900000000 RBP: ffff91c900000000 R08: 00000018d604b529 R09: 0000000000000000 R10: ffff91c9483eddb1 R11: ffff91ca483eddab R12: ffff91c946171580 R13: ffff91c9479f0538 R14: ffff91c9457c2848 R15: ffff91c9479f0538 FS: 00007f1d1cfbe740(0000) GS:ffff91c9bdc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff91c900000000 CR3: 0000000006316000 CR4: 00000000000006e0 Link: https://lore.kernel.org/linux-trace-kernel/20221207035143.2278781-1-zhengyejian1@huawei.com Cc: Cc: Cc: stable@vger.kernel.org Fixes: d380dcde9a07 ("tracing: Fix now invalid var_ref_vals assumption in trace action") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b4ad86c22b43..8264b28d5a57 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -621,7 +621,7 @@ struct action_data { * event param, and is passed to the synthetic event * invocation. */ - unsigned int var_ref_idx[TRACING_MAP_VARS_MAX]; + unsigned int var_ref_idx[SYNTH_FIELDS_MAX]; struct synth_event *synth_event; bool use_trace_keyword; char *synth_event_name; @@ -2186,7 +2186,9 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data, return ref_field; } } - + /* Sanity check to avoid out-of-bound write on 'hist_data->var_refs' */ + if (hist_data->n_var_refs >= TRACING_MAP_VARS_MAX) + return NULL; ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); if (ref_field) { if (init_var_ref(ref_field, var_field, system, event_name)) { @@ -3946,6 +3948,10 @@ static int trace_action_create(struct hist_trigger_data *hist_data, lockdep_assert_held(&event_mutex); + /* Sanity check to avoid out-of-bound write on 'data->var_ref_idx' */ + if (data->n_params > SYNTH_FIELDS_MAX) + return -EINVAL; + if (data->use_trace_keyword) synth_event_name = data->synth_event_name; else From ff4837f7fe59ff018eca4705a70eca5e0b486b97 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 7 Dec 2022 17:15:57 +0800 Subject: [PATCH 27/44] tracing: Fix issue of missing one synthetic field The maximum number of synthetic fields supported is defined as SYNTH_FIELDS_MAX which value currently is 64, but it actually fails when try to generate a synthetic event with 64 fields by executing like: # echo "my_synth_event int v1; int v2; int v3; int v4; int v5; int v6;\ int v7; int v8; int v9; int v10; int v11; int v12; int v13; int v14;\ int v15; int v16; int v17; int v18; int v19; int v20; int v21; int v22;\ int v23; int v24; int v25; int v26; int v27; int v28; int v29; int v30;\ int v31; int v32; int v33; int v34; int v35; int v36; int v37; int v38;\ int v39; int v40; int v41; int v42; int v43; int v44; int v45; int v46;\ int v47; int v48; int v49; int v50; int v51; int v52; int v53; int v54;\ int v55; int v56; int v57; int v58; int v59; int v60; int v61; int v62;\ int v63; int v64" >> /sys/kernel/tracing/synthetic_events Correct the field counting to fix it. Link: https://lore.kernel.org/linux-trace-kernel/20221207091557.3137904-1-zhengyejian1@huawei.com Cc: Cc: Cc: stable@vger.kernel.org Fixes: c9e759b1e845 ("tracing: Rework synthetic event command parsing") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index c3b582d19b62..67592eed0be8 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1282,12 +1282,12 @@ static int __create_synth_event(const char *name, const char *raw_fields) goto err_free_arg; } - fields[n_fields++] = field; if (n_fields == SYNTH_FIELDS_MAX) { synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); ret = -EINVAL; goto err_free_arg; } + fields[n_fields++] = field; n_fields_this_loop++; } From 608c6ed3337850c767ab0dd6c583477922233e29 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 7 Dec 2022 21:53:26 +0800 Subject: [PATCH 28/44] tracing/hist: Fix issue of losting command info in error_log When input some constructed invalid 'trigger' command, command info in 'error_log' are lost [1]. The root cause is that there is a path that event_hist_trigger_parse() is recursely called once and 'last_cmd' which save origin command is cleared, then later calling of hist_err() will no longer record origin command info: event_hist_trigger_parse() { last_cmd_set() // <1> 'last_cmd' save origin command here at first create_actions() { onmatch_create() { action_create() { trace_action_create() { trace_action_create_field_var() { create_field_var_hist() { event_hist_trigger_parse() { // <2> recursely called once hist_err_clear() // <3> 'last_cmd' is cleared here } hist_err() // <4> No longer find origin command!!! Since 'glob' is empty string while running into the recurse call, we can trickly check it and bypass the call of hist_err_clear() to solve it. [1] # cd /sys/kernel/tracing # echo "my_synth_event int v1; int v2; int v3;" >> synthetic_events # echo 'hist:keys=pid' >> events/sched/sched_waking/trigger # echo "hist:keys=next_pid:onmatch(sched.sched_waking).my_synth_event(\ pid,pid1)" >> events/sched/sched_switch/trigger # cat error_log [ 8.405018] hist:sched:sched_switch: error: Couldn't find synthetic event Command: hist:keys=next_pid:onmatch(sched.sched_waking).my_synth_event(pid,pid1) ^ [ 8.816902] hist:sched:sched_switch: error: Couldn't find field Command: hist:keys=next_pid:onmatch(sched.sched_waking).my_synth_event(pid,pid1) ^ [ 8.816902] hist:sched:sched_switch: error: Couldn't parse field variable Command: hist:keys=next_pid:onmatch(sched.sched_waking).my_synth_event(pid,pid1) ^ [ 8.999880] : error: Couldn't find field Command: ^ [ 8.999880] : error: Couldn't parse field variable Command: ^ [ 8.999880] : error: Couldn't find field Command: ^ [ 8.999880] : error: Couldn't create histogram for field Command: ^ Link: https://lore.kernel.org/linux-trace-kernel/20221207135326.3483216-1-zhengyejian1@huawei.com Cc: Cc: Fixes: f404da6e1d46 ("tracing: Add 'last error' error facility for hist triggers") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8264b28d5a57..fcaf226b7744 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6576,7 +6576,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (se) se->ref++; out: - if (ret == 0) + if (ret == 0 && glob[0]) hist_err_clear(); return ret; From 88ca6a71dcab4a4ba6e6e2ff66415a5c4f86e874 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 9 Dec 2022 10:11:51 -0500 Subject: [PATCH 29/44] ring-buffer: Handle resize in early boot up With the new command line option that allows trace event triggers to be added at boot, the "snapshot" trigger will allocate the snapshot buffer very early, when interrupts can not be enabled. Allocating the ring buffer is not the problem, but it also resizes it, which is, as the resize code does synchronization that can not be preformed at early boot. To handle this, first change the raw_spin_lock_irq() in rb_insert_pages() to raw_spin_lock_irqsave(), such that the unlocking of that spin lock will not enable interrupts. Next, where it calls schedule_work_on(), disable migration and check if the CPU to update is the current CPU, and if so, perform the work directly, otherwise re-enable migration and call the schedule_work_on() to the CPU that is being updated. The rb_insert_pages() just needs to be run on the CPU that it is updating, and does not need preemption nor interrupts disabled when calling it. Link: https://lore.kernel.org/lkml/Y5J%2FCajlNh1gexvo@google.com/ Link: https://lore.kernel.org/linux-trace-kernel/20221209101151.1fec1167@gandalf.local.home Cc: Masami Hiramatsu Cc: Andrew Morton Fixes: a01fdc897fa5 ("tracing: Add trace_trigger kernel command line option") Reported-by: Ross Zwisler Signed-off-by: Steven Rostedt Tested-by: Ross Zwisler Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 843818ee4814..c366a0a9ddba 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2062,8 +2062,10 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *pages = &cpu_buffer->new_pages; int retries, success; + unsigned long flags; - raw_spin_lock_irq(&cpu_buffer->reader_lock); + /* Can be called at early boot up, where interrupts must not been enabled */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* * We are holding the reader lock, so the reader page won't be swapped * in the ring buffer. Now we are racing with the writer trying to @@ -2120,7 +2122,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) * tracing */ RB_WARN_ON(cpu_buffer, !success); - raw_spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); /* free pages if they weren't inserted */ if (!success) { @@ -2248,8 +2250,16 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, rb_update_pages(cpu_buffer); cpu_buffer->nr_pages_to_update = 0; } else { - schedule_work_on(cpu, - &cpu_buffer->update_pages_work); + /* Run directly if possible. */ + migrate_disable(); + if (cpu != smp_processor_id()) { + migrate_enable(); + schedule_work_on(cpu, + &cpu_buffer->update_pages_work); + } else { + update_pages_handler(&cpu_buffer->update_pages_work); + migrate_enable(); + } } } @@ -2298,9 +2308,17 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); else { - schedule_work_on(cpu_id, - &cpu_buffer->update_pages_work); - wait_for_completion(&cpu_buffer->update_done); + /* Run directly if possible. */ + migrate_disable(); + if (cpu_id == smp_processor_id()) { + rb_update_pages(cpu_buffer); + migrate_enable(); + } else { + migrate_enable(); + schedule_work_on(cpu_id, + &cpu_buffer->update_pages_work); + wait_for_completion(&cpu_buffer->update_done); + } } cpu_buffer->nr_pages_to_update = 0; From 98629dadcd020cba690ce0c49a564a1166444646 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 8 Dec 2022 17:33:10 -0700 Subject: [PATCH 30/44] tracing: remove unnecessary trace_trigger ifdef The trace_trigger command line option introduced by commit a01fdc897fa5 ("tracing: Add trace_trigger kernel command line option") doesn't need to depend on the CONFIG_HIST_TRIGGERS kernel config option. This code doesn't depend on the histogram code, and the run-time selection of triggers is usable without CONFIG_HIST_TRIGGERS. Link: https://lore.kernel.org/linux-trace-kernel/20221209003310.1737039-1-zwisler@google.com Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Mathieu Desnoyers Cc: Joel Fernandes Cc: Tom Zanussi Fixes: a01fdc897fa5 ("tracing: Add trace_trigger kernel command line option") Signed-off-by: Ross Zwisler Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3bfaf560ecc4..33e0b4f8ebe6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2796,7 +2796,6 @@ trace_create_new_event(struct trace_event_call *call, return file; } -#ifdef CONFIG_HIST_TRIGGERS #define MAX_BOOT_TRIGGERS 32 static struct boot_triggers { @@ -2832,7 +2831,6 @@ static __init int setup_trace_triggers(char *str) return 1; } __setup("trace_trigger=", setup_trace_triggers); -#endif /* Add an event to a trace directory */ static int @@ -2850,7 +2848,6 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) return event_define_fields(call); } -#ifdef CONFIG_HIST_TRIGGERS static void trace_early_triggers(struct trace_event_file *file, const char *name) { int ret; @@ -2868,9 +2865,6 @@ static void trace_early_triggers(struct trace_event_file *file, const char *name bootup_triggers[i].event); } } -#else -static inline void trace_early_triggers(struct trace_event_file *file, const char *name) { } -#endif /* * Just create a descriptor for early init. A descriptor is required From ec370890f92ba8ad5476a34068655b06ba48def7 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 5 Dec 2022 09:27:38 +0100 Subject: [PATCH 31/44] tracing/osnoise: Make osnoise_options static Make osnoise_options static, as reported by the kernel test robot. Link: https://lkml.kernel.org/r/63255826485400d7a2270e9c5e66111079671e7a.1670228712.git.bristot@kernel.org Reported-by: kernel test robot Cc: Daniel Bristot de Oliveira Cc: Steven Rostedt Cc: Masami Hiramatsu Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 3f10dd1f2f1c..8ba82c71268f 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -59,8 +59,8 @@ enum osnoise_options_index { static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS", "OSNOISE_WORKLOAD" }; -#define OSN_DEFAULT_OPTIONS 0x2 -unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; +#define OSN_DEFAULT_OPTIONS 0x2 +static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; /* * trace_array of the enabled osnoise/timerlat instances. From bfd5a5e82d22da43afa0e2bb9fb72339aa79c6cc Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Dec 2022 10:21:52 +0000 Subject: [PATCH 32/44] tracing: Fix some checker warnings Fix some checker warnings in the trace code by adding __printf attributes to a number of trace functions and their declarations. Changes: ======== ver #2) - Dropped the fix for the unconditional tracing_max_lat_fops decl[1]. Link: https://lore.kernel.org/r/20221205180617.9b9d3971cbe06ee536603523@kernel.org/ [1] Link: https://lore.kernel.org/r/166992525941.1716618.13740663757583361463.stgit@warthog.procyon.org.uk/ # v1 Link: https://lkml.kernel.org/r/167023571258.382307.15314866482834835192.stgit@warthog.procyon.org.uk Signed-off-by: David Howells Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 3 ++- include/linux/trace_seq.h | 3 ++- kernel/trace/trace.h | 2 +- kernel/trace/trace_output.c | 5 +++-- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index bb2053246d6a..4342e996bcdb 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -234,7 +234,8 @@ void tracing_record_taskinfo_sched_switch(struct task_struct *prev, void tracing_record_cmdline(struct task_struct *task); void tracing_record_tgid(struct task_struct *task); -int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); +int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) + __printf(3, 4); struct event_filter; diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 5a2c650d9e1c..0c4c7587d6c3 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -97,7 +97,8 @@ extern int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, const void *buf, size_t len, bool ascii); #else /* CONFIG_TRACING */ -static inline void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +static inline __printf(2, 3) +void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { } static inline void diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9dc920b01c17..e46a49269be2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -614,7 +614,7 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap); + va_list ap) __printf(2, 0); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f0ba97121345..57a13b61f186 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -322,8 +322,9 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) } EXPORT_SYMBOL(trace_event_printf); -static int trace_output_raw(struct trace_iterator *iter, char *name, - char *fmt, va_list ap) +static __printf(3, 0) +int trace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) { struct trace_seq *s = &iter->seq; From 0e162c6f1c8f7f502b3978f5a37ed6fa2e4d21f9 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 25 Nov 2022 10:43:00 +0700 Subject: [PATCH 33/44] Documentation/osnoise: Escape underscore of NO_ prefix kernel test robot reported unknown target name warning: Documentation/trace/osnoise-tracer.rst:112: WARNING: Unknown target name: "no". The warning causes NO_ prefix to be rendered as link text instead, which points to non-existent link target. Escape the prefix underscore to fix the warning. Link: https://lkml.kernel.org/r/20221125034300.24168-1-bagasdotme@gmail.com Cc: Masami Hiramatsu Cc: Jonathan Corbet Cc: Ammar Faizi Cc: GNU/Weeb Mailing List Link: https://lore.kernel.org/linux-doc/202211240447.HxRNftE5-lkp@intel.com/ Fixes: 67543cd6b8eee5 ("Documentation/osnoise: Add osnoise/options documentation") Reported-by: kernel test robot Signed-off-by: Bagas Sanjaya Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/osnoise-tracer.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/osnoise-tracer.rst b/Documentation/trace/osnoise-tracer.rst index 3c675ed82b27..fdd562d7c22d 100644 --- a/Documentation/trace/osnoise-tracer.rst +++ b/Documentation/trace/osnoise-tracer.rst @@ -111,7 +111,7 @@ The tracer has a set of options inside the osnoise directory, they are: be used, which is currently 5 us. - osnoise/options: a set of on/off options that can be enabled by writing the option name to the file or disabled by writing the option - name preceded with the 'NO_' prefix. For example, writing + name preceded with the 'NO\_' prefix. For example, writing NO_OSNOISE_WORKLOAD disables the OSNOISE_WORKLOAD option. The special DEAFAULTS option resets all options to the default value. From 1603dda47714cebe8a29b2154407da7a929d13f4 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 9 Dec 2022 23:05:52 +0100 Subject: [PATCH 34/44] tracing/osnoise: Add PANIC_ON_STOP option Often the latency observed in a CPU is not caused by the work being done in the CPU itself, but by work done on another CPU that causes the hardware to stall all CPUs. In this case, it is interesting to know what is happening on ALL CPUs, and the best way to do this is via crash dump analysis. Add the PANIC_ON_STOP option to osnoise/timerlat tracers. The default behavior is having this option off. When enabled by the user, the system will panic after hitting a stop tracing condition. This option was motivated by a real scenario that Juri Lelli and I were debugging. Link: https://lkml.kernel.org/r/249ce4287c6725543e6db845a6e0df621dc67db5.1670623111.git.bristot@kernel.org Cc: Juri Lelli Cc: Clark Williams Cc: Bagas Sanjaya Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 8ba82c71268f..5a7613942223 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -54,10 +54,11 @@ enum osnoise_options_index { OSN_DEFAULTS = 0, OSN_WORKLOAD, + OSN_PANIC_ON_STOP, OSN_MAX }; -static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS", "OSNOISE_WORKLOAD" }; +static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS", "OSNOISE_WORKLOAD", "PANIC_ON_STOP" }; #define OSN_DEFAULT_OPTIONS 0x2 static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; @@ -1270,6 +1271,9 @@ static __always_inline void osnoise_stop_tracing(void) trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, "stop tracing hit on cpu %d\n", smp_processor_id()); + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit stop condition on CPU %d\n", smp_processor_id()); + tracer_tracing_off(tr); } rcu_read_unlock(); From b5dce2002567a9b1a83ef3e3a8678d8c32be2a78 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 9 Dec 2022 23:05:53 +0100 Subject: [PATCH 35/44] tracing/osnoise: Add preempt and/or irq disabled options The osnoise workload runs with preemption and IRQs enabled in such a way as to allow all sorts of noise to disturb osnoise's execution. hwlat tracer has a similar workload but works with irq disabled, allowing only NMIs and the hardware to generate noise. While thinking about adding an options file to hwlat tracer to allow the system to panic, and other features I was thinking to add, like having a tracepoint at each noise detection, it came to my mind that is easier to make osnoise and also do hardware latency detection than making hwlat "feature compatible" with osnoise. Other points are: - osnoise already has an independent cpu file. - osnoise has a more intuitive interface, e.g., runtime/period vs. window/width (and people often need help remembering what it is). - osnoise: tracepoints - osnoise stop options - osnoise options file itself Moreover, the user-space side (in rtla) is simplified by reusing the existing osnoise code. Finally, people have been asking me about using osnoise for hw latency detection, and I have to explain that it was sufficient but not necessary. These options make it sufficient and necessary. Adding a Suggested-by Clark, as he often asked me about this possibility. Link: https://lkml.kernel.org/r/d9c6c19135497054986900f94c8e47410b15316a.1670623111.git.bristot@kernel.org Cc: Suggested-by: Clark Williams Cc: Juri Lelli Cc: Bagas Sanjaya Cc: Daniel Bristot de Oliveira Cc: Masami Hiramatsu Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 48 ++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 5a7613942223..94c1b5eb1dc0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -55,10 +55,17 @@ enum osnoise_options_index { OSN_DEFAULTS = 0, OSN_WORKLOAD, OSN_PANIC_ON_STOP, + OSN_PREEMPT_DISABLE, + OSN_IRQ_DISABLE, OSN_MAX }; -static const char * const osnoise_options_str[OSN_MAX] = { "DEFAULTS", "OSNOISE_WORKLOAD", "PANIC_ON_STOP" }; +static const char * const osnoise_options_str[OSN_MAX] = { + "DEFAULTS", + "OSNOISE_WORKLOAD", + "PANIC_ON_STOP", + "OSNOISE_PREEMPT_DISABLE", + "OSNOISE_IRQ_DISABLE" }; #define OSN_DEFAULT_OPTIONS 0x2 static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; @@ -1308,18 +1315,26 @@ static void notify_new_max_latency(u64 latency) */ static int run_osnoise(void) { + bool disable_irq = test_bit(OSN_IRQ_DISABLE, &osnoise_options); struct osnoise_variables *osn_var = this_cpu_osn_var(); u64 start, sample, last_sample; u64 last_int_count, int_count; s64 noise = 0, max_noise = 0; s64 total, last_total = 0; struct osnoise_sample s; + bool disable_preemption; unsigned int threshold; u64 runtime, stop_in; u64 sum_noise = 0; int hw_count = 0; int ret = -1; + /* + * Disabling preemption is only required if IRQs are enabled, + * and the options is set on. + */ + disable_preemption = !disable_irq && test_bit(OSN_PREEMPT_DISABLE, &osnoise_options); + /* * Considers the current thread as the workload. */ @@ -1335,6 +1350,15 @@ static int run_osnoise(void) */ threshold = tracing_thresh ? : 5000; + /* + * Apply PREEMPT and IRQ disabled options. + */ + if (disable_irq) + local_irq_disable(); + + if (disable_preemption) + preempt_disable(); + /* * Make sure NMIs see sampling first */ @@ -1422,16 +1446,21 @@ static int run_osnoise(void) * cond_resched() */ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { - local_irq_disable(); + if (!disable_irq) + local_irq_disable(); + rcu_momentary_dyntick_idle(); - local_irq_enable(); + + if (!disable_irq) + local_irq_enable(); } /* * For the non-preemptive kernel config: let threads runs, if - * they so wish. + * they so wish, unless set not do to so. */ - cond_resched(); + if (!disable_irq && !disable_preemption) + cond_resched(); last_sample = sample; last_int_count = int_count; @@ -1450,6 +1479,15 @@ static int run_osnoise(void) */ barrier(); + /* + * Return to the preemptive state. + */ + if (disable_preemption) + preempt_enable(); + + if (disable_irq) + local_irq_enable(); + /* * Save noise info. */ From d358dfe60b7724ad0acb8cf8375a608b983e2b59 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 9 Dec 2022 23:05:54 +0100 Subject: [PATCH 36/44] Documentation/osnoise: Add osnoise/options documentation Add the documentation about the osnoise/options file, the options, and some additional explanation about the OSNOISE_WORKLOAD option. Link: https://lkml.kernel.org/r/fde5567a4bae364f67fd1e9a644d1d62862618a6.1670623111.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Clark Williams Cc: Masami Hiramatsu Cc: Jonathan Corbet Cc: Bagas Sanjaya Signed-off-by: Daniel Bristot de Oliveira Reviewed-by: Bagas Sanjaya Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/osnoise-tracer.rst | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/Documentation/trace/osnoise-tracer.rst b/Documentation/trace/osnoise-tracer.rst index fdd562d7c22d..140ef2533d26 100644 --- a/Documentation/trace/osnoise-tracer.rst +++ b/Documentation/trace/osnoise-tracer.rst @@ -92,8 +92,8 @@ Note that the example above shows a high number of HW noise samples. The reason being is that this sample was taken on a virtual machine, and the host interference is detected as a hardware interference. -Tracer options ---------------------- +Tracer Configuration +-------------------- The tracer has a set of options inside the osnoise directory, they are: @@ -115,6 +115,22 @@ The tracer has a set of options inside the osnoise directory, they are: NO_OSNOISE_WORKLOAD disables the OSNOISE_WORKLOAD option. The special DEAFAULTS option resets all options to the default value. +Tracer Options +-------------- + +The osnoise/options file exposes a set of on/off configuration options for +the osnoise tracer. These options are: + + - DEFAULTS: reset the options to the default value. + - OSNOISE_WORKLOAD: do not dispatch osnoise workload (see dedicated + section below). + - PANIC_ON_STOP: call panic() if the tracer stops. This option serves to + capture a vmcore. + - OSNOISE_PREEMPT_DISABLE: disable preemption while running the osnoise + workload, allowing only IRQ and hardware-related noise. + - OSNOISE_IRQ_DISABLE: disable IRQs while running the osnoise workload, + allowing only NMIs and hardware-related noise, like hwlat tracer. + Additional Tracing ------------------ From c1ac03af6ed45d05786c219d102f37eb44880f28 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Tue, 29 Nov 2022 19:30:09 +0800 Subject: [PATCH 37/44] tracing: Fix infinite loop in tracing_read_pipe on overflowed print_trace_line print_trace_line may overflow seq_file buffer. If the event is not consumed, the while loop keeps peeking this event, causing a infinite loop. Link: https://lkml.kernel.org/r/20221129113009.182425-1-yangjihong1@huawei.com Cc: Masami Hiramatsu Cc: stable@vger.kernel.org Fixes: 088b1e427dbba ("ftrace: pipe fixes") Signed-off-by: Yang Jihong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 664619b3f1e1..548890c7c0f5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6802,7 +6802,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { - /* don't print partial lines */ + /* + * If one print_trace_line() fills entire trace_seq in one shot, + * trace_seq_to_user() will returns -EBUSY because save_len == 0, + * In this case, we need to consume it, otherwise, loop will peek + * this event next time, resulting in an infinite loop. + */ + if (save_len == 0) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + trace_consume(iter); + break; + } + + /* In other cases, don't print partial lines */ iter->seq.seq.len = save_len; break; } From 3e12758392bee50135301b0189c064ab80980aca Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 12 Dec 2022 10:37:03 -0500 Subject: [PATCH 38/44] x86/mm/kmmio: Remove redundant preempt_disable() Now that kmmio uses rcu_read_lock_sched_notrace() there's no reason to call preempt_disable() as the read_lock_sched_notrace() already does that and is redundant. This also removes the preempt_enable_no_resched() as the "no_resched()" portion was bogus as there's no reason to do that. Link: https://lkml.kernel.org/r/20221212103703.7129cc5d@gandalf.local.home Cc: Masami Hiramatsu Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: "x86@kernel.org" Cc: Karol Herbst Cc: Pekka Paalanen Cc: Dave Hansen Cc: Andy Lutomirski Cc: Ingo Molnar Signed-off-by: Steven Rostedt (Google) --- arch/x86/mm/kmmio.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 853c49877c16..9f82019179e1 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -246,14 +246,13 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) page_base &= page_level_mask(l); /* - * Preemption is now disabled to prevent process switch during - * single stepping. We can only handle one active kmmio trace + * Hold the RCU read lock over single stepping to avoid looking + * up the probe and kmmio_fault_page again. The rcu_read_lock_sched() + * also disables preemption and prevents process switch during + * the single stepping. We can only handle one active kmmio trace * per cpu, so ensure that we finish it before something else - * gets to run. We also hold the RCU read lock over single - * stepping to avoid looking up the probe and kmmio_fault_page - * again. + * gets to run. */ - preempt_disable(); rcu_read_lock_sched_notrace(); faultpage = get_kmmio_fault_page(page_base); @@ -324,7 +323,6 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) no_kmmio: rcu_read_unlock_sched_notrace(); - preempt_enable_no_resched(); return ret; } @@ -364,7 +362,6 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) ctx->active--; BUG_ON(ctx->active); rcu_read_unlock_sched_notrace(); - preempt_enable_no_resched(); /* * if somebody else is singlestepping across a probe point, flags From a785736d7e587abbfd59df44e9a815d8d109c28c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 13 Dec 2022 09:56:02 -0500 Subject: [PATCH 39/44] tracing: Have trigger filter parsing errors show up in error_log It is annoying that the filter parsing of triggers do not show up in the error_log. Trying to figure out what is incorrect in the input is difficult when it fails for a typo. Have the errors of filter parsing show up in error_log as well. Link: https://lore.kernel.org/linux-trace-kernel/20221213095602.083fa9fd@gandalf.local.home Cc: Masami Hiramatsu Cc: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 918730d74932..19ce9d22bfd7 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1067,7 +1067,14 @@ int set_trigger_filter(char *filter_str, /* The filter is for the 'trigger' event, not the triggered event */ ret = create_event_filter(file->tr, file->event_call, - filter_str, false, &filter); + filter_str, true, &filter); + + /* Only enabled set_str for error handling */ + if (filter) { + kfree(filter->filter_string); + filter->filter_string = NULL; + } + /* * If create_event_filter() fails, filter still needs to be freed. * Which the calling code will do with data->filter. From fab89a09c86f948adfc7e20a7d608bd9f323bbe1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 12 Dec 2022 19:38:14 -0500 Subject: [PATCH 40/44] tracing: Remove pointer (asterisk) and brackets from cpumask_t field To differentiate between long arrays and cpumasks, the __cpumask() field was created. Part of the TRACE_EVENT() macros test if the type is signed or not by using the is_signed_type() macro. The __cpumask() field used the __dynamic_array() helper but because cpumask_t is a structure, it could not be used in the is_signed_type() macro as that would fail to build, so instead it passed in the pointer to cpumask_t. Unfortunately, that creates in the format file: field:__data_loc cpumask_t *[] mask; offset:36; size:4; signed:0; Which looks like an array of pointers to cpumask_t and not a cpumask_t type, which is misleading to user space parsers. Douglas Raillard pointed out that the "[]" are also misleading, as cpumask_t is not an array. Since cpumask() hasn't been created yet, and the parsers currently fail on it (but will still produce the raw output), make it be: field:__data_loc cpumask_t mask; offset:36; size:4; signed:0; Which is the correct type of the field. Then the parsers can be updated to handle this. Link: https://lore.kernel.org/lkml/6dda5e1d-9416-b55e-88f3-31d148bc925f@arm.com/ Link: https://lore.kernel.org/linux-trace-kernel/20221212193814.0e3f1e43@gandalf.local.home Cc: Masami Hiramatsu Cc: Valentin Schneider Cc: Andrew Morton Fixes: 8230f27b1ccc ("tracing: Add __cpumask to denote a trace event field that is a cpumask_t") Reported-by: Douglas Raillard Signed-off-by: Steven Rostedt (Google) --- include/trace/stages/stage4_event_fields.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h index f2990d22313c..affd541fd25e 100644 --- a/include/trace/stages/stage4_event_fields.h +++ b/include/trace/stages/stage4_event_fields.h @@ -47,7 +47,10 @@ #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) #undef __cpumask -#define __cpumask(item) __dynamic_array(cpumask_t *, item, -1) +#define __cpumask(item) { \ + .type = "__data_loc cpumask_t", .name = #item, \ + .size = 4, .align = 4, \ + .is_signed = 0, .filter_type = FILTER_OTHER }, #undef __sockaddr #define __sockaddr(field, len) __dynamic_array(u8, field, len) @@ -68,7 +71,10 @@ #define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, -1) #undef __rel_cpumask -#define __rel_cpumask(item) __rel_dynamic_array(cpumask_t *, item, -1) +#define __rel_cpumask(item) { \ + .type = "__rel_loc cpumask_t", .name = #item, \ + .size = 4, .align = 4, \ + .is_signed = 0, .filter_type = FILTER_OTHER }, #undef __rel_sockaddr #define __rel_sockaddr(field, len) __rel_dynamic_array(u8, field, len) From fb9f5ee9bf5cf7ebc8731a7033e57d98832dc650 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 13 Dec 2022 17:24:29 -0500 Subject: [PATCH 41/44] tracing: Do not synchronize freeing of trigger filter on boot up If a trigger filter on the kernel command line fails to apply (due to syntax error), it will be freed. The freeing will call tracepoint_synchronize_unregister(), but this is not needed during early boot up, and will even trigger a lockdep splat. Avoid calling the synchronization function when system_state is SYSTEM_BOOTING. Link: https://lore.kernel.org/linux-trace-kernel/20221213172429.7774f4ba@gandalf.local.home Cc: Andrew Morton Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 19ce9d22bfd7..e535959939d3 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1085,8 +1085,14 @@ int set_trigger_filter(char *filter_str, rcu_assign_pointer(data->filter, filter); if (tmp) { - /* Make sure the call is done with the filter */ - tracepoint_synchronize_unregister(); + /* + * Make sure the call is done with the filter. + * It is possible that a filter could fail at boot up, + * and then this path will be called. Avoid the synchronization + * in that case. + */ + if (system_state != SYSTEM_BOOTING) + tracepoint_synchronize_unregister(); free_event_filter(tmp); } From d0b24b4e91fcb8408c4979888547f86be514e337 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Tue, 15 Nov 2022 17:48:47 -0300 Subject: [PATCH 42/44] ftrace: Prevent RCU stall on PREEMPT_VOLUNTARY kernels The function match_records() may take a while due to a large number of string comparisons, so when in PREEMPT_VOLUNTARY kernels we could face RCU stalls due to that. Add a cond_resched() to prevent that. Link: https://lkml.kernel.org/r/20221115204847.593616-1-gpiccoli@igalia.com Cc: Mark Rutland Suggested-by: Steven Rostedt Acked-by: Paul E. McKenney # from RCU CPU stall warning perspective Signed-off-by: Guilherme G. Piccoli Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d04552c0c275..b8e374a372e5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4204,6 +4204,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) } found = 1; } + cond_resched(); } while_for_each_ftrace_rec(); out_unlock: mutex_unlock(&ftrace_lock); From ea47666ca4288becf82c51eb3da6d3ff891470a4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Fri, 19 Aug 2022 19:17:26 -0300 Subject: [PATCH 43/44] tracing: Improve panic/die notifiers Currently the tracing dump_on_oops feature is implemented through separate notifiers, one for die/oops and the other for panic; given they have the same functionality, let's unify them. Also improve the function comment and change the priority of the notifier to make it execute earlier, avoiding showing useless trace data (like the callback names for the other notifiers); finally, we also removed an unnecessary header inclusion. Link: https://lkml.kernel.org/r/20220819221731.480795-7-gpiccoli@igalia.com Cc: Petr Mladek Cc: Sergei Shtylyov Signed-off-by: Guilherme G. Piccoli Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 55 ++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 548890c7c0f5..6d7ef130f57e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -9876,40 +9875,40 @@ static __init int tracer_init_tracefs(void) fs_initcall(tracer_init_tracefs); -static int trace_panic_handler(struct notifier_block *this, - unsigned long event, void *unused) -{ - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); - return NOTIFY_OK; -} +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused); static struct notifier_block trace_panic_notifier = { - .notifier_call = trace_panic_handler, - .next = NULL, - .priority = 150 /* priority: INT_MAX >= x >= 0 */ + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, }; -static int trace_die_handler(struct notifier_block *self, - unsigned long val, - void *data) -{ - switch (val) { - case DIE_OOPS: - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); - break; - default: - break; - } - return NOTIFY_OK; -} - static struct notifier_block trace_die_notifier = { - .notifier_call = trace_die_handler, - .priority = 200 + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, }; +/* + * The idea is to execute the following die/panic callback early, in order + * to avoid showing irrelevant information in the trace (like other panic + * notifier functions); we are the 2nd to run, after hung_task/rcu_stall + * warnings get disabled (to prevent potential log flooding). + */ +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused) +{ + if (!ftrace_dump_on_oops) + return NOTIFY_DONE; + + /* The die notifier requires DIE_OOPS to trigger */ + if (self == &trace_die_notifier && ev != DIE_OOPS) + return NOTIFY_DONE; + + ftrace_dump(ftrace_dump_on_oops); + + return NOTIFY_DONE; +} + /* * printk is set to max of 1024, we really don't need it that big. * Nothing should be printing 1000 characters anyway. From eb9d58947d40699d93e5e69e1ddc54e41da7e132 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 13 Dec 2022 22:12:27 -0500 Subject: [PATCH 44/44] tracing: Fix cpumask() example typo The sample code for using cpumask used the wrong field for the __get_cpumask() helper. It used "cpus" which is the bitmask (but would still give a proper example) instead of the "cpum" that was there to be used. Although it produces the same output, fix it, because it's an example and is confusing in how to properly use the cpumask() macro. Link: https://lore.kernel.org/linux-trace-kernel/20221213221227.56560374@gandalf.local.home Cc: Andrew Morton Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- samples/trace_events/trace-events-sample.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index fb4548a44153..1c6b843b8c4e 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -359,7 +359,7 @@ TRACE_EVENT(foo_bar, __print_array(__get_dynamic_array(list), __get_dynamic_array_len(list) / sizeof(int), sizeof(int)), - __get_str(str), __get_bitmask(cpus), __get_cpumask(cpus), + __get_str(str), __get_bitmask(cpus), __get_cpumask(cpum), __get_str(vstr)) );