2018-08-16 15:23:53 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2009-02-24 15:21:36 +00:00
|
|
|
/*
|
|
|
|
* event tracer
|
|
|
|
*
|
|
|
|
* Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
|
|
|
|
*
|
2009-03-02 18:53:59 +00:00
|
|
|
* - Added format output of fields of the trace point.
|
|
|
|
* This was based off of work by Tom Zanussi <tzanussi@gmail.com>.
|
|
|
|
*
|
2009-02-24 15:21:36 +00:00
|
|
|
*/
|
|
|
|
|
2014-06-07 11:43:08 +00:00
|
|
|
#define pr_fmt(fmt) fmt
|
|
|
|
|
2009-04-15 17:36:40 +00:00
|
|
|
#include <linux/workqueue.h>
|
2019-10-11 21:22:50 +00:00
|
|
|
#include <linux/security.h>
|
2009-04-15 17:36:40 +00:00
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/kthread.h>
|
2015-01-20 17:13:40 +00:00
|
|
|
#include <linux/tracefs.h>
|
2009-02-24 15:21:36 +00:00
|
|
|
#include <linux/uaccess.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/ctype.h>
|
2015-09-24 15:33:26 +00:00
|
|
|
#include <linux/sort.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
|
|
|
#include <linux/slab.h>
|
2009-04-15 17:36:40 +00:00
|
|
|
#include <linux/delay.h>
|
2009-02-24 15:21:36 +00:00
|
|
|
|
2015-09-25 16:58:44 +00:00
|
|
|
#include <trace/events/sched.h>
|
2019-10-24 20:26:59 +00:00
|
|
|
#include <trace/syscall.h>
|
2015-09-25 16:58:44 +00:00
|
|
|
|
2009-07-01 02:47:05 +00:00
|
|
|
#include <asm/setup.h>
|
|
|
|
|
2009-03-02 20:03:01 +00:00
|
|
|
#include "trace_output.h"
|
2009-02-24 15:21:36 +00:00
|
|
|
|
2009-09-12 23:26:21 +00:00
|
|
|
#undef TRACE_SYSTEM
|
2009-02-28 04:32:58 +00:00
|
|
|
#define TRACE_SYSTEM "TRACE_SYSTEM"
|
|
|
|
|
2009-05-06 02:33:45 +00:00
|
|
|
DEFINE_MUTEX(event_mutex);
|
2009-03-02 16:49:04 +00:00
|
|
|
|
2009-04-10 17:52:20 +00:00
|
|
|
LIST_HEAD(ftrace_events);
|
2015-08-10 12:35:46 +00:00
|
|
|
static LIST_HEAD(ftrace_generic_fields);
|
2013-03-11 07:13:42 +00:00
|
|
|
static LIST_HEAD(ftrace_common_fields);
|
2020-09-10 12:38:58 +00:00
|
|
|
static bool eventdir_initialized;
|
2009-04-10 17:52:20 +00:00
|
|
|
|
tracing: Have type enum modifications copy the strings
When an enum is used in the visible parts of a trace event that is
exported to user space, the user space applications like perf and
trace-cmd do not have a way to know what the value of the enum is. To
solve this, at boot up (or module load) the printk formats are modified to
replace the enum with their numeric value in the string output.
Array fields of the event are defined by [<nr-elements>] in the type
portion of the format file so that the user space parsers can correctly
parse the array into the appropriate size chunks. But in some trace
events, an enum is used in defining the size of the array, which once
again breaks the parsing of user space tooling.
This was solved the same way as the print formats were, but it modified
the type strings of the trace event. This caused crashes in some
architectures because, as supposed to the print string, is a const string
value. This was not detected on x86, as it appears that const strings are
still writable (at least in boot up), but other architectures this is not
the case, and writing to a const string will cause a kernel fault.
To fix this, use kstrdup() to copy the type before modifying it. If the
trace event is for the core kernel there's no need to free it because the
string will be in use for the life of the machine being on line. For
modules, create a link list to store all the strings being allocated for
modules and when the module is removed, free them.
Link: https://lore.kernel.org/all/yt9dr1706b4i.fsf@linux.ibm.com/
Link: https://lkml.kernel.org/r/20220318153432.3984b871@gandalf.local.home
Tested-by: Marc Zyngier <maz@kernel.org>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Reported-by: Sven Schnelle <svens@linux.ibm.com>
Fixes: b3bc8547d3be ("tracing: Have TRACE_DEFINE_ENUM affect trace event types as well")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-03-18 19:34:32 +00:00
|
|
|
static LIST_HEAD(module_strings);
|
|
|
|
|
|
|
|
struct module_string {
|
|
|
|
struct list_head next;
|
|
|
|
struct module *module;
|
|
|
|
char *str;
|
|
|
|
};
|
|
|
|
|
2013-02-28 01:23:57 +00:00
|
|
|
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
|
|
|
|
|
|
|
|
static struct kmem_cache *field_cachep;
|
|
|
|
static struct kmem_cache *file_cachep;
|
|
|
|
|
2013-06-27 14:58:31 +00:00
|
|
|
static inline int system_refcount(struct event_subsystem *system)
|
|
|
|
{
|
2015-09-09 21:24:01 +00:00
|
|
|
return system->ref_count;
|
2013-06-27 14:58:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int system_refcount_inc(struct event_subsystem *system)
|
|
|
|
{
|
2015-09-09 21:24:01 +00:00
|
|
|
return system->ref_count++;
|
2013-06-27 14:58:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int system_refcount_dec(struct event_subsystem *system)
|
|
|
|
{
|
2015-09-09 21:24:01 +00:00
|
|
|
return --system->ref_count;
|
2013-06-27 14:58:31 +00:00
|
|
|
}
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
/* Double loops, do not use break, only goto's work */
|
|
|
|
#define do_for_each_event_file(tr, file) \
|
|
|
|
list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
|
|
|
|
list_for_each_entry(file, &tr->events, list)
|
|
|
|
|
|
|
|
#define do_for_each_event_file_safe(tr, file) \
|
|
|
|
list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *___n; \
|
2012-05-04 03:09:03 +00:00
|
|
|
list_for_each_entry_safe(file, ___n, &tr->events, list)
|
|
|
|
|
|
|
|
#define while_for_each_event_file() \
|
|
|
|
}
|
|
|
|
|
2013-03-11 07:13:42 +00:00
|
|
|
static struct ftrace_event_field *
|
tracing: Check "%s" dereference via the field and not the TP_printk format
The TP_printk() portion of a trace event is executed at the time a event
is read from the trace. This can happen seconds, minutes, hours, days,
months, years possibly later since the event was recorded. If the print
format contains a dereference to a string via "%s", and that string was
allocated, there's a chance that string could be freed before it is read
by the trace file.
To protect against such bugs, there are two functions that verify the
event. The first one is test_event_printk(), which is called when the
event is created. It reads the TP_printk() format as well as its arguments
to make sure nothing may be dereferencing a pointer that was not copied
into the ring buffer along with the event. If it is, it will trigger a
WARN_ON().
For strings that use "%s", it is not so easy. The string may not reside in
the ring buffer but may still be valid. Strings that are static and part
of the kernel proper which will not be freed for the life of the running
system, are safe to dereference. But to know if it is a pointer to a
static string or to something on the heap can not be determined until the
event is triggered.
This brings us to the second function that tests for the bad dereferencing
of strings, trace_check_vprintf(). It would walk through the printf format
looking for "%s", and when it finds it, it would validate that the pointer
is safe to read. If not, it would produces a WARN_ON() as well and write
into the ring buffer "[UNSAFE-MEMORY]".
The problem with this is how it used va_list to have vsnprintf() handle
all the cases that it didn't need to check. Instead of re-implementing
vsnprintf(), it would make a copy of the format up to the %s part, and
call vsnprintf() with the current va_list ap variable, where the ap would
then be ready to point at the string in question.
For architectures that passed va_list by reference this was possible. For
architectures that passed it by copy it was not. A test_can_verify()
function was used to differentiate between the two, and if it wasn't
possible, it would disable it.
Even for architectures where this was feasible, it was a stretch to rely
on such a method that is undocumented, and could cause issues later on
with new optimizations of the compiler.
Instead, the first function test_event_printk() was updated to look at
"%s" as well. If the "%s" argument is a pointer outside the event in the
ring buffer, it would find the field type of the event that is the problem
and mark the structure with a new flag called "needs_test". The event
itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
this event has a field that needs to be verified before the event can be
printed using the printf format.
When the event fields are created from the field type structure, the
fields would copy the field type's "needs_test" value.
Finally, before being printed, a new function ignore_event() is called
which will check if the event has the TEST_STR flag set (if not, it
returns false). If the flag is set, it then iterates through the events
fields looking for the ones that have the "needs_test" flag set.
Then it uses the offset field from the field structure to find the pointer
in the ring buffer event. It runs the tests to make sure that pointer is
safe to print and if not, it triggers the WARN_ON() and also adds to the
trace output that the event in question has an unsafe memory access.
The ignore_event() makes the trace_check_vprintf() obsolete so it is
removed.
Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:22 +00:00
|
|
|
__find_event_field(struct list_head *head, const char *name)
|
2013-03-11 07:13:42 +00:00
|
|
|
{
|
|
|
|
struct ftrace_event_field *field;
|
|
|
|
|
|
|
|
list_for_each_entry(field, head, link) {
|
|
|
|
if (!strcmp(field->name, name))
|
|
|
|
return field;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ftrace_event_field *
|
2015-05-05 15:45:27 +00:00
|
|
|
trace_find_event_field(struct trace_event_call *call, char *name)
|
2013-03-11 07:13:42 +00:00
|
|
|
{
|
|
|
|
struct ftrace_event_field *field;
|
|
|
|
struct list_head *head;
|
|
|
|
|
2016-03-03 22:18:20 +00:00
|
|
|
head = trace_get_fields(call);
|
|
|
|
field = __find_event_field(head, name);
|
2015-08-10 12:35:46 +00:00
|
|
|
if (field)
|
|
|
|
return field;
|
|
|
|
|
2016-03-03 22:18:20 +00:00
|
|
|
field = __find_event_field(&ftrace_generic_fields, name);
|
2013-03-11 07:13:42 +00:00
|
|
|
if (field)
|
|
|
|
return field;
|
|
|
|
|
2016-03-03 22:18:20 +00:00
|
|
|
return __find_event_field(&ftrace_common_fields, name);
|
2013-03-11 07:13:42 +00:00
|
|
|
}
|
|
|
|
|
2010-05-24 08:22:49 +00:00
|
|
|
static int __trace_define_field(struct list_head *head, const char *type,
|
|
|
|
const char *name, int offset, int size,
|
tracing: Check "%s" dereference via the field and not the TP_printk format
The TP_printk() portion of a trace event is executed at the time a event
is read from the trace. This can happen seconds, minutes, hours, days,
months, years possibly later since the event was recorded. If the print
format contains a dereference to a string via "%s", and that string was
allocated, there's a chance that string could be freed before it is read
by the trace file.
To protect against such bugs, there are two functions that verify the
event. The first one is test_event_printk(), which is called when the
event is created. It reads the TP_printk() format as well as its arguments
to make sure nothing may be dereferencing a pointer that was not copied
into the ring buffer along with the event. If it is, it will trigger a
WARN_ON().
For strings that use "%s", it is not so easy. The string may not reside in
the ring buffer but may still be valid. Strings that are static and part
of the kernel proper which will not be freed for the life of the running
system, are safe to dereference. But to know if it is a pointer to a
static string or to something on the heap can not be determined until the
event is triggered.
This brings us to the second function that tests for the bad dereferencing
of strings, trace_check_vprintf(). It would walk through the printf format
looking for "%s", and when it finds it, it would validate that the pointer
is safe to read. If not, it would produces a WARN_ON() as well and write
into the ring buffer "[UNSAFE-MEMORY]".
The problem with this is how it used va_list to have vsnprintf() handle
all the cases that it didn't need to check. Instead of re-implementing
vsnprintf(), it would make a copy of the format up to the %s part, and
call vsnprintf() with the current va_list ap variable, where the ap would
then be ready to point at the string in question.
For architectures that passed va_list by reference this was possible. For
architectures that passed it by copy it was not. A test_can_verify()
function was used to differentiate between the two, and if it wasn't
possible, it would disable it.
Even for architectures where this was feasible, it was a stretch to rely
on such a method that is undocumented, and could cause issues later on
with new optimizations of the compiler.
Instead, the first function test_event_printk() was updated to look at
"%s" as well. If the "%s" argument is a pointer outside the event in the
ring buffer, it would find the field type of the event that is the problem
and mark the structure with a new flag called "needs_test". The event
itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
this event has a field that needs to be verified before the event can be
printed using the printf format.
When the event fields are created from the field type structure, the
fields would copy the field type's "needs_test" value.
Finally, before being printed, a new function ignore_event() is called
which will check if the event has the TEST_STR flag set (if not, it
returns false). If the flag is set, it then iterates through the events
fields looking for the ones that have the "needs_test" flag set.
Then it uses the offset field from the field structure to find the pointer
in the ring buffer event. It runs the tests to make sure that pointer is
safe to print and if not, it triggers the WARN_ON() and also adds to the
trace output that the event in question has an unsafe memory access.
The ignore_event() makes the trace_check_vprintf() obsolete so it is
removed.
Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:22 +00:00
|
|
|
int is_signed, int filter_type, int len,
|
|
|
|
int need_test)
|
2009-03-22 08:30:39 +00:00
|
|
|
{
|
|
|
|
struct ftrace_event_field *field;
|
|
|
|
|
2013-02-28 01:23:57 +00:00
|
|
|
field = kmem_cache_alloc(field_cachep, GFP_TRACE);
|
2009-03-22 08:30:39 +00:00
|
|
|
if (!field)
|
2013-06-07 06:07:48 +00:00
|
|
|
return -ENOMEM;
|
2009-03-22 17:41:59 +00:00
|
|
|
|
2013-02-28 01:41:37 +00:00
|
|
|
field->name = name;
|
|
|
|
field->type = type;
|
2009-03-22 17:41:59 +00:00
|
|
|
|
2009-08-07 02:33:22 +00:00
|
|
|
if (filter_type == FILTER_OTHER)
|
|
|
|
field->filter_type = filter_assign_type(type);
|
|
|
|
else
|
|
|
|
field->filter_type = filter_type;
|
|
|
|
|
2009-03-22 08:30:39 +00:00
|
|
|
field->offset = offset;
|
|
|
|
field->size = size;
|
2009-04-28 08:04:53 +00:00
|
|
|
field->is_signed = is_signed;
|
tracing: Check "%s" dereference via the field and not the TP_printk format
The TP_printk() portion of a trace event is executed at the time a event
is read from the trace. This can happen seconds, minutes, hours, days,
months, years possibly later since the event was recorded. If the print
format contains a dereference to a string via "%s", and that string was
allocated, there's a chance that string could be freed before it is read
by the trace file.
To protect against such bugs, there are two functions that verify the
event. The first one is test_event_printk(), which is called when the
event is created. It reads the TP_printk() format as well as its arguments
to make sure nothing may be dereferencing a pointer that was not copied
into the ring buffer along with the event. If it is, it will trigger a
WARN_ON().
For strings that use "%s", it is not so easy. The string may not reside in
the ring buffer but may still be valid. Strings that are static and part
of the kernel proper which will not be freed for the life of the running
system, are safe to dereference. But to know if it is a pointer to a
static string or to something on the heap can not be determined until the
event is triggered.
This brings us to the second function that tests for the bad dereferencing
of strings, trace_check_vprintf(). It would walk through the printf format
looking for "%s", and when it finds it, it would validate that the pointer
is safe to read. If not, it would produces a WARN_ON() as well and write
into the ring buffer "[UNSAFE-MEMORY]".
The problem with this is how it used va_list to have vsnprintf() handle
all the cases that it didn't need to check. Instead of re-implementing
vsnprintf(), it would make a copy of the format up to the %s part, and
call vsnprintf() with the current va_list ap variable, where the ap would
then be ready to point at the string in question.
For architectures that passed va_list by reference this was possible. For
architectures that passed it by copy it was not. A test_can_verify()
function was used to differentiate between the two, and if it wasn't
possible, it would disable it.
Even for architectures where this was feasible, it was a stretch to rely
on such a method that is undocumented, and could cause issues later on
with new optimizations of the compiler.
Instead, the first function test_event_printk() was updated to look at
"%s" as well. If the "%s" argument is a pointer outside the event in the
ring buffer, it would find the field type of the event that is the problem
and mark the structure with a new flag called "needs_test". The event
itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
this event has a field that needs to be verified before the event can be
printed using the printf format.
When the event fields are created from the field type structure, the
fields would copy the field type's "needs_test" value.
Finally, before being printed, a new function ignore_event() is called
which will check if the event has the TEST_STR flag set (if not, it
returns false). If the flag is set, it then iterates through the events
fields looking for the ones that have the "needs_test" flag set.
Then it uses the offset field from the field structure to find the pointer
in the ring buffer event. It runs the tests to make sure that pointer is
safe to print and if not, it triggers the WARN_ON() and also adds to the
trace output that the event in question has an unsafe memory access.
The ignore_event() makes the trace_check_vprintf() obsolete so it is
removed.
Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:22 +00:00
|
|
|
field->needs_test = need_test;
|
2023-02-12 15:13:03 +00:00
|
|
|
field->len = len;
|
2009-08-07 02:33:02 +00:00
|
|
|
|
2010-04-22 14:35:55 +00:00
|
|
|
list_add(&field->link, head);
|
2009-03-22 08:30:39 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2010-05-24 08:22:49 +00:00
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
int trace_define_field(struct trace_event_call *call, const char *type,
|
2010-05-24 08:22:49 +00:00
|
|
|
const char *name, int offset, int size, int is_signed,
|
|
|
|
int filter_type)
|
|
|
|
{
|
|
|
|
struct list_head *head;
|
|
|
|
|
|
|
|
if (WARN_ON(!call->class))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
head = trace_get_fields(call);
|
|
|
|
return __trace_define_field(head, type, name, offset, size,
|
tracing: Check "%s" dereference via the field and not the TP_printk format
The TP_printk() portion of a trace event is executed at the time a event
is read from the trace. This can happen seconds, minutes, hours, days,
months, years possibly later since the event was recorded. If the print
format contains a dereference to a string via "%s", and that string was
allocated, there's a chance that string could be freed before it is read
by the trace file.
To protect against such bugs, there are two functions that verify the
event. The first one is test_event_printk(), which is called when the
event is created. It reads the TP_printk() format as well as its arguments
to make sure nothing may be dereferencing a pointer that was not copied
into the ring buffer along with the event. If it is, it will trigger a
WARN_ON().
For strings that use "%s", it is not so easy. The string may not reside in
the ring buffer but may still be valid. Strings that are static and part
of the kernel proper which will not be freed for the life of the running
system, are safe to dereference. But to know if it is a pointer to a
static string or to something on the heap can not be determined until the
event is triggered.
This brings us to the second function that tests for the bad dereferencing
of strings, trace_check_vprintf(). It would walk through the printf format
looking for "%s", and when it finds it, it would validate that the pointer
is safe to read. If not, it would produces a WARN_ON() as well and write
into the ring buffer "[UNSAFE-MEMORY]".
The problem with this is how it used va_list to have vsnprintf() handle
all the cases that it didn't need to check. Instead of re-implementing
vsnprintf(), it would make a copy of the format up to the %s part, and
call vsnprintf() with the current va_list ap variable, where the ap would
then be ready to point at the string in question.
For architectures that passed va_list by reference this was possible. For
architectures that passed it by copy it was not. A test_can_verify()
function was used to differentiate between the two, and if it wasn't
possible, it would disable it.
Even for architectures where this was feasible, it was a stretch to rely
on such a method that is undocumented, and could cause issues later on
with new optimizations of the compiler.
Instead, the first function test_event_printk() was updated to look at
"%s" as well. If the "%s" argument is a pointer outside the event in the
ring buffer, it would find the field type of the event that is the problem
and mark the structure with a new flag called "needs_test". The event
itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
this event has a field that needs to be verified before the event can be
printed using the printf format.
When the event fields are created from the field type structure, the
fields would copy the field type's "needs_test" value.
Finally, before being printed, a new function ignore_event() is called
which will check if the event has the TEST_STR flag set (if not, it
returns false). If the flag is set, it then iterates through the events
fields looking for the ones that have the "needs_test" flag set.
Then it uses the offset field from the field structure to find the pointer
in the ring buffer event. It runs the tests to make sure that pointer is
safe to print and if not, it triggers the WARN_ON() and also adds to the
trace output that the event in question has an unsafe memory access.
The ignore_event() makes the trace_check_vprintf() obsolete so it is
removed.
Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:22 +00:00
|
|
|
is_signed, filter_type, 0, 0);
|
2010-05-24 08:22:49 +00:00
|
|
|
}
|
2009-04-10 22:12:50 +00:00
|
|
|
EXPORT_SYMBOL_GPL(trace_define_field);
|
2009-03-22 08:30:39 +00:00
|
|
|
|
2023-02-13 01:11:42 +00:00
|
|
|
static int trace_define_field_ext(struct trace_event_call *call, const char *type,
|
2023-02-12 15:13:03 +00:00
|
|
|
const char *name, int offset, int size, int is_signed,
|
tracing: Check "%s" dereference via the field and not the TP_printk format
The TP_printk() portion of a trace event is executed at the time a event
is read from the trace. This can happen seconds, minutes, hours, days,
months, years possibly later since the event was recorded. If the print
format contains a dereference to a string via "%s", and that string was
allocated, there's a chance that string could be freed before it is read
by the trace file.
To protect against such bugs, there are two functions that verify the
event. The first one is test_event_printk(), which is called when the
event is created. It reads the TP_printk() format as well as its arguments
to make sure nothing may be dereferencing a pointer that was not copied
into the ring buffer along with the event. If it is, it will trigger a
WARN_ON().
For strings that use "%s", it is not so easy. The string may not reside in
the ring buffer but may still be valid. Strings that are static and part
of the kernel proper which will not be freed for the life of the running
system, are safe to dereference. But to know if it is a pointer to a
static string or to something on the heap can not be determined until the
event is triggered.
This brings us to the second function that tests for the bad dereferencing
of strings, trace_check_vprintf(). It would walk through the printf format
looking for "%s", and when it finds it, it would validate that the pointer
is safe to read. If not, it would produces a WARN_ON() as well and write
into the ring buffer "[UNSAFE-MEMORY]".
The problem with this is how it used va_list to have vsnprintf() handle
all the cases that it didn't need to check. Instead of re-implementing
vsnprintf(), it would make a copy of the format up to the %s part, and
call vsnprintf() with the current va_list ap variable, where the ap would
then be ready to point at the string in question.
For architectures that passed va_list by reference this was possible. For
architectures that passed it by copy it was not. A test_can_verify()
function was used to differentiate between the two, and if it wasn't
possible, it would disable it.
Even for architectures where this was feasible, it was a stretch to rely
on such a method that is undocumented, and could cause issues later on
with new optimizations of the compiler.
Instead, the first function test_event_printk() was updated to look at
"%s" as well. If the "%s" argument is a pointer outside the event in the
ring buffer, it would find the field type of the event that is the problem
and mark the structure with a new flag called "needs_test". The event
itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
this event has a field that needs to be verified before the event can be
printed using the printf format.
When the event fields are created from the field type structure, the
fields would copy the field type's "needs_test" value.
Finally, before being printed, a new function ignore_event() is called
which will check if the event has the TEST_STR flag set (if not, it
returns false). If the flag is set, it then iterates through the events
fields looking for the ones that have the "needs_test" flag set.
Then it uses the offset field from the field structure to find the pointer
in the ring buffer event. It runs the tests to make sure that pointer is
safe to print and if not, it triggers the WARN_ON() and also adds to the
trace output that the event in question has an unsafe memory access.
The ignore_event() makes the trace_check_vprintf() obsolete so it is
removed.
Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:22 +00:00
|
|
|
int filter_type, int len, int need_test)
|
2023-02-12 15:13:03 +00:00
|
|
|
{
|
|
|
|
struct list_head *head;
|
|
|
|
|
|
|
|
if (WARN_ON(!call->class))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
head = trace_get_fields(call);
|
|
|
|
return __trace_define_field(head, type, name, offset, size,
|
tracing: Check "%s" dereference via the field and not the TP_printk format
The TP_printk() portion of a trace event is executed at the time a event
is read from the trace. This can happen seconds, minutes, hours, days,
months, years possibly later since the event was recorded. If the print
format contains a dereference to a string via "%s", and that string was
allocated, there's a chance that string could be freed before it is read
by the trace file.
To protect against such bugs, there are two functions that verify the
event. The first one is test_event_printk(), which is called when the
event is created. It reads the TP_printk() format as well as its arguments
to make sure nothing may be dereferencing a pointer that was not copied
into the ring buffer along with the event. If it is, it will trigger a
WARN_ON().
For strings that use "%s", it is not so easy. The string may not reside in
the ring buffer but may still be valid. Strings that are static and part
of the kernel proper which will not be freed for the life of the running
system, are safe to dereference. But to know if it is a pointer to a
static string or to something on the heap can not be determined until the
event is triggered.
This brings us to the second function that tests for the bad dereferencing
of strings, trace_check_vprintf(). It would walk through the printf format
looking for "%s", and when it finds it, it would validate that the pointer
is safe to read. If not, it would produces a WARN_ON() as well and write
into the ring buffer "[UNSAFE-MEMORY]".
The problem with this is how it used va_list to have vsnprintf() handle
all the cases that it didn't need to check. Instead of re-implementing
vsnprintf(), it would make a copy of the format up to the %s part, and
call vsnprintf() with the current va_list ap variable, where the ap would
then be ready to point at the string in question.
For architectures that passed va_list by reference this was possible. For
architectures that passed it by copy it was not. A test_can_verify()
function was used to differentiate between the two, and if it wasn't
possible, it would disable it.
Even for architectures where this was feasible, it was a stretch to rely
on such a method that is undocumented, and could cause issues later on
with new optimizations of the compiler.
Instead, the first function test_event_printk() was updated to look at
"%s" as well. If the "%s" argument is a pointer outside the event in the
ring buffer, it would find the field type of the event that is the problem
and mark the structure with a new flag called "needs_test". The event
itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
this event has a field that needs to be verified before the event can be
printed using the printf format.
When the event fields are created from the field type structure, the
fields would copy the field type's "needs_test" value.
Finally, before being printed, a new function ignore_event() is called
which will check if the event has the TEST_STR flag set (if not, it
returns false). If the flag is set, it then iterates through the events
fields looking for the ones that have the "needs_test" flag set.
Then it uses the offset field from the field structure to find the pointer
in the ring buffer event. It runs the tests to make sure that pointer is
safe to print and if not, it triggers the WARN_ON() and also adds to the
trace output that the event in question has an unsafe memory access.
The ignore_event() makes the trace_check_vprintf() obsolete so it is
removed.
Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:22 +00:00
|
|
|
is_signed, filter_type, len, need_test);
|
2023-02-12 15:13:03 +00:00
|
|
|
}
|
|
|
|
|
2015-08-10 12:35:46 +00:00
|
|
|
#define __generic_field(type, item, filter_type) \
|
|
|
|
ret = __trace_define_field(&ftrace_generic_fields, #type, \
|
|
|
|
#item, 0, 0, is_signed_type(type), \
|
tracing: Check "%s" dereference via the field and not the TP_printk format
The TP_printk() portion of a trace event is executed at the time a event
is read from the trace. This can happen seconds, minutes, hours, days,
months, years possibly later since the event was recorded. If the print
format contains a dereference to a string via "%s", and that string was
allocated, there's a chance that string could be freed before it is read
by the trace file.
To protect against such bugs, there are two functions that verify the
event. The first one is test_event_printk(), which is called when the
event is created. It reads the TP_printk() format as well as its arguments
to make sure nothing may be dereferencing a pointer that was not copied
into the ring buffer along with the event. If it is, it will trigger a
WARN_ON().
For strings that use "%s", it is not so easy. The string may not reside in
the ring buffer but may still be valid. Strings that are static and part
of the kernel proper which will not be freed for the life of the running
system, are safe to dereference. But to know if it is a pointer to a
static string or to something on the heap can not be determined until the
event is triggered.
This brings us to the second function that tests for the bad dereferencing
of strings, trace_check_vprintf(). It would walk through the printf format
looking for "%s", and when it finds it, it would validate that the pointer
is safe to read. If not, it would produces a WARN_ON() as well and write
into the ring buffer "[UNSAFE-MEMORY]".
The problem with this is how it used va_list to have vsnprintf() handle
all the cases that it didn't need to check. Instead of re-implementing
vsnprintf(), it would make a copy of the format up to the %s part, and
call vsnprintf() with the current va_list ap variable, where the ap would
then be ready to point at the string in question.
For architectures that passed va_list by reference this was possible. For
architectures that passed it by copy it was not. A test_can_verify()
function was used to differentiate between the two, and if it wasn't
possible, it would disable it.
Even for architectures where this was feasible, it was a stretch to rely
on such a method that is undocumented, and could cause issues later on
with new optimizations of the compiler.
Instead, the first function test_event_printk() was updated to look at
"%s" as well. If the "%s" argument is a pointer outside the event in the
ring buffer, it would find the field type of the event that is the problem
and mark the structure with a new flag called "needs_test". The event
itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
this event has a field that needs to be verified before the event can be
printed using the printf format.
When the event fields are created from the field type structure, the
fields would copy the field type's "needs_test" value.
Finally, before being printed, a new function ignore_event() is called
which will check if the event has the TEST_STR flag set (if not, it
returns false). If the flag is set, it then iterates through the events
fields looking for the ones that have the "needs_test" flag set.
Then it uses the offset field from the field structure to find the pointer
in the ring buffer event. It runs the tests to make sure that pointer is
safe to print and if not, it triggers the WARN_ON() and also adds to the
trace output that the event in question has an unsafe memory access.
The ignore_event() makes the trace_check_vprintf() obsolete so it is
removed.
Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:22 +00:00
|
|
|
filter_type, 0, 0); \
|
2015-08-10 12:35:46 +00:00
|
|
|
if (ret) \
|
|
|
|
return ret;
|
|
|
|
|
2009-08-19 07:54:32 +00:00
|
|
|
#define __common_field(type, item) \
|
2010-05-24 08:22:49 +00:00
|
|
|
ret = __trace_define_field(&ftrace_common_fields, #type, \
|
|
|
|
"common_" #item, \
|
|
|
|
offsetof(typeof(ent), item), \
|
|
|
|
sizeof(ent.item), \
|
tracing: Check "%s" dereference via the field and not the TP_printk format
The TP_printk() portion of a trace event is executed at the time a event
is read from the trace. This can happen seconds, minutes, hours, days,
months, years possibly later since the event was recorded. If the print
format contains a dereference to a string via "%s", and that string was
allocated, there's a chance that string could be freed before it is read
by the trace file.
To protect against such bugs, there are two functions that verify the
event. The first one is test_event_printk(), which is called when the
event is created. It reads the TP_printk() format as well as its arguments
to make sure nothing may be dereferencing a pointer that was not copied
into the ring buffer along with the event. If it is, it will trigger a
WARN_ON().
For strings that use "%s", it is not so easy. The string may not reside in
the ring buffer but may still be valid. Strings that are static and part
of the kernel proper which will not be freed for the life of the running
system, are safe to dereference. But to know if it is a pointer to a
static string or to something on the heap can not be determined until the
event is triggered.
This brings us to the second function that tests for the bad dereferencing
of strings, trace_check_vprintf(). It would walk through the printf format
looking for "%s", and when it finds it, it would validate that the pointer
is safe to read. If not, it would produces a WARN_ON() as well and write
into the ring buffer "[UNSAFE-MEMORY]".
The problem with this is how it used va_list to have vsnprintf() handle
all the cases that it didn't need to check. Instead of re-implementing
vsnprintf(), it would make a copy of the format up to the %s part, and
call vsnprintf() with the current va_list ap variable, where the ap would
then be ready to point at the string in question.
For architectures that passed va_list by reference this was possible. For
architectures that passed it by copy it was not. A test_can_verify()
function was used to differentiate between the two, and if it wasn't
possible, it would disable it.
Even for architectures where this was feasible, it was a stretch to rely
on such a method that is undocumented, and could cause issues later on
with new optimizations of the compiler.
Instead, the first function test_event_printk() was updated to look at
"%s" as well. If the "%s" argument is a pointer outside the event in the
ring buffer, it would find the field type of the event that is the problem
and mark the structure with a new flag called "needs_test". The event
itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
this event has a field that needs to be verified before the event can be
printed using the printf format.
When the event fields are created from the field type structure, the
fields would copy the field type's "needs_test" value.
Finally, before being printed, a new function ignore_event() is called
which will check if the event has the TEST_STR flag set (if not, it
returns false). If the flag is set, it then iterates through the events
fields looking for the ones that have the "needs_test" flag set.
Then it uses the offset field from the field structure to find the pointer
in the ring buffer event. It runs the tests to make sure that pointer is
safe to print and if not, it triggers the WARN_ON() and also adds to the
trace output that the event in question has an unsafe memory access.
The ignore_event() makes the trace_check_vprintf() obsolete so it is
removed.
Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:22 +00:00
|
|
|
is_signed_type(type), FILTER_OTHER, \
|
|
|
|
0, 0); \
|
2009-08-19 07:54:32 +00:00
|
|
|
if (ret) \
|
|
|
|
return ret;
|
|
|
|
|
2015-08-10 12:35:46 +00:00
|
|
|
static int trace_define_generic_fields(void)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2016-03-03 22:18:20 +00:00
|
|
|
__generic_field(int, CPU, FILTER_CPU);
|
|
|
|
__generic_field(int, cpu, FILTER_CPU);
|
2022-08-20 13:43:22 +00:00
|
|
|
__generic_field(int, common_cpu, FILTER_CPU);
|
2016-03-03 22:18:20 +00:00
|
|
|
__generic_field(char *, COMM, FILTER_COMM);
|
|
|
|
__generic_field(char *, comm, FILTER_COMM);
|
2023-05-24 03:09:13 +00:00
|
|
|
__generic_field(char *, stacktrace, FILTER_STACKTRACE);
|
|
|
|
__generic_field(char *, STACKTRACE, FILTER_STACKTRACE);
|
2015-08-10 12:35:46 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-05-24 08:22:49 +00:00
|
|
|
static int trace_define_common_fields(void)
|
2009-08-19 07:54:32 +00:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct trace_entry ent;
|
|
|
|
|
|
|
|
__common_field(unsigned short, type);
|
|
|
|
__common_field(unsigned char, flags);
|
2021-08-10 13:26:25 +00:00
|
|
|
/* Holds both preempt_count and migrate_disable */
|
2009-08-19 07:54:32 +00:00
|
|
|
__common_field(unsigned char, preempt_count);
|
|
|
|
__common_field(int, pid);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
static void trace_destroy_fields(struct trace_event_call *call)
|
2009-05-06 02:33:04 +00:00
|
|
|
{
|
|
|
|
struct ftrace_event_field *field, *next;
|
2010-04-22 14:35:55 +00:00
|
|
|
struct list_head *head;
|
2009-05-06 02:33:04 +00:00
|
|
|
|
2010-04-22 14:35:55 +00:00
|
|
|
head = trace_get_fields(call);
|
|
|
|
list_for_each_entry_safe(field, next, head, link) {
|
2009-05-06 02:33:04 +00:00
|
|
|
list_del(&field->link);
|
2013-02-28 01:23:57 +00:00
|
|
|
kmem_cache_free(field_cachep, field);
|
2009-05-06 02:33:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-07 01:43:28 +00:00
|
|
|
/*
|
|
|
|
* run-time version of trace_event_get_offsets_<call>() that returns the last
|
|
|
|
* accessible offset of trace fields excluding __dynamic_array bytes
|
|
|
|
*/
|
|
|
|
int trace_event_get_offsets(struct trace_event_call *call)
|
|
|
|
{
|
|
|
|
struct ftrace_event_field *tail;
|
|
|
|
struct list_head *head;
|
|
|
|
|
|
|
|
head = trace_get_fields(call);
|
|
|
|
/*
|
|
|
|
* head->next points to the last field with the largest offset,
|
|
|
|
* since it was added last by trace_define_field()
|
|
|
|
*/
|
|
|
|
tail = list_first_entry(head, struct ftrace_event_field, link);
|
|
|
|
return tail->offset + tail->size;
|
|
|
|
}
|
|
|
|
|
2024-12-17 02:41:21 +00:00
|
|
|
|
|
|
|
static struct trace_event_fields *find_event_field(const char *fmt,
|
|
|
|
struct trace_event_call *call)
|
2021-02-25 21:51:23 +00:00
|
|
|
{
|
|
|
|
struct trace_event_fields *field = call->class->fields_array;
|
|
|
|
const char *p = fmt;
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (!(len = str_has_prefix(fmt, "REC->")))
|
2024-12-17 02:41:21 +00:00
|
|
|
return NULL;
|
2021-02-25 21:51:23 +00:00
|
|
|
fmt += len;
|
|
|
|
for (p = fmt; *p; p++) {
|
|
|
|
if (!isalnum(*p) && *p != '_')
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
len = p - fmt;
|
|
|
|
|
|
|
|
for (; field->type; field++) {
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
if (strncmp(field->name, fmt, len) || field->name[len])
|
2021-02-25 21:51:23 +00:00
|
|
|
continue;
|
2024-12-17 02:41:21 +00:00
|
|
|
|
|
|
|
return field;
|
2021-02-25 21:51:23 +00:00
|
|
|
}
|
2024-12-17 02:41:21 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if the referenced field is an array and return true,
|
|
|
|
* as arrays are OK to dereference.
|
|
|
|
*/
|
|
|
|
static bool test_field(const char *fmt, struct trace_event_call *call)
|
|
|
|
{
|
|
|
|
struct trace_event_fields *field;
|
|
|
|
|
|
|
|
field = find_event_field(fmt, call);
|
|
|
|
if (!field)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* This is an array and is OK to dereference. */
|
|
|
|
return strchr(field->type, '[') != NULL;
|
2021-02-25 21:51:23 +00:00
|
|
|
}
|
|
|
|
|
2024-12-17 02:41:20 +00:00
|
|
|
/* Look for a string within an argument */
|
|
|
|
static bool find_print_string(const char *arg, const char *str, const char *end)
|
|
|
|
{
|
|
|
|
const char *r;
|
|
|
|
|
|
|
|
r = strstr(arg, str);
|
|
|
|
return r && r < end;
|
|
|
|
}
|
|
|
|
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
/* Return true if the argument pointer is safe */
|
|
|
|
static bool process_pointer(const char *fmt, int len, struct trace_event_call *call)
|
|
|
|
{
|
|
|
|
const char *r, *e, *a;
|
|
|
|
|
|
|
|
e = fmt + len;
|
|
|
|
|
|
|
|
/* Find the REC-> in the argument */
|
|
|
|
r = strstr(fmt, "REC->");
|
|
|
|
if (r && r < e) {
|
|
|
|
/*
|
|
|
|
* Addresses of events on the buffer, or an array on the buffer is
|
|
|
|
* OK to dereference. There's ways to fool this, but
|
|
|
|
* this is to catch common mistakes, not malicious code.
|
|
|
|
*/
|
|
|
|
a = strchr(fmt, '&');
|
|
|
|
if ((a && (a < r)) || test_field(r, call))
|
|
|
|
return true;
|
2024-12-17 02:41:20 +00:00
|
|
|
} else if (find_print_string(fmt, "__get_dynamic_array(", e)) {
|
|
|
|
return true;
|
|
|
|
} else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) {
|
|
|
|
return true;
|
|
|
|
} else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) {
|
|
|
|
return true;
|
|
|
|
} else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) {
|
|
|
|
return true;
|
|
|
|
} else if (find_print_string(fmt, "__get_sockaddr(", e)) {
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
return true;
|
2024-12-17 02:41:20 +00:00
|
|
|
} else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) {
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2024-12-17 02:41:21 +00:00
|
|
|
/* Return true if the string is safe */
|
|
|
|
static bool process_string(const char *fmt, int len, struct trace_event_call *call)
|
|
|
|
{
|
tracing: Check "%s" dereference via the field and not the TP_printk format
The TP_printk() portion of a trace event is executed at the time a event
is read from the trace. This can happen seconds, minutes, hours, days,
months, years possibly later since the event was recorded. If the print
format contains a dereference to a string via "%s", and that string was
allocated, there's a chance that string could be freed before it is read
by the trace file.
To protect against such bugs, there are two functions that verify the
event. The first one is test_event_printk(), which is called when the
event is created. It reads the TP_printk() format as well as its arguments
to make sure nothing may be dereferencing a pointer that was not copied
into the ring buffer along with the event. If it is, it will trigger a
WARN_ON().
For strings that use "%s", it is not so easy. The string may not reside in
the ring buffer but may still be valid. Strings that are static and part
of the kernel proper which will not be freed for the life of the running
system, are safe to dereference. But to know if it is a pointer to a
static string or to something on the heap can not be determined until the
event is triggered.
This brings us to the second function that tests for the bad dereferencing
of strings, trace_check_vprintf(). It would walk through the printf format
looking for "%s", and when it finds it, it would validate that the pointer
is safe to read. If not, it would produces a WARN_ON() as well and write
into the ring buffer "[UNSAFE-MEMORY]".
The problem with this is how it used va_list to have vsnprintf() handle
all the cases that it didn't need to check. Instead of re-implementing
vsnprintf(), it would make a copy of the format up to the %s part, and
call vsnprintf() with the current va_list ap variable, where the ap would
then be ready to point at the string in question.
For architectures that passed va_list by reference this was possible. For
architectures that passed it by copy it was not. A test_can_verify()
function was used to differentiate between the two, and if it wasn't
possible, it would disable it.
Even for architectures where this was feasible, it was a stretch to rely
on such a method that is undocumented, and could cause issues later on
with new optimizations of the compiler.
Instead, the first function test_event_printk() was updated to look at
"%s" as well. If the "%s" argument is a pointer outside the event in the
ring buffer, it would find the field type of the event that is the problem
and mark the structure with a new flag called "needs_test". The event
itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
this event has a field that needs to be verified before the event can be
printed using the printf format.
When the event fields are created from the field type structure, the
fields would copy the field type's "needs_test" value.
Finally, before being printed, a new function ignore_event() is called
which will check if the event has the TEST_STR flag set (if not, it
returns false). If the flag is set, it then iterates through the events
fields looking for the ones that have the "needs_test" flag set.
Then it uses the offset field from the field structure to find the pointer
in the ring buffer event. It runs the tests to make sure that pointer is
safe to print and if not, it triggers the WARN_ON() and also adds to the
trace output that the event in question has an unsafe memory access.
The ignore_event() makes the trace_check_vprintf() obsolete so it is
removed.
Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:22 +00:00
|
|
|
struct trace_event_fields *field;
|
2024-12-17 02:41:21 +00:00
|
|
|
const char *r, *e, *s;
|
|
|
|
|
|
|
|
e = fmt + len;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There are several helper functions that return strings.
|
|
|
|
* If the argument contains a function, then assume its field is valid.
|
|
|
|
* It is considered that the argument has a function if it has:
|
|
|
|
* alphanumeric or '_' before a parenthesis.
|
|
|
|
*/
|
|
|
|
s = fmt;
|
|
|
|
do {
|
|
|
|
r = strstr(s, "(");
|
|
|
|
if (!r || r >= e)
|
|
|
|
break;
|
|
|
|
for (int i = 1; r - i >= s; i++) {
|
|
|
|
char ch = *(r - i);
|
|
|
|
if (isspace(ch))
|
|
|
|
continue;
|
|
|
|
if (isalnum(ch) || ch == '_')
|
|
|
|
return true;
|
|
|
|
/* Anything else, this isn't a function */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* A function could be wrapped in parethesis, try the next one */
|
|
|
|
s = r + 1;
|
|
|
|
} while (s < e);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there's any strings in the argument consider this arg OK as it
|
|
|
|
* could be: REC->field ? "foo" : "bar" and we don't want to get into
|
|
|
|
* verifying that logic here.
|
|
|
|
*/
|
|
|
|
if (find_print_string(fmt, "\"", e))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/* Dereferenced strings are also valid like any other pointer */
|
|
|
|
if (process_pointer(fmt, len, call))
|
|
|
|
return true;
|
|
|
|
|
tracing: Check "%s" dereference via the field and not the TP_printk format
The TP_printk() portion of a trace event is executed at the time a event
is read from the trace. This can happen seconds, minutes, hours, days,
months, years possibly later since the event was recorded. If the print
format contains a dereference to a string via "%s", and that string was
allocated, there's a chance that string could be freed before it is read
by the trace file.
To protect against such bugs, there are two functions that verify the
event. The first one is test_event_printk(), which is called when the
event is created. It reads the TP_printk() format as well as its arguments
to make sure nothing may be dereferencing a pointer that was not copied
into the ring buffer along with the event. If it is, it will trigger a
WARN_ON().
For strings that use "%s", it is not so easy. The string may not reside in
the ring buffer but may still be valid. Strings that are static and part
of the kernel proper which will not be freed for the life of the running
system, are safe to dereference. But to know if it is a pointer to a
static string or to something on the heap can not be determined until the
event is triggered.
This brings us to the second function that tests for the bad dereferencing
of strings, trace_check_vprintf(). It would walk through the printf format
looking for "%s", and when it finds it, it would validate that the pointer
is safe to read. If not, it would produces a WARN_ON() as well and write
into the ring buffer "[UNSAFE-MEMORY]".
The problem with this is how it used va_list to have vsnprintf() handle
all the cases that it didn't need to check. Instead of re-implementing
vsnprintf(), it would make a copy of the format up to the %s part, and
call vsnprintf() with the current va_list ap variable, where the ap would
then be ready to point at the string in question.
For architectures that passed va_list by reference this was possible. For
architectures that passed it by copy it was not. A test_can_verify()
function was used to differentiate between the two, and if it wasn't
possible, it would disable it.
Even for architectures where this was feasible, it was a stretch to rely
on such a method that is undocumented, and could cause issues later on
with new optimizations of the compiler.
Instead, the first function test_event_printk() was updated to look at
"%s" as well. If the "%s" argument is a pointer outside the event in the
ring buffer, it would find the field type of the event that is the problem
and mark the structure with a new flag called "needs_test". The event
itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
this event has a field that needs to be verified before the event can be
printed using the printf format.
When the event fields are created from the field type structure, the
fields would copy the field type's "needs_test" value.
Finally, before being printed, a new function ignore_event() is called
which will check if the event has the TEST_STR flag set (if not, it
returns false). If the flag is set, it then iterates through the events
fields looking for the ones that have the "needs_test" flag set.
Then it uses the offset field from the field structure to find the pointer
in the ring buffer event. It runs the tests to make sure that pointer is
safe to print and if not, it triggers the WARN_ON() and also adds to the
trace output that the event in question has an unsafe memory access.
The ignore_event() makes the trace_check_vprintf() obsolete so it is
removed.
Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:22 +00:00
|
|
|
/* Make sure the field is found */
|
|
|
|
field = find_event_field(fmt, call);
|
|
|
|
if (!field)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* Test this field's string before printing the event */
|
|
|
|
call->flags |= TRACE_EVENT_FL_TEST_STR;
|
|
|
|
field->needs_test = 1;
|
|
|
|
|
|
|
|
return true;
|
2024-12-17 02:41:21 +00:00
|
|
|
}
|
|
|
|
|
2021-02-25 21:51:23 +00:00
|
|
|
/*
|
|
|
|
* Examine the print fmt of the event looking for unsafe dereference
|
|
|
|
* pointers using %p* that could be recorded in the trace event and
|
|
|
|
* much later referenced after the pointer was freed. Dereferencing
|
|
|
|
* pointers are OK, if it is dereferenced into the event itself.
|
|
|
|
*/
|
|
|
|
static void test_event_printk(struct trace_event_call *call)
|
|
|
|
{
|
|
|
|
u64 dereference_flags = 0;
|
2024-12-17 02:41:21 +00:00
|
|
|
u64 string_flags = 0;
|
2021-02-25 21:51:23 +00:00
|
|
|
bool first = true;
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
const char *fmt;
|
2021-02-25 21:51:23 +00:00
|
|
|
int parens = 0;
|
|
|
|
char in_quote = 0;
|
|
|
|
int start_arg = 0;
|
|
|
|
int arg = 0;
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
int i, e;
|
2021-02-25 21:51:23 +00:00
|
|
|
|
|
|
|
fmt = call->print_fmt;
|
|
|
|
|
|
|
|
if (!fmt)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (i = 0; fmt[i]; i++) {
|
|
|
|
switch (fmt[i]) {
|
|
|
|
case '\\':
|
|
|
|
i++;
|
|
|
|
if (!fmt[i])
|
|
|
|
return;
|
|
|
|
continue;
|
|
|
|
case '"':
|
|
|
|
case '\'':
|
|
|
|
/*
|
|
|
|
* The print fmt starts with a string that
|
|
|
|
* is processed first to find %p* usage,
|
|
|
|
* then after the first string, the print fmt
|
|
|
|
* contains arguments that are used to check
|
|
|
|
* if the dereferenced %p* usage is safe.
|
|
|
|
*/
|
|
|
|
if (first) {
|
|
|
|
if (fmt[i] == '\'')
|
|
|
|
continue;
|
|
|
|
if (in_quote) {
|
|
|
|
arg = 0;
|
|
|
|
first = false;
|
|
|
|
/*
|
|
|
|
* If there was no %p* uses
|
|
|
|
* the fmt is OK.
|
|
|
|
*/
|
|
|
|
if (!dereference_flags)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (in_quote) {
|
|
|
|
if (in_quote == fmt[i])
|
|
|
|
in_quote = 0;
|
|
|
|
} else {
|
|
|
|
in_quote = fmt[i];
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
case '%':
|
|
|
|
if (!first || !in_quote)
|
|
|
|
continue;
|
|
|
|
i++;
|
|
|
|
if (!fmt[i])
|
|
|
|
return;
|
|
|
|
switch (fmt[i]) {
|
|
|
|
case '%':
|
|
|
|
continue;
|
|
|
|
case 'p':
|
|
|
|
/* Find dereferencing fields */
|
|
|
|
switch (fmt[i + 1]) {
|
|
|
|
case 'B': case 'R': case 'r':
|
|
|
|
case 'b': case 'M': case 'm':
|
|
|
|
case 'I': case 'i': case 'E':
|
|
|
|
case 'U': case 'V': case 'N':
|
|
|
|
case 'a': case 'd': case 'D':
|
|
|
|
case 'g': case 't': case 'C':
|
|
|
|
case 'O': case 'f':
|
|
|
|
if (WARN_ONCE(arg == 63,
|
|
|
|
"Too many args for event: %s",
|
|
|
|
trace_event_name(call)))
|
|
|
|
return;
|
|
|
|
dereference_flags |= 1ULL << arg;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
{
|
|
|
|
bool star = false;
|
|
|
|
int j;
|
|
|
|
|
|
|
|
/* Increment arg if %*s exists. */
|
|
|
|
for (j = 0; fmt[i + j]; j++) {
|
|
|
|
if (isdigit(fmt[i + j]) ||
|
|
|
|
fmt[i + j] == '.')
|
|
|
|
continue;
|
|
|
|
if (fmt[i + j] == '*') {
|
|
|
|
star = true;
|
|
|
|
continue;
|
|
|
|
}
|
2024-12-17 02:41:21 +00:00
|
|
|
if ((fmt[i + j] == 's')) {
|
|
|
|
if (star)
|
|
|
|
arg++;
|
|
|
|
if (WARN_ONCE(arg == 63,
|
|
|
|
"Too many args for event: %s",
|
|
|
|
trace_event_name(call)))
|
|
|
|
return;
|
|
|
|
dereference_flags |= 1ULL << arg;
|
|
|
|
string_flags |= 1ULL << arg;
|
|
|
|
}
|
2021-02-25 21:51:23 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
} /* default */
|
|
|
|
|
|
|
|
} /* switch */
|
|
|
|
arg++;
|
|
|
|
continue;
|
|
|
|
case '(':
|
|
|
|
if (in_quote)
|
|
|
|
continue;
|
|
|
|
parens++;
|
|
|
|
continue;
|
|
|
|
case ')':
|
|
|
|
if (in_quote)
|
|
|
|
continue;
|
|
|
|
parens--;
|
|
|
|
if (WARN_ONCE(parens < 0,
|
|
|
|
"Paren mismatch for event: %s\narg='%s'\n%*s",
|
|
|
|
trace_event_name(call),
|
|
|
|
fmt + start_arg,
|
|
|
|
(i - start_arg) + 5, "^"))
|
|
|
|
return;
|
|
|
|
continue;
|
|
|
|
case ',':
|
|
|
|
if (in_quote || parens)
|
|
|
|
continue;
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
e = i;
|
2021-02-25 21:51:23 +00:00
|
|
|
i++;
|
|
|
|
while (isspace(fmt[i]))
|
|
|
|
i++;
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If start_arg is zero, then this is the start of the
|
|
|
|
* first argument. The processing of the argument happens
|
|
|
|
* when the end of the argument is found, as it needs to
|
|
|
|
* handle paranthesis and such.
|
|
|
|
*/
|
|
|
|
if (!start_arg) {
|
|
|
|
start_arg = i;
|
|
|
|
/* Balance out the i++ in the for loop */
|
|
|
|
i--;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dereference_flags & (1ULL << arg)) {
|
2024-12-17 02:41:21 +00:00
|
|
|
if (string_flags & (1ULL << arg)) {
|
|
|
|
if (process_string(fmt + start_arg, e - start_arg, call))
|
|
|
|
dereference_flags &= ~(1ULL << arg);
|
|
|
|
} else if (process_pointer(fmt + start_arg, e - start_arg, call))
|
2021-02-25 21:51:23 +00:00
|
|
|
dereference_flags &= ~(1ULL << arg);
|
|
|
|
}
|
2022-04-07 18:56:32 +00:00
|
|
|
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
start_arg = i;
|
2021-02-25 21:51:23 +00:00
|
|
|
arg++;
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
/* Balance out the i++ in the for loop */
|
|
|
|
i--;
|
2021-02-25 21:51:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
if (dereference_flags & (1ULL << arg)) {
|
2024-12-17 02:41:21 +00:00
|
|
|
if (string_flags & (1ULL << arg)) {
|
|
|
|
if (process_string(fmt + start_arg, i - start_arg, call))
|
|
|
|
dereference_flags &= ~(1ULL << arg);
|
|
|
|
} else if (process_pointer(fmt + start_arg, i - start_arg, call))
|
tracing: Fix test_event_printk() to process entire print argument
The test_event_printk() analyzes print formats of trace events looking for
cases where it may dereference a pointer that is not in the ring buffer
which can possibly be a bug when the trace event is read from the ring
buffer and the content of that pointer no longer exists.
The function needs to accurately go from one print format argument to the
next. It handles quotes and parenthesis that may be included in an
argument. When it finds the start of the next argument, it uses a simple
"c = strstr(fmt + i, ',')" to find the end of that argument!
In order to include "%s" dereferencing, it needs to process the entire
content of the print format argument and not just the content of the first
',' it finds. As there may be content like:
({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
*access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
}; union kvm_mmu_page_role role; role.word = REC->role;
trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
%sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
: "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
"unsync" : "sync", 0); saved_ptr; })
Which is an example of a full argument of an existing event. As the code
already handles finding the next print format argument, process the
argument at the end of it and not the start of it. This way it has both
the start of the argument as well as the end of it.
Add a helper function "process_pointer()" that will do the processing during
the loop as well as at the end. It also makes the code cleaner and easier
to read.
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:19 +00:00
|
|
|
dereference_flags &= ~(1ULL << arg);
|
|
|
|
}
|
|
|
|
|
2021-02-25 21:51:23 +00:00
|
|
|
/*
|
|
|
|
* If you triggered the below warning, the trace event reported
|
|
|
|
* uses an unsafe dereference pointer %p*. As the data stored
|
|
|
|
* at the trace event time may no longer exist when the trace
|
|
|
|
* event is printed, dereferencing to the original source is
|
|
|
|
* unsafe. The source of the dereference must be copied into the
|
|
|
|
* event itself, and the dereference must access the copy instead.
|
|
|
|
*/
|
|
|
|
if (WARN_ON_ONCE(dereference_flags)) {
|
|
|
|
arg = 1;
|
|
|
|
while (!(dereference_flags & 1)) {
|
|
|
|
dereference_flags >>= 1;
|
|
|
|
arg++;
|
|
|
|
}
|
|
|
|
pr_warn("event %s has unsafe dereference of argument %d\n",
|
|
|
|
trace_event_name(call), arg);
|
|
|
|
pr_warn("print_fmt: %s\n", fmt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
int trace_event_raw_init(struct trace_event_call *call)
|
2009-12-08 03:14:20 +00:00
|
|
|
{
|
|
|
|
int id;
|
|
|
|
|
2015-05-05 13:39:12 +00:00
|
|
|
id = register_trace_event(&call->event);
|
2009-12-08 03:14:20 +00:00
|
|
|
if (!id)
|
|
|
|
return -ENODEV;
|
|
|
|
|
2021-02-25 21:51:23 +00:00
|
|
|
test_event_printk(call);
|
|
|
|
|
2009-12-08 03:14:20 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(trace_event_raw_init);
|
|
|
|
|
2015-09-25 16:58:44 +00:00
|
|
|
bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
|
|
|
|
{
|
|
|
|
struct trace_array *tr = trace_file->tr;
|
|
|
|
struct trace_array_cpu *data;
|
2020-03-25 23:51:19 +00:00
|
|
|
struct trace_pid_list *no_pid_list;
|
2015-09-25 16:58:44 +00:00
|
|
|
struct trace_pid_list *pid_list;
|
|
|
|
|
2018-08-03 02:34:07 +00:00
|
|
|
pid_list = rcu_dereference_raw(tr->filtered_pids);
|
2020-03-25 23:51:19 +00:00
|
|
|
no_pid_list = rcu_dereference_raw(tr->filtered_no_pids);
|
|
|
|
|
|
|
|
if (!pid_list && !no_pid_list)
|
2015-09-25 16:58:44 +00:00
|
|
|
return false;
|
|
|
|
|
2020-01-09 23:53:48 +00:00
|
|
|
data = this_cpu_ptr(tr->array_buffer.data);
|
2015-09-25 16:58:44 +00:00
|
|
|
|
|
|
|
return data->ignore_pid;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid);
|
|
|
|
|
2015-05-05 17:18:46 +00:00
|
|
|
void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
|
|
|
|
struct trace_event_file *trace_file,
|
|
|
|
unsigned long len)
|
2012-08-10 02:42:57 +00:00
|
|
|
{
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *event_call = trace_file->event_call;
|
2012-08-10 02:42:57 +00:00
|
|
|
|
2015-09-25 16:58:44 +00:00
|
|
|
if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) &&
|
|
|
|
trace_event_ignore_this_pid(trace_file))
|
|
|
|
return NULL;
|
|
|
|
|
2016-06-17 21:40:58 +00:00
|
|
|
/*
|
2019-07-26 21:19:40 +00:00
|
|
|
* If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
|
2016-06-17 21:40:58 +00:00
|
|
|
* preemption (adding one to the preempt_count). Since we are
|
|
|
|
* interested in the preempt_count at the time the tracepoint was
|
|
|
|
* hit, we need to subtract one to offset the increment.
|
|
|
|
*/
|
2021-01-25 19:45:08 +00:00
|
|
|
fbuffer->trace_ctx = tracing_gen_ctx_dec();
|
2015-05-05 14:09:53 +00:00
|
|
|
fbuffer->trace_file = trace_file;
|
2012-08-10 02:42:57 +00:00
|
|
|
|
|
|
|
fbuffer->event =
|
2015-05-05 14:09:53 +00:00
|
|
|
trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file,
|
2012-08-10 02:42:57 +00:00
|
|
|
event_call->event.type, len,
|
2021-01-25 19:45:08 +00:00
|
|
|
fbuffer->trace_ctx);
|
2012-08-10 02:42:57 +00:00
|
|
|
if (!fbuffer->event)
|
|
|
|
return NULL;
|
|
|
|
|
2020-01-10 16:05:31 +00:00
|
|
|
fbuffer->regs = NULL;
|
2012-08-10 02:42:57 +00:00
|
|
|
fbuffer->entry = ring_buffer_event_data(fbuffer->event);
|
|
|
|
return fbuffer->entry;
|
|
|
|
}
|
2015-05-05 17:18:46 +00:00
|
|
|
EXPORT_SYMBOL_GPL(trace_event_buffer_reserve);
|
2012-08-10 02:42:57 +00:00
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
int trace_event_reg(struct trace_event_call *call,
|
2015-05-05 13:39:12 +00:00
|
|
|
enum trace_reg type, void *data)
|
2010-06-08 15:22:06 +00:00
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file = data;
|
2012-05-04 03:09:03 +00:00
|
|
|
|
2014-04-08 21:26:21 +00:00
|
|
|
WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
|
2010-06-08 15:22:06 +00:00
|
|
|
switch (type) {
|
|
|
|
case TRACE_REG_REGISTER:
|
2014-04-08 21:26:21 +00:00
|
|
|
return tracepoint_probe_register(call->tp,
|
2010-06-08 15:22:06 +00:00
|
|
|
call->class->probe,
|
2012-05-04 03:09:03 +00:00
|
|
|
file);
|
2010-06-08 15:22:06 +00:00
|
|
|
case TRACE_REG_UNREGISTER:
|
2014-04-08 21:26:21 +00:00
|
|
|
tracepoint_probe_unregister(call->tp,
|
2010-06-08 15:22:06 +00:00
|
|
|
call->class->probe,
|
2012-05-04 03:09:03 +00:00
|
|
|
file);
|
2010-06-08 15:22:06 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
|
|
case TRACE_REG_PERF_REGISTER:
|
2014-04-08 21:26:21 +00:00
|
|
|
return tracepoint_probe_register(call->tp,
|
2010-06-08 15:22:06 +00:00
|
|
|
call->class->perf_probe,
|
|
|
|
call);
|
|
|
|
case TRACE_REG_PERF_UNREGISTER:
|
2014-04-08 21:26:21 +00:00
|
|
|
tracepoint_probe_unregister(call->tp,
|
2010-06-08 15:22:06 +00:00
|
|
|
call->class->perf_probe,
|
|
|
|
call);
|
|
|
|
return 0;
|
2012-02-15 14:51:49 +00:00
|
|
|
case TRACE_REG_PERF_OPEN:
|
|
|
|
case TRACE_REG_PERF_CLOSE:
|
2012-02-15 14:51:50 +00:00
|
|
|
case TRACE_REG_PERF_ADD:
|
|
|
|
case TRACE_REG_PERF_DEL:
|
2012-02-15 14:51:49 +00:00
|
|
|
return 0;
|
2010-06-08 15:22:06 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2015-05-05 13:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(trace_event_reg);
|
2010-06-08 15:22:06 +00:00
|
|
|
|
2010-07-02 03:07:32 +00:00
|
|
|
void trace_event_enable_cmd_record(bool enable)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr;
|
2010-07-02 03:07:32 +00:00
|
|
|
|
2019-12-10 09:15:16 +00:00
|
|
|
lockdep_assert_held(&event_mutex);
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
do_for_each_event_file(tr, file) {
|
|
|
|
|
2015-05-13 19:12:33 +00:00
|
|
|
if (!(file->flags & EVENT_FILE_FL_ENABLED))
|
2010-07-02 03:07:32 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
if (enable) {
|
|
|
|
tracing_start_cmdline_record();
|
2015-05-13 19:12:33 +00:00
|
|
|
set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
|
2010-07-02 03:07:32 +00:00
|
|
|
} else {
|
|
|
|
tracing_stop_cmdline_record();
|
2015-05-13 19:12:33 +00:00
|
|
|
clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
|
2010-07-02 03:07:32 +00:00
|
|
|
}
|
2012-05-04 03:09:03 +00:00
|
|
|
} while_for_each_event_file();
|
2010-07-02 03:07:32 +00:00
|
|
|
}
|
|
|
|
|
2017-06-27 02:01:55 +00:00
|
|
|
void trace_event_enable_tgid_record(bool enable)
|
|
|
|
{
|
|
|
|
struct trace_event_file *file;
|
|
|
|
struct trace_array *tr;
|
|
|
|
|
2019-12-10 09:15:16 +00:00
|
|
|
lockdep_assert_held(&event_mutex);
|
|
|
|
|
2017-06-27 02:01:55 +00:00
|
|
|
do_for_each_event_file(tr, file) {
|
|
|
|
if (!(file->flags & EVENT_FILE_FL_ENABLED))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (enable) {
|
|
|
|
tracing_start_tgid_record();
|
|
|
|
set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
|
|
|
|
} else {
|
|
|
|
tracing_stop_tgid_record();
|
|
|
|
clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT,
|
|
|
|
&file->flags);
|
|
|
|
}
|
|
|
|
} while_for_each_event_file();
|
|
|
|
}
|
|
|
|
|
2015-05-05 14:09:53 +00:00
|
|
|
static int __ftrace_event_enable_disable(struct trace_event_file *file,
|
2013-03-12 17:26:18 +00:00
|
|
|
int enable, int soft_disable)
|
2009-02-28 07:41:25 +00:00
|
|
|
{
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call = file->event_call;
|
2015-09-30 13:42:05 +00:00
|
|
|
struct trace_array *tr = file->tr;
|
2009-12-08 03:14:52 +00:00
|
|
|
int ret = 0;
|
2013-03-12 17:26:18 +00:00
|
|
|
int disable;
|
2009-12-08 03:14:52 +00:00
|
|
|
|
2009-02-28 07:41:25 +00:00
|
|
|
switch (enable) {
|
|
|
|
case 0:
|
2013-03-12 17:26:18 +00:00
|
|
|
/*
|
2013-05-09 05:44:29 +00:00
|
|
|
* When soft_disable is set and enable is cleared, the sm_ref
|
|
|
|
* reference counter is decremented. If it reaches 0, we want
|
2013-03-12 17:26:18 +00:00
|
|
|
* to clear the SOFT_DISABLED flag but leave the event in the
|
|
|
|
* state that it was. That is, if the event was enabled and
|
|
|
|
* SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
|
|
|
|
* is set we do not want the event to be enabled before we
|
|
|
|
* clear the bit.
|
|
|
|
*
|
|
|
|
* When soft_disable is not set but the SOFT_MODE flag is,
|
|
|
|
* we do nothing. Do not disable the tracepoint, otherwise
|
|
|
|
* "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
|
|
|
|
*/
|
|
|
|
if (soft_disable) {
|
2013-05-09 05:44:29 +00:00
|
|
|
if (atomic_dec_return(&file->sm_ref) > 0)
|
|
|
|
break;
|
2015-05-13 19:12:33 +00:00
|
|
|
disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
|
|
|
|
clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
|
tracing: Fix warning in trace_buffered_event_disable()
Warning happened in trace_buffered_event_disable() at
WARN_ON_ONCE(!trace_buffered_event_ref)
Call Trace:
? __warn+0xa5/0x1b0
? trace_buffered_event_disable+0x189/0x1b0
__ftrace_event_enable_disable+0x19e/0x3e0
free_probe_data+0x3b/0xa0
unregister_ftrace_function_probe_func+0x6b8/0x800
event_enable_func+0x2f0/0x3d0
ftrace_process_regex.isra.0+0x12d/0x1b0
ftrace_filter_write+0xe6/0x140
vfs_write+0x1c9/0x6f0
[...]
The cause of the warning is in __ftrace_event_enable_disable(),
trace_buffered_event_enable() was called once while
trace_buffered_event_disable() was called twice.
Reproduction script show as below, for analysis, see the comments:
```
#!/bin/bash
cd /sys/kernel/tracing/
# 1. Register a 'disable_event' command, then:
# 1) SOFT_DISABLED_BIT was set;
# 2) trace_buffered_event_enable() was called first time;
echo 'cmdline_proc_show:disable_event:initcall:initcall_finish' > \
set_ftrace_filter
# 2. Enable the event registered, then:
# 1) SOFT_DISABLED_BIT was cleared;
# 2) trace_buffered_event_disable() was called first time;
echo 1 > events/initcall/initcall_finish/enable
# 3. Try to call into cmdline_proc_show(), then SOFT_DISABLED_BIT was
# set again!!!
cat /proc/cmdline
# 4. Unregister the 'disable_event' command, then:
# 1) SOFT_DISABLED_BIT was cleared again;
# 2) trace_buffered_event_disable() was called second time!!!
echo '!cmdline_proc_show:disable_event:initcall:initcall_finish' > \
set_ftrace_filter
```
To fix it, IIUC, we can change to call trace_buffered_event_enable() at
fist time soft-mode enabled, and call trace_buffered_event_disable() at
last time soft-mode disabled.
Link: https://lore.kernel.org/linux-trace-kernel/20230726095804.920457-1-zhengyejian1@huawei.com
Cc: <mhiramat@kernel.org>
Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events")
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-26 09:58:04 +00:00
|
|
|
/* Disable use of trace_buffered_event */
|
|
|
|
trace_buffered_event_disable();
|
2013-03-12 17:26:18 +00:00
|
|
|
} else
|
2015-05-13 19:12:33 +00:00
|
|
|
disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
|
2013-03-12 17:26:18 +00:00
|
|
|
|
2015-05-13 19:12:33 +00:00
|
|
|
if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
|
|
|
|
clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
|
|
|
|
if (file->flags & EVENT_FILE_FL_RECORDED_CMD) {
|
2010-07-02 03:07:32 +00:00
|
|
|
tracing_stop_cmdline_record();
|
2015-05-13 19:12:33 +00:00
|
|
|
clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
|
2010-07-02 03:07:32 +00:00
|
|
|
}
|
2017-06-27 02:01:55 +00:00
|
|
|
|
|
|
|
if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
|
|
|
|
tracing_stop_tgid_record();
|
2017-09-05 05:36:46 +00:00
|
|
|
clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
|
2017-06-27 02:01:55 +00:00
|
|
|
}
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
call->class->reg(call, TRACE_REG_UNREGISTER, file);
|
2009-02-28 07:41:25 +00:00
|
|
|
}
|
2013-06-29 05:08:07 +00:00
|
|
|
/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
|
2015-05-13 19:12:33 +00:00
|
|
|
if (file->flags & EVENT_FILE_FL_SOFT_MODE)
|
|
|
|
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
|
2013-06-29 05:08:07 +00:00
|
|
|
else
|
2015-05-13 19:12:33 +00:00
|
|
|
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
|
2009-02-28 07:41:25 +00:00
|
|
|
break;
|
|
|
|
case 1:
|
2013-03-12 17:26:18 +00:00
|
|
|
/*
|
|
|
|
* When soft_disable is set and enable is set, we want to
|
|
|
|
* register the tracepoint for the event, but leave the event
|
|
|
|
* as is. That means, if the event was already enabled, we do
|
|
|
|
* nothing (but set SOFT_MODE). If the event is disabled, we
|
|
|
|
* set SOFT_DISABLED before enabling the event tracepoint, so
|
|
|
|
* it still seems to be disabled.
|
|
|
|
*/
|
|
|
|
if (!soft_disable)
|
2015-05-13 19:12:33 +00:00
|
|
|
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
|
2013-05-09 05:44:29 +00:00
|
|
|
else {
|
|
|
|
if (atomic_inc_return(&file->sm_ref) > 1)
|
|
|
|
break;
|
2015-05-13 19:12:33 +00:00
|
|
|
set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
|
tracing: Fix warning in trace_buffered_event_disable()
Warning happened in trace_buffered_event_disable() at
WARN_ON_ONCE(!trace_buffered_event_ref)
Call Trace:
? __warn+0xa5/0x1b0
? trace_buffered_event_disable+0x189/0x1b0
__ftrace_event_enable_disable+0x19e/0x3e0
free_probe_data+0x3b/0xa0
unregister_ftrace_function_probe_func+0x6b8/0x800
event_enable_func+0x2f0/0x3d0
ftrace_process_regex.isra.0+0x12d/0x1b0
ftrace_filter_write+0xe6/0x140
vfs_write+0x1c9/0x6f0
[...]
The cause of the warning is in __ftrace_event_enable_disable(),
trace_buffered_event_enable() was called once while
trace_buffered_event_disable() was called twice.
Reproduction script show as below, for analysis, see the comments:
```
#!/bin/bash
cd /sys/kernel/tracing/
# 1. Register a 'disable_event' command, then:
# 1) SOFT_DISABLED_BIT was set;
# 2) trace_buffered_event_enable() was called first time;
echo 'cmdline_proc_show:disable_event:initcall:initcall_finish' > \
set_ftrace_filter
# 2. Enable the event registered, then:
# 1) SOFT_DISABLED_BIT was cleared;
# 2) trace_buffered_event_disable() was called first time;
echo 1 > events/initcall/initcall_finish/enable
# 3. Try to call into cmdline_proc_show(), then SOFT_DISABLED_BIT was
# set again!!!
cat /proc/cmdline
# 4. Unregister the 'disable_event' command, then:
# 1) SOFT_DISABLED_BIT was cleared again;
# 2) trace_buffered_event_disable() was called second time!!!
echo '!cmdline_proc_show:disable_event:initcall:initcall_finish' > \
set_ftrace_filter
```
To fix it, IIUC, we can change to call trace_buffered_event_enable() at
fist time soft-mode enabled, and call trace_buffered_event_disable() at
last time soft-mode disabled.
Link: https://lore.kernel.org/linux-trace-kernel/20230726095804.920457-1-zhengyejian1@huawei.com
Cc: <mhiramat@kernel.org>
Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events")
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-26 09:58:04 +00:00
|
|
|
/* Enable use of trace_buffered_event */
|
|
|
|
trace_buffered_event_enable();
|
2013-05-09 05:44:29 +00:00
|
|
|
}
|
2013-03-12 17:26:18 +00:00
|
|
|
|
2015-05-13 19:12:33 +00:00
|
|
|
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
|
2017-06-27 02:01:55 +00:00
|
|
|
bool cmd = false, tgid = false;
|
2013-03-12 17:26:18 +00:00
|
|
|
|
|
|
|
/* Keep the event disabled, when going to SOFT_MODE. */
|
|
|
|
if (soft_disable)
|
2015-05-13 19:12:33 +00:00
|
|
|
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
|
2013-03-12 17:26:18 +00:00
|
|
|
|
2015-09-30 13:42:05 +00:00
|
|
|
if (tr->trace_flags & TRACE_ITER_RECORD_CMD) {
|
2017-06-27 02:01:55 +00:00
|
|
|
cmd = true;
|
2010-07-02 03:07:32 +00:00
|
|
|
tracing_start_cmdline_record();
|
2015-05-13 19:12:33 +00:00
|
|
|
set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
|
2010-07-02 03:07:32 +00:00
|
|
|
}
|
2017-06-27 02:01:55 +00:00
|
|
|
|
|
|
|
if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
|
|
|
|
tgid = true;
|
|
|
|
tracing_start_tgid_record();
|
|
|
|
set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
|
|
|
|
}
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
ret = call->class->reg(call, TRACE_REG_REGISTER, file);
|
2009-12-08 03:14:52 +00:00
|
|
|
if (ret) {
|
2017-06-27 02:01:55 +00:00
|
|
|
if (cmd)
|
|
|
|
tracing_stop_cmdline_record();
|
|
|
|
if (tgid)
|
|
|
|
tracing_stop_tgid_record();
|
2009-12-08 03:14:52 +00:00
|
|
|
pr_info("event trace: Could not enable event "
|
2015-05-13 18:20:14 +00:00
|
|
|
"%s\n", trace_event_name(call));
|
2009-12-08 03:14:52 +00:00
|
|
|
break;
|
|
|
|
}
|
2015-05-13 19:12:33 +00:00
|
|
|
set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
|
2013-03-05 04:05:12 +00:00
|
|
|
|
|
|
|
/* WAS_ENABLED gets set but never cleared. */
|
tracing: Only have rmmod clear buffers that its events were active in
Currently, when a module event is enabled, when that module is removed, it
clears all ring buffers. This is to prevent another module from being loaded
and having one of its trace event IDs from reusing a trace event ID of the
removed module. This could cause undesirable effects as the trace event of
the new module would be using its own processing algorithms to process raw
data of another event. To prevent this, when a module is loaded, if any of
its events have been used (signified by the WAS_ENABLED event call flag,
which is never cleared), all ring buffers are cleared, just in case any one
of them contains event data of the removed event.
The problem is, there's no reason to clear all ring buffers if only one (or
less than all of them) uses one of the events. Instead, only clear the ring
buffers that recorded the events of a module that is being removed.
To do this, instead of keeping the WAS_ENABLED flag with the trace event
call, move it to the per instance (per ring buffer) event file descriptor.
The event file descriptor maps each event to a separate ring buffer
instance. Then when the module is removed, only the ring buffers that
activated one of the module's events get cleared. The rest are not touched.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-08-31 21:03:47 +00:00
|
|
|
set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags);
|
2009-02-28 07:41:25 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2009-12-08 03:14:52 +00:00
|
|
|
|
|
|
|
return ret;
|
2009-02-28 07:41:25 +00:00
|
|
|
}
|
|
|
|
|
2015-05-05 14:09:53 +00:00
|
|
|
int trace_event_enable_disable(struct trace_event_file *file,
|
tracing: Add basic event trigger framework
Add a 'trigger' file for each trace event, enabling 'trace event
triggers' to be set for trace events.
'trace event triggers' are patterned after the existing 'ftrace
function triggers' implementation except that triggers are written to
per-event 'trigger' files instead of to a single file such as the
'set_ftrace_filter' used for ftrace function triggers.
The implementation is meant to be entirely separate from ftrace
function triggers, in order to keep the respective implementations
relatively simple and to allow them to diverge.
The event trigger functionality is built on top of SOFT_DISABLE
functionality. It adds a TRIGGER_MODE bit to the ftrace_event_file
flags which is checked when any trace event fires. Triggers set for a
particular event need to be checked regardless of whether that event
is actually enabled or not - getting an event to fire even if it's not
enabled is what's already implemented by SOFT_DISABLE mode, so trigger
mode directly reuses that. Event trigger essentially inherit the soft
disable logic in __ftrace_event_enable_disable() while adding a bit of
logic and trigger reference counting via tm_ref on top of that in a
new trace_event_trigger_enable_disable() function. Because the base
__ftrace_event_enable_disable() code now needs to be invoked from
outside trace_events.c, a wrapper is also added for those usages.
The triggers for an event are actually invoked via a new function,
event_triggers_call(), and code is also added to invoke them for
ftrace_raw_event calls as well as syscall events.
The main part of the patch creates a new trace_events_trigger.c file
to contain the trace event triggers implementation.
The standard open, read, and release file operations are implemented
here.
The open() implementation sets up for the various open modes of the
'trigger' file. It creates and attaches the trigger iterator and sets
up the command parser. If opened for reading set up the trigger
seq_ops.
The read() implementation parses the event trigger written to the
'trigger' file, looks up the trigger command, and passes it along to
that event_command's func() implementation for command-specific
processing.
The release() implementation does whatever cleanup is needed to
release the 'trigger' file, like releasing the parser and trigger
iterator, etc.
A couple of functions for event command registration and
unregistration are added, along with a list to add them to and a mutex
to protect them, as well as an (initially empty) registration function
to add the set of commands that will be added by future commits, and
call to it from the trace event initialization code.
also added are a couple trigger-specific data structures needed for
these implementations such as a trigger iterator and a struct for
trigger-specific data.
A couple structs consisting mostly of function meant to be implemented
in command-specific ways, event_command and event_trigger_ops, are
used by the generic event trigger command implementations. They're
being put into trace.h alongside the other trace_event data structures
and functions, in the expectation that they'll be needed in several
trace_event-related files such as trace_events_trigger.c and
trace_events.c.
The event_command.func() function is meant to be called by the trigger
parsing code in order to add a trigger instance to the corresponding
event. It essentially coordinates adding a live trigger instance to
the event, and arming the triggering the event.
Every event_command func() implementation essentially does the
same thing for any command:
- choose ops - use the value of param to choose either a number or
count version of event_trigger_ops specific to the command
- do the register or unregister of those ops
- associate a filter, if specified, with the triggering event
The reg() and unreg() ops allow command-specific implementations for
event_trigger_op registration and unregistration, and the
get_trigger_ops() op allows command-specific event_trigger_ops
selection to be parameterized. When a trigger instance is added, the
reg() op essentially adds that trigger to the triggering event and
arms it, while unreg() does the opposite. The set_filter() function
is used to associate a filter with the trigger - if the command
doesn't specify a set_filter() implementation, the command will ignore
filters.
Each command has an associated trigger_type, which serves double duty,
both as a unique identifier for the command as well as a value that
can be used for setting a trigger mode bit during trigger invocation.
The signature of func() adds a pointer to the event_command struct,
used to invoke those functions, along with a command_data param that
can be passed to the reg/unreg functions. This allows func()
implementations to use command-specific blobs and supports code
re-use.
The event_trigger_ops.func() command corrsponds to the trigger 'probe'
function that gets called when the triggering event is actually
invoked. The other functions are used to list the trigger when
needed, along with a couple mundane book-keeping functions.
This also moves event_file_data() into trace.h so it can be used
outside of trace_events.c.
Link: http://lkml.kernel.org/r/316d95061accdee070aac8e5750afba0192fa5b9.1382622043.git.tom.zanussi@linux.intel.com
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Idea-by: Steve Rostedt <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-10-24 13:59:24 +00:00
|
|
|
int enable, int soft_disable)
|
|
|
|
{
|
|
|
|
return __ftrace_event_enable_disable(file, enable, soft_disable);
|
|
|
|
}
|
|
|
|
|
2015-05-05 14:09:53 +00:00
|
|
|
static int ftrace_event_enable_disable(struct trace_event_file *file,
|
2013-03-12 17:26:18 +00:00
|
|
|
int enable)
|
|
|
|
{
|
|
|
|
return __ftrace_event_enable_disable(file, enable, 0);
|
|
|
|
}
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
static void ftrace_clear_events(struct trace_array *tr)
|
2009-05-25 10:13:59 +00:00
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2009-05-25 10:13:59 +00:00
|
|
|
|
|
|
|
mutex_lock(&event_mutex);
|
2012-05-04 03:09:03 +00:00
|
|
|
list_for_each_entry(file, &tr->events, list) {
|
|
|
|
ftrace_event_enable_disable(file, 0);
|
2009-05-25 10:13:59 +00:00
|
|
|
}
|
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
}
|
|
|
|
|
2016-04-13 20:59:18 +00:00
|
|
|
static void
|
|
|
|
event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
|
|
|
|
{
|
|
|
|
struct trace_pid_list *pid_list;
|
|
|
|
struct trace_array *tr = data;
|
|
|
|
|
2018-08-03 02:34:07 +00:00
|
|
|
pid_list = rcu_dereference_raw(tr->filtered_pids);
|
2016-04-14 11:38:13 +00:00
|
|
|
trace_filter_add_remove_task(pid_list, NULL, task);
|
2020-03-25 23:51:19 +00:00
|
|
|
|
|
|
|
pid_list = rcu_dereference_raw(tr->filtered_no_pids);
|
|
|
|
trace_filter_add_remove_task(pid_list, NULL, task);
|
2016-04-13 20:59:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
event_filter_pid_sched_process_fork(void *data,
|
|
|
|
struct task_struct *self,
|
|
|
|
struct task_struct *task)
|
|
|
|
{
|
|
|
|
struct trace_pid_list *pid_list;
|
|
|
|
struct trace_array *tr = data;
|
|
|
|
|
|
|
|
pid_list = rcu_dereference_sched(tr->filtered_pids);
|
2016-04-14 11:38:13 +00:00
|
|
|
trace_filter_add_remove_task(pid_list, self, task);
|
2020-03-25 23:51:19 +00:00
|
|
|
|
|
|
|
pid_list = rcu_dereference_sched(tr->filtered_no_pids);
|
|
|
|
trace_filter_add_remove_task(pid_list, self, task);
|
2016-04-13 20:59:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void trace_event_follow_fork(struct trace_array *tr, bool enable)
|
|
|
|
{
|
|
|
|
if (enable) {
|
|
|
|
register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
|
|
|
|
tr, INT_MIN);
|
2020-08-05 00:00:02 +00:00
|
|
|
register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit,
|
2016-04-13 20:59:18 +00:00
|
|
|
tr, INT_MAX);
|
|
|
|
} else {
|
|
|
|
unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
|
|
|
|
tr);
|
2020-08-05 00:00:02 +00:00
|
|
|
unregister_trace_sched_process_free(event_filter_pid_sched_process_exit,
|
2016-04-13 20:59:18 +00:00
|
|
|
tr);
|
|
|
|
}
|
2015-09-25 16:58:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2015-11-06 21:30:20 +00:00
|
|
|
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
|
2022-01-20 16:25:19 +00:00
|
|
|
struct task_struct *prev,
|
2022-05-11 18:28:36 +00:00
|
|
|
struct task_struct *next,
|
|
|
|
unsigned int prev_state)
|
2015-09-25 16:58:44 +00:00
|
|
|
{
|
|
|
|
struct trace_array *tr = data;
|
2020-03-25 23:51:19 +00:00
|
|
|
struct trace_pid_list *no_pid_list;
|
2015-09-25 16:58:44 +00:00
|
|
|
struct trace_pid_list *pid_list;
|
2020-03-25 23:51:19 +00:00
|
|
|
bool ret;
|
2015-09-25 16:58:44 +00:00
|
|
|
|
|
|
|
pid_list = rcu_dereference_sched(tr->filtered_pids);
|
2020-03-25 23:51:19 +00:00
|
|
|
no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
|
2015-09-25 16:58:44 +00:00
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
/*
|
|
|
|
* Sched switch is funny, as we only want to ignore it
|
|
|
|
* in the notrace case if both prev and next should be ignored.
|
|
|
|
*/
|
|
|
|
ret = trace_ignore_this_task(NULL, no_pid_list, prev) &&
|
|
|
|
trace_ignore_this_task(NULL, no_pid_list, next);
|
|
|
|
|
|
|
|
this_cpu_write(tr->array_buffer.data->ignore_pid, ret ||
|
|
|
|
(trace_ignore_this_task(pid_list, NULL, prev) &&
|
|
|
|
trace_ignore_this_task(pid_list, NULL, next)));
|
2015-09-25 16:58:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2015-11-06 21:30:20 +00:00
|
|
|
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
|
2022-01-20 16:25:19 +00:00
|
|
|
struct task_struct *prev,
|
2022-05-11 18:28:36 +00:00
|
|
|
struct task_struct *next,
|
|
|
|
unsigned int prev_state)
|
2015-09-25 16:58:44 +00:00
|
|
|
{
|
|
|
|
struct trace_array *tr = data;
|
2020-03-25 23:51:19 +00:00
|
|
|
struct trace_pid_list *no_pid_list;
|
2015-09-25 16:58:44 +00:00
|
|
|
struct trace_pid_list *pid_list;
|
|
|
|
|
|
|
|
pid_list = rcu_dereference_sched(tr->filtered_pids);
|
2020-03-25 23:51:19 +00:00
|
|
|
no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
|
2015-09-25 16:58:44 +00:00
|
|
|
|
2020-01-09 23:53:48 +00:00
|
|
|
this_cpu_write(tr->array_buffer.data->ignore_pid,
|
2020-03-25 23:51:19 +00:00
|
|
|
trace_ignore_this_task(pid_list, no_pid_list, next));
|
2015-09-25 16:58:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
|
|
|
|
{
|
|
|
|
struct trace_array *tr = data;
|
2020-03-25 23:51:19 +00:00
|
|
|
struct trace_pid_list *no_pid_list;
|
2015-09-25 16:58:44 +00:00
|
|
|
struct trace_pid_list *pid_list;
|
|
|
|
|
|
|
|
/* Nothing to do if we are already tracing */
|
2020-01-09 23:53:48 +00:00
|
|
|
if (!this_cpu_read(tr->array_buffer.data->ignore_pid))
|
2015-09-25 16:58:44 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
pid_list = rcu_dereference_sched(tr->filtered_pids);
|
2020-03-25 23:51:19 +00:00
|
|
|
no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
|
2015-09-25 16:58:44 +00:00
|
|
|
|
2020-01-09 23:53:48 +00:00
|
|
|
this_cpu_write(tr->array_buffer.data->ignore_pid,
|
2020-03-25 23:51:19 +00:00
|
|
|
trace_ignore_this_task(pid_list, no_pid_list, task));
|
2015-09-25 16:58:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
|
|
|
|
{
|
|
|
|
struct trace_array *tr = data;
|
2020-03-25 23:51:19 +00:00
|
|
|
struct trace_pid_list *no_pid_list;
|
2015-09-25 16:58:44 +00:00
|
|
|
struct trace_pid_list *pid_list;
|
|
|
|
|
|
|
|
/* Nothing to do if we are not tracing */
|
2020-01-09 23:53:48 +00:00
|
|
|
if (this_cpu_read(tr->array_buffer.data->ignore_pid))
|
2015-09-25 16:58:44 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
pid_list = rcu_dereference_sched(tr->filtered_pids);
|
2020-03-25 23:51:19 +00:00
|
|
|
no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
|
2015-09-25 16:58:44 +00:00
|
|
|
|
|
|
|
/* Set tracing if current is enabled */
|
2020-01-09 23:53:48 +00:00
|
|
|
this_cpu_write(tr->array_buffer.data->ignore_pid,
|
2020-03-25 23:51:19 +00:00
|
|
|
trace_ignore_this_task(pid_list, no_pid_list, current));
|
2015-09-25 16:58:44 +00:00
|
|
|
}
|
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
static void unregister_pid_events(struct trace_array *tr)
|
2015-09-24 15:33:26 +00:00
|
|
|
{
|
2015-09-25 16:58:44 +00:00
|
|
|
unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr);
|
|
|
|
unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr);
|
|
|
|
|
|
|
|
unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr);
|
|
|
|
unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr);
|
|
|
|
|
2015-12-01 21:08:05 +00:00
|
|
|
unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr);
|
|
|
|
unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr);
|
|
|
|
|
|
|
|
unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr);
|
|
|
|
unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr);
|
2020-03-25 23:51:19 +00:00
|
|
|
}
|
2015-12-01 21:08:05 +00:00
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
static void __ftrace_clear_event_pids(struct trace_array *tr, int type)
|
|
|
|
{
|
|
|
|
struct trace_pid_list *pid_list;
|
|
|
|
struct trace_pid_list *no_pid_list;
|
|
|
|
struct trace_event_file *file;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
pid_list = rcu_dereference_protected(tr->filtered_pids,
|
|
|
|
lockdep_is_held(&event_mutex));
|
|
|
|
no_pid_list = rcu_dereference_protected(tr->filtered_no_pids,
|
|
|
|
lockdep_is_held(&event_mutex));
|
|
|
|
|
|
|
|
/* Make sure there's something to do */
|
|
|
|
if (!pid_type_enabled(type, pid_list, no_pid_list))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!still_need_pid_events(type, pid_list, no_pid_list)) {
|
|
|
|
unregister_pid_events(tr);
|
|
|
|
|
|
|
|
list_for_each_entry(file, &tr->events, list) {
|
|
|
|
clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu)
|
|
|
|
per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false;
|
2015-09-25 16:58:44 +00:00
|
|
|
}
|
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
if (type & TRACE_PIDS)
|
|
|
|
rcu_assign_pointer(tr->filtered_pids, NULL);
|
2015-09-25 16:58:44 +00:00
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
if (type & TRACE_NO_PIDS)
|
|
|
|
rcu_assign_pointer(tr->filtered_no_pids, NULL);
|
2015-09-24 15:33:26 +00:00
|
|
|
|
|
|
|
/* Wait till all users are no longer using pid filtering */
|
2018-08-09 19:31:48 +00:00
|
|
|
tracepoint_synchronize_unregister();
|
2015-09-24 15:33:26 +00:00
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
if ((type & TRACE_PIDS) && pid_list)
|
2021-09-24 01:03:49 +00:00
|
|
|
trace_pid_list_free(pid_list);
|
2020-03-25 23:51:19 +00:00
|
|
|
|
|
|
|
if ((type & TRACE_NO_PIDS) && no_pid_list)
|
2021-09-24 01:03:49 +00:00
|
|
|
trace_pid_list_free(no_pid_list);
|
2015-09-24 15:33:26 +00:00
|
|
|
}
|
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
static void ftrace_clear_event_pids(struct trace_array *tr, int type)
|
2015-09-24 15:33:26 +00:00
|
|
|
{
|
|
|
|
mutex_lock(&event_mutex);
|
2020-03-25 23:51:19 +00:00
|
|
|
__ftrace_clear_event_pids(tr, type);
|
2015-09-24 15:33:26 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
}
|
|
|
|
|
2011-07-05 15:36:06 +00:00
|
|
|
static void __put_system(struct event_subsystem *system)
|
|
|
|
{
|
|
|
|
struct event_filter *filter = system->filter;
|
|
|
|
|
2013-06-27 14:58:31 +00:00
|
|
|
WARN_ON_ONCE(system_refcount(system) == 0);
|
|
|
|
if (system_refcount_dec(system))
|
2011-07-05 15:36:06 +00:00
|
|
|
return;
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
list_del(&system->list);
|
|
|
|
|
2011-07-05 15:36:06 +00:00
|
|
|
if (filter) {
|
|
|
|
kfree(filter->filter_string);
|
|
|
|
kfree(filter);
|
|
|
|
}
|
2015-09-09 21:24:01 +00:00
|
|
|
kfree_const(system->name);
|
2011-07-05 15:36:06 +00:00
|
|
|
kfree(system);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __get_system(struct event_subsystem *system)
|
|
|
|
{
|
2013-06-27 14:58:31 +00:00
|
|
|
WARN_ON_ONCE(system_refcount(system) == 0);
|
|
|
|
system_refcount_inc(system);
|
2011-07-05 15:36:06 +00:00
|
|
|
}
|
|
|
|
|
2015-05-13 18:59:40 +00:00
|
|
|
static void __get_system_dir(struct trace_subsystem_dir *dir)
|
2012-05-04 03:09:03 +00:00
|
|
|
{
|
|
|
|
WARN_ON_ONCE(dir->ref_count == 0);
|
|
|
|
dir->ref_count++;
|
|
|
|
__get_system(dir->subsystem);
|
|
|
|
}
|
|
|
|
|
2015-05-13 18:59:40 +00:00
|
|
|
static void __put_system_dir(struct trace_subsystem_dir *dir)
|
2012-05-04 03:09:03 +00:00
|
|
|
{
|
|
|
|
WARN_ON_ONCE(dir->ref_count == 0);
|
|
|
|
/* If the subsystem is about to be freed, the dir must be too */
|
2013-06-27 14:58:31 +00:00
|
|
|
WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1);
|
2012-05-04 03:09:03 +00:00
|
|
|
|
|
|
|
__put_system(dir->subsystem);
|
|
|
|
if (!--dir->ref_count)
|
|
|
|
kfree(dir);
|
|
|
|
}
|
|
|
|
|
2015-05-13 18:59:40 +00:00
|
|
|
static void put_system(struct trace_subsystem_dir *dir)
|
2011-07-05 15:36:06 +00:00
|
|
|
{
|
|
|
|
mutex_lock(&event_mutex);
|
2012-05-04 03:09:03 +00:00
|
|
|
__put_system_dir(dir);
|
2011-07-05 15:36:06 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
}
|
|
|
|
|
2015-05-13 18:59:40 +00:00
|
|
|
static void remove_subsystem(struct trace_subsystem_dir *dir)
|
2013-07-26 17:25:47 +00:00
|
|
|
{
|
|
|
|
if (!dir)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!--dir->nr_events) {
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
eventfs_remove_dir(dir->ei);
|
2013-07-26 17:25:47 +00:00
|
|
|
list_del(&dir->list);
|
|
|
|
__put_system_dir(dir);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-31 16:24:53 +00:00
|
|
|
void event_file_get(struct trace_event_file *file)
|
|
|
|
{
|
2024-07-26 18:42:08 +00:00
|
|
|
refcount_inc(&file->ref);
|
2023-10-31 16:24:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void event_file_put(struct trace_event_file *file)
|
|
|
|
{
|
2024-07-26 18:42:08 +00:00
|
|
|
if (WARN_ON_ONCE(!refcount_read(&file->ref))) {
|
2023-10-31 16:24:53 +00:00
|
|
|
if (file->flags & EVENT_FILE_FL_FREED)
|
|
|
|
kmem_cache_free(file_cachep, file);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-07-26 18:42:08 +00:00
|
|
|
if (refcount_dec_and_test(&file->ref)) {
|
2023-10-31 16:24:53 +00:00
|
|
|
/* Count should only go to zero when it is freed */
|
|
|
|
if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED)))
|
|
|
|
return;
|
|
|
|
kmem_cache_free(file_cachep, file);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-05 14:09:53 +00:00
|
|
|
static void remove_event_file_dir(struct trace_event_file *file)
|
2013-07-26 17:25:47 +00:00
|
|
|
{
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
eventfs_remove_dir(file->ei);
|
2013-07-26 17:25:47 +00:00
|
|
|
list_del(&file->list);
|
|
|
|
remove_subsystem(file->system);
|
2014-07-11 19:06:38 +00:00
|
|
|
free_event_filter(file->filter);
|
2023-10-31 16:24:53 +00:00
|
|
|
file->flags |= EVENT_FILE_FL_FREED;
|
|
|
|
event_file_put(file);
|
2013-07-26 17:25:47 +00:00
|
|
|
}
|
|
|
|
|
2009-05-08 02:31:42 +00:00
|
|
|
/*
|
|
|
|
* __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
|
|
|
|
*/
|
2013-07-02 18:48:23 +00:00
|
|
|
static int
|
|
|
|
__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
|
|
|
|
const char *sub, const char *event, int set)
|
2009-02-24 15:21:36 +00:00
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call;
|
2014-04-08 21:26:21 +00:00
|
|
|
const char *name;
|
2009-05-08 20:06:47 +00:00
|
|
|
int ret = -EINVAL;
|
2016-11-28 18:54:57 +00:00
|
|
|
int eret = 0;
|
2009-05-08 02:31:42 +00:00
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
list_for_each_entry(file, &tr->events, list) {
|
|
|
|
|
|
|
|
call = file->event_call;
|
2015-05-13 18:20:14 +00:00
|
|
|
name = trace_event_name(call);
|
2009-05-08 02:31:42 +00:00
|
|
|
|
2014-04-08 21:26:21 +00:00
|
|
|
if (!name || !call->class || !call->class->reg)
|
2009-05-08 02:31:42 +00:00
|
|
|
continue;
|
|
|
|
|
2012-05-10 19:55:43 +00:00
|
|
|
if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
|
|
|
|
continue;
|
|
|
|
|
2009-05-08 02:31:42 +00:00
|
|
|
if (match &&
|
2014-04-08 21:26:21 +00:00
|
|
|
strcmp(match, name) != 0 &&
|
2010-04-20 14:47:33 +00:00
|
|
|
strcmp(match, call->class->system) != 0)
|
2009-05-08 02:31:42 +00:00
|
|
|
continue;
|
|
|
|
|
2010-04-20 14:47:33 +00:00
|
|
|
if (sub && strcmp(sub, call->class->system) != 0)
|
2009-05-08 02:31:42 +00:00
|
|
|
continue;
|
|
|
|
|
2014-04-08 21:26:21 +00:00
|
|
|
if (event && strcmp(event, name) != 0)
|
2009-05-08 02:31:42 +00:00
|
|
|
continue;
|
|
|
|
|
2016-11-28 18:54:57 +00:00
|
|
|
ret = ftrace_event_enable_disable(file, set);
|
2009-05-08 02:31:42 +00:00
|
|
|
|
2016-11-28 18:54:57 +00:00
|
|
|
/*
|
|
|
|
* Save the first error and return that. Some events
|
|
|
|
* may still have been enabled, but let the user
|
|
|
|
* know that something went wrong.
|
|
|
|
*/
|
|
|
|
if (ret && !eret)
|
|
|
|
eret = ret;
|
|
|
|
|
|
|
|
ret = eret;
|
2009-05-08 02:31:42 +00:00
|
|
|
}
|
2013-07-02 18:48:23 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
|
|
|
|
const char *sub, const char *event, int set)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
mutex_lock(&event_mutex);
|
|
|
|
ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
|
2009-05-08 02:31:42 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-07-04 17:21:10 +00:00
|
|
|
int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
|
2009-05-08 02:31:42 +00:00
|
|
|
{
|
2009-02-28 04:32:58 +00:00
|
|
|
char *event = NULL, *sub = NULL, *match;
|
2015-04-16 04:44:44 +00:00
|
|
|
int ret;
|
2009-02-28 04:32:58 +00:00
|
|
|
|
2019-08-14 17:55:25 +00:00
|
|
|
if (!tr)
|
|
|
|
return -ENOENT;
|
2009-02-28 04:32:58 +00:00
|
|
|
/*
|
|
|
|
* The buf format can be <subsystem>:<event-name>
|
|
|
|
* *:<event-name> means any event by that name.
|
|
|
|
* :<event-name> is the same.
|
|
|
|
*
|
|
|
|
* <subsystem>:* means all events in that subsystem
|
|
|
|
* <subsystem>: means the same.
|
|
|
|
*
|
|
|
|
* <name> (no ':') means all events in a subsystem with
|
|
|
|
* the name <name> or any event that matches <name>
|
|
|
|
*/
|
|
|
|
|
|
|
|
match = strsep(&buf, ":");
|
|
|
|
if (buf) {
|
|
|
|
sub = match;
|
|
|
|
event = buf;
|
|
|
|
match = NULL;
|
|
|
|
|
|
|
|
if (!strlen(sub) || strcmp(sub, "*") == 0)
|
|
|
|
sub = NULL;
|
|
|
|
if (!strlen(event) || strcmp(event, "*") == 0)
|
|
|
|
event = NULL;
|
|
|
|
}
|
2009-02-24 15:21:36 +00:00
|
|
|
|
2015-04-16 04:44:44 +00:00
|
|
|
ret = __ftrace_set_clr_event(tr, match, sub, event, set);
|
|
|
|
|
|
|
|
/* Put back the colon to allow this to be called again */
|
|
|
|
if (buf)
|
|
|
|
*(buf - 1) = ':';
|
|
|
|
|
|
|
|
return ret;
|
2009-02-24 15:21:36 +00:00
|
|
|
}
|
|
|
|
|
2009-05-08 20:27:41 +00:00
|
|
|
/**
|
|
|
|
* trace_set_clr_event - enable or disable an event
|
|
|
|
* @system: system name to match (NULL for any system)
|
|
|
|
* @event: event name to match (NULL for all events, within system)
|
|
|
|
* @set: 1 to enable, 0 to disable
|
|
|
|
*
|
|
|
|
* This is a way for other parts of the kernel to enable or disable
|
|
|
|
* event recording.
|
|
|
|
*
|
|
|
|
* Returns 0 on success, -EINVAL if the parameters do not match any
|
|
|
|
* registered events.
|
|
|
|
*/
|
|
|
|
int trace_set_clr_event(const char *system, const char *event, int set)
|
|
|
|
{
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr = top_trace_array();
|
|
|
|
|
2014-06-05 22:35:17 +00:00
|
|
|
if (!tr)
|
|
|
|
return -ENODEV;
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
return __ftrace_set_clr_event(tr, NULL, system, event, set);
|
2009-05-08 20:27:41 +00:00
|
|
|
}
|
2010-11-08 06:05:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(trace_set_clr_event);
|
2009-05-08 20:27:41 +00:00
|
|
|
|
2019-11-20 19:08:38 +00:00
|
|
|
/**
|
|
|
|
* trace_array_set_clr_event - enable or disable an event for a trace array.
|
|
|
|
* @tr: concerned trace array.
|
|
|
|
* @system: system name to match (NULL for any system)
|
|
|
|
* @event: event name to match (NULL for all events, within system)
|
|
|
|
* @enable: true to enable, false to disable
|
|
|
|
*
|
|
|
|
* This is a way for other parts of the kernel to enable or disable
|
|
|
|
* event recording.
|
|
|
|
*
|
|
|
|
* Returns 0 on success, -EINVAL if the parameters do not match any
|
|
|
|
* registered events.
|
|
|
|
*/
|
|
|
|
int trace_array_set_clr_event(struct trace_array *tr, const char *system,
|
|
|
|
const char *event, bool enable)
|
|
|
|
{
|
|
|
|
int set;
|
|
|
|
|
|
|
|
if (!tr)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
set = (enable == true) ? 1 : 0;
|
|
|
|
return __ftrace_set_clr_event(tr, NULL, system, event, set);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(trace_array_set_clr_event);
|
|
|
|
|
2009-02-24 15:21:36 +00:00
|
|
|
/* 128 should be much more than enough */
|
|
|
|
#define EVENT_BUF_SIZE 127
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
ftrace_event_write(struct file *file, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos)
|
|
|
|
{
|
2009-09-11 15:29:28 +00:00
|
|
|
struct trace_parser parser;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct seq_file *m = file->private_data;
|
|
|
|
struct trace_array *tr = m->private;
|
2009-09-22 05:52:20 +00:00
|
|
|
ssize_t read, ret;
|
2009-02-24 15:21:36 +00:00
|
|
|
|
2009-09-22 05:52:20 +00:00
|
|
|
if (!cnt)
|
2009-02-24 15:21:36 +00:00
|
|
|
return 0;
|
|
|
|
|
2023-09-06 09:18:37 +00:00
|
|
|
ret = tracing_update_buffers(tr);
|
2009-03-11 18:33:00 +00:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
2009-09-11 15:29:28 +00:00
|
|
|
if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
|
2009-02-24 15:21:36 +00:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2009-09-11 15:29:28 +00:00
|
|
|
read = trace_get_user(&parser, ubuf, cnt, ppos);
|
|
|
|
|
2009-09-22 05:52:20 +00:00
|
|
|
if (read >= 0 && trace_parser_loaded((&parser))) {
|
2009-09-11 15:29:28 +00:00
|
|
|
int set = 1;
|
2009-02-24 15:21:36 +00:00
|
|
|
|
2009-09-11 15:29:28 +00:00
|
|
|
if (*parser.buffer == '!')
|
2009-02-24 15:21:36 +00:00
|
|
|
set = 0;
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
|
2009-02-24 15:21:36 +00:00
|
|
|
if (ret)
|
2009-09-11 15:29:28 +00:00
|
|
|
goto out_put;
|
2009-02-24 15:21:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = read;
|
|
|
|
|
2009-09-11 15:29:28 +00:00
|
|
|
out_put:
|
|
|
|
trace_parser_put(&parser);
|
2009-02-24 15:21:36 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *
|
|
|
|
t_next(struct seq_file *m, void *v, loff_t *pos)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file = v;
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr = m->private;
|
2009-02-24 15:21:36 +00:00
|
|
|
|
|
|
|
(*pos)++;
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
list_for_each_entry_continue(file, &tr->events, list) {
|
|
|
|
call = file->event_call;
|
2009-03-10 15:32:40 +00:00
|
|
|
/*
|
|
|
|
* The ftrace subsystem is for showing formats only.
|
|
|
|
* They can not be enabled or disabled via the event files.
|
|
|
|
*/
|
2016-02-24 14:04:24 +00:00
|
|
|
if (call->class && call->class->reg &&
|
|
|
|
!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
|
2012-05-04 03:09:03 +00:00
|
|
|
return file;
|
2009-03-10 15:32:40 +00:00
|
|
|
}
|
2009-02-24 15:21:36 +00:00
|
|
|
|
2009-09-18 06:07:05 +00:00
|
|
|
return NULL;
|
2009-02-24 15:21:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void *t_start(struct seq_file *m, loff_t *pos)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr = m->private;
|
2009-06-24 01:52:29 +00:00
|
|
|
loff_t l;
|
|
|
|
|
2009-05-06 02:33:45 +00:00
|
|
|
mutex_lock(&event_mutex);
|
2009-06-24 01:52:29 +00:00
|
|
|
|
2015-05-05 14:09:53 +00:00
|
|
|
file = list_entry(&tr->events, struct trace_event_file, list);
|
2009-06-24 01:52:29 +00:00
|
|
|
for (l = 0; l <= *pos; ) {
|
2012-05-04 03:09:03 +00:00
|
|
|
file = t_next(m, file, &l);
|
|
|
|
if (!file)
|
2009-06-24 01:52:29 +00:00
|
|
|
break;
|
|
|
|
}
|
2012-05-04 03:09:03 +00:00
|
|
|
return file;
|
2009-02-24 15:21:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void *
|
|
|
|
s_next(struct seq_file *m, void *v, loff_t *pos)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file = v;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr = m->private;
|
2009-02-24 15:21:36 +00:00
|
|
|
|
|
|
|
(*pos)++;
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
list_for_each_entry_continue(file, &tr->events, list) {
|
2015-05-13 19:12:33 +00:00
|
|
|
if (file->flags & EVENT_FILE_FL_ENABLED)
|
2012-05-04 03:09:03 +00:00
|
|
|
return file;
|
2009-02-24 15:21:36 +00:00
|
|
|
}
|
|
|
|
|
2009-09-18 06:07:05 +00:00
|
|
|
return NULL;
|
2009-02-24 15:21:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void *s_start(struct seq_file *m, loff_t *pos)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr = m->private;
|
2009-06-24 01:52:29 +00:00
|
|
|
loff_t l;
|
|
|
|
|
2009-05-06 02:33:45 +00:00
|
|
|
mutex_lock(&event_mutex);
|
2009-06-24 01:52:29 +00:00
|
|
|
|
2015-05-05 14:09:53 +00:00
|
|
|
file = list_entry(&tr->events, struct trace_event_file, list);
|
2009-06-24 01:52:29 +00:00
|
|
|
for (l = 0; l <= *pos; ) {
|
2012-05-04 03:09:03 +00:00
|
|
|
file = s_next(m, file, &l);
|
|
|
|
if (!file)
|
2009-06-24 01:52:29 +00:00
|
|
|
break;
|
|
|
|
}
|
2012-05-04 03:09:03 +00:00
|
|
|
return file;
|
2009-02-24 15:21:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int t_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file = v;
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call = file->event_call;
|
2009-02-24 15:21:36 +00:00
|
|
|
|
2010-04-20 14:47:33 +00:00
|
|
|
if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
|
|
|
|
seq_printf(m, "%s:", call->class->system);
|
2015-05-13 18:20:14 +00:00
|
|
|
seq_printf(m, "%s\n", trace_event_name(call));
|
2009-02-24 15:21:36 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void t_stop(struct seq_file *m, void *p)
|
|
|
|
{
|
2009-05-06 02:33:45 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
2009-02-24 15:21:36 +00:00
|
|
|
}
|
|
|
|
|
2016-04-13 20:27:49 +00:00
|
|
|
static void *
|
2020-03-25 23:51:19 +00:00
|
|
|
__next(struct seq_file *m, void *v, loff_t *pos, int type)
|
2016-04-13 20:27:49 +00:00
|
|
|
{
|
|
|
|
struct trace_array *tr = m->private;
|
2020-03-25 23:51:19 +00:00
|
|
|
struct trace_pid_list *pid_list;
|
|
|
|
|
|
|
|
if (type == TRACE_PIDS)
|
|
|
|
pid_list = rcu_dereference_sched(tr->filtered_pids);
|
|
|
|
else
|
|
|
|
pid_list = rcu_dereference_sched(tr->filtered_no_pids);
|
2016-04-13 20:27:49 +00:00
|
|
|
|
2016-04-20 19:19:54 +00:00
|
|
|
return trace_pid_next(pid_list, v, pos);
|
2016-04-13 20:27:49 +00:00
|
|
|
}
|
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
static void *
|
|
|
|
p_next(struct seq_file *m, void *v, loff_t *pos)
|
|
|
|
{
|
|
|
|
return __next(m, v, pos, TRACE_PIDS);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *
|
|
|
|
np_next(struct seq_file *m, void *v, loff_t *pos)
|
|
|
|
{
|
|
|
|
return __next(m, v, pos, TRACE_NO_PIDS);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *__start(struct seq_file *m, loff_t *pos, int type)
|
2015-10-26 07:45:22 +00:00
|
|
|
__acquires(RCU)
|
2015-09-24 15:33:26 +00:00
|
|
|
{
|
|
|
|
struct trace_pid_list *pid_list;
|
|
|
|
struct trace_array *tr = m->private;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Grab the mutex, to keep calls to p_next() having the same
|
|
|
|
* tr->filtered_pids as p_start() has.
|
|
|
|
* If we just passed the tr->filtered_pids around, then RCU would
|
|
|
|
* have been enough, but doing that makes things more complex.
|
|
|
|
*/
|
|
|
|
mutex_lock(&event_mutex);
|
|
|
|
rcu_read_lock_sched();
|
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
if (type == TRACE_PIDS)
|
|
|
|
pid_list = rcu_dereference_sched(tr->filtered_pids);
|
|
|
|
else
|
|
|
|
pid_list = rcu_dereference_sched(tr->filtered_no_pids);
|
2015-09-24 15:33:26 +00:00
|
|
|
|
2016-04-13 20:27:49 +00:00
|
|
|
if (!pid_list)
|
2015-09-24 15:33:26 +00:00
|
|
|
return NULL;
|
|
|
|
|
2016-04-20 19:19:54 +00:00
|
|
|
return trace_pid_start(pid_list, pos);
|
2015-09-24 15:33:26 +00:00
|
|
|
}
|
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
static void *p_start(struct seq_file *m, loff_t *pos)
|
|
|
|
__acquires(RCU)
|
|
|
|
{
|
|
|
|
return __start(m, pos, TRACE_PIDS);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *np_start(struct seq_file *m, loff_t *pos)
|
|
|
|
__acquires(RCU)
|
|
|
|
{
|
|
|
|
return __start(m, pos, TRACE_NO_PIDS);
|
|
|
|
}
|
|
|
|
|
2015-09-24 15:33:26 +00:00
|
|
|
static void p_stop(struct seq_file *m, void *p)
|
2015-10-26 07:45:22 +00:00
|
|
|
__releases(RCU)
|
2015-09-24 15:33:26 +00:00
|
|
|
{
|
|
|
|
rcu_read_unlock_sched();
|
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
}
|
|
|
|
|
2009-02-24 19:15:08 +00:00
|
|
|
static ssize_t
|
|
|
|
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2013-07-26 17:25:36 +00:00
|
|
|
unsigned long flags;
|
2013-06-29 05:08:04 +00:00
|
|
|
char buf[4] = "0";
|
|
|
|
|
2013-07-26 17:25:36 +00:00
|
|
|
mutex_lock(&event_mutex);
|
tracing: Have format file honor EVENT_FILE_FL_FREED
When eventfs was introduced, special care had to be done to coordinate the
freeing of the file meta data with the files that are exposed to user
space. The file meta data would have a ref count that is set when the file
is created and would be decremented and freed after the last user that
opened the file closed it. When the file meta data was to be freed, it
would set a flag (EVENT_FILE_FL_FREED) to denote that the file is freed,
and any new references made (like new opens or reads) would fail as it is
marked freed. This allowed other meta data to be freed after this flag was
set (under the event_mutex).
All the files that were dynamically created in the events directory had a
pointer to the file meta data and would call event_release() when the last
reference to the user space file was closed. This would be the time that it
is safe to free the file meta data.
A shortcut was made for the "format" file. It's i_private would point to
the "call" entry directly and not point to the file's meta data. This is
because all format files are the same for the same "call", so it was
thought there was no reason to differentiate them. The other files
maintain state (like the "enable", "trigger", etc). But this meant if the
file were to disappear, the "format" file would be unaware of it.
This caused a race that could be trigger via the user_events test (that
would create dynamic events and free them), and running a loop that would
read the user_events format files:
In one console run:
# cd tools/testing/selftests/user_events
# while true; do ./ftrace_test; done
And in another console run:
# cd /sys/kernel/tracing/
# while true; do cat events/user_events/__test_event/format; done 2>/dev/null
With KASAN memory checking, it would trigger a use-after-free bug report
(which was a real bug). This was because the format file was not checking
the file's meta data flag "EVENT_FILE_FL_FREED", so it would access the
event that the file meta data pointed to after the event was freed.
After inspection, there are other locations that were found to not check
the EVENT_FILE_FL_FREED flag when accessing the trace_event_file. Add a
new helper function: event_file_file() that will make sure that the
event_mutex is held, and will return NULL if the trace_event_file has the
EVENT_FILE_FL_FREED flag set. Have the first reference of the struct file
pointer use event_file_file() and check for NULL. Later uses can still use
the event_file_data() helper function if the event_mutex is still held and
was not released since the event_file_file() call.
Link: https://lore.kernel.org/all/20240719204701.1605950-1-minipli@grsecurity.net/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Ilkka Naulapää <digirigawa@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: Alexey Makhalov <alexey.makhalov@broadcom.com>
Cc: Vasavi Sirnapalli <vasavi.sirnapalli@broadcom.com>
Link: https://lore.kernel.org/20240730110657.3b69d3c1@gandalf.local.home
Fixes: b63db58e2fa5d ("eventfs/tracing: Add callback for release of an eventfs_inode")
Reported-by: Mathias Krause <minipli@grsecurity.net>
Tested-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-07-30 15:06:57 +00:00
|
|
|
file = event_file_file(filp);
|
2013-07-26 17:25:36 +00:00
|
|
|
if (likely(file))
|
|
|
|
flags = file->flags;
|
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
|
tracing: Have format file honor EVENT_FILE_FL_FREED
When eventfs was introduced, special care had to be done to coordinate the
freeing of the file meta data with the files that are exposed to user
space. The file meta data would have a ref count that is set when the file
is created and would be decremented and freed after the last user that
opened the file closed it. When the file meta data was to be freed, it
would set a flag (EVENT_FILE_FL_FREED) to denote that the file is freed,
and any new references made (like new opens or reads) would fail as it is
marked freed. This allowed other meta data to be freed after this flag was
set (under the event_mutex).
All the files that were dynamically created in the events directory had a
pointer to the file meta data and would call event_release() when the last
reference to the user space file was closed. This would be the time that it
is safe to free the file meta data.
A shortcut was made for the "format" file. It's i_private would point to
the "call" entry directly and not point to the file's meta data. This is
because all format files are the same for the same "call", so it was
thought there was no reason to differentiate them. The other files
maintain state (like the "enable", "trigger", etc). But this meant if the
file were to disappear, the "format" file would be unaware of it.
This caused a race that could be trigger via the user_events test (that
would create dynamic events and free them), and running a loop that would
read the user_events format files:
In one console run:
# cd tools/testing/selftests/user_events
# while true; do ./ftrace_test; done
And in another console run:
# cd /sys/kernel/tracing/
# while true; do cat events/user_events/__test_event/format; done 2>/dev/null
With KASAN memory checking, it would trigger a use-after-free bug report
(which was a real bug). This was because the format file was not checking
the file's meta data flag "EVENT_FILE_FL_FREED", so it would access the
event that the file meta data pointed to after the event was freed.
After inspection, there are other locations that were found to not check
the EVENT_FILE_FL_FREED flag when accessing the trace_event_file. Add a
new helper function: event_file_file() that will make sure that the
event_mutex is held, and will return NULL if the trace_event_file has the
EVENT_FILE_FL_FREED flag set. Have the first reference of the struct file
pointer use event_file_file() and check for NULL. Later uses can still use
the event_file_data() helper function if the event_mutex is still held and
was not released since the event_file_file() call.
Link: https://lore.kernel.org/all/20240719204701.1605950-1-minipli@grsecurity.net/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Ilkka Naulapää <digirigawa@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: Alexey Makhalov <alexey.makhalov@broadcom.com>
Cc: Vasavi Sirnapalli <vasavi.sirnapalli@broadcom.com>
Link: https://lore.kernel.org/20240730110657.3b69d3c1@gandalf.local.home
Fixes: b63db58e2fa5d ("eventfs/tracing: Add callback for release of an eventfs_inode")
Reported-by: Mathias Krause <minipli@grsecurity.net>
Tested-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-07-30 15:06:57 +00:00
|
|
|
if (!file)
|
2013-07-26 17:25:36 +00:00
|
|
|
return -ENODEV;
|
|
|
|
|
2015-05-13 19:12:33 +00:00
|
|
|
if (flags & EVENT_FILE_FL_ENABLED &&
|
|
|
|
!(flags & EVENT_FILE_FL_SOFT_DISABLED))
|
2013-06-29 05:08:04 +00:00
|
|
|
strcpy(buf, "1");
|
|
|
|
|
2015-05-13 19:12:33 +00:00
|
|
|
if (flags & EVENT_FILE_FL_SOFT_DISABLED ||
|
|
|
|
flags & EVENT_FILE_FL_SOFT_MODE)
|
2013-06-29 05:08:04 +00:00
|
|
|
strcat(buf, "*");
|
|
|
|
|
|
|
|
strcat(buf, "\n");
|
2009-02-24 19:15:08 +00:00
|
|
|
|
2013-03-12 17:26:18 +00:00
|
|
|
return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
|
2009-02-24 19:15:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2009-02-24 19:15:08 +00:00
|
|
|
unsigned long val;
|
|
|
|
int ret;
|
|
|
|
|
2011-06-07 19:58:27 +00:00
|
|
|
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
|
|
|
|
if (ret)
|
2009-02-24 19:15:08 +00:00
|
|
|
return ret;
|
|
|
|
|
|
|
|
switch (val) {
|
|
|
|
case 0:
|
|
|
|
case 1:
|
2013-07-26 17:25:36 +00:00
|
|
|
ret = -ENODEV;
|
2009-03-02 16:49:04 +00:00
|
|
|
mutex_lock(&event_mutex);
|
tracing: Have format file honor EVENT_FILE_FL_FREED
When eventfs was introduced, special care had to be done to coordinate the
freeing of the file meta data with the files that are exposed to user
space. The file meta data would have a ref count that is set when the file
is created and would be decremented and freed after the last user that
opened the file closed it. When the file meta data was to be freed, it
would set a flag (EVENT_FILE_FL_FREED) to denote that the file is freed,
and any new references made (like new opens or reads) would fail as it is
marked freed. This allowed other meta data to be freed after this flag was
set (under the event_mutex).
All the files that were dynamically created in the events directory had a
pointer to the file meta data and would call event_release() when the last
reference to the user space file was closed. This would be the time that it
is safe to free the file meta data.
A shortcut was made for the "format" file. It's i_private would point to
the "call" entry directly and not point to the file's meta data. This is
because all format files are the same for the same "call", so it was
thought there was no reason to differentiate them. The other files
maintain state (like the "enable", "trigger", etc). But this meant if the
file were to disappear, the "format" file would be unaware of it.
This caused a race that could be trigger via the user_events test (that
would create dynamic events and free them), and running a loop that would
read the user_events format files:
In one console run:
# cd tools/testing/selftests/user_events
# while true; do ./ftrace_test; done
And in another console run:
# cd /sys/kernel/tracing/
# while true; do cat events/user_events/__test_event/format; done 2>/dev/null
With KASAN memory checking, it would trigger a use-after-free bug report
(which was a real bug). This was because the format file was not checking
the file's meta data flag "EVENT_FILE_FL_FREED", so it would access the
event that the file meta data pointed to after the event was freed.
After inspection, there are other locations that were found to not check
the EVENT_FILE_FL_FREED flag when accessing the trace_event_file. Add a
new helper function: event_file_file() that will make sure that the
event_mutex is held, and will return NULL if the trace_event_file has the
EVENT_FILE_FL_FREED flag set. Have the first reference of the struct file
pointer use event_file_file() and check for NULL. Later uses can still use
the event_file_data() helper function if the event_mutex is still held and
was not released since the event_file_file() call.
Link: https://lore.kernel.org/all/20240719204701.1605950-1-minipli@grsecurity.net/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Ilkka Naulapää <digirigawa@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: Alexey Makhalov <alexey.makhalov@broadcom.com>
Cc: Vasavi Sirnapalli <vasavi.sirnapalli@broadcom.com>
Link: https://lore.kernel.org/20240730110657.3b69d3c1@gandalf.local.home
Fixes: b63db58e2fa5d ("eventfs/tracing: Add callback for release of an eventfs_inode")
Reported-by: Mathias Krause <minipli@grsecurity.net>
Tested-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-07-30 15:06:57 +00:00
|
|
|
file = event_file_file(filp);
|
|
|
|
if (likely(file)) {
|
2023-09-06 09:18:37 +00:00
|
|
|
ret = tracing_update_buffers(file->tr);
|
|
|
|
if (ret < 0) {
|
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
return ret;
|
|
|
|
}
|
2013-07-26 17:25:36 +00:00
|
|
|
ret = ftrace_event_enable_disable(file, val);
|
2023-09-06 09:18:37 +00:00
|
|
|
}
|
2009-03-02 16:49:04 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
2009-02-24 19:15:08 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
*ppos += cnt;
|
|
|
|
|
2009-12-08 03:14:52 +00:00
|
|
|
return ret ? ret : cnt;
|
2009-02-24 19:15:08 +00:00
|
|
|
}
|
|
|
|
|
2009-05-07 02:52:15 +00:00
|
|
|
static ssize_t
|
|
|
|
system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
2009-05-08 02:32:05 +00:00
|
|
|
const char set_to_char[4] = { '?', '0', '1', 'X' };
|
2015-05-13 18:59:40 +00:00
|
|
|
struct trace_subsystem_dir *dir = filp->private_data;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct event_subsystem *system = dir->subsystem;
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call;
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr = dir->tr;
|
2009-05-07 02:52:15 +00:00
|
|
|
char buf[2];
|
2009-05-08 02:32:05 +00:00
|
|
|
int set = 0;
|
2009-05-07 02:52:15 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
mutex_lock(&event_mutex);
|
2012-05-04 03:09:03 +00:00
|
|
|
list_for_each_entry(file, &tr->events, list) {
|
|
|
|
call = file->event_call;
|
tracing: Do not count ftrace events in top level enable output
The file /sys/kernel/tracing/events/enable is used to enable all events by
echoing in "1", or disabling all events when echoing in "0". To know if all
events are enabled, disabled, or some are enabled but not all of them,
cating the file should show either "1" (all enabled), "0" (all disabled), or
"X" (some enabled but not all of them). This works the same as the "enable"
files in the individule system directories (like tracing/events/sched/enable).
But when all events are enabled, the top level "enable" file shows "X". The
reason is that its checking the "ftrace" events, which are special events
that only exist for their format files. These include the format for the
function tracer events, that are enabled when the function tracer is
enabled, but not by the "enable" file. The check includes these events,
which will always be disabled, and even though all true events are enabled,
the top level "enable" file will show "X" instead of "1".
To fix this, have the check test the event's flags to see if it has the
"IGNORE_ENABLE" flag set, and if so, not test it.
Cc: stable@vger.kernel.org
Fixes: 553552ce1796c ("tracing: Combine event filter_active and enable into single flags field")
Reported-by: "Yordan Karadzhov (VMware)" <y.karadz@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2021-02-05 20:40:04 +00:00
|
|
|
if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
|
|
|
|
!trace_event_name(call) || !call->class || !call->class->reg)
|
2009-05-07 02:52:15 +00:00
|
|
|
continue;
|
|
|
|
|
2011-07-05 18:32:51 +00:00
|
|
|
if (system && strcmp(call->class->system, system->name) != 0)
|
2009-05-07 02:52:15 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to find out if all the events are set
|
|
|
|
* or if all events or cleared, or if we have
|
|
|
|
* a mixture.
|
|
|
|
*/
|
2015-05-13 19:12:33 +00:00
|
|
|
set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED));
|
2009-05-08 02:32:05 +00:00
|
|
|
|
2009-05-07 02:52:15 +00:00
|
|
|
/*
|
|
|
|
* If we have a mixture, no need to look further.
|
|
|
|
*/
|
2009-05-08 02:32:05 +00:00
|
|
|
if (set == 3)
|
2009-05-07 02:52:15 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
|
2009-05-08 02:32:05 +00:00
|
|
|
buf[0] = set_to_char[set];
|
2009-05-07 02:52:15 +00:00
|
|
|
buf[1] = '\n';
|
|
|
|
|
|
|
|
ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
2015-05-13 18:59:40 +00:00
|
|
|
struct trace_subsystem_dir *dir = filp->private_data;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct event_subsystem *system = dir->subsystem;
|
2011-07-05 18:32:51 +00:00
|
|
|
const char *name = NULL;
|
2009-05-07 02:52:15 +00:00
|
|
|
unsigned long val;
|
|
|
|
ssize_t ret;
|
|
|
|
|
2011-06-07 19:58:27 +00:00
|
|
|
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
|
|
|
|
if (ret)
|
2009-05-07 02:52:15 +00:00
|
|
|
return ret;
|
|
|
|
|
2023-09-06 09:18:37 +00:00
|
|
|
ret = tracing_update_buffers(dir->tr);
|
2009-05-07 02:52:15 +00:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
2009-05-08 02:31:42 +00:00
|
|
|
if (val != 0 && val != 1)
|
2009-05-07 02:52:15 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2011-07-05 18:32:51 +00:00
|
|
|
/*
|
|
|
|
* Opening of "enable" adds a ref count to system,
|
|
|
|
* so the name is safe to use.
|
|
|
|
*/
|
|
|
|
if (system)
|
|
|
|
name = system->name;
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
|
2009-05-07 02:52:15 +00:00
|
|
|
if (ret)
|
2009-05-08 02:31:42 +00:00
|
|
|
goto out;
|
2009-05-07 02:52:15 +00:00
|
|
|
|
|
|
|
ret = cnt;
|
|
|
|
|
2009-05-08 02:31:42 +00:00
|
|
|
out:
|
2009-05-07 02:52:15 +00:00
|
|
|
*ppos += cnt;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-06-03 19:21:34 +00:00
|
|
|
enum {
|
|
|
|
FORMAT_HEADER = 1,
|
2010-08-17 05:53:06 +00:00
|
|
|
FORMAT_FIELD_SEPERATOR = 2,
|
|
|
|
FORMAT_PRINTFMT = 3,
|
2010-06-03 19:21:34 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
|
2009-03-02 18:53:59 +00:00
|
|
|
{
|
tracing: Have format file honor EVENT_FILE_FL_FREED
When eventfs was introduced, special care had to be done to coordinate the
freeing of the file meta data with the files that are exposed to user
space. The file meta data would have a ref count that is set when the file
is created and would be decremented and freed after the last user that
opened the file closed it. When the file meta data was to be freed, it
would set a flag (EVENT_FILE_FL_FREED) to denote that the file is freed,
and any new references made (like new opens or reads) would fail as it is
marked freed. This allowed other meta data to be freed after this flag was
set (under the event_mutex).
All the files that were dynamically created in the events directory had a
pointer to the file meta data and would call event_release() when the last
reference to the user space file was closed. This would be the time that it
is safe to free the file meta data.
A shortcut was made for the "format" file. It's i_private would point to
the "call" entry directly and not point to the file's meta data. This is
because all format files are the same for the same "call", so it was
thought there was no reason to differentiate them. The other files
maintain state (like the "enable", "trigger", etc). But this meant if the
file were to disappear, the "format" file would be unaware of it.
This caused a race that could be trigger via the user_events test (that
would create dynamic events and free them), and running a loop that would
read the user_events format files:
In one console run:
# cd tools/testing/selftests/user_events
# while true; do ./ftrace_test; done
And in another console run:
# cd /sys/kernel/tracing/
# while true; do cat events/user_events/__test_event/format; done 2>/dev/null
With KASAN memory checking, it would trigger a use-after-free bug report
(which was a real bug). This was because the format file was not checking
the file's meta data flag "EVENT_FILE_FL_FREED", so it would access the
event that the file meta data pointed to after the event was freed.
After inspection, there are other locations that were found to not check
the EVENT_FILE_FL_FREED flag when accessing the trace_event_file. Add a
new helper function: event_file_file() that will make sure that the
event_mutex is held, and will return NULL if the trace_event_file has the
EVENT_FILE_FL_FREED flag set. Have the first reference of the struct file
pointer use event_file_file() and check for NULL. Later uses can still use
the event_file_data() helper function if the event_mutex is still held and
was not released since the event_file_file() call.
Link: https://lore.kernel.org/all/20240719204701.1605950-1-minipli@grsecurity.net/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Ilkka Naulapää <digirigawa@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: Alexey Makhalov <alexey.makhalov@broadcom.com>
Cc: Vasavi Sirnapalli <vasavi.sirnapalli@broadcom.com>
Link: https://lore.kernel.org/20240730110657.3b69d3c1@gandalf.local.home
Fixes: b63db58e2fa5d ("eventfs/tracing: Add callback for release of an eventfs_inode")
Reported-by: Mathias Krause <minipli@grsecurity.net>
Tested-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-07-30 15:06:57 +00:00
|
|
|
struct trace_event_file *file = event_file_data(m->private);
|
|
|
|
struct trace_event_call *call = file->event_call;
|
2010-08-17 05:53:06 +00:00
|
|
|
struct list_head *common_head = &ftrace_common_fields;
|
|
|
|
struct list_head *head = trace_get_fields(call);
|
tracing: Simplify the iteration logic in f_start/f_next
f_next() looks overcomplicated, and it is not strictly correct
even if this doesn't matter.
Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF)
if trace_get_fields() returns an empty list, we should simply
advance to FORMAT_PRINTFMT as we do when we find the end of list.
1. Change f_next() to return "struct list_head *" rather than
"ftrace_event_field *", and change f_show() to do list_entry().
This simplifies the code a bit, only f_show() needs to know
about ftrace_event_field, and f_next() can play with ->prev
directly
2. Change f_next() to not play with ->prev / return inside the
switch() statement. It can simply set node = head/common_head,
the prev-or-advance-to-the-next-magic below does all work.
While at it. f_start() looks overcomplicated too. I don't think
*pos == 0 makes sense as a separate case, just change this code
to do "while" instead of "do/while".
The patch also moves f_start() down, close to f_stop(). This is
purely cosmetic, just to make the locking added by the next patch
more clear/visible.
Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-07-18 18:47:10 +00:00
|
|
|
struct list_head *node = v;
|
2009-03-02 18:53:59 +00:00
|
|
|
|
2010-06-03 19:21:34 +00:00
|
|
|
(*pos)++;
|
2009-12-15 07:39:53 +00:00
|
|
|
|
2010-06-03 19:21:34 +00:00
|
|
|
switch ((unsigned long)v) {
|
|
|
|
case FORMAT_HEADER:
|
tracing: Simplify the iteration logic in f_start/f_next
f_next() looks overcomplicated, and it is not strictly correct
even if this doesn't matter.
Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF)
if trace_get_fields() returns an empty list, we should simply
advance to FORMAT_PRINTFMT as we do when we find the end of list.
1. Change f_next() to return "struct list_head *" rather than
"ftrace_event_field *", and change f_show() to do list_entry().
This simplifies the code a bit, only f_show() needs to know
about ftrace_event_field, and f_next() can play with ->prev
directly
2. Change f_next() to not play with ->prev / return inside the
switch() statement. It can simply set node = head/common_head,
the prev-or-advance-to-the-next-magic below does all work.
While at it. f_start() looks overcomplicated too. I don't think
*pos == 0 makes sense as a separate case, just change this code
to do "while" instead of "do/while".
The patch also moves f_start() down, close to f_stop(). This is
purely cosmetic, just to make the locking added by the next patch
more clear/visible.
Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-07-18 18:47:10 +00:00
|
|
|
node = common_head;
|
|
|
|
break;
|
2009-12-15 07:39:53 +00:00
|
|
|
|
2010-08-17 05:53:06 +00:00
|
|
|
case FORMAT_FIELD_SEPERATOR:
|
tracing: Simplify the iteration logic in f_start/f_next
f_next() looks overcomplicated, and it is not strictly correct
even if this doesn't matter.
Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF)
if trace_get_fields() returns an empty list, we should simply
advance to FORMAT_PRINTFMT as we do when we find the end of list.
1. Change f_next() to return "struct list_head *" rather than
"ftrace_event_field *", and change f_show() to do list_entry().
This simplifies the code a bit, only f_show() needs to know
about ftrace_event_field, and f_next() can play with ->prev
directly
2. Change f_next() to not play with ->prev / return inside the
switch() statement. It can simply set node = head/common_head,
the prev-or-advance-to-the-next-magic below does all work.
While at it. f_start() looks overcomplicated too. I don't think
*pos == 0 makes sense as a separate case, just change this code
to do "while" instead of "do/while".
The patch also moves f_start() down, close to f_stop(). This is
purely cosmetic, just to make the locking added by the next patch
more clear/visible.
Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-07-18 18:47:10 +00:00
|
|
|
node = head;
|
|
|
|
break;
|
2009-12-15 07:39:53 +00:00
|
|
|
|
2010-06-03 19:21:34 +00:00
|
|
|
case FORMAT_PRINTFMT:
|
|
|
|
/* all done */
|
|
|
|
return NULL;
|
2009-12-15 07:39:53 +00:00
|
|
|
}
|
|
|
|
|
tracing: Simplify the iteration logic in f_start/f_next
f_next() looks overcomplicated, and it is not strictly correct
even if this doesn't matter.
Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF)
if trace_get_fields() returns an empty list, we should simply
advance to FORMAT_PRINTFMT as we do when we find the end of list.
1. Change f_next() to return "struct list_head *" rather than
"ftrace_event_field *", and change f_show() to do list_entry().
This simplifies the code a bit, only f_show() needs to know
about ftrace_event_field, and f_next() can play with ->prev
directly
2. Change f_next() to not play with ->prev / return inside the
switch() statement. It can simply set node = head/common_head,
the prev-or-advance-to-the-next-magic below does all work.
While at it. f_start() looks overcomplicated too. I don't think
*pos == 0 makes sense as a separate case, just change this code
to do "while" instead of "do/while".
The patch also moves f_start() down, close to f_stop(). This is
purely cosmetic, just to make the locking added by the next patch
more clear/visible.
Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-07-18 18:47:10 +00:00
|
|
|
node = node->prev;
|
|
|
|
if (node == common_head)
|
2010-08-17 05:53:06 +00:00
|
|
|
return (void *)FORMAT_FIELD_SEPERATOR;
|
tracing: Simplify the iteration logic in f_start/f_next
f_next() looks overcomplicated, and it is not strictly correct
even if this doesn't matter.
Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF)
if trace_get_fields() returns an empty list, we should simply
advance to FORMAT_PRINTFMT as we do when we find the end of list.
1. Change f_next() to return "struct list_head *" rather than
"ftrace_event_field *", and change f_show() to do list_entry().
This simplifies the code a bit, only f_show() needs to know
about ftrace_event_field, and f_next() can play with ->prev
directly
2. Change f_next() to not play with ->prev / return inside the
switch() statement. It can simply set node = head/common_head,
the prev-or-advance-to-the-next-magic below does all work.
While at it. f_start() looks overcomplicated too. I don't think
*pos == 0 makes sense as a separate case, just change this code
to do "while" instead of "do/while".
The patch also moves f_start() down, close to f_stop(). This is
purely cosmetic, just to make the locking added by the next patch
more clear/visible.
Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-07-18 18:47:10 +00:00
|
|
|
else if (node == head)
|
2010-06-03 19:21:34 +00:00
|
|
|
return (void *)FORMAT_PRINTFMT;
|
tracing: Simplify the iteration logic in f_start/f_next
f_next() looks overcomplicated, and it is not strictly correct
even if this doesn't matter.
Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF)
if trace_get_fields() returns an empty list, we should simply
advance to FORMAT_PRINTFMT as we do when we find the end of list.
1. Change f_next() to return "struct list_head *" rather than
"ftrace_event_field *", and change f_show() to do list_entry().
This simplifies the code a bit, only f_show() needs to know
about ftrace_event_field, and f_next() can play with ->prev
directly
2. Change f_next() to not play with ->prev / return inside the
switch() statement. It can simply set node = head/common_head,
the prev-or-advance-to-the-next-magic below does all work.
While at it. f_start() looks overcomplicated too. I don't think
*pos == 0 makes sense as a separate case, just change this code
to do "while" instead of "do/while".
The patch also moves f_start() down, close to f_stop(). This is
purely cosmetic, just to make the locking added by the next patch
more clear/visible.
Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-07-18 18:47:10 +00:00
|
|
|
else
|
|
|
|
return node;
|
2010-06-03 19:21:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int f_show(struct seq_file *m, void *v)
|
|
|
|
{
|
tracing: Have format file honor EVENT_FILE_FL_FREED
When eventfs was introduced, special care had to be done to coordinate the
freeing of the file meta data with the files that are exposed to user
space. The file meta data would have a ref count that is set when the file
is created and would be decremented and freed after the last user that
opened the file closed it. When the file meta data was to be freed, it
would set a flag (EVENT_FILE_FL_FREED) to denote that the file is freed,
and any new references made (like new opens or reads) would fail as it is
marked freed. This allowed other meta data to be freed after this flag was
set (under the event_mutex).
All the files that were dynamically created in the events directory had a
pointer to the file meta data and would call event_release() when the last
reference to the user space file was closed. This would be the time that it
is safe to free the file meta data.
A shortcut was made for the "format" file. It's i_private would point to
the "call" entry directly and not point to the file's meta data. This is
because all format files are the same for the same "call", so it was
thought there was no reason to differentiate them. The other files
maintain state (like the "enable", "trigger", etc). But this meant if the
file were to disappear, the "format" file would be unaware of it.
This caused a race that could be trigger via the user_events test (that
would create dynamic events and free them), and running a loop that would
read the user_events format files:
In one console run:
# cd tools/testing/selftests/user_events
# while true; do ./ftrace_test; done
And in another console run:
# cd /sys/kernel/tracing/
# while true; do cat events/user_events/__test_event/format; done 2>/dev/null
With KASAN memory checking, it would trigger a use-after-free bug report
(which was a real bug). This was because the format file was not checking
the file's meta data flag "EVENT_FILE_FL_FREED", so it would access the
event that the file meta data pointed to after the event was freed.
After inspection, there are other locations that were found to not check
the EVENT_FILE_FL_FREED flag when accessing the trace_event_file. Add a
new helper function: event_file_file() that will make sure that the
event_mutex is held, and will return NULL if the trace_event_file has the
EVENT_FILE_FL_FREED flag set. Have the first reference of the struct file
pointer use event_file_file() and check for NULL. Later uses can still use
the event_file_data() helper function if the event_mutex is still held and
was not released since the event_file_file() call.
Link: https://lore.kernel.org/all/20240719204701.1605950-1-minipli@grsecurity.net/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Ilkka Naulapää <digirigawa@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: Alexey Makhalov <alexey.makhalov@broadcom.com>
Cc: Vasavi Sirnapalli <vasavi.sirnapalli@broadcom.com>
Link: https://lore.kernel.org/20240730110657.3b69d3c1@gandalf.local.home
Fixes: b63db58e2fa5d ("eventfs/tracing: Add callback for release of an eventfs_inode")
Reported-by: Mathias Krause <minipli@grsecurity.net>
Tested-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-07-30 15:06:57 +00:00
|
|
|
struct trace_event_file *file = event_file_data(m->private);
|
|
|
|
struct trace_event_call *call = file->event_call;
|
2010-06-03 19:21:34 +00:00
|
|
|
struct ftrace_event_field *field;
|
|
|
|
const char *array_descriptor;
|
|
|
|
|
|
|
|
switch ((unsigned long)v) {
|
|
|
|
case FORMAT_HEADER:
|
2015-05-13 18:20:14 +00:00
|
|
|
seq_printf(m, "name: %s\n", trace_event_name(call));
|
2010-06-03 19:21:34 +00:00
|
|
|
seq_printf(m, "ID: %d\n", call->event.type);
|
2014-11-08 20:42:10 +00:00
|
|
|
seq_puts(m, "format:\n");
|
2010-05-24 08:22:49 +00:00
|
|
|
return 0;
|
2009-12-15 07:39:53 +00:00
|
|
|
|
2010-08-17 05:53:06 +00:00
|
|
|
case FORMAT_FIELD_SEPERATOR:
|
|
|
|
seq_putc(m, '\n');
|
|
|
|
return 0;
|
|
|
|
|
2010-06-03 19:21:34 +00:00
|
|
|
case FORMAT_PRINTFMT:
|
|
|
|
seq_printf(m, "\nprint fmt: %s\n",
|
|
|
|
call->print_fmt);
|
|
|
|
return 0;
|
2009-03-02 18:53:59 +00:00
|
|
|
}
|
2010-05-24 08:22:49 +00:00
|
|
|
|
tracing: Simplify the iteration logic in f_start/f_next
f_next() looks overcomplicated, and it is not strictly correct
even if this doesn't matter.
Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF)
if trace_get_fields() returns an empty list, we should simply
advance to FORMAT_PRINTFMT as we do when we find the end of list.
1. Change f_next() to return "struct list_head *" rather than
"ftrace_event_field *", and change f_show() to do list_entry().
This simplifies the code a bit, only f_show() needs to know
about ftrace_event_field, and f_next() can play with ->prev
directly
2. Change f_next() to not play with ->prev / return inside the
switch() statement. It can simply set node = head/common_head,
the prev-or-advance-to-the-next-magic below does all work.
While at it. f_start() looks overcomplicated too. I don't think
*pos == 0 makes sense as a separate case, just change this code
to do "while" instead of "do/while".
The patch also moves f_start() down, close to f_stop(). This is
purely cosmetic, just to make the locking added by the next patch
more clear/visible.
Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-07-18 18:47:10 +00:00
|
|
|
field = list_entry(v, struct ftrace_event_field, link);
|
2010-06-03 19:21:34 +00:00
|
|
|
/*
|
|
|
|
* Smartly shows the array type(except dynamic array).
|
|
|
|
* Normal:
|
|
|
|
* field:TYPE VAR
|
|
|
|
* If TYPE := TYPE[LEN], it is shown:
|
|
|
|
* field:TYPE VAR[LEN]
|
|
|
|
*/
|
|
|
|
array_descriptor = strchr(field->type, '[');
|
2010-05-24 08:22:49 +00:00
|
|
|
|
2018-12-20 18:20:07 +00:00
|
|
|
if (str_has_prefix(field->type, "__data_loc"))
|
2010-06-03 19:21:34 +00:00
|
|
|
array_descriptor = NULL;
|
2010-05-24 08:22:49 +00:00
|
|
|
|
2010-06-03 19:21:34 +00:00
|
|
|
if (!array_descriptor)
|
|
|
|
seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
|
|
|
|
field->type, field->name, field->offset,
|
|
|
|
field->size, !!field->is_signed);
|
2023-02-12 15:13:03 +00:00
|
|
|
else if (field->len)
|
|
|
|
seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
|
2010-06-03 19:21:34 +00:00
|
|
|
(int)(array_descriptor - field->type),
|
|
|
|
field->type, field->name,
|
2023-02-12 15:13:03 +00:00
|
|
|
field->len, field->offset,
|
2010-06-03 19:21:34 +00:00
|
|
|
field->size, !!field->is_signed);
|
2023-02-12 15:13:03 +00:00
|
|
|
else
|
|
|
|
seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
|
|
|
|
(int)(array_descriptor - field->type),
|
|
|
|
field->type, field->name,
|
|
|
|
field->offset, field->size, !!field->is_signed);
|
2010-05-24 08:22:49 +00:00
|
|
|
|
2010-06-03 19:21:34 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2009-12-15 07:39:53 +00:00
|
|
|
|
tracing: Simplify the iteration logic in f_start/f_next
f_next() looks overcomplicated, and it is not strictly correct
even if this doesn't matter.
Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF)
if trace_get_fields() returns an empty list, we should simply
advance to FORMAT_PRINTFMT as we do when we find the end of list.
1. Change f_next() to return "struct list_head *" rather than
"ftrace_event_field *", and change f_show() to do list_entry().
This simplifies the code a bit, only f_show() needs to know
about ftrace_event_field, and f_next() can play with ->prev
directly
2. Change f_next() to not play with ->prev / return inside the
switch() statement. It can simply set node = head/common_head,
the prev-or-advance-to-the-next-magic below does all work.
While at it. f_start() looks overcomplicated too. I don't think
*pos == 0 makes sense as a separate case, just change this code
to do "while" instead of "do/while".
The patch also moves f_start() down, close to f_stop(). This is
purely cosmetic, just to make the locking added by the next patch
more clear/visible.
Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-07-18 18:47:10 +00:00
|
|
|
static void *f_start(struct seq_file *m, loff_t *pos)
|
|
|
|
{
|
tracing: Have format file honor EVENT_FILE_FL_FREED
When eventfs was introduced, special care had to be done to coordinate the
freeing of the file meta data with the files that are exposed to user
space. The file meta data would have a ref count that is set when the file
is created and would be decremented and freed after the last user that
opened the file closed it. When the file meta data was to be freed, it
would set a flag (EVENT_FILE_FL_FREED) to denote that the file is freed,
and any new references made (like new opens or reads) would fail as it is
marked freed. This allowed other meta data to be freed after this flag was
set (under the event_mutex).
All the files that were dynamically created in the events directory had a
pointer to the file meta data and would call event_release() when the last
reference to the user space file was closed. This would be the time that it
is safe to free the file meta data.
A shortcut was made for the "format" file. It's i_private would point to
the "call" entry directly and not point to the file's meta data. This is
because all format files are the same for the same "call", so it was
thought there was no reason to differentiate them. The other files
maintain state (like the "enable", "trigger", etc). But this meant if the
file were to disappear, the "format" file would be unaware of it.
This caused a race that could be trigger via the user_events test (that
would create dynamic events and free them), and running a loop that would
read the user_events format files:
In one console run:
# cd tools/testing/selftests/user_events
# while true; do ./ftrace_test; done
And in another console run:
# cd /sys/kernel/tracing/
# while true; do cat events/user_events/__test_event/format; done 2>/dev/null
With KASAN memory checking, it would trigger a use-after-free bug report
(which was a real bug). This was because the format file was not checking
the file's meta data flag "EVENT_FILE_FL_FREED", so it would access the
event that the file meta data pointed to after the event was freed.
After inspection, there are other locations that were found to not check
the EVENT_FILE_FL_FREED flag when accessing the trace_event_file. Add a
new helper function: event_file_file() that will make sure that the
event_mutex is held, and will return NULL if the trace_event_file has the
EVENT_FILE_FL_FREED flag set. Have the first reference of the struct file
pointer use event_file_file() and check for NULL. Later uses can still use
the event_file_data() helper function if the event_mutex is still held and
was not released since the event_file_file() call.
Link: https://lore.kernel.org/all/20240719204701.1605950-1-minipli@grsecurity.net/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Ilkka Naulapää <digirigawa@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: Alexey Makhalov <alexey.makhalov@broadcom.com>
Cc: Vasavi Sirnapalli <vasavi.sirnapalli@broadcom.com>
Link: https://lore.kernel.org/20240730110657.3b69d3c1@gandalf.local.home
Fixes: b63db58e2fa5d ("eventfs/tracing: Add callback for release of an eventfs_inode")
Reported-by: Mathias Krause <minipli@grsecurity.net>
Tested-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-07-30 15:06:57 +00:00
|
|
|
struct trace_event_file *file;
|
tracing: Simplify the iteration logic in f_start/f_next
f_next() looks overcomplicated, and it is not strictly correct
even if this doesn't matter.
Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF)
if trace_get_fields() returns an empty list, we should simply
advance to FORMAT_PRINTFMT as we do when we find the end of list.
1. Change f_next() to return "struct list_head *" rather than
"ftrace_event_field *", and change f_show() to do list_entry().
This simplifies the code a bit, only f_show() needs to know
about ftrace_event_field, and f_next() can play with ->prev
directly
2. Change f_next() to not play with ->prev / return inside the
switch() statement. It can simply set node = head/common_head,
the prev-or-advance-to-the-next-magic below does all work.
While at it. f_start() looks overcomplicated too. I don't think
*pos == 0 makes sense as a separate case, just change this code
to do "while" instead of "do/while".
The patch also moves f_start() down, close to f_stop(). This is
purely cosmetic, just to make the locking added by the next patch
more clear/visible.
Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-07-18 18:47:10 +00:00
|
|
|
void *p = (void *)FORMAT_HEADER;
|
|
|
|
loff_t l = 0;
|
|
|
|
|
2013-07-26 17:25:43 +00:00
|
|
|
/* ->stop() is called even if ->start() fails */
|
|
|
|
mutex_lock(&event_mutex);
|
tracing: Have format file honor EVENT_FILE_FL_FREED
When eventfs was introduced, special care had to be done to coordinate the
freeing of the file meta data with the files that are exposed to user
space. The file meta data would have a ref count that is set when the file
is created and would be decremented and freed after the last user that
opened the file closed it. When the file meta data was to be freed, it
would set a flag (EVENT_FILE_FL_FREED) to denote that the file is freed,
and any new references made (like new opens or reads) would fail as it is
marked freed. This allowed other meta data to be freed after this flag was
set (under the event_mutex).
All the files that were dynamically created in the events directory had a
pointer to the file meta data and would call event_release() when the last
reference to the user space file was closed. This would be the time that it
is safe to free the file meta data.
A shortcut was made for the "format" file. It's i_private would point to
the "call" entry directly and not point to the file's meta data. This is
because all format files are the same for the same "call", so it was
thought there was no reason to differentiate them. The other files
maintain state (like the "enable", "trigger", etc). But this meant if the
file were to disappear, the "format" file would be unaware of it.
This caused a race that could be trigger via the user_events test (that
would create dynamic events and free them), and running a loop that would
read the user_events format files:
In one console run:
# cd tools/testing/selftests/user_events
# while true; do ./ftrace_test; done
And in another console run:
# cd /sys/kernel/tracing/
# while true; do cat events/user_events/__test_event/format; done 2>/dev/null
With KASAN memory checking, it would trigger a use-after-free bug report
(which was a real bug). This was because the format file was not checking
the file's meta data flag "EVENT_FILE_FL_FREED", so it would access the
event that the file meta data pointed to after the event was freed.
After inspection, there are other locations that were found to not check
the EVENT_FILE_FL_FREED flag when accessing the trace_event_file. Add a
new helper function: event_file_file() that will make sure that the
event_mutex is held, and will return NULL if the trace_event_file has the
EVENT_FILE_FL_FREED flag set. Have the first reference of the struct file
pointer use event_file_file() and check for NULL. Later uses can still use
the event_file_data() helper function if the event_mutex is still held and
was not released since the event_file_file() call.
Link: https://lore.kernel.org/all/20240719204701.1605950-1-minipli@grsecurity.net/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Ilkka Naulapää <digirigawa@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: Alexey Makhalov <alexey.makhalov@broadcom.com>
Cc: Vasavi Sirnapalli <vasavi.sirnapalli@broadcom.com>
Link: https://lore.kernel.org/20240730110657.3b69d3c1@gandalf.local.home
Fixes: b63db58e2fa5d ("eventfs/tracing: Add callback for release of an eventfs_inode")
Reported-by: Mathias Krause <minipli@grsecurity.net>
Tested-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-07-30 15:06:57 +00:00
|
|
|
file = event_file_file(m->private);
|
|
|
|
if (!file)
|
2013-07-26 17:25:43 +00:00
|
|
|
return ERR_PTR(-ENODEV);
|
|
|
|
|
tracing: Simplify the iteration logic in f_start/f_next
f_next() looks overcomplicated, and it is not strictly correct
even if this doesn't matter.
Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF)
if trace_get_fields() returns an empty list, we should simply
advance to FORMAT_PRINTFMT as we do when we find the end of list.
1. Change f_next() to return "struct list_head *" rather than
"ftrace_event_field *", and change f_show() to do list_entry().
This simplifies the code a bit, only f_show() needs to know
about ftrace_event_field, and f_next() can play with ->prev
directly
2. Change f_next() to not play with ->prev / return inside the
switch() statement. It can simply set node = head/common_head,
the prev-or-advance-to-the-next-magic below does all work.
While at it. f_start() looks overcomplicated too. I don't think
*pos == 0 makes sense as a separate case, just change this code
to do "while" instead of "do/while".
The patch also moves f_start() down, close to f_stop(). This is
purely cosmetic, just to make the locking added by the next patch
more clear/visible.
Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-07-18 18:47:10 +00:00
|
|
|
while (l < *pos && p)
|
|
|
|
p = f_next(m, p, &l);
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
2010-06-03 19:21:34 +00:00
|
|
|
static void f_stop(struct seq_file *m, void *p)
|
|
|
|
{
|
2013-07-26 17:25:43 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
2010-06-03 19:21:34 +00:00
|
|
|
}
|
2009-03-02 18:53:59 +00:00
|
|
|
|
2010-06-03 19:21:34 +00:00
|
|
|
static const struct seq_operations trace_format_seq_ops = {
|
|
|
|
.start = f_start,
|
|
|
|
.next = f_next,
|
|
|
|
.stop = f_stop,
|
|
|
|
.show = f_show,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int trace_format_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
struct seq_file *m;
|
|
|
|
int ret;
|
|
|
|
|
2019-10-11 21:22:50 +00:00
|
|
|
/* Do we want to hide event format files on tracefs lockdown? */
|
|
|
|
|
2010-06-03 19:21:34 +00:00
|
|
|
ret = seq_open(file, &trace_format_seq_ops);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
m = file->private_data;
|
2013-07-26 17:25:43 +00:00
|
|
|
m->private = file;
|
2010-06-03 19:21:34 +00:00
|
|
|
|
|
|
|
return 0;
|
2009-03-02 18:53:59 +00:00
|
|
|
}
|
|
|
|
|
2024-04-03 08:06:24 +00:00
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
2009-03-19 19:26:13 +00:00
|
|
|
static ssize_t
|
|
|
|
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
|
|
|
|
{
|
2013-07-26 17:25:32 +00:00
|
|
|
int id = (long)event_file_data(filp);
|
2013-07-18 18:47:12 +00:00
|
|
|
char buf[32];
|
|
|
|
int len;
|
2009-03-19 19:26:13 +00:00
|
|
|
|
2013-07-26 17:25:32 +00:00
|
|
|
if (unlikely(!id))
|
|
|
|
return -ENODEV;
|
|
|
|
|
|
|
|
len = sprintf(buf, "%d\n", id);
|
|
|
|
|
2013-07-18 18:47:12 +00:00
|
|
|
return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
|
2009-03-19 19:26:13 +00:00
|
|
|
}
|
2024-04-03 08:06:24 +00:00
|
|
|
#endif
|
2009-03-19 19:26:13 +00:00
|
|
|
|
2009-03-22 08:31:04 +00:00
|
|
|
static ssize_t
|
|
|
|
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2009-03-22 08:31:04 +00:00
|
|
|
struct trace_seq *s;
|
2013-07-26 17:25:40 +00:00
|
|
|
int r = -ENODEV;
|
2009-03-22 08:31:04 +00:00
|
|
|
|
|
|
|
if (*ppos)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
s = kmalloc(sizeof(*s), GFP_KERNEL);
|
2013-07-26 17:25:40 +00:00
|
|
|
|
2009-03-22 08:31:04 +00:00
|
|
|
if (!s)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
trace_seq_init(s);
|
|
|
|
|
2013-07-26 17:25:40 +00:00
|
|
|
mutex_lock(&event_mutex);
|
tracing: Have format file honor EVENT_FILE_FL_FREED
When eventfs was introduced, special care had to be done to coordinate the
freeing of the file meta data with the files that are exposed to user
space. The file meta data would have a ref count that is set when the file
is created and would be decremented and freed after the last user that
opened the file closed it. When the file meta data was to be freed, it
would set a flag (EVENT_FILE_FL_FREED) to denote that the file is freed,
and any new references made (like new opens or reads) would fail as it is
marked freed. This allowed other meta data to be freed after this flag was
set (under the event_mutex).
All the files that were dynamically created in the events directory had a
pointer to the file meta data and would call event_release() when the last
reference to the user space file was closed. This would be the time that it
is safe to free the file meta data.
A shortcut was made for the "format" file. It's i_private would point to
the "call" entry directly and not point to the file's meta data. This is
because all format files are the same for the same "call", so it was
thought there was no reason to differentiate them. The other files
maintain state (like the "enable", "trigger", etc). But this meant if the
file were to disappear, the "format" file would be unaware of it.
This caused a race that could be trigger via the user_events test (that
would create dynamic events and free them), and running a loop that would
read the user_events format files:
In one console run:
# cd tools/testing/selftests/user_events
# while true; do ./ftrace_test; done
And in another console run:
# cd /sys/kernel/tracing/
# while true; do cat events/user_events/__test_event/format; done 2>/dev/null
With KASAN memory checking, it would trigger a use-after-free bug report
(which was a real bug). This was because the format file was not checking
the file's meta data flag "EVENT_FILE_FL_FREED", so it would access the
event that the file meta data pointed to after the event was freed.
After inspection, there are other locations that were found to not check
the EVENT_FILE_FL_FREED flag when accessing the trace_event_file. Add a
new helper function: event_file_file() that will make sure that the
event_mutex is held, and will return NULL if the trace_event_file has the
EVENT_FILE_FL_FREED flag set. Have the first reference of the struct file
pointer use event_file_file() and check for NULL. Later uses can still use
the event_file_data() helper function if the event_mutex is still held and
was not released since the event_file_file() call.
Link: https://lore.kernel.org/all/20240719204701.1605950-1-minipli@grsecurity.net/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Ilkka Naulapää <digirigawa@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: Alexey Makhalov <alexey.makhalov@broadcom.com>
Cc: Vasavi Sirnapalli <vasavi.sirnapalli@broadcom.com>
Link: https://lore.kernel.org/20240730110657.3b69d3c1@gandalf.local.home
Fixes: b63db58e2fa5d ("eventfs/tracing: Add callback for release of an eventfs_inode")
Reported-by: Mathias Krause <minipli@grsecurity.net>
Tested-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-07-30 15:06:57 +00:00
|
|
|
file = event_file_file(filp);
|
|
|
|
if (file)
|
2013-10-24 13:34:17 +00:00
|
|
|
print_event_filter(file, s);
|
2013-07-26 17:25:40 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
|
2013-10-24 13:34:17 +00:00
|
|
|
if (file)
|
2014-11-14 20:49:41 +00:00
|
|
|
r = simple_read_from_buffer(ubuf, cnt, ppos,
|
|
|
|
s->buffer, trace_seq_used(s));
|
2009-03-22 08:31:04 +00:00
|
|
|
|
|
|
|
kfree(s);
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
tracing/filters: a better event parser
Replace the current event parser hack with a better one. Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:
numeric fields:
==, !=, <, <=, >, >=
string fields:
==, !=
predicates can be combined with the logical operators:
&&, ||
examples:
"common_preempt_count > 4" > filter
"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:
((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found
Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.
To clear a filter, '0' can be written to the filter file.
Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem. Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared. This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.
Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.
[ Impact: add new, extended trace-filter implementation ]
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-04-28 08:04:59 +00:00
|
|
|
char *buf;
|
2013-07-26 17:25:40 +00:00
|
|
|
int err = -ENODEV;
|
2009-03-22 08:31:04 +00:00
|
|
|
|
tracing/filters: a better event parser
Replace the current event parser hack with a better one. Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:
numeric fields:
==, !=, <, <=, >, >=
string fields:
==, !=
predicates can be combined with the logical operators:
&&, ||
examples:
"common_preempt_count > 4" > filter
"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:
((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found
Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.
To clear a filter, '0' can be written to the filter file.
Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem. Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared. This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.
Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.
[ Impact: add new, extended trace-filter implementation ]
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-04-28 08:04:59 +00:00
|
|
|
if (cnt >= PAGE_SIZE)
|
2009-03-22 08:31:04 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2015-12-24 05:13:10 +00:00
|
|
|
buf = memdup_user_nul(ubuf, cnt);
|
|
|
|
if (IS_ERR(buf))
|
|
|
|
return PTR_ERR(buf);
|
2009-03-22 08:31:04 +00:00
|
|
|
|
2013-07-26 17:25:40 +00:00
|
|
|
mutex_lock(&event_mutex);
|
tracing: Have format file honor EVENT_FILE_FL_FREED
When eventfs was introduced, special care had to be done to coordinate the
freeing of the file meta data with the files that are exposed to user
space. The file meta data would have a ref count that is set when the file
is created and would be decremented and freed after the last user that
opened the file closed it. When the file meta data was to be freed, it
would set a flag (EVENT_FILE_FL_FREED) to denote that the file is freed,
and any new references made (like new opens or reads) would fail as it is
marked freed. This allowed other meta data to be freed after this flag was
set (under the event_mutex).
All the files that were dynamically created in the events directory had a
pointer to the file meta data and would call event_release() when the last
reference to the user space file was closed. This would be the time that it
is safe to free the file meta data.
A shortcut was made for the "format" file. It's i_private would point to
the "call" entry directly and not point to the file's meta data. This is
because all format files are the same for the same "call", so it was
thought there was no reason to differentiate them. The other files
maintain state (like the "enable", "trigger", etc). But this meant if the
file were to disappear, the "format" file would be unaware of it.
This caused a race that could be trigger via the user_events test (that
would create dynamic events and free them), and running a loop that would
read the user_events format files:
In one console run:
# cd tools/testing/selftests/user_events
# while true; do ./ftrace_test; done
And in another console run:
# cd /sys/kernel/tracing/
# while true; do cat events/user_events/__test_event/format; done 2>/dev/null
With KASAN memory checking, it would trigger a use-after-free bug report
(which was a real bug). This was because the format file was not checking
the file's meta data flag "EVENT_FILE_FL_FREED", so it would access the
event that the file meta data pointed to after the event was freed.
After inspection, there are other locations that were found to not check
the EVENT_FILE_FL_FREED flag when accessing the trace_event_file. Add a
new helper function: event_file_file() that will make sure that the
event_mutex is held, and will return NULL if the trace_event_file has the
EVENT_FILE_FL_FREED flag set. Have the first reference of the struct file
pointer use event_file_file() and check for NULL. Later uses can still use
the event_file_data() helper function if the event_mutex is still held and
was not released since the event_file_file() call.
Link: https://lore.kernel.org/all/20240719204701.1605950-1-minipli@grsecurity.net/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Ilkka Naulapää <digirigawa@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: Alexey Makhalov <alexey.makhalov@broadcom.com>
Cc: Vasavi Sirnapalli <vasavi.sirnapalli@broadcom.com>
Link: https://lore.kernel.org/20240730110657.3b69d3c1@gandalf.local.home
Fixes: b63db58e2fa5d ("eventfs/tracing: Add callback for release of an eventfs_inode")
Reported-by: Mathias Krause <minipli@grsecurity.net>
Tested-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-07-30 15:06:57 +00:00
|
|
|
file = event_file_file(filp);
|
|
|
|
if (file) {
|
|
|
|
if (file->flags & EVENT_FILE_FL_FREED)
|
|
|
|
err = -ENODEV;
|
|
|
|
else
|
|
|
|
err = apply_event_filter(file, buf);
|
|
|
|
}
|
2013-07-26 17:25:40 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
|
2015-12-24 05:13:10 +00:00
|
|
|
kfree(buf);
|
tracing/filters: a better event parser
Replace the current event parser hack with a better one. Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:
numeric fields:
==, !=, <, <=, >, >=
string fields:
==, !=
predicates can be combined with the logical operators:
&&, ||
examples:
"common_preempt_count > 4" > filter
"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:
((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found
Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.
To clear a filter, '0' can be written to the filter file.
Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem. Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared. This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.
Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.
[ Impact: add new, extended trace-filter implementation ]
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-04-28 08:04:59 +00:00
|
|
|
if (err < 0)
|
2009-04-11 07:55:28 +00:00
|
|
|
return err;
|
tracing/filters: allow on-the-fly filter switching
This patch allows event filters to be safely removed or switched
on-the-fly while avoiding the use of rcu or the suspension of tracing of
previous versions.
It does it by adding a new filter_pred_none() predicate function which
does nothing and by never deallocating either the predicates or any of
the filter_pred members used in matching; the predicate lists are
allocated and initialized during ftrace_event_calls initialization.
Whenever a filter is removed or replaced, the filter_pred_* functions
currently in use by the affected ftrace_event_call are immediately
switched over to to the filter_pred_none() function, while the rest of
the filter_pred members are left intact, allowing any currently
executing filter_pred_* functions to finish up, using the values they're
currently using.
In the case of filter replacement, the new predicate values are copied
into the old predicates after the above step, and the filter_pred_none()
functions are replaced by the filter_pred_* functions for the new
filter. In this case, it is possible though very unlikely that a
previous filter_pred_* is still running even after the
filter_pred_none() switch and the switch to the new filter_pred_*. In
that case, however, because nothing has been deallocated in the
filter_pred, the worst that can happen is that the old filter_pred_*
function sees the new values and as a result produces either a false
positive or a false negative, depending on the values it finds.
So one downside to this method is that rarely, it can produce a bad
match during the filter switch, but it should be possible to live with
that, IMHO.
The other downside is that at least in this patch the predicate lists
are always pre-allocated, taking up memory from the start. They could
probably be allocated on first-use, and de-allocated when tracing is
completely stopped - if this patch makes sense, I could create another
one to do that later on.
Oh, and it also places a restriction on the size of __arrays in events,
currently set to 128, since they can't be larger than the now embedded
str_val arrays in the filter_pred struct.
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1239610670.6660.49.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-04-13 08:17:50 +00:00
|
|
|
|
2009-03-22 08:31:04 +00:00
|
|
|
*ppos += cnt;
|
|
|
|
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
2011-07-05 15:36:06 +00:00
|
|
|
static LIST_HEAD(event_subsystems);
|
|
|
|
|
|
|
|
static int subsystem_open(struct inode *inode, struct file *filp)
|
|
|
|
{
|
2022-04-27 17:07:32 +00:00
|
|
|
struct trace_subsystem_dir *dir = NULL, *iter_dir;
|
|
|
|
struct trace_array *tr = NULL, *iter_tr;
|
2011-07-05 15:36:06 +00:00
|
|
|
struct event_subsystem *system = NULL;
|
|
|
|
int ret;
|
|
|
|
|
2013-11-06 19:02:51 +00:00
|
|
|
if (tracing_is_disabled())
|
|
|
|
return -ENODEV;
|
|
|
|
|
2011-07-05 15:36:06 +00:00
|
|
|
/* Make sure the system still exists */
|
|
|
|
mutex_lock(&event_mutex);
|
2017-09-21 20:22:49 +00:00
|
|
|
mutex_lock(&trace_types_lock);
|
2022-04-27 17:07:32 +00:00
|
|
|
list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
|
|
|
|
list_for_each_entry(iter_dir, &iter_tr->systems, list) {
|
|
|
|
if (iter_dir == inode->i_private) {
|
2012-05-04 03:09:03 +00:00
|
|
|
/* Don't open systems with no events */
|
2022-04-27 17:07:32 +00:00
|
|
|
tr = iter_tr;
|
|
|
|
dir = iter_dir;
|
2012-05-04 03:09:03 +00:00
|
|
|
if (dir->nr_events) {
|
|
|
|
__get_system_dir(dir);
|
|
|
|
system = dir->subsystem;
|
|
|
|
}
|
|
|
|
goto exit_loop;
|
2011-07-05 15:36:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-05-04 03:09:03 +00:00
|
|
|
exit_loop:
|
2013-07-02 02:37:54 +00:00
|
|
|
mutex_unlock(&trace_types_lock);
|
2017-09-21 20:22:49 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
2011-07-05 15:36:06 +00:00
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
if (!system)
|
2011-07-05 15:36:06 +00:00
|
|
|
return -ENODEV;
|
|
|
|
|
2013-07-02 19:30:53 +00:00
|
|
|
/* Still need to increment the ref count of the system */
|
|
|
|
if (trace_array_get(tr) < 0) {
|
|
|
|
put_system(dir);
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
|
2011-07-05 15:36:06 +00:00
|
|
|
ret = tracing_open_generic(inode, filp);
|
2013-07-02 19:30:53 +00:00
|
|
|
if (ret < 0) {
|
|
|
|
trace_array_put(tr);
|
2012-05-04 03:09:03 +00:00
|
|
|
put_system(dir);
|
2013-07-02 19:30:53 +00:00
|
|
|
}
|
2012-05-04 03:09:03 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int system_tr_open(struct inode *inode, struct file *filp)
|
|
|
|
{
|
2015-05-13 18:59:40 +00:00
|
|
|
struct trace_subsystem_dir *dir;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr = inode->i_private;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* Make a temporary dir that has no system but points to tr */
|
|
|
|
dir = kzalloc(sizeof(*dir), GFP_KERNEL);
|
2019-10-11 23:12:21 +00:00
|
|
|
if (!dir)
|
2012-05-04 03:09:03 +00:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2019-10-11 23:12:21 +00:00
|
|
|
ret = tracing_open_generic_tr(inode, filp);
|
2013-07-02 19:30:53 +00:00
|
|
|
if (ret < 0) {
|
2012-05-04 03:09:03 +00:00
|
|
|
kfree(dir);
|
2013-11-06 19:02:51 +00:00
|
|
|
return ret;
|
2013-07-02 19:30:53 +00:00
|
|
|
}
|
2019-10-11 23:12:21 +00:00
|
|
|
dir->tr = tr;
|
2012-05-04 03:09:03 +00:00
|
|
|
filp->private_data = dir;
|
2011-07-05 15:36:06 +00:00
|
|
|
|
2013-11-06 19:02:51 +00:00
|
|
|
return 0;
|
2011-07-05 15:36:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int subsystem_release(struct inode *inode, struct file *file)
|
|
|
|
{
|
2015-05-13 18:59:40 +00:00
|
|
|
struct trace_subsystem_dir *dir = file->private_data;
|
2011-07-05 15:36:06 +00:00
|
|
|
|
2013-07-02 19:30:53 +00:00
|
|
|
trace_array_put(dir->tr);
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
/*
|
|
|
|
* If dir->subsystem is NULL, then this is a temporary
|
|
|
|
* descriptor that was made for a trace_array to enable
|
|
|
|
* all subsystems.
|
|
|
|
*/
|
|
|
|
if (dir->subsystem)
|
|
|
|
put_system(dir);
|
|
|
|
else
|
|
|
|
kfree(dir);
|
2011-07-05 15:36:06 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-03-22 08:31:17 +00:00
|
|
|
static ssize_t
|
|
|
|
subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
2015-05-13 18:59:40 +00:00
|
|
|
struct trace_subsystem_dir *dir = filp->private_data;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct event_subsystem *system = dir->subsystem;
|
2009-03-22 08:31:17 +00:00
|
|
|
struct trace_seq *s;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
if (*ppos)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
s = kmalloc(sizeof(*s), GFP_KERNEL);
|
|
|
|
if (!s)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
trace_seq_init(s);
|
|
|
|
|
tracing/filters: a better event parser
Replace the current event parser hack with a better one. Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:
numeric fields:
==, !=, <, <=, >, >=
string fields:
==, !=
predicates can be combined with the logical operators:
&&, ||
examples:
"common_preempt_count > 4" > filter
"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:
((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found
Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.
To clear a filter, '0' can be written to the filter file.
Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem. Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared. This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.
Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.
[ Impact: add new, extended trace-filter implementation ]
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-04-28 08:04:59 +00:00
|
|
|
print_subsystem_event_filter(system, s);
|
2014-11-14 20:49:41 +00:00
|
|
|
r = simple_read_from_buffer(ubuf, cnt, ppos,
|
|
|
|
s->buffer, trace_seq_used(s));
|
2009-03-22 08:31:17 +00:00
|
|
|
|
|
|
|
kfree(s);
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
2015-05-13 18:59:40 +00:00
|
|
|
struct trace_subsystem_dir *dir = filp->private_data;
|
tracing/filters: a better event parser
Replace the current event parser hack with a better one. Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:
numeric fields:
==, !=, <, <=, >, >=
string fields:
==, !=
predicates can be combined with the logical operators:
&&, ||
examples:
"common_preempt_count > 4" > filter
"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:
((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found
Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.
To clear a filter, '0' can be written to the filter file.
Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem. Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared. This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.
Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.
[ Impact: add new, extended trace-filter implementation ]
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-04-28 08:04:59 +00:00
|
|
|
char *buf;
|
2009-03-22 08:31:17 +00:00
|
|
|
int err;
|
|
|
|
|
tracing/filters: a better event parser
Replace the current event parser hack with a better one. Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:
numeric fields:
==, !=, <, <=, >, >=
string fields:
==, !=
predicates can be combined with the logical operators:
&&, ||
examples:
"common_preempt_count > 4" > filter
"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:
((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found
Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.
To clear a filter, '0' can be written to the filter file.
Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem. Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared. This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.
Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.
[ Impact: add new, extended trace-filter implementation ]
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-04-28 08:04:59 +00:00
|
|
|
if (cnt >= PAGE_SIZE)
|
2009-03-22 08:31:17 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2015-12-24 05:13:10 +00:00
|
|
|
buf = memdup_user_nul(ubuf, cnt);
|
|
|
|
if (IS_ERR(buf))
|
|
|
|
return PTR_ERR(buf);
|
2009-03-22 08:31:17 +00:00
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
err = apply_subsystem_event_filter(dir, buf);
|
2015-12-24 05:13:10 +00:00
|
|
|
kfree(buf);
|
tracing/filters: a better event parser
Replace the current event parser hack with a better one. Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:
numeric fields:
==, !=, <, <=, >, >=
string fields:
==, !=
predicates can be combined with the logical operators:
&&, ||
examples:
"common_preempt_count > 4" > filter
"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:
((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found
Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.
To clear a filter, '0' can be written to the filter file.
Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem. Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared. This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.
Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.
[ Impact: add new, extended trace-filter implementation ]
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-04-28 08:04:59 +00:00
|
|
|
if (err < 0)
|
2009-04-11 07:55:28 +00:00
|
|
|
return err;
|
2009-03-22 08:31:17 +00:00
|
|
|
|
|
|
|
*ppos += cnt;
|
|
|
|
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
2009-04-15 20:53:47 +00:00
|
|
|
static ssize_t
|
2023-12-19 18:54:16 +00:00
|
|
|
show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
|
2009-04-15 20:53:47 +00:00
|
|
|
{
|
2023-12-19 18:54:16 +00:00
|
|
|
struct trace_array *tr = filp->private_data;
|
2009-04-15 20:53:47 +00:00
|
|
|
struct trace_seq *s;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
if (*ppos)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
s = kmalloc(sizeof(*s), GFP_KERNEL);
|
|
|
|
if (!s)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
trace_seq_init(s);
|
|
|
|
|
2023-12-19 18:54:16 +00:00
|
|
|
ring_buffer_print_page_header(tr->array_buffer.buffer, s);
|
|
|
|
r = simple_read_from_buffer(ubuf, cnt, ppos,
|
|
|
|
s->buffer, trace_seq_used(s));
|
|
|
|
|
|
|
|
kfree(s);
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
|
|
|
|
{
|
|
|
|
struct trace_seq *s;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
if (*ppos)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
s = kmalloc(sizeof(*s), GFP_KERNEL);
|
|
|
|
if (!s)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
trace_seq_init(s);
|
|
|
|
|
|
|
|
ring_buffer_print_entry_header(s);
|
2014-11-14 20:49:41 +00:00
|
|
|
r = simple_read_from_buffer(ubuf, cnt, ppos,
|
|
|
|
s->buffer, trace_seq_used(s));
|
2009-04-15 20:53:47 +00:00
|
|
|
|
|
|
|
kfree(s);
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2015-10-21 19:27:36 +00:00
|
|
|
static void ignore_task_cpu(void *data)
|
|
|
|
{
|
|
|
|
struct trace_array *tr = data;
|
|
|
|
struct trace_pid_list *pid_list;
|
2020-03-25 23:51:19 +00:00
|
|
|
struct trace_pid_list *no_pid_list;
|
2015-10-21 19:27:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This function is called by on_each_cpu() while the
|
|
|
|
* event_mutex is held.
|
|
|
|
*/
|
|
|
|
pid_list = rcu_dereference_protected(tr->filtered_pids,
|
|
|
|
mutex_is_locked(&event_mutex));
|
2020-03-25 23:51:19 +00:00
|
|
|
no_pid_list = rcu_dereference_protected(tr->filtered_no_pids,
|
|
|
|
mutex_is_locked(&event_mutex));
|
2015-10-21 19:27:36 +00:00
|
|
|
|
2020-01-09 23:53:48 +00:00
|
|
|
this_cpu_write(tr->array_buffer.data->ignore_pid,
|
2020-03-25 23:51:19 +00:00
|
|
|
trace_ignore_this_task(pid_list, no_pid_list, current));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void register_pid_events(struct trace_array *tr)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Register a probe that is called before all other probes
|
|
|
|
* to set ignore_pid if next or prev do not match.
|
|
|
|
* Register a probe this is called after all other probes
|
|
|
|
* to only keep ignore_pid set if next pid matches.
|
|
|
|
*/
|
|
|
|
register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre,
|
|
|
|
tr, INT_MAX);
|
|
|
|
register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post,
|
|
|
|
tr, 0);
|
|
|
|
|
|
|
|
register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre,
|
|
|
|
tr, INT_MAX);
|
|
|
|
register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post,
|
|
|
|
tr, 0);
|
|
|
|
|
|
|
|
register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre,
|
|
|
|
tr, INT_MAX);
|
|
|
|
register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post,
|
|
|
|
tr, 0);
|
|
|
|
|
|
|
|
register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre,
|
|
|
|
tr, INT_MAX);
|
|
|
|
register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post,
|
|
|
|
tr, 0);
|
2015-10-21 19:27:36 +00:00
|
|
|
}
|
|
|
|
|
2015-09-24 15:33:26 +00:00
|
|
|
static ssize_t
|
2020-03-25 23:51:19 +00:00
|
|
|
event_pid_write(struct file *filp, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos, int type)
|
2015-09-24 15:33:26 +00:00
|
|
|
{
|
2015-09-25 16:58:44 +00:00
|
|
|
struct seq_file *m = filp->private_data;
|
2015-09-24 15:33:26 +00:00
|
|
|
struct trace_array *tr = m->private;
|
|
|
|
struct trace_pid_list *filtered_pids = NULL;
|
2020-03-25 23:51:19 +00:00
|
|
|
struct trace_pid_list *other_pids = NULL;
|
2016-04-13 20:27:49 +00:00
|
|
|
struct trace_pid_list *pid_list;
|
2015-09-25 16:58:44 +00:00
|
|
|
struct trace_event_file *file;
|
2016-04-21 15:35:30 +00:00
|
|
|
ssize_t ret;
|
2015-09-24 15:33:26 +00:00
|
|
|
|
|
|
|
if (!cnt)
|
|
|
|
return 0;
|
|
|
|
|
2023-09-06 09:18:37 +00:00
|
|
|
ret = tracing_update_buffers(tr);
|
2015-09-24 15:33:26 +00:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
mutex_lock(&event_mutex);
|
2016-04-21 15:35:30 +00:00
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
if (type == TRACE_PIDS) {
|
|
|
|
filtered_pids = rcu_dereference_protected(tr->filtered_pids,
|
|
|
|
lockdep_is_held(&event_mutex));
|
|
|
|
other_pids = rcu_dereference_protected(tr->filtered_no_pids,
|
|
|
|
lockdep_is_held(&event_mutex));
|
|
|
|
} else {
|
|
|
|
filtered_pids = rcu_dereference_protected(tr->filtered_no_pids,
|
|
|
|
lockdep_is_held(&event_mutex));
|
|
|
|
other_pids = rcu_dereference_protected(tr->filtered_pids,
|
|
|
|
lockdep_is_held(&event_mutex));
|
|
|
|
}
|
2016-04-13 20:27:49 +00:00
|
|
|
|
2016-04-21 15:35:30 +00:00
|
|
|
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
|
|
|
|
if (ret < 0)
|
2016-04-13 20:27:49 +00:00
|
|
|
goto out;
|
2015-09-24 15:33:26 +00:00
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
if (type == TRACE_PIDS)
|
|
|
|
rcu_assign_pointer(tr->filtered_pids, pid_list);
|
|
|
|
else
|
|
|
|
rcu_assign_pointer(tr->filtered_no_pids, pid_list);
|
2015-09-24 15:33:26 +00:00
|
|
|
|
2015-09-25 16:58:44 +00:00
|
|
|
list_for_each_entry(file, &tr->events, list) {
|
|
|
|
set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
|
|
|
|
}
|
2015-09-24 15:33:26 +00:00
|
|
|
|
|
|
|
if (filtered_pids) {
|
2018-08-09 19:31:48 +00:00
|
|
|
tracepoint_synchronize_unregister();
|
2021-09-24 01:03:49 +00:00
|
|
|
trace_pid_list_free(filtered_pids);
|
2020-03-25 23:51:19 +00:00
|
|
|
} else if (pid_list && !other_pids) {
|
|
|
|
register_pid_events(tr);
|
2015-09-24 15:33:26 +00:00
|
|
|
}
|
|
|
|
|
2015-11-02 18:08:26 +00:00
|
|
|
/*
|
|
|
|
* Ignoring of pids is done at task switch. But we have to
|
|
|
|
* check for those tasks that are currently running.
|
|
|
|
* Always do this in case a pid was appended or removed.
|
|
|
|
*/
|
|
|
|
on_each_cpu(ignore_task_cpu, tr, 1);
|
|
|
|
|
2016-04-13 20:27:49 +00:00
|
|
|
out:
|
2015-09-25 16:58:44 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
|
2016-04-21 15:35:30 +00:00
|
|
|
if (ret > 0)
|
|
|
|
*ppos += ret;
|
2015-09-24 15:33:26 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
static ssize_t
|
|
|
|
ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos)
|
|
|
|
{
|
|
|
|
return event_pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
ftrace_event_npid_write(struct file *filp, const char __user *ubuf,
|
|
|
|
size_t cnt, loff_t *ppos)
|
|
|
|
{
|
|
|
|
return event_pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS);
|
|
|
|
}
|
|
|
|
|
2012-05-03 18:57:28 +00:00
|
|
|
static int ftrace_event_avail_open(struct inode *inode, struct file *file);
|
|
|
|
static int ftrace_event_set_open(struct inode *inode, struct file *file);
|
2015-09-24 15:33:26 +00:00
|
|
|
static int ftrace_event_set_pid_open(struct inode *inode, struct file *file);
|
2020-03-25 23:51:19 +00:00
|
|
|
static int ftrace_event_set_npid_open(struct inode *inode, struct file *file);
|
2013-07-18 18:18:44 +00:00
|
|
|
static int ftrace_event_release(struct inode *inode, struct file *file);
|
2012-05-03 18:57:28 +00:00
|
|
|
|
2009-02-24 15:21:36 +00:00
|
|
|
static const struct seq_operations show_event_seq_ops = {
|
|
|
|
.start = t_start,
|
|
|
|
.next = t_next,
|
|
|
|
.show = t_show,
|
|
|
|
.stop = t_stop,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct seq_operations show_set_event_seq_ops = {
|
|
|
|
.start = s_start,
|
|
|
|
.next = s_next,
|
|
|
|
.show = t_show,
|
|
|
|
.stop = t_stop,
|
|
|
|
};
|
|
|
|
|
2015-09-24 15:33:26 +00:00
|
|
|
static const struct seq_operations show_set_pid_seq_ops = {
|
|
|
|
.start = p_start,
|
|
|
|
.next = p_next,
|
2016-04-20 19:19:54 +00:00
|
|
|
.show = trace_pid_show,
|
2015-09-24 15:33:26 +00:00
|
|
|
.stop = p_stop,
|
|
|
|
};
|
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
static const struct seq_operations show_set_no_pid_seq_ops = {
|
|
|
|
.start = np_start,
|
|
|
|
.next = np_next,
|
|
|
|
.show = trace_pid_show,
|
|
|
|
.stop = p_stop,
|
|
|
|
};
|
|
|
|
|
2009-03-10 16:04:02 +00:00
|
|
|
static const struct file_operations ftrace_avail_fops = {
|
2012-05-03 18:57:28 +00:00
|
|
|
.open = ftrace_event_avail_open,
|
2009-03-10 16:04:02 +00:00
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
|
|
|
};
|
|
|
|
|
2009-02-24 15:21:36 +00:00
|
|
|
static const struct file_operations ftrace_set_event_fops = {
|
2012-05-03 18:57:28 +00:00
|
|
|
.open = ftrace_event_set_open,
|
2009-02-24 15:21:36 +00:00
|
|
|
.read = seq_read,
|
|
|
|
.write = ftrace_event_write,
|
|
|
|
.llseek = seq_lseek,
|
2013-07-18 18:18:44 +00:00
|
|
|
.release = ftrace_event_release,
|
2009-02-24 15:21:36 +00:00
|
|
|
};
|
|
|
|
|
2015-09-24 15:33:26 +00:00
|
|
|
static const struct file_operations ftrace_set_event_pid_fops = {
|
|
|
|
.open = ftrace_event_set_pid_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.write = ftrace_event_pid_write,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = ftrace_event_release,
|
|
|
|
};
|
|
|
|
|
2020-03-25 23:51:19 +00:00
|
|
|
static const struct file_operations ftrace_set_event_notrace_pid_fops = {
|
|
|
|
.open = ftrace_event_set_npid_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.write = ftrace_event_npid_write,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = ftrace_event_release,
|
|
|
|
};
|
|
|
|
|
2009-02-24 19:15:08 +00:00
|
|
|
static const struct file_operations ftrace_enable_fops = {
|
2023-09-07 02:47:12 +00:00
|
|
|
.open = tracing_open_file_tr,
|
2009-02-24 19:15:08 +00:00
|
|
|
.read = event_enable_read,
|
|
|
|
.write = event_enable_write,
|
2023-09-07 02:47:12 +00:00
|
|
|
.release = tracing_release_file_tr,
|
llseek: automatically add .llseek fop
All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.
The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.
New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time. Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.
The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.
Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.
Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.
===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
// but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}
@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}
@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}
@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}
@ fops0 @
identifier fops;
@@
struct file_operations fops = {
...
};
@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
.llseek = llseek_f,
...
};
@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
.read = read_f,
...
};
@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
.write = write_f,
...
};
@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
.open = open_f,
...
};
// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
... .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};
@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
... .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};
// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
... .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};
// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};
// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};
@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+ .llseek = default_llseek, /* write accesses f_pos */
};
// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////
@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
.write = write_f,
.read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};
@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};
@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};
@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
2010-08-15 16:52:59 +00:00
|
|
|
.llseek = default_llseek,
|
2009-02-24 19:15:08 +00:00
|
|
|
};
|
|
|
|
|
2009-03-02 18:53:59 +00:00
|
|
|
static const struct file_operations ftrace_event_format_fops = {
|
2010-06-03 19:21:34 +00:00
|
|
|
.open = trace_format_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
2009-03-02 18:53:59 +00:00
|
|
|
};
|
|
|
|
|
2024-04-03 08:06:24 +00:00
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
2009-03-19 19:26:13 +00:00
|
|
|
static const struct file_operations ftrace_event_id_fops = {
|
|
|
|
.read = event_id_read,
|
llseek: automatically add .llseek fop
All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.
The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.
New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time. Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.
The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.
Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.
Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.
===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
// but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}
@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}
@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}
@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}
@ fops0 @
identifier fops;
@@
struct file_operations fops = {
...
};
@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
.llseek = llseek_f,
...
};
@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
.read = read_f,
...
};
@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
.write = write_f,
...
};
@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
.open = open_f,
...
};
// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
... .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};
@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
... .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};
// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
... .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};
// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};
// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};
@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+ .llseek = default_llseek, /* write accesses f_pos */
};
// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////
@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
.write = write_f,
.read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};
@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};
@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};
@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
2010-08-15 16:52:59 +00:00
|
|
|
.llseek = default_llseek,
|
2009-03-19 19:26:13 +00:00
|
|
|
};
|
2024-04-03 08:06:24 +00:00
|
|
|
#endif
|
2009-03-19 19:26:13 +00:00
|
|
|
|
2009-03-22 08:31:04 +00:00
|
|
|
static const struct file_operations ftrace_event_filter_fops = {
|
2023-09-07 02:47:12 +00:00
|
|
|
.open = tracing_open_file_tr,
|
2009-03-22 08:31:04 +00:00
|
|
|
.read = event_filter_read,
|
|
|
|
.write = event_filter_write,
|
2023-09-07 02:47:12 +00:00
|
|
|
.release = tracing_release_file_tr,
|
llseek: automatically add .llseek fop
All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.
The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.
New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time. Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.
The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.
Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.
Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.
===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
// but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}
@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}
@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}
@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}
@ fops0 @
identifier fops;
@@
struct file_operations fops = {
...
};
@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
.llseek = llseek_f,
...
};
@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
.read = read_f,
...
};
@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
.write = write_f,
...
};
@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
.open = open_f,
...
};
// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
... .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};
@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
... .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};
// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
... .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};
// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};
// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};
@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+ .llseek = default_llseek, /* write accesses f_pos */
};
// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////
@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
.write = write_f,
.read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};
@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};
@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};
@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
2010-08-15 16:52:59 +00:00
|
|
|
.llseek = default_llseek,
|
2009-03-22 08:31:04 +00:00
|
|
|
};
|
|
|
|
|
2009-03-22 08:31:17 +00:00
|
|
|
static const struct file_operations ftrace_subsystem_filter_fops = {
|
2011-07-05 15:36:06 +00:00
|
|
|
.open = subsystem_open,
|
2009-03-22 08:31:17 +00:00
|
|
|
.read = subsystem_filter_read,
|
|
|
|
.write = subsystem_filter_write,
|
llseek: automatically add .llseek fop
All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.
The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.
New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time. Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.
The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.
Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.
Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.
===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
// but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}
@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}
@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}
@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}
@ fops0 @
identifier fops;
@@
struct file_operations fops = {
...
};
@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
.llseek = llseek_f,
...
};
@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
.read = read_f,
...
};
@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
.write = write_f,
...
};
@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
.open = open_f,
...
};
// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
... .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};
@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
... .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};
// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
... .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};
// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};
// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};
@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+ .llseek = default_llseek, /* write accesses f_pos */
};
// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////
@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
.write = write_f,
.read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};
@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};
@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};
@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
2010-08-15 16:52:59 +00:00
|
|
|
.llseek = default_llseek,
|
2011-07-05 15:36:06 +00:00
|
|
|
.release = subsystem_release,
|
2009-03-22 08:31:17 +00:00
|
|
|
};
|
|
|
|
|
2009-05-07 02:52:15 +00:00
|
|
|
static const struct file_operations ftrace_system_enable_fops = {
|
2011-07-05 18:32:51 +00:00
|
|
|
.open = subsystem_open,
|
2009-05-07 02:52:15 +00:00
|
|
|
.read = system_enable_read,
|
|
|
|
.write = system_enable_write,
|
llseek: automatically add .llseek fop
All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.
The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.
New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time. Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.
The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.
Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.
Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.
===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
// but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}
@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}
@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}
@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}
@ fops0 @
identifier fops;
@@
struct file_operations fops = {
...
};
@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
.llseek = llseek_f,
...
};
@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
.read = read_f,
...
};
@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
.write = write_f,
...
};
@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
.open = open_f,
...
};
// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
... .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};
@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
... .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};
// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
... .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};
// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};
// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};
@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+ .llseek = default_llseek, /* write accesses f_pos */
};
// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////
@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
.write = write_f,
.read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};
@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};
@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};
@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
2010-08-15 16:52:59 +00:00
|
|
|
.llseek = default_llseek,
|
2011-07-05 18:32:51 +00:00
|
|
|
.release = subsystem_release,
|
2009-05-07 02:52:15 +00:00
|
|
|
};
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
static const struct file_operations ftrace_tr_enable_fops = {
|
|
|
|
.open = system_tr_open,
|
|
|
|
.read = system_enable_read,
|
|
|
|
.write = system_enable_write,
|
|
|
|
.llseek = default_llseek,
|
|
|
|
.release = subsystem_release,
|
|
|
|
};
|
|
|
|
|
2023-12-19 18:54:16 +00:00
|
|
|
static const struct file_operations ftrace_show_header_page_fops = {
|
|
|
|
.open = tracing_open_generic_tr,
|
|
|
|
.read = show_header_page_file,
|
llseek: automatically add .llseek fop
All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.
The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.
New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time. Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.
The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.
Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.
Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.
===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
// but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}
@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}
@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}
@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}
@ fops0 @
identifier fops;
@@
struct file_operations fops = {
...
};
@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
.llseek = llseek_f,
...
};
@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
.read = read_f,
...
};
@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
.write = write_f,
...
};
@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
.open = open_f,
...
};
// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
... .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};
@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
... .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};
// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
... .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};
// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};
// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};
@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+ .llseek = default_llseek, /* write accesses f_pos */
};
// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////
@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
.write = write_f,
.read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};
@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};
@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};
@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
2010-08-15 16:52:59 +00:00
|
|
|
.llseek = default_llseek,
|
2023-12-19 18:54:16 +00:00
|
|
|
.release = tracing_release_generic_tr,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct file_operations ftrace_show_header_event_fops = {
|
|
|
|
.open = tracing_open_generic_tr,
|
|
|
|
.read = show_header_event_file,
|
|
|
|
.llseek = default_llseek,
|
|
|
|
.release = tracing_release_generic_tr,
|
2009-04-15 20:53:47 +00:00
|
|
|
};
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
static int
|
|
|
|
ftrace_event_open(struct inode *inode, struct file *file,
|
|
|
|
const struct seq_operations *seq_ops)
|
2009-02-24 19:15:08 +00:00
|
|
|
{
|
2012-05-04 03:09:03 +00:00
|
|
|
struct seq_file *m;
|
|
|
|
int ret;
|
2009-02-24 19:15:08 +00:00
|
|
|
|
2019-10-11 21:22:50 +00:00
|
|
|
ret = security_locked_down(LOCKDOWN_TRACEFS);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
ret = seq_open(file, seq_ops);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
m = file->private_data;
|
|
|
|
/* copy tr over to seq ops */
|
|
|
|
m->private = inode->i_private;
|
2009-02-24 19:15:08 +00:00
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
return ret;
|
2009-02-24 19:15:08 +00:00
|
|
|
}
|
|
|
|
|
2013-07-18 18:18:44 +00:00
|
|
|
static int ftrace_event_release(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
struct trace_array *tr = inode->i_private;
|
|
|
|
|
|
|
|
trace_array_put(tr);
|
|
|
|
|
|
|
|
return seq_release(inode, file);
|
|
|
|
}
|
|
|
|
|
2012-05-03 18:57:28 +00:00
|
|
|
static int
|
|
|
|
ftrace_event_avail_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
const struct seq_operations *seq_ops = &show_event_seq_ops;
|
|
|
|
|
2019-10-11 21:22:50 +00:00
|
|
|
/* Checks for tracefs lockdown */
|
2012-05-04 03:09:03 +00:00
|
|
|
return ftrace_event_open(inode, file, seq_ops);
|
2012-05-03 18:57:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
ftrace_event_set_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
const struct seq_operations *seq_ops = &show_set_event_seq_ops;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr = inode->i_private;
|
2013-07-18 18:18:44 +00:00
|
|
|
int ret;
|
|
|
|
|
tracing: Add tracing_check_open_get_tr()
Currently, most files in the tracefs directory test if tracing_disabled is
set. If so, it should return -ENODEV. The tracing_disabled is called when
tracing is found to be broken. Originally it was done in case the ring
buffer was found to be corrupted, and we wanted to prevent reading it from
crashing the kernel. But it's also called if a tracing selftest fails on
boot. It's a one way switch. That is, once it is triggered, tracing is
disabled until reboot.
As most tracefs files can also be used by instances in the tracefs
directory, they need to be carefully done. Each instance has a trace_array
associated to it, and when the instance is removed, the trace_array is
freed. But if an instance is opened with a reference to the trace_array,
then it requires looking up the trace_array to get its ref counter (as there
could be a race with it being deleted and the open itself). Once it is
found, a reference is added to prevent the instance from being removed (and
the trace_array associated with it freed).
Combine the two checks (tracing_disabled and trace_array_get()) into a
single helper function. This will also make it easier to add lockdown to
tracefs later.
Link: http://lkml.kernel.org/r/20191011135458.7399da44@gandalf.local.home
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2019-10-11 21:39:57 +00:00
|
|
|
ret = tracing_check_open_get_tr(tr);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2012-05-03 18:57:28 +00:00
|
|
|
|
|
|
|
if ((file->f_mode & FMODE_WRITE) &&
|
|
|
|
(file->f_flags & O_TRUNC))
|
2012-05-04 03:09:03 +00:00
|
|
|
ftrace_clear_events(tr);
|
2012-05-03 18:57:28 +00:00
|
|
|
|
2013-07-18 18:18:44 +00:00
|
|
|
ret = ftrace_event_open(inode, file, seq_ops);
|
|
|
|
if (ret < 0)
|
|
|
|
trace_array_put(tr);
|
|
|
|
return ret;
|
2012-05-04 03:09:03 +00:00
|
|
|
}
|
|
|
|
|
2015-09-24 15:33:26 +00:00
|
|
|
static int
|
|
|
|
ftrace_event_set_pid_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
const struct seq_operations *seq_ops = &show_set_pid_seq_ops;
|
|
|
|
struct trace_array *tr = inode->i_private;
|
|
|
|
int ret;
|
|
|
|
|
tracing: Add tracing_check_open_get_tr()
Currently, most files in the tracefs directory test if tracing_disabled is
set. If so, it should return -ENODEV. The tracing_disabled is called when
tracing is found to be broken. Originally it was done in case the ring
buffer was found to be corrupted, and we wanted to prevent reading it from
crashing the kernel. But it's also called if a tracing selftest fails on
boot. It's a one way switch. That is, once it is triggered, tracing is
disabled until reboot.
As most tracefs files can also be used by instances in the tracefs
directory, they need to be carefully done. Each instance has a trace_array
associated to it, and when the instance is removed, the trace_array is
freed. But if an instance is opened with a reference to the trace_array,
then it requires looking up the trace_array to get its ref counter (as there
could be a race with it being deleted and the open itself). Once it is
found, a reference is added to prevent the instance from being removed (and
the trace_array associated with it freed).
Combine the two checks (tracing_disabled and trace_array_get()) into a
single helper function. This will also make it easier to add lockdown to
tracefs later.
Link: http://lkml.kernel.org/r/20191011135458.7399da44@gandalf.local.home
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2019-10-11 21:39:57 +00:00
|
|
|
ret = tracing_check_open_get_tr(tr);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2015-09-24 15:33:26 +00:00
|
|
|
|
|
|
|
if ((file->f_mode & FMODE_WRITE) &&
|
|
|
|
(file->f_flags & O_TRUNC))
|
2020-03-25 23:51:19 +00:00
|
|
|
ftrace_clear_event_pids(tr, TRACE_PIDS);
|
|
|
|
|
|
|
|
ret = ftrace_event_open(inode, file, seq_ops);
|
|
|
|
if (ret < 0)
|
|
|
|
trace_array_put(tr);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
ftrace_event_set_npid_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
const struct seq_operations *seq_ops = &show_set_no_pid_seq_ops;
|
|
|
|
struct trace_array *tr = inode->i_private;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = tracing_check_open_get_tr(tr);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if ((file->f_mode & FMODE_WRITE) &&
|
|
|
|
(file->f_flags & O_TRUNC))
|
|
|
|
ftrace_clear_event_pids(tr, TRACE_NO_PIDS);
|
2015-09-24 15:33:26 +00:00
|
|
|
|
|
|
|
ret = ftrace_event_open(inode, file, seq_ops);
|
|
|
|
if (ret < 0)
|
|
|
|
trace_array_put(tr);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
static struct event_subsystem *
|
|
|
|
create_new_subsystem(const char *name)
|
|
|
|
{
|
|
|
|
struct event_subsystem *system;
|
|
|
|
|
|
|
|
/* need to create new entry */
|
|
|
|
system = kmalloc(sizeof(*system), GFP_KERNEL);
|
|
|
|
if (!system)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
system->ref_count = 1;
|
2013-06-27 14:58:31 +00:00
|
|
|
|
|
|
|
/* Only allocate if dynamic (kprobes and modules) */
|
2015-09-09 21:24:01 +00:00
|
|
|
system->name = kstrdup_const(name, GFP_KERNEL);
|
|
|
|
if (!system->name)
|
|
|
|
goto out_free;
|
2012-05-04 03:09:03 +00:00
|
|
|
|
|
|
|
system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
|
|
|
|
if (!system->filter)
|
|
|
|
goto out_free;
|
|
|
|
|
|
|
|
list_add(&system->list, &event_subsystems);
|
|
|
|
|
|
|
|
return system;
|
|
|
|
|
|
|
|
out_free:
|
2015-09-09 21:24:01 +00:00
|
|
|
kfree_const(system->name);
|
2012-05-04 03:09:03 +00:00
|
|
|
kfree(system);
|
|
|
|
return NULL;
|
2012-05-03 18:57:28 +00:00
|
|
|
}
|
|
|
|
|
2023-10-05 14:47:45 +00:00
|
|
|
static int system_callback(const char *name, umode_t *mode, void **data,
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
const struct file_operations **fops)
|
|
|
|
{
|
|
|
|
if (strcmp(name, "filter") == 0)
|
|
|
|
*fops = &ftrace_subsystem_filter_fops;
|
|
|
|
|
|
|
|
else if (strcmp(name, "enable") == 0)
|
|
|
|
*fops = &ftrace_system_enable_fops;
|
|
|
|
|
|
|
|
else
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
*mode = TRACE_MODE_WRITE;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct eventfs_inode *
|
2012-05-04 03:09:03 +00:00
|
|
|
event_subsystem_dir(struct trace_array *tr, const char *name,
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
struct trace_event_file *file, struct eventfs_inode *parent)
|
2009-02-28 02:33:02 +00:00
|
|
|
{
|
2022-04-27 17:07:34 +00:00
|
|
|
struct event_subsystem *system, *iter;
|
2015-05-13 18:59:40 +00:00
|
|
|
struct trace_subsystem_dir *dir;
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
struct eventfs_inode *ei;
|
|
|
|
int nr_entries;
|
|
|
|
static struct eventfs_entry system_entries[] = {
|
|
|
|
{
|
|
|
|
.name = "filter",
|
|
|
|
.callback = system_callback,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "enable",
|
|
|
|
.callback = system_callback,
|
|
|
|
}
|
|
|
|
};
|
2009-02-28 02:33:02 +00:00
|
|
|
|
|
|
|
/* First see if we did not already create this dir */
|
2012-05-04 03:09:03 +00:00
|
|
|
list_for_each_entry(dir, &tr->systems, list) {
|
|
|
|
system = dir->subsystem;
|
2009-07-09 08:22:22 +00:00
|
|
|
if (strcmp(system->name, name) == 0) {
|
2012-05-04 03:09:03 +00:00
|
|
|
dir->nr_events++;
|
|
|
|
file->system = dir;
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
return dir->ei;
|
2009-07-09 08:22:22 +00:00
|
|
|
}
|
2009-02-28 02:33:02 +00:00
|
|
|
}
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
/* Now see if the system itself exists. */
|
2022-04-27 17:07:34 +00:00
|
|
|
system = NULL;
|
|
|
|
list_for_each_entry(iter, &event_subsystems, list) {
|
|
|
|
if (strcmp(iter->name, name) == 0) {
|
|
|
|
system = iter;
|
2012-05-04 03:09:03 +00:00
|
|
|
break;
|
2022-04-27 17:07:34 +00:00
|
|
|
}
|
2009-02-28 02:33:02 +00:00
|
|
|
}
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
dir = kmalloc(sizeof(*dir), GFP_KERNEL);
|
|
|
|
if (!dir)
|
|
|
|
goto out_fail;
|
2009-02-28 02:33:02 +00:00
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
if (!system) {
|
|
|
|
system = create_new_subsystem(name);
|
|
|
|
if (!system)
|
|
|
|
goto out_free;
|
|
|
|
} else
|
|
|
|
__get_system(system);
|
|
|
|
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
/* ftrace only has directories no files */
|
|
|
|
if (strcmp(name, "ftrace") == 0)
|
|
|
|
nr_entries = 0;
|
|
|
|
else
|
|
|
|
nr_entries = ARRAY_SIZE(system_entries);
|
|
|
|
|
|
|
|
ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir);
|
2023-10-20 13:52:45 +00:00
|
|
|
if (IS_ERR(ei)) {
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("Failed to create system directory %s\n", name);
|
2012-05-04 03:09:03 +00:00
|
|
|
__put_system(system);
|
|
|
|
goto out_free;
|
2009-04-10 18:53:50 +00:00
|
|
|
}
|
|
|
|
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
dir->ei = ei;
|
2012-05-04 03:09:03 +00:00
|
|
|
dir->tr = tr;
|
|
|
|
dir->ref_count = 1;
|
|
|
|
dir->nr_events = 1;
|
|
|
|
dir->subsystem = system;
|
|
|
|
file->system = dir;
|
tracing/filters: a better event parser
Replace the current event parser hack with a better one. Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:
numeric fields:
==, !=, <, <=, >, >=
string fields:
==, !=
predicates can be combined with the logical operators:
&&, ||
examples:
"common_preempt_count > 4" > filter
"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:
((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found
Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.
To clear a filter, '0' can be written to the filter file.
Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem. Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared. This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.
Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.
[ Impact: add new, extended trace-filter implementation ]
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-04-28 08:04:59 +00:00
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
list_add(&dir->list, &tr->systems);
|
|
|
|
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
return dir->ei;
|
2012-05-04 03:09:03 +00:00
|
|
|
|
|
|
|
out_free:
|
|
|
|
kfree(dir);
|
|
|
|
out_fail:
|
|
|
|
/* Only print this message if failed on memory allocation */
|
|
|
|
if (!dir || !system)
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("No memory to create event subsystem %s\n", name);
|
2012-05-04 03:09:03 +00:00
|
|
|
return NULL;
|
2009-02-28 02:33:02 +00:00
|
|
|
}
|
|
|
|
|
2020-09-10 12:38:48 +00:00
|
|
|
static int
|
|
|
|
event_define_fields(struct trace_event_call *call)
|
|
|
|
{
|
|
|
|
struct list_head *head;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Other events may have the same class. Only update
|
|
|
|
* the fields if they are not already defined.
|
|
|
|
*/
|
|
|
|
head = trace_get_fields(call);
|
|
|
|
if (list_empty(head)) {
|
|
|
|
struct trace_event_fields *field = call->class->fields_array;
|
|
|
|
unsigned int offset = sizeof(struct trace_entry);
|
|
|
|
|
|
|
|
for (; field->type; field++) {
|
|
|
|
if (field->type == TRACE_FUNCTION_TYPE) {
|
|
|
|
field->define_fields(call);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
offset = ALIGN(offset, field->align);
|
2023-02-12 15:13:03 +00:00
|
|
|
ret = trace_define_field_ext(call, field->type, field->name,
|
2020-09-10 12:38:48 +00:00
|
|
|
offset, field->size,
|
2023-02-12 15:13:03 +00:00
|
|
|
field->is_signed, field->filter_type,
|
tracing: Check "%s" dereference via the field and not the TP_printk format
The TP_printk() portion of a trace event is executed at the time a event
is read from the trace. This can happen seconds, minutes, hours, days,
months, years possibly later since the event was recorded. If the print
format contains a dereference to a string via "%s", and that string was
allocated, there's a chance that string could be freed before it is read
by the trace file.
To protect against such bugs, there are two functions that verify the
event. The first one is test_event_printk(), which is called when the
event is created. It reads the TP_printk() format as well as its arguments
to make sure nothing may be dereferencing a pointer that was not copied
into the ring buffer along with the event. If it is, it will trigger a
WARN_ON().
For strings that use "%s", it is not so easy. The string may not reside in
the ring buffer but may still be valid. Strings that are static and part
of the kernel proper which will not be freed for the life of the running
system, are safe to dereference. But to know if it is a pointer to a
static string or to something on the heap can not be determined until the
event is triggered.
This brings us to the second function that tests for the bad dereferencing
of strings, trace_check_vprintf(). It would walk through the printf format
looking for "%s", and when it finds it, it would validate that the pointer
is safe to read. If not, it would produces a WARN_ON() as well and write
into the ring buffer "[UNSAFE-MEMORY]".
The problem with this is how it used va_list to have vsnprintf() handle
all the cases that it didn't need to check. Instead of re-implementing
vsnprintf(), it would make a copy of the format up to the %s part, and
call vsnprintf() with the current va_list ap variable, where the ap would
then be ready to point at the string in question.
For architectures that passed va_list by reference this was possible. For
architectures that passed it by copy it was not. A test_can_verify()
function was used to differentiate between the two, and if it wasn't
possible, it would disable it.
Even for architectures where this was feasible, it was a stretch to rely
on such a method that is undocumented, and could cause issues later on
with new optimizations of the compiler.
Instead, the first function test_event_printk() was updated to look at
"%s" as well. If the "%s" argument is a pointer outside the event in the
ring buffer, it would find the field type of the event that is the problem
and mark the structure with a new flag called "needs_test". The event
itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
this event has a field that needs to be verified before the event can be
printed using the printf format.
When the event fields are created from the field type structure, the
fields would copy the field type's "needs_test" value.
Finally, before being printed, a new function ignore_event() is called
which will check if the event has the TEST_STR flag set (if not, it
returns false). If the flag is set, it then iterates through the events
fields looking for the ones that have the "needs_test" flag set.
Then it uses the offset field from the field structure to find the pointer
in the ring buffer event. It runs the tests to make sure that pointer is
safe to print and if not, it triggers the WARN_ON() and also adds to the
trace output that the event in question has an unsafe memory access.
The ignore_event() makes the trace_check_vprintf() obsolete so it is
removed.
Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-12-17 02:41:22 +00:00
|
|
|
field->len, field->needs_test);
|
2020-09-10 12:38:48 +00:00
|
|
|
if (WARN_ON_ONCE(ret)) {
|
|
|
|
pr_err("error code is %d\n", ret);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
offset += field->size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
static int event_callback(const char *name, umode_t *mode, void **data,
|
|
|
|
const struct file_operations **fops)
|
|
|
|
{
|
|
|
|
struct trace_event_file *file = *data;
|
|
|
|
struct trace_event_call *call = file->event_call;
|
|
|
|
|
|
|
|
if (strcmp(name, "format") == 0) {
|
|
|
|
*mode = TRACE_MODE_READ;
|
|
|
|
*fops = &ftrace_event_format_fops;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only event directories that can be enabled should have
|
|
|
|
* triggers or filters, with the exception of the "print"
|
|
|
|
* event that can have a "trigger" file.
|
|
|
|
*/
|
|
|
|
if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
|
|
|
|
if (call->class->reg && strcmp(name, "enable") == 0) {
|
|
|
|
*mode = TRACE_MODE_WRITE;
|
|
|
|
*fops = &ftrace_enable_fops;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (strcmp(name, "filter") == 0) {
|
|
|
|
*mode = TRACE_MODE_WRITE;
|
|
|
|
*fops = &ftrace_event_filter_fops;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
|
|
|
|
strcmp(trace_event_name(call), "print") == 0) {
|
|
|
|
if (strcmp(name, "trigger") == 0) {
|
|
|
|
*mode = TRACE_MODE_WRITE;
|
|
|
|
*fops = &event_trigger_fops;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
|
|
if (call->event.type && call->class->reg &&
|
|
|
|
strcmp(name, "id") == 0) {
|
|
|
|
*mode = TRACE_MODE_READ;
|
|
|
|
*data = (void *)(long)call->event.type;
|
|
|
|
*fops = &ftrace_event_id_fops;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_HIST_TRIGGERS
|
|
|
|
if (strcmp(name, "hist") == 0) {
|
|
|
|
*mode = TRACE_MODE_READ;
|
|
|
|
*fops = &event_hist_fops;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
|
|
|
|
if (strcmp(name, "hist_debug") == 0) {
|
|
|
|
*mode = TRACE_MODE_READ;
|
|
|
|
*fops = &event_hist_debug_fops;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_TRACE_EVENT_INJECT
|
|
|
|
if (call->event.type && call->class->reg &&
|
|
|
|
strcmp(name, "inject") == 0) {
|
|
|
|
*mode = 0200;
|
|
|
|
*fops = &event_inject_fops;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-05-02 13:03:15 +00:00
|
|
|
/* The file is incremented on creation and freeing the enable file decrements it */
|
|
|
|
static void event_release(const char *name, void *data)
|
|
|
|
{
|
|
|
|
struct trace_event_file *file = data;
|
|
|
|
|
|
|
|
event_file_put(file);
|
|
|
|
}
|
|
|
|
|
2009-02-24 19:15:08 +00:00
|
|
|
static int
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
|
2009-02-24 19:15:08 +00:00
|
|
|
{
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call = file->event_call;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr = file->tr;
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
struct eventfs_inode *e_events;
|
|
|
|
struct eventfs_inode *ei;
|
2014-04-08 21:26:21 +00:00
|
|
|
const char *name;
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
int nr_entries;
|
2009-02-28 07:41:25 +00:00
|
|
|
int ret;
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
static struct eventfs_entry event_entries[] = {
|
|
|
|
{
|
|
|
|
.name = "enable",
|
|
|
|
.callback = event_callback,
|
2024-05-02 13:03:15 +00:00
|
|
|
.release = event_release,
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "filter",
|
|
|
|
.callback = event_callback,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "trigger",
|
|
|
|
.callback = event_callback,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "format",
|
|
|
|
.callback = event_callback,
|
|
|
|
},
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
|
|
{
|
|
|
|
.name = "id",
|
|
|
|
.callback = event_callback,
|
|
|
|
},
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_HIST_TRIGGERS
|
|
|
|
{
|
|
|
|
.name = "hist",
|
|
|
|
.callback = event_callback,
|
|
|
|
},
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
|
|
|
|
{
|
|
|
|
.name = "hist_debug",
|
|
|
|
.callback = event_callback,
|
|
|
|
},
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_TRACE_EVENT_INJECT
|
|
|
|
{
|
|
|
|
.name = "inject",
|
|
|
|
.callback = event_callback,
|
|
|
|
},
|
|
|
|
#endif
|
|
|
|
};
|
2009-02-24 19:15:08 +00:00
|
|
|
|
2009-02-28 02:33:02 +00:00
|
|
|
/*
|
|
|
|
* If the trace point header did not define TRACE_SYSTEM
|
2023-07-28 18:20:43 +00:00
|
|
|
* then the system would be called "TRACE_SYSTEM". This should
|
|
|
|
* never happen.
|
2009-02-28 02:33:02 +00:00
|
|
|
*/
|
2023-07-28 18:20:43 +00:00
|
|
|
if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0))
|
|
|
|
return -ENODEV;
|
|
|
|
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
e_events = event_subsystem_dir(tr, call->class->system, file, parent);
|
|
|
|
if (!e_events)
|
2023-07-28 18:20:43 +00:00
|
|
|
return -ENOMEM;
|
2012-05-04 03:09:03 +00:00
|
|
|
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
nr_entries = ARRAY_SIZE(event_entries);
|
|
|
|
|
2015-05-13 18:20:14 +00:00
|
|
|
name = trace_event_name(call);
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file);
|
|
|
|
if (IS_ERR(ei)) {
|
2015-01-20 17:13:40 +00:00
|
|
|
pr_warn("Could not create tracefs '%s' directory\n", name);
|
2009-02-24 19:15:08 +00:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
file->ei = ei;
|
2009-03-19 19:26:13 +00:00
|
|
|
|
2020-09-10 12:38:48 +00:00
|
|
|
ret = event_define_fields(call);
|
|
|
|
if (ret < 0) {
|
|
|
|
pr_warn("Could not initialize trace point events/%s\n", name);
|
|
|
|
return ret;
|
2009-03-22 08:30:39 +00:00
|
|
|
}
|
|
|
|
|
2024-05-02 13:03:15 +00:00
|
|
|
/* Gets decremented on freeing of the "enable" file */
|
|
|
|
event_file_get(file);
|
|
|
|
|
2009-04-10 18:53:50 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
static void remove_event_from_tracers(struct trace_event_call *call)
|
2012-05-04 03:09:03 +00:00
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr;
|
|
|
|
|
|
|
|
do_for_each_event_file_safe(tr, file) {
|
|
|
|
if (file->event_call != call)
|
|
|
|
continue;
|
|
|
|
|
2013-07-26 17:25:47 +00:00
|
|
|
remove_event_file_dir(file);
|
2012-05-04 03:09:03 +00:00
|
|
|
/*
|
|
|
|
* The do_for_each_event_file_safe() is
|
|
|
|
* a double loop. After finding the call for this
|
|
|
|
* trace_array, we use break to jump to the next
|
|
|
|
* trace_array.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
} while_for_each_event_file();
|
|
|
|
}
|
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
static void event_remove(struct trace_event_call *call)
|
2012-09-12 14:47:57 +00:00
|
|
|
{
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr;
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2012-05-04 03:09:03 +00:00
|
|
|
|
|
|
|
do_for_each_event_file(tr, file) {
|
|
|
|
if (file->event_call != call)
|
|
|
|
continue;
|
tracing: Only have rmmod clear buffers that its events were active in
Currently, when a module event is enabled, when that module is removed, it
clears all ring buffers. This is to prevent another module from being loaded
and having one of its trace event IDs from reusing a trace event ID of the
removed module. This could cause undesirable effects as the trace event of
the new module would be using its own processing algorithms to process raw
data of another event. To prevent this, when a module is loaded, if any of
its events have been used (signified by the WAS_ENABLED event call flag,
which is never cleared), all ring buffers are cleared, just in case any one
of them contains event data of the removed event.
The problem is, there's no reason to clear all ring buffers if only one (or
less than all of them) uses one of the events. Instead, only clear the ring
buffers that recorded the events of a module that is being removed.
To do this, instead of keeping the WAS_ENABLED flag with the trace event
call, move it to the per instance (per ring buffer) event file descriptor.
The event file descriptor maps each event to a separate ring buffer
instance. Then when the module is removed, only the ring buffers that
activated one of the module's events get cleared. The rest are not touched.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-08-31 21:03:47 +00:00
|
|
|
|
|
|
|
if (file->flags & EVENT_FILE_FL_WAS_ENABLED)
|
|
|
|
tr->clear_trace = true;
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
ftrace_event_enable_disable(file, 0);
|
|
|
|
/*
|
|
|
|
* The do_for_each_event_file() is
|
|
|
|
* a double loop. After finding the call for this
|
|
|
|
* trace_array, we use break to jump to the next
|
|
|
|
* trace_array.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
} while_for_each_event_file();
|
|
|
|
|
2012-09-12 14:47:57 +00:00
|
|
|
if (call->event.funcs)
|
2015-05-05 13:39:12 +00:00
|
|
|
__unregister_trace_event(&call->event);
|
2012-05-04 03:09:03 +00:00
|
|
|
remove_event_from_tracers(call);
|
2012-09-12 14:47:57 +00:00
|
|
|
list_del(&call->list);
|
|
|
|
}
|
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
static int event_init(struct trace_event_call *call)
|
2012-09-12 14:47:57 +00:00
|
|
|
{
|
|
|
|
int ret = 0;
|
2014-04-08 21:26:21 +00:00
|
|
|
const char *name;
|
2012-09-12 14:47:57 +00:00
|
|
|
|
2015-05-13 18:20:14 +00:00
|
|
|
name = trace_event_name(call);
|
2014-04-08 21:26:21 +00:00
|
|
|
if (WARN_ON(!name))
|
2012-09-12 14:47:57 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (call->class->raw_init) {
|
|
|
|
ret = call->class->raw_init(call);
|
|
|
|
if (ret < 0 && ret != -ENOSYS)
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("Could not initialize trace events/%s\n", name);
|
2012-09-12 14:47:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-05-24 08:25:13 +00:00
|
|
|
static int
|
2015-05-05 15:45:27 +00:00
|
|
|
__register_event(struct trace_event_call *call, struct module *mod)
|
2009-08-13 20:34:53 +00:00
|
|
|
{
|
|
|
|
int ret;
|
2009-04-10 18:53:50 +00:00
|
|
|
|
2012-09-12 14:47:57 +00:00
|
|
|
ret = event_init(call);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2009-04-25 03:11:22 +00:00
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
list_add(&call->list, &ftrace_events);
|
2021-08-17 03:42:57 +00:00
|
|
|
if (call->flags & TRACE_EVENT_FL_DYNAMIC)
|
|
|
|
atomic_set(&call->refcnt, 0);
|
|
|
|
else
|
|
|
|
call->module = mod;
|
2009-09-25 18:20:54 +00:00
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
return 0;
|
2009-08-13 20:34:53 +00:00
|
|
|
}
|
|
|
|
|
2017-05-31 21:56:49 +00:00
|
|
|
static char *eval_replace(char *ptr, struct trace_eval_map *map, int len)
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
{
|
|
|
|
int rlen;
|
|
|
|
int elen;
|
|
|
|
|
2017-05-31 21:56:49 +00:00
|
|
|
/* Find the length of the eval value as a string */
|
2017-05-31 21:56:43 +00:00
|
|
|
elen = snprintf(ptr, 0, "%ld", map->eval_value);
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
/* Make sure there's enough room to replace the string with the value */
|
|
|
|
if (len < elen)
|
|
|
|
return NULL;
|
|
|
|
|
2017-05-31 21:56:43 +00:00
|
|
|
snprintf(ptr, elen + 1, "%ld", map->eval_value);
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
|
|
|
|
/* Get the rest of the string of ptr */
|
|
|
|
rlen = strlen(ptr + len);
|
|
|
|
memmove(ptr + elen, ptr + len, rlen);
|
|
|
|
/* Make sure we end the new string */
|
|
|
|
ptr[elen + rlen] = 0;
|
|
|
|
|
|
|
|
return ptr + elen;
|
|
|
|
}
|
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
static void update_event_printk(struct trace_event_call *call,
|
2017-05-31 21:56:43 +00:00
|
|
|
struct trace_eval_map *map)
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
{
|
|
|
|
char *ptr;
|
|
|
|
int quote = 0;
|
2017-05-31 21:56:43 +00:00
|
|
|
int len = strlen(map->eval_string);
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
|
|
|
|
for (ptr = call->print_fmt; *ptr; ptr++) {
|
|
|
|
if (*ptr == '\\') {
|
|
|
|
ptr++;
|
|
|
|
/* paranoid */
|
|
|
|
if (!*ptr)
|
|
|
|
break;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (*ptr == '"') {
|
|
|
|
quote ^= 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (quote)
|
|
|
|
continue;
|
|
|
|
if (isdigit(*ptr)) {
|
|
|
|
/* skip numbers */
|
|
|
|
do {
|
|
|
|
ptr++;
|
|
|
|
/* Check for alpha chars like ULL */
|
|
|
|
} while (isalnum(*ptr));
|
2015-04-17 14:27:57 +00:00
|
|
|
if (!*ptr)
|
|
|
|
break;
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
/*
|
|
|
|
* A number must have some kind of delimiter after
|
|
|
|
* it, and we can ignore that too.
|
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (isalpha(*ptr) || *ptr == '_') {
|
2017-05-31 21:56:43 +00:00
|
|
|
if (strncmp(map->eval_string, ptr, len) == 0 &&
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
!isalnum(ptr[len]) && ptr[len] != '_') {
|
2017-05-31 21:56:49 +00:00
|
|
|
ptr = eval_replace(ptr, map, len);
|
|
|
|
/* enum/sizeof string smaller than value */
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
if (WARN_ON_ONCE(!ptr))
|
|
|
|
return;
|
|
|
|
/*
|
2017-05-31 21:56:49 +00:00
|
|
|
* No need to decrement here, as eval_replace()
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
* returns the pointer to the character passed
|
2017-05-31 21:56:49 +00:00
|
|
|
* the eval, and two evals can not be placed
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
* back to back without something in between.
|
|
|
|
* We can skip that something in between.
|
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
skip_more:
|
|
|
|
do {
|
|
|
|
ptr++;
|
|
|
|
} while (isalnum(*ptr) || *ptr == '_');
|
2015-04-17 14:27:57 +00:00
|
|
|
if (!*ptr)
|
|
|
|
break;
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
/*
|
|
|
|
* If what comes after this variable is a '.' or
|
|
|
|
* '->' then we can continue to ignore that string.
|
|
|
|
*/
|
|
|
|
if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
|
|
|
|
ptr += *ptr == '.' ? 1 : 2;
|
2015-04-17 14:27:57 +00:00
|
|
|
if (!*ptr)
|
|
|
|
break;
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
goto skip_more;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Once again, we can skip the delimiter that came
|
|
|
|
* after the string.
|
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
tracing: Have type enum modifications copy the strings
When an enum is used in the visible parts of a trace event that is
exported to user space, the user space applications like perf and
trace-cmd do not have a way to know what the value of the enum is. To
solve this, at boot up (or module load) the printk formats are modified to
replace the enum with their numeric value in the string output.
Array fields of the event are defined by [<nr-elements>] in the type
portion of the format file so that the user space parsers can correctly
parse the array into the appropriate size chunks. But in some trace
events, an enum is used in defining the size of the array, which once
again breaks the parsing of user space tooling.
This was solved the same way as the print formats were, but it modified
the type strings of the trace event. This caused crashes in some
architectures because, as supposed to the print string, is a const string
value. This was not detected on x86, as it appears that const strings are
still writable (at least in boot up), but other architectures this is not
the case, and writing to a const string will cause a kernel fault.
To fix this, use kstrdup() to copy the type before modifying it. If the
trace event is for the core kernel there's no need to free it because the
string will be in use for the life of the machine being on line. For
modules, create a link list to store all the strings being allocated for
modules and when the module is removed, free them.
Link: https://lore.kernel.org/all/yt9dr1706b4i.fsf@linux.ibm.com/
Link: https://lkml.kernel.org/r/20220318153432.3984b871@gandalf.local.home
Tested-by: Marc Zyngier <maz@kernel.org>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Reported-by: Sven Schnelle <svens@linux.ibm.com>
Fixes: b3bc8547d3be ("tracing: Have TRACE_DEFINE_ENUM affect trace event types as well")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-03-18 19:34:32 +00:00
|
|
|
static void add_str_to_module(struct module *module, char *str)
|
|
|
|
{
|
|
|
|
struct module_string *modstr;
|
|
|
|
|
|
|
|
modstr = kmalloc(sizeof(*modstr), GFP_KERNEL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we failed to allocate memory here, then we'll just
|
|
|
|
* let the str memory leak when the module is removed.
|
|
|
|
* If this fails to allocate, there's worse problems than
|
|
|
|
* a leaked string on module removal.
|
|
|
|
*/
|
|
|
|
if (WARN_ON_ONCE(!modstr))
|
|
|
|
return;
|
|
|
|
|
|
|
|
modstr->module = module;
|
|
|
|
modstr->str = str;
|
|
|
|
|
|
|
|
list_add(&modstr->next, &module_strings);
|
|
|
|
}
|
|
|
|
|
2022-03-11 04:27:38 +00:00
|
|
|
static void update_event_fields(struct trace_event_call *call,
|
|
|
|
struct trace_eval_map *map)
|
|
|
|
{
|
|
|
|
struct ftrace_event_field *field;
|
|
|
|
struct list_head *head;
|
|
|
|
char *ptr;
|
tracing: Have type enum modifications copy the strings
When an enum is used in the visible parts of a trace event that is
exported to user space, the user space applications like perf and
trace-cmd do not have a way to know what the value of the enum is. To
solve this, at boot up (or module load) the printk formats are modified to
replace the enum with their numeric value in the string output.
Array fields of the event are defined by [<nr-elements>] in the type
portion of the format file so that the user space parsers can correctly
parse the array into the appropriate size chunks. But in some trace
events, an enum is used in defining the size of the array, which once
again breaks the parsing of user space tooling.
This was solved the same way as the print formats were, but it modified
the type strings of the trace event. This caused crashes in some
architectures because, as supposed to the print string, is a const string
value. This was not detected on x86, as it appears that const strings are
still writable (at least in boot up), but other architectures this is not
the case, and writing to a const string will cause a kernel fault.
To fix this, use kstrdup() to copy the type before modifying it. If the
trace event is for the core kernel there's no need to free it because the
string will be in use for the life of the machine being on line. For
modules, create a link list to store all the strings being allocated for
modules and when the module is removed, free them.
Link: https://lore.kernel.org/all/yt9dr1706b4i.fsf@linux.ibm.com/
Link: https://lkml.kernel.org/r/20220318153432.3984b871@gandalf.local.home
Tested-by: Marc Zyngier <maz@kernel.org>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Reported-by: Sven Schnelle <svens@linux.ibm.com>
Fixes: b3bc8547d3be ("tracing: Have TRACE_DEFINE_ENUM affect trace event types as well")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-03-18 19:34:32 +00:00
|
|
|
char *str;
|
2022-03-11 04:27:38 +00:00
|
|
|
int len = strlen(map->eval_string);
|
|
|
|
|
tracing: Have type enum modifications copy the strings
When an enum is used in the visible parts of a trace event that is
exported to user space, the user space applications like perf and
trace-cmd do not have a way to know what the value of the enum is. To
solve this, at boot up (or module load) the printk formats are modified to
replace the enum with their numeric value in the string output.
Array fields of the event are defined by [<nr-elements>] in the type
portion of the format file so that the user space parsers can correctly
parse the array into the appropriate size chunks. But in some trace
events, an enum is used in defining the size of the array, which once
again breaks the parsing of user space tooling.
This was solved the same way as the print formats were, but it modified
the type strings of the trace event. This caused crashes in some
architectures because, as supposed to the print string, is a const string
value. This was not detected on x86, as it appears that const strings are
still writable (at least in boot up), but other architectures this is not
the case, and writing to a const string will cause a kernel fault.
To fix this, use kstrdup() to copy the type before modifying it. If the
trace event is for the core kernel there's no need to free it because the
string will be in use for the life of the machine being on line. For
modules, create a link list to store all the strings being allocated for
modules and when the module is removed, free them.
Link: https://lore.kernel.org/all/yt9dr1706b4i.fsf@linux.ibm.com/
Link: https://lkml.kernel.org/r/20220318153432.3984b871@gandalf.local.home
Tested-by: Marc Zyngier <maz@kernel.org>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Reported-by: Sven Schnelle <svens@linux.ibm.com>
Fixes: b3bc8547d3be ("tracing: Have TRACE_DEFINE_ENUM affect trace event types as well")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-03-18 19:34:32 +00:00
|
|
|
/* Dynamic events should never have field maps */
|
|
|
|
if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
|
|
|
|
return;
|
|
|
|
|
2022-03-11 04:27:38 +00:00
|
|
|
head = trace_get_fields(call);
|
|
|
|
list_for_each_entry(field, head, link) {
|
|
|
|
ptr = strchr(field->type, '[');
|
|
|
|
if (!ptr)
|
|
|
|
continue;
|
|
|
|
ptr++;
|
|
|
|
|
|
|
|
if (!isalpha(*ptr) && *ptr != '_')
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (strncmp(map->eval_string, ptr, len) != 0)
|
|
|
|
continue;
|
|
|
|
|
tracing: Have type enum modifications copy the strings
When an enum is used in the visible parts of a trace event that is
exported to user space, the user space applications like perf and
trace-cmd do not have a way to know what the value of the enum is. To
solve this, at boot up (or module load) the printk formats are modified to
replace the enum with their numeric value in the string output.
Array fields of the event are defined by [<nr-elements>] in the type
portion of the format file so that the user space parsers can correctly
parse the array into the appropriate size chunks. But in some trace
events, an enum is used in defining the size of the array, which once
again breaks the parsing of user space tooling.
This was solved the same way as the print formats were, but it modified
the type strings of the trace event. This caused crashes in some
architectures because, as supposed to the print string, is a const string
value. This was not detected on x86, as it appears that const strings are
still writable (at least in boot up), but other architectures this is not
the case, and writing to a const string will cause a kernel fault.
To fix this, use kstrdup() to copy the type before modifying it. If the
trace event is for the core kernel there's no need to free it because the
string will be in use for the life of the machine being on line. For
modules, create a link list to store all the strings being allocated for
modules and when the module is removed, free them.
Link: https://lore.kernel.org/all/yt9dr1706b4i.fsf@linux.ibm.com/
Link: https://lkml.kernel.org/r/20220318153432.3984b871@gandalf.local.home
Tested-by: Marc Zyngier <maz@kernel.org>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Reported-by: Sven Schnelle <svens@linux.ibm.com>
Fixes: b3bc8547d3be ("tracing: Have TRACE_DEFINE_ENUM affect trace event types as well")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-03-18 19:34:32 +00:00
|
|
|
str = kstrdup(field->type, GFP_KERNEL);
|
|
|
|
if (WARN_ON_ONCE(!str))
|
|
|
|
return;
|
|
|
|
ptr = str + (ptr - field->type);
|
2022-03-11 04:27:38 +00:00
|
|
|
ptr = eval_replace(ptr, map, len);
|
|
|
|
/* enum/sizeof string smaller than value */
|
tracing: Have type enum modifications copy the strings
When an enum is used in the visible parts of a trace event that is
exported to user space, the user space applications like perf and
trace-cmd do not have a way to know what the value of the enum is. To
solve this, at boot up (or module load) the printk formats are modified to
replace the enum with their numeric value in the string output.
Array fields of the event are defined by [<nr-elements>] in the type
portion of the format file so that the user space parsers can correctly
parse the array into the appropriate size chunks. But in some trace
events, an enum is used in defining the size of the array, which once
again breaks the parsing of user space tooling.
This was solved the same way as the print formats were, but it modified
the type strings of the trace event. This caused crashes in some
architectures because, as supposed to the print string, is a const string
value. This was not detected on x86, as it appears that const strings are
still writable (at least in boot up), but other architectures this is not
the case, and writing to a const string will cause a kernel fault.
To fix this, use kstrdup() to copy the type before modifying it. If the
trace event is for the core kernel there's no need to free it because the
string will be in use for the life of the machine being on line. For
modules, create a link list to store all the strings being allocated for
modules and when the module is removed, free them.
Link: https://lore.kernel.org/all/yt9dr1706b4i.fsf@linux.ibm.com/
Link: https://lkml.kernel.org/r/20220318153432.3984b871@gandalf.local.home
Tested-by: Marc Zyngier <maz@kernel.org>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Reported-by: Sven Schnelle <svens@linux.ibm.com>
Fixes: b3bc8547d3be ("tracing: Have TRACE_DEFINE_ENUM affect trace event types as well")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-03-18 19:34:32 +00:00
|
|
|
if (WARN_ON_ONCE(!ptr)) {
|
|
|
|
kfree(str);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the event is part of a module, then we need to free the string
|
|
|
|
* when the module is removed. Otherwise, it will stay allocated
|
|
|
|
* until a reboot.
|
|
|
|
*/
|
|
|
|
if (call->module)
|
|
|
|
add_str_to_module(call->module, str);
|
|
|
|
|
|
|
|
field->type = str;
|
2022-03-11 04:27:38 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-31 21:56:48 +00:00
|
|
|
void trace_event_eval_update(struct trace_eval_map **map, int len)
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
{
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call, *p;
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
const char *last_system = NULL;
|
tracing: Fix converting enum's from the map in trace_event_eval_update()
Since enums do not get converted by the TRACE_EVENT macro into their values,
the event format displaces the enum name and not the value. This breaks
tools like perf and trace-cmd that need to interpret the raw binary data. To
solve this, an enum map was created to convert these enums into their actual
numbers on boot up. This is done by TRACE_EVENTS() adding a
TRACE_DEFINE_ENUM() macro.
Some enums were not being converted. This was caused by an optization that
had a bug in it.
All calls get checked against this enum map to see if it should be converted
or not, and it compares the call's system to the system that the enum map
was created under. If they match, then they call is processed.
To cut down on the number of iterations needed to find the maps with a
matching system, since calls and maps are grouped by system, when a match is
made, the index into the map array is saved, so that the next call, if it
belongs to the same system as the previous call, could start right at that
array index and not have to scan all the previous arrays.
The problem was, the saved index was used as the variable to know if this is
a call in a new system or not. If the index was zero, it was assumed that
the call is in a new system and would keep incrementing the saved index
until it found a matching system. The issue arises when the first matching
system was at index zero. The next map, if it belonged to the same system,
would then think it was the first match and increment the index to one. If
the next call belong to the same system, it would begin its search of the
maps off by one, and miss the first enum that should be converted. This left
a single enum not converted properly.
Also add a comment to describe exactly what that index was for. It took me a
bit too long to figure out what I was thinking when debugging this issue.
Link: http://lkml.kernel.org/r/717BE572-2070-4C1E-9902-9F2E0FEDA4F8@oracle.com
Cc: stable@vger.kernel.org
Fixes: 0c564a538aa93 ("tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values")
Reported-by: Chuck Lever <chuck.lever@oracle.com>
Teste-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-01-18 20:53:10 +00:00
|
|
|
bool first = false;
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
int last_i;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
down_write(&trace_event_sem);
|
|
|
|
list_for_each_entry_safe(call, p, &ftrace_events, list) {
|
|
|
|
/* events are usually grouped together with systems */
|
|
|
|
if (!last_system || call->class->system != last_system) {
|
tracing: Fix converting enum's from the map in trace_event_eval_update()
Since enums do not get converted by the TRACE_EVENT macro into their values,
the event format displaces the enum name and not the value. This breaks
tools like perf and trace-cmd that need to interpret the raw binary data. To
solve this, an enum map was created to convert these enums into their actual
numbers on boot up. This is done by TRACE_EVENTS() adding a
TRACE_DEFINE_ENUM() macro.
Some enums were not being converted. This was caused by an optization that
had a bug in it.
All calls get checked against this enum map to see if it should be converted
or not, and it compares the call's system to the system that the enum map
was created under. If they match, then they call is processed.
To cut down on the number of iterations needed to find the maps with a
matching system, since calls and maps are grouped by system, when a match is
made, the index into the map array is saved, so that the next call, if it
belongs to the same system as the previous call, could start right at that
array index and not have to scan all the previous arrays.
The problem was, the saved index was used as the variable to know if this is
a call in a new system or not. If the index was zero, it was assumed that
the call is in a new system and would keep incrementing the saved index
until it found a matching system. The issue arises when the first matching
system was at index zero. The next map, if it belonged to the same system,
would then think it was the first match and increment the index to one. If
the next call belong to the same system, it would begin its search of the
maps off by one, and miss the first enum that should be converted. This left
a single enum not converted properly.
Also add a comment to describe exactly what that index was for. It took me a
bit too long to figure out what I was thinking when debugging this issue.
Link: http://lkml.kernel.org/r/717BE572-2070-4C1E-9902-9F2E0FEDA4F8@oracle.com
Cc: stable@vger.kernel.org
Fixes: 0c564a538aa93 ("tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values")
Reported-by: Chuck Lever <chuck.lever@oracle.com>
Teste-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-01-18 20:53:10 +00:00
|
|
|
first = true;
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
last_i = 0;
|
|
|
|
last_system = call->class->system;
|
|
|
|
}
|
|
|
|
|
tracing: Fix converting enum's from the map in trace_event_eval_update()
Since enums do not get converted by the TRACE_EVENT macro into their values,
the event format displaces the enum name and not the value. This breaks
tools like perf and trace-cmd that need to interpret the raw binary data. To
solve this, an enum map was created to convert these enums into their actual
numbers on boot up. This is done by TRACE_EVENTS() adding a
TRACE_DEFINE_ENUM() macro.
Some enums were not being converted. This was caused by an optization that
had a bug in it.
All calls get checked against this enum map to see if it should be converted
or not, and it compares the call's system to the system that the enum map
was created under. If they match, then they call is processed.
To cut down on the number of iterations needed to find the maps with a
matching system, since calls and maps are grouped by system, when a match is
made, the index into the map array is saved, so that the next call, if it
belongs to the same system as the previous call, could start right at that
array index and not have to scan all the previous arrays.
The problem was, the saved index was used as the variable to know if this is
a call in a new system or not. If the index was zero, it was assumed that
the call is in a new system and would keep incrementing the saved index
until it found a matching system. The issue arises when the first matching
system was at index zero. The next map, if it belonged to the same system,
would then think it was the first match and increment the index to one. If
the next call belong to the same system, it would begin its search of the
maps off by one, and miss the first enum that should be converted. This left
a single enum not converted properly.
Also add a comment to describe exactly what that index was for. It took me a
bit too long to figure out what I was thinking when debugging this issue.
Link: http://lkml.kernel.org/r/717BE572-2070-4C1E-9902-9F2E0FEDA4F8@oracle.com
Cc: stable@vger.kernel.org
Fixes: 0c564a538aa93 ("tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values")
Reported-by: Chuck Lever <chuck.lever@oracle.com>
Teste-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-01-18 20:53:10 +00:00
|
|
|
/*
|
2021-03-23 17:49:35 +00:00
|
|
|
* Since calls are grouped by systems, the likelihood that the
|
tracing: Fix converting enum's from the map in trace_event_eval_update()
Since enums do not get converted by the TRACE_EVENT macro into their values,
the event format displaces the enum name and not the value. This breaks
tools like perf and trace-cmd that need to interpret the raw binary data. To
solve this, an enum map was created to convert these enums into their actual
numbers on boot up. This is done by TRACE_EVENTS() adding a
TRACE_DEFINE_ENUM() macro.
Some enums were not being converted. This was caused by an optization that
had a bug in it.
All calls get checked against this enum map to see if it should be converted
or not, and it compares the call's system to the system that the enum map
was created under. If they match, then they call is processed.
To cut down on the number of iterations needed to find the maps with a
matching system, since calls and maps are grouped by system, when a match is
made, the index into the map array is saved, so that the next call, if it
belongs to the same system as the previous call, could start right at that
array index and not have to scan all the previous arrays.
The problem was, the saved index was used as the variable to know if this is
a call in a new system or not. If the index was zero, it was assumed that
the call is in a new system and would keep incrementing the saved index
until it found a matching system. The issue arises when the first matching
system was at index zero. The next map, if it belonged to the same system,
would then think it was the first match and increment the index to one. If
the next call belong to the same system, it would begin its search of the
maps off by one, and miss the first enum that should be converted. This left
a single enum not converted properly.
Also add a comment to describe exactly what that index was for. It took me a
bit too long to figure out what I was thinking when debugging this issue.
Link: http://lkml.kernel.org/r/717BE572-2070-4C1E-9902-9F2E0FEDA4F8@oracle.com
Cc: stable@vger.kernel.org
Fixes: 0c564a538aa93 ("tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values")
Reported-by: Chuck Lever <chuck.lever@oracle.com>
Teste-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-01-18 20:53:10 +00:00
|
|
|
* next call in the iteration belongs to the same system as the
|
2020-10-29 15:05:54 +00:00
|
|
|
* previous call is high. As an optimization, we skip searching
|
tracing: Fix converting enum's from the map in trace_event_eval_update()
Since enums do not get converted by the TRACE_EVENT macro into their values,
the event format displaces the enum name and not the value. This breaks
tools like perf and trace-cmd that need to interpret the raw binary data. To
solve this, an enum map was created to convert these enums into their actual
numbers on boot up. This is done by TRACE_EVENTS() adding a
TRACE_DEFINE_ENUM() macro.
Some enums were not being converted. This was caused by an optization that
had a bug in it.
All calls get checked against this enum map to see if it should be converted
or not, and it compares the call's system to the system that the enum map
was created under. If they match, then they call is processed.
To cut down on the number of iterations needed to find the maps with a
matching system, since calls and maps are grouped by system, when a match is
made, the index into the map array is saved, so that the next call, if it
belongs to the same system as the previous call, could start right at that
array index and not have to scan all the previous arrays.
The problem was, the saved index was used as the variable to know if this is
a call in a new system or not. If the index was zero, it was assumed that
the call is in a new system and would keep incrementing the saved index
until it found a matching system. The issue arises when the first matching
system was at index zero. The next map, if it belonged to the same system,
would then think it was the first match and increment the index to one. If
the next call belong to the same system, it would begin its search of the
maps off by one, and miss the first enum that should be converted. This left
a single enum not converted properly.
Also add a comment to describe exactly what that index was for. It took me a
bit too long to figure out what I was thinking when debugging this issue.
Link: http://lkml.kernel.org/r/717BE572-2070-4C1E-9902-9F2E0FEDA4F8@oracle.com
Cc: stable@vger.kernel.org
Fixes: 0c564a538aa93 ("tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values")
Reported-by: Chuck Lever <chuck.lever@oracle.com>
Teste-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-01-18 20:53:10 +00:00
|
|
|
* for a map[] that matches the call's system if the last call
|
|
|
|
* was from the same system. That's what last_i is for. If the
|
|
|
|
* call has the same system as the previous call, then last_i
|
|
|
|
* will be the index of the first map[] that has a matching
|
|
|
|
* system.
|
|
|
|
*/
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
for (i = last_i; i < len; i++) {
|
|
|
|
if (call->class->system == map[i]->system) {
|
|
|
|
/* Save the first system if need be */
|
tracing: Fix converting enum's from the map in trace_event_eval_update()
Since enums do not get converted by the TRACE_EVENT macro into their values,
the event format displaces the enum name and not the value. This breaks
tools like perf and trace-cmd that need to interpret the raw binary data. To
solve this, an enum map was created to convert these enums into their actual
numbers on boot up. This is done by TRACE_EVENTS() adding a
TRACE_DEFINE_ENUM() macro.
Some enums were not being converted. This was caused by an optization that
had a bug in it.
All calls get checked against this enum map to see if it should be converted
or not, and it compares the call's system to the system that the enum map
was created under. If they match, then they call is processed.
To cut down on the number of iterations needed to find the maps with a
matching system, since calls and maps are grouped by system, when a match is
made, the index into the map array is saved, so that the next call, if it
belongs to the same system as the previous call, could start right at that
array index and not have to scan all the previous arrays.
The problem was, the saved index was used as the variable to know if this is
a call in a new system or not. If the index was zero, it was assumed that
the call is in a new system and would keep incrementing the saved index
until it found a matching system. The issue arises when the first matching
system was at index zero. The next map, if it belonged to the same system,
would then think it was the first match and increment the index to one. If
the next call belong to the same system, it would begin its search of the
maps off by one, and miss the first enum that should be converted. This left
a single enum not converted properly.
Also add a comment to describe exactly what that index was for. It took me a
bit too long to figure out what I was thinking when debugging this issue.
Link: http://lkml.kernel.org/r/717BE572-2070-4C1E-9902-9F2E0FEDA4F8@oracle.com
Cc: stable@vger.kernel.org
Fixes: 0c564a538aa93 ("tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values")
Reported-by: Chuck Lever <chuck.lever@oracle.com>
Teste-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-01-18 20:53:10 +00:00
|
|
|
if (first) {
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
last_i = i;
|
tracing: Fix converting enum's from the map in trace_event_eval_update()
Since enums do not get converted by the TRACE_EVENT macro into their values,
the event format displaces the enum name and not the value. This breaks
tools like perf and trace-cmd that need to interpret the raw binary data. To
solve this, an enum map was created to convert these enums into their actual
numbers on boot up. This is done by TRACE_EVENTS() adding a
TRACE_DEFINE_ENUM() macro.
Some enums were not being converted. This was caused by an optization that
had a bug in it.
All calls get checked against this enum map to see if it should be converted
or not, and it compares the call's system to the system that the enum map
was created under. If they match, then they call is processed.
To cut down on the number of iterations needed to find the maps with a
matching system, since calls and maps are grouped by system, when a match is
made, the index into the map array is saved, so that the next call, if it
belongs to the same system as the previous call, could start right at that
array index and not have to scan all the previous arrays.
The problem was, the saved index was used as the variable to know if this is
a call in a new system or not. If the index was zero, it was assumed that
the call is in a new system and would keep incrementing the saved index
until it found a matching system. The issue arises when the first matching
system was at index zero. The next map, if it belonged to the same system,
would then think it was the first match and increment the index to one. If
the next call belong to the same system, it would begin its search of the
maps off by one, and miss the first enum that should be converted. This left
a single enum not converted properly.
Also add a comment to describe exactly what that index was for. It took me a
bit too long to figure out what I was thinking when debugging this issue.
Link: http://lkml.kernel.org/r/717BE572-2070-4C1E-9902-9F2E0FEDA4F8@oracle.com
Cc: stable@vger.kernel.org
Fixes: 0c564a538aa93 ("tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values")
Reported-by: Chuck Lever <chuck.lever@oracle.com>
Teste-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-01-18 20:53:10 +00:00
|
|
|
first = false;
|
|
|
|
}
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
update_event_printk(call, map[i]);
|
2022-03-11 04:27:38 +00:00
|
|
|
update_event_fields(call, map[i]);
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
}
|
|
|
|
}
|
2023-09-29 19:16:37 +00:00
|
|
|
cond_resched();
|
tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values
Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.
For example, the tracepoint trace_tlb_flush() has:
__print_symbolic(REC->reason,
{ TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
{ TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
{ TLB_LOCAL_SHOOTDOWN, "local shootdown" },
{ TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })
Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.
With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:
By adding:
TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);
$ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
__print_symbolic(REC->reason,
{ 0, "flush on task switch" },
{ 1, "remote shootdown" },
{ 2, "local shootdown" },
{ 3, "local mm shootdown" })
The above is what userspace expects to see, and tools do not need to
be modified to parse them.
Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org
Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-03-24 21:58:09 +00:00
|
|
|
}
|
|
|
|
up_write(&trace_event_sem);
|
|
|
|
}
|
|
|
|
|
2023-12-13 14:37:01 +00:00
|
|
|
static bool event_in_systems(struct trace_event_call *call,
|
|
|
|
const char *systems)
|
|
|
|
{
|
|
|
|
const char *system;
|
|
|
|
const char *p;
|
|
|
|
|
|
|
|
if (!systems)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
system = call->class->system;
|
|
|
|
p = strstr(systems, system);
|
|
|
|
if (!p)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
|
|
|
|
return false;
|
|
|
|
|
|
|
|
p += strlen(system);
|
|
|
|
return !*p || isspace(*p) || *p == ',';
|
|
|
|
}
|
|
|
|
|
2015-05-05 14:09:53 +00:00
|
|
|
static struct trace_event_file *
|
2015-05-05 15:45:27 +00:00
|
|
|
trace_create_new_event(struct trace_event_call *call,
|
2013-05-09 19:00:07 +00:00
|
|
|
struct trace_array *tr)
|
|
|
|
{
|
2021-11-26 18:35:26 +00:00
|
|
|
struct trace_pid_list *no_pid_list;
|
|
|
|
struct trace_pid_list *pid_list;
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2021-11-26 18:35:26 +00:00
|
|
|
unsigned int first;
|
2013-05-09 19:00:07 +00:00
|
|
|
|
2023-12-13 14:37:01 +00:00
|
|
|
if (!event_in_systems(call, tr->system_names))
|
|
|
|
return NULL;
|
|
|
|
|
2013-05-09 19:00:07 +00:00
|
|
|
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
|
|
|
|
if (!file)
|
2023-12-13 14:37:01 +00:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2013-05-09 19:00:07 +00:00
|
|
|
|
2021-11-26 18:35:26 +00:00
|
|
|
pid_list = rcu_dereference_protected(tr->filtered_pids,
|
|
|
|
lockdep_is_held(&event_mutex));
|
|
|
|
no_pid_list = rcu_dereference_protected(tr->filtered_no_pids,
|
|
|
|
lockdep_is_held(&event_mutex));
|
|
|
|
|
|
|
|
if (!trace_pid_list_first(pid_list, &first) ||
|
2021-11-27 21:45:26 +00:00
|
|
|
!trace_pid_list_first(no_pid_list, &first))
|
2021-11-26 18:35:26 +00:00
|
|
|
file->flags |= EVENT_FILE_FL_PID_FILTER;
|
|
|
|
|
2013-05-09 19:00:07 +00:00
|
|
|
file->event_call = call;
|
|
|
|
file->tr = tr;
|
|
|
|
atomic_set(&file->sm_ref, 0);
|
tracing: Add basic event trigger framework
Add a 'trigger' file for each trace event, enabling 'trace event
triggers' to be set for trace events.
'trace event triggers' are patterned after the existing 'ftrace
function triggers' implementation except that triggers are written to
per-event 'trigger' files instead of to a single file such as the
'set_ftrace_filter' used for ftrace function triggers.
The implementation is meant to be entirely separate from ftrace
function triggers, in order to keep the respective implementations
relatively simple and to allow them to diverge.
The event trigger functionality is built on top of SOFT_DISABLE
functionality. It adds a TRIGGER_MODE bit to the ftrace_event_file
flags which is checked when any trace event fires. Triggers set for a
particular event need to be checked regardless of whether that event
is actually enabled or not - getting an event to fire even if it's not
enabled is what's already implemented by SOFT_DISABLE mode, so trigger
mode directly reuses that. Event trigger essentially inherit the soft
disable logic in __ftrace_event_enable_disable() while adding a bit of
logic and trigger reference counting via tm_ref on top of that in a
new trace_event_trigger_enable_disable() function. Because the base
__ftrace_event_enable_disable() code now needs to be invoked from
outside trace_events.c, a wrapper is also added for those usages.
The triggers for an event are actually invoked via a new function,
event_triggers_call(), and code is also added to invoke them for
ftrace_raw_event calls as well as syscall events.
The main part of the patch creates a new trace_events_trigger.c file
to contain the trace event triggers implementation.
The standard open, read, and release file operations are implemented
here.
The open() implementation sets up for the various open modes of the
'trigger' file. It creates and attaches the trigger iterator and sets
up the command parser. If opened for reading set up the trigger
seq_ops.
The read() implementation parses the event trigger written to the
'trigger' file, looks up the trigger command, and passes it along to
that event_command's func() implementation for command-specific
processing.
The release() implementation does whatever cleanup is needed to
release the 'trigger' file, like releasing the parser and trigger
iterator, etc.
A couple of functions for event command registration and
unregistration are added, along with a list to add them to and a mutex
to protect them, as well as an (initially empty) registration function
to add the set of commands that will be added by future commits, and
call to it from the trace event initialization code.
also added are a couple trigger-specific data structures needed for
these implementations such as a trigger iterator and a struct for
trigger-specific data.
A couple structs consisting mostly of function meant to be implemented
in command-specific ways, event_command and event_trigger_ops, are
used by the generic event trigger command implementations. They're
being put into trace.h alongside the other trace_event data structures
and functions, in the expectation that they'll be needed in several
trace_event-related files such as trace_events_trigger.c and
trace_events.c.
The event_command.func() function is meant to be called by the trigger
parsing code in order to add a trigger instance to the corresponding
event. It essentially coordinates adding a live trigger instance to
the event, and arming the triggering the event.
Every event_command func() implementation essentially does the
same thing for any command:
- choose ops - use the value of param to choose either a number or
count version of event_trigger_ops specific to the command
- do the register or unregister of those ops
- associate a filter, if specified, with the triggering event
The reg() and unreg() ops allow command-specific implementations for
event_trigger_op registration and unregistration, and the
get_trigger_ops() op allows command-specific event_trigger_ops
selection to be parameterized. When a trigger instance is added, the
reg() op essentially adds that trigger to the triggering event and
arms it, while unreg() does the opposite. The set_filter() function
is used to associate a filter with the trigger - if the command
doesn't specify a set_filter() implementation, the command will ignore
filters.
Each command has an associated trigger_type, which serves double duty,
both as a unique identifier for the command as well as a value that
can be used for setting a trigger mode bit during trigger invocation.
The signature of func() adds a pointer to the event_command struct,
used to invoke those functions, along with a command_data param that
can be passed to the reg/unreg functions. This allows func()
implementations to use command-specific blobs and supports code
re-use.
The event_trigger_ops.func() command corrsponds to the trigger 'probe'
function that gets called when the triggering event is actually
invoked. The other functions are used to list the trigger when
needed, along with a couple mundane book-keeping functions.
This also moves event_file_data() into trace.h so it can be used
outside of trace_events.c.
Link: http://lkml.kernel.org/r/316d95061accdee070aac8e5750afba0192fa5b9.1382622043.git.tom.zanussi@linux.intel.com
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Idea-by: Steve Rostedt <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-10-24 13:59:24 +00:00
|
|
|
atomic_set(&file->tm_ref, 0);
|
|
|
|
INIT_LIST_HEAD(&file->triggers);
|
2013-05-09 19:00:07 +00:00
|
|
|
list_add(&file->list, &tr->events);
|
2024-07-26 18:42:08 +00:00
|
|
|
refcount_set(&file->ref, 1);
|
2013-05-09 19:00:07 +00:00
|
|
|
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
|
2022-10-21 01:00:56 +00:00
|
|
|
#define MAX_BOOT_TRIGGERS 32
|
|
|
|
|
|
|
|
static struct boot_triggers {
|
|
|
|
const char *event;
|
|
|
|
char *trigger;
|
|
|
|
} bootup_triggers[MAX_BOOT_TRIGGERS];
|
|
|
|
|
|
|
|
static char bootup_trigger_buf[COMMAND_LINE_SIZE];
|
|
|
|
static int nr_boot_triggers;
|
|
|
|
|
|
|
|
static __init int setup_trace_triggers(char *str)
|
|
|
|
{
|
|
|
|
char *trigger;
|
|
|
|
char *buf;
|
|
|
|
int i;
|
|
|
|
|
2023-05-16 14:39:56 +00:00
|
|
|
strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
|
2023-09-06 09:18:37 +00:00
|
|
|
trace_set_ring_buffer_expanded(NULL);
|
2022-10-21 01:00:56 +00:00
|
|
|
disable_tracing_selftest("running event triggers");
|
|
|
|
|
|
|
|
buf = bootup_trigger_buf;
|
|
|
|
for (i = 0; i < MAX_BOOT_TRIGGERS; i++) {
|
|
|
|
trigger = strsep(&buf, ",");
|
|
|
|
if (!trigger)
|
|
|
|
break;
|
|
|
|
bootup_triggers[i].event = strsep(&trigger, ".");
|
2022-12-19 18:31:07 +00:00
|
|
|
bootup_triggers[i].trigger = trigger;
|
2022-10-21 01:00:56 +00:00
|
|
|
if (!bootup_triggers[i].trigger)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
nr_boot_triggers = i;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("trace_trigger=", setup_trace_triggers);
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
/* Add an event to a trace directory */
|
|
|
|
static int
|
2015-05-05 15:45:27 +00:00
|
|
|
__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
|
2012-05-04 03:09:03 +00:00
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2012-05-04 03:09:03 +00:00
|
|
|
|
2013-05-09 19:00:07 +00:00
|
|
|
file = trace_create_new_event(call, tr);
|
2023-12-13 14:37:01 +00:00
|
|
|
/*
|
|
|
|
* trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
|
|
|
|
* allocation, or NULL if the event is not part of the tr->system_names.
|
|
|
|
* When the event is not part of the tr->system_names, return zero, not
|
|
|
|
* an error.
|
|
|
|
*/
|
2012-05-04 03:09:03 +00:00
|
|
|
if (!file)
|
2023-12-13 14:37:01 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (IS_ERR(file))
|
|
|
|
return PTR_ERR(file);
|
2012-05-04 03:09:03 +00:00
|
|
|
|
2020-09-10 12:38:58 +00:00
|
|
|
if (eventdir_initialized)
|
|
|
|
return event_create_dir(tr->event_dir, file);
|
|
|
|
else
|
|
|
|
return event_define_fields(call);
|
2012-05-04 03:09:03 +00:00
|
|
|
}
|
|
|
|
|
2022-10-21 01:00:56 +00:00
|
|
|
static void trace_early_triggers(struct trace_event_file *file, const char *name)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_boot_triggers; i++) {
|
|
|
|
if (strcmp(name, bootup_triggers[i].event))
|
|
|
|
continue;
|
|
|
|
mutex_lock(&event_mutex);
|
|
|
|
ret = trigger_process_regex(file, bootup_triggers[i].trigger);
|
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
if (ret)
|
|
|
|
pr_err("Failed to register trigger '%s' on event %s\n",
|
|
|
|
bootup_triggers[i].trigger,
|
|
|
|
bootup_triggers[i].event);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-02-27 21:28:06 +00:00
|
|
|
/*
|
2021-03-23 17:49:35 +00:00
|
|
|
* Just create a descriptor for early init. A descriptor is required
|
2013-02-27 21:28:06 +00:00
|
|
|
* for enabling events at boot. We want to enable events before
|
|
|
|
* the filesystem is initialized.
|
|
|
|
*/
|
2020-10-16 04:20:02 +00:00
|
|
|
static int
|
2015-05-05 15:45:27 +00:00
|
|
|
__trace_early_add_new_event(struct trace_event_call *call,
|
2013-02-27 21:28:06 +00:00
|
|
|
struct trace_array *tr)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2022-10-21 01:00:56 +00:00
|
|
|
int ret;
|
2013-02-27 21:28:06 +00:00
|
|
|
|
2013-05-09 19:00:07 +00:00
|
|
|
file = trace_create_new_event(call, tr);
|
2023-12-13 14:37:01 +00:00
|
|
|
/*
|
|
|
|
* trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
|
|
|
|
* allocation, or NULL if the event is not part of the tr->system_names.
|
|
|
|
* When the event is not part of the tr->system_names, return zero, not
|
|
|
|
* an error.
|
|
|
|
*/
|
2013-02-27 21:28:06 +00:00
|
|
|
if (!file)
|
2023-12-13 14:37:01 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (IS_ERR(file))
|
|
|
|
return PTR_ERR(file);
|
2013-02-27 21:28:06 +00:00
|
|
|
|
2022-10-21 01:00:56 +00:00
|
|
|
ret = event_define_fields(call);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
trace_early_triggers(file, trace_event_name(call));
|
|
|
|
|
|
|
|
return 0;
|
2013-02-27 21:28:06 +00:00
|
|
|
}
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
struct ftrace_module_file_ops;
|
2015-05-05 15:45:27 +00:00
|
|
|
static void __add_event_to_tracers(struct trace_event_call *call);
|
2012-05-04 03:09:03 +00:00
|
|
|
|
2018-12-04 18:35:45 +00:00
|
|
|
/* Add an additional event_call dynamically */
|
|
|
|
int trace_add_event_call(struct trace_event_call *call)
|
2009-08-13 20:34:53 +00:00
|
|
|
{
|
|
|
|
int ret;
|
tracing: Lock event_mutex before synth_event_mutex
synthetic event is using synth_event_mutex for protecting
synth_event_list, and event_trigger_write() path acquires
locks as below order.
event_trigger_write(event_mutex)
->trigger_process_regex(trigger_cmd_mutex)
->event_hist_trigger_func(synth_event_mutex)
On the other hand, synthetic event creation and deletion paths
call trace_add_event_call() and trace_remove_event_call()
which acquires event_mutex. In that case, if we keep the
synth_event_mutex locked while registering/unregistering synthetic
events, its dependency will be inversed.
To avoid this issue, current synthetic event is using a 2 phase
process to create/delete events. For example, it searches existing
events under synth_event_mutex to check for event-name conflicts, and
unlocks synth_event_mutex, then registers a new event under event_mutex
locked. Finally, it locks synth_event_mutex and tries to add the
new event to the list. But it can introduce complexity and a chance
for name conflicts.
To solve this simpler, this introduces trace_add_event_call_nolock()
and trace_remove_event_call_nolock() which don't acquire
event_mutex inside. synthetic event can lock event_mutex before
synth_event_mutex to solve the lock dependency issue simpler.
Link: http://lkml.kernel.org/r/154140844377.17322.13781091165954002713.stgit@devbox
Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-11-05 09:00:43 +00:00
|
|
|
lockdep_assert_held(&event_mutex);
|
|
|
|
|
2017-09-21 20:22:49 +00:00
|
|
|
mutex_lock(&trace_types_lock);
|
2009-04-25 03:11:22 +00:00
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
ret = __register_event(call, NULL);
|
|
|
|
if (ret >= 0)
|
2013-07-31 17:31:32 +00:00
|
|
|
__add_event_to_tracers(call);
|
2009-08-06 05:32:21 +00:00
|
|
|
|
2013-07-02 02:37:54 +00:00
|
|
|
mutex_unlock(&trace_types_lock);
|
tracing: Lock event_mutex before synth_event_mutex
synthetic event is using synth_event_mutex for protecting
synth_event_list, and event_trigger_write() path acquires
locks as below order.
event_trigger_write(event_mutex)
->trigger_process_regex(trigger_cmd_mutex)
->event_hist_trigger_func(synth_event_mutex)
On the other hand, synthetic event creation and deletion paths
call trace_add_event_call() and trace_remove_event_call()
which acquires event_mutex. In that case, if we keep the
synth_event_mutex locked while registering/unregistering synthetic
events, its dependency will be inversed.
To avoid this issue, current synthetic event is using a 2 phase
process to create/delete events. For example, it searches existing
events under synth_event_mutex to check for event-name conflicts, and
unlocks synth_event_mutex, then registers a new event under event_mutex
locked. Finally, it locks synth_event_mutex and tries to add the
new event to the list. But it can introduce complexity and a chance
for name conflicts.
To solve this simpler, this introduces trace_add_event_call_nolock()
and trace_remove_event_call_nolock() which don't acquire
event_mutex inside. synthetic event can lock event_mutex before
synth_event_mutex to solve the lock dependency issue simpler.
Link: http://lkml.kernel.org/r/154140844377.17322.13781091165954002713.stgit@devbox
Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-11-05 09:00:43 +00:00
|
|
|
return ret;
|
|
|
|
}
|
2022-03-03 22:05:31 +00:00
|
|
|
EXPORT_SYMBOL_GPL(trace_add_event_call);
|
tracing: Lock event_mutex before synth_event_mutex
synthetic event is using synth_event_mutex for protecting
synth_event_list, and event_trigger_write() path acquires
locks as below order.
event_trigger_write(event_mutex)
->trigger_process_regex(trigger_cmd_mutex)
->event_hist_trigger_func(synth_event_mutex)
On the other hand, synthetic event creation and deletion paths
call trace_add_event_call() and trace_remove_event_call()
which acquires event_mutex. In that case, if we keep the
synth_event_mutex locked while registering/unregistering synthetic
events, its dependency will be inversed.
To avoid this issue, current synthetic event is using a 2 phase
process to create/delete events. For example, it searches existing
events under synth_event_mutex to check for event-name conflicts, and
unlocks synth_event_mutex, then registers a new event under event_mutex
locked. Finally, it locks synth_event_mutex and tries to add the
new event to the list. But it can introduce complexity and a chance
for name conflicts.
To solve this simpler, this introduces trace_add_event_call_nolock()
and trace_remove_event_call_nolock() which don't acquire
event_mutex inside. synthetic event can lock event_mutex before
synth_event_mutex to solve the lock dependency issue simpler.
Link: http://lkml.kernel.org/r/154140844377.17322.13781091165954002713.stgit@devbox
Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-11-05 09:00:43 +00:00
|
|
|
|
2009-09-14 20:49:12 +00:00
|
|
|
/*
|
2013-07-02 02:37:54 +00:00
|
|
|
* Must be called under locking of trace_types_lock, event_mutex and
|
|
|
|
* trace_event_sem.
|
2009-09-14 20:49:12 +00:00
|
|
|
*/
|
2015-05-05 15:45:27 +00:00
|
|
|
static void __trace_remove_event_call(struct trace_event_call *call)
|
2009-08-13 20:34:53 +00:00
|
|
|
{
|
2012-09-12 14:47:57 +00:00
|
|
|
event_remove(call);
|
2009-08-13 20:34:53 +00:00
|
|
|
trace_destroy_fields(call);
|
|
|
|
}
|
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
static int probe_remove_event_call(struct trace_event_call *call)
|
2013-07-29 17:50:33 +00:00
|
|
|
{
|
|
|
|
struct trace_array *tr;
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2013-07-29 17:50:33 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
|
|
if (call->perf_refcount)
|
|
|
|
return -EBUSY;
|
|
|
|
#endif
|
|
|
|
do_for_each_event_file(tr, file) {
|
|
|
|
if (file->event_call != call)
|
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* We can't rely on ftrace_event_enable_disable(enable => 0)
|
2015-05-13 19:12:33 +00:00
|
|
|
* we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress
|
2013-07-29 17:50:33 +00:00
|
|
|
* TRACE_REG_UNREGISTER.
|
|
|
|
*/
|
2015-05-13 19:12:33 +00:00
|
|
|
if (file->flags & EVENT_FILE_FL_ENABLED)
|
tracing: Free buffers when a used dynamic event is removed
After 65536 dynamic events have been added and removed, the "type" field
of the event then uses the first type number that is available (not
currently used by other events). A type number is the identifier of the
binary blobs in the tracing ring buffer (known as events) to map them to
logic that can parse the binary blob.
The issue is that if a dynamic event (like a kprobe event) is traced and
is in the ring buffer, and then that event is removed (because it is
dynamic, which means it can be created and destroyed), if another dynamic
event is created that has the same number that new event's logic on
parsing the binary blob will be used.
To show how this can be an issue, the following can crash the kernel:
# cd /sys/kernel/tracing
# for i in `seq 65536`; do
echo 'p:kprobes/foo do_sys_openat2 $arg1:u32' > kprobe_events
# done
For every iteration of the above, the writing to the kprobe_events will
remove the old event and create a new one (with the same format) and
increase the type number to the next available on until the type number
reaches over 65535 which is the max number for the 16 bit type. After it
reaches that number, the logic to allocate a new number simply looks for
the next available number. When an dynamic event is removed, that number
is then available to be reused by the next dynamic event created. That is,
once the above reaches the max number, the number assigned to the event in
that loop will remain the same.
Now that means deleting one dynamic event and created another will reuse
the previous events type number. This is where bad things can happen.
After the above loop finishes, the kprobes/foo event which reads the
do_sys_openat2 function call's first parameter as an integer.
# echo 1 > kprobes/foo/enable
# cat /etc/passwd > /dev/null
# cat trace
cat-2211 [005] .... 2007.849603: foo: (do_sys_openat2+0x0/0x130) arg1=4294967196
cat-2211 [005] .... 2007.849620: foo: (do_sys_openat2+0x0/0x130) arg1=4294967196
cat-2211 [005] .... 2007.849838: foo: (do_sys_openat2+0x0/0x130) arg1=4294967196
cat-2211 [005] .... 2007.849880: foo: (do_sys_openat2+0x0/0x130) arg1=4294967196
# echo 0 > kprobes/foo/enable
Now if we delete the kprobe and create a new one that reads a string:
# echo 'p:kprobes/foo do_sys_openat2 +0($arg2):string' > kprobe_events
And now we can the trace:
# cat trace
sendmail-1942 [002] ..... 530.136320: foo: (do_sys_openat2+0x0/0x240) arg1= cat-2046 [004] ..... 530.930817: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
cat-2046 [004] ..... 530.930961: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
cat-2046 [004] ..... 530.934278: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
cat-2046 [004] ..... 530.934563: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
bash-1515 [007] ..... 534.299093: foo: (do_sys_openat2+0x0/0x240) arg1="kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk���������@��4Z����;Y�����U
And dmesg has:
==================================================================
BUG: KASAN: use-after-free in string+0xd4/0x1c0
Read of size 1 at addr ffff88805fdbbfa0 by task cat/2049
CPU: 0 PID: 2049 Comm: cat Not tainted 6.1.0-rc6-test+ #641
Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016
Call Trace:
<TASK>
dump_stack_lvl+0x5b/0x77
print_report+0x17f/0x47b
kasan_report+0xad/0x130
string+0xd4/0x1c0
vsnprintf+0x500/0x840
seq_buf_vprintf+0x62/0xc0
trace_seq_printf+0x10e/0x1e0
print_type_string+0x90/0xa0
print_kprobe_event+0x16b/0x290
print_trace_line+0x451/0x8e0
s_show+0x72/0x1f0
seq_read_iter+0x58e/0x750
seq_read+0x115/0x160
vfs_read+0x11d/0x460
ksys_read+0xa9/0x130
do_syscall_64+0x3a/0x90
entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7fc2e972ade2
Code: c0 e9 b2 fe ff ff 50 48 8d 3d b2 3f 0a 00 e8 05 f0 01 00 0f 1f 44 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 0f 05 <48> 3d 00 f0 ff ff 77 56 c3 0f 1f 44 00 00 48 83 ec 28 48 89 54 24
RSP: 002b:00007ffc64e687c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fc2e972ade2
RDX: 0000000000020000 RSI: 00007fc2e980d000 RDI: 0000000000000003
RBP: 00007fc2e980d000 R08: 00007fc2e980c010 R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000020f00
R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000
</TASK>
The buggy address belongs to the physical page:
page:ffffea00017f6ec0 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x5fdbb
flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff)
raw: 000fffffc0000000 0000000000000000 ffffea00017f6ec8 0000000000000000
raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff88805fdbbe80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
ffff88805fdbbf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
>ffff88805fdbbf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
^
ffff88805fdbc000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
ffff88805fdbc080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
==================================================================
This was found when Zheng Yejian sent a patch to convert the event type
number assignment to use IDA, which gives the next available number, and
this bug showed up in the fuzz testing by Yujie Liu and the kernel test
robot. But after further analysis, I found that this behavior is the same
as when the event type numbers go past the 16bit max (and the above shows
that).
As modules have a similar issue, but is dealt with by setting a
"WAS_ENABLED" flag when a module event is enabled, and when the module is
freed, if any of its events were enabled, the ring buffer that holds that
event is also cleared, to prevent reading stale events. The same can be
done for dynamic events.
If any dynamic event that is being removed was enabled, then make sure the
buffers they were enabled in are now cleared.
Link: https://lkml.kernel.org/r/20221123171434.545706e3@gandalf.local.home
Link: https://lore.kernel.org/all/20221110020319.1259291-1-zhengyejian1@huawei.com/
Cc: stable@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Depends-on: e18eb8783ec49 ("tracing: Add tracing_reset_all_online_cpus_unlocked() function")
Depends-on: 5448d44c38557 ("tracing: Add unified dynamic event framework")
Depends-on: 6212dd29683ee ("tracing/kprobes: Use dyn_event framework for kprobe events")
Depends-on: 065e63f951432 ("tracing: Only have rmmod clear buffers that its events were active in")
Depends-on: 575380da8b469 ("tracing: Only clear trace buffer on module unload if event was traced")
Fixes: 77b44d1b7c283 ("tracing/kprobes: Rename Kprobe-tracer to kprobe-event")
Reported-by: Zheng Yejian <zhengyejian1@huawei.com>
Reported-by: Yujie Liu <yujie.liu@intel.com>
Reported-by: kernel test robot <yujie.liu@intel.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-11-23 22:14:34 +00:00
|
|
|
goto busy;
|
|
|
|
|
|
|
|
if (file->flags & EVENT_FILE_FL_WAS_ENABLED)
|
|
|
|
tr->clear_trace = true;
|
2013-07-31 17:16:22 +00:00
|
|
|
/*
|
|
|
|
* The do_for_each_event_file_safe() is
|
|
|
|
* a double loop. After finding the call for this
|
|
|
|
* trace_array, we use break to jump to the next
|
|
|
|
* trace_array.
|
|
|
|
*/
|
2013-07-29 17:50:33 +00:00
|
|
|
break;
|
|
|
|
} while_for_each_event_file();
|
|
|
|
|
|
|
|
__trace_remove_event_call(call);
|
|
|
|
|
|
|
|
return 0;
|
tracing: Free buffers when a used dynamic event is removed
After 65536 dynamic events have been added and removed, the "type" field
of the event then uses the first type number that is available (not
currently used by other events). A type number is the identifier of the
binary blobs in the tracing ring buffer (known as events) to map them to
logic that can parse the binary blob.
The issue is that if a dynamic event (like a kprobe event) is traced and
is in the ring buffer, and then that event is removed (because it is
dynamic, which means it can be created and destroyed), if another dynamic
event is created that has the same number that new event's logic on
parsing the binary blob will be used.
To show how this can be an issue, the following can crash the kernel:
# cd /sys/kernel/tracing
# for i in `seq 65536`; do
echo 'p:kprobes/foo do_sys_openat2 $arg1:u32' > kprobe_events
# done
For every iteration of the above, the writing to the kprobe_events will
remove the old event and create a new one (with the same format) and
increase the type number to the next available on until the type number
reaches over 65535 which is the max number for the 16 bit type. After it
reaches that number, the logic to allocate a new number simply looks for
the next available number. When an dynamic event is removed, that number
is then available to be reused by the next dynamic event created. That is,
once the above reaches the max number, the number assigned to the event in
that loop will remain the same.
Now that means deleting one dynamic event and created another will reuse
the previous events type number. This is where bad things can happen.
After the above loop finishes, the kprobes/foo event which reads the
do_sys_openat2 function call's first parameter as an integer.
# echo 1 > kprobes/foo/enable
# cat /etc/passwd > /dev/null
# cat trace
cat-2211 [005] .... 2007.849603: foo: (do_sys_openat2+0x0/0x130) arg1=4294967196
cat-2211 [005] .... 2007.849620: foo: (do_sys_openat2+0x0/0x130) arg1=4294967196
cat-2211 [005] .... 2007.849838: foo: (do_sys_openat2+0x0/0x130) arg1=4294967196
cat-2211 [005] .... 2007.849880: foo: (do_sys_openat2+0x0/0x130) arg1=4294967196
# echo 0 > kprobes/foo/enable
Now if we delete the kprobe and create a new one that reads a string:
# echo 'p:kprobes/foo do_sys_openat2 +0($arg2):string' > kprobe_events
And now we can the trace:
# cat trace
sendmail-1942 [002] ..... 530.136320: foo: (do_sys_openat2+0x0/0x240) arg1= cat-2046 [004] ..... 530.930817: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
cat-2046 [004] ..... 530.930961: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
cat-2046 [004] ..... 530.934278: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
cat-2046 [004] ..... 530.934563: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
bash-1515 [007] ..... 534.299093: foo: (do_sys_openat2+0x0/0x240) arg1="kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk���������@��4Z����;Y�����U
And dmesg has:
==================================================================
BUG: KASAN: use-after-free in string+0xd4/0x1c0
Read of size 1 at addr ffff88805fdbbfa0 by task cat/2049
CPU: 0 PID: 2049 Comm: cat Not tainted 6.1.0-rc6-test+ #641
Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016
Call Trace:
<TASK>
dump_stack_lvl+0x5b/0x77
print_report+0x17f/0x47b
kasan_report+0xad/0x130
string+0xd4/0x1c0
vsnprintf+0x500/0x840
seq_buf_vprintf+0x62/0xc0
trace_seq_printf+0x10e/0x1e0
print_type_string+0x90/0xa0
print_kprobe_event+0x16b/0x290
print_trace_line+0x451/0x8e0
s_show+0x72/0x1f0
seq_read_iter+0x58e/0x750
seq_read+0x115/0x160
vfs_read+0x11d/0x460
ksys_read+0xa9/0x130
do_syscall_64+0x3a/0x90
entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7fc2e972ade2
Code: c0 e9 b2 fe ff ff 50 48 8d 3d b2 3f 0a 00 e8 05 f0 01 00 0f 1f 44 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 0f 05 <48> 3d 00 f0 ff ff 77 56 c3 0f 1f 44 00 00 48 83 ec 28 48 89 54 24
RSP: 002b:00007ffc64e687c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fc2e972ade2
RDX: 0000000000020000 RSI: 00007fc2e980d000 RDI: 0000000000000003
RBP: 00007fc2e980d000 R08: 00007fc2e980c010 R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000020f00
R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000
</TASK>
The buggy address belongs to the physical page:
page:ffffea00017f6ec0 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x5fdbb
flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff)
raw: 000fffffc0000000 0000000000000000 ffffea00017f6ec8 0000000000000000
raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff88805fdbbe80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
ffff88805fdbbf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
>ffff88805fdbbf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
^
ffff88805fdbc000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
ffff88805fdbc080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
==================================================================
This was found when Zheng Yejian sent a patch to convert the event type
number assignment to use IDA, which gives the next available number, and
this bug showed up in the fuzz testing by Yujie Liu and the kernel test
robot. But after further analysis, I found that this behavior is the same
as when the event type numbers go past the 16bit max (and the above shows
that).
As modules have a similar issue, but is dealt with by setting a
"WAS_ENABLED" flag when a module event is enabled, and when the module is
freed, if any of its events were enabled, the ring buffer that holds that
event is also cleared, to prevent reading stale events. The same can be
done for dynamic events.
If any dynamic event that is being removed was enabled, then make sure the
buffers they were enabled in are now cleared.
Link: https://lkml.kernel.org/r/20221123171434.545706e3@gandalf.local.home
Link: https://lore.kernel.org/all/20221110020319.1259291-1-zhengyejian1@huawei.com/
Cc: stable@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Depends-on: e18eb8783ec49 ("tracing: Add tracing_reset_all_online_cpus_unlocked() function")
Depends-on: 5448d44c38557 ("tracing: Add unified dynamic event framework")
Depends-on: 6212dd29683ee ("tracing/kprobes: Use dyn_event framework for kprobe events")
Depends-on: 065e63f951432 ("tracing: Only have rmmod clear buffers that its events were active in")
Depends-on: 575380da8b469 ("tracing: Only clear trace buffer on module unload if event was traced")
Fixes: 77b44d1b7c283 ("tracing/kprobes: Rename Kprobe-tracer to kprobe-event")
Reported-by: Zheng Yejian <zhengyejian1@huawei.com>
Reported-by: Yujie Liu <yujie.liu@intel.com>
Reported-by: kernel test robot <yujie.liu@intel.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-11-23 22:14:34 +00:00
|
|
|
busy:
|
|
|
|
/* No need to clear the trace now */
|
|
|
|
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
|
|
|
|
tr->clear_trace = false;
|
|
|
|
}
|
|
|
|
return -EBUSY;
|
2013-07-29 17:50:33 +00:00
|
|
|
}
|
|
|
|
|
2018-12-04 18:35:45 +00:00
|
|
|
/* Remove an event_call */
|
|
|
|
int trace_remove_event_call(struct trace_event_call *call)
|
2009-08-13 20:34:53 +00:00
|
|
|
{
|
2013-07-29 17:50:33 +00:00
|
|
|
int ret;
|
|
|
|
|
tracing: Lock event_mutex before synth_event_mutex
synthetic event is using synth_event_mutex for protecting
synth_event_list, and event_trigger_write() path acquires
locks as below order.
event_trigger_write(event_mutex)
->trigger_process_regex(trigger_cmd_mutex)
->event_hist_trigger_func(synth_event_mutex)
On the other hand, synthetic event creation and deletion paths
call trace_add_event_call() and trace_remove_event_call()
which acquires event_mutex. In that case, if we keep the
synth_event_mutex locked while registering/unregistering synthetic
events, its dependency will be inversed.
To avoid this issue, current synthetic event is using a 2 phase
process to create/delete events. For example, it searches existing
events under synth_event_mutex to check for event-name conflicts, and
unlocks synth_event_mutex, then registers a new event under event_mutex
locked. Finally, it locks synth_event_mutex and tries to add the
new event to the list. But it can introduce complexity and a chance
for name conflicts.
To solve this simpler, this introduces trace_add_event_call_nolock()
and trace_remove_event_call_nolock() which don't acquire
event_mutex inside. synthetic event can lock event_mutex before
synth_event_mutex to solve the lock dependency issue simpler.
Link: http://lkml.kernel.org/r/154140844377.17322.13781091165954002713.stgit@devbox
Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-11-05 09:00:43 +00:00
|
|
|
lockdep_assert_held(&event_mutex);
|
|
|
|
|
2017-09-21 20:22:49 +00:00
|
|
|
mutex_lock(&trace_types_lock);
|
2013-03-11 07:14:03 +00:00
|
|
|
down_write(&trace_event_sem);
|
2013-07-29 17:50:33 +00:00
|
|
|
ret = probe_remove_event_call(call);
|
2013-03-11 07:14:03 +00:00
|
|
|
up_write(&trace_event_sem);
|
2013-07-02 02:37:54 +00:00
|
|
|
mutex_unlock(&trace_types_lock);
|
tracing: Lock event_mutex before synth_event_mutex
synthetic event is using synth_event_mutex for protecting
synth_event_list, and event_trigger_write() path acquires
locks as below order.
event_trigger_write(event_mutex)
->trigger_process_regex(trigger_cmd_mutex)
->event_hist_trigger_func(synth_event_mutex)
On the other hand, synthetic event creation and deletion paths
call trace_add_event_call() and trace_remove_event_call()
which acquires event_mutex. In that case, if we keep the
synth_event_mutex locked while registering/unregistering synthetic
events, its dependency will be inversed.
To avoid this issue, current synthetic event is using a 2 phase
process to create/delete events. For example, it searches existing
events under synth_event_mutex to check for event-name conflicts, and
unlocks synth_event_mutex, then registers a new event under event_mutex
locked. Finally, it locks synth_event_mutex and tries to add the
new event to the list. But it can introduce complexity and a chance
for name conflicts.
To solve this simpler, this introduces trace_add_event_call_nolock()
and trace_remove_event_call_nolock() which don't acquire
event_mutex inside. synthetic event can lock event_mutex before
synth_event_mutex to solve the lock dependency issue simpler.
Link: http://lkml.kernel.org/r/154140844377.17322.13781091165954002713.stgit@devbox
Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-11-05 09:00:43 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
2022-03-03 22:05:31 +00:00
|
|
|
EXPORT_SYMBOL_GPL(trace_remove_event_call);
|
tracing: Lock event_mutex before synth_event_mutex
synthetic event is using synth_event_mutex for protecting
synth_event_list, and event_trigger_write() path acquires
locks as below order.
event_trigger_write(event_mutex)
->trigger_process_regex(trigger_cmd_mutex)
->event_hist_trigger_func(synth_event_mutex)
On the other hand, synthetic event creation and deletion paths
call trace_add_event_call() and trace_remove_event_call()
which acquires event_mutex. In that case, if we keep the
synth_event_mutex locked while registering/unregistering synthetic
events, its dependency will be inversed.
To avoid this issue, current synthetic event is using a 2 phase
process to create/delete events. For example, it searches existing
events under synth_event_mutex to check for event-name conflicts, and
unlocks synth_event_mutex, then registers a new event under event_mutex
locked. Finally, it locks synth_event_mutex and tries to add the
new event to the list. But it can introduce complexity and a chance
for name conflicts.
To solve this simpler, this introduces trace_add_event_call_nolock()
and trace_remove_event_call_nolock() which don't acquire
event_mutex inside. synthetic event can lock event_mutex before
synth_event_mutex to solve the lock dependency issue simpler.
Link: http://lkml.kernel.org/r/154140844377.17322.13781091165954002713.stgit@devbox
Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-11-05 09:00:43 +00:00
|
|
|
|
2009-08-13 20:34:53 +00:00
|
|
|
#define for_each_event(event, start, end) \
|
|
|
|
for (event = start; \
|
|
|
|
(unsigned long)event < (unsigned long)end; \
|
|
|
|
event++)
|
|
|
|
|
|
|
|
#ifdef CONFIG_MODULES
|
|
|
|
|
2009-04-10 18:53:50 +00:00
|
|
|
static void trace_module_add_events(struct module *mod)
|
|
|
|
{
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call **call, **start, **end;
|
2009-04-10 18:53:50 +00:00
|
|
|
|
2014-02-26 18:37:38 +00:00
|
|
|
if (!mod->num_trace_events)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Don't add infrastructure for mods without tracepoints */
|
|
|
|
if (trace_module_has_bad_taint(mod)) {
|
|
|
|
pr_err("%s: module has bad taint, not creating trace events\n",
|
|
|
|
mod->name);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2009-04-10 18:53:50 +00:00
|
|
|
start = mod->trace_events;
|
|
|
|
end = mod->trace_events + mod->num_trace_events;
|
|
|
|
|
|
|
|
for_each_event(call, start, end) {
|
2012-05-04 03:09:03 +00:00
|
|
|
__register_event(*call, mod);
|
2013-07-31 17:31:32 +00:00
|
|
|
__add_event_to_tracers(*call);
|
2009-04-10 18:53:50 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void trace_module_remove_events(struct module *mod)
|
|
|
|
{
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call, *p;
|
tracing: Have type enum modifications copy the strings
When an enum is used in the visible parts of a trace event that is
exported to user space, the user space applications like perf and
trace-cmd do not have a way to know what the value of the enum is. To
solve this, at boot up (or module load) the printk formats are modified to
replace the enum with their numeric value in the string output.
Array fields of the event are defined by [<nr-elements>] in the type
portion of the format file so that the user space parsers can correctly
parse the array into the appropriate size chunks. But in some trace
events, an enum is used in defining the size of the array, which once
again breaks the parsing of user space tooling.
This was solved the same way as the print formats were, but it modified
the type strings of the trace event. This caused crashes in some
architectures because, as supposed to the print string, is a const string
value. This was not detected on x86, as it appears that const strings are
still writable (at least in boot up), but other architectures this is not
the case, and writing to a const string will cause a kernel fault.
To fix this, use kstrdup() to copy the type before modifying it. If the
trace event is for the core kernel there's no need to free it because the
string will be in use for the life of the machine being on line. For
modules, create a link list to store all the strings being allocated for
modules and when the module is removed, free them.
Link: https://lore.kernel.org/all/yt9dr1706b4i.fsf@linux.ibm.com/
Link: https://lkml.kernel.org/r/20220318153432.3984b871@gandalf.local.home
Tested-by: Marc Zyngier <maz@kernel.org>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Reported-by: Sven Schnelle <svens@linux.ibm.com>
Fixes: b3bc8547d3be ("tracing: Have TRACE_DEFINE_ENUM affect trace event types as well")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-03-18 19:34:32 +00:00
|
|
|
struct module_string *modstr, *m;
|
2009-04-10 18:53:50 +00:00
|
|
|
|
2013-03-11 07:14:03 +00:00
|
|
|
down_write(&trace_event_sem);
|
2009-04-10 18:53:50 +00:00
|
|
|
list_for_each_entry_safe(call, p, &ftrace_events, list) {
|
2021-08-17 03:42:57 +00:00
|
|
|
if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module)
|
|
|
|
continue;
|
|
|
|
if (call->module == mod)
|
2009-08-13 20:34:53 +00:00
|
|
|
__trace_remove_event_call(call);
|
2009-04-10 18:53:50 +00:00
|
|
|
}
|
tracing: Have type enum modifications copy the strings
When an enum is used in the visible parts of a trace event that is
exported to user space, the user space applications like perf and
trace-cmd do not have a way to know what the value of the enum is. To
solve this, at boot up (or module load) the printk formats are modified to
replace the enum with their numeric value in the string output.
Array fields of the event are defined by [<nr-elements>] in the type
portion of the format file so that the user space parsers can correctly
parse the array into the appropriate size chunks. But in some trace
events, an enum is used in defining the size of the array, which once
again breaks the parsing of user space tooling.
This was solved the same way as the print formats were, but it modified
the type strings of the trace event. This caused crashes in some
architectures because, as supposed to the print string, is a const string
value. This was not detected on x86, as it appears that const strings are
still writable (at least in boot up), but other architectures this is not
the case, and writing to a const string will cause a kernel fault.
To fix this, use kstrdup() to copy the type before modifying it. If the
trace event is for the core kernel there's no need to free it because the
string will be in use for the life of the machine being on line. For
modules, create a link list to store all the strings being allocated for
modules and when the module is removed, free them.
Link: https://lore.kernel.org/all/yt9dr1706b4i.fsf@linux.ibm.com/
Link: https://lkml.kernel.org/r/20220318153432.3984b871@gandalf.local.home
Tested-by: Marc Zyngier <maz@kernel.org>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Reported-by: Sven Schnelle <svens@linux.ibm.com>
Fixes: b3bc8547d3be ("tracing: Have TRACE_DEFINE_ENUM affect trace event types as well")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-03-18 19:34:32 +00:00
|
|
|
/* Check for any strings allocade for this module */
|
|
|
|
list_for_each_entry_safe(modstr, m, &module_strings, next) {
|
|
|
|
if (modstr->module != mod)
|
|
|
|
continue;
|
|
|
|
list_del(&modstr->next);
|
|
|
|
kfree(modstr->str);
|
|
|
|
kfree(modstr);
|
|
|
|
}
|
2013-03-11 07:14:03 +00:00
|
|
|
up_write(&trace_event_sem);
|
2009-05-07 01:54:09 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* It is safest to reset the ring buffer if the module being unloaded
|
2013-03-05 04:26:06 +00:00
|
|
|
* registered any events that were used. The only worry is if
|
|
|
|
* a new module gets loaded, and takes on the same id as the events
|
|
|
|
* of this module. When printing out the buffer, traced events left
|
|
|
|
* over from this module may be passed to the new module events and
|
|
|
|
* unexpected results may occur.
|
2009-05-07 01:54:09 +00:00
|
|
|
*/
|
2022-11-23 19:25:57 +00:00
|
|
|
tracing_reset_all_online_cpus_unlocked();
|
2009-04-10 18:53:50 +00:00
|
|
|
}
|
|
|
|
|
2009-04-14 22:22:32 +00:00
|
|
|
static int trace_module_notify(struct notifier_block *self,
|
|
|
|
unsigned long val, void *data)
|
2009-04-10 18:53:50 +00:00
|
|
|
{
|
|
|
|
struct module *mod = data;
|
|
|
|
|
|
|
|
mutex_lock(&event_mutex);
|
2017-09-21 20:22:49 +00:00
|
|
|
mutex_lock(&trace_types_lock);
|
2009-04-10 18:53:50 +00:00
|
|
|
switch (val) {
|
|
|
|
case MODULE_STATE_COMING:
|
|
|
|
trace_module_add_events(mod);
|
|
|
|
break;
|
|
|
|
case MODULE_STATE_GOING:
|
|
|
|
trace_module_remove_events(mod);
|
|
|
|
break;
|
|
|
|
}
|
2013-07-02 02:37:54 +00:00
|
|
|
mutex_unlock(&trace_types_lock);
|
2017-09-21 20:22:49 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
2009-02-28 07:41:25 +00:00
|
|
|
|
2020-08-18 13:57:37 +00:00
|
|
|
return NOTIFY_OK;
|
2009-02-24 19:15:08 +00:00
|
|
|
}
|
2013-03-02 22:37:14 +00:00
|
|
|
|
2013-07-31 17:31:37 +00:00
|
|
|
static struct notifier_block trace_module_nb = {
|
|
|
|
.notifier_call = trace_module_notify,
|
2015-03-25 19:44:21 +00:00
|
|
|
.priority = 1, /* higher than trace.c module notify */
|
2013-07-31 17:31:37 +00:00
|
|
|
};
|
2009-04-14 22:22:32 +00:00
|
|
|
#endif /* CONFIG_MODULES */
|
2009-02-24 19:15:08 +00:00
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
/* Create a new event directory structure for a trace directory. */
|
|
|
|
static void
|
|
|
|
__trace_add_event_dirs(struct trace_array *tr)
|
|
|
|
{
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call;
|
2012-05-04 03:09:03 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
list_for_each_entry(call, &ftrace_events, list) {
|
2013-07-31 17:31:35 +00:00
|
|
|
ret = __trace_add_new_event(call, tr);
|
2012-05-04 03:09:03 +00:00
|
|
|
if (ret < 0)
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("Could not create directory for event %s\n",
|
2015-05-13 18:20:14 +00:00
|
|
|
trace_event_name(call));
|
2012-05-04 03:09:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-05-08 19:06:38 +00:00
|
|
|
/* Returns any file that matches the system and event */
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *
|
2018-05-08 19:06:38 +00:00
|
|
|
__find_event_file(struct trace_array *tr, const char *system, const char *event)
|
2013-03-12 23:35:13 +00:00
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call;
|
2014-04-08 21:26:21 +00:00
|
|
|
const char *name;
|
2013-03-12 23:35:13 +00:00
|
|
|
|
|
|
|
list_for_each_entry(file, &tr->events, list) {
|
|
|
|
|
|
|
|
call = file->event_call;
|
2015-05-13 18:20:14 +00:00
|
|
|
name = trace_event_name(call);
|
2013-03-12 23:35:13 +00:00
|
|
|
|
2018-05-08 19:06:38 +00:00
|
|
|
if (!name || !call->class)
|
2013-03-12 23:35:13 +00:00
|
|
|
continue;
|
|
|
|
|
2014-04-08 21:26:21 +00:00
|
|
|
if (strcmp(event, name) == 0 &&
|
2013-03-12 23:35:13 +00:00
|
|
|
strcmp(system, call->class->system) == 0)
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-05-08 19:06:38 +00:00
|
|
|
/* Returns valid trace event files that match system and event */
|
|
|
|
struct trace_event_file *
|
|
|
|
find_event_file(struct trace_array *tr, const char *system, const char *event)
|
|
|
|
{
|
|
|
|
struct trace_event_file *file;
|
|
|
|
|
|
|
|
file = __find_event_file(tr, system, event);
|
|
|
|
if (!file || !file->event_call->class->reg ||
|
|
|
|
file->event_call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
|
2020-01-29 18:59:22 +00:00
|
|
|
/**
|
|
|
|
* trace_get_event_file - Find and return a trace event file
|
|
|
|
* @instance: The name of the trace instance containing the event
|
|
|
|
* @system: The name of the system containing the event
|
|
|
|
* @event: The name of the event
|
|
|
|
*
|
|
|
|
* Return a trace event file given the trace instance name, trace
|
|
|
|
* system, and trace event name. If the instance name is NULL, it
|
|
|
|
* refers to the top-level trace array.
|
|
|
|
*
|
|
|
|
* This function will look it up and return it if found, after calling
|
|
|
|
* trace_array_get() to prevent the instance from going away, and
|
|
|
|
* increment the event's module refcount to prevent it from being
|
|
|
|
* removed.
|
|
|
|
*
|
|
|
|
* To release the file, call trace_put_event_file(), which will call
|
|
|
|
* trace_array_put() and decrement the event's module refcount.
|
|
|
|
*
|
|
|
|
* Return: The trace event on success, ERR_PTR otherwise.
|
|
|
|
*/
|
|
|
|
struct trace_event_file *trace_get_event_file(const char *instance,
|
|
|
|
const char *system,
|
|
|
|
const char *event)
|
|
|
|
{
|
|
|
|
struct trace_array *tr = top_trace_array();
|
|
|
|
struct trace_event_file *file = NULL;
|
|
|
|
int ret = -EINVAL;
|
|
|
|
|
|
|
|
if (instance) {
|
|
|
|
tr = trace_array_find_get(instance);
|
|
|
|
if (!tr)
|
|
|
|
return ERR_PTR(-ENOENT);
|
|
|
|
} else {
|
|
|
|
ret = trace_array_get(tr);
|
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_lock(&event_mutex);
|
|
|
|
|
|
|
|
file = find_event_file(tr, system, event);
|
|
|
|
if (!file) {
|
|
|
|
trace_array_put(tr);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Don't let event modules unload while in use */
|
2021-08-17 03:42:57 +00:00
|
|
|
ret = trace_event_try_get_ref(file->event_call);
|
2020-01-29 18:59:22 +00:00
|
|
|
if (!ret) {
|
|
|
|
trace_array_put(tr);
|
|
|
|
ret = -EBUSY;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
file = ERR_PTR(ret);
|
|
|
|
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(trace_get_event_file);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* trace_put_event_file - Release a file from trace_get_event_file()
|
|
|
|
* @file: The trace event file
|
|
|
|
*
|
|
|
|
* If a file was retrieved using trace_get_event_file(), this should
|
|
|
|
* be called when it's no longer needed. It will cancel the previous
|
|
|
|
* trace_array_get() called by that function, and decrement the
|
|
|
|
* event's module refcount.
|
|
|
|
*/
|
|
|
|
void trace_put_event_file(struct trace_event_file *file)
|
|
|
|
{
|
|
|
|
mutex_lock(&event_mutex);
|
2021-08-17 03:42:57 +00:00
|
|
|
trace_event_put_ref(file->event_call);
|
2020-01-29 18:59:22 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
|
|
|
|
trace_array_put(file->tr);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(trace_put_event_file);
|
|
|
|
|
2013-12-21 04:23:05 +00:00
|
|
|
#ifdef CONFIG_DYNAMIC_FTRACE
|
|
|
|
|
|
|
|
/* Avoid typos */
|
|
|
|
#define ENABLE_EVENT_STR "enable_event"
|
|
|
|
#define DISABLE_EVENT_STR "disable_event"
|
|
|
|
|
|
|
|
struct event_probe_data {
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2013-12-21 04:23:05 +00:00
|
|
|
unsigned long count;
|
|
|
|
int ref;
|
|
|
|
bool enable;
|
|
|
|
};
|
|
|
|
|
2017-04-04 00:58:35 +00:00
|
|
|
static void update_event_probe(struct event_probe_data *data)
|
|
|
|
{
|
|
|
|
if (data->enable)
|
|
|
|
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
|
|
|
|
else
|
|
|
|
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
|
|
|
|
}
|
|
|
|
|
2013-03-12 23:35:13 +00:00
|
|
|
static void
|
2017-04-03 22:18:47 +00:00
|
|
|
event_enable_probe(unsigned long ip, unsigned long parent_ip,
|
2017-04-11 02:30:05 +00:00
|
|
|
struct trace_array *tr, struct ftrace_probe_ops *ops,
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
void *data)
|
2013-03-12 23:35:13 +00:00
|
|
|
{
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
struct ftrace_func_mapper *mapper = data;
|
|
|
|
struct event_probe_data *edata;
|
2017-04-04 00:58:35 +00:00
|
|
|
void **pdata;
|
2013-03-12 23:35:13 +00:00
|
|
|
|
2017-04-04 00:58:35 +00:00
|
|
|
pdata = ftrace_func_mapper_find_ip(mapper, ip);
|
|
|
|
if (!pdata || !*pdata)
|
2013-03-12 23:35:13 +00:00
|
|
|
return;
|
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
edata = *pdata;
|
|
|
|
update_event_probe(edata);
|
2013-03-12 23:35:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2017-04-03 22:18:47 +00:00
|
|
|
event_enable_count_probe(unsigned long ip, unsigned long parent_ip,
|
2017-04-11 02:30:05 +00:00
|
|
|
struct trace_array *tr, struct ftrace_probe_ops *ops,
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
void *data)
|
2013-03-12 23:35:13 +00:00
|
|
|
{
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
struct ftrace_func_mapper *mapper = data;
|
|
|
|
struct event_probe_data *edata;
|
2017-04-04 00:58:35 +00:00
|
|
|
void **pdata;
|
2013-03-12 23:35:13 +00:00
|
|
|
|
2017-04-04 00:58:35 +00:00
|
|
|
pdata = ftrace_func_mapper_find_ip(mapper, ip);
|
|
|
|
if (!pdata || !*pdata)
|
2013-03-12 23:35:13 +00:00
|
|
|
return;
|
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
edata = *pdata;
|
2017-04-04 00:58:35 +00:00
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
if (!edata->count)
|
2013-03-12 23:35:13 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
/* Skip if the event is in a state we want to switch to */
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
|
2013-03-12 23:35:13 +00:00
|
|
|
return;
|
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
if (edata->count != -1)
|
|
|
|
(edata->count)--;
|
2013-03-12 23:35:13 +00:00
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
update_event_probe(edata);
|
2013-03-12 23:35:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
event_enable_print(struct seq_file *m, unsigned long ip,
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
struct ftrace_probe_ops *ops, void *data)
|
2013-03-12 23:35:13 +00:00
|
|
|
{
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
struct ftrace_func_mapper *mapper = data;
|
|
|
|
struct event_probe_data *edata;
|
2017-04-04 00:58:35 +00:00
|
|
|
void **pdata;
|
|
|
|
|
|
|
|
pdata = ftrace_func_mapper_find_ip(mapper, ip);
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!pdata || !*pdata))
|
|
|
|
return 0;
|
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
edata = *pdata;
|
2013-03-12 23:35:13 +00:00
|
|
|
|
|
|
|
seq_printf(m, "%ps:", (void *)ip);
|
|
|
|
|
|
|
|
seq_printf(m, "%s:%s:%s",
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
|
|
|
|
edata->file->event_call->class->system,
|
|
|
|
trace_event_name(edata->file->event_call));
|
2013-03-12 23:35:13 +00:00
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
if (edata->count == -1)
|
2014-11-08 20:42:10 +00:00
|
|
|
seq_puts(m, ":unlimited\n");
|
2013-03-12 23:35:13 +00:00
|
|
|
else
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
seq_printf(m, ":count=%ld\n", edata->count);
|
2013-03-12 23:35:13 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2017-04-11 02:30:05 +00:00
|
|
|
event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
unsigned long ip, void *init_data, void **data)
|
2013-03-12 23:35:13 +00:00
|
|
|
{
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
struct ftrace_func_mapper *mapper = *data;
|
|
|
|
struct event_probe_data *edata = init_data;
|
2017-04-04 00:58:35 +00:00
|
|
|
int ret;
|
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
if (!mapper) {
|
|
|
|
mapper = allocate_ftrace_func_mapper();
|
|
|
|
if (!mapper)
|
|
|
|
return -ENODEV;
|
|
|
|
*data = mapper;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ftrace_func_mapper_add_ip(mapper, ip, edata);
|
2017-04-04 00:58:35 +00:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2013-03-12 23:35:13 +00:00
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
edata->ref++;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int free_probe_data(void *data)
|
|
|
|
{
|
|
|
|
struct event_probe_data *edata = data;
|
2017-04-04 00:58:35 +00:00
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
edata->ref--;
|
|
|
|
if (!edata->ref) {
|
|
|
|
/* Remove the SOFT_MODE flag */
|
|
|
|
__ftrace_event_enable_disable(edata->file, 0, 1);
|
2021-08-17 03:42:57 +00:00
|
|
|
trace_event_put_ref(edata->file->event_call);
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
kfree(edata);
|
|
|
|
}
|
2013-03-12 23:35:13 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2017-04-11 02:30:05 +00:00
|
|
|
event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
unsigned long ip, void *data)
|
2013-03-12 23:35:13 +00:00
|
|
|
{
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
struct ftrace_func_mapper *mapper = data;
|
|
|
|
struct event_probe_data *edata;
|
|
|
|
|
|
|
|
if (!ip) {
|
|
|
|
if (!mapper)
|
|
|
|
return;
|
|
|
|
free_ftrace_func_mapper(mapper, free_probe_data);
|
|
|
|
return;
|
|
|
|
}
|
2017-04-04 00:58:35 +00:00
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
edata = ftrace_func_mapper_remove_ip(mapper, ip);
|
2017-04-04 00:58:35 +00:00
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
if (WARN_ON_ONCE(!edata))
|
2017-04-04 00:58:35 +00:00
|
|
|
return;
|
2013-03-12 23:35:13 +00:00
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
if (WARN_ON_ONCE(edata->ref <= 0))
|
2013-03-12 23:35:13 +00:00
|
|
|
return;
|
|
|
|
|
tracing/ftrace: Add a better way to pass data via the probe functions
With the redesign of the registration and execution of the function probes
(triggers), data can now be passed from the setup of the probe to the probe
callers that are specific to the trace_array it is on. Although, all probes
still only affect the toplevel trace array, this change will allow for
instances to have their own probes separated from other instances and the
top array.
That is, something like the stacktrace probe can be set to trace only in an
instance and not the toplevel trace array. This isn't implement yet, but
this change sets the ground work for the change.
When a probe callback is triggered (someone writes the probe format into
set_ftrace_filter), it calls register_ftrace_function_probe() passing in
init_data that will be used to initialize the probe. Then for every matching
function, register_ftrace_function_probe() will call the probe_ops->init()
function with the init data that was passed to it, as well as an address to
a place holder that is associated with the probe and the instance. The first
occurrence will have a NULL in the pointer. The init() function will then
initialize it. If other probes are added, or more functions are part of the
probe, the place holder will be passed to the init() function with the place
holder data that it was initialized to the last time.
Then this place_holder is passed to each of the other probe_ops functions,
where it can be used in the function callback. When the probe_ops free()
function is called, it can be called either with the rip of the function
that is being removed from the probe, or zero, indicating that there are no
more functions attached to the probe, and the place holder is about to be
freed. This gives the probe_ops a way to free the data it assigned to the
place holder if it was allocade during the first init call.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:39:44 +00:00
|
|
|
free_probe_data(edata);
|
2013-03-12 23:35:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct ftrace_probe_ops event_enable_probe_ops = {
|
|
|
|
.func = event_enable_probe,
|
|
|
|
.print = event_enable_print,
|
|
|
|
.init = event_enable_init,
|
|
|
|
.free = event_enable_free,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct ftrace_probe_ops event_enable_count_probe_ops = {
|
|
|
|
.func = event_enable_count_probe,
|
|
|
|
.print = event_enable_print,
|
|
|
|
.init = event_enable_init,
|
|
|
|
.free = event_enable_free,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct ftrace_probe_ops event_disable_probe_ops = {
|
|
|
|
.func = event_enable_probe,
|
|
|
|
.print = event_enable_print,
|
|
|
|
.init = event_enable_init,
|
|
|
|
.free = event_enable_free,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct ftrace_probe_ops event_disable_count_probe_ops = {
|
|
|
|
.func = event_enable_count_probe,
|
|
|
|
.print = event_enable_print,
|
|
|
|
.init = event_enable_init,
|
|
|
|
.free = event_enable_free,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int
|
2017-04-05 17:12:55 +00:00
|
|
|
event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
|
2013-03-12 23:35:13 +00:00
|
|
|
char *glob, char *cmd, char *param, int enabled)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2013-03-12 23:35:13 +00:00
|
|
|
struct ftrace_probe_ops *ops;
|
|
|
|
struct event_probe_data *data;
|
|
|
|
const char *system;
|
|
|
|
const char *event;
|
|
|
|
char *number;
|
|
|
|
bool enable;
|
|
|
|
int ret;
|
|
|
|
|
2014-06-05 22:35:17 +00:00
|
|
|
if (!tr)
|
|
|
|
return -ENODEV;
|
|
|
|
|
2013-03-12 23:35:13 +00:00
|
|
|
/* hash funcs only work with set_ftrace_filter */
|
2013-05-24 07:22:17 +00:00
|
|
|
if (!enabled || !param)
|
2013-03-12 23:35:13 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
system = strsep(¶m, ":");
|
|
|
|
if (!param)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
event = strsep(¶m, ":");
|
|
|
|
|
|
|
|
mutex_lock(&event_mutex);
|
|
|
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
file = find_event_file(tr, system, event);
|
|
|
|
if (!file)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
|
|
|
|
|
|
|
|
if (enable)
|
|
|
|
ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops;
|
|
|
|
else
|
|
|
|
ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
|
|
|
|
|
|
|
|
if (glob[0] == '!') {
|
2017-04-18 18:50:39 +00:00
|
|
|
ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
|
2013-03-12 23:35:13 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = -ENOMEM;
|
2017-04-04 00:58:35 +00:00
|
|
|
|
2013-03-12 23:35:13 +00:00
|
|
|
data = kzalloc(sizeof(*data), GFP_KERNEL);
|
|
|
|
if (!data)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
data->enable = enable;
|
|
|
|
data->count = -1;
|
|
|
|
data->file = file;
|
|
|
|
|
|
|
|
if (!param)
|
|
|
|
goto out_reg;
|
|
|
|
|
|
|
|
number = strsep(¶m, ":");
|
|
|
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
if (!strlen(number))
|
|
|
|
goto out_free;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We use the callback data field (which is a pointer)
|
|
|
|
* as our counter.
|
|
|
|
*/
|
|
|
|
ret = kstrtoul(number, 0, &data->count);
|
|
|
|
if (ret)
|
|
|
|
goto out_free;
|
|
|
|
|
|
|
|
out_reg:
|
|
|
|
/* Don't let event modules unload while probe registered */
|
2021-08-17 03:42:57 +00:00
|
|
|
ret = trace_event_try_get_ref(file->event_call);
|
2013-05-16 11:48:49 +00:00
|
|
|
if (!ret) {
|
|
|
|
ret = -EBUSY;
|
2013-03-12 23:35:13 +00:00
|
|
|
goto out_free;
|
2013-05-16 11:48:49 +00:00
|
|
|
}
|
2013-03-12 23:35:13 +00:00
|
|
|
|
|
|
|
ret = __ftrace_event_enable_disable(file, 1, 1);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out_put;
|
2017-04-04 00:58:35 +00:00
|
|
|
|
2017-04-05 17:12:55 +00:00
|
|
|
ret = register_ftrace_function_probe(glob, tr, ops, data);
|
2013-05-09 15:30:26 +00:00
|
|
|
/*
|
|
|
|
* The above returns on success the # of functions enabled,
|
|
|
|
* but if it didn't find any functions it returns zero.
|
|
|
|
* Consider no functions a failure too.
|
|
|
|
*/
|
2013-05-09 05:44:14 +00:00
|
|
|
if (!ret) {
|
|
|
|
ret = -ENOENT;
|
2013-03-12 23:35:13 +00:00
|
|
|
goto out_disable;
|
2013-05-09 15:30:26 +00:00
|
|
|
} else if (ret < 0)
|
|
|
|
goto out_disable;
|
|
|
|
/* Just return zero, not the number of enabled functions */
|
|
|
|
ret = 0;
|
2013-03-12 23:35:13 +00:00
|
|
|
out:
|
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
out_disable:
|
|
|
|
__ftrace_event_enable_disable(file, 0, 1);
|
|
|
|
out_put:
|
2021-08-17 03:42:57 +00:00
|
|
|
trace_event_put_ref(file->event_call);
|
2013-03-12 23:35:13 +00:00
|
|
|
out_free:
|
|
|
|
kfree(data);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct ftrace_func_command event_enable_cmd = {
|
|
|
|
.name = ENABLE_EVENT_STR,
|
|
|
|
.func = event_enable_func,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct ftrace_func_command event_disable_cmd = {
|
|
|
|
.name = DISABLE_EVENT_STR,
|
|
|
|
.func = event_enable_func,
|
|
|
|
};
|
|
|
|
|
|
|
|
static __init int register_event_cmds(void)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = register_ftrace_command(&event_enable_cmd);
|
|
|
|
if (WARN_ON(ret < 0))
|
|
|
|
return ret;
|
|
|
|
ret = register_ftrace_command(&event_disable_cmd);
|
|
|
|
if (WARN_ON(ret < 0))
|
|
|
|
unregister_ftrace_command(&event_enable_cmd);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline int register_event_cmds(void) { return 0; }
|
|
|
|
#endif /* CONFIG_DYNAMIC_FTRACE */
|
|
|
|
|
2013-02-27 21:28:06 +00:00
|
|
|
/*
|
2020-09-24 16:40:08 +00:00
|
|
|
* The top level array and trace arrays created by boot-time tracing
|
|
|
|
* have already had its trace_event_file descriptors created in order
|
|
|
|
* to allow for early events to be recorded.
|
|
|
|
* This function is called after the tracefs has been initialized,
|
|
|
|
* and we now have to create the files associated to the events.
|
2013-02-27 21:28:06 +00:00
|
|
|
*/
|
2020-09-24 16:40:08 +00:00
|
|
|
static void __trace_early_add_event_dirs(struct trace_array *tr)
|
2013-02-27 21:28:06 +00:00
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2013-02-27 21:28:06 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
|
|
list_for_each_entry(file, &tr->events, list) {
|
2013-07-31 17:31:35 +00:00
|
|
|
ret = event_create_dir(tr->event_dir, file);
|
2013-02-27 21:28:06 +00:00
|
|
|
if (ret < 0)
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("Could not create directory for event %s\n",
|
2015-05-13 18:20:14 +00:00
|
|
|
trace_event_name(file->event_call));
|
2013-02-27 21:28:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2020-09-24 16:40:08 +00:00
|
|
|
* For early boot up, the top trace array and the trace arrays created
|
|
|
|
* by boot-time tracing require to have a list of events that can be
|
|
|
|
* enabled. This must be done before the filesystem is set up in order
|
|
|
|
* to allow events to be traced early.
|
2013-02-27 21:28:06 +00:00
|
|
|
*/
|
2020-09-24 16:40:08 +00:00
|
|
|
void __trace_early_add_events(struct trace_array *tr)
|
2013-02-27 21:28:06 +00:00
|
|
|
{
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call;
|
2013-02-27 21:28:06 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
list_for_each_entry(call, &ftrace_events, list) {
|
|
|
|
/* Early boot up should not have any modules loaded */
|
2021-08-17 03:42:57 +00:00
|
|
|
if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) &&
|
|
|
|
WARN_ON_ONCE(call->module))
|
2013-02-27 21:28:06 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
ret = __trace_early_add_new_event(call, tr);
|
|
|
|
if (ret < 0)
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("Could not create early event %s\n",
|
2015-05-13 18:20:14 +00:00
|
|
|
trace_event_name(call));
|
2013-02-27 21:28:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-08-07 20:14:16 +00:00
|
|
|
/* Remove the event directory structure for a trace directory. */
|
|
|
|
static void
|
|
|
|
__trace_remove_event_dirs(struct trace_array *tr)
|
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file, *next;
|
2012-08-07 20:14:16 +00:00
|
|
|
|
2013-07-26 17:25:47 +00:00
|
|
|
list_for_each_entry_safe(file, next, &tr->events, list)
|
|
|
|
remove_event_file_dir(file);
|
2012-08-07 20:14:16 +00:00
|
|
|
}
|
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
static void __add_event_to_tracers(struct trace_event_call *call)
|
2012-05-04 03:09:03 +00:00
|
|
|
{
|
|
|
|
struct trace_array *tr;
|
|
|
|
|
2013-07-31 17:31:35 +00:00
|
|
|
list_for_each_entry(tr, &ftrace_trace_arrays, list)
|
|
|
|
__trace_add_new_event(call, tr);
|
2012-05-04 03:09:03 +00:00
|
|
|
}
|
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
extern struct trace_event_call *__start_ftrace_events[];
|
|
|
|
extern struct trace_event_call *__stop_ftrace_events[];
|
2009-04-10 17:52:20 +00:00
|
|
|
|
2009-07-01 02:47:05 +00:00
|
|
|
static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
|
|
|
|
|
|
|
|
static __init int setup_trace_event(char *str)
|
|
|
|
{
|
2023-05-16 14:39:56 +00:00
|
|
|
strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
|
2023-09-06 09:18:37 +00:00
|
|
|
trace_set_ring_buffer_expanded(NULL);
|
2020-12-08 08:54:09 +00:00
|
|
|
disable_tracing_selftest("running event tracing");
|
2009-07-01 02:47:05 +00:00
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("trace_event=", setup_trace_event);
|
|
|
|
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
static int events_callback(const char *name, umode_t *mode, void **data,
|
|
|
|
const struct file_operations **fops)
|
|
|
|
{
|
|
|
|
if (strcmp(name, "enable") == 0) {
|
|
|
|
*mode = TRACE_MODE_WRITE;
|
|
|
|
*fops = &ftrace_tr_enable_fops;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2023-12-19 18:54:16 +00:00
|
|
|
if (strcmp(name, "header_page") == 0) {
|
|
|
|
*mode = TRACE_MODE_READ;
|
|
|
|
*fops = &ftrace_show_header_page_fops;
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
|
2023-12-19 18:54:16 +00:00
|
|
|
} else if (strcmp(name, "header_event") == 0) {
|
|
|
|
*mode = TRACE_MODE_READ;
|
|
|
|
*fops = &ftrace_show_header_event_fops;
|
|
|
|
} else
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2013-02-27 21:28:06 +00:00
|
|
|
/* Expects to have event_mutex held when called */
|
|
|
|
static int
|
|
|
|
create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
|
2012-05-04 03:09:03 +00:00
|
|
|
{
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
struct eventfs_inode *e_events;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct dentry *entry;
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
int nr_entries;
|
|
|
|
static struct eventfs_entry events_entries[] = {
|
|
|
|
{
|
|
|
|
.name = "enable",
|
|
|
|
.callback = events_callback,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "header_page",
|
|
|
|
.callback = events_callback,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "header_event",
|
|
|
|
.callback = events_callback,
|
|
|
|
},
|
|
|
|
};
|
2012-05-04 03:09:03 +00:00
|
|
|
|
2022-01-14 13:10:52 +00:00
|
|
|
entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
|
|
|
|
tr, &ftrace_set_event_fops);
|
|
|
|
if (!entry)
|
2012-05-04 03:09:03 +00:00
|
|
|
return -ENOMEM;
|
|
|
|
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
nr_entries = ARRAY_SIZE(events_entries);
|
|
|
|
|
|
|
|
e_events = eventfs_create_events_dir("events", parent, events_entries,
|
|
|
|
nr_entries, tr);
|
|
|
|
if (IS_ERR(e_events)) {
|
2015-01-20 17:13:40 +00:00
|
|
|
pr_warn("Could not create tracefs 'events' directory\n");
|
2012-08-03 20:10:49 +00:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2012-05-04 03:09:03 +00:00
|
|
|
|
2016-11-22 23:32:03 +00:00
|
|
|
/* There are not as crucial, just warn if they are not created */
|
|
|
|
|
2022-01-14 13:10:52 +00:00
|
|
|
trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
|
|
|
|
tr, &ftrace_set_event_pid_fops);
|
2015-09-24 15:33:26 +00:00
|
|
|
|
2022-01-14 13:10:52 +00:00
|
|
|
trace_create_file("set_event_notrace_pid",
|
|
|
|
TRACE_MODE_WRITE, parent, tr,
|
|
|
|
&ftrace_set_event_notrace_pid_fops);
|
2020-03-25 23:51:19 +00:00
|
|
|
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-04 20:50:07 +00:00
|
|
|
tr->event_dir = e_events;
|
2013-02-27 21:28:06 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* event_trace_add_tracer - add a instance of a trace_array to events
|
|
|
|
* @parent: The parent dentry to place the files/directories for events in
|
|
|
|
* @tr: The trace array associated with these events
|
|
|
|
*
|
|
|
|
* When a new instance is created, it needs to set up its events
|
|
|
|
* directory, as well as other files associated with events. It also
|
2020-10-29 15:05:54 +00:00
|
|
|
* creates the event hierarchy in the @parent/events directory.
|
2013-02-27 21:28:06 +00:00
|
|
|
*
|
|
|
|
* Returns 0 on success.
|
2017-09-21 20:22:49 +00:00
|
|
|
*
|
|
|
|
* Must be called with event_mutex held.
|
2013-02-27 21:28:06 +00:00
|
|
|
*/
|
|
|
|
int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2017-09-21 20:22:49 +00:00
|
|
|
lockdep_assert_held(&event_mutex);
|
2013-02-27 21:28:06 +00:00
|
|
|
|
|
|
|
ret = create_event_toplevel_files(parent, tr);
|
|
|
|
if (ret)
|
2017-09-21 20:22:49 +00:00
|
|
|
goto out;
|
2013-02-27 21:28:06 +00:00
|
|
|
|
2013-03-11 07:14:03 +00:00
|
|
|
down_write(&trace_event_sem);
|
2020-09-24 16:40:08 +00:00
|
|
|
/* If tr already has the event list, it is initialized in early boot. */
|
|
|
|
if (unlikely(!list_empty(&tr->events)))
|
|
|
|
__trace_early_add_event_dirs(tr);
|
|
|
|
else
|
|
|
|
__trace_add_event_dirs(tr);
|
2013-03-11 07:14:03 +00:00
|
|
|
up_write(&trace_event_sem);
|
2012-08-03 20:10:49 +00:00
|
|
|
|
2017-09-21 20:22:49 +00:00
|
|
|
out:
|
2013-02-27 21:28:06 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The top trace array already had its file descriptors created.
|
|
|
|
* Now the files themselves need to be created.
|
|
|
|
*/
|
|
|
|
static __init int
|
|
|
|
early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
mutex_lock(&event_mutex);
|
|
|
|
|
|
|
|
ret = create_event_toplevel_files(parent, tr);
|
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2013-03-11 07:14:03 +00:00
|
|
|
down_write(&trace_event_sem);
|
2013-02-27 21:28:06 +00:00
|
|
|
__trace_early_add_event_dirs(tr);
|
2013-03-11 07:14:03 +00:00
|
|
|
up_write(&trace_event_sem);
|
2013-02-27 21:28:06 +00:00
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&event_mutex);
|
|
|
|
|
|
|
|
return ret;
|
2012-05-04 03:09:03 +00:00
|
|
|
}
|
|
|
|
|
2017-09-21 20:22:49 +00:00
|
|
|
/* Must be called with event_mutex held */
|
2012-08-07 20:14:16 +00:00
|
|
|
int event_trace_del_tracer(struct trace_array *tr)
|
|
|
|
{
|
2017-09-21 20:22:49 +00:00
|
|
|
lockdep_assert_held(&event_mutex);
|
2012-08-07 20:14:16 +00:00
|
|
|
|
tracing: Add basic event trigger framework
Add a 'trigger' file for each trace event, enabling 'trace event
triggers' to be set for trace events.
'trace event triggers' are patterned after the existing 'ftrace
function triggers' implementation except that triggers are written to
per-event 'trigger' files instead of to a single file such as the
'set_ftrace_filter' used for ftrace function triggers.
The implementation is meant to be entirely separate from ftrace
function triggers, in order to keep the respective implementations
relatively simple and to allow them to diverge.
The event trigger functionality is built on top of SOFT_DISABLE
functionality. It adds a TRIGGER_MODE bit to the ftrace_event_file
flags which is checked when any trace event fires. Triggers set for a
particular event need to be checked regardless of whether that event
is actually enabled or not - getting an event to fire even if it's not
enabled is what's already implemented by SOFT_DISABLE mode, so trigger
mode directly reuses that. Event trigger essentially inherit the soft
disable logic in __ftrace_event_enable_disable() while adding a bit of
logic and trigger reference counting via tm_ref on top of that in a
new trace_event_trigger_enable_disable() function. Because the base
__ftrace_event_enable_disable() code now needs to be invoked from
outside trace_events.c, a wrapper is also added for those usages.
The triggers for an event are actually invoked via a new function,
event_triggers_call(), and code is also added to invoke them for
ftrace_raw_event calls as well as syscall events.
The main part of the patch creates a new trace_events_trigger.c file
to contain the trace event triggers implementation.
The standard open, read, and release file operations are implemented
here.
The open() implementation sets up for the various open modes of the
'trigger' file. It creates and attaches the trigger iterator and sets
up the command parser. If opened for reading set up the trigger
seq_ops.
The read() implementation parses the event trigger written to the
'trigger' file, looks up the trigger command, and passes it along to
that event_command's func() implementation for command-specific
processing.
The release() implementation does whatever cleanup is needed to
release the 'trigger' file, like releasing the parser and trigger
iterator, etc.
A couple of functions for event command registration and
unregistration are added, along with a list to add them to and a mutex
to protect them, as well as an (initially empty) registration function
to add the set of commands that will be added by future commits, and
call to it from the trace event initialization code.
also added are a couple trigger-specific data structures needed for
these implementations such as a trigger iterator and a struct for
trigger-specific data.
A couple structs consisting mostly of function meant to be implemented
in command-specific ways, event_command and event_trigger_ops, are
used by the generic event trigger command implementations. They're
being put into trace.h alongside the other trace_event data structures
and functions, in the expectation that they'll be needed in several
trace_event-related files such as trace_events_trigger.c and
trace_events.c.
The event_command.func() function is meant to be called by the trigger
parsing code in order to add a trigger instance to the corresponding
event. It essentially coordinates adding a live trigger instance to
the event, and arming the triggering the event.
Every event_command func() implementation essentially does the
same thing for any command:
- choose ops - use the value of param to choose either a number or
count version of event_trigger_ops specific to the command
- do the register or unregister of those ops
- associate a filter, if specified, with the triggering event
The reg() and unreg() ops allow command-specific implementations for
event_trigger_op registration and unregistration, and the
get_trigger_ops() op allows command-specific event_trigger_ops
selection to be parameterized. When a trigger instance is added, the
reg() op essentially adds that trigger to the triggering event and
arms it, while unreg() does the opposite. The set_filter() function
is used to associate a filter with the trigger - if the command
doesn't specify a set_filter() implementation, the command will ignore
filters.
Each command has an associated trigger_type, which serves double duty,
both as a unique identifier for the command as well as a value that
can be used for setting a trigger mode bit during trigger invocation.
The signature of func() adds a pointer to the event_command struct,
used to invoke those functions, along with a command_data param that
can be passed to the reg/unreg functions. This allows func()
implementations to use command-specific blobs and supports code
re-use.
The event_trigger_ops.func() command corrsponds to the trigger 'probe'
function that gets called when the triggering event is actually
invoked. The other functions are used to list the trigger when
needed, along with a couple mundane book-keeping functions.
This also moves event_file_data() into trace.h so it can be used
outside of trace_events.c.
Link: http://lkml.kernel.org/r/316d95061accdee070aac8e5750afba0192fa5b9.1382622043.git.tom.zanussi@linux.intel.com
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Idea-by: Steve Rostedt <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-10-24 13:59:24 +00:00
|
|
|
/* Disable any event triggers and associated soft-disabled events */
|
|
|
|
clear_event_triggers(tr);
|
|
|
|
|
2015-09-24 15:33:26 +00:00
|
|
|
/* Clear the pid list */
|
2020-03-25 23:51:19 +00:00
|
|
|
__ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS);
|
2015-09-24 15:33:26 +00:00
|
|
|
|
2013-07-02 18:48:23 +00:00
|
|
|
/* Disable any running events */
|
|
|
|
__ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
|
|
|
|
|
2018-08-09 19:31:48 +00:00
|
|
|
/* Make sure no more events are being executed */
|
|
|
|
tracepoint_synchronize_unregister();
|
2013-12-03 17:41:20 +00:00
|
|
|
|
2013-03-11 07:14:03 +00:00
|
|
|
down_write(&trace_event_sem);
|
2012-08-07 20:14:16 +00:00
|
|
|
__trace_remove_event_dirs(tr);
|
2023-10-05 13:13:48 +00:00
|
|
|
eventfs_remove_events_dir(tr->event_dir);
|
2013-03-11 07:14:03 +00:00
|
|
|
up_write(&trace_event_sem);
|
2012-08-07 20:14:16 +00:00
|
|
|
|
|
|
|
tr->event_dir = NULL;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-02-28 01:23:57 +00:00
|
|
|
static __init int event_trace_memsetup(void)
|
|
|
|
{
|
|
|
|
field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
|
2015-05-05 14:09:53 +00:00
|
|
|
file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC);
|
2013-02-28 01:23:57 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-02-07 17:28:51 +00:00
|
|
|
__init void
|
|
|
|
early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
|
2015-01-14 17:53:45 +00:00
|
|
|
{
|
|
|
|
char *token;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
token = strsep(&buf, ",");
|
|
|
|
|
|
|
|
if (!token)
|
|
|
|
break;
|
|
|
|
|
tracing: Put back comma for empty fields in boot string parsing
Both early_enable_events() and apply_trace_boot_options() parse a boot
string that may get parsed later on. They both use strsep() which converts a
comma into a nul character. To still allow the boot string to be parsed
again the same way, the nul character gets converted back to a comma after
the token is processed.
The problem is that these two functions check for an empty parameter (two
commas in a row ",,"), and continue the loop if the parameter is empty, but
fails to place the comma back. In this case, the second parsing will end at
this blank field, and not process fields afterward.
In most cases, users should not have an empty field, but if its going to be
checked, the code might as well be correct.
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-11-04 03:15:14 +00:00
|
|
|
if (*token) {
|
|
|
|
/* Restarting syscalls requires that we stop them first */
|
|
|
|
if (disable_first)
|
|
|
|
ftrace_set_clr_event(tr, token, 0);
|
2015-01-14 17:53:45 +00:00
|
|
|
|
tracing: Put back comma for empty fields in boot string parsing
Both early_enable_events() and apply_trace_boot_options() parse a boot
string that may get parsed later on. They both use strsep() which converts a
comma into a nul character. To still allow the boot string to be parsed
again the same way, the nul character gets converted back to a comma after
the token is processed.
The problem is that these two functions check for an empty parameter (two
commas in a row ",,"), and continue the loop if the parameter is empty, but
fails to place the comma back. In this case, the second parsing will end at
this blank field, and not process fields afterward.
In most cases, users should not have an empty field, but if its going to be
checked, the code might as well be correct.
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-11-04 03:15:14 +00:00
|
|
|
ret = ftrace_set_clr_event(tr, token, 1);
|
|
|
|
if (ret)
|
|
|
|
pr_warn("Failed to enable trace event: %s\n", token);
|
|
|
|
}
|
2015-01-14 17:53:45 +00:00
|
|
|
|
|
|
|
/* Put back the comma to allow this to be called again */
|
|
|
|
if (buf)
|
|
|
|
*(buf - 1) = ',';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-09-12 14:47:57 +00:00
|
|
|
static __init int event_trace_enable(void)
|
|
|
|
{
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr = top_trace_array();
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call **iter, *call;
|
2012-09-12 14:47:57 +00:00
|
|
|
int ret;
|
|
|
|
|
2014-06-05 22:35:17 +00:00
|
|
|
if (!tr)
|
|
|
|
return -ENODEV;
|
|
|
|
|
2012-09-12 14:47:57 +00:00
|
|
|
for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
|
|
|
|
|
|
|
|
call = *iter;
|
|
|
|
ret = event_init(call);
|
|
|
|
if (!ret)
|
|
|
|
list_add(&call->list, &ftrace_events);
|
|
|
|
}
|
|
|
|
|
2022-10-21 01:00:56 +00:00
|
|
|
register_trigger_cmds();
|
|
|
|
|
2013-02-27 21:28:06 +00:00
|
|
|
/*
|
|
|
|
* We need the top trace array to have a working set of trace
|
|
|
|
* points at early init, before the debug files and directories
|
|
|
|
* are created. Create the file entries now, and attach them
|
|
|
|
* to the actual file dentries later.
|
|
|
|
*/
|
|
|
|
__trace_early_add_events(tr);
|
|
|
|
|
2023-02-07 17:28:51 +00:00
|
|
|
early_enable_events(tr, bootup_event_buf, false);
|
2012-10-11 14:15:05 +00:00
|
|
|
|
|
|
|
trace_printk_start_comm();
|
|
|
|
|
2013-03-12 23:35:13 +00:00
|
|
|
register_event_cmds();
|
|
|
|
|
tracing: Add basic event trigger framework
Add a 'trigger' file for each trace event, enabling 'trace event
triggers' to be set for trace events.
'trace event triggers' are patterned after the existing 'ftrace
function triggers' implementation except that triggers are written to
per-event 'trigger' files instead of to a single file such as the
'set_ftrace_filter' used for ftrace function triggers.
The implementation is meant to be entirely separate from ftrace
function triggers, in order to keep the respective implementations
relatively simple and to allow them to diverge.
The event trigger functionality is built on top of SOFT_DISABLE
functionality. It adds a TRIGGER_MODE bit to the ftrace_event_file
flags which is checked when any trace event fires. Triggers set for a
particular event need to be checked regardless of whether that event
is actually enabled or not - getting an event to fire even if it's not
enabled is what's already implemented by SOFT_DISABLE mode, so trigger
mode directly reuses that. Event trigger essentially inherit the soft
disable logic in __ftrace_event_enable_disable() while adding a bit of
logic and trigger reference counting via tm_ref on top of that in a
new trace_event_trigger_enable_disable() function. Because the base
__ftrace_event_enable_disable() code now needs to be invoked from
outside trace_events.c, a wrapper is also added for those usages.
The triggers for an event are actually invoked via a new function,
event_triggers_call(), and code is also added to invoke them for
ftrace_raw_event calls as well as syscall events.
The main part of the patch creates a new trace_events_trigger.c file
to contain the trace event triggers implementation.
The standard open, read, and release file operations are implemented
here.
The open() implementation sets up for the various open modes of the
'trigger' file. It creates and attaches the trigger iterator and sets
up the command parser. If opened for reading set up the trigger
seq_ops.
The read() implementation parses the event trigger written to the
'trigger' file, looks up the trigger command, and passes it along to
that event_command's func() implementation for command-specific
processing.
The release() implementation does whatever cleanup is needed to
release the 'trigger' file, like releasing the parser and trigger
iterator, etc.
A couple of functions for event command registration and
unregistration are added, along with a list to add them to and a mutex
to protect them, as well as an (initially empty) registration function
to add the set of commands that will be added by future commits, and
call to it from the trace event initialization code.
also added are a couple trigger-specific data structures needed for
these implementations such as a trigger iterator and a struct for
trigger-specific data.
A couple structs consisting mostly of function meant to be implemented
in command-specific ways, event_command and event_trigger_ops, are
used by the generic event trigger command implementations. They're
being put into trace.h alongside the other trace_event data structures
and functions, in the expectation that they'll be needed in several
trace_event-related files such as trace_events_trigger.c and
trace_events.c.
The event_command.func() function is meant to be called by the trigger
parsing code in order to add a trigger instance to the corresponding
event. It essentially coordinates adding a live trigger instance to
the event, and arming the triggering the event.
Every event_command func() implementation essentially does the
same thing for any command:
- choose ops - use the value of param to choose either a number or
count version of event_trigger_ops specific to the command
- do the register or unregister of those ops
- associate a filter, if specified, with the triggering event
The reg() and unreg() ops allow command-specific implementations for
event_trigger_op registration and unregistration, and the
get_trigger_ops() op allows command-specific event_trigger_ops
selection to be parameterized. When a trigger instance is added, the
reg() op essentially adds that trigger to the triggering event and
arms it, while unreg() does the opposite. The set_filter() function
is used to associate a filter with the trigger - if the command
doesn't specify a set_filter() implementation, the command will ignore
filters.
Each command has an associated trigger_type, which serves double duty,
both as a unique identifier for the command as well as a value that
can be used for setting a trigger mode bit during trigger invocation.
The signature of func() adds a pointer to the event_command struct,
used to invoke those functions, along with a command_data param that
can be passed to the reg/unreg functions. This allows func()
implementations to use command-specific blobs and supports code
re-use.
The event_trigger_ops.func() command corrsponds to the trigger 'probe'
function that gets called when the triggering event is actually
invoked. The other functions are used to list the trigger when
needed, along with a couple mundane book-keeping functions.
This also moves event_file_data() into trace.h so it can be used
outside of trace_events.c.
Link: http://lkml.kernel.org/r/316d95061accdee070aac8e5750afba0192fa5b9.1382622043.git.tom.zanussi@linux.intel.com
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Idea-by: Steve Rostedt <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2013-10-24 13:59:24 +00:00
|
|
|
|
2012-09-12 14:47:57 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-01-14 17:53:45 +00:00
|
|
|
/*
|
|
|
|
* event_trace_enable() is called from trace_event_init() first to
|
|
|
|
* initialize events and perhaps start any events that are on the
|
|
|
|
* command line. Unfortunately, there are some events that will not
|
|
|
|
* start this early, like the system call tracepoints that need
|
2020-11-16 17:42:01 +00:00
|
|
|
* to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But
|
|
|
|
* event_trace_enable() is called before pid 1 starts, and this flag
|
|
|
|
* is never set, making the syscall tracepoint never get reached, but
|
|
|
|
* the event is enabled regardless (and not doing anything).
|
2015-01-14 17:53:45 +00:00
|
|
|
*/
|
|
|
|
static __init int event_trace_enable_again(void)
|
|
|
|
{
|
|
|
|
struct trace_array *tr;
|
|
|
|
|
|
|
|
tr = top_trace_array();
|
|
|
|
if (!tr)
|
|
|
|
return -ENODEV;
|
|
|
|
|
2023-02-07 17:28:51 +00:00
|
|
|
early_enable_events(tr, bootup_event_buf, true);
|
2015-01-14 17:53:45 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
early_initcall(event_trace_enable_again);
|
|
|
|
|
2020-09-10 12:38:48 +00:00
|
|
|
/* Init fields which doesn't related to the tracefs */
|
|
|
|
static __init int event_trace_init_fields(void)
|
|
|
|
{
|
|
|
|
if (trace_define_generic_fields())
|
|
|
|
pr_warn("tracing: Failed to allocated generic fields");
|
|
|
|
|
|
|
|
if (trace_define_common_fields())
|
|
|
|
pr_warn("tracing: Failed to allocate common fields");
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-05-08 19:09:27 +00:00
|
|
|
__init int event_trace_init(void)
|
2009-02-24 15:21:36 +00:00
|
|
|
{
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr;
|
2009-04-10 18:53:50 +00:00
|
|
|
int ret;
|
2009-02-24 15:21:36 +00:00
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
tr = top_trace_array();
|
2014-06-05 22:35:17 +00:00
|
|
|
if (!tr)
|
|
|
|
return -ENODEV;
|
2012-05-04 03:09:03 +00:00
|
|
|
|
2022-01-14 13:10:52 +00:00
|
|
|
trace_create_file("available_events", TRACE_MODE_READ,
|
|
|
|
NULL, tr, &ftrace_avail_fops);
|
2009-03-10 16:04:02 +00:00
|
|
|
|
2020-07-12 01:10:35 +00:00
|
|
|
ret = early_event_add_tracer(NULL, tr);
|
2012-05-04 03:09:03 +00:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2009-07-01 02:47:05 +00:00
|
|
|
|
2013-07-31 17:31:37 +00:00
|
|
|
#ifdef CONFIG_MODULES
|
2009-04-10 18:53:50 +00:00
|
|
|
ret = register_module_notifier(&trace_module_nb);
|
2009-05-18 15:04:46 +00:00
|
|
|
if (ret)
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("Failed to register trace events module notifier\n");
|
2013-07-31 17:31:37 +00:00
|
|
|
#endif
|
2020-09-10 12:38:58 +00:00
|
|
|
|
|
|
|
eventdir_initialized = true;
|
|
|
|
|
2009-02-24 15:21:36 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2014-12-13 01:05:10 +00:00
|
|
|
|
|
|
|
void __init trace_event_init(void)
|
|
|
|
{
|
|
|
|
event_trace_memsetup();
|
|
|
|
init_ftrace_syscalls();
|
|
|
|
event_trace_enable();
|
2020-09-10 12:38:48 +00:00
|
|
|
event_trace_init_fields();
|
2014-12-13 01:05:10 +00:00
|
|
|
}
|
|
|
|
|
2019-05-23 23:40:17 +00:00
|
|
|
#ifdef CONFIG_EVENT_TRACE_STARTUP_TEST
|
2009-04-15 17:36:40 +00:00
|
|
|
|
|
|
|
static DEFINE_SPINLOCK(test_spinlock);
|
|
|
|
static DEFINE_SPINLOCK(test_spinlock_irq);
|
|
|
|
static DEFINE_MUTEX(test_mutex);
|
|
|
|
|
|
|
|
static __init void test_work(struct work_struct *dummy)
|
|
|
|
{
|
|
|
|
spin_lock(&test_spinlock);
|
|
|
|
spin_lock_irq(&test_spinlock_irq);
|
|
|
|
udelay(1);
|
|
|
|
spin_unlock_irq(&test_spinlock_irq);
|
|
|
|
spin_unlock(&test_spinlock);
|
|
|
|
|
|
|
|
mutex_lock(&test_mutex);
|
|
|
|
msleep(1);
|
|
|
|
mutex_unlock(&test_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __init int event_test_thread(void *unused)
|
|
|
|
{
|
|
|
|
void *test_malloc;
|
|
|
|
|
|
|
|
test_malloc = kmalloc(1234, GFP_KERNEL);
|
|
|
|
if (!test_malloc)
|
|
|
|
pr_info("failed to kmalloc\n");
|
|
|
|
|
|
|
|
schedule_on_each_cpu(test_work);
|
|
|
|
|
|
|
|
kfree(test_malloc);
|
|
|
|
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
2014-10-08 16:51:10 +00:00
|
|
|
while (!kthread_should_stop()) {
|
2009-04-15 17:36:40 +00:00
|
|
|
schedule();
|
2014-10-08 16:51:10 +00:00
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
|
|
}
|
|
|
|
__set_current_state(TASK_RUNNING);
|
2009-04-15 17:36:40 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do various things that may trigger events.
|
|
|
|
*/
|
|
|
|
static __init void event_test_stuff(void)
|
|
|
|
{
|
|
|
|
struct task_struct *test_thread;
|
|
|
|
|
|
|
|
test_thread = kthread_run(event_test_thread, NULL, "test-events");
|
|
|
|
msleep(1);
|
|
|
|
kthread_stop(test_thread);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For every trace event defined, we will test each trace point separately,
|
|
|
|
* and then by groups, and finally all trace points.
|
|
|
|
*/
|
2009-04-16 16:15:44 +00:00
|
|
|
static __init void event_trace_self_tests(void)
|
2009-04-15 17:36:40 +00:00
|
|
|
{
|
2015-05-13 18:59:40 +00:00
|
|
|
struct trace_subsystem_dir *dir;
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file;
|
2015-05-05 15:45:27 +00:00
|
|
|
struct trace_event_call *call;
|
2009-04-15 17:36:40 +00:00
|
|
|
struct event_subsystem *system;
|
2012-05-04 03:09:03 +00:00
|
|
|
struct trace_array *tr;
|
2009-04-15 17:36:40 +00:00
|
|
|
int ret;
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
tr = top_trace_array();
|
2014-06-05 22:35:17 +00:00
|
|
|
if (!tr)
|
|
|
|
return;
|
2012-05-04 03:09:03 +00:00
|
|
|
|
2009-04-15 17:36:40 +00:00
|
|
|
pr_info("Running tests on trace events:\n");
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
list_for_each_entry(file, &tr->events, list) {
|
|
|
|
|
|
|
|
call = file->event_call;
|
2009-04-15 17:36:40 +00:00
|
|
|
|
2010-04-21 16:27:06 +00:00
|
|
|
/* Only test those that have a probe */
|
|
|
|
if (!call->class || !call->class->probe)
|
2009-04-15 17:36:40 +00:00
|
|
|
continue;
|
|
|
|
|
2009-09-14 15:58:24 +00:00
|
|
|
/*
|
|
|
|
* Testing syscall events here is pretty useless, but
|
|
|
|
* we still do it if configured. But this is time consuming.
|
|
|
|
* What we really need is a user thread to perform the
|
|
|
|
* syscalls as we test.
|
|
|
|
*/
|
|
|
|
#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
|
2010-04-20 14:47:33 +00:00
|
|
|
if (call->class->system &&
|
|
|
|
strcmp(call->class->system, "syscalls") == 0)
|
2009-09-14 15:58:24 +00:00
|
|
|
continue;
|
|
|
|
#endif
|
|
|
|
|
2015-05-13 18:20:14 +00:00
|
|
|
pr_info("Testing event %s: ", trace_event_name(call));
|
2009-04-15 17:36:40 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If an event is already enabled, someone is using
|
|
|
|
* it and the self test should not be on.
|
|
|
|
*/
|
2015-05-13 19:12:33 +00:00
|
|
|
if (file->flags & EVENT_FILE_FL_ENABLED) {
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("Enabled event during self test!\n");
|
2009-04-15 17:36:40 +00:00
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
ftrace_event_enable_disable(file, 1);
|
2009-04-15 17:36:40 +00:00
|
|
|
event_test_stuff();
|
2012-05-04 03:09:03 +00:00
|
|
|
ftrace_event_enable_disable(file, 0);
|
2009-04-15 17:36:40 +00:00
|
|
|
|
|
|
|
pr_cont("OK\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Now test at the sub system level */
|
|
|
|
|
|
|
|
pr_info("Running tests on trace event systems:\n");
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
list_for_each_entry(dir, &tr->systems, list) {
|
|
|
|
|
|
|
|
system = dir->subsystem;
|
2009-04-15 17:36:40 +00:00
|
|
|
|
|
|
|
/* the ftrace system is special, skip it */
|
|
|
|
if (strcmp(system->name, "ftrace") == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
pr_info("Testing event system %s: ", system->name);
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
|
2009-04-15 17:36:40 +00:00
|
|
|
if (WARN_ON_ONCE(ret)) {
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("error enabling system %s\n",
|
|
|
|
system->name);
|
2009-04-15 17:36:40 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
event_test_stuff();
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
|
2012-08-27 07:13:45 +00:00
|
|
|
if (WARN_ON_ONCE(ret)) {
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("error disabling system %s\n",
|
|
|
|
system->name);
|
2012-08-27 07:13:45 +00:00
|
|
|
continue;
|
|
|
|
}
|
2009-04-15 17:36:40 +00:00
|
|
|
|
|
|
|
pr_cont("OK\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Test with all events enabled */
|
|
|
|
|
|
|
|
pr_info("Running tests on all trace events:\n");
|
|
|
|
pr_info("Testing all events: ");
|
|
|
|
|
2012-05-04 03:09:03 +00:00
|
|
|
ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
|
2009-04-15 17:36:40 +00:00
|
|
|
if (WARN_ON_ONCE(ret)) {
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("error enabling all events\n");
|
2009-04-16 16:15:44 +00:00
|
|
|
return;
|
2009-04-15 17:36:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
event_test_stuff();
|
|
|
|
|
|
|
|
/* reset sysname */
|
2012-05-04 03:09:03 +00:00
|
|
|
ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
|
2009-04-15 17:36:40 +00:00
|
|
|
if (WARN_ON_ONCE(ret)) {
|
2014-06-07 11:43:08 +00:00
|
|
|
pr_warn("error disabling all events\n");
|
2009-04-16 16:15:44 +00:00
|
|
|
return;
|
2009-04-15 17:36:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
pr_cont("OK\n");
|
2009-04-16 16:15:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_FUNCTION_TRACER
|
|
|
|
|
2009-06-24 06:13:48 +00:00
|
|
|
static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
|
2009-04-16 16:15:44 +00:00
|
|
|
|
2016-04-29 22:10:21 +00:00
|
|
|
static struct trace_event_file event_trace_file __initdata;
|
2015-09-25 21:38:44 +00:00
|
|
|
|
|
|
|
static void __init
|
2011-08-08 20:57:47 +00:00
|
|
|
function_test_events_call(unsigned long ip, unsigned long parent_ip,
|
2020-10-28 21:42:17 +00:00
|
|
|
struct ftrace_ops *op, struct ftrace_regs *regs)
|
2009-04-16 16:15:44 +00:00
|
|
|
{
|
2019-12-13 18:58:57 +00:00
|
|
|
struct trace_buffer *buffer;
|
2009-04-16 16:15:44 +00:00
|
|
|
struct ring_buffer_event *event;
|
|
|
|
struct ftrace_entry *entry;
|
2021-01-25 19:45:08 +00:00
|
|
|
unsigned int trace_ctx;
|
2009-04-16 16:15:44 +00:00
|
|
|
long disabled;
|
|
|
|
int cpu;
|
|
|
|
|
2021-01-25 19:45:08 +00:00
|
|
|
trace_ctx = tracing_gen_ctx();
|
2010-06-03 13:36:50 +00:00
|
|
|
preempt_disable_notrace();
|
2009-04-16 16:15:44 +00:00
|
|
|
cpu = raw_smp_processor_id();
|
2009-06-24 06:13:48 +00:00
|
|
|
disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
|
2009-04-16 16:15:44 +00:00
|
|
|
|
|
|
|
if (disabled != 1)
|
|
|
|
goto out;
|
|
|
|
|
2016-04-29 22:10:21 +00:00
|
|
|
event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file,
|
|
|
|
TRACE_FN, sizeof(*entry),
|
2021-01-25 19:45:08 +00:00
|
|
|
trace_ctx);
|
2009-04-16 16:15:44 +00:00
|
|
|
if (!event)
|
|
|
|
goto out;
|
|
|
|
entry = ring_buffer_event_data(event);
|
|
|
|
entry->ip = ip;
|
|
|
|
entry->parent_ip = parent_ip;
|
|
|
|
|
2016-04-29 22:10:21 +00:00
|
|
|
event_trigger_unlock_commit(&event_trace_file, buffer, event,
|
2021-01-25 19:45:08 +00:00
|
|
|
entry, trace_ctx);
|
2009-04-16 16:15:44 +00:00
|
|
|
out:
|
2009-06-24 06:13:48 +00:00
|
|
|
atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
|
2010-06-03 13:36:50 +00:00
|
|
|
preempt_enable_notrace();
|
2009-04-16 16:15:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct ftrace_ops trace_ops __initdata =
|
|
|
|
{
|
|
|
|
.func = function_test_events_call,
|
|
|
|
};
|
|
|
|
|
|
|
|
static __init void event_trace_self_test_with_function(void)
|
|
|
|
{
|
2011-05-23 19:27:46 +00:00
|
|
|
int ret;
|
2016-04-29 22:10:21 +00:00
|
|
|
|
|
|
|
event_trace_file.tr = top_trace_array();
|
|
|
|
if (WARN_ON(!event_trace_file.tr))
|
2015-09-30 15:45:22 +00:00
|
|
|
return;
|
2016-04-29 22:10:21 +00:00
|
|
|
|
2011-05-23 19:27:46 +00:00
|
|
|
ret = register_ftrace_function(&trace_ops);
|
|
|
|
if (WARN_ON(ret < 0)) {
|
|
|
|
pr_info("Failed to enable function tracer for event tests\n");
|
|
|
|
return;
|
|
|
|
}
|
2009-04-16 16:15:44 +00:00
|
|
|
pr_info("Running tests again, along with the function tracer\n");
|
|
|
|
event_trace_self_tests();
|
|
|
|
unregister_ftrace_function(&trace_ops);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static __init void event_trace_self_test_with_function(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static __init int event_trace_self_tests_init(void)
|
|
|
|
{
|
2009-07-01 02:47:05 +00:00
|
|
|
if (!tracing_selftest_disabled) {
|
|
|
|
event_trace_self_tests();
|
|
|
|
event_trace_self_test_with_function();
|
|
|
|
}
|
2009-04-15 17:36:40 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-04-20 16:12:44 +00:00
|
|
|
late_initcall(event_trace_self_tests_init);
|
2009-04-15 17:36:40 +00:00
|
|
|
|
|
|
|
#endif
|