2018-08-16 15:23:53 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2012-04-11 10:30:43 +00:00
|
|
|
/*
|
|
|
|
* uprobes-based tracing events
|
|
|
|
*
|
|
|
|
* Copyright (C) IBM Corporation, 2010-2012
|
|
|
|
* Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
|
|
|
|
*/
|
2019-01-17 13:30:23 +00:00
|
|
|
#define pr_fmt(fmt) "trace_uprobe: " fmt
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2021-12-16 02:55:37 +00:00
|
|
|
#include <linux/bpf-cgroup.h>
|
2019-10-11 21:22:50 +00:00
|
|
|
#include <linux/security.h>
|
2018-11-05 09:03:04 +00:00
|
|
|
#include <linux/ctype.h>
|
2012-04-11 10:30:43 +00:00
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/uaccess.h>
|
|
|
|
#include <linux/uprobes.h>
|
|
|
|
#include <linux/namei.h>
|
2012-12-18 00:01:27 +00:00
|
|
|
#include <linux/string.h>
|
2017-02-04 00:27:20 +00:00
|
|
|
#include <linux/rculist.h>
|
bpf: implement sleepable uprobes by chaining gps
uprobes work by raising a trap, setting a task flag from within the
interrupt handler, and processing the actual work for the uprobe on the
way back to userspace. As a result, uprobe handlers already execute in a
might_fault/_sleep context. The primary obstacle to sleepable bpf uprobe
programs is therefore on the bpf side.
Namely, the bpf_prog_array attached to the uprobe is protected by normal
rcu. In order for uprobe bpf programs to become sleepable, it has to be
protected by the tasks_trace rcu flavor instead (and kfree() called after
a corresponding grace period).
Therefore, the free path for bpf_prog_array now chains a tasks_trace and
normal grace periods one after the other.
Users who iterate under tasks_trace read section would
be safe, as would users who iterate under normal read sections (from
non-sleepable locations).
The downside is that the tasks_trace latency affects all perf_event-attached
bpf programs (and not just uprobe ones). This is deemed safe given the
possible attach rates for kprobe/uprobe/tp programs.
Separately, non-sleepable programs need access to dynamically sized
rcu-protected maps, so bpf_run_prog_array_sleepables now conditionally takes
an rcu read section, in addition to the overarching tasks_trace section.
Signed-off-by: Delyan Kratunov <delyank@fb.com>
Link: https://lore.kernel.org/r/ce844d62a2fd0443b08c5ab02e95bc7149f9aeb1.1655248076.git.delyank@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2022-06-14 23:10:46 +00:00
|
|
|
#include <linux/filter.h>
|
2024-08-13 20:34:09 +00:00
|
|
|
#include <linux/percpu.h>
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
#include "trace_dynevent.h"
|
2012-04-11 10:30:43 +00:00
|
|
|
#include "trace_probe.h"
|
2018-04-25 12:18:03 +00:00
|
|
|
#include "trace_probe_tmpl.h"
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
#define UPROBE_EVENT_SYSTEM "uprobes"
|
|
|
|
|
2013-03-29 17:26:51 +00:00
|
|
|
struct uprobe_trace_entry_head {
|
|
|
|
struct trace_entry ent;
|
|
|
|
unsigned long vaddr[];
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SIZEOF_TRACE_ENTRY(is_return) \
|
|
|
|
(sizeof(struct uprobe_trace_entry_head) + \
|
|
|
|
sizeof(unsigned long) * (is_return ? 2 : 1))
|
|
|
|
|
|
|
|
#define DATAOF_TRACE_ENTRY(entry, is_return) \
|
|
|
|
((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
|
|
|
|
|
2021-02-01 19:48:11 +00:00
|
|
|
static int trace_uprobe_create(const char *raw_command);
|
2018-11-05 09:03:04 +00:00
|
|
|
static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);
|
|
|
|
static int trace_uprobe_release(struct dyn_event *ev);
|
|
|
|
static bool trace_uprobe_is_busy(struct dyn_event *ev);
|
|
|
|
static bool trace_uprobe_match(const char *system, const char *event,
|
2019-06-19 15:07:39 +00:00
|
|
|
int argc, const char **argv, struct dyn_event *ev);
|
2018-11-05 09:03:04 +00:00
|
|
|
|
|
|
|
static struct dyn_event_operations trace_uprobe_ops = {
|
|
|
|
.create = trace_uprobe_create,
|
|
|
|
.show = trace_uprobe_show,
|
|
|
|
.is_busy = trace_uprobe_is_busy,
|
|
|
|
.free = trace_uprobe_release,
|
|
|
|
.match = trace_uprobe_match,
|
|
|
|
};
|
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
/*
|
|
|
|
* uprobe event core functions
|
|
|
|
*/
|
|
|
|
struct trace_uprobe {
|
2018-11-05 09:03:04 +00:00
|
|
|
struct dyn_event devent;
|
2013-01-31 18:47:23 +00:00
|
|
|
struct uprobe_consumer consumer;
|
2018-04-23 17:21:34 +00:00
|
|
|
struct path path;
|
2012-04-11 10:30:43 +00:00
|
|
|
char *filename;
|
2024-08-01 13:27:34 +00:00
|
|
|
struct uprobe *uprobe;
|
2012-04-11 10:30:43 +00:00
|
|
|
unsigned long offset;
|
uprobes: Support SDT markers having reference count (semaphore)
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:
if (reference_counter > 0) {
Execute marker instructions;
}
Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.
Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:
<path>:<offset>[(ref_ctr_offset)]
where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:
uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)
No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.
[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are calling it a 'reference counter'
in kernel / perf code.
Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
[Only trace_uprobe.c]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-08-20 04:42:47 +00:00
|
|
|
unsigned long ref_ctr_offset;
|
2024-08-13 20:34:09 +00:00
|
|
|
unsigned long __percpu *nhits;
|
2013-07-03 06:42:53 +00:00
|
|
|
struct trace_probe tp;
|
2012-04-11 10:30:43 +00:00
|
|
|
};
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
static bool is_trace_uprobe(struct dyn_event *ev)
|
|
|
|
{
|
|
|
|
return ev->ops == &trace_uprobe_ops;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
|
|
|
|
{
|
|
|
|
return container_of(ev, struct trace_uprobe, devent);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* for_each_trace_uprobe - iterate over the trace_uprobe list
|
|
|
|
* @pos: the struct trace_uprobe * for each entry
|
|
|
|
* @dpos: the struct dyn_event * to use as a loop cursor
|
|
|
|
*/
|
|
|
|
#define for_each_trace_uprobe(pos, dpos) \
|
|
|
|
for_each_dyn_event(dpos) \
|
|
|
|
if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos)))
|
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
static int register_uprobe_event(struct trace_uprobe *tu);
|
2013-07-04 03:33:51 +00:00
|
|
|
static int unregister_uprobe_event(struct trace_uprobe *tu);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2024-10-18 20:22:51 +00:00
|
|
|
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
|
|
|
|
__u64 *data);
|
2013-03-30 17:25:23 +00:00
|
|
|
static int uretprobe_dispatcher(struct uprobe_consumer *con,
|
2024-10-18 20:22:51 +00:00
|
|
|
unsigned long func, struct pt_regs *regs,
|
|
|
|
__u64 *data);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2013-11-26 06:21:04 +00:00
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
|
|
|
|
{
|
|
|
|
return addr - (n * sizeof(long));
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
|
|
|
|
{
|
|
|
|
return addr + (n * sizeof(long));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
|
|
|
|
{
|
|
|
|
unsigned long ret;
|
|
|
|
unsigned long addr = user_stack_pointer(regs);
|
|
|
|
|
|
|
|
addr = adjust_stack_addr(addr, n);
|
|
|
|
|
|
|
|
if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret)))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Uprobes-specific fetch functions
|
|
|
|
*/
|
2018-04-25 12:18:03 +00:00
|
|
|
static nokprobe_inline int
|
2018-04-25 12:19:59 +00:00
|
|
|
probe_mem_read(void *dest, void *src, size_t size)
|
2018-04-25 12:18:03 +00:00
|
|
|
{
|
|
|
|
void __user *vaddr = (void __force __user *)src;
|
|
|
|
|
2018-08-28 16:17:47 +00:00
|
|
|
return copy_from_user(dest, vaddr, size) ? -EFAULT : 0;
|
2013-11-26 06:21:04 +00:00
|
|
|
}
|
2019-05-15 05:38:42 +00:00
|
|
|
|
|
|
|
static nokprobe_inline int
|
|
|
|
probe_mem_read_user(void *dest, void *src, size_t size)
|
|
|
|
{
|
|
|
|
return probe_mem_read(dest, src, size);
|
|
|
|
}
|
|
|
|
|
2013-11-26 06:21:04 +00:00
|
|
|
/*
|
|
|
|
* Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
|
|
|
|
* length and relative data location.
|
|
|
|
*/
|
2018-04-25 12:19:01 +00:00
|
|
|
static nokprobe_inline int
|
|
|
|
fetch_store_string(unsigned long addr, void *dest, void *base)
|
2013-11-26 06:21:04 +00:00
|
|
|
{
|
|
|
|
long ret;
|
2018-04-25 12:19:01 +00:00
|
|
|
u32 loc = *(u32 *)dest;
|
|
|
|
int maxlen = get_loc_len(loc);
|
|
|
|
u8 *dst = get_loc_data(dest, base);
|
2013-11-26 06:21:04 +00:00
|
|
|
void __user *src = (void __force __user *) addr;
|
|
|
|
|
2018-04-25 12:19:01 +00:00
|
|
|
if (unlikely(!maxlen))
|
|
|
|
return -ENOMEM;
|
2013-11-26 06:21:04 +00:00
|
|
|
|
2019-05-07 13:55:41 +00:00
|
|
|
if (addr == FETCH_TOKEN_COMM)
|
2023-11-30 20:56:08 +00:00
|
|
|
ret = strscpy(dst, current->comm, maxlen);
|
2019-05-07 13:55:41 +00:00
|
|
|
else
|
|
|
|
ret = strncpy_from_user(dst, src, maxlen);
|
2018-04-25 12:19:01 +00:00
|
|
|
if (ret >= 0) {
|
|
|
|
if (ret == maxlen)
|
|
|
|
dst[ret - 1] = '\0';
|
2019-01-16 14:16:29 +00:00
|
|
|
else
|
|
|
|
/*
|
|
|
|
* Include the terminating null byte. In this case it
|
|
|
|
* was copied by strncpy_from_user but not accounted
|
|
|
|
* for in ret.
|
|
|
|
*/
|
|
|
|
ret++;
|
2018-04-25 12:19:01 +00:00
|
|
|
*(u32 *)dest = make_data_loc(ret, (void *)dst - base);
|
2023-07-11 14:16:07 +00:00
|
|
|
} else
|
|
|
|
*(u32 *)dest = make_data_loc(0, (void *)dst - base);
|
2018-04-25 12:19:01 +00:00
|
|
|
|
|
|
|
return ret;
|
2013-11-26 06:21:04 +00:00
|
|
|
}
|
|
|
|
|
2019-05-15 05:38:30 +00:00
|
|
|
static nokprobe_inline int
|
|
|
|
fetch_store_string_user(unsigned long addr, void *dest, void *base)
|
|
|
|
{
|
|
|
|
return fetch_store_string(addr, dest, base);
|
|
|
|
}
|
|
|
|
|
2018-04-25 12:18:03 +00:00
|
|
|
/* Return the length of string -- including null terminal byte */
|
2018-04-25 12:19:01 +00:00
|
|
|
static nokprobe_inline int
|
|
|
|
fetch_store_strlen(unsigned long addr)
|
2013-11-26 06:21:04 +00:00
|
|
|
{
|
|
|
|
int len;
|
|
|
|
void __user *vaddr = (void __force __user *) addr;
|
|
|
|
|
2019-05-07 13:55:41 +00:00
|
|
|
if (addr == FETCH_TOKEN_COMM)
|
|
|
|
len = strlen(current->comm) + 1;
|
|
|
|
else
|
|
|
|
len = strnlen_user(vaddr, MAX_STRING_SIZE);
|
2013-11-26 06:21:04 +00:00
|
|
|
|
2018-04-25 12:19:01 +00:00
|
|
|
return (len > MAX_STRING_SIZE) ? 0 : len;
|
2013-11-26 06:21:04 +00:00
|
|
|
}
|
2013-11-26 06:21:04 +00:00
|
|
|
|
2019-05-15 05:38:30 +00:00
|
|
|
static nokprobe_inline int
|
|
|
|
fetch_store_strlen_user(unsigned long addr)
|
|
|
|
{
|
|
|
|
return fetch_store_strlen(addr);
|
|
|
|
}
|
|
|
|
|
2018-04-25 12:18:03 +00:00
|
|
|
static unsigned long translate_user_vaddr(unsigned long file_offset)
|
2013-11-25 04:42:47 +00:00
|
|
|
{
|
|
|
|
unsigned long base_addr;
|
|
|
|
struct uprobe_dispatch_data *udd;
|
|
|
|
|
|
|
|
udd = (void *) current->utask->vaddr;
|
|
|
|
|
|
|
|
base_addr = udd->bp_addr - udd->tu->offset;
|
2018-04-25 12:18:03 +00:00
|
|
|
return base_addr + file_offset;
|
2013-11-25 04:42:47 +00:00
|
|
|
}
|
|
|
|
|
2018-04-25 12:18:03 +00:00
|
|
|
/* Note that we don't verify it, since the code does not come from user space */
|
|
|
|
static int
|
2024-03-04 03:40:55 +00:00
|
|
|
process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
|
|
|
|
void *dest, void *base)
|
2018-04-25 12:18:03 +00:00
|
|
|
{
|
2021-08-19 04:13:28 +00:00
|
|
|
struct pt_regs *regs = rec;
|
2018-04-25 12:18:03 +00:00
|
|
|
unsigned long val;
|
2022-12-30 06:33:53 +00:00
|
|
|
int ret;
|
2018-04-25 12:18:03 +00:00
|
|
|
|
|
|
|
/* 1st stage: get value from context */
|
|
|
|
switch (code->op) {
|
|
|
|
case FETCH_OP_REG:
|
|
|
|
val = regs_get_register(regs, code->param);
|
|
|
|
break;
|
|
|
|
case FETCH_OP_STACK:
|
|
|
|
val = get_user_stack_nth(regs, code->param);
|
|
|
|
break;
|
|
|
|
case FETCH_OP_STACKP:
|
|
|
|
val = user_stack_pointer(regs);
|
|
|
|
break;
|
|
|
|
case FETCH_OP_RETVAL:
|
|
|
|
val = regs_return_value(regs);
|
|
|
|
break;
|
2019-05-07 13:55:41 +00:00
|
|
|
case FETCH_OP_COMM:
|
|
|
|
val = FETCH_TOKEN_COMM;
|
|
|
|
break;
|
2018-04-25 12:18:03 +00:00
|
|
|
case FETCH_OP_FOFFS:
|
|
|
|
val = translate_user_vaddr(code->immediate);
|
|
|
|
break;
|
|
|
|
default:
|
2022-12-30 06:33:53 +00:00
|
|
|
ret = process_common_fetch_insn(code, &val);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2018-04-25 12:18:03 +00:00
|
|
|
}
|
|
|
|
code++;
|
|
|
|
|
2018-04-25 12:19:59 +00:00
|
|
|
return process_fetch_insn_bottom(code, val, dest, base);
|
2018-04-25 12:18:03 +00:00
|
|
|
}
|
|
|
|
NOKPROBE_SYMBOL(process_fetch_insn)
|
|
|
|
|
2013-02-03 19:58:35 +00:00
|
|
|
static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
|
|
|
|
{
|
|
|
|
rwlock_init(&filter->rwlock);
|
|
|
|
filter->nr_systemwide = 0;
|
|
|
|
INIT_LIST_HEAD(&filter->perf_events);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
|
|
|
|
{
|
|
|
|
return !filter->nr_systemwide && list_empty(&filter->perf_events);
|
|
|
|
}
|
|
|
|
|
2013-03-30 17:25:23 +00:00
|
|
|
static inline bool is_ret_probe(struct trace_uprobe *tu)
|
|
|
|
{
|
|
|
|
return tu->consumer.ret_handler != NULL;
|
|
|
|
}
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
static bool trace_uprobe_is_busy(struct dyn_event *ev)
|
|
|
|
{
|
|
|
|
struct trace_uprobe *tu = to_trace_uprobe(ev);
|
|
|
|
|
|
|
|
return trace_probe_is_enabled(&tu->tp);
|
|
|
|
}
|
|
|
|
|
2019-06-19 15:08:18 +00:00
|
|
|
static bool trace_uprobe_match_command_head(struct trace_uprobe *tu,
|
|
|
|
int argc, const char **argv)
|
|
|
|
{
|
|
|
|
char buf[MAX_ARGSTR_LEN + 1];
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (!argc)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
len = strlen(tu->filename);
|
|
|
|
if (strncmp(tu->filename, argv[0], len) || argv[0][len] != ':')
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (tu->ref_ctr_offset == 0)
|
|
|
|
snprintf(buf, sizeof(buf), "0x%0*lx",
|
|
|
|
(int)(sizeof(void *) * 2), tu->offset);
|
|
|
|
else
|
|
|
|
snprintf(buf, sizeof(buf), "0x%0*lx(0x%lx)",
|
|
|
|
(int)(sizeof(void *) * 2), tu->offset,
|
|
|
|
tu->ref_ctr_offset);
|
|
|
|
if (strcmp(buf, &argv[0][len + 1]))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
argc--; argv++;
|
|
|
|
|
|
|
|
return trace_probe_match_command_args(&tu->tp, argc, argv);
|
|
|
|
}
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
static bool trace_uprobe_match(const char *system, const char *event,
|
2019-06-19 15:07:39 +00:00
|
|
|
int argc, const char **argv, struct dyn_event *ev)
|
2018-11-05 09:03:04 +00:00
|
|
|
{
|
|
|
|
struct trace_uprobe *tu = to_trace_uprobe(ev);
|
|
|
|
|
2022-06-27 02:19:07 +00:00
|
|
|
return (event[0] == '\0' ||
|
|
|
|
strcmp(trace_probe_name(&tu->tp), event) == 0) &&
|
2019-06-19 15:08:18 +00:00
|
|
|
(!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) &&
|
|
|
|
trace_uprobe_match_command_head(tu, argc, argv);
|
2018-11-05 09:03:04 +00:00
|
|
|
}
|
|
|
|
|
2019-06-19 15:07:20 +00:00
|
|
|
static nokprobe_inline struct trace_uprobe *
|
|
|
|
trace_uprobe_primary_from_call(struct trace_event_call *call)
|
|
|
|
{
|
|
|
|
struct trace_probe *tp;
|
|
|
|
|
|
|
|
tp = trace_probe_primary_from_call(call);
|
|
|
|
if (WARN_ON_ONCE(!tp))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return container_of(tp, struct trace_uprobe, tp);
|
|
|
|
}
|
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
/*
|
|
|
|
* Allocate new trace_uprobe and initialize it (including uprobes).
|
|
|
|
*/
|
|
|
|
static struct trace_uprobe *
|
2013-03-30 17:25:23 +00:00
|
|
|
alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
|
|
|
struct trace_uprobe *tu;
|
2019-05-31 15:17:06 +00:00
|
|
|
int ret;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2021-08-17 03:43:00 +00:00
|
|
|
tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL);
|
2012-04-11 10:30:43 +00:00
|
|
|
if (!tu)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
2024-08-13 20:34:09 +00:00
|
|
|
tu->nhits = alloc_percpu(unsigned long);
|
|
|
|
if (!tu->nhits) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2024-03-04 03:40:36 +00:00
|
|
|
ret = trace_probe_init(&tu->tp, event, group, true, nargs);
|
2019-05-31 15:17:06 +00:00
|
|
|
if (ret < 0)
|
2012-04-11 10:30:43 +00:00
|
|
|
goto error;
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
dyn_event_init(&tu->devent, &trace_uprobe_ops);
|
2013-01-31 18:47:23 +00:00
|
|
|
tu->consumer.handler = uprobe_dispatcher;
|
2013-03-30 17:25:23 +00:00
|
|
|
if (is_ret)
|
|
|
|
tu->consumer.ret_handler = uretprobe_dispatcher;
|
2020-01-22 03:23:25 +00:00
|
|
|
init_trace_uprobe_filter(tu->tp.event->filter);
|
2012-04-11 10:30:43 +00:00
|
|
|
return tu;
|
|
|
|
|
|
|
|
error:
|
2024-08-13 20:34:09 +00:00
|
|
|
free_percpu(tu->nhits);
|
2012-04-11 10:30:43 +00:00
|
|
|
kfree(tu);
|
|
|
|
|
2019-05-31 15:17:06 +00:00
|
|
|
return ERR_PTR(ret);
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void free_trace_uprobe(struct trace_uprobe *tu)
|
|
|
|
{
|
2018-11-05 09:03:04 +00:00
|
|
|
if (!tu)
|
|
|
|
return;
|
|
|
|
|
2018-04-23 17:21:34 +00:00
|
|
|
path_put(&tu->path);
|
2019-05-31 15:17:06 +00:00
|
|
|
trace_probe_cleanup(&tu->tp);
|
2012-04-11 10:30:43 +00:00
|
|
|
kfree(tu->filename);
|
2024-08-13 20:34:09 +00:00
|
|
|
free_percpu(tu->nhits);
|
2012-04-11 10:30:43 +00:00
|
|
|
kfree(tu);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct trace_uprobe *find_probe_event(const char *event, const char *group)
|
|
|
|
{
|
2018-11-05 09:03:04 +00:00
|
|
|
struct dyn_event *pos;
|
2012-04-11 10:30:43 +00:00
|
|
|
struct trace_uprobe *tu;
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
for_each_trace_uprobe(tu, pos)
|
2019-05-31 15:17:47 +00:00
|
|
|
if (strcmp(trace_probe_name(&tu->tp), event) == 0 &&
|
|
|
|
strcmp(trace_probe_group_name(&tu->tp), group) == 0)
|
2012-04-11 10:30:43 +00:00
|
|
|
return tu;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
/* Unregister a trace_uprobe and probe_event */
|
2013-07-04 03:33:51 +00:00
|
|
|
static int unregister_trace_uprobe(struct trace_uprobe *tu)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
2013-07-04 03:33:51 +00:00
|
|
|
int ret;
|
|
|
|
|
2019-06-19 15:07:58 +00:00
|
|
|
if (trace_probe_has_sibling(&tu->tp))
|
|
|
|
goto unreg;
|
|
|
|
|
2021-08-17 03:42:57 +00:00
|
|
|
/* If there's a reference to the dynamic event */
|
|
|
|
if (trace_event_dyn_busy(trace_probe_event_call(&tu->tp)))
|
|
|
|
return -EBUSY;
|
|
|
|
|
2013-07-04 03:33:51 +00:00
|
|
|
ret = unregister_uprobe_event(tu);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2019-06-19 15:07:58 +00:00
|
|
|
unreg:
|
2018-11-05 09:03:04 +00:00
|
|
|
dyn_event_remove(&tu->devent);
|
2019-06-19 15:07:58 +00:00
|
|
|
trace_probe_unlink(&tu->tp);
|
2012-04-11 10:30:43 +00:00
|
|
|
free_trace_uprobe(tu);
|
2013-07-04 03:33:51 +00:00
|
|
|
return 0;
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
2019-09-18 08:55:46 +00:00
|
|
|
static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig,
|
|
|
|
struct trace_uprobe *comp)
|
|
|
|
{
|
|
|
|
struct trace_probe_event *tpe = orig->tp.event;
|
|
|
|
struct inode *comp_inode = d_real_inode(comp->path.dentry);
|
|
|
|
int i;
|
|
|
|
|
2021-11-25 20:28:52 +00:00
|
|
|
list_for_each_entry(orig, &tpe->probes, tp.list) {
|
2019-09-18 08:55:46 +00:00
|
|
|
if (comp_inode != d_real_inode(orig->path.dentry) ||
|
|
|
|
comp->offset != orig->offset)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* trace_probe_compare_arg_type() ensured that nr_args and
|
|
|
|
* each argument name and type are same. Let's compare comm.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < orig->tp.nr_args; i++) {
|
|
|
|
if (strcmp(orig->tp.args[i].comm,
|
|
|
|
comp->tp.args[i].comm))
|
2019-09-24 11:49:06 +00:00
|
|
|
break;
|
2019-09-18 08:55:46 +00:00
|
|
|
}
|
|
|
|
|
2019-09-24 11:49:06 +00:00
|
|
|
if (i == orig->tp.nr_args)
|
|
|
|
return true;
|
2019-09-18 08:55:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2019-06-19 15:07:58 +00:00
|
|
|
static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2019-09-18 08:55:46 +00:00
|
|
|
ret = trace_probe_compare_arg_type(&tu->tp, &to->tp);
|
|
|
|
if (ret) {
|
|
|
|
/* Note that argument starts index = 2 */
|
|
|
|
trace_probe_log_set_index(ret + 1);
|
|
|
|
trace_probe_log_err(0, DIFF_ARG_TYPE);
|
|
|
|
return -EEXIST;
|
|
|
|
}
|
|
|
|
if (trace_uprobe_has_same_uprobe(to, tu)) {
|
|
|
|
trace_probe_log_set_index(0);
|
|
|
|
trace_probe_log_err(0, SAME_PROBE);
|
|
|
|
return -EEXIST;
|
|
|
|
}
|
|
|
|
|
2019-06-19 15:07:58 +00:00
|
|
|
/* Append to existing event */
|
|
|
|
ret = trace_probe_append(&tu->tp, &to->tp);
|
|
|
|
if (!ret)
|
2021-08-17 03:42:56 +00:00
|
|
|
dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
|
2019-06-19 15:07:58 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-08-20 04:42:49 +00:00
|
|
|
/*
|
|
|
|
* Uprobe with multiple reference counter is not allowed. i.e.
|
|
|
|
* If inode and offset matches, reference counter offset *must*
|
|
|
|
* match as well. Though, there is one exception: If user is
|
|
|
|
* replacing old trace_uprobe with new one(same group/event),
|
|
|
|
* then we allow same uprobe with new reference counter as far
|
|
|
|
* as the new one does not conflict with any other existing
|
|
|
|
* ones.
|
|
|
|
*/
|
2019-06-19 15:07:58 +00:00
|
|
|
static int validate_ref_ctr_offset(struct trace_uprobe *new)
|
2018-08-20 04:42:49 +00:00
|
|
|
{
|
2018-11-05 09:03:04 +00:00
|
|
|
struct dyn_event *pos;
|
2019-06-19 15:07:58 +00:00
|
|
|
struct trace_uprobe *tmp;
|
2018-08-20 04:42:49 +00:00
|
|
|
struct inode *new_inode = d_real_inode(new->path.dentry);
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
for_each_trace_uprobe(tmp, pos) {
|
2019-06-19 15:07:58 +00:00
|
|
|
if (new_inode == d_real_inode(tmp->path.dentry) &&
|
2018-08-20 04:42:49 +00:00
|
|
|
new->offset == tmp->offset &&
|
|
|
|
new->ref_ctr_offset != tmp->ref_ctr_offset) {
|
|
|
|
pr_warn("Reference counter offset mismatch.");
|
2019-06-19 15:07:58 +00:00
|
|
|
return -EINVAL;
|
2018-08-20 04:42:49 +00:00
|
|
|
}
|
|
|
|
}
|
2019-06-19 15:07:58 +00:00
|
|
|
return 0;
|
2018-08-20 04:42:49 +00:00
|
|
|
}
|
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
/* Register a trace_uprobe and probe_event */
|
|
|
|
static int register_trace_uprobe(struct trace_uprobe *tu)
|
|
|
|
{
|
2013-07-03 06:42:53 +00:00
|
|
|
struct trace_uprobe *old_tu;
|
2012-04-11 10:30:43 +00:00
|
|
|
int ret;
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
mutex_lock(&event_mutex);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2019-06-19 15:07:58 +00:00
|
|
|
ret = validate_ref_ctr_offset(tu);
|
|
|
|
if (ret)
|
2018-08-20 04:42:49 +00:00
|
|
|
goto end;
|
|
|
|
|
2019-06-19 15:07:58 +00:00
|
|
|
/* register as an event */
|
|
|
|
old_tu = find_probe_event(trace_probe_name(&tu->tp),
|
|
|
|
trace_probe_group_name(&tu->tp));
|
2013-07-03 06:42:53 +00:00
|
|
|
if (old_tu) {
|
2019-06-19 15:07:58 +00:00
|
|
|
if (is_ret_probe(tu) != is_ret_probe(old_tu)) {
|
|
|
|
trace_probe_log_set_index(0);
|
|
|
|
trace_probe_log_err(0, DIFF_PROBE_TYPE);
|
|
|
|
ret = -EEXIST;
|
|
|
|
} else {
|
2019-09-18 08:55:46 +00:00
|
|
|
ret = append_trace_uprobe(tu, old_tu);
|
2019-06-19 15:07:58 +00:00
|
|
|
}
|
|
|
|
goto end;
|
2013-07-04 03:33:51 +00:00
|
|
|
}
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
ret = register_uprobe_event(tu);
|
|
|
|
if (ret) {
|
2021-08-19 10:26:02 +00:00
|
|
|
if (ret == -EEXIST) {
|
|
|
|
trace_probe_log_set_index(0);
|
|
|
|
trace_probe_log_err(0, EVENT_EXIST);
|
|
|
|
} else
|
|
|
|
pr_warn("Failed to register probe event(%d)\n", ret);
|
2012-04-11 10:30:43 +00:00
|
|
|
goto end;
|
|
|
|
}
|
|
|
|
|
2021-08-17 03:42:56 +00:00
|
|
|
dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
end:
|
2018-11-05 09:03:04 +00:00
|
|
|
mutex_unlock(&event_mutex);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Argument syntax:
|
2022-06-27 02:19:07 +00:00
|
|
|
* - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS]
|
2012-04-11 10:30:43 +00:00
|
|
|
*/
|
2021-02-01 19:48:11 +00:00
|
|
|
static int __trace_uprobe_create(int argc, const char **argv)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
|
|
|
struct trace_uprobe *tu;
|
2018-11-05 09:03:04 +00:00
|
|
|
const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
|
|
|
|
char *arg, *filename, *rctr, *rctr_end, *tmp;
|
2012-04-11 10:30:43 +00:00
|
|
|
char buf[MAX_EVENT_NAME_LEN];
|
2022-06-27 02:19:07 +00:00
|
|
|
char gbuf[MAX_EVENT_NAME_LEN];
|
2021-08-19 04:13:27 +00:00
|
|
|
enum probe_print_type ptype;
|
2012-04-11 10:30:43 +00:00
|
|
|
struct path path;
|
uprobes: Support SDT markers having reference count (semaphore)
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:
if (reference_counter > 0) {
Execute marker instructions;
}
Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.
Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:
<path>:<offset>[(ref_ctr_offset)]
where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:
uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)
No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.
[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are calling it a 'reference counter'
in kernel / perf code.
Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
[Only trace_uprobe.c]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-08-20 04:42:47 +00:00
|
|
|
unsigned long offset, ref_ctr_offset;
|
2018-11-05 09:03:04 +00:00
|
|
|
bool is_return = false;
|
2012-04-11 10:30:43 +00:00
|
|
|
int i, ret;
|
|
|
|
|
uprobes: Support SDT markers having reference count (semaphore)
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:
if (reference_counter > 0) {
Execute marker instructions;
}
Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.
Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:
<path>:<offset>[(ref_ctr_offset)]
where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:
uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)
No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.
[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are calling it a 'reference counter'
in kernel / perf code.
Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
[Only trace_uprobe.c]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-08-20 04:42:47 +00:00
|
|
|
ref_ctr_offset = 0;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2019-06-14 07:40:25 +00:00
|
|
|
switch (argv[0][0]) {
|
|
|
|
case 'r':
|
2013-03-30 19:28:15 +00:00
|
|
|
is_return = true;
|
2019-06-14 07:40:25 +00:00
|
|
|
break;
|
|
|
|
case 'p':
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -ECANCELED;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (argc < 2)
|
2018-11-05 09:03:04 +00:00
|
|
|
return -ECANCELED;
|
2024-09-30 20:26:54 +00:00
|
|
|
if (argc - 2 > MAX_TRACE_ARGS)
|
|
|
|
return -E2BIG;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
if (argv[0][1] == ':')
|
2012-04-11 10:30:43 +00:00
|
|
|
event = &argv[0][2];
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
if (!strchr(argv[1], '/'))
|
|
|
|
return -ECANCELED;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
filename = kstrdup(argv[1], GFP_KERNEL);
|
|
|
|
if (!filename)
|
|
|
|
return -ENOMEM;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2017-01-13 16:58:34 +00:00
|
|
|
/* Find the last occurrence, in case the path contains ':' too. */
|
2018-11-05 09:03:04 +00:00
|
|
|
arg = strrchr(filename, ':');
|
|
|
|
if (!arg || !isdigit(arg[1])) {
|
|
|
|
kfree(filename);
|
|
|
|
return -ECANCELED;
|
|
|
|
}
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2019-03-31 23:48:19 +00:00
|
|
|
trace_probe_log_init("trace_uprobe", argc, argv);
|
|
|
|
trace_probe_log_set_index(1); /* filename is the 2nd argument */
|
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
*arg++ = '\0';
|
|
|
|
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
|
2018-11-05 09:03:04 +00:00
|
|
|
if (ret) {
|
2019-03-31 23:48:19 +00:00
|
|
|
trace_probe_log_err(0, FILE_NOT_FOUND);
|
2018-11-05 09:03:04 +00:00
|
|
|
kfree(filename);
|
2019-03-31 23:48:19 +00:00
|
|
|
trace_probe_log_clear();
|
2018-04-23 17:21:34 +00:00
|
|
|
return ret;
|
2018-11-05 09:03:04 +00:00
|
|
|
}
|
2018-04-23 17:21:34 +00:00
|
|
|
if (!d_is_reg(path.dentry)) {
|
2019-03-31 23:48:19 +00:00
|
|
|
trace_probe_log_err(0, NO_REGULAR_FILE);
|
2012-07-18 10:16:44 +00:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto fail_address_parse;
|
|
|
|
}
|
2012-04-11 10:30:43 +00:00
|
|
|
|
uprobes: Support SDT markers having reference count (semaphore)
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:
if (reference_counter > 0) {
Execute marker instructions;
}
Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.
Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:
<path>:<offset>[(ref_ctr_offset)]
where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:
uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)
No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.
[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are calling it a 'reference counter'
in kernel / perf code.
Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
[Only trace_uprobe.c]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-08-20 04:42:47 +00:00
|
|
|
/* Parse reference counter offset if specified. */
|
|
|
|
rctr = strchr(arg, '(');
|
|
|
|
if (rctr) {
|
|
|
|
rctr_end = strchr(rctr, ')');
|
2019-03-31 23:48:19 +00:00
|
|
|
if (!rctr_end) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
rctr_end = rctr + strlen(rctr);
|
|
|
|
trace_probe_log_err(rctr_end - filename,
|
|
|
|
REFCNT_OPEN_BRACE);
|
|
|
|
goto fail_address_parse;
|
|
|
|
} else if (rctr_end[1] != '\0') {
|
uprobes: Support SDT markers having reference count (semaphore)
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:
if (reference_counter > 0) {
Execute marker instructions;
}
Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.
Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:
<path>:<offset>[(ref_ctr_offset)]
where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:
uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)
No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.
[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are calling it a 'reference counter'
in kernel / perf code.
Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
[Only trace_uprobe.c]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-08-20 04:42:47 +00:00
|
|
|
ret = -EINVAL;
|
2019-03-31 23:48:19 +00:00
|
|
|
trace_probe_log_err(rctr_end + 1 - filename,
|
|
|
|
BAD_REFCNT_SUFFIX);
|
uprobes: Support SDT markers having reference count (semaphore)
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:
if (reference_counter > 0) {
Execute marker instructions;
}
Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.
Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:
<path>:<offset>[(ref_ctr_offset)]
where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:
uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)
No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.
[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are calling it a 'reference counter'
in kernel / perf code.
Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
[Only trace_uprobe.c]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-08-20 04:42:47 +00:00
|
|
|
goto fail_address_parse;
|
|
|
|
}
|
|
|
|
|
|
|
|
*rctr++ = '\0';
|
|
|
|
*rctr_end = '\0';
|
|
|
|
ret = kstrtoul(rctr, 0, &ref_ctr_offset);
|
|
|
|
if (ret) {
|
2019-03-31 23:48:19 +00:00
|
|
|
trace_probe_log_err(rctr - filename, BAD_REFCNT);
|
uprobes: Support SDT markers having reference count (semaphore)
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:
if (reference_counter > 0) {
Execute marker instructions;
}
Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.
Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:
<path>:<offset>[(ref_ctr_offset)]
where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:
uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)
No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.
[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are calling it a 'reference counter'
in kernel / perf code.
Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
[Only trace_uprobe.c]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-08-20 04:42:47 +00:00
|
|
|
goto fail_address_parse;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-09-10 08:55:46 +00:00
|
|
|
/* Check if there is %return suffix */
|
|
|
|
tmp = strchr(arg, '%');
|
|
|
|
if (tmp) {
|
|
|
|
if (!strcmp(tmp, "%return")) {
|
|
|
|
*tmp = '\0';
|
|
|
|
is_return = true;
|
|
|
|
} else {
|
|
|
|
trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto fail_address_parse;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
uprobes: Support SDT markers having reference count (semaphore)
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:
if (reference_counter > 0) {
Execute marker instructions;
}
Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.
Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:
<path>:<offset>[(ref_ctr_offset)]
where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:
uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)
No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.
[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are calling it a 'reference counter'
in kernel / perf code.
Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
[Only trace_uprobe.c]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-08-20 04:42:47 +00:00
|
|
|
/* Parse uprobe offset. */
|
2013-01-27 17:20:45 +00:00
|
|
|
ret = kstrtoul(arg, 0, &offset);
|
2019-03-31 23:48:19 +00:00
|
|
|
if (ret) {
|
|
|
|
trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS);
|
2013-01-27 17:20:45 +00:00
|
|
|
goto fail_address_parse;
|
2019-03-31 23:48:19 +00:00
|
|
|
}
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
/* setup a probe */
|
2019-03-31 23:48:19 +00:00
|
|
|
trace_probe_log_set_index(0);
|
2018-11-05 09:03:04 +00:00
|
|
|
if (event) {
|
2022-06-27 02:19:07 +00:00
|
|
|
ret = traceprobe_parse_event_name(&event, &group, gbuf,
|
2019-03-31 23:48:19 +00:00
|
|
|
event - argv[0]);
|
2018-11-05 09:03:04 +00:00
|
|
|
if (ret)
|
|
|
|
goto fail_address_parse;
|
2022-06-27 02:19:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!event) {
|
2012-12-18 00:01:27 +00:00
|
|
|
char *tail;
|
2012-04-11 10:30:43 +00:00
|
|
|
char *ptr;
|
|
|
|
|
2012-12-18 00:01:27 +00:00
|
|
|
tail = kstrdup(kbasename(filename), GFP_KERNEL);
|
|
|
|
if (!tail) {
|
2012-04-11 10:30:43 +00:00
|
|
|
ret = -ENOMEM;
|
|
|
|
goto fail_address_parse;
|
|
|
|
}
|
|
|
|
|
|
|
|
ptr = strpbrk(tail, ".-_");
|
|
|
|
if (ptr)
|
|
|
|
*ptr = '\0';
|
|
|
|
|
|
|
|
snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
|
|
|
|
event = buf;
|
|
|
|
kfree(tail);
|
|
|
|
}
|
|
|
|
|
2019-03-31 23:48:19 +00:00
|
|
|
argc -= 2;
|
|
|
|
argv += 2;
|
|
|
|
|
2013-03-30 19:28:15 +00:00
|
|
|
tu = alloc_trace_uprobe(group, event, argc, is_return);
|
2012-04-11 10:30:43 +00:00
|
|
|
if (IS_ERR(tu)) {
|
|
|
|
ret = PTR_ERR(tu);
|
2019-03-14 04:30:50 +00:00
|
|
|
/* This must return -ENOMEM otherwise there is a bug */
|
|
|
|
WARN_ON_ONCE(ret != -ENOMEM);
|
2012-04-11 10:30:43 +00:00
|
|
|
goto fail_address_parse;
|
|
|
|
}
|
|
|
|
tu->offset = offset;
|
uprobes: Support SDT markers having reference count (semaphore)
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:
if (reference_counter > 0) {
Execute marker instructions;
}
Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.
Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:
<path>:<offset>[(ref_ctr_offset)]
where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:
uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)
No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.
[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are calling it a 'reference counter'
in kernel / perf code.
Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
[Only trace_uprobe.c]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-08-20 04:42:47 +00:00
|
|
|
tu->ref_ctr_offset = ref_ctr_offset;
|
2018-04-23 17:21:34 +00:00
|
|
|
tu->path = path;
|
2018-11-05 09:03:04 +00:00
|
|
|
tu->filename = filename;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
/* parse arguments */
|
2024-09-30 20:26:54 +00:00
|
|
|
for (i = 0; i < argc; i++) {
|
2023-06-06 12:39:56 +00:00
|
|
|
struct traceprobe_parse_context ctx = {
|
|
|
|
.flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER,
|
|
|
|
};
|
|
|
|
|
2019-03-31 23:48:19 +00:00
|
|
|
trace_probe_log_set_index(i + 2);
|
2023-06-06 12:39:56 +00:00
|
|
|
ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx);
|
2023-08-22 16:25:42 +00:00
|
|
|
traceprobe_finish_parse(&ctx);
|
2018-11-05 09:01:40 +00:00
|
|
|
if (ret)
|
2012-04-11 10:30:43 +00:00
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2021-08-19 04:13:27 +00:00
|
|
|
ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
|
|
|
|
ret = traceprobe_set_print_fmt(&tu->tp, ptype);
|
2019-05-31 15:16:56 +00:00
|
|
|
if (ret < 0)
|
|
|
|
goto error;
|
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
ret = register_trace_uprobe(tu);
|
2019-03-31 23:48:19 +00:00
|
|
|
if (!ret)
|
|
|
|
goto out;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
error:
|
|
|
|
free_trace_uprobe(tu);
|
2019-03-31 23:48:19 +00:00
|
|
|
out:
|
|
|
|
trace_probe_log_clear();
|
2012-04-11 10:30:43 +00:00
|
|
|
return ret;
|
|
|
|
|
|
|
|
fail_address_parse:
|
2019-03-31 23:48:19 +00:00
|
|
|
trace_probe_log_clear();
|
2018-04-23 17:21:34 +00:00
|
|
|
path_put(&path);
|
2018-11-05 09:03:04 +00:00
|
|
|
kfree(filename);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2021-02-01 19:48:11 +00:00
|
|
|
int trace_uprobe_create(const char *raw_command)
|
|
|
|
{
|
|
|
|
return trace_probe_create(raw_command, __trace_uprobe_create);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int create_or_delete_trace_uprobe(const char *raw_command)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
2018-11-05 09:03:04 +00:00
|
|
|
int ret;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2021-02-01 19:48:11 +00:00
|
|
|
if (raw_command[0] == '-')
|
|
|
|
return dyn_event_release(raw_command, &trace_uprobe_ops);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2021-02-01 19:48:11 +00:00
|
|
|
ret = trace_uprobe_create(raw_command);
|
2018-11-05 09:03:04 +00:00
|
|
|
return ret == -ECANCELED ? -EINVAL : ret;
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
static int trace_uprobe_release(struct dyn_event *ev)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
2018-11-05 09:03:04 +00:00
|
|
|
struct trace_uprobe *tu = to_trace_uprobe(ev);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
return unregister_trace_uprobe(tu);
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
/* Probes listing interfaces */
|
|
|
|
static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
2018-11-05 09:03:04 +00:00
|
|
|
struct trace_uprobe *tu = to_trace_uprobe(ev);
|
2013-03-30 18:48:09 +00:00
|
|
|
char c = is_ret_probe(tu) ? 'r' : 'p';
|
2012-04-11 10:30:43 +00:00
|
|
|
int i;
|
|
|
|
|
2019-05-31 15:17:47 +00:00
|
|
|
seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, trace_probe_group_name(&tu->tp),
|
|
|
|
trace_probe_name(&tu->tp), tu->filename,
|
2018-03-15 08:27:56 +00:00
|
|
|
(int)(sizeof(void *) * 2), tu->offset);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
uprobes: Support SDT markers having reference count (semaphore)
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:
if (reference_counter > 0) {
Execute marker instructions;
}
Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.
Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:
<path>:<offset>[(ref_ctr_offset)]
where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:
uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)
No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.
[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are calling it a 'reference counter'
in kernel / perf code.
Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
[Only trace_uprobe.c]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-08-20 04:42:47 +00:00
|
|
|
if (tu->ref_ctr_offset)
|
|
|
|
seq_printf(m, "(0x%lx)", tu->ref_ctr_offset);
|
|
|
|
|
2013-07-03 06:42:53 +00:00
|
|
|
for (i = 0; i < tu->tp.nr_args; i++)
|
|
|
|
seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2014-11-08 20:42:10 +00:00
|
|
|
seq_putc(m, '\n');
|
2012-04-11 10:30:43 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
static int probes_seq_show(struct seq_file *m, void *v)
|
|
|
|
{
|
|
|
|
struct dyn_event *ev = v;
|
|
|
|
|
|
|
|
if (!is_trace_uprobe(ev))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return trace_uprobe_show(m, ev);
|
|
|
|
}
|
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
static const struct seq_operations probes_seq_op = {
|
2018-11-05 09:03:04 +00:00
|
|
|
.start = dyn_event_seq_start,
|
|
|
|
.next = dyn_event_seq_next,
|
|
|
|
.stop = dyn_event_seq_stop,
|
|
|
|
.show = probes_seq_show
|
2012-04-11 10:30:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static int probes_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
2013-07-04 03:33:51 +00:00
|
|
|
int ret;
|
|
|
|
|
2019-10-11 21:22:50 +00:00
|
|
|
ret = security_locked_down(LOCKDOWN_TRACEFS);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2013-07-04 03:33:51 +00:00
|
|
|
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
|
2018-11-05 09:03:04 +00:00
|
|
|
ret = dyn_events_release_all(&trace_uprobe_ops);
|
2013-07-04 03:33:51 +00:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
return seq_open(file, &probes_seq_op);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t probes_write(struct file *file, const char __user *buffer,
|
|
|
|
size_t count, loff_t *ppos)
|
|
|
|
{
|
2018-11-05 09:03:04 +00:00
|
|
|
return trace_parse_run_command(file, buffer, count, ppos,
|
|
|
|
create_or_delete_trace_uprobe);
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations uprobe_events_ops = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.open = probes_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
|
|
|
.write = probes_write,
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Probes profiling interfaces */
|
|
|
|
static int probes_profile_seq_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2018-11-05 09:03:04 +00:00
|
|
|
struct dyn_event *ev = v;
|
|
|
|
struct trace_uprobe *tu;
|
2024-08-13 20:34:09 +00:00
|
|
|
unsigned long nhits;
|
|
|
|
int cpu;
|
2018-11-05 09:03:04 +00:00
|
|
|
|
|
|
|
if (!is_trace_uprobe(ev))
|
|
|
|
return 0;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2018-11-05 09:03:04 +00:00
|
|
|
tu = to_trace_uprobe(ev);
|
2024-08-13 20:34:09 +00:00
|
|
|
|
|
|
|
nhits = 0;
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
nhits += per_cpu(*tu->nhits, cpu);
|
|
|
|
}
|
|
|
|
|
2014-04-08 21:26:21 +00:00
|
|
|
seq_printf(m, " %s %-44s %15lu\n", tu->filename,
|
2024-08-13 20:34:09 +00:00
|
|
|
trace_probe_name(&tu->tp), nhits);
|
2012-04-11 10:30:43 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct seq_operations profile_seq_op = {
|
2018-11-05 09:03:04 +00:00
|
|
|
.start = dyn_event_seq_start,
|
|
|
|
.next = dyn_event_seq_next,
|
|
|
|
.stop = dyn_event_seq_stop,
|
2012-04-11 10:30:43 +00:00
|
|
|
.show = probes_profile_seq_show
|
|
|
|
};
|
|
|
|
|
|
|
|
static int profile_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
2019-10-11 21:22:50 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = security_locked_down(LOCKDOWN_TRACEFS);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
return seq_open(file, &profile_seq_op);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations uprobe_profile_ops = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.open = profile_open,
|
|
|
|
.read = seq_read,
|
|
|
|
.llseek = seq_lseek,
|
|
|
|
.release = seq_release,
|
|
|
|
};
|
|
|
|
|
2013-07-03 07:40:28 +00:00
|
|
|
struct uprobe_cpu_buffer {
|
|
|
|
struct mutex mutex;
|
|
|
|
void *buf;
|
2024-03-18 18:17:26 +00:00
|
|
|
int dsize;
|
2013-07-03 07:40:28 +00:00
|
|
|
};
|
|
|
|
static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
|
|
|
|
static int uprobe_buffer_refcnt;
|
uprobe: avoid out-of-bounds memory access of fetching args
Uprobe needs to fetch args into a percpu buffer, and then copy to ring
buffer to avoid non-atomic context problem.
Sometimes user-space strings, arrays can be very large, but the size of
percpu buffer is only page size. And store_trace_args() won't check
whether these data exceeds a single page or not, caused out-of-bounds
memory access.
It could be reproduced by following steps:
1. build kernel with CONFIG_KASAN enabled
2. save follow program as test.c
```
\#include <stdio.h>
\#include <stdlib.h>
\#include <string.h>
// If string length large than MAX_STRING_SIZE, the fetch_store_strlen()
// will return 0, cause __get_data_size() return shorter size, and
// store_trace_args() will not trigger out-of-bounds access.
// So make string length less than 4096.
\#define STRLEN 4093
void generate_string(char *str, int n)
{
int i;
for (i = 0; i < n; ++i)
{
char c = i % 26 + 'a';
str[i] = c;
}
str[n-1] = '\0';
}
void print_string(char *str)
{
printf("%s\n", str);
}
int main()
{
char tmp[STRLEN];
generate_string(tmp, STRLEN);
print_string(tmp);
return 0;
}
```
3. compile program
`gcc -o test test.c`
4. get the offset of `print_string()`
```
objdump -t test | grep -w print_string
0000000000401199 g F .text 000000000000001b print_string
```
5. configure uprobe with offset 0x1199
```
off=0x1199
cd /sys/kernel/debug/tracing/
echo "p /root/test:${off} arg1=+0(%di):ustring arg2=\$comm arg3=+0(%di):ustring"
> uprobe_events
echo 1 > events/uprobes/enable
echo 1 > tracing_on
```
6. run `test`, and kasan will report error.
==================================================================
BUG: KASAN: use-after-free in strncpy_from_user+0x1d6/0x1f0
Write of size 8 at addr ffff88812311c004 by task test/499CPU: 0 UID: 0 PID: 499 Comm: test Not tainted 6.12.0-rc3+ #18
Hardware name: Red Hat KVM, BIOS 1.16.0-4.al8 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x55/0x70
print_address_description.constprop.0+0x27/0x310
kasan_report+0x10f/0x120
? strncpy_from_user+0x1d6/0x1f0
strncpy_from_user+0x1d6/0x1f0
? rmqueue.constprop.0+0x70d/0x2ad0
process_fetch_insn+0xb26/0x1470
? __pfx_process_fetch_insn+0x10/0x10
? _raw_spin_lock+0x85/0xe0
? __pfx__raw_spin_lock+0x10/0x10
? __pte_offset_map+0x1f/0x2d0
? unwind_next_frame+0xc5f/0x1f80
? arch_stack_walk+0x68/0xf0
? is_bpf_text_address+0x23/0x30
? kernel_text_address.part.0+0xbb/0xd0
? __kernel_text_address+0x66/0xb0
? unwind_get_return_address+0x5e/0xa0
? __pfx_stack_trace_consume_entry+0x10/0x10
? arch_stack_walk+0xa2/0xf0
? _raw_spin_lock_irqsave+0x8b/0xf0
? __pfx__raw_spin_lock_irqsave+0x10/0x10
? depot_alloc_stack+0x4c/0x1f0
? _raw_spin_unlock_irqrestore+0xe/0x30
? stack_depot_save_flags+0x35d/0x4f0
? kasan_save_stack+0x34/0x50
? kasan_save_stack+0x24/0x50
? mutex_lock+0x91/0xe0
? __pfx_mutex_lock+0x10/0x10
prepare_uprobe_buffer.part.0+0x2cd/0x500
uprobe_dispatcher+0x2c3/0x6a0
? __pfx_uprobe_dispatcher+0x10/0x10
? __kasan_slab_alloc+0x4d/0x90
handler_chain+0xdd/0x3e0
handle_swbp+0x26e/0x3d0
? __pfx_handle_swbp+0x10/0x10
? uprobe_pre_sstep_notifier+0x151/0x1b0
irqentry_exit_to_user_mode+0xe2/0x1b0
asm_exc_int3+0x39/0x40
RIP: 0033:0x401199
Code: 01 c2 0f b6 45 fb 88 02 83 45 fc 01 8b 45 fc 3b 45 e4 7c b7 8b 45 e4 48 98 48 8d 50 ff 48 8b 45 e8 48 01 d0 ce
RSP: 002b:00007ffdf00576a8 EFLAGS: 00000206
RAX: 00007ffdf00576b0 RBX: 0000000000000000 RCX: 0000000000000ff2
RDX: 0000000000000ffc RSI: 0000000000000ffd RDI: 00007ffdf00576b0
RBP: 00007ffdf00586b0 R08: 00007feb2f9c0d20 R09: 00007feb2f9c0d20
R10: 0000000000000001 R11: 0000000000000202 R12: 0000000000401040
R13: 00007ffdf0058780 R14: 0000000000000000 R15: 0000000000000000
</TASK>
This commit enforces the buffer's maxlen less than a page-size to avoid
store_trace_args() out-of-memory access.
Link: https://lore.kernel.org/all/20241015060148.1108331-1-mqaio@linux.alibaba.com/
Fixes: dcad1a204f72 ("tracing/uprobes: Fetch args before reserving a ring buffer")
Signed-off-by: Qiao Ma <mqaio@linux.alibaba.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2024-10-15 06:01:48 +00:00
|
|
|
#define MAX_UCB_BUFFER_SIZE PAGE_SIZE
|
2013-07-03 07:40:28 +00:00
|
|
|
|
|
|
|
static int uprobe_buffer_init(void)
|
|
|
|
{
|
|
|
|
int cpu, err_cpu;
|
|
|
|
|
|
|
|
uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer);
|
|
|
|
if (uprobe_cpu_buffer == NULL)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
struct page *p = alloc_pages_node(cpu_to_node(cpu),
|
|
|
|
GFP_KERNEL, 0);
|
|
|
|
if (p == NULL) {
|
|
|
|
err_cpu = cpu;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p);
|
|
|
|
mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err:
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
if (cpu == err_cpu)
|
|
|
|
break;
|
|
|
|
free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
free_percpu(uprobe_cpu_buffer);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int uprobe_buffer_enable(void)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
BUG_ON(!mutex_is_locked(&event_mutex));
|
|
|
|
|
|
|
|
if (uprobe_buffer_refcnt++ == 0) {
|
|
|
|
ret = uprobe_buffer_init();
|
|
|
|
if (ret < 0)
|
|
|
|
uprobe_buffer_refcnt--;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void uprobe_buffer_disable(void)
|
|
|
|
{
|
2014-04-17 08:05:19 +00:00
|
|
|
int cpu;
|
|
|
|
|
2013-07-03 07:40:28 +00:00
|
|
|
BUG_ON(!mutex_is_locked(&event_mutex));
|
|
|
|
|
|
|
|
if (--uprobe_buffer_refcnt == 0) {
|
2014-04-17 08:05:19 +00:00
|
|
|
for_each_possible_cpu(cpu)
|
|
|
|
free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer,
|
|
|
|
cpu)->buf);
|
|
|
|
|
2013-07-03 07:40:28 +00:00
|
|
|
free_percpu(uprobe_cpu_buffer);
|
|
|
|
uprobe_cpu_buffer = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
|
|
|
|
{
|
|
|
|
struct uprobe_cpu_buffer *ucb;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
cpu = raw_smp_processor_id();
|
|
|
|
ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use per-cpu buffers for fastest access, but we might migrate
|
|
|
|
* so the mutex makes sure we have sole access to it.
|
|
|
|
*/
|
|
|
|
mutex_lock(&ucb->mutex);
|
|
|
|
|
|
|
|
return ucb;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
|
|
|
|
{
|
2024-03-18 18:17:27 +00:00
|
|
|
if (!ucb)
|
|
|
|
return;
|
2013-07-03 07:40:28 +00:00
|
|
|
mutex_unlock(&ucb->mutex);
|
|
|
|
}
|
|
|
|
|
2024-03-18 18:17:26 +00:00
|
|
|
static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
|
2024-03-18 18:17:27 +00:00
|
|
|
struct pt_regs *regs,
|
|
|
|
struct uprobe_cpu_buffer **ucbp)
|
2024-03-18 18:17:26 +00:00
|
|
|
{
|
|
|
|
struct uprobe_cpu_buffer *ucb;
|
|
|
|
int dsize, esize;
|
|
|
|
|
2024-03-18 18:17:27 +00:00
|
|
|
if (*ucbp)
|
|
|
|
return *ucbp;
|
|
|
|
|
2024-03-18 18:17:26 +00:00
|
|
|
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
|
|
|
|
dsize = __get_data_size(&tu->tp, regs, NULL);
|
|
|
|
|
|
|
|
ucb = uprobe_buffer_get();
|
|
|
|
ucb->dsize = tu->tp.size + dsize;
|
|
|
|
|
uprobe: avoid out-of-bounds memory access of fetching args
Uprobe needs to fetch args into a percpu buffer, and then copy to ring
buffer to avoid non-atomic context problem.
Sometimes user-space strings, arrays can be very large, but the size of
percpu buffer is only page size. And store_trace_args() won't check
whether these data exceeds a single page or not, caused out-of-bounds
memory access.
It could be reproduced by following steps:
1. build kernel with CONFIG_KASAN enabled
2. save follow program as test.c
```
\#include <stdio.h>
\#include <stdlib.h>
\#include <string.h>
// If string length large than MAX_STRING_SIZE, the fetch_store_strlen()
// will return 0, cause __get_data_size() return shorter size, and
// store_trace_args() will not trigger out-of-bounds access.
// So make string length less than 4096.
\#define STRLEN 4093
void generate_string(char *str, int n)
{
int i;
for (i = 0; i < n; ++i)
{
char c = i % 26 + 'a';
str[i] = c;
}
str[n-1] = '\0';
}
void print_string(char *str)
{
printf("%s\n", str);
}
int main()
{
char tmp[STRLEN];
generate_string(tmp, STRLEN);
print_string(tmp);
return 0;
}
```
3. compile program
`gcc -o test test.c`
4. get the offset of `print_string()`
```
objdump -t test | grep -w print_string
0000000000401199 g F .text 000000000000001b print_string
```
5. configure uprobe with offset 0x1199
```
off=0x1199
cd /sys/kernel/debug/tracing/
echo "p /root/test:${off} arg1=+0(%di):ustring arg2=\$comm arg3=+0(%di):ustring"
> uprobe_events
echo 1 > events/uprobes/enable
echo 1 > tracing_on
```
6. run `test`, and kasan will report error.
==================================================================
BUG: KASAN: use-after-free in strncpy_from_user+0x1d6/0x1f0
Write of size 8 at addr ffff88812311c004 by task test/499CPU: 0 UID: 0 PID: 499 Comm: test Not tainted 6.12.0-rc3+ #18
Hardware name: Red Hat KVM, BIOS 1.16.0-4.al8 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x55/0x70
print_address_description.constprop.0+0x27/0x310
kasan_report+0x10f/0x120
? strncpy_from_user+0x1d6/0x1f0
strncpy_from_user+0x1d6/0x1f0
? rmqueue.constprop.0+0x70d/0x2ad0
process_fetch_insn+0xb26/0x1470
? __pfx_process_fetch_insn+0x10/0x10
? _raw_spin_lock+0x85/0xe0
? __pfx__raw_spin_lock+0x10/0x10
? __pte_offset_map+0x1f/0x2d0
? unwind_next_frame+0xc5f/0x1f80
? arch_stack_walk+0x68/0xf0
? is_bpf_text_address+0x23/0x30
? kernel_text_address.part.0+0xbb/0xd0
? __kernel_text_address+0x66/0xb0
? unwind_get_return_address+0x5e/0xa0
? __pfx_stack_trace_consume_entry+0x10/0x10
? arch_stack_walk+0xa2/0xf0
? _raw_spin_lock_irqsave+0x8b/0xf0
? __pfx__raw_spin_lock_irqsave+0x10/0x10
? depot_alloc_stack+0x4c/0x1f0
? _raw_spin_unlock_irqrestore+0xe/0x30
? stack_depot_save_flags+0x35d/0x4f0
? kasan_save_stack+0x34/0x50
? kasan_save_stack+0x24/0x50
? mutex_lock+0x91/0xe0
? __pfx_mutex_lock+0x10/0x10
prepare_uprobe_buffer.part.0+0x2cd/0x500
uprobe_dispatcher+0x2c3/0x6a0
? __pfx_uprobe_dispatcher+0x10/0x10
? __kasan_slab_alloc+0x4d/0x90
handler_chain+0xdd/0x3e0
handle_swbp+0x26e/0x3d0
? __pfx_handle_swbp+0x10/0x10
? uprobe_pre_sstep_notifier+0x151/0x1b0
irqentry_exit_to_user_mode+0xe2/0x1b0
asm_exc_int3+0x39/0x40
RIP: 0033:0x401199
Code: 01 c2 0f b6 45 fb 88 02 83 45 fc 01 8b 45 fc 3b 45 e4 7c b7 8b 45 e4 48 98 48 8d 50 ff 48 8b 45 e8 48 01 d0 ce
RSP: 002b:00007ffdf00576a8 EFLAGS: 00000206
RAX: 00007ffdf00576b0 RBX: 0000000000000000 RCX: 0000000000000ff2
RDX: 0000000000000ffc RSI: 0000000000000ffd RDI: 00007ffdf00576b0
RBP: 00007ffdf00586b0 R08: 00007feb2f9c0d20 R09: 00007feb2f9c0d20
R10: 0000000000000001 R11: 0000000000000202 R12: 0000000000401040
R13: 00007ffdf0058780 R14: 0000000000000000 R15: 0000000000000000
</TASK>
This commit enforces the buffer's maxlen less than a page-size to avoid
store_trace_args() out-of-memory access.
Link: https://lore.kernel.org/all/20241015060148.1108331-1-mqaio@linux.alibaba.com/
Fixes: dcad1a204f72 ("tracing/uprobes: Fetch args before reserving a ring buffer")
Signed-off-by: Qiao Ma <mqaio@linux.alibaba.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2024-10-15 06:01:48 +00:00
|
|
|
if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) {
|
|
|
|
ucb->dsize = MAX_UCB_BUFFER_SIZE;
|
|
|
|
dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size;
|
|
|
|
}
|
|
|
|
|
2024-03-18 18:17:26 +00:00
|
|
|
store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
|
|
|
|
|
2024-03-18 18:17:27 +00:00
|
|
|
*ucbp = ucb;
|
2024-03-18 18:17:26 +00:00
|
|
|
return ucb;
|
|
|
|
}
|
|
|
|
|
2014-01-17 08:08:36 +00:00
|
|
|
static void __uprobe_trace_func(struct trace_uprobe *tu,
|
2014-01-17 08:08:37 +00:00
|
|
|
unsigned long func, struct pt_regs *regs,
|
uprobes: prevent mutex_lock() under rcu_read_lock()
Recent changes made uprobe_cpu_buffer preparation lazy, and moved it
deeper into __uprobe_trace_func(). This is problematic because
__uprobe_trace_func() is called inside rcu_read_lock()/rcu_read_unlock()
block, which then calls prepare_uprobe_buffer() -> uprobe_buffer_get() ->
mutex_lock(&ucb->mutex), leading to a splat about using mutex under
non-sleepable RCU:
BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585
in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 98231, name: stress-ng-sigq
preempt_count: 0, expected: 0
RCU nest depth: 1, expected: 0
...
Call Trace:
<TASK>
dump_stack_lvl+0x3d/0xe0
__might_resched+0x24c/0x270
? prepare_uprobe_buffer+0xd5/0x1d0
__mutex_lock+0x41/0x820
? ___perf_sw_event+0x206/0x290
? __perf_event_task_sched_in+0x54/0x660
? __perf_event_task_sched_in+0x54/0x660
prepare_uprobe_buffer+0xd5/0x1d0
__uprobe_trace_func+0x4a/0x140
uprobe_dispatcher+0x135/0x280
? uprobe_dispatcher+0x94/0x280
uprobe_notify_resume+0x650/0xec0
? atomic_notifier_call_chain+0x21/0x110
? atomic_notifier_call_chain+0xf8/0x110
irqentry_exit_to_user_mode+0xe2/0x1e0
asm_exc_int3+0x35/0x40
RIP: 0033:0x7f7e1d4da390
Code: 33 04 00 0f 1f 80 00 00 00 00 f3 0f 1e fa b9 01 00 00 00 e9 b2 fc ff ff 66 90 f3 0f 1e fa 31 c9 e9 a5 fc ff ff 0f 1f 44 00 00 <cc> 0f 1e fa b8 27 00 00 00 0f 05 c3 0f 1f 40 00 f3 0f 1e fa b8 6e
RSP: 002b:00007ffd2abc3608 EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000076d325f1 RCX: 0000000000000000
RDX: 0000000076d325f1 RSI: 000000000000000a RDI: 00007ffd2abc3690
RBP: 000000000000000a R08: 00017fb700000000 R09: 00017fb700000000
R10: 00017fb700000000 R11: 0000000000000246 R12: 0000000000017ff2
R13: 00007ffd2abc3610 R14: 0000000000000000 R15: 00007ffd2abc3780
</TASK>
Luckily, it's easy to fix by moving prepare_uprobe_buffer() to be called
slightly earlier: into uprobe_trace_func() and uretprobe_trace_func(), outside
of RCU locked section. This still keeps this buffer preparation lazy and helps
avoid the overhead when it's not needed. E.g., if there is only BPF uprobe
handler installed on a given uprobe, buffer won't be initialized.
Note, the other user of prepare_uprobe_buffer(), __uprobe_perf_func(), is not
affected, as it doesn't prepare buffer under RCU read lock.
Link: https://lore.kernel.org/all/20240521053017.3708530-1-andrii@kernel.org/
Fixes: 1b8f85defbc8 ("uprobes: prepare uprobe args buffer lazily")
Reported-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2024-05-21 05:30:17 +00:00
|
|
|
struct uprobe_cpu_buffer *ucb,
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *trace_file)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
|
|
|
struct uprobe_trace_entry_head *entry;
|
2021-12-06 21:24:40 +00:00
|
|
|
struct trace_event_buffer fbuffer;
|
2013-03-29 17:26:51 +00:00
|
|
|
void *data;
|
2014-01-17 08:08:37 +00:00
|
|
|
int size, esize;
|
2019-05-31 15:17:57 +00:00
|
|
|
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2015-05-05 14:09:53 +00:00
|
|
|
WARN_ON(call != trace_file->event_call);
|
2014-01-17 08:08:38 +00:00
|
|
|
|
2015-05-13 19:21:25 +00:00
|
|
|
if (trace_trigger_soft_disabled(trace_file))
|
2014-01-17 08:08:39 +00:00
|
|
|
return;
|
|
|
|
|
2014-01-17 08:08:37 +00:00
|
|
|
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
|
2024-03-18 18:17:26 +00:00
|
|
|
size = esize + ucb->dsize;
|
2021-12-06 21:24:40 +00:00
|
|
|
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
|
|
|
|
if (!entry)
|
2014-01-17 08:08:37 +00:00
|
|
|
return;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2013-03-30 17:46:22 +00:00
|
|
|
if (is_ret_probe(tu)) {
|
|
|
|
entry->vaddr[0] = func;
|
|
|
|
entry->vaddr[1] = instruction_pointer(regs);
|
|
|
|
data = DATAOF_TRACE_ENTRY(entry, true);
|
|
|
|
} else {
|
|
|
|
entry->vaddr[0] = instruction_pointer(regs);
|
|
|
|
data = DATAOF_TRACE_ENTRY(entry, false);
|
|
|
|
}
|
|
|
|
|
2024-03-18 18:17:26 +00:00
|
|
|
memcpy(data, ucb->buf, ucb->dsize);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2021-12-06 21:24:40 +00:00
|
|
|
trace_event_buffer_commit(&fbuffer);
|
2013-03-30 17:02:12 +00:00
|
|
|
}
|
2013-02-04 16:48:34 +00:00
|
|
|
|
2013-03-30 17:02:12 +00:00
|
|
|
/* uprobe handler */
|
2014-01-17 08:08:37 +00:00
|
|
|
static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
|
2024-03-18 18:17:27 +00:00
|
|
|
struct uprobe_cpu_buffer **ucbp)
|
2013-03-30 17:02:12 +00:00
|
|
|
{
|
2014-01-17 08:08:38 +00:00
|
|
|
struct event_file_link *link;
|
uprobes: prevent mutex_lock() under rcu_read_lock()
Recent changes made uprobe_cpu_buffer preparation lazy, and moved it
deeper into __uprobe_trace_func(). This is problematic because
__uprobe_trace_func() is called inside rcu_read_lock()/rcu_read_unlock()
block, which then calls prepare_uprobe_buffer() -> uprobe_buffer_get() ->
mutex_lock(&ucb->mutex), leading to a splat about using mutex under
non-sleepable RCU:
BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585
in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 98231, name: stress-ng-sigq
preempt_count: 0, expected: 0
RCU nest depth: 1, expected: 0
...
Call Trace:
<TASK>
dump_stack_lvl+0x3d/0xe0
__might_resched+0x24c/0x270
? prepare_uprobe_buffer+0xd5/0x1d0
__mutex_lock+0x41/0x820
? ___perf_sw_event+0x206/0x290
? __perf_event_task_sched_in+0x54/0x660
? __perf_event_task_sched_in+0x54/0x660
prepare_uprobe_buffer+0xd5/0x1d0
__uprobe_trace_func+0x4a/0x140
uprobe_dispatcher+0x135/0x280
? uprobe_dispatcher+0x94/0x280
uprobe_notify_resume+0x650/0xec0
? atomic_notifier_call_chain+0x21/0x110
? atomic_notifier_call_chain+0xf8/0x110
irqentry_exit_to_user_mode+0xe2/0x1e0
asm_exc_int3+0x35/0x40
RIP: 0033:0x7f7e1d4da390
Code: 33 04 00 0f 1f 80 00 00 00 00 f3 0f 1e fa b9 01 00 00 00 e9 b2 fc ff ff 66 90 f3 0f 1e fa 31 c9 e9 a5 fc ff ff 0f 1f 44 00 00 <cc> 0f 1e fa b8 27 00 00 00 0f 05 c3 0f 1f 40 00 f3 0f 1e fa b8 6e
RSP: 002b:00007ffd2abc3608 EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000076d325f1 RCX: 0000000000000000
RDX: 0000000076d325f1 RSI: 000000000000000a RDI: 00007ffd2abc3690
RBP: 000000000000000a R08: 00017fb700000000 R09: 00017fb700000000
R10: 00017fb700000000 R11: 0000000000000246 R12: 0000000000017ff2
R13: 00007ffd2abc3610 R14: 0000000000000000 R15: 00007ffd2abc3780
</TASK>
Luckily, it's easy to fix by moving prepare_uprobe_buffer() to be called
slightly earlier: into uprobe_trace_func() and uretprobe_trace_func(), outside
of RCU locked section. This still keeps this buffer preparation lazy and helps
avoid the overhead when it's not needed. E.g., if there is only BPF uprobe
handler installed on a given uprobe, buffer won't be initialized.
Note, the other user of prepare_uprobe_buffer(), __uprobe_perf_func(), is not
affected, as it doesn't prepare buffer under RCU read lock.
Link: https://lore.kernel.org/all/20240521053017.3708530-1-andrii@kernel.org/
Fixes: 1b8f85defbc8 ("uprobes: prepare uprobe args buffer lazily")
Reported-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2024-05-21 05:30:17 +00:00
|
|
|
struct uprobe_cpu_buffer *ucb;
|
2014-01-17 08:08:38 +00:00
|
|
|
|
|
|
|
if (is_ret_probe(tu))
|
|
|
|
return 0;
|
|
|
|
|
uprobes: prevent mutex_lock() under rcu_read_lock()
Recent changes made uprobe_cpu_buffer preparation lazy, and moved it
deeper into __uprobe_trace_func(). This is problematic because
__uprobe_trace_func() is called inside rcu_read_lock()/rcu_read_unlock()
block, which then calls prepare_uprobe_buffer() -> uprobe_buffer_get() ->
mutex_lock(&ucb->mutex), leading to a splat about using mutex under
non-sleepable RCU:
BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585
in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 98231, name: stress-ng-sigq
preempt_count: 0, expected: 0
RCU nest depth: 1, expected: 0
...
Call Trace:
<TASK>
dump_stack_lvl+0x3d/0xe0
__might_resched+0x24c/0x270
? prepare_uprobe_buffer+0xd5/0x1d0
__mutex_lock+0x41/0x820
? ___perf_sw_event+0x206/0x290
? __perf_event_task_sched_in+0x54/0x660
? __perf_event_task_sched_in+0x54/0x660
prepare_uprobe_buffer+0xd5/0x1d0
__uprobe_trace_func+0x4a/0x140
uprobe_dispatcher+0x135/0x280
? uprobe_dispatcher+0x94/0x280
uprobe_notify_resume+0x650/0xec0
? atomic_notifier_call_chain+0x21/0x110
? atomic_notifier_call_chain+0xf8/0x110
irqentry_exit_to_user_mode+0xe2/0x1e0
asm_exc_int3+0x35/0x40
RIP: 0033:0x7f7e1d4da390
Code: 33 04 00 0f 1f 80 00 00 00 00 f3 0f 1e fa b9 01 00 00 00 e9 b2 fc ff ff 66 90 f3 0f 1e fa 31 c9 e9 a5 fc ff ff 0f 1f 44 00 00 <cc> 0f 1e fa b8 27 00 00 00 0f 05 c3 0f 1f 40 00 f3 0f 1e fa b8 6e
RSP: 002b:00007ffd2abc3608 EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000076d325f1 RCX: 0000000000000000
RDX: 0000000076d325f1 RSI: 000000000000000a RDI: 00007ffd2abc3690
RBP: 000000000000000a R08: 00017fb700000000 R09: 00017fb700000000
R10: 00017fb700000000 R11: 0000000000000246 R12: 0000000000017ff2
R13: 00007ffd2abc3610 R14: 0000000000000000 R15: 00007ffd2abc3780
</TASK>
Luckily, it's easy to fix by moving prepare_uprobe_buffer() to be called
slightly earlier: into uprobe_trace_func() and uretprobe_trace_func(), outside
of RCU locked section. This still keeps this buffer preparation lazy and helps
avoid the overhead when it's not needed. E.g., if there is only BPF uprobe
handler installed on a given uprobe, buffer won't be initialized.
Note, the other user of prepare_uprobe_buffer(), __uprobe_perf_func(), is not
affected, as it doesn't prepare buffer under RCU read lock.
Link: https://lore.kernel.org/all/20240521053017.3708530-1-andrii@kernel.org/
Fixes: 1b8f85defbc8 ("uprobes: prepare uprobe args buffer lazily")
Reported-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2024-05-21 05:30:17 +00:00
|
|
|
ucb = prepare_uprobe_buffer(tu, regs, ucbp);
|
|
|
|
|
2014-01-17 08:08:38 +00:00
|
|
|
rcu_read_lock();
|
2019-05-31 15:17:26 +00:00
|
|
|
trace_probe_for_each_link_rcu(link, &tu->tp)
|
uprobes: prevent mutex_lock() under rcu_read_lock()
Recent changes made uprobe_cpu_buffer preparation lazy, and moved it
deeper into __uprobe_trace_func(). This is problematic because
__uprobe_trace_func() is called inside rcu_read_lock()/rcu_read_unlock()
block, which then calls prepare_uprobe_buffer() -> uprobe_buffer_get() ->
mutex_lock(&ucb->mutex), leading to a splat about using mutex under
non-sleepable RCU:
BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585
in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 98231, name: stress-ng-sigq
preempt_count: 0, expected: 0
RCU nest depth: 1, expected: 0
...
Call Trace:
<TASK>
dump_stack_lvl+0x3d/0xe0
__might_resched+0x24c/0x270
? prepare_uprobe_buffer+0xd5/0x1d0
__mutex_lock+0x41/0x820
? ___perf_sw_event+0x206/0x290
? __perf_event_task_sched_in+0x54/0x660
? __perf_event_task_sched_in+0x54/0x660
prepare_uprobe_buffer+0xd5/0x1d0
__uprobe_trace_func+0x4a/0x140
uprobe_dispatcher+0x135/0x280
? uprobe_dispatcher+0x94/0x280
uprobe_notify_resume+0x650/0xec0
? atomic_notifier_call_chain+0x21/0x110
? atomic_notifier_call_chain+0xf8/0x110
irqentry_exit_to_user_mode+0xe2/0x1e0
asm_exc_int3+0x35/0x40
RIP: 0033:0x7f7e1d4da390
Code: 33 04 00 0f 1f 80 00 00 00 00 f3 0f 1e fa b9 01 00 00 00 e9 b2 fc ff ff 66 90 f3 0f 1e fa 31 c9 e9 a5 fc ff ff 0f 1f 44 00 00 <cc> 0f 1e fa b8 27 00 00 00 0f 05 c3 0f 1f 40 00 f3 0f 1e fa b8 6e
RSP: 002b:00007ffd2abc3608 EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000076d325f1 RCX: 0000000000000000
RDX: 0000000076d325f1 RSI: 000000000000000a RDI: 00007ffd2abc3690
RBP: 000000000000000a R08: 00017fb700000000 R09: 00017fb700000000
R10: 00017fb700000000 R11: 0000000000000246 R12: 0000000000017ff2
R13: 00007ffd2abc3610 R14: 0000000000000000 R15: 00007ffd2abc3780
</TASK>
Luckily, it's easy to fix by moving prepare_uprobe_buffer() to be called
slightly earlier: into uprobe_trace_func() and uretprobe_trace_func(), outside
of RCU locked section. This still keeps this buffer preparation lazy and helps
avoid the overhead when it's not needed. E.g., if there is only BPF uprobe
handler installed on a given uprobe, buffer won't be initialized.
Note, the other user of prepare_uprobe_buffer(), __uprobe_perf_func(), is not
affected, as it doesn't prepare buffer under RCU read lock.
Link: https://lore.kernel.org/all/20240521053017.3708530-1-andrii@kernel.org/
Fixes: 1b8f85defbc8 ("uprobes: prepare uprobe args buffer lazily")
Reported-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2024-05-21 05:30:17 +00:00
|
|
|
__uprobe_trace_func(tu, 0, regs, ucb, link->file);
|
2014-01-17 08:08:38 +00:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
2013-02-04 16:48:34 +00:00
|
|
|
return 0;
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
2013-03-30 17:25:23 +00:00
|
|
|
static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
|
2014-01-17 08:08:37 +00:00
|
|
|
struct pt_regs *regs,
|
2024-03-18 18:17:27 +00:00
|
|
|
struct uprobe_cpu_buffer **ucbp)
|
2013-03-30 17:25:23 +00:00
|
|
|
{
|
2014-01-17 08:08:38 +00:00
|
|
|
struct event_file_link *link;
|
uprobes: prevent mutex_lock() under rcu_read_lock()
Recent changes made uprobe_cpu_buffer preparation lazy, and moved it
deeper into __uprobe_trace_func(). This is problematic because
__uprobe_trace_func() is called inside rcu_read_lock()/rcu_read_unlock()
block, which then calls prepare_uprobe_buffer() -> uprobe_buffer_get() ->
mutex_lock(&ucb->mutex), leading to a splat about using mutex under
non-sleepable RCU:
BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585
in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 98231, name: stress-ng-sigq
preempt_count: 0, expected: 0
RCU nest depth: 1, expected: 0
...
Call Trace:
<TASK>
dump_stack_lvl+0x3d/0xe0
__might_resched+0x24c/0x270
? prepare_uprobe_buffer+0xd5/0x1d0
__mutex_lock+0x41/0x820
? ___perf_sw_event+0x206/0x290
? __perf_event_task_sched_in+0x54/0x660
? __perf_event_task_sched_in+0x54/0x660
prepare_uprobe_buffer+0xd5/0x1d0
__uprobe_trace_func+0x4a/0x140
uprobe_dispatcher+0x135/0x280
? uprobe_dispatcher+0x94/0x280
uprobe_notify_resume+0x650/0xec0
? atomic_notifier_call_chain+0x21/0x110
? atomic_notifier_call_chain+0xf8/0x110
irqentry_exit_to_user_mode+0xe2/0x1e0
asm_exc_int3+0x35/0x40
RIP: 0033:0x7f7e1d4da390
Code: 33 04 00 0f 1f 80 00 00 00 00 f3 0f 1e fa b9 01 00 00 00 e9 b2 fc ff ff 66 90 f3 0f 1e fa 31 c9 e9 a5 fc ff ff 0f 1f 44 00 00 <cc> 0f 1e fa b8 27 00 00 00 0f 05 c3 0f 1f 40 00 f3 0f 1e fa b8 6e
RSP: 002b:00007ffd2abc3608 EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000076d325f1 RCX: 0000000000000000
RDX: 0000000076d325f1 RSI: 000000000000000a RDI: 00007ffd2abc3690
RBP: 000000000000000a R08: 00017fb700000000 R09: 00017fb700000000
R10: 00017fb700000000 R11: 0000000000000246 R12: 0000000000017ff2
R13: 00007ffd2abc3610 R14: 0000000000000000 R15: 00007ffd2abc3780
</TASK>
Luckily, it's easy to fix by moving prepare_uprobe_buffer() to be called
slightly earlier: into uprobe_trace_func() and uretprobe_trace_func(), outside
of RCU locked section. This still keeps this buffer preparation lazy and helps
avoid the overhead when it's not needed. E.g., if there is only BPF uprobe
handler installed on a given uprobe, buffer won't be initialized.
Note, the other user of prepare_uprobe_buffer(), __uprobe_perf_func(), is not
affected, as it doesn't prepare buffer under RCU read lock.
Link: https://lore.kernel.org/all/20240521053017.3708530-1-andrii@kernel.org/
Fixes: 1b8f85defbc8 ("uprobes: prepare uprobe args buffer lazily")
Reported-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2024-05-21 05:30:17 +00:00
|
|
|
struct uprobe_cpu_buffer *ucb;
|
|
|
|
|
|
|
|
ucb = prepare_uprobe_buffer(tu, regs, ucbp);
|
2014-01-17 08:08:38 +00:00
|
|
|
|
|
|
|
rcu_read_lock();
|
2019-05-31 15:17:26 +00:00
|
|
|
trace_probe_for_each_link_rcu(link, &tu->tp)
|
uprobes: prevent mutex_lock() under rcu_read_lock()
Recent changes made uprobe_cpu_buffer preparation lazy, and moved it
deeper into __uprobe_trace_func(). This is problematic because
__uprobe_trace_func() is called inside rcu_read_lock()/rcu_read_unlock()
block, which then calls prepare_uprobe_buffer() -> uprobe_buffer_get() ->
mutex_lock(&ucb->mutex), leading to a splat about using mutex under
non-sleepable RCU:
BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585
in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 98231, name: stress-ng-sigq
preempt_count: 0, expected: 0
RCU nest depth: 1, expected: 0
...
Call Trace:
<TASK>
dump_stack_lvl+0x3d/0xe0
__might_resched+0x24c/0x270
? prepare_uprobe_buffer+0xd5/0x1d0
__mutex_lock+0x41/0x820
? ___perf_sw_event+0x206/0x290
? __perf_event_task_sched_in+0x54/0x660
? __perf_event_task_sched_in+0x54/0x660
prepare_uprobe_buffer+0xd5/0x1d0
__uprobe_trace_func+0x4a/0x140
uprobe_dispatcher+0x135/0x280
? uprobe_dispatcher+0x94/0x280
uprobe_notify_resume+0x650/0xec0
? atomic_notifier_call_chain+0x21/0x110
? atomic_notifier_call_chain+0xf8/0x110
irqentry_exit_to_user_mode+0xe2/0x1e0
asm_exc_int3+0x35/0x40
RIP: 0033:0x7f7e1d4da390
Code: 33 04 00 0f 1f 80 00 00 00 00 f3 0f 1e fa b9 01 00 00 00 e9 b2 fc ff ff 66 90 f3 0f 1e fa 31 c9 e9 a5 fc ff ff 0f 1f 44 00 00 <cc> 0f 1e fa b8 27 00 00 00 0f 05 c3 0f 1f 40 00 f3 0f 1e fa b8 6e
RSP: 002b:00007ffd2abc3608 EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000076d325f1 RCX: 0000000000000000
RDX: 0000000076d325f1 RSI: 000000000000000a RDI: 00007ffd2abc3690
RBP: 000000000000000a R08: 00017fb700000000 R09: 00017fb700000000
R10: 00017fb700000000 R11: 0000000000000246 R12: 0000000000017ff2
R13: 00007ffd2abc3610 R14: 0000000000000000 R15: 00007ffd2abc3780
</TASK>
Luckily, it's easy to fix by moving prepare_uprobe_buffer() to be called
slightly earlier: into uprobe_trace_func() and uretprobe_trace_func(), outside
of RCU locked section. This still keeps this buffer preparation lazy and helps
avoid the overhead when it's not needed. E.g., if there is only BPF uprobe
handler installed on a given uprobe, buffer won't be initialized.
Note, the other user of prepare_uprobe_buffer(), __uprobe_perf_func(), is not
affected, as it doesn't prepare buffer under RCU read lock.
Link: https://lore.kernel.org/all/20240521053017.3708530-1-andrii@kernel.org/
Fixes: 1b8f85defbc8 ("uprobes: prepare uprobe args buffer lazily")
Reported-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2024-05-21 05:30:17 +00:00
|
|
|
__uprobe_trace_func(tu, func, regs, ucb, link->file);
|
2014-01-17 08:08:38 +00:00
|
|
|
rcu_read_unlock();
|
2013-03-30 17:25:23 +00:00
|
|
|
}
|
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
/* Event entry printers */
|
|
|
|
static enum print_line_t
|
|
|
|
print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
|
|
|
|
{
|
2013-03-29 17:26:51 +00:00
|
|
|
struct uprobe_trace_entry_head *entry;
|
2012-04-11 10:30:43 +00:00
|
|
|
struct trace_seq *s = &iter->seq;
|
|
|
|
struct trace_uprobe *tu;
|
|
|
|
u8 *data;
|
|
|
|
|
2013-03-29 17:26:51 +00:00
|
|
|
entry = (struct uprobe_trace_entry_head *)iter->ent;
|
2019-06-19 15:07:20 +00:00
|
|
|
tu = trace_uprobe_primary_from_call(
|
|
|
|
container_of(event, struct trace_event_call, event));
|
|
|
|
if (unlikely(!tu))
|
|
|
|
goto out;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2013-03-30 18:48:09 +00:00
|
|
|
if (is_ret_probe(tu)) {
|
2014-11-12 22:26:57 +00:00
|
|
|
trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
|
2019-05-31 15:17:47 +00:00
|
|
|
trace_probe_name(&tu->tp),
|
2014-11-12 22:26:57 +00:00
|
|
|
entry->vaddr[1], entry->vaddr[0]);
|
2013-03-30 18:48:09 +00:00
|
|
|
data = DATAOF_TRACE_ENTRY(entry, true);
|
|
|
|
} else {
|
2014-11-12 22:26:57 +00:00
|
|
|
trace_seq_printf(s, "%s: (0x%lx)",
|
2019-05-31 15:17:47 +00:00
|
|
|
trace_probe_name(&tu->tp),
|
2014-11-12 22:26:57 +00:00
|
|
|
entry->vaddr[0]);
|
2013-03-30 18:48:09 +00:00
|
|
|
data = DATAOF_TRACE_ENTRY(entry, false);
|
|
|
|
}
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2022-12-30 06:33:19 +00:00
|
|
|
if (trace_probe_print_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0)
|
2018-04-25 12:16:36 +00:00
|
|
|
goto out;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2014-11-12 22:26:57 +00:00
|
|
|
trace_seq_putc(s, '\n');
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2014-11-12 22:26:57 +00:00
|
|
|
out:
|
|
|
|
return trace_handle_return(s);
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
2024-09-03 17:45:58 +00:00
|
|
|
typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm);
|
2013-02-04 16:11:58 +00:00
|
|
|
|
2019-06-19 15:07:20 +00:00
|
|
|
static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
2024-08-01 13:27:34 +00:00
|
|
|
struct inode *inode = d_real_inode(tu->path.dentry);
|
|
|
|
struct uprobe *uprobe;
|
2014-01-17 08:08:38 +00:00
|
|
|
|
2019-06-19 15:07:20 +00:00
|
|
|
tu->consumer.filter = filter;
|
2024-08-01 13:27:34 +00:00
|
|
|
uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer);
|
|
|
|
if (IS_ERR(uprobe))
|
|
|
|
return PTR_ERR(uprobe);
|
2019-06-19 15:07:20 +00:00
|
|
|
|
2024-08-01 13:27:34 +00:00
|
|
|
tu->uprobe = uprobe;
|
|
|
|
return 0;
|
2019-06-19 15:07:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __probe_event_disable(struct trace_probe *tp)
|
|
|
|
{
|
|
|
|
struct trace_uprobe *tu;
|
2024-09-03 17:46:00 +00:00
|
|
|
bool sync = false;
|
2019-06-19 15:07:20 +00:00
|
|
|
|
2020-01-10 01:45:39 +00:00
|
|
|
tu = container_of(tp, struct trace_uprobe, tp);
|
2020-01-22 03:23:25 +00:00
|
|
|
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
|
2020-01-10 01:45:39 +00:00
|
|
|
|
2021-11-25 20:28:52 +00:00
|
|
|
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
|
2024-08-01 13:27:34 +00:00
|
|
|
if (!tu->uprobe)
|
2019-06-19 15:07:20 +00:00
|
|
|
continue;
|
|
|
|
|
2024-09-03 17:46:00 +00:00
|
|
|
uprobe_unregister_nosync(tu->uprobe, &tu->consumer);
|
|
|
|
sync = true;
|
2024-08-01 13:27:34 +00:00
|
|
|
tu->uprobe = NULL;
|
2019-06-19 15:07:20 +00:00
|
|
|
}
|
2024-09-03 17:46:00 +00:00
|
|
|
if (sync)
|
|
|
|
uprobe_unregister_sync();
|
2019-06-19 15:07:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int probe_event_enable(struct trace_event_call *call,
|
|
|
|
struct trace_event_file *file, filter_func_t filter)
|
|
|
|
{
|
2021-11-25 20:28:52 +00:00
|
|
|
struct trace_probe *tp;
|
2019-06-19 15:07:20 +00:00
|
|
|
struct trace_uprobe *tu;
|
|
|
|
bool enabled;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
tp = trace_probe_primary_from_call(call);
|
|
|
|
if (WARN_ON_ONCE(!tp))
|
|
|
|
return -ENODEV;
|
|
|
|
enabled = trace_probe_is_enabled(tp);
|
|
|
|
|
|
|
|
/* This may also change "enabled" state */
|
2014-01-17 08:08:38 +00:00
|
|
|
if (file) {
|
2019-06-19 15:07:20 +00:00
|
|
|
if (trace_probe_test_flag(tp, TP_FLAG_PROFILE))
|
2014-06-27 17:01:36 +00:00
|
|
|
return -EINTR;
|
|
|
|
|
2019-06-19 15:07:20 +00:00
|
|
|
ret = trace_probe_add_file(tp, file);
|
2019-05-31 15:17:26 +00:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2014-06-27 17:01:36 +00:00
|
|
|
} else {
|
2019-06-19 15:07:20 +00:00
|
|
|
if (trace_probe_test_flag(tp, TP_FLAG_TRACE))
|
2014-06-27 17:01:36 +00:00
|
|
|
return -EINTR;
|
|
|
|
|
2019-06-19 15:07:20 +00:00
|
|
|
trace_probe_set_flag(tp, TP_FLAG_PROFILE);
|
2014-06-27 17:01:36 +00:00
|
|
|
}
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2019-06-19 15:07:20 +00:00
|
|
|
tu = container_of(tp, struct trace_uprobe, tp);
|
2020-01-22 03:23:25 +00:00
|
|
|
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
|
2013-02-03 19:58:35 +00:00
|
|
|
|
2014-01-17 08:08:38 +00:00
|
|
|
if (enabled)
|
|
|
|
return 0;
|
|
|
|
|
2014-06-27 17:01:46 +00:00
|
|
|
ret = uprobe_buffer_enable();
|
|
|
|
if (ret)
|
|
|
|
goto err_flags;
|
|
|
|
|
2021-11-25 20:28:52 +00:00
|
|
|
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
|
2019-06-19 15:07:20 +00:00
|
|
|
ret = trace_uprobe_enable(tu, filter);
|
|
|
|
if (ret) {
|
|
|
|
__probe_event_disable(tp);
|
|
|
|
goto err_buffer;
|
|
|
|
}
|
uprobes: Support SDT markers having reference count (semaphore)
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:
if (reference_counter > 0) {
Execute marker instructions;
}
Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.
Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:
<path>:<offset>[(ref_ctr_offset)]
where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:
uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)
No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.
[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are calling it a 'reference counter'
in kernel / perf code.
Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
[Only trace_uprobe.c]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-08-20 04:42:47 +00:00
|
|
|
}
|
|
|
|
|
2014-06-27 17:01:46 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_buffer:
|
|
|
|
uprobe_buffer_disable();
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2014-06-27 17:01:46 +00:00
|
|
|
err_flags:
|
2019-05-31 15:17:26 +00:00
|
|
|
if (file)
|
2019-06-19 15:07:20 +00:00
|
|
|
trace_probe_remove_file(tp, file);
|
2019-05-31 15:17:26 +00:00
|
|
|
else
|
2019-06-19 15:07:20 +00:00
|
|
|
trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
|
2019-05-31 15:17:26 +00:00
|
|
|
|
2013-01-27 17:36:24 +00:00
|
|
|
return ret;
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
2019-06-19 15:07:20 +00:00
|
|
|
static void probe_event_disable(struct trace_event_call *call,
|
|
|
|
struct trace_event_file *file)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
2019-06-19 15:07:20 +00:00
|
|
|
struct trace_probe *tp;
|
|
|
|
|
|
|
|
tp = trace_probe_primary_from_call(call);
|
|
|
|
if (WARN_ON_ONCE(!tp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!trace_probe_is_enabled(tp))
|
2012-04-11 10:30:43 +00:00
|
|
|
return;
|
|
|
|
|
2014-01-17 08:08:38 +00:00
|
|
|
if (file) {
|
2019-06-19 15:07:20 +00:00
|
|
|
if (trace_probe_remove_file(tp, file) < 0)
|
2014-01-17 08:08:38 +00:00
|
|
|
return;
|
|
|
|
|
2019-06-19 15:07:20 +00:00
|
|
|
if (trace_probe_is_enabled(tp))
|
2014-01-17 08:08:38 +00:00
|
|
|
return;
|
2019-05-31 15:17:26 +00:00
|
|
|
} else
|
2019-06-19 15:07:20 +00:00
|
|
|
trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
|
2013-07-03 07:40:28 +00:00
|
|
|
|
2019-06-19 15:07:20 +00:00
|
|
|
__probe_event_disable(tp);
|
2013-07-03 07:40:28 +00:00
|
|
|
uprobe_buffer_disable();
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
2015-05-05 15:45:27 +00:00
|
|
|
static int uprobe_event_define_fields(struct trace_event_call *event_call)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
2018-04-25 12:17:05 +00:00
|
|
|
int ret, size;
|
2012-04-11 10:30:43 +00:00
|
|
|
struct uprobe_trace_entry_head field;
|
2019-06-19 15:07:20 +00:00
|
|
|
struct trace_uprobe *tu;
|
|
|
|
|
|
|
|
tu = trace_uprobe_primary_from_call(event_call);
|
|
|
|
if (unlikely(!tu))
|
|
|
|
return -ENODEV;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2013-03-30 18:23:15 +00:00
|
|
|
if (is_ret_probe(tu)) {
|
|
|
|
DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
|
|
|
|
DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
|
|
|
|
size = SIZEOF_TRACE_ENTRY(true);
|
|
|
|
} else {
|
|
|
|
DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
|
|
|
|
size = SIZEOF_TRACE_ENTRY(false);
|
|
|
|
}
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2018-04-25 12:17:05 +00:00
|
|
|
return traceprobe_define_arg_fields(event_call, size, &tu->tp);
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
2013-02-04 16:11:58 +00:00
|
|
|
static bool
|
|
|
|
__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
struct perf_event *event;
|
|
|
|
|
|
|
|
list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
|
2015-03-05 21:10:19 +00:00
|
|
|
if (event->hw.target->mm == mm)
|
2013-02-04 16:11:58 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-02-04 18:05:43 +00:00
|
|
|
static inline bool
|
2020-01-10 01:45:39 +00:00
|
|
|
trace_uprobe_filter_event(struct trace_uprobe_filter *filter,
|
|
|
|
struct perf_event *event)
|
2013-02-04 18:05:43 +00:00
|
|
|
{
|
2020-01-10 01:45:39 +00:00
|
|
|
return __uprobe_perf_filter(filter, event->hw.target->mm);
|
2013-02-04 18:05:43 +00:00
|
|
|
}
|
|
|
|
|
2020-01-10 01:45:39 +00:00
|
|
|
static bool trace_uprobe_filter_remove(struct trace_uprobe_filter *filter,
|
|
|
|
struct perf_event *event)
|
2013-02-03 19:58:35 +00:00
|
|
|
{
|
2013-02-04 18:05:43 +00:00
|
|
|
bool done;
|
|
|
|
|
2020-01-10 01:45:39 +00:00
|
|
|
write_lock(&filter->rwlock);
|
2015-03-05 21:10:19 +00:00
|
|
|
if (event->hw.target) {
|
2014-04-24 11:26:01 +00:00
|
|
|
list_del(&event->hw.tp_list);
|
2020-01-10 01:45:39 +00:00
|
|
|
done = filter->nr_systemwide ||
|
2015-03-05 21:10:19 +00:00
|
|
|
(event->hw.target->flags & PF_EXITING) ||
|
2020-01-10 01:45:39 +00:00
|
|
|
trace_uprobe_filter_event(filter, event);
|
2013-02-04 18:05:43 +00:00
|
|
|
} else {
|
2020-01-10 01:45:39 +00:00
|
|
|
filter->nr_systemwide--;
|
|
|
|
done = filter->nr_systemwide;
|
2013-02-04 18:05:43 +00:00
|
|
|
}
|
2020-01-10 01:45:39 +00:00
|
|
|
write_unlock(&filter->rwlock);
|
2013-02-04 16:11:58 +00:00
|
|
|
|
2020-01-10 01:45:39 +00:00
|
|
|
return done;
|
2013-02-03 19:58:35 +00:00
|
|
|
}
|
|
|
|
|
2020-01-10 01:45:39 +00:00
|
|
|
/* This returns true if the filter always covers target mm */
|
|
|
|
static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter,
|
|
|
|
struct perf_event *event)
|
2013-02-03 19:58:35 +00:00
|
|
|
{
|
2013-02-04 18:05:43 +00:00
|
|
|
bool done;
|
|
|
|
|
2020-01-10 01:45:39 +00:00
|
|
|
write_lock(&filter->rwlock);
|
2015-03-05 21:10:19 +00:00
|
|
|
if (event->hw.target) {
|
2014-04-24 11:26:01 +00:00
|
|
|
/*
|
|
|
|
* event->parent != NULL means copy_process(), we can avoid
|
|
|
|
* uprobe_apply(). current->mm must be probed and we can rely
|
|
|
|
* on dup_mmap() which preserves the already installed bp's.
|
|
|
|
*
|
|
|
|
* attr.enable_on_exec means that exec/mmap will install the
|
|
|
|
* breakpoints we need.
|
|
|
|
*/
|
2020-01-10 01:45:39 +00:00
|
|
|
done = filter->nr_systemwide ||
|
2014-04-24 11:26:01 +00:00
|
|
|
event->parent || event->attr.enable_on_exec ||
|
2020-01-10 01:45:39 +00:00
|
|
|
trace_uprobe_filter_event(filter, event);
|
|
|
|
list_add(&event->hw.tp_list, &filter->perf_events);
|
2013-02-04 18:05:43 +00:00
|
|
|
} else {
|
2020-01-10 01:45:39 +00:00
|
|
|
done = filter->nr_systemwide;
|
|
|
|
filter->nr_systemwide++;
|
2013-02-04 18:05:43 +00:00
|
|
|
}
|
2020-01-10 01:45:39 +00:00
|
|
|
write_unlock(&filter->rwlock);
|
2013-02-03 19:58:35 +00:00
|
|
|
|
2020-01-10 01:45:39 +00:00
|
|
|
return done;
|
2013-02-03 19:58:35 +00:00
|
|
|
}
|
|
|
|
|
2020-01-10 01:45:39 +00:00
|
|
|
static int uprobe_perf_close(struct trace_event_call *call,
|
|
|
|
struct perf_event *event)
|
2019-06-19 15:07:20 +00:00
|
|
|
{
|
2021-11-25 20:28:52 +00:00
|
|
|
struct trace_probe *tp;
|
2019-06-19 15:07:20 +00:00
|
|
|
struct trace_uprobe *tu;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
tp = trace_probe_primary_from_call(call);
|
|
|
|
if (WARN_ON_ONCE(!tp))
|
|
|
|
return -ENODEV;
|
|
|
|
|
2020-01-10 01:45:39 +00:00
|
|
|
tu = container_of(tp, struct trace_uprobe, tp);
|
2020-01-22 03:23:25 +00:00
|
|
|
if (trace_uprobe_filter_remove(tu->tp.event->filter, event))
|
2020-01-10 01:45:39 +00:00
|
|
|
return 0;
|
|
|
|
|
2021-11-25 20:28:52 +00:00
|
|
|
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
|
2024-08-01 13:27:34 +00:00
|
|
|
ret = uprobe_apply(tu->uprobe, &tu->consumer, false);
|
2019-06-19 15:07:20 +00:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
2020-01-10 01:45:39 +00:00
|
|
|
|
|
|
|
static int uprobe_perf_open(struct trace_event_call *call,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
2021-11-25 20:28:52 +00:00
|
|
|
struct trace_probe *tp;
|
2020-01-10 01:45:39 +00:00
|
|
|
struct trace_uprobe *tu;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
tp = trace_probe_primary_from_call(call);
|
|
|
|
if (WARN_ON_ONCE(!tp))
|
|
|
|
return -ENODEV;
|
|
|
|
|
|
|
|
tu = container_of(tp, struct trace_uprobe, tp);
|
2020-01-22 03:23:25 +00:00
|
|
|
if (trace_uprobe_filter_add(tu->tp.event->filter, event))
|
2020-01-10 01:45:39 +00:00
|
|
|
return 0;
|
|
|
|
|
2021-11-25 20:28:52 +00:00
|
|
|
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
|
2024-08-01 13:27:34 +00:00
|
|
|
err = uprobe_apply(tu->uprobe, &tu->consumer, true);
|
2020-01-10 01:45:39 +00:00
|
|
|
if (err) {
|
|
|
|
uprobe_perf_close(call, event);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2024-09-03 17:45:58 +00:00
|
|
|
static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
|
2013-02-04 16:11:58 +00:00
|
|
|
{
|
2020-01-10 01:45:39 +00:00
|
|
|
struct trace_uprobe_filter *filter;
|
2013-02-04 16:11:58 +00:00
|
|
|
struct trace_uprobe *tu;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
tu = container_of(uc, struct trace_uprobe, consumer);
|
2020-01-22 03:23:25 +00:00
|
|
|
filter = tu->tp.event->filter;
|
2020-01-10 01:45:39 +00:00
|
|
|
|
uprobes: add speculative lockless system-wide uprobe filter check
It's very common with BPF-based uprobe/uretprobe use cases to have
a system-wide (not PID specific) probes used. In this case uprobe's
trace_uprobe_filter->nr_systemwide counter is bumped at registration
time, and actual filtering is short circuited at the time when
uprobe/uretprobe is triggered.
This is a great optimization, and the only issue with it is that to even
get to checking this counter uprobe subsystem is taking
read-side trace_uprobe_filter->rwlock. This is actually noticeable in
profiles and is just another point of contention when uprobe is
triggered on multiple CPUs simultaneously.
This patch moves this nr_systemwide check outside of filter list's
rwlock scope, as rwlock is meant to protect list modification, while
nr_systemwide-based check is speculative and racy already, despite the
lock (as discussed in [0]). trace_uprobe_filter_remove() and
trace_uprobe_filter_add() already check for filter->nr_systewide
explicitly outside of __uprobe_perf_filter, so no modifications are
required there.
Confirming with BPF selftests's based benchmarks.
BEFORE (based on changes in previous patch)
===========================================
uprobe-nop : 2.732 ± 0.022M/s
uprobe-push : 2.621 ± 0.016M/s
uprobe-ret : 1.105 ± 0.007M/s
uretprobe-nop : 1.396 ± 0.007M/s
uretprobe-push : 1.347 ± 0.008M/s
uretprobe-ret : 0.800 ± 0.006M/s
AFTER
=====
uprobe-nop : 2.878 ± 0.017M/s (+5.5%, total +8.3%)
uprobe-push : 2.753 ± 0.013M/s (+5.3%, total +10.2%)
uprobe-ret : 1.142 ± 0.010M/s (+3.8%, total +3.8%)
uretprobe-nop : 1.444 ± 0.008M/s (+3.5%, total +6.5%)
uretprobe-push : 1.410 ± 0.010M/s (+4.8%, total +7.1%)
uretprobe-ret : 0.816 ± 0.002M/s (+2.0%, total +3.9%)
In the above, first percentage value is based on top of previous patch
(lazy uprobe buffer optimization), while the "total" percentage is
based on kernel without any of the changes in this patch set.
As can be seen, we get about 4% - 10% speed up, in total, with both lazy
uprobe buffer and speculative filter check optimizations.
[0] https://lore.kernel.org/bpf/20240313131926.GA19986@redhat.com/
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/all/20240318181728.2795838-4-andrii@kernel.org/
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2024-03-18 18:17:28 +00:00
|
|
|
/*
|
|
|
|
* speculative short-circuiting check to avoid unnecessarily taking
|
|
|
|
* filter->rwlock below, if the uprobe has system-wide consumer
|
|
|
|
*/
|
|
|
|
if (READ_ONCE(filter->nr_systemwide))
|
|
|
|
return true;
|
|
|
|
|
2020-01-10 01:45:39 +00:00
|
|
|
read_lock(&filter->rwlock);
|
|
|
|
ret = __uprobe_perf_filter(filter, mm);
|
|
|
|
read_unlock(&filter->rwlock);
|
2013-02-04 16:11:58 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-01-17 08:08:36 +00:00
|
|
|
static void __uprobe_perf_func(struct trace_uprobe *tu,
|
2014-01-17 08:08:37 +00:00
|
|
|
unsigned long func, struct pt_regs *regs,
|
2024-03-18 18:17:27 +00:00
|
|
|
struct uprobe_cpu_buffer **ucbp)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
2019-05-31 15:17:57 +00:00
|
|
|
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
|
2012-04-11 10:30:43 +00:00
|
|
|
struct uprobe_trace_entry_head *entry;
|
2024-03-18 18:17:27 +00:00
|
|
|
struct uprobe_cpu_buffer *ucb;
|
2012-04-11 10:30:43 +00:00
|
|
|
struct hlist_head *head;
|
2013-03-29 17:26:51 +00:00
|
|
|
void *data;
|
2014-01-17 08:08:37 +00:00
|
|
|
int size, esize;
|
2013-07-03 07:40:28 +00:00
|
|
|
int rctx;
|
|
|
|
|
2022-06-20 21:47:55 +00:00
|
|
|
#ifdef CONFIG_BPF_EVENTS
|
2020-02-24 19:27:15 +00:00
|
|
|
if (bpf_prog_array_valid(call)) {
|
2024-12-10 19:08:14 +00:00
|
|
|
const struct bpf_prog_array *array;
|
2020-02-24 19:27:15 +00:00
|
|
|
u32 ret;
|
|
|
|
|
2024-12-10 19:08:14 +00:00
|
|
|
rcu_read_lock_trace();
|
|
|
|
array = rcu_dereference_check(call->prog_array, rcu_read_lock_trace_held());
|
|
|
|
ret = bpf_prog_run_array_uprobe(array, regs, bpf_prog_run);
|
|
|
|
rcu_read_unlock_trace();
|
2020-02-24 19:27:15 +00:00
|
|
|
if (!ret)
|
|
|
|
return;
|
|
|
|
}
|
2022-06-20 21:47:55 +00:00
|
|
|
#endif /* CONFIG_BPF_EVENTS */
|
2015-07-01 02:13:50 +00:00
|
|
|
|
2013-07-03 07:40:28 +00:00
|
|
|
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2024-03-18 18:17:27 +00:00
|
|
|
ucb = prepare_uprobe_buffer(tu, regs, ucbp);
|
2024-03-18 18:17:26 +00:00
|
|
|
size = esize + ucb->dsize;
|
2013-07-03 07:40:28 +00:00
|
|
|
size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
|
|
|
|
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
|
|
|
|
return;
|
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
preempt_disable();
|
2013-04-13 13:36:49 +00:00
|
|
|
head = this_cpu_ptr(call->perf_events);
|
|
|
|
if (hlist_empty(head))
|
|
|
|
goto out;
|
|
|
|
|
2016-04-07 01:43:24 +00:00
|
|
|
entry = perf_trace_buf_alloc(size, NULL, &rctx);
|
2012-04-11 10:30:43 +00:00
|
|
|
if (!entry)
|
|
|
|
goto out;
|
|
|
|
|
2013-03-30 17:46:22 +00:00
|
|
|
if (is_ret_probe(tu)) {
|
|
|
|
entry->vaddr[0] = func;
|
2013-04-10 14:25:49 +00:00
|
|
|
entry->vaddr[1] = instruction_pointer(regs);
|
2013-03-30 17:46:22 +00:00
|
|
|
data = DATAOF_TRACE_ENTRY(entry, true);
|
|
|
|
} else {
|
2013-04-10 14:25:49 +00:00
|
|
|
entry->vaddr[0] = instruction_pointer(regs);
|
2013-03-30 17:46:22 +00:00
|
|
|
data = DATAOF_TRACE_ENTRY(entry, false);
|
|
|
|
}
|
|
|
|
|
2024-03-18 18:17:26 +00:00
|
|
|
memcpy(data, ucb->buf, ucb->dsize);
|
2013-07-03 07:40:28 +00:00
|
|
|
|
2024-03-18 18:17:26 +00:00
|
|
|
if (size - esize > ucb->dsize)
|
|
|
|
memset(data + ucb->dsize, 0, size - esize - ucb->dsize);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2016-04-07 01:43:24 +00:00
|
|
|
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
|
2017-10-11 07:45:29 +00:00
|
|
|
head, NULL);
|
2012-04-11 10:30:43 +00:00
|
|
|
out:
|
|
|
|
preempt_enable();
|
2013-03-30 17:02:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* uprobe profile handler */
|
2014-01-17 08:08:37 +00:00
|
|
|
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
|
2024-03-18 18:17:27 +00:00
|
|
|
struct uprobe_cpu_buffer **ucbp)
|
2013-03-30 17:02:12 +00:00
|
|
|
{
|
2024-09-03 17:45:58 +00:00
|
|
|
if (!uprobe_perf_filter(&tu->consumer, current->mm))
|
2013-03-30 17:02:12 +00:00
|
|
|
return UPROBE_HANDLER_REMOVE;
|
|
|
|
|
2013-03-30 17:46:22 +00:00
|
|
|
if (!is_ret_probe(tu))
|
2024-03-18 18:17:27 +00:00
|
|
|
__uprobe_perf_func(tu, 0, regs, ucbp);
|
2013-02-04 16:48:34 +00:00
|
|
|
return 0;
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
2013-03-30 17:25:23 +00:00
|
|
|
|
|
|
|
static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
|
2014-01-17 08:08:37 +00:00
|
|
|
struct pt_regs *regs,
|
2024-03-18 18:17:27 +00:00
|
|
|
struct uprobe_cpu_buffer **ucbp)
|
2013-03-30 17:25:23 +00:00
|
|
|
{
|
2024-03-18 18:17:27 +00:00
|
|
|
__uprobe_perf_func(tu, func, regs, ucbp);
|
2013-03-30 17:25:23 +00:00
|
|
|
}
|
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 18:21:09 +00:00
|
|
|
|
|
|
|
int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
|
|
|
|
const char **filename, u64 *probe_offset,
|
2023-07-09 02:56:25 +00:00
|
|
|
u64 *probe_addr, bool perf_type_tracepoint)
|
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 18:21:09 +00:00
|
|
|
{
|
|
|
|
const char *pevent = trace_event_name(event->tp_event);
|
|
|
|
const char *group = event->tp_event->class->system;
|
|
|
|
struct trace_uprobe *tu;
|
|
|
|
|
|
|
|
if (perf_type_tracepoint)
|
|
|
|
tu = find_probe_event(pevent, group);
|
|
|
|
else
|
2020-06-08 12:45:32 +00:00
|
|
|
tu = trace_uprobe_primary_from_call(event->tp_event);
|
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 18:21:09 +00:00
|
|
|
if (!tu)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
*fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
|
|
|
|
: BPF_FD_TYPE_UPROBE;
|
|
|
|
*filename = tu->filename;
|
|
|
|
*probe_offset = tu->offset;
|
2023-07-09 02:56:25 +00:00
|
|
|
*probe_addr = 0;
|
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-05-24 18:21:09 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2012-04-11 10:30:43 +00:00
|
|
|
#endif /* CONFIG_PERF_EVENTS */
|
|
|
|
|
2014-01-17 08:08:38 +00:00
|
|
|
static int
|
2015-05-05 15:45:27 +00:00
|
|
|
trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
|
2014-01-17 08:08:38 +00:00
|
|
|
void *data)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
2015-05-05 14:09:53 +00:00
|
|
|
struct trace_event_file *file = data;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case TRACE_REG_REGISTER:
|
2019-06-19 15:07:20 +00:00
|
|
|
return probe_event_enable(event, file, NULL);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
case TRACE_REG_UNREGISTER:
|
2019-06-19 15:07:20 +00:00
|
|
|
probe_event_disable(event, file);
|
2012-04-11 10:30:43 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
|
|
case TRACE_REG_PERF_REGISTER:
|
2019-06-19 15:07:20 +00:00
|
|
|
return probe_event_enable(event, NULL, uprobe_perf_filter);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
case TRACE_REG_PERF_UNREGISTER:
|
2019-06-19 15:07:20 +00:00
|
|
|
probe_event_disable(event, NULL);
|
2012-04-11 10:30:43 +00:00
|
|
|
return 0;
|
2013-02-03 19:58:35 +00:00
|
|
|
|
|
|
|
case TRACE_REG_PERF_OPEN:
|
2020-01-10 01:45:39 +00:00
|
|
|
return uprobe_perf_open(event, data);
|
2013-02-03 19:58:35 +00:00
|
|
|
|
|
|
|
case TRACE_REG_PERF_CLOSE:
|
2020-01-10 01:45:39 +00:00
|
|
|
return uprobe_perf_close(event, data);
|
2013-02-03 19:58:35 +00:00
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
#endif
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-18 20:22:51 +00:00
|
|
|
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
|
|
|
|
__u64 *data)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
|
|
|
struct trace_uprobe *tu;
|
2013-11-25 04:42:47 +00:00
|
|
|
struct uprobe_dispatch_data udd;
|
2024-03-18 18:17:27 +00:00
|
|
|
struct uprobe_cpu_buffer *ucb = NULL;
|
2013-02-04 16:48:34 +00:00
|
|
|
int ret = 0;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2013-01-31 18:47:23 +00:00
|
|
|
tu = container_of(con, struct trace_uprobe, consumer);
|
2024-08-13 20:34:09 +00:00
|
|
|
|
|
|
|
this_cpu_inc(*tu->nhits);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2013-11-25 04:42:47 +00:00
|
|
|
udd.tu = tu;
|
|
|
|
udd.bp_addr = instruction_pointer(regs);
|
|
|
|
|
|
|
|
current->utask->vaddr = (unsigned long) &udd;
|
|
|
|
|
2014-01-17 08:08:37 +00:00
|
|
|
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
|
|
|
|
return 0;
|
|
|
|
|
2019-05-31 15:17:37 +00:00
|
|
|
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
|
2024-03-18 18:17:27 +00:00
|
|
|
ret |= uprobe_trace_func(tu, regs, &ucb);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
2019-05-31 15:17:37 +00:00
|
|
|
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
|
2024-03-18 18:17:27 +00:00
|
|
|
ret |= uprobe_perf_func(tu, regs, &ucb);
|
2012-04-11 10:30:43 +00:00
|
|
|
#endif
|
2014-01-17 08:08:37 +00:00
|
|
|
uprobe_buffer_put(ucb);
|
2013-02-04 16:48:34 +00:00
|
|
|
return ret;
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
2013-03-30 17:25:23 +00:00
|
|
|
static int uretprobe_dispatcher(struct uprobe_consumer *con,
|
2024-10-18 20:22:51 +00:00
|
|
|
unsigned long func, struct pt_regs *regs,
|
|
|
|
__u64 *data)
|
2013-03-30 17:25:23 +00:00
|
|
|
{
|
|
|
|
struct trace_uprobe *tu;
|
2013-11-25 04:42:47 +00:00
|
|
|
struct uprobe_dispatch_data udd;
|
2024-03-18 18:17:27 +00:00
|
|
|
struct uprobe_cpu_buffer *ucb = NULL;
|
2013-03-30 17:25:23 +00:00
|
|
|
|
|
|
|
tu = container_of(con, struct trace_uprobe, consumer);
|
|
|
|
|
2013-11-25 04:42:47 +00:00
|
|
|
udd.tu = tu;
|
|
|
|
udd.bp_addr = func;
|
|
|
|
|
|
|
|
current->utask->vaddr = (unsigned long) &udd;
|
|
|
|
|
2014-01-17 08:08:37 +00:00
|
|
|
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
|
|
|
|
return 0;
|
|
|
|
|
2019-05-31 15:17:37 +00:00
|
|
|
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
|
2024-03-18 18:17:27 +00:00
|
|
|
uretprobe_trace_func(tu, func, regs, &ucb);
|
2013-03-30 17:25:23 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
2019-05-31 15:17:37 +00:00
|
|
|
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
|
2024-03-18 18:17:27 +00:00
|
|
|
uretprobe_perf_func(tu, func, regs, &ucb);
|
2013-03-30 17:25:23 +00:00
|
|
|
#endif
|
2014-01-17 08:08:37 +00:00
|
|
|
uprobe_buffer_put(ucb);
|
2013-03-30 17:25:23 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-04-11 10:30:43 +00:00
|
|
|
static struct trace_event_functions uprobe_funcs = {
|
|
|
|
.trace = print_uprobe_event
|
|
|
|
};
|
|
|
|
|
2019-10-24 20:26:59 +00:00
|
|
|
static struct trace_event_fields uprobe_fields_array[] = {
|
|
|
|
{ .type = TRACE_FUNCTION_TYPE,
|
|
|
|
.define_fields = uprobe_event_define_fields },
|
|
|
|
{}
|
|
|
|
};
|
|
|
|
|
2019-05-31 15:17:57 +00:00
|
|
|
static inline void init_trace_event_call(struct trace_uprobe *tu)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
2019-05-31 15:17:57 +00:00
|
|
|
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
|
2012-04-11 10:30:43 +00:00
|
|
|
call->event.funcs = &uprobe_funcs;
|
2019-10-24 20:26:59 +00:00
|
|
|
call->class->fields_array = uprobe_fields_array;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2019-05-07 16:15:45 +00:00
|
|
|
call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY;
|
2017-12-06 22:45:16 +00:00
|
|
|
call->class->reg = trace_uprobe_register;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int register_uprobe_event(struct trace_uprobe *tu)
|
|
|
|
{
|
2019-05-31 15:17:57 +00:00
|
|
|
init_trace_event_call(tu);
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2019-05-31 15:17:16 +00:00
|
|
|
return trace_probe_register_event_call(&tu->tp);
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
2013-07-04 03:33:51 +00:00
|
|
|
static int unregister_uprobe_event(struct trace_uprobe *tu)
|
2012-04-11 10:30:43 +00:00
|
|
|
{
|
2019-05-31 15:17:16 +00:00
|
|
|
return trace_probe_unregister_event_call(&tu->tp);
|
2012-04-11 10:30:43 +00:00
|
|
|
}
|
|
|
|
|
2017-12-06 22:45:16 +00:00
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
|
|
struct trace_event_call *
|
2018-10-02 05:36:36 +00:00
|
|
|
create_local_trace_uprobe(char *name, unsigned long offs,
|
|
|
|
unsigned long ref_ctr_offset, bool is_return)
|
2017-12-06 22:45:16 +00:00
|
|
|
{
|
2021-08-19 04:13:27 +00:00
|
|
|
enum probe_print_type ptype;
|
2017-12-06 22:45:16 +00:00
|
|
|
struct trace_uprobe *tu;
|
|
|
|
struct path path;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = kern_path(name, LOOKUP_FOLLOW, &path);
|
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
2018-04-23 17:21:34 +00:00
|
|
|
if (!d_is_reg(path.dentry)) {
|
|
|
|
path_put(&path);
|
2017-12-06 22:45:16 +00:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2018-11-05 09:03:04 +00:00
|
|
|
* local trace_kprobes are not added to dyn_event, so they are never
|
2017-12-06 22:45:16 +00:00
|
|
|
* searched in find_trace_kprobe(). Therefore, there is no concern of
|
|
|
|
* duplicated name "DUMMY_EVENT" here.
|
|
|
|
*/
|
|
|
|
tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0,
|
|
|
|
is_return);
|
|
|
|
|
|
|
|
if (IS_ERR(tu)) {
|
|
|
|
pr_info("Failed to allocate trace_uprobe.(%d)\n",
|
|
|
|
(int)PTR_ERR(tu));
|
2018-04-23 17:21:34 +00:00
|
|
|
path_put(&path);
|
2017-12-06 22:45:16 +00:00
|
|
|
return ERR_CAST(tu);
|
|
|
|
}
|
|
|
|
|
|
|
|
tu->offset = offs;
|
2018-04-23 17:21:34 +00:00
|
|
|
tu->path = path;
|
2018-10-02 05:36:36 +00:00
|
|
|
tu->ref_ctr_offset = ref_ctr_offset;
|
2017-12-06 22:45:16 +00:00
|
|
|
tu->filename = kstrdup(name, GFP_KERNEL);
|
2021-12-14 01:28:02 +00:00
|
|
|
if (!tu->filename) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2019-05-31 15:17:57 +00:00
|
|
|
init_trace_event_call(tu);
|
2017-12-06 22:45:16 +00:00
|
|
|
|
2021-08-19 04:13:27 +00:00
|
|
|
ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
|
|
|
|
if (traceprobe_set_print_fmt(&tu->tp, ptype) < 0) {
|
2017-12-06 22:45:16 +00:00
|
|
|
ret = -ENOMEM;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2019-05-31 15:17:57 +00:00
|
|
|
return trace_probe_event_call(&tu->tp);
|
2017-12-06 22:45:16 +00:00
|
|
|
error:
|
|
|
|
free_trace_uprobe(tu);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
void destroy_local_trace_uprobe(struct trace_event_call *event_call)
|
|
|
|
{
|
|
|
|
struct trace_uprobe *tu;
|
|
|
|
|
2019-06-19 15:07:20 +00:00
|
|
|
tu = trace_uprobe_primary_from_call(event_call);
|
2017-12-06 22:45:16 +00:00
|
|
|
|
|
|
|
free_trace_uprobe(tu);
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_PERF_EVENTS */
|
|
|
|
|
2021-01-12 04:50:08 +00:00
|
|
|
/* Make a trace interface for controlling probe points */
|
2012-04-11 10:30:43 +00:00
|
|
|
static __init int init_uprobe_trace(void)
|
|
|
|
{
|
2018-11-05 09:03:04 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = dyn_event_register(&trace_uprobe_ops);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2012-04-11 10:30:43 +00:00
|
|
|
|
2020-07-12 01:10:36 +00:00
|
|
|
ret = tracing_init_dentry();
|
|
|
|
if (ret)
|
2012-04-11 10:30:43 +00:00
|
|
|
return 0;
|
|
|
|
|
2021-08-18 15:24:51 +00:00
|
|
|
trace_create_file("uprobe_events", TRACE_MODE_WRITE, NULL,
|
2012-04-11 10:30:43 +00:00
|
|
|
NULL, &uprobe_events_ops);
|
|
|
|
/* Profile interface */
|
2021-08-18 15:24:51 +00:00
|
|
|
trace_create_file("uprobe_profile", TRACE_MODE_READ, NULL,
|
2012-04-11 10:30:43 +00:00
|
|
|
NULL, &uprobe_profile_ops);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
fs_initcall(init_uprobe_trace);
|