2022-03-15 23:00:38 +09:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
|
|
* fprobe - Simple ftrace probe wrapper for function entry.
|
|
|
|
*/
|
|
|
|
#define pr_fmt(fmt) "fprobe: " fmt
|
|
|
|
|
|
|
|
#include <linux/err.h>
|
|
|
|
#include <linux/fprobe.h>
|
|
|
|
#include <linux/kallsyms.h>
|
|
|
|
#include <linux/kprobes.h>
|
2022-03-15 23:01:48 +09:00
|
|
|
#include <linux/rethook.h>
|
2022-03-15 23:00:38 +09:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/sort.h>
|
|
|
|
|
2022-03-15 23:01:48 +09:00
|
|
|
#include "trace.h"
|
|
|
|
|
|
|
|
struct fprobe_rethook_node {
|
|
|
|
struct rethook_node node;
|
|
|
|
unsigned long entry_ip;
|
2023-05-17 11:45:08 +08:00
|
|
|
unsigned long entry_parent_ip;
|
2023-02-02 00:56:01 +09:00
|
|
|
char data[];
|
2022-03-15 23:01:48 +09:00
|
|
|
};
|
|
|
|
|
2023-05-17 11:45:07 +08:00
|
|
|
static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
|
|
|
|
struct ftrace_ops *ops, struct ftrace_regs *fregs)
|
2022-03-15 23:00:38 +09:00
|
|
|
{
|
2022-03-15 23:01:48 +09:00
|
|
|
struct fprobe_rethook_node *fpr;
|
2023-02-02 00:56:01 +09:00
|
|
|
struct rethook_node *rh = NULL;
|
2022-03-15 23:00:38 +09:00
|
|
|
struct fprobe *fp;
|
2023-02-02 00:56:01 +09:00
|
|
|
void *entry_data = NULL;
|
2023-05-17 11:45:07 +08:00
|
|
|
int ret = 0;
|
2022-03-15 23:00:38 +09:00
|
|
|
|
|
|
|
fp = container_of(ops, struct fprobe, ops);
|
|
|
|
|
2022-03-15 23:01:48 +09:00
|
|
|
if (fp->exit_handler) {
|
|
|
|
rh = rethook_try_get(fp->rethook);
|
|
|
|
if (!rh) {
|
|
|
|
fp->nmissed++;
|
2023-05-17 11:45:07 +08:00
|
|
|
return;
|
2022-03-15 23:01:48 +09:00
|
|
|
}
|
|
|
|
fpr = container_of(rh, struct fprobe_rethook_node, node);
|
|
|
|
fpr->entry_ip = ip;
|
2023-05-17 11:45:08 +08:00
|
|
|
fpr->entry_parent_ip = parent_ip;
|
2023-02-02 00:56:01 +09:00
|
|
|
if (fp->entry_data_size)
|
|
|
|
entry_data = fpr->data;
|
2022-03-15 23:01:48 +09:00
|
|
|
}
|
|
|
|
|
2023-02-02 00:56:01 +09:00
|
|
|
if (fp->entry_handler)
|
2023-06-06 21:39:55 +09:00
|
|
|
ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data);
|
2023-02-02 00:56:01 +09:00
|
|
|
|
2023-02-02 00:56:38 +09:00
|
|
|
/* If entry_handler returns !0, nmissed is not counted. */
|
|
|
|
if (rh) {
|
|
|
|
if (ret)
|
|
|
|
rethook_recycle(rh);
|
|
|
|
else
|
|
|
|
rethook_hook(rh, ftrace_get_regs(fregs), true);
|
|
|
|
}
|
2023-05-17 11:45:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
|
|
|
|
struct ftrace_ops *ops, struct ftrace_regs *fregs)
|
|
|
|
{
|
|
|
|
struct fprobe *fp;
|
|
|
|
int bit;
|
|
|
|
|
|
|
|
fp = container_of(ops, struct fprobe, ops);
|
|
|
|
if (fprobe_disabled(fp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* recursion detection has to go before any traceable function and
|
|
|
|
* all functions before this point should be marked as notrace
|
|
|
|
*/
|
|
|
|
bit = ftrace_test_recursion_trylock(ip, parent_ip);
|
|
|
|
if (bit < 0) {
|
|
|
|
fp->nmissed++;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
__fprobe_handler(ip, parent_ip, ops, fregs);
|
2022-03-15 23:00:38 +09:00
|
|
|
ftrace_test_recursion_unlock(bit);
|
2023-05-17 11:45:07 +08:00
|
|
|
|
2022-03-15 23:00:38 +09:00
|
|
|
}
|
|
|
|
NOKPROBE_SYMBOL(fprobe_handler);
|
|
|
|
|
2022-03-15 23:02:11 +09:00
|
|
|
static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
|
|
|
|
struct ftrace_ops *ops, struct ftrace_regs *fregs)
|
|
|
|
{
|
2023-05-17 11:45:07 +08:00
|
|
|
struct fprobe *fp;
|
|
|
|
int bit;
|
|
|
|
|
|
|
|
fp = container_of(ops, struct fprobe, ops);
|
|
|
|
if (fprobe_disabled(fp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* recursion detection has to go before any traceable function and
|
|
|
|
* all functions called before this point should be marked as notrace
|
|
|
|
*/
|
|
|
|
bit = ftrace_test_recursion_trylock(ip, parent_ip);
|
|
|
|
if (bit < 0) {
|
|
|
|
fp->nmissed++;
|
|
|
|
return;
|
|
|
|
}
|
2022-03-15 23:02:11 +09:00
|
|
|
|
2023-07-08 01:38:03 +09:00
|
|
|
/*
|
|
|
|
* This user handler is shared with other kprobes and is not expected to be
|
|
|
|
* called recursively. So if any other kprobe handler is running, this will
|
|
|
|
* exit as kprobe does. See the section 'Share the callbacks with kprobes'
|
|
|
|
* in Documentation/trace/fprobe.rst for more information.
|
|
|
|
*/
|
2022-03-15 23:02:11 +09:00
|
|
|
if (unlikely(kprobe_running())) {
|
|
|
|
fp->nmissed++;
|
2023-07-03 17:23:36 +08:00
|
|
|
goto recursion_unlock;
|
2022-03-15 23:02:11 +09:00
|
|
|
}
|
2023-05-17 11:45:07 +08:00
|
|
|
|
2022-03-15 23:02:11 +09:00
|
|
|
kprobe_busy_begin();
|
2023-05-17 11:45:07 +08:00
|
|
|
__fprobe_handler(ip, parent_ip, ops, fregs);
|
2022-03-15 23:02:11 +09:00
|
|
|
kprobe_busy_end();
|
2023-07-03 17:23:36 +08:00
|
|
|
|
|
|
|
recursion_unlock:
|
2023-05-17 11:45:07 +08:00
|
|
|
ftrace_test_recursion_unlock(bit);
|
2022-03-15 23:02:11 +09:00
|
|
|
}
|
|
|
|
|
2022-03-15 23:01:48 +09:00
|
|
|
static void fprobe_exit_handler(struct rethook_node *rh, void *data,
|
2023-06-06 21:39:55 +09:00
|
|
|
unsigned long ret_ip, struct pt_regs *regs)
|
2022-03-15 23:01:48 +09:00
|
|
|
{
|
|
|
|
struct fprobe *fp = (struct fprobe *)data;
|
|
|
|
struct fprobe_rethook_node *fpr;
|
2023-05-17 11:45:08 +08:00
|
|
|
int bit;
|
2022-03-15 23:01:48 +09:00
|
|
|
|
|
|
|
if (!fp || fprobe_disabled(fp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
fpr = container_of(rh, struct fprobe_rethook_node, node);
|
|
|
|
|
2023-05-17 11:45:08 +08:00
|
|
|
/*
|
|
|
|
* we need to assure no calls to traceable functions in-between the
|
|
|
|
* end of fprobe_handler and the beginning of fprobe_exit_handler.
|
|
|
|
*/
|
|
|
|
bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
|
|
|
|
if (bit < 0) {
|
|
|
|
fp->nmissed++;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-06-06 21:39:55 +09:00
|
|
|
fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs,
|
2023-02-02 00:56:01 +09:00
|
|
|
fp->entry_data_size ? (void *)fpr->data : NULL);
|
2023-05-17 11:45:08 +08:00
|
|
|
ftrace_test_recursion_unlock(bit);
|
2022-03-15 23:01:48 +09:00
|
|
|
}
|
|
|
|
NOKPROBE_SYMBOL(fprobe_exit_handler);
|
|
|
|
|
2022-05-10 14:26:14 +02:00
|
|
|
static int symbols_cmp(const void *a, const void *b)
|
|
|
|
{
|
|
|
|
const char **str_a = (const char **) a;
|
|
|
|
const char **str_b = (const char **) b;
|
|
|
|
|
|
|
|
return strcmp(*str_a, *str_b);
|
|
|
|
}
|
|
|
|
|
2022-03-15 23:00:38 +09:00
|
|
|
/* Convert ftrace location address from symbols */
|
|
|
|
static unsigned long *get_ftrace_locations(const char **syms, int num)
|
|
|
|
{
|
|
|
|
unsigned long *addrs;
|
|
|
|
|
|
|
|
/* Convert symbols to symbol address */
|
|
|
|
addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
|
|
|
|
if (!addrs)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
2022-05-10 14:26:14 +02:00
|
|
|
/* ftrace_lookup_symbols expects sorted symbols */
|
|
|
|
sort(syms, num, sizeof(*syms), symbols_cmp, NULL);
|
2022-03-15 23:00:38 +09:00
|
|
|
|
2022-05-10 14:26:14 +02:00
|
|
|
if (!ftrace_lookup_symbols(syms, num, addrs))
|
|
|
|
return addrs;
|
2022-03-15 23:00:38 +09:00
|
|
|
|
|
|
|
kfree(addrs);
|
|
|
|
return ERR_PTR(-ENOENT);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void fprobe_init(struct fprobe *fp)
|
|
|
|
{
|
|
|
|
fp->nmissed = 0;
|
2022-03-15 23:02:11 +09:00
|
|
|
if (fprobe_shared_with_kprobes(fp))
|
|
|
|
fp->ops.func = fprobe_kprobe_handler;
|
|
|
|
else
|
|
|
|
fp->ops.func = fprobe_handler;
|
2022-03-15 23:00:38 +09:00
|
|
|
fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
|
|
|
|
}
|
|
|
|
|
2022-03-15 23:01:48 +09:00
|
|
|
static int fprobe_init_rethook(struct fprobe *fp, int num)
|
|
|
|
{
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
int size;
|
2022-03-15 23:01:48 +09:00
|
|
|
|
|
|
|
if (!fp->exit_handler) {
|
|
|
|
fp->rethook = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Initialize rethook if needed */
|
2023-02-02 00:56:19 +09:00
|
|
|
if (fp->nr_maxactive)
|
2024-03-01 09:18:24 +09:00
|
|
|
num = fp->nr_maxactive;
|
2023-02-02 00:56:19 +09:00
|
|
|
else
|
2024-03-01 09:18:24 +09:00
|
|
|
num *= num_possible_cpus() * 2;
|
|
|
|
if (num <= 0)
|
2023-10-17 08:49:45 +09:00
|
|
|
return -EINVAL;
|
2022-03-15 23:01:48 +09:00
|
|
|
|
2024-03-01 09:18:24 +09:00
|
|
|
size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size;
|
|
|
|
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
/* Initialize rethook */
|
2024-03-01 09:18:24 +09:00
|
|
|
fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num);
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
if (IS_ERR(fp->rethook))
|
|
|
|
return PTR_ERR(fp->rethook);
|
|
|
|
|
2022-03-15 23:01:48 +09:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void fprobe_fail_cleanup(struct fprobe *fp)
|
|
|
|
{
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
if (!IS_ERR_OR_NULL(fp->rethook)) {
|
2022-03-15 23:01:48 +09:00
|
|
|
/* Don't need to cleanup rethook->handler because this is not used. */
|
|
|
|
rethook_free(fp->rethook);
|
|
|
|
fp->rethook = NULL;
|
|
|
|
}
|
|
|
|
ftrace_free_filter(&fp->ops);
|
|
|
|
}
|
|
|
|
|
2022-03-15 23:00:38 +09:00
|
|
|
/**
|
|
|
|
* register_fprobe() - Register fprobe to ftrace by pattern.
|
|
|
|
* @fp: A fprobe data structure to be registered.
|
|
|
|
* @filter: A wildcard pattern of probed symbols.
|
|
|
|
* @notfilter: A wildcard pattern of NOT probed symbols.
|
|
|
|
*
|
|
|
|
* Register @fp to ftrace for enabling the probe on the symbols matched to @filter.
|
|
|
|
* If @notfilter is not NULL, the symbols matched the @notfilter are not probed.
|
|
|
|
*
|
|
|
|
* Return 0 if @fp is registered successfully, -errno if not.
|
|
|
|
*/
|
|
|
|
int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
|
|
|
|
{
|
2022-03-15 23:01:48 +09:00
|
|
|
struct ftrace_hash *hash;
|
2022-03-15 23:00:38 +09:00
|
|
|
unsigned char *str;
|
|
|
|
int ret, len;
|
|
|
|
|
|
|
|
if (!fp || !filter)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
fprobe_init(fp);
|
|
|
|
|
|
|
|
len = strlen(filter);
|
|
|
|
str = kstrdup(filter, GFP_KERNEL);
|
|
|
|
ret = ftrace_set_filter(&fp->ops, str, len, 0);
|
|
|
|
kfree(str);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (notfilter) {
|
|
|
|
len = strlen(notfilter);
|
|
|
|
str = kstrdup(notfilter, GFP_KERNEL);
|
|
|
|
ret = ftrace_set_notrace(&fp->ops, str, len, 0);
|
|
|
|
kfree(str);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2022-03-15 23:01:48 +09:00
|
|
|
/* TODO:
|
|
|
|
* correctly calculate the total number of filtered symbols
|
|
|
|
* from both filter and notfilter.
|
|
|
|
*/
|
2022-03-23 16:35:36 +09:00
|
|
|
hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
|
2022-03-15 23:01:48 +09:00
|
|
|
if (WARN_ON_ONCE(!hash))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = fprobe_init_rethook(fp, (int)hash->count);
|
|
|
|
if (!ret)
|
|
|
|
ret = register_ftrace_function(&fp->ops);
|
|
|
|
|
2022-03-15 23:00:38 +09:00
|
|
|
out:
|
|
|
|
if (ret)
|
2022-03-15 23:01:48 +09:00
|
|
|
fprobe_fail_cleanup(fp);
|
2022-03-15 23:00:38 +09:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(register_fprobe);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* register_fprobe_ips() - Register fprobe to ftrace by address.
|
|
|
|
* @fp: A fprobe data structure to be registered.
|
|
|
|
* @addrs: An array of target ftrace location addresses.
|
|
|
|
* @num: The number of entries of @addrs.
|
|
|
|
*
|
|
|
|
* Register @fp to ftrace for enabling the probe on the address given by @addrs.
|
|
|
|
* The @addrs must be the addresses of ftrace location address, which may be
|
|
|
|
* the symbol address + arch-dependent offset.
|
|
|
|
* If you unsure what this mean, please use other registration functions.
|
|
|
|
*
|
|
|
|
* Return 0 if @fp is registered successfully, -errno if not.
|
|
|
|
*/
|
|
|
|
int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!fp || !addrs || num <= 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
fprobe_init(fp);
|
|
|
|
|
|
|
|
ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
|
2022-03-15 23:01:48 +09:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = fprobe_init_rethook(fp, num);
|
2022-03-15 23:00:38 +09:00
|
|
|
if (!ret)
|
|
|
|
ret = register_ftrace_function(&fp->ops);
|
|
|
|
|
|
|
|
if (ret)
|
2022-03-15 23:01:48 +09:00
|
|
|
fprobe_fail_cleanup(fp);
|
2022-03-15 23:00:38 +09:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(register_fprobe_ips);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* register_fprobe_syms() - Register fprobe to ftrace by symbols.
|
|
|
|
* @fp: A fprobe data structure to be registered.
|
|
|
|
* @syms: An array of target symbols.
|
|
|
|
* @num: The number of entries of @syms.
|
|
|
|
*
|
|
|
|
* Register @fp to the symbols given by @syms array. This will be useful if
|
|
|
|
* you are sure the symbols exist in the kernel.
|
|
|
|
*
|
|
|
|
* Return 0 if @fp is registered successfully, -errno if not.
|
|
|
|
*/
|
|
|
|
int register_fprobe_syms(struct fprobe *fp, const char **syms, int num)
|
|
|
|
{
|
|
|
|
unsigned long *addrs;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!fp || !syms || num <= 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
addrs = get_ftrace_locations(syms, num);
|
|
|
|
if (IS_ERR(addrs))
|
|
|
|
return PTR_ERR(addrs);
|
|
|
|
|
|
|
|
ret = register_fprobe_ips(fp, addrs, num);
|
|
|
|
|
|
|
|
kfree(addrs);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(register_fprobe_syms);
|
|
|
|
|
2023-06-06 21:39:55 +09:00
|
|
|
bool fprobe_is_registered(struct fprobe *fp)
|
|
|
|
{
|
|
|
|
if (!fp || (fp->ops.saved_func != fprobe_handler &&
|
|
|
|
fp->ops.saved_func != fprobe_kprobe_handler))
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2022-03-15 23:00:38 +09:00
|
|
|
/**
|
|
|
|
* unregister_fprobe() - Unregister fprobe from ftrace
|
|
|
|
* @fp: A fprobe data structure to be unregistered.
|
|
|
|
*
|
|
|
|
* Unregister fprobe (and remove ftrace hooks from the function entries).
|
|
|
|
*
|
|
|
|
* Return 0 if @fp is unregistered successfully, -errno if not.
|
|
|
|
*/
|
|
|
|
int unregister_fprobe(struct fprobe *fp)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2023-06-06 21:39:55 +09:00
|
|
|
if (!fprobe_is_registered(fp))
|
2022-03-15 23:00:38 +09:00
|
|
|
return -EINVAL;
|
|
|
|
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
if (!IS_ERR_OR_NULL(fp->rethook))
|
2023-07-07 23:03:19 +09:00
|
|
|
rethook_stop(fp->rethook);
|
2022-03-15 23:01:48 +09:00
|
|
|
|
2022-03-15 23:00:38 +09:00
|
|
|
ret = unregister_ftrace_function(&fp->ops);
|
2022-03-15 23:01:48 +09:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2022-03-15 23:00:38 +09:00
|
|
|
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
if (!IS_ERR_OR_NULL(fp->rethook))
|
2023-06-15 13:52:36 +02:00
|
|
|
rethook_free(fp->rethook);
|
|
|
|
|
2022-03-15 23:01:48 +09:00
|
|
|
ftrace_free_filter(&fp->ops);
|
2022-03-15 23:00:38 +09:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(unregister_fprobe);
|