2022-03-15 23:00:50 +09:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
#define pr_fmt(fmt) "rethook: " fmt
|
|
|
|
|
|
|
|
#include <linux/bug.h>
|
|
|
|
#include <linux/kallsyms.h>
|
|
|
|
#include <linux/kprobes.h>
|
|
|
|
#include <linux/preempt.h>
|
|
|
|
#include <linux/rethook.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
|
|
|
|
/* Return hook list (shadow stack by list) */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function is called from delayed_put_task_struct() when a task is
|
|
|
|
* dead and cleaned up to recycle any kretprobe instances associated with
|
|
|
|
* this task. These left over instances represent probed functions that
|
|
|
|
* have been called but will never return.
|
|
|
|
*/
|
|
|
|
void rethook_flush_task(struct task_struct *tk)
|
|
|
|
{
|
|
|
|
struct rethook_node *rhn;
|
|
|
|
struct llist_node *node;
|
|
|
|
|
|
|
|
node = __llist_del_all(&tk->rethooks);
|
|
|
|
while (node) {
|
|
|
|
rhn = container_of(node, struct rethook_node, llist);
|
|
|
|
node = node->next;
|
|
|
|
preempt_disable();
|
|
|
|
rethook_recycle(rhn);
|
|
|
|
preempt_enable();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void rethook_free_rcu(struct rcu_head *head)
|
|
|
|
{
|
|
|
|
struct rethook *rh = container_of(head, struct rethook, rcu);
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
objpool_fini(&rh->pool);
|
2022-03-15 23:00:50 +09:00
|
|
|
}
|
|
|
|
|
2023-07-07 23:03:19 +09:00
|
|
|
/**
|
|
|
|
* rethook_stop() - Stop using a rethook.
|
|
|
|
* @rh: the struct rethook to stop.
|
|
|
|
*
|
|
|
|
* Stop using a rethook to prepare for freeing it. If you want to wait for
|
|
|
|
* all running rethook handler before calling rethook_free(), you need to
|
|
|
|
* call this first and wait RCU, and call rethook_free().
|
|
|
|
*/
|
|
|
|
void rethook_stop(struct rethook *rh)
|
|
|
|
{
|
|
|
|
WRITE_ONCE(rh->handler, NULL);
|
|
|
|
}
|
|
|
|
|
2022-03-15 23:00:50 +09:00
|
|
|
/**
|
|
|
|
* rethook_free() - Free struct rethook.
|
|
|
|
* @rh: the struct rethook to be freed.
|
|
|
|
*
|
|
|
|
* Free the rethook. Before calling this function, user must ensure the
|
|
|
|
* @rh::data is cleaned if needed (or, the handler can access it after
|
|
|
|
* calling this function.) This function will set the @rh to be freed
|
|
|
|
* after all rethook_node are freed (not soon). And the caller must
|
|
|
|
* not touch @rh after calling this.
|
|
|
|
*/
|
|
|
|
void rethook_free(struct rethook *rh)
|
|
|
|
{
|
2022-03-31 10:11:17 +09:00
|
|
|
WRITE_ONCE(rh->handler, NULL);
|
2022-03-15 23:00:50 +09:00
|
|
|
|
|
|
|
call_rcu(&rh->rcu, rethook_free_rcu);
|
|
|
|
}
|
|
|
|
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
static int rethook_init_node(void *nod, void *context)
|
|
|
|
{
|
|
|
|
struct rethook_node *node = nod;
|
|
|
|
|
|
|
|
node->rethook = context;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int rethook_fini_pool(struct objpool_head *head, void *context)
|
|
|
|
{
|
|
|
|
kfree(context);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-03-15 23:00:50 +09:00
|
|
|
/**
|
|
|
|
* rethook_alloc() - Allocate struct rethook.
|
|
|
|
* @data: a data to pass the @handler when hooking the return.
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
* @handler: the return hook callback function, must NOT be NULL
|
|
|
|
* @size: node size: rethook node and additional data
|
|
|
|
* @num: number of rethook nodes to be preallocated
|
2022-03-15 23:00:50 +09:00
|
|
|
*
|
|
|
|
* Allocate and initialize a new rethook with @data and @handler.
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
* Return pointer of new rethook, or error codes for failures.
|
|
|
|
*
|
2022-03-15 23:00:50 +09:00
|
|
|
* Note that @handler == NULL means this rethook is going to be freed.
|
|
|
|
*/
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
struct rethook *rethook_alloc(void *data, rethook_handler_t handler,
|
|
|
|
int size, int num)
|
2022-03-15 23:00:50 +09:00
|
|
|
{
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
struct rethook *rh;
|
2022-03-15 23:00:50 +09:00
|
|
|
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
if (!handler || num <= 0 || size < sizeof(struct rethook_node))
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
|
|
|
|
if (!rh)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2022-03-15 23:00:50 +09:00
|
|
|
|
|
|
|
rh->data = data;
|
|
|
|
rh->handler = handler;
|
|
|
|
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
/* initialize the objpool for rethook nodes */
|
|
|
|
if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh,
|
|
|
|
rethook_init_node, rethook_fini_pool)) {
|
|
|
|
kfree(rh);
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
2022-03-15 23:00:50 +09:00
|
|
|
return rh;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_rethook_node_rcu(struct rcu_head *head)
|
|
|
|
{
|
|
|
|
struct rethook_node *node = container_of(head, struct rethook_node, rcu);
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
struct rethook *rh = node->rethook;
|
2022-03-15 23:00:50 +09:00
|
|
|
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
objpool_drop(node, &rh->pool);
|
2022-03-15 23:00:50 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* rethook_recycle() - return the node to rethook.
|
|
|
|
* @node: The struct rethook_node to be returned.
|
|
|
|
*
|
|
|
|
* Return back the @node to @node::rethook. If the @node::rethook is already
|
|
|
|
* marked as freed, this will free the @node.
|
|
|
|
*/
|
|
|
|
void rethook_recycle(struct rethook_node *node)
|
|
|
|
{
|
|
|
|
lockdep_assert_preemption_disabled();
|
|
|
|
|
|
|
|
if (likely(READ_ONCE(node->rethook->handler)))
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
objpool_push(node, &node->rethook->pool);
|
2022-03-15 23:00:50 +09:00
|
|
|
else
|
|
|
|
call_rcu(&node->rcu, free_rethook_node_rcu);
|
|
|
|
}
|
|
|
|
NOKPROBE_SYMBOL(rethook_recycle);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* rethook_try_get() - get an unused rethook node.
|
|
|
|
* @rh: The struct rethook which pools the nodes.
|
|
|
|
*
|
|
|
|
* Get an unused rethook node from @rh. If the node pool is empty, this
|
|
|
|
* will return NULL. Caller must disable preemption.
|
|
|
|
*/
|
|
|
|
struct rethook_node *rethook_try_get(struct rethook *rh)
|
|
|
|
{
|
|
|
|
rethook_handler_t handler = READ_ONCE(rh->handler);
|
|
|
|
|
|
|
|
lockdep_assert_preemption_disabled();
|
|
|
|
|
|
|
|
/* Check whether @rh is going to be freed. */
|
|
|
|
if (unlikely(!handler))
|
|
|
|
return NULL;
|
|
|
|
|
2022-06-08 01:11:12 +09:00
|
|
|
/*
|
|
|
|
* This expects the caller will set up a rethook on a function entry.
|
|
|
|
* When the function returns, the rethook will eventually be reclaimed
|
|
|
|
* or released in the rethook_recycle() with call_rcu().
|
|
|
|
* This means the caller must be run in the RCU-availabe context.
|
|
|
|
*/
|
|
|
|
if (unlikely(!rcu_is_watching()))
|
|
|
|
return NULL;
|
|
|
|
|
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 21:56:52 +08:00
|
|
|
return (struct rethook_node *)objpool_pop(&rh->pool);
|
2022-03-15 23:00:50 +09:00
|
|
|
}
|
|
|
|
NOKPROBE_SYMBOL(rethook_try_get);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* rethook_hook() - Hook the current function return.
|
|
|
|
* @node: The struct rethook node to hook the function return.
|
|
|
|
* @regs: The struct pt_regs for the function entry.
|
|
|
|
* @mcount: True if this is called from mcount(ftrace) context.
|
|
|
|
*
|
|
|
|
* Hook the current running function return. This must be called when the
|
|
|
|
* function entry (or at least @regs must be the registers of the function
|
|
|
|
* entry.) @mcount is used for identifying the context. If this is called
|
|
|
|
* from ftrace (mcount) callback, @mcount must be set true. If this is called
|
|
|
|
* from the real function entry (e.g. kprobes) @mcount must be set false.
|
|
|
|
* This is because the way to hook the function return depends on the context.
|
|
|
|
*/
|
|
|
|
void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount)
|
|
|
|
{
|
|
|
|
arch_rethook_prepare(node, regs, mcount);
|
|
|
|
__llist_add(&node->llist, ¤t->rethooks);
|
|
|
|
}
|
|
|
|
NOKPROBE_SYMBOL(rethook_hook);
|
|
|
|
|
|
|
|
/* This assumes the 'tsk' is the current task or is not running. */
|
|
|
|
static unsigned long __rethook_find_ret_addr(struct task_struct *tsk,
|
|
|
|
struct llist_node **cur)
|
|
|
|
{
|
|
|
|
struct rethook_node *rh = NULL;
|
|
|
|
struct llist_node *node = *cur;
|
|
|
|
|
|
|
|
if (!node)
|
|
|
|
node = tsk->rethooks.first;
|
|
|
|
else
|
|
|
|
node = node->next;
|
|
|
|
|
|
|
|
while (node) {
|
|
|
|
rh = container_of(node, struct rethook_node, llist);
|
|
|
|
if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) {
|
|
|
|
*cur = node;
|
|
|
|
return rh->ret_addr;
|
|
|
|
}
|
|
|
|
node = node->next;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
NOKPROBE_SYMBOL(__rethook_find_ret_addr);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* rethook_find_ret_addr -- Find correct return address modified by rethook
|
|
|
|
* @tsk: Target task
|
|
|
|
* @frame: A frame pointer
|
|
|
|
* @cur: a storage of the loop cursor llist_node pointer for next call
|
|
|
|
*
|
|
|
|
* Find the correct return address modified by a rethook on @tsk in unsigned
|
|
|
|
* long type.
|
|
|
|
* The @tsk must be 'current' or a task which is not running. @frame is a hint
|
|
|
|
* to get the currect return address - which is compared with the
|
|
|
|
* rethook::frame field. The @cur is a loop cursor for searching the
|
|
|
|
* kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
|
|
|
|
* first call, but '@cur' itself must NOT NULL.
|
|
|
|
*
|
|
|
|
* Returns found address value or zero if not found.
|
|
|
|
*/
|
|
|
|
unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
|
|
|
|
struct llist_node **cur)
|
|
|
|
{
|
|
|
|
struct rethook_node *rhn = NULL;
|
|
|
|
unsigned long ret;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!cur))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(tsk != current && task_is_running(tsk)))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
do {
|
|
|
|
ret = __rethook_find_ret_addr(tsk, cur);
|
|
|
|
if (!ret)
|
|
|
|
break;
|
|
|
|
rhn = container_of(*cur, struct rethook_node, llist);
|
|
|
|
} while (rhn->frame != frame);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
NOKPROBE_SYMBOL(rethook_find_ret_addr);
|
|
|
|
|
|
|
|
void __weak arch_rethook_fixup_return(struct pt_regs *regs,
|
|
|
|
unsigned long correct_ret_addr)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Do nothing by default. If the architecture which uses a
|
|
|
|
* frame pointer to record real return address on the stack,
|
|
|
|
* it should fill this function to fixup the return address
|
|
|
|
* so that stacktrace works from the rethook handler.
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This function will be called from each arch-defined trampoline. */
|
|
|
|
unsigned long rethook_trampoline_handler(struct pt_regs *regs,
|
|
|
|
unsigned long frame)
|
|
|
|
{
|
|
|
|
struct llist_node *first, *node = NULL;
|
|
|
|
unsigned long correct_ret_addr;
|
|
|
|
rethook_handler_t handler;
|
|
|
|
struct rethook_node *rhn;
|
|
|
|
|
|
|
|
correct_ret_addr = __rethook_find_ret_addr(current, &node);
|
|
|
|
if (!correct_ret_addr) {
|
|
|
|
pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n");
|
|
|
|
BUG_ON(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
instruction_pointer_set(regs, correct_ret_addr);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These loops must be protected from rethook_free_rcu() because those
|
|
|
|
* are accessing 'rhn->rethook'.
|
|
|
|
*/
|
2023-05-17 11:45:06 +08:00
|
|
|
preempt_disable_notrace();
|
2022-03-15 23:00:50 +09:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Run the handler on the shadow stack. Do not unlink the list here because
|
|
|
|
* stackdump inside the handlers needs to decode it.
|
|
|
|
*/
|
|
|
|
first = current->rethooks.first;
|
|
|
|
while (first) {
|
|
|
|
rhn = container_of(first, struct rethook_node, llist);
|
|
|
|
if (WARN_ON_ONCE(rhn->frame != frame))
|
|
|
|
break;
|
|
|
|
handler = READ_ONCE(rhn->rethook->handler);
|
|
|
|
if (handler)
|
2023-06-06 21:39:55 +09:00
|
|
|
handler(rhn, rhn->rethook->data,
|
|
|
|
correct_ret_addr, regs);
|
2022-03-15 23:00:50 +09:00
|
|
|
|
|
|
|
if (first == node)
|
|
|
|
break;
|
|
|
|
first = first->next;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Fixup registers for returning to correct address. */
|
|
|
|
arch_rethook_fixup_return(regs, correct_ret_addr);
|
|
|
|
|
|
|
|
/* Unlink used shadow stack */
|
|
|
|
first = current->rethooks.first;
|
|
|
|
current->rethooks.first = node->next;
|
|
|
|
node->next = NULL;
|
|
|
|
|
|
|
|
while (first) {
|
|
|
|
rhn = container_of(first, struct rethook_node, llist);
|
|
|
|
first = first->next;
|
|
|
|
rethook_recycle(rhn);
|
|
|
|
}
|
2023-05-17 11:45:06 +08:00
|
|
|
preempt_enable_notrace();
|
2022-03-15 23:00:50 +09:00
|
|
|
|
|
|
|
return correct_ret_addr;
|
|
|
|
}
|
|
|
|
NOKPROBE_SYMBOL(rethook_trampoline_handler);
|