mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-01 10:45:49 +00:00
ac4ad5c09b
v2->v1: 1. Remove the simuation of STP and the related bits. 2. Use arm64_skip_faulting_instruction for single-stepping or FEAT_BTI scenario. As Andrii pointed out, the uprobe/uretprobe selftest bench run into a counterintuitive result that nop and push variants are much slower than ret variant [0]. The root cause lies in the arch_probe_analyse_insn(), which excludes 'nop' and 'stp' from the emulatable instructions list. This force the kernel returns to userspace and execute them out-of-line, then trapping back to kernel for running uprobe callback functions. This leads to a significant performance overhead compared to 'ret' variant, which is already emulated. Typicall uprobe is installed on 'nop' for USDT and on function entry which starts with the instrucion 'stp x29, x30, [sp, #imm]!' to push lr and fp into stack regardless kernel or userspace binary. In order to improve the performance of handling uprobe for common usecases. This patch supports the emulation of Arm64 equvialents instructions of 'nop' and 'push'. The benchmark results below indicates the performance gain of emulation is obvious. On Kunpeng916 (Hi1616), 4 NUMA nodes, 64 Arm64 cores@2.4GHz. xol (1 cpus) ------------ uprobe-nop: 0.916 ± 0.001M/s (0.916M/prod) uprobe-push: 0.908 ± 0.001M/s (0.908M/prod) uprobe-ret: 1.855 ± 0.000M/s (1.855M/prod) uretprobe-nop: 0.640 ± 0.000M/s (0.640M/prod) uretprobe-push: 0.633 ± 0.001M/s (0.633M/prod) uretprobe-ret: 0.978 ± 0.003M/s (0.978M/prod) emulation (1 cpus) ------------------- uprobe-nop: 1.862 ± 0.002M/s (1.862M/prod) uprobe-push: 1.743 ± 0.006M/s (1.743M/prod) uprobe-ret: 1.840 ± 0.001M/s (1.840M/prod) uretprobe-nop: 0.964 ± 0.004M/s (0.964M/prod) uretprobe-push: 0.936 ± 0.004M/s (0.936M/prod) uretprobe-ret: 0.940 ± 0.001M/s (0.940M/prod) As shown above, the performance gap between 'nop/push' and 'ret' variants has been significantly reduced. Due to the emulation of 'push' instruction needs to access userspace memory, it spent more cycles than the other. As Mark suggested [1], it is painful to emulate the correct atomicity and ordering properties of STP, especially when it interacts with MTE, POE, etc. So this patch just focus on the simuation of 'nop'. The simluation of STP and related changes will be addressed in a separate patch. [0] https://lore.kernel.org/all/CAEf4BzaO4eG6hr2hzXYpn+7Uer4chS0R99zLn02ezZ5YruVuQw@mail.gmail.com/ [1] https://lore.kernel.org/all/Zr3RN4zxF5XPgjEB@J2N7QTR9R3/ CC: Andrii Nakryiko <andrii.nakryiko@gmail.com> CC: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Liao Chang <liaochang1@huawei.com> Acked-by: Mark Rutland <mark.rutland@arm.com> Link: https://lore.kernel.org/r/20240909071114.1150053-1-liaochang1@huawei.com [catalin.marinas@arm.com: small tweaks following MarkR's comments] Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
205 lines
4.6 KiB
C
205 lines
4.6 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* arch/arm64/kernel/probes/simulate-insn.c
|
|
*
|
|
* Copyright (C) 2013 Linaro Limited.
|
|
*/
|
|
|
|
#include <linux/bitops.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/kprobes.h>
|
|
|
|
#include <asm/ptrace.h>
|
|
#include <asm/traps.h>
|
|
|
|
#include "simulate-insn.h"
|
|
|
|
#define bbl_displacement(insn) \
|
|
sign_extend32(((insn) & 0x3ffffff) << 2, 27)
|
|
|
|
#define bcond_displacement(insn) \
|
|
sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20)
|
|
|
|
#define cbz_displacement(insn) \
|
|
sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20)
|
|
|
|
#define tbz_displacement(insn) \
|
|
sign_extend32(((insn >> 5) & 0x3fff) << 2, 15)
|
|
|
|
#define ldr_displacement(insn) \
|
|
sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20)
|
|
|
|
static inline void set_x_reg(struct pt_regs *regs, int reg, u64 val)
|
|
{
|
|
pt_regs_write_reg(regs, reg, val);
|
|
}
|
|
|
|
static inline void set_w_reg(struct pt_regs *regs, int reg, u64 val)
|
|
{
|
|
pt_regs_write_reg(regs, reg, lower_32_bits(val));
|
|
}
|
|
|
|
static inline u64 get_x_reg(struct pt_regs *regs, int reg)
|
|
{
|
|
return pt_regs_read_reg(regs, reg);
|
|
}
|
|
|
|
static inline u32 get_w_reg(struct pt_regs *regs, int reg)
|
|
{
|
|
return lower_32_bits(pt_regs_read_reg(regs, reg));
|
|
}
|
|
|
|
static bool __kprobes check_cbz(u32 opcode, struct pt_regs *regs)
|
|
{
|
|
int xn = opcode & 0x1f;
|
|
|
|
return (opcode & (1 << 31)) ?
|
|
(get_x_reg(regs, xn) == 0) : (get_w_reg(regs, xn) == 0);
|
|
}
|
|
|
|
static bool __kprobes check_cbnz(u32 opcode, struct pt_regs *regs)
|
|
{
|
|
int xn = opcode & 0x1f;
|
|
|
|
return (opcode & (1 << 31)) ?
|
|
(get_x_reg(regs, xn) != 0) : (get_w_reg(regs, xn) != 0);
|
|
}
|
|
|
|
static bool __kprobes check_tbz(u32 opcode, struct pt_regs *regs)
|
|
{
|
|
int xn = opcode & 0x1f;
|
|
int bit_pos = ((opcode & (1 << 31)) >> 26) | ((opcode >> 19) & 0x1f);
|
|
|
|
return ((get_x_reg(regs, xn) >> bit_pos) & 0x1) == 0;
|
|
}
|
|
|
|
static bool __kprobes check_tbnz(u32 opcode, struct pt_regs *regs)
|
|
{
|
|
int xn = opcode & 0x1f;
|
|
int bit_pos = ((opcode & (1 << 31)) >> 26) | ((opcode >> 19) & 0x1f);
|
|
|
|
return ((get_x_reg(regs, xn) >> bit_pos) & 0x1) != 0;
|
|
}
|
|
|
|
/*
|
|
* instruction simulation functions
|
|
*/
|
|
void __kprobes
|
|
simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs)
|
|
{
|
|
long imm, xn, val;
|
|
|
|
xn = opcode & 0x1f;
|
|
imm = ((opcode >> 3) & 0x1ffffc) | ((opcode >> 29) & 0x3);
|
|
imm = sign_extend64(imm, 20);
|
|
if (opcode & 0x80000000)
|
|
val = (imm<<12) + (addr & 0xfffffffffffff000);
|
|
else
|
|
val = imm + addr;
|
|
|
|
set_x_reg(regs, xn, val);
|
|
|
|
instruction_pointer_set(regs, instruction_pointer(regs) + 4);
|
|
}
|
|
|
|
void __kprobes
|
|
simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs)
|
|
{
|
|
int disp = bbl_displacement(opcode);
|
|
|
|
/* Link register is x30 */
|
|
if (opcode & (1 << 31))
|
|
set_x_reg(regs, 30, addr + 4);
|
|
|
|
instruction_pointer_set(regs, addr + disp);
|
|
}
|
|
|
|
void __kprobes
|
|
simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs)
|
|
{
|
|
int disp = 4;
|
|
|
|
if (aarch32_opcode_cond_checks[opcode & 0xf](regs->pstate & 0xffffffff))
|
|
disp = bcond_displacement(opcode);
|
|
|
|
instruction_pointer_set(regs, addr + disp);
|
|
}
|
|
|
|
void __kprobes
|
|
simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs)
|
|
{
|
|
int xn = (opcode >> 5) & 0x1f;
|
|
|
|
/* update pc first in case we're doing a "blr lr" */
|
|
instruction_pointer_set(regs, get_x_reg(regs, xn));
|
|
|
|
/* Link register is x30 */
|
|
if (((opcode >> 21) & 0x3) == 1)
|
|
set_x_reg(regs, 30, addr + 4);
|
|
}
|
|
|
|
void __kprobes
|
|
simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs)
|
|
{
|
|
int disp = 4;
|
|
|
|
if (opcode & (1 << 24)) {
|
|
if (check_cbnz(opcode, regs))
|
|
disp = cbz_displacement(opcode);
|
|
} else {
|
|
if (check_cbz(opcode, regs))
|
|
disp = cbz_displacement(opcode);
|
|
}
|
|
instruction_pointer_set(regs, addr + disp);
|
|
}
|
|
|
|
void __kprobes
|
|
simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs)
|
|
{
|
|
int disp = 4;
|
|
|
|
if (opcode & (1 << 24)) {
|
|
if (check_tbnz(opcode, regs))
|
|
disp = tbz_displacement(opcode);
|
|
} else {
|
|
if (check_tbz(opcode, regs))
|
|
disp = tbz_displacement(opcode);
|
|
}
|
|
instruction_pointer_set(regs, addr + disp);
|
|
}
|
|
|
|
void __kprobes
|
|
simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs)
|
|
{
|
|
unsigned long load_addr;
|
|
int xn = opcode & 0x1f;
|
|
|
|
load_addr = addr + ldr_displacement(opcode);
|
|
|
|
if (opcode & (1 << 30)) /* x0-x30 */
|
|
set_x_reg(regs, xn, READ_ONCE(*(u64 *)load_addr));
|
|
else /* w0-w30 */
|
|
set_w_reg(regs, xn, READ_ONCE(*(u32 *)load_addr));
|
|
|
|
instruction_pointer_set(regs, instruction_pointer(regs) + 4);
|
|
}
|
|
|
|
void __kprobes
|
|
simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs)
|
|
{
|
|
unsigned long load_addr;
|
|
int xn = opcode & 0x1f;
|
|
|
|
load_addr = addr + ldr_displacement(opcode);
|
|
|
|
set_x_reg(regs, xn, READ_ONCE(*(s32 *)load_addr));
|
|
|
|
instruction_pointer_set(regs, instruction_pointer(regs) + 4);
|
|
}
|
|
|
|
void __kprobes
|
|
simulate_nop(u32 opcode, long addr, struct pt_regs *regs)
|
|
{
|
|
arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
|
|
}
|