kprobe是什么
kprobe 是一种动态调试机制,用于debugging,动态跟踪,性能分析,动态修改内核行为等,2004年由IBM发布,是名为Dprobes工具集的底层实现机制[1][2],2005年合入Linux kernel。probe的含义是像一个探针,可以不修改分析对象源码的情况下,获取Kernel的运行时信息。
kprobe的实现原理是把指定地址(探测点)的指令替换成一个可以让cpu进入debug模式的指令,使执行路径暂停,跳转到probe 处理函数后收集、修改信息,再跳转回来继续执行。
X86中使用的是int3指令,ARM64中使用的是BRK指令进入debug monitor模式。
这种指令替换机制使得kprobe可以在大部分kernel的代码段插入探测点,除了一些分支类的指令如br、exception、eret等。另外kprobe模块本身的代码不能probe。在pre_handler和post_handler中可以访问、修改所有寄存器和全局变量,其变体jprobe可以检查传入参数,kretprobe可以检查返回值。且不需要修改探测目标的源码,方便用于生产系统的debug、性能分析、log记录等。
kprobe的注册
kprobe的触发和处理
static struct fault_info __refdata debug_fault_info[] = {
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" },
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" },
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 3" },
{ do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" },
{ do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" },
{ early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 7" },
};
void __init hook_debug_fault_code(int nr,
int (*fn)(unsigned long, unsigned int, struct pt_regs *),
int sig, int code, const char *name)
{
BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
debug_fault_info[nr].fn = fn;
debug_fault_info[nr].sig = sig;
debug_fault_info[nr].code = code;
debug_fault_info[nr].name = name;
}
// brk指令异常处理 -> brk_handler
// 单步调试异常处理 -> single_step_handler
static int __init debug_traps_init(void)
{
hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP,
TRAP_TRACE, "single-step handler");
hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP,
TRAP_BRKPT, "ptrace BRK handler");
return 0;
}
point
1.被替换的指令放在哪里?
- slot page,使用了module_alloc分配可以执行的内存页
2.一个探测点由多个probe注册怎么处理
- aggrprobe
3.SMP、中断、抢占时可能有kprobe重入,如何处理
- 实现了reenter检查机制,允许probe嵌套
4.kprobe的性能
- break指令导致CPU执行停止,时间开销较大
- x86实现了优化机制,使用jmp指令替换int3 这种break指令,速度提升10倍;ARM64中未实现。
5.kprobe实现方式
- 之前的实现方式:通过gcc -pg可以向函数头插入mcount指令,linux启动的时候将mcount指令替换为nop指令,需要probe的时候再替换回来,这样只能实现函数头的trace,现在的方案可以实现任意指令的trace
添加kprobe驱动
/* samples/kprobes/kprobe_example.c */
/*
* NOTE: This example is works on x86 and powerpc.
* Here's a sample kernel module showing the use of kprobes to dump a
* stack trace and selected registers when _do_fork() is called.
*
* For more information on theory of operation of kprobes, see
* Documentation/kprobes.txt
*
* You will see the trace data in /var/log/messages and on the console
* whenever _do_fork() is invoked to create a new process.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#define MAX_SYMBOL_LEN 64
static char symbol[MAX_SYMBOL_LEN] = "_do_fork";
module_param_string(symbol, symbol, sizeof(symbol), 0644);
/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
.symbol_name = symbol,
};
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_ARM64
pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"
" pstate = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate);
#endif
/* A dump_stack() here will give a stack backtrace */
return 0;
}
/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
#ifdef CONFIG_ARM64
pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pstate);
#endif
}
/*
* fault_handler: this is called if an exception is generated for any
* instruction within the pre- or post-handler, or when Kprobes
* single-steps the probed instruction.
*/
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr);
/* Return 0 because we don't handle the fault. */
return 0;
}
static int __init kprobe_init(void)
{
int ret;
kp.pre_handler = handler_pre;
kp.post_handler = handler_post;
kp.fault_handler = handler_fault;
ret = register_kprobe(&kp);
if (ret < 0) {
pr_err("register_kprobe failed, returned %d\n", ret);
return ret;
}
pr_info("Planted kprobe at %p\n", kp.addr);
return 0;
}
static void __exit kprobe_exit(void)
{
unregister_kprobe(&kp);
pr_info("kprobe at %p unregistered\n", kp.addr);
}
module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");
在ftrace中使用kprobe
p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe
r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe
p:[GRP/]EVENT] [MOD:]SYM[+0]%return [FETCHARGS] : Set a return probe
-:[GRP/]EVENT : Clear a probe
GRP : Group name. If omitted, use "kprobes" for it.
EVENT : Event name. If omitted, the event name is generated
based on SYM+offs or MEMADDR.
MOD : Module name which has given SYM.
SYM[+offs] : Symbol+offset where the probe is inserted.
SYM%return : Return address of the symbol
MEMADDR : Address where the probe is inserted.
MAXACTIVE : Maximum number of instances of the specified function that
can be probed simultaneously, or 0 for the default value
as defined in Documentation/trace/kprobes.rst section 1.3.1.
FETCHARGS : Arguments. Each probe can have up to 128 args.
%REG : Fetch register REG
@ADDR : Fetch memory at ADDR (ADDR should be in kernel)
@SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
$stackN : Fetch Nth entry of stack (N >= 0)
$stack : Fetch stack address.
$argN : Fetch the Nth function argument. (N >= 1) (\*1)
$retval : Fetch return value.(\*2)
$comm : Fetch current task comm.
+|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
\IMM : Store an immediate value to the argument.
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
(u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
(x8/x16/x32/x64), "string", "ustring" and bitfield
are supported.
(\*1) only for the probe on function entry (offs == 0).
(\*2) only for return probe.
(\*3) this is useful for fetching a field of data structures.
(\*4) "u" means user-space dereference. See :ref:`user_mem_access`.
例:
如果想trace __set_task_comm函数,并且打印出来更新后线程的名字
/* buf是线程的新名字 */
void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
cd /sys/kernel/debug/tracing
echo 'p:myprobe __set_task_comm task_struct=%x0 name=+0(%x1):string exec=+0(%x2) comm=$comm' > kprobe_events
echo 1 > events/kprobes/myprobe/enable
cat trace
echo 0 > events/kprobes/myprobe/enable
echo > kprobe_events
跟踪谁使用了高精度定时器,并打印调用栈
echo 0 > tracing_on
echo 0 > events/kprobes/myprobe/enable
echo 'p:myprobe hrtimer_start_range_ns ktime=%x1:s64 comm=$comm' > kprobe_events
echo stacktrace > trace_options
echo 1 > tracing_on
echo 1 > events/kprobes/myprobe/enable
参考资料
- 主要参考文章
- https://www.cnblogs.com/hpyu/p/14257305.html
- ftrace kprobe使用说明
- https://docs.kernel.org/trace/kprobetrace.html
- https://blog.csdn.net/jasonactions/article/details/122299418