一、内核编译配置
如果你所在的环境不支持kprobe,可以尝试自查下以下编译选项是否打开。
Configuring Kprobes
-
CONFIG_KPROBES
When configuring the kernel using make menuconfig/xconfig/oldconfig,
ensure that CONFIG_KPROBES is set to “y”, look for “Kprobes” under
“General architecture-dependent options”.
-
CONFIG_MODULES
So that you can load and unload Kprobes-based instrumentation modules,
make sure “Loadable module support” (CONFIG_MODULES) and “Module
unloading” (CONFIG_MODULE_UNLOAD) are set to “y”.
-
CONFIG_KALLSYMS
Also make sure that CONFIG_KALLSYMS and perhaps even CONFIG_KALLSYMS_ALL
are set to “y”, since kallsyms_lookup_name() is used by the in-kernel
kprobe address resolution code.
If you need to insert a probe in the middle of a function, you may find
it useful to “Compile the kernel with debug info” (CONFIG_DEBUG_INFO),
so you can use “objdump -d -l vmlinux” to see the source-to-object
code mapping.
在menuconfig中输入/进入配置文件搜索模式并搜索关键词
二、使用场景
- 监控某个内核函数是否被调用
- 获取某个内核函数在调用栈上耗费的时间
- 获取某个内核函数的入参
- 获取某个内核函数的调用栈(
dump_stack()
) - 获取某个内核函数的返回值
三、参数传递规则
x86平台对pt_regs的定义
arch/x86/include/asm/ptrace.h
// i386架构
#ifdef __i386__
struct pt_regs {
/*
* NB: 32-bit x86 CPUs are inconsistent as what happens in the
* following cases (where %seg represents a segment register):
*
* - pushl %seg: some do a 16-bit write and leave the high
* bits alone
* - movl %seg, [mem]: some do a 16-bit write despite the movl
* - IDT entry: some (e.g. 486) will leave the high bits of CS
* and (if applicable) SS undefined.
*
* Fortunately, x86-32 doesn't read the high bits on POP or IRET,
* so we can just treat all of the segment registers as 16-bit
* values.
*/
unsigned long bx;
unsigned long cx;
unsigned long dx;
unsigned long si;
unsigned long di;
unsigned long bp;
unsigned long ax;
unsigned short ds;
unsigned short __dsh;
unsigned short es;
unsigned short __esh;
unsigned short fs;
unsigned short __fsh;
/* On interrupt, gs and __gsh store the vector number. */
unsigned short gs;
unsigned short __gsh;
/* On interrupt, this is the error code. */
unsigned long orig_ax;
unsigned long ip;
unsigned short cs;
unsigned short __csh;
unsigned long flags;
unsigned long sp;
unsigned short ss;
unsigned short __ssh;
};
#else /* __i386__ */
// ia64
struct pt_regs {
/*
* C ABI says these regs are callee-preserved. They aren't saved on kernel entry
* unless syscall needs a complete, fully filled "struct pt_regs".
*/
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
/* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long ax;
unsigned long cx;
unsigned long dx;
unsigned long si;
unsigned long di;
/*
* On syscall entry, this is syscall#. On CPU exception, this is error code.
* On hw interrupt, it's IRQ number:
*/
unsigned long orig_ax;
/* Return frame for iretq */
unsigned long ip;
unsigned long cs;
unsigned long flags;
unsigned long sp;
unsigned long ss;
/* top of stack page */
};
#endif /* !__i386__ */
从4.18的内核版本bpf
的相关源码/tools/testing/selftests/bpf/bpf_helpers.h
中可以窥探x86
结构和arm
架构函数参数传递规则。
#if defined(bpf_target_x86)
#define PT_REGS_PARM1(x) ((x)->di)
#define PT_REGS_PARM2(x) ((x)->si)
#define PT_REGS_PARM3(x) ((x)->dx)
#define PT_REGS_PARM4(x) ((x)->cx)
#define PT_REGS_PARM5(x) ((x)->r8)
#define PT_REGS_RET(x) ((x)->sp)
#define PT_REGS_FP(x) ((x)->bp)
#define PT_REGS_RC(x) ((x)->ax)
#define PT_REGS_SP(x) ((x)->sp)
#define PT_REGS_IP(x) ((x)->ip)
#elif defined(bpf_target_arm64)
#define PT_REGS_PARM1(x) ((x)->regs[0])
#define PT_REGS_PARM2(x) ((x)->regs[1])
#define PT_REGS_PARM3(x) ((x)->regs[2])
#define PT_REGS_PARM4(x) ((x)->regs[3])
#define PT_REGS_PARM5(x) ((x)->regs[4])
#define PT_REGS_RET(x) ((x)->regs[30])
#define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */
#define PT_REGS_RC(x) ((x)->regs[0])
#define PT_REGS_SP(x) ((x)->sp)
#define PT_REGS_IP(x) ((x)->pc)
/samples/bpf/test_overhead_kprobe_kern.c
// 使用示例
SEC("kprobe/__set_task_comm")
int prog(struct pt_regs *ctx)
{
struct signal_struct *signal;
struct task_struct *tsk;
char oldcomm[16] = {};
char newcomm[16] = {};
u16 oom_score_adj;
u32 pid;
tsk = (void *)PT_REGS_PARM1(ctx);
pid = _(tsk->pid);
bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm);
bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx));
signal = _(tsk->signal);
oom_score_adj = _(signal->oom_score_adj);
return 0;
}
// 函数原型
/*
* These functions flushes out all traces of the currently running executable
* so that a new one can be started
*/
void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{
task_lock(tsk);
trace_task_rename(tsk, buf);
strlcpy(tsk->comm, buf, sizeof(tsk->comm));
task_unlock(tsk);
perf_event_comm(tsk, exec);
}
-
x86架构寄存器约定与函数参数传递
在 X86_64 架构中,当调用一个函数的时候,RDI 寄存器用于传递第一个参数,RSI 寄存器用于传递第二个寄存器(RDI/RSI/RDX/RCS/R8/R9),R9 寄存器传递第六个参数, 函数返回值保存在 RAX 寄存器中。那么如果函数的参数超过六个,那么多余的参数参数如何传递? 在 X86_64 架构中,函数大于 6 个参数的参数通过堆栈进行传输。寄存器 描述 RDI 传递第1个参数 RSI 传递第2个参数 RDX 传递第3个参数 RCX 传递第4个参数 R8 传递第5个参数 R9 传递第6个参数 RAX 临时寄存器或者第一个返回值 RSP SP寄存器 RBP 栈帧寄存器
其中RDI对应pt_regs结构体中的di,其他寄存器依次类推。
- ARM架构寄存器约定与函数参数传递
在 ARM64 架构中,使用 X0-X7 寄存器传递参数,第一个参数通过 X0 寄存器传递,第二个参数通过 X1 寄存器传递,以此类推. 返回值存储在 X0 寄存器中。
四、使用实例
- 内核简单使用示例
/samples/kprobes/kprobe_example.c
/*
* NOTE: This example is works on x86 and powerpc.
* Here's a sample kernel module showing the use of kprobes to dump a
* stack trace and selected registers when _do_fork() is called.
*
* For more information on theory of operation of kprobes, see
* Documentation/kprobes.txt
*
* You will see the trace data in /var/log/messages and on the console
* whenever _do_fork() is invoked to create a new process.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#define MAX_SYMBOL_LEN 64
static char symbol[MAX_SYMBOL_LEN] = "_do_fork";
module_param_string(symbol, symbol, sizeof(symbol), 0644);
/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
.symbol_name = symbol,
};
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_X86
pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n",
p->symbol_name, p->addr, regs->ip, regs->flags);
#endif
#ifdef CONFIG_PPC
pr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n",
p->symbol_name, p->addr, regs->nip, regs->msr);
#endif
#ifdef CONFIG_MIPS
pr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n",
p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status);
#endif
#ifdef CONFIG_ARM64
pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"
" pstate = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate);
#endif
#ifdef CONFIG_S390
pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n",
p->symbol_name, p->addr, regs->psw.addr, regs->flags);
#endif
/* A dump_stack() here will give a stack backtrace */
return 0;
}
/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
#ifdef CONFIG_X86
pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n",
p->symbol_name, p->addr, regs->flags);
#endif
#ifdef CONFIG_PPC
pr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n",
p->symbol_name, p->addr, regs->msr);
#endif
#ifdef CONFIG_MIPS
pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n",
p->symbol_name, p->addr, regs->cp0_status);
#endif
#ifdef CONFIG_ARM64
pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pstate);
#endif
#ifdef CONFIG_S390
pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n",
p->symbol_name, p->addr, regs->flags);
#endif
}
/*
* fault_handler: this is called if an exception is generated for any
* instruction within the pre- or post-handler, or when Kprobes
* single-steps the probed instruction.
*/
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr);
/* Return 0 because we don't handle the fault. */
return 0;
}
static int __init kprobe_init(void)
{
int ret;
kp.pre_handler = handler_pre;
kp.post_handler = handler_post;
kp.fault_handler = handler_fault;
ret = register_kprobe(&kp);
if (ret < 0) {
pr_err("register_kprobe failed, returned %d\n", ret);
return ret;
}
pr_info("Planted kprobe at %s\n", kp.symbol);
return 0;
}
static void __exit kprobe_exit(void)
{
unregister_kprobe(&kp);
pr_info("kprobe at %s unregistered\n", kp.symbol);
}
module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");
obj-m := kprobe.o
kprobe-y += kprobe_example.o
BASEINCLUDE ?= /lib/modules/`uname -r`/build
all:
$(MAKE) -C $(BASEINCLUDE) M=$(PWD) modules;
clean:
$(MAKE) -C $(BASEINCLUDE) M=$(PWD) clean;
rm -f *.ko;
五、原理解析
直接从感官上来看,在没有进行kprobe时,看看vfs_read函数的汇编代码,第一条指令为NOP指令,占5个字节。
使用kprobe插桩vfs_read函数之后,可以看到第一条指令被替换程call指令。