linux kprobe使用

本文详细介绍了kprobe的内核编译配置,包括确保CONFIG_KPROBES等选项开启,以及模块支持的设置。讨论了kprobe在监控内核函数调用、获取参数、调用栈信息和返回值等方面的应用。此外,文章还阐述了x86和ARM64平台上的函数参数传递规则,并给出了x86和ARM64架构下pt_regs结构体中寄存器与函数参数的对应关系。最后,提供了一个内核kprobe使用示例,展示了如何在_do_fork函数调用前后打印相关信息。
摘要由CSDN通过智能技术生成

一、内核编译配置

如果你所在的环境不支持kprobe,可以尝试自查下以下编译选项是否打开。
Configuring Kprobes

  • CONFIG_KPROBES
    When configuring the kernel using make menuconfig/xconfig/oldconfig,
    ensure that CONFIG_KPROBES is set to “y”, look for “Kprobes” under
    “General architecture-dependent options”.
    在这里插入图片描述在这里插入图片描述

  • CONFIG_MODULES
    So that you can load and unload Kprobes-based instrumentation modules,
    make sure “Loadable module support” (CONFIG_MODULES) and “Module
    unloading” (CONFIG_MODULE_UNLOAD) are set to “y”.
    在这里插入图片描述
    在这里插入图片描述

  • CONFIG_KALLSYMS
    Also make sure that CONFIG_KALLSYMS and perhaps even CONFIG_KALLSYMS_ALL
    are set to “y”, since kallsyms_lookup_name() is used by the in-kernel
    kprobe address resolution code.
    在这里插入图片描述
    在这里插入图片描述

If you need to insert a probe in the middle of a function, you may find
it useful to “Compile the kernel with debug info” (CONFIG_DEBUG_INFO),
so you can use “objdump -d -l vmlinux” to see the source-to-object
code mapping.

在menuconfig中输入/进入配置文件搜索模式并搜索关键词

在这里插入图片描述
在这里插入图片描述

二、使用场景

  • 监控某个内核函数是否被调用
  • 获取某个内核函数在调用栈上耗费的时间
  • 获取某个内核函数的入参
  • 获取某个内核函数的调用栈(dump_stack()
  • 获取某个内核函数的返回值

三、参数传递规则

x86平台对pt_regs的定义
arch/x86/include/asm/ptrace.h

// i386架构
#ifdef __i386__

struct pt_regs {
        /*
         * NB: 32-bit x86 CPUs are inconsistent as what happens in the
         * following cases (where %seg represents a segment register):
         *
         * - pushl %seg: some do a 16-bit write and leave the high
         *   bits alone
         * - movl %seg, [mem]: some do a 16-bit write despite the movl
         * - IDT entry: some (e.g. 486) will leave the high bits of CS
         *   and (if applicable) SS undefined.
         *
         * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
         * so we can just treat all of the segment registers as 16-bit
         * values.
         */
        unsigned long bx;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;
        unsigned long bp;
        unsigned long ax;
        unsigned short ds;
        unsigned short __dsh;
        unsigned short es;
        unsigned short __esh;
        unsigned short fs;
        unsigned short __fsh;
        /* On interrupt, gs and __gsh store the vector number. */
        unsigned short gs;
        unsigned short __gsh;
        /* On interrupt, this is the error code. */
        unsigned long orig_ax;
        unsigned long ip;
        unsigned short cs;
        unsigned short __csh;
        unsigned long flags;
        unsigned long sp;
        unsigned short ss; 
        unsigned short __ssh;
};

#else /* __i386__ */
// ia64
struct pt_regs {
/*
 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
 * unless syscall needs a complete, fully filled "struct pt_regs".
 */
        unsigned long r15;
        unsigned long r14;
        unsigned long r13;
        unsigned long r12;
        unsigned long bp;
        unsigned long bx;
/* These regs are callee-clobbered. Always saved on kernel entry. */
        unsigned long r11;
        unsigned long r10;
        unsigned long r9;
        unsigned long r8;
        unsigned long ax;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;
/*
 * On syscall entry, this is syscall#. On CPU exception, this is error code.
 * On hw interrupt, it's IRQ number:
 */
        unsigned long orig_ax;
/* Return frame for iretq */
        unsigned long ip;
        unsigned long cs;
        unsigned long flags;
        unsigned long sp;
        unsigned long ss;
/* top of stack page */
};

#endif /* !__i386__ */

从4.18的内核版本bpf的相关源码/tools/testing/selftests/bpf/bpf_helpers.h中可以窥探x86结构和arm架构函数参数传递规则。

#if defined(bpf_target_x86)
#define PT_REGS_PARM1(x) ((x)->di)
#define PT_REGS_PARM2(x) ((x)->si)
#define PT_REGS_PARM3(x) ((x)->dx)
#define PT_REGS_PARM4(x) ((x)->cx)
#define PT_REGS_PARM5(x) ((x)->r8)
#define PT_REGS_RET(x) ((x)->sp)
#define PT_REGS_FP(x) ((x)->bp)
#define PT_REGS_RC(x) ((x)->ax)
#define PT_REGS_SP(x) ((x)->sp)
#define PT_REGS_IP(x) ((x)->ip)

#elif defined(bpf_target_arm64)
#define PT_REGS_PARM1(x) ((x)->regs[0])
#define PT_REGS_PARM2(x) ((x)->regs[1])
#define PT_REGS_PARM3(x) ((x)->regs[2])
#define PT_REGS_PARM4(x) ((x)->regs[3])
#define PT_REGS_PARM5(x) ((x)->regs[4])
#define PT_REGS_RET(x) ((x)->regs[30])
#define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */
#define PT_REGS_RC(x) ((x)->regs[0])
#define PT_REGS_SP(x) ((x)->sp)
#define PT_REGS_IP(x) ((x)->pc)

/samples/bpf/test_overhead_kprobe_kern.c

// 使用示例
SEC("kprobe/__set_task_comm")
int prog(struct pt_regs *ctx)
{
	struct signal_struct *signal;
	struct task_struct *tsk;
	char oldcomm[16] = {};
	char newcomm[16] = {};
	u16 oom_score_adj;
	u32 pid;

	tsk = (void *)PT_REGS_PARM1(ctx);

	pid = _(tsk->pid);
	bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm);
	bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx));
	signal = _(tsk->signal);
	oom_score_adj = _(signal->oom_score_adj);
	return 0;
}

// 函数原型
/*
 * These functions flushes out all traces of the currently running executable
 * so that a new one can be started
 */
void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{
	task_lock(tsk);
	trace_task_rename(tsk, buf);
	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
	task_unlock(tsk);
	perf_event_comm(tsk, exec);
}
  • x86架构寄存器约定与函数参数传递
    在 X86_64 架构中,当调用一个函数的时候,RDI 寄存器用于传递第一个参数,RSI 寄存器用于传递第二个寄存器(RDI/RSI/RDX/RCS/R8/R9),R9 寄存器传递第六个参数, 函数返回值保存在 RAX 寄存器中。那么如果函数的参数超过六个,那么多余的参数参数如何传递? 在 X86_64 架构中,函数大于 6 个参数的参数通过堆栈进行传输。

    寄存器描述
    RDI传递第1个参数
    RSI传递第2个参数
    RDX传递第3个参数
    RCX传递第4个参数
    R8传递第5个参数
    R9传递第6个参数
    RAX临时寄存器或者第一个返回值
    RSPSP寄存器
    RBP栈帧寄存器

其中RDI对应pt_regs结构体中的di,其他寄存器依次类推。

  • ARM架构寄存器约定与函数参数传递
    在 ARM64 架构中,使用 X0-X7 寄存器传递参数,第一个参数通过 X0 寄存器传递,第二个参数通过 X1 寄存器传递,以此类推. 返回值存储在 X0 寄存器中。

四、使用实例

  • 内核简单使用示例
    /samples/kprobes/kprobe_example.c
/*
 * NOTE: This example is works on x86 and powerpc.
 * Here's a sample kernel module showing the use of kprobes to dump a
 * stack trace and selected registers when _do_fork() is called.
 *
 * For more information on theory of operation of kprobes, see
 * Documentation/kprobes.txt
 *
 * You will see the trace data in /var/log/messages and on the console
 * whenever _do_fork() is invoked to create a new process.
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>

#define MAX_SYMBOL_LEN	64
static char symbol[MAX_SYMBOL_LEN] = "_do_fork";
module_param_string(symbol, symbol, sizeof(symbol), 0644);

/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
	.symbol_name	= symbol,
};

/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_X86
	pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n",
		p->symbol_name, p->addr, regs->ip, regs->flags);
#endif
#ifdef CONFIG_PPC
	pr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n",
		p->symbol_name, p->addr, regs->nip, regs->msr);
#endif
#ifdef CONFIG_MIPS
	pr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n",
		p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status);
#endif
#ifdef CONFIG_ARM64
	pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"
			" pstate = 0x%lx\n",
		p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate);
#endif
#ifdef CONFIG_S390
	pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n",
		p->symbol_name, p->addr, regs->psw.addr, regs->flags);
#endif

	/* A dump_stack() here will give a stack backtrace */
	return 0;
}

/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
				unsigned long flags)
{
#ifdef CONFIG_X86
	pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n",
		p->symbol_name, p->addr, regs->flags);
#endif
#ifdef CONFIG_PPC
	pr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n",
		p->symbol_name, p->addr, regs->msr);
#endif
#ifdef CONFIG_MIPS
	pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n",
		p->symbol_name, p->addr, regs->cp0_status);
#endif
#ifdef CONFIG_ARM64
	pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
		p->symbol_name, p->addr, (long)regs->pstate);
#endif
#ifdef CONFIG_S390
	pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n",
		p->symbol_name, p->addr, regs->flags);
#endif
}

/*
 * fault_handler: this is called if an exception is generated for any
 * instruction within the pre- or post-handler, or when Kprobes
 * single-steps the probed instruction.
 */
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
	pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr);
	/* Return 0 because we don't handle the fault. */
	return 0;
}

static int __init kprobe_init(void)
{
	int ret;
	kp.pre_handler = handler_pre;
	kp.post_handler = handler_post;
	kp.fault_handler = handler_fault;

	ret = register_kprobe(&kp);
	if (ret < 0) {
		pr_err("register_kprobe failed, returned %d\n", ret);
		return ret;
	}
	pr_info("Planted kprobe at %s\n", kp.symbol);
	return 0;
}

static void __exit kprobe_exit(void)
{
	unregister_kprobe(&kp);
	pr_info("kprobe at %s unregistered\n", kp.symbol);
}

module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");
obj-m := kprobe.o

kprobe-y += kprobe_example.o

BASEINCLUDE ?= /lib/modules/`uname -r`/build

all:
        $(MAKE) -C $(BASEINCLUDE) M=$(PWD) modules;

clean:
        $(MAKE) -C $(BASEINCLUDE) M=$(PWD) clean;
        rm -f *.ko;

五、原理解析

直接从感官上来看,在没有进行kprobe时,看看vfs_read函数的汇编代码,第一条指令为NOP指令,占5个字节。
在这里插入图片描述
使用kprobe插桩vfs_read函数之后,可以看到第一条指令被替换程call指令。
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值