Linux kprobe原理

2401_87197303

于 2024-09-18 21:44:03 发布

阅读量967

点赞数 8

文章标签： linux 哈希算法运维

本文链接：https://blog.csdn.net/2401_87197303/article/details/142344620

版权

3.1 init_kprobes

kprobes作为一个内核中的一个模块，init_kprobes函数用来初始化kprobes模块：

// linux-3.10/kernel/kprobes.c

#define KPROBE\_HASH\_BITS 6
#define KPROBE\_TABLE\_SIZE (1 << KPROBE\_HASH\_BITS)

static struct hlist\_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist\_head kretprobe_inst_table[KPROBE_TABLE_SIZE];

static struct {
	raw\_spinlock\_t lock ____cacheline_aligned_in_smp;
} kretprobe_table_locks[KPROBE_TABLE_SIZE];

static int __init init\_kprobes(void)
{
	int i, err = 0;
	unsigned long offset = 0, size = 0;
	char \*modname, namebuf[128];
	const char \*symbol_name;
	void \*addr;
	struct kprobe\_blackpoint \*kb;

	/\* FIXME allocate the probe table, currently defined statically \*/
	/\* initialize all list heads \*/
	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
		INIT\_HLIST\_HEAD(&kprobe_table[i]);
		INIT\_HLIST\_HEAD(&kretprobe_inst_table[i]);
		raw\_spin\_lock\_init(&(kretprobe_table_locks[i].lock));
	}

	/\*
 \* Lookup and populate the kprobe\_blacklist.
 \*
 \* Unlike the kretprobe blacklist, we'll need to determine
 \* the range of addresses that belong to the said functions,
 \* since a kprobe need not necessarily be at the beginning
 \* of a function.
 \*/
	for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
		kprobe\_lookup\_name(kb->name, addr);
		if (!addr)
			continue;

		kb->start_addr = (unsigned long)addr;
		symbol_name = kallsyms\_lookup(kb->start_addr,
				&size, &offset, &modname, namebuf);
		if (!symbol_name)
			kb->range = 0;
		else
			kb->range = size;
	}

	if (kretprobe_blacklist_size) {
		/\* lookup the function address from its name \*/
		for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
			kprobe\_lookup\_name(kretprobe_blacklist[i].name,
					   kretprobe_blacklist[i].addr);
			if (!kretprobe_blacklist[i].addr)
				printk("kretprobe: lookup failed: %s\n",
				       kretprobe_blacklist[i].name);
		}
	}

#if defined(CONFIG\_OPTPROBES)
#if defined(\_\_ARCH\_WANT\_KPROBES\_INSN\_SLOT)
	/\* Init kprobe\_optinsn\_slots \*/
	kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
#endif
	/\* By default, kprobes can be optimized \*/
	kprobes_allow_optimization = true;
#endif

	/\* By default, kprobes are armed \*/
	kprobes_all_disarmed = false;

	err = arch\_init\_kprobes();
	if (!err)
		err = register\_die\_notifier(&kprobe_exceptions_nb);
	if (!err)
		err = register\_module\_notifier(&kprobe_module_nb);

	kprobes_initialized = (err == 0);

	if (!err)
		init\_test\_probes();
	return err;
}

module\_init(init_kprobes);

（1）分配当前静态定义的探测表，初始化所有的哈希链表头。并初始化kretprobe用到的自旋锁。
（2）查找并填充kprobe_blacklist，与kretprobe blacklist不同的是，我们需要确定属于所述函数的地址范围，因为kprobe不一定位于函数的开头（kprobe可以插入到内核中的任何指令上，不一定是函数开头）。

函数前面加__kprobes修饰的不能被探测，比如：

/\*
 \* This routine is called either:
 \* - under the kprobe\_mutex - during kprobe\_[un]register()
 \* OR
 \* - with preemption disabled - from arch/xxx/kernel/kprobes.c
 \*/
struct kprobe __kprobes \*get\_kprobe(void \*addr)
{
	struct hlist\_head \*head;
	struct kprobe \*p;

	head = &kprobe_table[hash\_ptr(addr, KPROBE_HASH_BITS)];
	hlist\_for\_each\_entry\_rcu(p, head, hlist) {
		if (p->addr == addr)
			return p;
	}

	return NULL;
}

下面这一些函数也不能被探测：

/\*
 \* Normally, functions that we'd want to prohibit kprobes in, are marked
 \* \_\_kprobes. But, there are cases where such functions already belong to
 \* a different section (\_\_sched for preempt\_schedule)
 \*
 \* For such cases, we now have a blacklist
 \*/
static struct kprobe\_blackpoint kprobe_blacklist[] = {
	{"preempt\_schedule",},
	{"native\_get\_debugreg",},
	{"irq\_entries\_start",},
	{"common\_interrupt",},
	{"mcount",},	/\* mcount can be called from everywhere \*/
	{NULL}    /\* Terminator \*/
};

（3）查找并填充retkprobe_blacklist。

struct kretprobe\_blackpoint kretprobe_blacklist[] = {
	{"\_\_switch\_to", }, /\* This function switches only current task, but
 doesn't switch kernel stack.\*/
	{NULL, NULL}	/\* Terminator \*/
};

const int kretprobe_blacklist_size = ARRAY\_SIZE(kretprobe_blacklist);

（4）注册 die 通知链

注册内核通知链：kprobe_exceptions_nb，注释标明了该通知链最高，最先被调用，执行被探测指令期间若发生了内存异常，比如执行了int3指令，将最优先调用kprobe_exceptions_notify函数。

static struct notifier\_block kprobe_exceptions_nb = {
	.notifier_call = kprobe_exceptions_notify,
	.priority = 0x7fffffff /\* we need to be notified first \*/
};

register\_die\_notifier(&kprobe_exceptions_nb);

/\*
 \* Wrapper routine for handling exceptions.
 \*/
int __kprobes
kprobe\_exceptions\_notify(struct notifier\_block \*self, unsigned long val, void \*data)
{
	struct die\_args \*args = data;
	int ret = NOTIFY_DONE;

	if (args->regs && user\_mode\_vm(args->regs))
		return ret;

	switch (val) {
	case DIE_INT3:
		if (kprobe\_handler(args->regs))
			ret = NOTIFY_STOP;
		break;
	case DIE_DEBUG:
		if (post\_kprobe\_handler(args->regs)) {
			/\*
 \* Reset the BS bit in dr6 (pointed by args->err) to
 \* denote completion of processing
 \*/
			(\*(unsigned long \*)ERR\_PTR(args->err)) &= ~DR_STEP;
			ret = NOTIFY_STOP;
		}
		break;
	case DIE_GPF:
		/\*
 \* To be potentially processing a kprobe fault and to
 \* trust the result from kprobe\_running(), we have
 \* be non-preemptible.
 \*/
		if (!preemptible() && kprobe\_running() &&
		    kprobe\_fault\_handler(args->regs, args->trapnr))
			ret = NOTIFY_STOP;
		break;
	default:
		break;
	}
	return ret;
}

（5）注册模块通知链

除了内核中的代码段函数外，还有模块中的代码段，我们可以给模块中的函数添加 kprobe点，当模块被卸载时，模块的.text 和.init.text sections都被释放，移除模块中的 kprobe点，当模块加载时，可以给模块的.text添加kprobe点，但是模块的.init.text sections再加载后就被释放，因此要禁止.init.text sections的kprobe点。

模块正常运行（已经完成了模块的初始化）的状态是MODULE_STATE_LIVE。
模块卸载是状态是MODULE_STATE_GOING。

/\* Module notifier call back, checking kprobes on the module \*/
static int __kprobes kprobes\_module\_callback(struct notifier\_block \*nb,
					     unsigned long val, void \*data)
{
	struct module \*mod = data;
	struct hlist\_head \*head;
	struct kprobe \*p;
	unsigned int i;
	int checkcore = (val == MODULE_STATE_GOING);

	if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
		return NOTIFY_DONE;

	/\*
 \* When MODULE\_STATE\_GOING was notified, both of module .text and
 \* .init.text sections would be freed. When MODULE\_STATE\_LIVE was
 \* notified, only .init.text section would be freed. We need to
 \* disable kprobes which have been inserted in the sections.
 \*/
	mutex\_lock(&kprobe_mutex);
	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
		head = &kprobe_table[i];
		hlist\_for\_each\_entry\_rcu(p, head, hlist)
			if (within\_module\_init((unsigned long)p->addr, mod) ||
			    (checkcore &&
			     within\_module\_core((unsigned long)p->addr, mod))) {
				/\*
 \* The vaddr this probe is installed will soon
 \* be vfreed buy not synced to disk. Hence,
 \* disarming the breakpoint isn't needed.
 \*/
				kill\_kprobe(p);
			}
	}
	mutex\_unlock(&kprobe_mutex);
	return NOTIFY_DONE;
}

static struct notifier\_block kprobe_module_nb = {
	.notifier_call = kprobes_module_callback,
	.priority = 0
};

register\_module\_notifier(&kprobe_module_nb)

注册module notify回调kprobes_module_callback函数的作用是若当某个内核模块发生卸载操作时有必要检测并移除注册到该模块函数的探测点。
当模块处于加载状态时，由于模块的.init.text节在加载后就被释放，不会存留在内存中，因此不能再.init.text节添加 kprobe点。

当模块的状态等于MODULE_STATE_GOING时，模块的.text 和.init.text sections都要禁用kprobe点。

	val = MODULE_STATE_GOING
	
	if (within\_module\_init((unsigned long)p->addr, mod) ||
	     within\_module\_core((unsigned long)p->addr, mod)) {
		/\*
 \* The vaddr this probe is installed will soon
 \* be vfreed buy not synced to disk. Hence,
 \* disarming the breakpoint isn't needed.
 \*/
		kill\_kprobe(p);
	}

当模块的状态等于MODULE_STATE_LIVE时，模块的.init.text sections要禁用kprobe点。

    val = MODULE_STATE_LIVE
    
	if (within\_module\_init((unsigned long)p->addr, mod)) {
		/\*
 \* The vaddr this probe is installed will soon
 \* be vfreed buy not synced to disk. Hence,
 \* disarming the breakpoint isn't needed.
 \*/
		kill\_kprobe(p);
	}

3.2 do_int3

前面说到系统执行到探测点以后，系统会陷入断点异常int3，触发了一个trap，也就是执行 do_int3 函数：

// linux-3.10/arch/x86/include/asm/kdebug.h

/\* Grossly misnamed. \*/
enum die\_val {
	DIE_OOPS = 1,
	DIE_INT3,
	DIE_DEBUG,
	DIE_PANIC,
	DIE_NMI,
	DIE_DIE,
	DIE_KERNELDEBUG,
	DIE_TRAP,
	DIE_GPF,
	DIE_CALL,
	DIE_PAGE_FAULT,
	DIE_NMIUNKNOWN,
};

// linux-3.10/arch/x86/include/asm/traps.h
/\* Interrupts/Exceptions \*/
enum {
	X86_TRAP_DE = 0,	/\* 0, Divide-by-zero \*/
	X86_TRAP_DB,		/\* 1, Debug \*/
	X86_TRAP_NMI,		/\* 2, Non-maskable Interrupt \*/
	X86_TRAP_BP,		/\* 3, Breakpoint \*/
	X86_TRAP_OF,		/\* 4, Overflow \*/
	X86_TRAP_BR,		/\* 5, Bound Range Exceeded \*/
	X86_TRAP_UD,		/\* 6, Invalid Opcode \*/
	X86_TRAP_NM,		/\* 7, Device Not Available \*/
	X86_TRAP_DF,		/\* 8, Double Fault \*/
	X86_TRAP_OLD_MF,	/\* 9, Coprocessor Segment Overrun \*/
	X86_TRAP_TS,		/\* 10, Invalid TSS \*/
	X86_TRAP_NP,		/\* 11, Segment Not Present \*/
	X86_TRAP_SS,		/\* 12, Stack Segment Fault \*/
	X86_TRAP_GP,		/\* 13, General Protection Fault \*/
	X86_TRAP_PF,		/\* 14, Page Fault \*/
	X86_TRAP_SPURIOUS,	/\* 15, Spurious Interrupt \*/
	X86_TRAP_MF,		/\* 16, x87 Floating-Point Exception \*/
	X86_TRAP_AC,		/\* 17, Alignment Check \*/
	X86_TRAP_MC,		/\* 18, Machine Check \*/
	X86_TRAP_XF,		/\* 19, SIMD Floating-Point Exception \*/
	X86_TRAP_IRET = 32,	/\* 32, IRET Exception \*/
};

// linux-3.10/arch/x86/kernel/traps.c

/\* May run on IST stack. \*/
dotraplinkage void __kprobes notrace do\_int3(struct pt\_regs \*regs, long error_code)
{
	......
	//当 CPU 遇到断点指令时，会发生陷阱，保存 CPU 的寄存器，并通过 notifier\_call\_chain 机制将控制权传递给 Kprobes。
	if (notify\_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
			SIGTRAP) == NOTIFY_STOP)
		goto exit;
	......
}

之后会执行通知链机制上注册的回调函数：kprobe_exceptions_notify，对于int 3 指令就是kprobe_handler函数：

int3
	-->do_int3
		-->notify\_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
			SIGTRAP) == NOTIFY_STOP)
				-->kprobe\_exceptions\_notify(){
					case DIE_INT3:
						if (kprobe\_handler(args->regs))
							ret = NOTIFY_STOP;
						break;
					}

/\*
 \* Interrupts are disabled on entry as trap3 is an interrupt gate and they
 \* remain disabled throughout this function.
 \*/
static int __kprobes kprobe\_handler(struct pt\_regs \*regs)
{
	kprobe\_opcode\_t \*addr;
	struct kprobe \*p;
	struct kprobe\_ctlblk \*kcb;

	addr = (kprobe\_opcode\_t \*)(regs->ip - sizeof(kprobe\_opcode\_t));
	/\*
 \* We don't want to be preempted for the entire
 \* duration of kprobe processing. We conditionally
 \* re-enable preemption at the end of this function,
 \* and also in reenter\_kprobe() and setup\_singlestep().
 \*/
	preempt\_disable();

	kcb = get\_kprobe\_ctlblk();
	p = get\_kprobe(addr);

	if (p) {
		if (kprobe\_running()) {
			if (reenter\_kprobe(p, regs, kcb))
				return 1;
		} else {
			set\_current\_kprobe(p, regs, kcb);
			kcb->kprobe_status = KPROBE_HIT_ACTIVE;

			/\*
 \* If we have no pre-handler or it returned 0, we
 \* continue with normal processing. If we have a
 \* pre-handler and it returned non-zero, it prepped
 \* for calling the break\_handler below on re-entry
 \* for jprobe processing, so get out doing nothing
 \* more here.
 \*/
			if (!p->pre_handler || !p->pre\_handler(p, regs))
				setup\_singlestep(p, regs, kcb, 0);
			return 1;
		}
	} else if (\*addr != BREAKPOINT_INSTRUCTION) {
		/\*
 \* The breakpoint instruction was removed right
 \* after we hit it. Another cpu has removed
 \* either a probepoint or a debugger breakpoint
 \* at this address. In either case, no further
 \* handling of this interrupt is appropriate.
 \* Back up over the (now missing) int3 and run
 \* the original instruction.
 \*/
		regs->ip = (unsigned long)addr;
		preempt\_enable\_no\_resched();
		return 1;
	} else if (kprobe\_running()) {
		p = \_\_this\_cpu\_read(current_kprobe);
		if (p->break_handler && p->break\_handler(p, regs)) {
			if (!skip\_singlestep(p, regs, kcb))
				setup\_singlestep(p, regs, kcb, 0);
			return 1;
		}
	} /\* else: not a kprobe fault; let the kernel handle it \*/

	preempt\_enable\_no\_resched();
	return 0;
}

对于kprobe我们主要分析这一部分：

与x86_64有关的EFLAGS 寄存器的flag位：
在这里插入图片描述
（1）TF Trap (bit 8)：设置启用单步模式进行调试；清除以禁用单步模式。在单步模式下，处理器在每条指令后生成一个调试异常。这允许在每条指令之后检查程序的执行状态。如果应用程序使用 POPF、POPFD 或 IRET 指令设置 TF 标志，则会在 POPF、POPFD 或 IRET 之后的指令之后生成调试异常。

（2）IF Interrupt enable (bit 9)：控制处理器对可屏蔽硬件中断请求的响应，该标志设置为响应可屏蔽的硬件中断；清除以禁止可屏蔽的硬件中断。 IF 标志不影响异常或不可屏蔽中断（NMI 中断）的生成。控制寄存器 CR4 中的 CPL、IOPL 和 VME 标志的状态决定了 IF 标志是否可以被 CLI、STI、POPF、POPFD 和 IRET 修改

set_current_kprobe设置struct kprobe *p为当前正在处理的 probe点。

// linux-3.10/arch/x86/kernel/kprobes/core.c

static void __kprobes set\_current\_kprobe(struct kprobe \*p, struct pt\_regs \*regs,
				struct kprobe\_ctlblk \*kcb)
{
	\_\_this\_cpu\_write(current_kprobe, p);
	kcb->kprobe_saved_flags = kcb->kprobe_old_flags
		= (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
	if (p->ainsn.if_modifier)
		kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
}

	set\_current\_kprobe(p, regs, kcb);
	kcb->kprobe_status = KPROBE_HIT_ACTIVE;

	/\*
 \* If we have no pre-handler or it returned 0, we
 \* continue with normal processing. If we have a
 \* pre-handler and it returned non-zero, it prepped
 \* for calling the break\_handler below on re-entry
 \* for jprobe processing, so get out doing nothing
 \* more here.
 \*/
	if (!p->pre_handler || !p->pre\_handler(p, regs))
		setup\_singlestep(p, regs, kcb, 0);
	return 1;

这里在设置current_kprobe全局变量的同时，还会同时设置kprobe_saved_flags和kprobe_old_flags的flag值，它们用于具体的架构指令相关处理。接下来处理pre_handler回调函数，有注册的话就调用执行，然后调用setup_singlestep启动单步执行。在调试完成后直接返回1。

static void __kprobes
setup\_singlestep(struct kprobe \*p, struct pt\_regs \*regs, struct kprobe\_ctlblk \*kcb, int reenter)
{
	if (setup\_detour\_execution(p, regs, reenter))
		return;

#if !defined(CONFIG\_PREEMPT)
	if (p->ainsn.boostable == 1 && !p->post_handler) {
		/\* Boost up -- we can execute copied instructions directly \*/
		if (!reenter)
			reset\_current\_kprobe();
		/\*
 \* Reentering boosted probe doesn't reset current\_kprobe,
 \* nor set current\_kprobe, because it doesn't use single
 \* stepping.
 \*/
		regs->ip = (unsigned long)p->ainsn.insn;
		preempt\_enable\_no\_resched();
		return;
	}
#endif
	if (reenter) {
		save\_previous\_kprobe(kcb);
		set\_current\_kprobe(p, regs, kcb);
		kcb->kprobe_status = KPROBE_REENTER;
	} else
		kcb->kprobe_status = KPROBE_HIT_SS;
	/\* Prepare real single stepping \*/
	clear\_btf();
	
	//设置regs->flags中的TF位，开启单步调试
	regs->flags |= X86_EFLAGS_TF;
	
	//屏蔽regs->flags中的IF位，屏蔽中断
	regs->flags &= ~X86_EFLAGS_IF;
	
	/\* single step inline if the instruction is an int3 \*/
	//指令寄存器地址改为前面保存的被探测指令（备份的原始指令）
	if (p->opcode == BREAKPOINT_INSTRUCTION)
		regs->ip = (unsigned long)p->addr;
	else
		regs->ip = (unsigned long)p->ainsn.insn;
}

单步执行，首先设置EFLAGS 寄存器flags中的TF位，并屏蔽IF位，同时把int3异常返回的指令寄存器地址改为前面保存的被探测指令，当int3异常返回时这些设置就会生效，即立即执行保存的原始指令（注意这里是在触发int3之前原来的上下文中执行，因此直接执行原始指令即可，无需特别的模拟操作）。该函数返回后do_int3函数立即返回，由于EFLAGS 寄存器TF位被设置，在单步执行完被探测指令后立即触发debug异常，进入debug异常处理函数do_debug，执行post_kprobe_handler函数，即post_handler()。

3.3 do_bug

dotraplinkage void __kprobes do\_debug(struct pt\_regs \*regs, long error_code)
{
	......
	if (notify\_die(DIE_DEBUG, "debug", regs, PTR\_ERR(&dr6), error_code,
							SIGTRAP) == NOTIFY_STOP)
	......
}

由于初始化时注册了内核通知链：kprobe_exceptions_nb，执行被探测指令期间若发生了内存异常，比如执行了debug指令，将最优先调用kprobe_exceptions_notify函数。

/\*
 \* Wrapper routine for handling exceptions.
 \*/
int __kprobes
kprobe\_exceptions\_notify(struct notifier\_block \*self, unsigned long val, void \*data)
{
	......
	case DIE_DEBUG:
		if (post\_kprobe\_handler(args->regs)) {
			/\*
 \* Reset the BS bit in dr6 (pointed by args->err) to
 \* denote completion of processing
 \*/
			(\*(unsigned long \*)ERR\_PTR(args->err)) &= ~DR_STEP;
			ret = NOTIFY_STOP;
		}
		break;
	......
}

/\*
 \* Interrupts are disabled on entry as trap1 is an interrupt gate and they
 \* remain disabled throughout this function.
 \*/
static int __kprobes post\_kprobe\_handler(struct pt\_regs \*regs)
{
	struct kprobe \*cur = kprobe\_running();
	struct kprobe\_ctlblk \*kcb = get\_kprobe\_ctlblk();

	if (!cur)
		return 0;

	resume\_execution(cur, regs, kcb);
	regs->flags |= kcb->kprobe_saved_flags;

	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
		kcb->kprobe_status = KPROBE_HIT_SSDONE;
		cur->post\_handler(cur, regs, 0);
	}

	/\* Restore back the original saved kprobes variables and continue. \*/
	if (kcb->kprobe_status == KPROBE_REENTER) {
		restore\_previous\_kprobe(kcb);
		goto out;
	}
	reset\_current\_kprobe();
out:
	preempt\_enable\_no\_resched();

	/\*
 \* if somebody else is singlestepping across a probe point, flags
 \* will have TF set, in which case, continue the remaining processing
 \* of do\_debug, as if this is not a probe hit.
 \*/
	if (regs->flags & X86_EFLAGS_TF)
		return 0;

	return 1;
}

首先调用resume_execution函数将debug异常返回的下一条指令设置为被探测之后的指令，这样异常返回后程序的流程就会按正常的流程继续执行；然后恢复kprobe执行前保存的flags标识；接下来如果kprobe不是重入的并且设置了post_handler回调函数，就设置kprobe_status状态为KPROBE_HIT_SSDONE并调用post_handler函数，即调用用户态设置的post_handler回调函数。

四、Changing Execution Path

由于 kprobes 可以探测正在运行的内核代码，它可以更改寄存器集，包括指令指针。此操作需要非常小心，例如保留堆栈帧，恢复执行路径等。因为它在运行的内核上运行并且需要深入了解计算机体系结构。

如果您更改 pre_handler 中的指令指针（并设置其他相关寄存器），则必须返回 !0 以便 kprobes 停止单步执行并返回到给定地址。这也意味着不应再调用 post_handler。

请注意，在某些使用 TOC（Table of Contents）进行函数调用的架构上，此操作可能会更难，因为您必须在模块中为您的函数设置一个新的 TOC，并在从它返回后恢复旧的 TOC。

五、Return Probes

5.1 How Does a Return Probe Work

当您调用 register_kretprobe() 时，Kprobes 在函数的入口处建立一个 kprobe。当被探测的函数被调用并且这个探测被命中时，Kprobes 会保存一份返回地址的副本，并将返回地址替换为“trampoline”的地址。trampoline是一段任意代码——通常只是一条 nop 指令。在启动时，Kprobes 在 trampoline 上注册一个 kprobe。

当被探测的函数执行它的 return instruction时，控制权传递给trampoline并且该探测被命中。 Kprobes 的 trampoline 处理程序调用与 kretprobe 关联的用户指定的返回处理程序，然后将保存的指令指针设置为保存的返回地址，这就是从陷阱返回后恢复执行的地方。

当被探测函数正在执行时，它的返回地址存储在一个 kretprobe_instance 类型的对象中。在调用 register_kretprobe() 之前，用户设置 kretprobe 结构的 maxactive 字段来指定可以同时探测多少个指定函数的实例。 register_kretprobe() 预分配指定数量的 kretprobe_instance 对象。

例如，如果函数是非递归的并且在调用时持有自旋锁，那么 maxactive = 1 就足够了。如果函数是非递归的并且永远不会放弃 CPU（例如，通过信号量或抢占），则 NR_CPUS 应该足够了。如果 maxactive <= 0，则设置为默认值。如果启用了 CONFIG_PREEMPT，则默认值为 max(10, 2*NR_CPUS)。否则，默认值为 NR_CPUS。

如果你将 maxactive 设置得太低，这不是一场灾难；你只会错过一些探测。在 kretprobe 结构中，nmissed 字段在注册返回探针时设置为零，并且每次进入被探测函数但没有可用于建立返回探针的 kretprobe_instance 对象时递增。

5.2 Kretprobe entry-handler

Kretprobes 还提供了一个可选的用户指定的处理程序，它在函数入口上运行。该处理程序是通过设置 kretprobe 结构的 entry_handler 字段来指定的。每当 kretprobe 放置在函数入口处的 kprobe 被命中时，都会调用用户定义的 entry_handler，如果有的话。如果 entry_handler 返回 0（成功），则保证在函数返回时调用相应的返回处理程序。如果 entry_handler 返回非零错误，则 Kprobes 将返回地址保持原样，并且 kretprobe 对该特定函数实例没有进一步的影响。

使用与它们关联的唯一 kretprobe_instance 对象来匹配多个入口和返回处理程序调用。此外，用户还可以将每个返回实例的私有数据指定为每个 kretprobe_instance 对象的一部分。这在相应的用户条目和返回处理程序之间共享私有数据时特别有用。每个私有数据对象的大小可以在 kretprobe 注册时通过设置 kretprobe 结构的 data_size 字段来指定。可以通过每个 kretprobe_instance 对象的数据字段访问此数据。

如果输入了探测函数但没有可用的 kretprobe_instance 对象，则除了增加 nmissed 计数外，还会跳过用户 entry_handler 调用。

六、How Does Jump Optimization Work

关于kprobe的优化可以参考这篇文章：linux kprobe实现原理

如果Linux 内核是使用 CONFIG_OPTPROBES=y 构建的（目前此标志在 x86/x86-64 非抢占式内核上自动设置为 ‘y’）并且“debug.kprobes_optimization”内核参数设置为 1 ，Kprobes 会尝试减少探测 - 通过在每个探测点使用跳转指令而不是断点指令来降低开销。

int 3 指令会产生一个 a trap ，比较耗时，可以用跳转指令替换断点指令，优化成jmp指令跳转到kprobe探测点。

当前的机器默认配置了 CONFIG_OPTPROBES 选项：

[root@localhost ~]# cat /etc/centos-release
CentOS Linux release 7.6.1810 (Core)

[root@localhost ~]# uname -r
3.10.0-957.el7.x86_64

# Kernel Performance Events And Counters
#
CONFIG\_SLUB=y
CONFIG_PROFILING=y
CONFIG_TRACEPOINTS=y
CONFIG_CRASH_CORE=y
CONFIG_KEXEC_CORE=y
CONFIG_HOTPLUG_SMT=y
CONFIG_OPROFILE=m
CONFIG_OPROFILE_EVENT_MULTIPLEX=y
CONFIG_HAVE_OPROFILE=y
CONFIG_OPROFILE_NMI_TIMER=y
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y

CONFIG_OPTPROBES=y  //当前的机器配置了 CONFIG\_OPTPROBES 选项

debug.kprobes_optimization内核参数同样也设置为 1：

[root@localhost ~]# cat /proc/sys/debug/kprobes-optimization
1
[root@localhost ~]#

6.1 Init a Kprobe

注册一个 probe 后，在尝试此优化之前，Kprobes会在指定地址插入一个基于断点的普通kprobe。因此，即使无法优化这个特定的probepoint，也会有一个探针。

6.2 Safety Check

在优化探针之前，Kprobes会执行以下安全检查，不符合条件不可以进行优化：
（1）Kprobes 验证将被跳转指令替换的区域（“优化区域”）是否完全位于一个函数中。（跳转指令是5个字节：near relative jump，因此可能会覆盖多个指令。）
（2）Kprobes 分析整个函数并验证没有跳转到优化区域，不能有跳转到这块要被优化区域的指令，这块区域将会被jmp覆盖，具体如下：
a：函数中不包含间接跳转（indirect jump）;
b：该函数不包含导致异常的指令（因为由异常触发的修复代码可以跳回优化区域 - Kprobes 检查异常表以验证这一点）；
c：没有到优化区域的近跳转（near jump）（除了第一个字节）。
（3）对于优化区域中的每条指令，Kprobes将验证该指令是否可以单独执行。

使用如下跳转指令（near jump）形式：

JMP 跳转指令： 
0xE9（E9 cd） ：Jump near 后面的4个字节是偏移：一个保存jmp本身的机器码，另4个保存偏移  -->总共5个字节

6.3 Preparing Detour Buffer

接下来，Kprobes准备了一个 Detour 缓冲区，其中包含以下指令序列:
（1）能够将cpu寄存器压栈(模拟int3的trap过程)。
（2）调用用户的探测处理程序的蹦床代码（trampoline code）。
（3）恢复寄存器的代码。
（4）来自优化区域的指令。
（5）跳转回原来的执行路径。

6.4 Pre-optimization

准备 Detour 缓冲区后，Kprobes验证以下情况是否存在：
（1）探针有一个 post_handler。
（2）探测优化区域中的其他指令。
（3）探针被禁用。
在上述任何一种情况下，Kprobes 都不会开始优化探针。由于这些是临时情况，如果情况发生变化，Kprobes 会尝试再次开始优化。

如果可以优化 kprobe，则 Kprobes 将 kprobe 排入优化列表，并启动 kprobe-optimizer 工作队列以对其进行优化。如果要优化的probepoint在优化之前被命中，则Kprobes通过将CPU的指令指针设置为 the detour buffer 中复制的代码，将控制权返回到原始指令路径，从而至少避免了单步执行。

6.5 Optimization

Kprobe-optimizer 不会立即插入跳转指令；相反，它首先出于安全考虑调用 synchronize_rcu()，因为 CPU 在执行优化区域的过程中可能会被中断。 synchronize_rcu() 可以确保在调用 synchronize_rcu() 时处于活动状态的所有中断都已完成，但前提是 CONFIG_PREEMPT=n。因此，此版本的 kprobe 优化仅支持具有 CONFIG_PREEMPT=n 的内核。

centos 7.6 :3.10.0默认没有开启 CONFIG_PREEMPT选项：

# CONFIG\_PREEMPT is not set

之后，Kprobe优化器调用stop_machine（），使用text_poke_smp（）将优化区域替换为 Detour 缓冲区的跳转指令。

6.6 Unoptimization

当一个优化了的kprobe未注册、禁用或被另一个kprobe阻止时，它将被取消优化。如果在优化完成之前发生这种情况，则kprobe将从优化列表中退出队列。如果优化已经完成，则使用text_poke_smp（）将跳转替换为原始代码（第一个字节如果是int3断点除外）。假设第二条指令被中断，然后优化器在中断处理程序运行时用跳转地址替换第二条命令。当中断返回到原始地址时，如果没有有效的指令，这将会导致意外的结果。

注意：跳转优化会更改kprobe的pre_handler行为。如果不进行优化，pre_handler可以通过更改regs->ip并返回1来更改内核的执行路径，完成内核函数的hook。但是，当优化探针时，该更改将会被忽略，不能内核函数的hook了。

因此，如果要调整内核的执行路径，即hook，需要使用以下技术之一禁止优化：
（1）为kprobe的post_handler指定一个空函数。
（2）执行“sysctl-w debug.krobes_optimization=n”

6.7 Blacklist

Kprobes可以探测除Kprobes本身之外的大部分内核函数。这意味着有些函数kprobes无法探测。探测（捕获）此类函数可能会导致递归陷阱（例如 double fault），或者嵌套的探测处理程序可能永远不会被调用。Kprobes使用 a blacklist 来管理该功能，如果要将函数添加到 blacklist中，只需包含linux/kprobes.h并使用NOKPROBE_SYMBOL（）宏指定一个 blacklisted 函数即可。Kprobes根据 blacklist 检查给定的探测地址，如果给定地址在 blacklist 中，则拒绝注册。

// linux-4.10.1/include/linux/kprobes.h

#ifdef CONFIG\_KPROBES
/\*
 \* Blacklist ganerating macro. Specify functions which is not probed
 \* by using this macro.
 \*/
#define \_\_NOKPROBE\_SYMBOL(fname) \
static unsigned long \_\_used \
 \_\_attribute\_\_((section("\_kprobe\_blacklist"))) \
 \_kbl\_addr\_##fname = (unsigned long)fname;
#define NOKPROBE\_SYMBOL(fname) \_\_NOKPROBE\_SYMBOL(fname)

可以通过 NOKPROBE_SYMBOL 宏在内核源码中查询内核哪些函数不能被探测：

......
NOKPROBE\_SYMBOL(__context_tracking_enter);
NOKPROBE\_SYMBOL(get_kprobe);
NOKPROBE\_SYMBOL(notifier_call_chain);
NOKPROBE\_SYMBOL(preempt_count_add);
NOKPROBE\_SYMBOL(perf_trace_buf_alloc);
NOKPROBE\_SYMBOL(FETCH\_FUNC\_NAME(stack, type));
NOKPROBE\_SYMBOL(PRINT\_TYPE\_FUNC\_NAME(tname));
......

6.8 try_to_optimize_kprobe

int register\_kprobe(struct kprobe \*p)
{	
	......
	/\* Try to optimize kprobe \*/
	try\_to\_optimize\_kprobe(p);
	......
}
EXPORT\_SYMBOL\_GPL(register_kprobe);

/\*
 \* Prepare an optimized\_kprobe and optimize it
 \* NOTE: p must be a normal registered kprobe
 \*/
static void try\_to\_optimize\_kprobe(struct kprobe \*p)
{
	struct kprobe \*ap;
	struct optimized\_kprobe \*op;

	/\* Impossible to optimize ftrace-based kprobe \*/
	if (kprobe\_ftrace(p))
		return;

	/\* For preparing optimization, jump\_label\_text\_reserved() is called \*/
	jump\_label\_lock();
	mutex\_lock(&text_mutex);

	(1)分配新的 optimized_kprobe 并尝试准备优化的指令
	ap = alloc\_aggr\_kprobe(p);
	if (!ap)
		goto out;

	op = container\_of(ap, struct optimized\_kprobe, kp);
	if (!arch\_prepared\_optinsn(&op->optinsn)) {
		/\* If failed to setup optimizing, fallback to kprobe \*/
		arch\_remove\_optimized\_kprobe(op);
		kfree(op);
		goto out;
	}

	(2)将hlist中较早的kprobe替换为manager kprobe
	init\_aggr\_kprobe(ap, p);
	
	(3)开始优化 kprobe 点
	optimize\_kprobe(ap);	/\* This just kicks optimizer thread \*/

out:
	mutex\_unlock(&text_mutex);
	jump\_label\_unlock();
}

CONFIG_OPTPROBES=y

// linux-4.10.1/arch/x86/include/asm/kprobes.h

struct arch\_optimized\_insn {
	/\* copy of the original instructions \*/
	kprobe\_opcode\_t copied_insn[RELATIVE_ADDR_SIZE];
	/\* detour code buffer \*/
	kprobe\_opcode\_t \*insn;
	/\* the size of instructions copied to detour code buffer \*/
	size\_t size;
};

为kprobe关联一个optimized_kprobe对象，它有一个detour buffer,保存有一段指令，之后是通过jmp跳转回原始函数。

#ifdef CONFIG\_OPTPROBES
/\*
 \* Internal structure for direct jump optimized probe
 \*/
struct optimized\_kprobe {
	struct kprobe kp;
	struct list\_head list;	/\* list for optimizing queue \*/
	struct arch\_optimized\_insn optinsn;
};

(1) alloc_aggr_kprobe：分配新的 optimized_kprobe 并尝试准备优化的指令

/\* Allocate new optimized\_kprobe and try to prepare optimized instructions \*/
static struct kprobe \*alloc\_aggr\_kprobe(struct kprobe \*p)
{
	struct optimized\_kprobe \*op;

	op = kzalloc(sizeof(struct optimized\_kprobe), GFP_KERNEL);
	if (!op)
		return NULL;

	INIT\_LIST\_HEAD(&op->list);
	op->kp.addr = p->addr;
	arch\_prepare\_optimized\_kprobe(op, p);

	return &op->kp;
}