ebpf中的bpf_probe_read_kernel和pt_regs

最新推荐文章于 2024-07-02 11:36:08 发布

VirtualMask

最新推荐文章于 2024-07-02 11:36:08 发布

阅读量1.7k

点赞数

分类专栏： bpf入门代码文章标签： linux c语言 github

本文链接：https://blog.csdn.net/qq_44927248/article/details/127455302

版权

bpf入门代码专栏收录该内容

3 篇文章 2 订阅

订阅专栏

运行队列

// 获取运行队列长度
// SEC("kprobe/update_rq_clock")
int update_rq_clock(struct pt_regs *ctx) {
	u32 key     = 0;
	u32 rqKey	= 0;
	struct rq *p_rq = 0;

	p_rq = (struct rq *)rq_map.lookup(&rqKey);
	if (!p_rq) { // 针对map表项未创建的时候，map表项之后会自动创建并初始化
		return 0;
	}

	bpf_probe_read_kernel(p_rq, sizeof(struct rq), (void *)PT_REGS_PARM1(ctx));
	u64 val = p_rq->nr_running;
	
	runqlen.update(&key, &val);
	return 0;
}

bpf_probe_read_kernel

 long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)

Description
      Safely attempt to read size bytes from kernel space address unsafe_ptr and store the data in dst.

      Return 0 on success, or a negative error in case of failure.

从内核空间内存指针中读取数据，即从地址unsafe_ptr开始读取size大小的字节存储到dst指向的地址，

PT_REGS_PARM*(x)宏

bpf_uprobe_sys_write()
    
long sys_write(unsigned int fd, const char __user *buf, size_t count);

PT_REGS_PARM1(x)中的PARM代表“参数”。通过这些宏，可以访问kprobe或tracepoint所挂接的函数的参数。例如，PT_REGS_PARM1(ctx)，其中ctx是作为参数传递给eBPF程序的struct pt_regs *ctx上下文，将允许您访问第一个参数，即文件描述符fd。类似地，PT_REGS_PARM3(ctx)将为您提供count，您可以通过查看这个内核示例（write_size）来确认这一点。

PT_REGS_RC()宏

与kretprobes探针配合使用，获取函数的返回值。

int kretprobe__tcp_v4_connect(struct pt_regs *ctx)

{     int ret = PT_REGS_RC(ctx);     [...]

}

struct pt_regs

linux kernel 使用它来格式化内核栈

//arch/x86/include/asm/ptrace.h
struct pt_regs {
/*
 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
 * unless syscall needs a complete, fully filled "struct pt_regs".
 */
	unsigned long r15;
	unsigned long r14;
	unsigned long r13;
	unsigned long r12;
	unsigned long rbp;
	unsigned long rbx;
/* These regs are callee-clobbered. Always saved on kernel entry. */
	unsigned long r11;
	unsigned long r10;
	unsigned long r9;
	unsigned long r8;
	unsigned long ax;
	unsigned long cx;
	unsigned long dx;
	unsigned long si;
	unsigned long di;
	unsigned long orig_ax;
/* Return frame for iretq */
	unsigned long ip;
	unsigned long cs;
	unsigned long flags;
	unsigned long sp;
	unsigned long ss;
/* top of stack page */
};

内核栈按照这个顺序缓存各个寄存器存储的用户空间数据/地址，下面会结合源码详细分析。

内核SYSCALL 入口代码在entry_64.S中，了解进程栈结构，需要看在陷入内核后，CPU都做了哪些堆栈操作。下面看下入口处部分汇编源码：

//arch/x86/entry/entry_64.S
ENTRY(entry_SYSCALL_64)
	UNWIND_HINT_EMPTY
	/* Interrupts are off on entry. */
	swapgs
	// 将用户栈偏移保存到 per-cpu 变量 rsp_scratch 中
	movq	%rsp, PER_CPU_VAR(rsp_scratch)
	// 切换到进程内核栈
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

	/* 在栈中倒序构建 struct pt_regs */
	pushq	$__USER_DS			/* pt_regs->ss */
	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
	pushq	%r11				/* pt_regs->flags */
	pushq	$__USER_CS			/* pt_regs->cs */
	pushq	%rcx				/* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
	//rax 保存着系统调用号
	pushq	%rax				/* pt_regs->orig_ax */

	PUSH_AND_CLEAR_REGS rax=$-ENOSYS

	TRACE_IRQS_OFF

	/* 保存参数到寄存器，调用do_syscall_64函数 */
	movq	%rax, %rdi
	movq	%rsp, %rsi
call	do_syscall_64		/* returns with IRQs disabled */

（1）指令“movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp”使栈顶寄存器载入进程内核栈地址，实现了用户栈到进程内核栈的切换；

（2）后续依次将用户空间寄存器压栈，和上面的数据结构struct pt_regs 成员一一对应（顺序固定且是倒序）。有三点需要注意：

1）%rcx寄存器保存在了pt_regs->ip 位置，是因为根据 Intel SDM，syscall 会将当前 rip 存到 rcx ，然后将 IA32_LSTAR 加载到 rip 。因此用户空间下一条指令就是从%rcx寄存器中获取；
2）系统调用号（sys_call_table索引号）保存在%rax中；
3）PUSH_AND_CLEAR_REGS 宏包含剩余寄存器入栈指令，展开如下：

//arch/x86/entry/calling.h
.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
        .if \save_ret
        pushq   %rsi            /* pt_regs->si */
        movq    8(%rsp), %rsi   /* temporarily store the return address in %rsi */
        movq    %rdi, 8(%rsp)   /* pt_regs->di (overwriting original return address) */
        .else
        pushq   %rdi            /* pt_regs->di */
        pushq   %rsi            /* pt_regs->si */
        .endif
        pushq   \rdx            /* pt_regs->dx */
        xorl    %edx, %edx      /* nospec   dx */
        pushq   %rcx            /* pt_regs->cx */
        xorl    %ecx, %ecx      /* nospec   cx */
        pushq   \rax            /* pt_regs->ax */
      pushq   %r8             /* pt_regs->r8 */
        xorl    %r8d, %r8d      /* nospec   r8 */
        pushq   %r9             /* pt_regs->r9 */
        xorl    %r9d, %r9d      /* nospec   r9 */
        pushq   %r10            /* pt_regs->r10 */
        xorl    %r10d, %r10d    /* nospec   r10 */
        pushq   %r11            /* pt_regs->r11 */
        xorl    %r11d, %r11d    /* nospec   r11*/
//后面的寄存器是caller-saved，这里可能是空的
        pushq   %rbx            /* pt_regs->rbx */
        xorl    %ebx, %ebx      /* nospec   rbx*/
        pushq   %rbp            /* pt_regs->rbp */
        xorl    %ebp, %ebp      /* nospec   rbp*/
        pushq   %r12            /* pt_regs->r12 */
        xorl    %r12d, %r12d    /* nospec   r12*/
        pushq   %r13            /* pt_regs->r13 */
        xorl    %r13d, %r13d    /* nospec   r13*/
        pushq   %r14            /* pt_regs->r14 */
        xorl    %r14d, %r14d    /* nospec   r14*/
        pushq   %r15            /* pt_regs->r15 */
        xorl    %r15d, %r15d    /* nospec   r15*/

在x86_64中，在内核栈中，rbx rbp r12 r13 r14 r15不是必须保存的项（为了访问不越界相应空间必须保留），根据需要保存，linux后续版本采取都保存方式；

（3）和IA32相比，x86_64内核栈起始位置没有预留8KB空间（STACK_PADDIN），是因为在x86_64中，SYCALL过程内核栈所有寄存器都由软件压栈保存，不存在硬件可能没有压栈，防止越界预留位置的情况。在这里贴上内核中关于STACK_PADDING定义：

/* x86_64 has a fixed-length stack frame */
#ifdef CONFIG_X86_32
# ifdef CONFIG_VM86
#  define TOP_OF_KERNEL_STACK_PADDING 16
# else
#  define TOP_OF_KERNEL_STACK_PADDING 8
# endif
#else
# define TOP_OF_KERNEL_STACK_PADDING 0
#endif

在x86_64中，linux内核栈、struct pt_regs、current宏、struct task_struct关系总结如下图：

在这里插入图片描述

进程内核栈

在每一个进程的生命周期中，经常会通过系统调用（SYSCALL）陷入内核。在执行系统调用陷入内核之后，这些内核代码所使用的栈并不是原先用户空间中的栈，而是一个内核空间的栈，这个称作进程的“内核栈”。

每个task的栈分成用户栈和内核栈两部分，进程内核栈在kernel中的定义是：

union thread_union {
	struct thread_info thread_info;
	unsigned long stack[THREAD_SIZE/sizeof(long)];
};

每个task的内核栈大小THREAD_SIZE ：

x86：
	#define THREAD_SIZE_ORDER	1
	#define THREAD_SIZE		(PAGE_SIZE << THREAD_SIZE_ORDER)
	因此是8K
x86_64：
	#define THREAD_SIZE_ORDER	(2 + KASAN_STACK_ORDER)
	#define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)
	PAGE_SIZE默认4K，KASAN_STACK_ORDER没有定义时为0，因此是16K
	
ARM：
	8k
ARM64：
        16K

在32位系统是8KB，64位系统里是16KB。

thread_info

在linux kernel中，task_struct、thread_info都用来保存进程相关信息，即进程PCB信息。然而不同的体系结构里，进程需要存储的信息不尽相同，linux使用task_struct存储通用的信息，将体系结构相关的部分存储在thread_info中。这也是为什么struct task_struct在include/linux/sched.h中定义，而thread_info 在arch/ 下体系结构相关头文件里。

thread_info 、内核栈、task_struct 关联

三者都是密切相关的，服务于进程的关键数据结构，在内核中定义截取如下：

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
	struct thread_info	thread_info;
#endif
… …
	void			*stack;
… …
}

/* * */
union thread_union {
#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
	struct task_struct task;
#endif
#ifndef CONFIG_THREAD_INFO_IN_TASK
	struct thread_info thread_info;
#endif
	unsigned long stack[THREAD_SIZE/sizeof(long)];
};

/* x86 */
struct thread_info {
	unsigned long		flags;		/* low level flags */
	u32			status;		/* thread synchronous flags */
};

/* ARM */
struct thread_info {
	unsigned long		flags;		/* low level flags */
	int			preempt_count;	/* 0 => preemptable, <0 => bug */
	mm_segment_t		addr_limit;	/* address limit */
	struct task_struct	*task;		/* main task structure */
… …
};

根据宏“CONFIG_THREAD_INFO_IN_TASK”的存在与否，三者在内核中存在两种不同关联：

（1）thread_info 结构在进程内核栈中

即当“CONFIG_THREAD_INFO_IN_TASK = N”时，thread_info和栈stack 在一个联合体thread_union内，共享一块内存，即thread_info在栈所在物理页框上。

进程描述符task_struct 中的成员“void *stack”指向内核栈。不同的是，在ARM中，struct thread_info 结构体有成员“struct task_struct *task”指向进程描述符task_struct，而x86文件中没有。实际上早期内核3.X版本中，x86下的 thread_info 里也有task_struct的指针，后续版本被删除。

至此三者关系可以描述如下（x86中没有info.task指针这条线）：

在这里插入图片描述