一、eBPF 最大的特点就是动态加载
当你把一个 eBPF 程序加载到内核中时,它会立刻生效;当你终止其运行时,它的检测功能也会立刻终止。eBPF 程序在事件触发时由内核运行,所以可以被看作是一种函数挂钩或事件驱动的编程形式。
事件可由 kprobes/uprobes、tracepoints、dtrace probes、socket 等产生。这允许在内核和用户进程的指令中钩住(hook)和检查任何函数的内存、拦截文件操作、检查特定的网络数据包等等。
事件触发了附加的 eBPF 程序的执行,后续可以将信息保存至 map 和环形缓冲区(ringbuffer)或调用一些特定 API 定义的内核函数的子集。一个 eBPF 程序可以链接到多个事件,不同的 eBPF 程序也可以访问相同的 map 以共享数据。一个被称为 “program array” 的特殊读/写 map 存储了对通过 bpf() 系统调用加载的其他 eBPF 程序的引用,在该 map 中成功的查找则会触发一个跳转,而且并不返回到原来的 eBPF 程序。这种 eBPF 嵌套也有限制,以避免无限的递归循环。
运行 eBPF 程序的步骤:
1.用户空间将字节码和程序类型一起发送到内核,程序类型决定了可以访问的内核区域(主要是 BPF 辅助函数的各种子集)。
2.内核在字节码上运行验证器,以确保程序可以安全运行(kernel/bpf/verifier.c)。
3.内核将字节码编译为本地代码,并将其插入(或附加到)指定的代码位置。(如果启用了 JIT 功能,字节码编译为本地代码)。
4.插入的代码将数据写入环形缓冲区或通用键值 map。
5.用户空间从共享 map 或环形缓冲区中读取结果值。
二、execsnoop(监控系统进程exec执行)源码解析
源码位置:/usr/share/bcc/tools/execsnoop (需具备ebpf+bcc环境)
#!/usr/libexec/platform-python
# @lint-avoid-python-3-compatibility-imports
#
# execsnoop Trace new processes via exec() syscalls.
# For Linux, uses BCC, eBPF. Embedded C.
#
# USAGE: execsnoop [-h] [-T] [-t] [-x] [-q] [-n NAME] [-l LINE]
# [--max-args MAX_ARGS]
#
# This currently will print up to a maximum of 19 arguments, plus the process
# name, so 20 fields in total (MAXARG).
#
# This won't catch all new processes: an application may fork() but not exec().
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 07-Feb-2016 Brendan Gregg Created this.
#加载依赖
from __future__ import print_function
from bcc import BPF
from bcc.containers import filter_by_containers
from bcc.utils import ArgString, printb
import bcc.utils as utils
import argparse
import re
import time
import pwd
from collections import defaultdict
from time import strftime
#
def parse_uid(user):
try:
result = int(user)
except ValueError:
try:
user_info = pwd.getpwnam(user)
except KeyError:
raise argparse.ArgumentTypeError(
"{0!r} is not valid UID or user entry".format(user))
else:
return user_info.pw_uid
else:
# Maybe validate if UID < 0 ?
return result
# arguments参数
examples = """examples:
./execsnoop # trace all exec() syscalls
./execsnoop -x # include failed exec()s
./execsnoop -T # include time (HH:MM:SS)
./execsnoop -U # include UID
./execsnoop -u 1000 # only trace UID 1000
./execsnoop -u user # get user UID and trace only them
./execsnoop -t # include timestamps
./execsnoop -q # add "quotemarks" around arguments
./execsnoop -n main # only print command lines containing "main"
./execsnoop -l tpkg # only print command where arguments contains "tpkg"
./execsnoop --cgroupmap mappath # only trace cgroups in this BPF map
./execsnoop --mntnsmap mappath # only trace mount namespaces in the map
"""
#初始化命令项选项与参数解析
parser = argparse.ArgumentParser(
description="Trace exec() syscalls",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-T", "--time", action="store_true",
help="include time column on output (HH:MM:SS)")
parser.add_argument("-t", "--timestamp", action="store_true",
help="include timestamp on output")
parser.add_argument("-x", "--fails", action="store_true",
help="include failed exec()s")
parser.add_argument("--cgroupmap",
help="trace cgroups in this BPF map only")
parser.add_argument("--mntnsmap",
help="trace mount namespaces in this BPF map only")
parser.add_argument("-u", "--uid", type=parse_uid, metavar='USER',
help="trace this UID only")
parser.add_argument("-q", "--quote", action="store_true",
help="Add quotemarks (\") around arguments."
)
parser.add_argument("-n", "--name",
type=ArgString,
help="only print commands matching this name (regex), any arg")
parser.add_argument("-l", "--line",
type=ArgString,
help="only print commands where arg contains this line (regex)")
parser.add_argument("-U", "--print-uid", action="store_true",
help="print UID column")
parser.add_argument("--max-args", default="20",
help="maximum number of arguments parsed and displayed, defaults to 20")
parser.add_argument("--ebpf", action="store_true",
help=argparse.SUPPRESS)
args = parser.parse_args()
#定义BPF程序(C语言)
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
#include <linux/fs.h>
#define ARGSIZE 128
enum event_type {
EVENT_ARG,
EVENT_RET,
};
//定义了内核Probe程序与用户空间程序通信的结构体data_t
struct data_t {
u32 pid; // PID as in the userspace term (i.e. task->tgid in kernel)
u32 ppid; // Parent PID as in the userspace term (i.e task->real_parent->tgid in kernel)
u32 uid;
char comm[TASK_COMM_LEN];
enum event_type type;
char argv[ARGSIZE];
int retval;
};
//通过BBC宏定义内核中events变量
BPF_PERF_OUTPUT(events);
//函数bpf_probe_read_user:u64 res = bpf_probe_read_user(&buf, sizeof(buf), (void *)PT_REGS_PARM1(ctx)); 从用户空间&buf读取数据到变量ctx
//events.perf_submit:将event数据发送至用户空间
static int __submit_arg(struct pt_regs *ctx, void *ptr, struct data_t *data)
{
bpf_probe_read_user(data->argv, sizeof(data->argv), ptr);
events.perf_submit(ctx, data, sizeof(struct data_t));
return 1;
}
//初始化args参数
static int submit_arg(struct pt_regs *ctx, void *ptr, struct data_t *data)
{
const char *argp = NULL;
bpf_probe_read_user(&argp, sizeof(argp), ptr);
//
if (argp) {
return __submit_arg(ctx, (void *)(argp), data);
}
return 0;
}
int syscall__execve(struct pt_regs *ctx,
const char __user *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp)
{
//获取当前用户uid和组tgid,高32位为GID,低32位为UID
u32 uid = bpf_get_current_uid_gid() & 0xffffffff;
UID_FILTER
if (container_should_be_filtered()) {
return 0;
}
// create data here and pass to submit_arg to save stack space (#555)
struct data_t data = {};
struct task_struct *task;
//获取进程pid和组tgid,低32位为当前进程ID,高32位是组ID。
data.pid = bpf_get_current_pid_tgid() >> 32;
//获取当前任务的task struct结构体,通过task获取ppid
task = (struct task_struct *)bpf_get_current_task();
// Some kernels, like Ubuntu 4.13.0-generic, return 0
// as the real_parent->tgid.
// We use the get_ppid function as a fallback in those cases. (#1883)
data.ppid = task->real_parent->tgid;
//获取当前进程名字,并填充第一个参数地址。bpf_get_current_comm(char *buf, int size_of_buf)
bpf_get_current_comm(&data.comm, sizeof(data.comm));
data.type = EVENT_ARG;
__submit_arg(ctx, (void *)filename, &data);
// skip first arg, as we submitted filename
#pragma unroll
for (int i = 1; i < MAXARG; i++) {
if (submit_arg(ctx, (void *)&__argv[i], &data) == 0)
goto out;
}
// handle truncated argument list
char ellipsis[] = "...";
__submit_arg(ctx, (void *)ellipsis, &data);
out:
return 0;
}
int do_ret_sys_execve(struct pt_regs *ctx)
{
if (container_should_be_filtered()) {
return 0;
}
struct data_t data = {};
struct task_struct *task;
u32 uid = bpf_get_current_uid_gid() & 0xffffffff;
UID_FILTER
data.pid = bpf_get_current_pid_tgid() >> 32;
data.uid = uid;
task = (struct task_struct *)bpf_get_current_task();
// Some kernels, like Ubuntu 4.13.0-generic, return 0
// as the real_parent->tgid.
// We use the get_ppid function as a fallback in those cases. (#1883)
data.ppid = task->real_parent->tgid;
bpf_get_current_comm(&data.comm, sizeof(data.comm));
data.type = EVENT_RET;
data.retval = PT_REGS_RC(ctx);
events.perf_submit(ctx, &data, sizeof(data));
return 0;
}
"""
bpf_text = bpf_text.replace("MAXARG", args.max_args)
#是否指定uid参数
if args.uid:
bpf_text = bpf_text.replace('UID_FILTER',
'if (uid != %s) { return 0; }' % args.uid)
else:
bpf_text = bpf_text.replace('UID_FILTER', '')
bpf_text = filter_by_containers(args) + bpf_text
if args.ebpf:
print(bpf_text)
exit()
#初始化BPF
b = BPF(text=bpf_text)
#获取需要跟踪的系统调用函数名
execve_fnname = b.get_syscall_fnname("execve")
#在eBPF用户态程序中,可以通过attach_kprobe函数将内核态eBPF程序通过kprobes机制附加到某个内核函数中。attach_kprobe 函数会创建一个 perf event,再将 eBPF 内核态程序附加到 perf event。每个 perf event 的 kprobe probe handler 都是 kprobe_dispatch 函数,他会去 perf event 中获取注册在当前 perf event 的回调函数列表并依次执行,同时将指向 perf ringbuffer 的指针的传递给 eBPF 程序,eBPF 程序可以通过 libbpf 封装好的 PT_REGS_PARAMx 宏定义来获取缓冲区中的数据。(tracepoint 类型的eBPF程序需要定义好tracepoint关联的函数的参数的数据结构 /sys/kernel/tracing/events)。该kprobe会执行自定义的trace_stack()函数。可以通过多次执行attach_kprobe() ,将自定义的函数附加到多个内核函数上。
b.attach_kprobe(event=execve_fnname, fn_name="syscall__execve")
b.attach_kretprobe(event=execve_fnname, fn_name="do_ret_sys_execve")
# header
if args.time:
print("%-9s" % ("TIME"), end="")
if args.timestamp:
print("%-8s" % ("TIME(s)"), end="")
if args.print_uid:
print("%-6s" % ("UID"), end="")
print("%-16s %-6s %-6s %3s %s" % ("PCOMM", "PID", "PPID", "RET", "ARGS"))
class EventType(object):
EVENT_ARG = 0
EVENT_RET = 1
start_ts = time.time()
argv = defaultdict(list)
# This is best-effort PPID matching. Short-lived processes may exit
# before we get a chance to read the PPID.
# This is a fallback for when fetching the PPID from task->real_parent->tgip
# returns 0, which happens in some kernel versions.
def get_ppid(pid):
try:
with open("/proc/%d/status" % pid) as status:
for line in status:
if line.startswith("PPid:"):
return int(line.split()[1])
except IOError:
pass
return 0
# process event
def print_event(cpu, data, size):
event = b["events"].event(data)
skip = False
if event.type == EventType.EVENT_ARG:
argv[event.pid].append(event.argv)
elif event.type == EventType.EVENT_RET:
if event.retval != 0 and not args.fails:
skip = True
if args.name and not re.search(bytes(args.name), event.comm):
skip = True
if args.line and not re.search(bytes(args.line),
b' '.join(argv[event.pid])):
skip = True
if args.quote:
argv[event.pid] = [
b"\"" + arg.replace(b"\"", b"\\\"") + b"\""
for arg in argv[event.pid]
]
if not skip:
if args.time:
printb(b"%-9s" % strftime("%H:%M:%S").encode('ascii'), nl="")
if args.timestamp:
printb(b"%-8.3f" % (time.time() - start_ts), nl="")
if args.print_uid:
printb(b"%-6d" % event.uid, nl="")
ppid = event.ppid if event.ppid > 0 else get_ppid(event.pid)
ppid = b"%d" % ppid if ppid > 0 else b"?"
argv_text = b' '.join(argv[event.pid]).replace(b'\n', b'\\n')
printb(b"%-16s %-6d %-6s %3d %s" % (event.comm, event.pid,
ppid, event.retval, argv_text))
try:
del(argv[event.pid])
except Exception:
pass
# loop with callback to print_event
b["events"].open_perf_buffer(print_event)
while 1:
try:
b.perf_buffer_poll()
except KeyboardInterrupt:
exit()
#./execsnoop