参考资料
-
http://man7.org/conf/lpc2015/limiting_kernel_attack_surface_with_seccomp-LPC_2015-Kerrisk.pdf
-
https://www.freebsd.org/cgi/man.cgi?query=bpf&sektion=4&manpath=FreeBSD+4.7-RELEASE
配置
- CONFIG_SECCOMP
- CONFIG_SECCOMP_FILTER
内核态
seccomp 以 task_struct 为单位
struct task_struct {
...
struct seccomp seccomp;
...
}
struct seccomp {
int mode;
struct seccomp_filter *filter;
};
用户态每次调用 seccomp 时注册的 sock_fprog->filter 都加入链表
struct seccomp_filter {
atomic_t usage;
struct seccomp_filter *prev;
struct bpf_prog *prog;
};
struct bpf_prog {
u16 pages; /* Number of allocated pages */
kmemcheck_bitfield_begin(meta);
u16 jited:1, /* Is our filter JIT'ed? */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1; /* Do we need dst entry? */
kmemcheck_bitfield_end(meta);
u32 len; /* Number of filter blocks */
enum bpf_prog_type type; /* Type of BPF program */
struct bpf_prog_aux *aux; /* Auxiliary fields */
struct sock_fprog_kern *orig_prog; /* Original BPF program */
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct bpf_insn *filter);
/* Instructions for interpreter */
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0];
};
};
sock_filter 用于 seccomp,操作对象是系统调用,bpf_insn 用于 BFP,操作对象是 packet。seccomp 复用了执行insn的虚拟机
sock_filter 执行流程:
el0_svc - __sys_trace - syscall_trace_enter - secure_computing - __seccomp_filter
对于每个系统调用,执行的时候都需要把该task_struct上的filter都执行一遍,并且返回一个值表明内核是否运行执行该系统调用
/**
* struct seccomp_data - the format the BPF program executes over.
* @nr: the system call number
* @arch: indicates system call convention as an AUDIT_ARCH_* value
* as defined in <linux/audit.h>.
* @instruction_pointer: at the time of the system call.
* @args: up to 6 system call arguments always stored as 64-bit values
* regardless of the architecture.
*/
struct seccomp_data {
int nr;
__u32 arch;
__u64 instruction_pointer;
__u64 args[6];
};
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
u32 filter_ret, action;
int data;
/*
* Make sure that any changes to mode from another thread have
* been seen after TIF_SECCOMP was seen.
*/
rmb();
// 获取 filter 返回的 action/data
filter_ret = seccomp_run_filters(sd);
data = filter_ret & SECCOMP_RET_DATA;
action = filter_ret & SECCOMP_RET_ACTION;
switch (action) {
case SECCOMP_RET_ERRNO:
/* Set low-order bits as an errno, capped at MAX_ERRNO. */
if (data > MAX_ERRNO)
data = MAX_ERRNO;
syscall_set_return_value(current, task_pt_regs(current),
-data, 0);
goto skip;
case SECCOMP_RET_TRAP:
/* Show the handler the original registers. */
syscall_rollback(current, task_pt_regs(current));
/* Let the filter pass back 16 bits of data. */
seccomp_send_sigsys(this_syscall, data);
goto skip;
case SECCOMP_RET_TRACE:
/* We've been put in this state by the ptracer already. */
if (recheck_after_trace)
return 0;
/* ENOSYS these calls if there is no tracer attached. */
if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
syscall_set_return_value(current,
task_pt_regs(current),
-ENOSYS, 0);
goto skip;
}
/* Allow the BPF to provide the event message */
ptrace_event(PTRACE_EVENT_SECCOMP, data);
/*
* The delivery of a fatal signal during event
* notification may silently skip tracer notification,
* which could leave us with a potentially unmodified
* syscall that the tracer would have liked to have
* changed. Since the process is about to die, we just
* force the syscall to be skipped and let the signal
* kill the process and correctly handle any tracer exit
* notifications.
*/
if (fatal_signal_pending(current))
goto skip;
/* Check if the tracer forced the syscall to be skipped. */
this_syscall = syscall_get_nr(current, task_pt_regs(current));
if (this_syscall < 0)
goto skip;
/*
* Recheck the syscall, since it may have changed. This
* intentionally uses a NULL struct seccomp_data to force
* a reload of all registers. This does not goto skip since
* a skip would have already been reported.
*/
if (__seccomp_filter(this_syscall, NULL, true))
return -1;
return 0;
case SECCOMP_RET_ALLOW:
return 0;
case SECCOMP_RET_KILL:
default:
audit_seccomp(this_syscall, SIGSYS, action);
do_exit(SIGSYS);
}
unreachable();
skip:
audit_seccomp(this_syscall, 0, action);
return -1;
}
/**
* seccomp_run_filters - evaluates all seccomp filters against @syscall
* @syscall: number of the current system call
*
* Returns valid seccomp BPF response codes.
*/
static u32 seccomp_run_filters(const struct seccomp_data *sd)
{
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
/* Make sure cross-thread synced filter points somewhere sane. */
struct seccomp_filter *f =
lockless_dereference(current->seccomp.filter);
/* Ensure unexpected behavior doesn't result in failing open. */
if (unlikely(WARN_ON(f == NULL)))
return SECCOMP_RET_KILL;
if (!sd) {
populate_seccomp_data(&sd_local); // 获取 seccomp_data
sd = &sd_local;
}
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
}
return ret;
}
用户态
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/audit.h>
#include <linux/signal.h>
#include <sys/ptrace.h>
方法1
int seccomp(unsigned int operation, unsigned int flags, void *args);
方法2 使用 libseccomp 库
方法3:
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */
#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
/* Valid operations for seccomp syscall. */
#define SECCOMP_SET_MODE_STRICT 0
#define SECCOMP_SET_MODE_FILTER 1
#define SECCOMP_GET_ACTION_AVAIL 2 // since Linux 4.14
/* 使用方法,返回值为0为成功
unsigned int action = SECCOMP_RET_ALLOW
seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &action);
*/
/* Valid flags for SECCOMP_SET_MODE_FILTER */
#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
#define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
/*
* All BPF programs must return a 32-bit value.
* The bottom 16-bits are for optional return data.
* The upper 16-bits are ordered from least permissive values to most,
* as a signed value (so 0x8000000 is negative).
*
* The ordering ensures that a min_t() over composed return values always
* selects the least permissive choice.
*/
#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */
#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD
#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */
#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */
#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */
#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */
#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
fork/clone/exec 会继承 sock_filter,使用 SECCOMP_SET_MODE_FILTER 时需要设置:
prctl(PR_SET_NO_NEW_PRIVS, 1);
该设置会被fork/clone/exec继承,且不可撤销。此设置为了防止一种情况:正常情况下,普通进程 exec 一个setuid的
程序时,会执行完一些操作后 setuid(uid)降权,然而,普通进程可以在exec前设置 filter 来跳过 setuid 执行,导致进程
保持以root权限执行。设置 PR_SET_NO_NEW_PRIVS bit后,使得exec时不会获取更多的权限。
当operation为 SECCOMP_SET_MODE_FILTER, filter是一个 sock_fprog 指针
struct sock_fprog {
unsigned short len; /* Number of BPF instructions */
struct sock_filter *filter; /* Pointer to array of
BPF instructions */
};
sock_fprog 至少包含一个 BPF instructions:
struct sock_filter { /* Filter block */
__u16 code; /* Actual filter code */
__u8 jt; /* Jump true */
__u8 jf; /* Jump false */
__u32 k; /* Generic multiuse field */
};
每条指令执行的时候,其操作对象为 seccomp_data
struct seccomp_data {
int nr; /* System call number */
__u32 arch; /* AUDIT_ARCH_* value
(see <linux/audit.h>) */
__u64 instruction_pointer; /* CPU instruction pointer */
__u64 args[6]; /* Up to 6 system call arguments */
};
内核交互
- /proc/sys/kernel/seccomp
- /proc/$pid/status
- /proc/sys/net/core/bpf_jit_enable
Filter虚拟机
Filter是一组指令,指令只能向前跳转,并且以一个RET指令结束
A表示累加器,X是索引寄存器,K表示立即数
/* Instruction classes */
#define BPF_CLASS(code) ((code) & 0x07)
#define BPF_LD 0x00 // 拷贝值到累加器
#define BPF_LDX 0x01 // 拷贝值到索引寄存器
#define BPF_ST 0x02
#define BPF_STX 0x03
#define BPF_ALU 0x04 // 累加器和索引寄存器或者常量计算
#define BPF_JMP 0x05 // 向前跳转
#define BPF_RET 0x06 // 结束filter,并返回值
#define BPF_MISC 0x07
/* ld/ldx fields */
#define BPF_SIZE(code) ((code) & 0x18)
#define BPF_W 0x00 /* 32-bit */ // 操作数size
#define BPF_MODE(code) ((code) & 0xe0)
#define BPF_IMM 0x00
#define BPF_ABS 0x20 // 访问 seccomp_data buffer
#define BPF_IND 0x40
#define BPF_MEM 0x60
#define BPF_LEN 0x80
#define BPF_MSH 0xa0
/* alu/jmp fields */
#define BPF_OP(code) ((code) & 0xf0)
#define BPF_ADD 0x00
#define BPF_SUB 0x10
#define BPF_MUL 0x20
#define BPF_DIV 0x30
#define BPF_OR 0x40
#define BPF_AND 0x50
#define BPF_LSH 0x60
#define BPF_RSH 0x70
#define BPF_NEG 0x80
#define BPF_MOD 0x90
#define BPF_XOR 0xa0
#define BPF_JA 0x00 // 无条件跳转,范围32bit
#define BPF_JEQ 0x10 // 条件跳转,范围8bit
#define BPF_JGT 0x20
#define BPF_JGE 0x30
#define BPF_JSET 0x40
#define BPF_SRC(code) ((code) & 0x08)
#define BPF_K 0x00 // 跳转比较测试对象是K
#define BPF_X 0x08
#ifndef BPF_MAXINSNS
#define BPF_MAXINSNS 4096
#endif
/*
* Macros for filter block array initializers. struct sock_filter
*/
#ifndef BPF_STMT
#define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k }
#endif
#ifndef BPF_JUMP
#define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k }
#endif
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))) // 获取系统调用号
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K , __NR_open , 1 , 0) // 和__NR_open比较是否相等
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW) // 不相等则允许该系统调用
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL) // 相等禁止该系统调用
实例
+ struct sock_filter filter[] = {
+ /* Grab the system call number */
//BPF_LD 将值拷贝进寄存器(accumulator)
//BPF_W Word BPF_H half Word
//BPF_IND 可变的偏移
//BPF_ABS 固定的偏移
//BPF_k 常数 BPF_A 累加器
//BPF_JEQ 判断是否相等
//#define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k }
//#define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k }
+ BPF_STMT(BPF_LD+BPF_W+BPF_IND, regoffset(orig_eax)),
//物理头,偏移regoffset(orig_eax) byte后,指向type*。
+ //* Jump table for the allowed syscalls */
//进行比较,是否为I__NR_rt_sigreturn。 true的话 0, false 10
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_rt_sigreturn, 10, 0),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_sigreturn, 9, 0),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 8, 0),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit, 7, 0),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_read, 1, 0),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 2, 6),
+
+ /* Check that read is only using stdin. */
+ BPF_STMT(BPF_LD+BPF_W+BPF_IND, regoffset(ebx)),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 3, 4),
+
+ /* Check that write is only using stdout/stderr */
+ BPF_STMT(BPF_LD+BPF_W+BPF_IND, regoffset(ebx)),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDOUT_FILENO, 1, 0),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDERR_FILENO, 0, 1),
+
+ /* Put the "accept" value in A */
+ BPF_STMT(BPF_LD+BPF_W+BPF_LEN, 0),
+
+ BPF_STMT(BPF_RET+BPF_A,0),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+ .filter = filter,
+ };