seccomp - Secure Computing

最新推荐文章于 2024-05-03 00:10:07 发布

xialiangzhao

最新推荐文章于 2024-05-03 00:10:07 发布

阅读量871

点赞数

本文链接：https://blog.csdn.net/happykillerxxx/article/details/86634703

版权

参考资料

配置

CONFIG_SECCOMP
CONFIG_SECCOMP_FILTER

内核态

seccomp 以 task_struct 为单位

struct task_struct {
    ...
    struct seccomp seccomp;
    ...
}

struct seccomp {
    int mode;
    struct seccomp_filter *filter;
};

用户态每次调用 seccomp 时注册的 sock_fprog->filter 都加入链表

struct seccomp_filter {
    atomic_t usage;
    struct seccomp_filter *prev;
    struct bpf_prog *prog;
};

struct bpf_prog {
    u16         pages;      /* Number of allocated pages */
    kmemcheck_bitfield_begin(meta);
    u16         jited:1,    /* Is our filter JIT'ed? */
                gpl_compatible:1, /* Is filter GPL compatible? */
                cb_access:1,    /* Is control block accessed? */
                dst_needed:1;   /* Do we need dst entry? */
    kmemcheck_bitfield_end(meta);
    u32         len;        /* Number of filter blocks */
    enum bpf_prog_type  type;       /* Type of BPF program */
    struct bpf_prog_aux *aux;       /* Auxiliary fields */
    struct sock_fprog_kern  *orig_prog; /* Original BPF program */
    unsigned int        (*bpf_func)(const struct sk_buff *skb,
                        const struct bpf_insn *filter);
    /* Instructions for interpreter */
    union {
        struct sock_filter  insns[0];
        struct bpf_insn     insnsi[0];
    };
};

sock_filter 用于 seccomp，操作对象是系统调用，bpf_insn 用于 BFP，操作对象是 packet。seccomp 复用了执行insn的虚拟机

sock_filter 执行流程：
el0_svc - __sys_trace - syscall_trace_enter - secure_computing - __seccomp_filter

对于每个系统调用，执行的时候都需要把该task_struct上的filter都执行一遍，并且返回一个值表明内核是否运行执行该系统调用

/**
 * struct seccomp_data - the format the BPF program executes over.
 * @nr: the system call number
 * @arch: indicates system call convention as an AUDIT_ARCH_* value
 *        as defined in <linux/audit.h>.
 * @instruction_pointer: at the time of the system call.
 * @args: up to 6 system call arguments always stored as 64-bit values
 *        regardless of the architecture.
 */
struct seccomp_data {
    int nr;
    __u32 arch;
    __u64 instruction_pointer;
    __u64 args[6];
};

static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
                const bool recheck_after_trace)
{
    u32 filter_ret, action;
    int data;

    /*
     * Make sure that any changes to mode from another thread have
     * been seen after TIF_SECCOMP was seen.
     */
    rmb();
    
    // 获取 filter 返回的 action/data
    filter_ret = seccomp_run_filters(sd);
    data = filter_ret & SECCOMP_RET_DATA;
    action = filter_ret & SECCOMP_RET_ACTION;

    switch (action) {
    case SECCOMP_RET_ERRNO:
        /* Set low-order bits as an errno, capped at MAX_ERRNO. */
        if (data > MAX_ERRNO)
            data = MAX_ERRNO;
        syscall_set_return_value(current, task_pt_regs(current),
                     -data, 0);
        goto skip;

    case SECCOMP_RET_TRAP:
        /* Show the handler the original registers. */
        syscall_rollback(current, task_pt_regs(current));
        /* Let the filter pass back 16 bits of data. */
        seccomp_send_sigsys(this_syscall, data);
        goto skip;

    case SECCOMP_RET_TRACE:
        /* We've been put in this state by the ptracer already. */
        if (recheck_after_trace)
            return 0;

        /* ENOSYS these calls if there is no tracer attached. */
        if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
            syscall_set_return_value(current,
                         task_pt_regs(current),
                         -ENOSYS, 0);
            goto skip;
        }

        /* Allow the BPF to provide the event message */
        ptrace_event(PTRACE_EVENT_SECCOMP, data);
        /*
         * The delivery of a fatal signal during event
         * notification may silently skip tracer notification,
         * which could leave us with a potentially unmodified
         * syscall that the tracer would have liked to have
         * changed. Since the process is about to die, we just
         * force the syscall to be skipped and let the signal
         * kill the process and correctly handle any tracer exit
         * notifications.
         */
        if (fatal_signal_pending(current))
            goto skip;
        /* Check if the tracer forced the syscall to be skipped. */
        this_syscall = syscall_get_nr(current, task_pt_regs(current));
        if (this_syscall < 0)
            goto skip;

        /*
         * Recheck the syscall, since it may have changed. This
         * intentionally uses a NULL struct seccomp_data to force
         * a reload of all registers. This does not goto skip since
         * a skip would have already been reported.
         */
        if (__seccomp_filter(this_syscall, NULL, true))
            return -1;

        return 0;

    case SECCOMP_RET_ALLOW:
        return 0;

    case SECCOMP_RET_KILL:
    default:
        audit_seccomp(this_syscall, SIGSYS, action);
        do_exit(SIGSYS);
    }

    unreachable();

skip:
    audit_seccomp(this_syscall, 0, action);
    return -1;
}

/**
 * seccomp_run_filters - evaluates all seccomp filters against @syscall
 * @syscall: number of the current system call
 *
 * Returns valid seccomp BPF response codes.
 */
static u32 seccomp_run_filters(const struct seccomp_data *sd)
{
    struct seccomp_data sd_local;
    u32 ret = SECCOMP_RET_ALLOW;
    /* Make sure cross-thread synced filter points somewhere sane. */
    struct seccomp_filter *f =
            lockless_dereference(current->seccomp.filter);

    /* Ensure unexpected behavior doesn't result in failing open. */
    if (unlikely(WARN_ON(f == NULL)))
        return SECCOMP_RET_KILL;

    if (!sd) {
        populate_seccomp_data(&sd_local); // 获取 seccomp_data
        sd = &sd_local;
    }

    /*
     * All filters in the list are evaluated and the lowest BPF return
     * value always takes priority (ignoring the DATA).
     */
    for (; f; f = f->prev) {
        u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);

        if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
            ret = cur_ret;
    }
    return ret;
}

用户态

#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/audit.h>
#include <linux/signal.h>
#include <sys/ptrace.h>

方法1

int seccomp(unsigned int operation, unsigned int flags, void *args);

方法2 使用 libseccomp 库

方法3：
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);

/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
#define SECCOMP_MODE_DISABLED	0 /* seccomp is not in use. */
#define SECCOMP_MODE_STRICT	1 /* uses hard-coded filter. */
#define SECCOMP_MODE_FILTER	2 /* uses user-supplied filter. */

/* Valid operations for seccomp syscall. */
#define SECCOMP_SET_MODE_STRICT     0
#define SECCOMP_SET_MODE_FILTER     1
#define SECCOMP_GET_ACTION_AVAIL    2 // since Linux 4.14

/* 使用方法，返回值为0为成功
unsigned int action = SECCOMP_RET_ALLOW
seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &action);
*/

/* Valid flags for SECCOMP_SET_MODE_FILTER */
#define SECCOMP_FILTER_FLAG_TSYNC   (1UL << 0)
#define SECCOMP_FILTER_FLAG_LOG     (1UL << 1)
#define SECCOMP_FILTER_FLAG_SPEC_ALLOW  (1UL << 2)

/*
 * All BPF programs must return a 32-bit value.
 * The bottom 16-bits are for optional return data.
 * The upper 16-bits are ordered from least permissive values to most,
 * as a signed value (so 0x8000000 is negative).
 *
 * The ordering ensures that a min_t() over composed return values always
 * selects the least permissive choice.
 */
#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
#define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
#define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
#define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
#define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
#define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
#define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
#define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */

fork/clone/exec 会继承 sock_filter，使用 SECCOMP_SET_MODE_FILTER 时需要设置：

prctl(PR_SET_NO_NEW_PRIVS, 1);

该设置会被fork/clone/exec继承，且不可撤销。此设置为了防止一种情况：正常情况下，普通进程 exec 一个setuid的

程序时，会执行完一些操作后 setuid(uid)降权，然而，普通进程可以在exec前设置 filter 来跳过 setuid 执行，导致进程

保持以root权限执行。设置 PR_SET_NO_NEW_PRIVS bit后，使得exec时不会获取更多的权限。

当operation为 SECCOMP_SET_MODE_FILTER, filter是一个 sock_fprog 指针

struct sock_fprog {
   unsigned short      len;    /* Number of BPF instructions */
   struct sock_filter *filter; /* Pointer to array of
                                  BPF instructions */
};

sock_fprog 至少包含一个 BPF instructions:

struct sock_filter {            /* Filter block */
   __u16 code;                 /* Actual filter code */
   __u8  jt;                   /* Jump true */
   __u8  jf;                   /* Jump false */
   __u32 k;                    /* Generic multiuse field */
};

每条指令执行的时候，其操作对象为 seccomp_data

struct seccomp_data {
   int   nr;                   /* System call number */
   __u32 arch;                 /* AUDIT_ARCH_* value
                                  (see <linux/audit.h>) */
   __u64 instruction_pointer;  /* CPU instruction pointer */
   __u64 args[6];              /* Up to 6 system call arguments */
};

内核交互

/proc/sys/kernel/seccomp
/proc/$pid/status
/proc/sys/net/core/bpf_jit_enable

Filter虚拟机

Filter是一组指令，指令只能向前跳转，并且以一个RET指令结束

A表示累加器，X是索引寄存器，K表示立即数

/* Instruction classes */
#define BPF_CLASS(code) ((code) & 0x07)
#define		BPF_LD		0x00    // 拷贝值到累加器
#define		BPF_LDX		0x01    // 拷贝值到索引寄存器
#define		BPF_ST		0x02
#define		BPF_STX		0x03
#define		BPF_ALU		0x04    // 累加器和索引寄存器或者常量计算
#define		BPF_JMP		0x05    // 向前跳转
#define		BPF_RET		0x06    // 结束filter，并返回值 
#define		BPF_MISC        0x07

/* ld/ldx fields */
#define BPF_SIZE(code)  ((code) & 0x18)
#define		BPF_W		0x00 /* 32-bit */    // 操作数size

#define BPF_MODE(code)  ((code) & 0xe0)
#define		BPF_IMM		0x00
#define		BPF_ABS		0x20    // 访问 seccomp_data buffer
#define		BPF_IND		0x40
#define		BPF_MEM		0x60
#define		BPF_LEN		0x80
#define		BPF_MSH		0xa0

/* alu/jmp fields */
#define BPF_OP(code)    ((code) & 0xf0)
#define		BPF_ADD		0x00
#define		BPF_SUB		0x10
#define		BPF_MUL		0x20
#define		BPF_DIV		0x30
#define		BPF_OR		0x40
#define		BPF_AND		0x50
#define		BPF_LSH		0x60
#define		BPF_RSH		0x70
#define		BPF_NEG		0x80
#define		BPF_MOD		0x90
#define		BPF_XOR		0xa0

#define		BPF_JA		0x00    // 无条件跳转，范围32bit
#define		BPF_JEQ		0x10    // 条件跳转，范围8bit
#define		BPF_JGT		0x20
#define		BPF_JGE		0x30
#define		BPF_JSET    0x40

#define BPF_SRC(code)   ((code) & 0x08)
#define		BPF_K		0x00    // 跳转比较测试对象是K
#define		BPF_X		0x08

#ifndef BPF_MAXINSNS
#define BPF_MAXINSNS 4096
#endif

/*
 * Macros for filter block array initializers. struct sock_filter
 */
#ifndef BPF_STMT
#define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k }
#endif
#ifndef BPF_JUMP
#define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k }
#endif

BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))) // 获取系统调用号
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K , __NR_open , 1 , 0) // 和__NR_open比较是否相等
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)    // 不相等则允许该系统调用
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL)    // 相等禁止该系统调用

实例

+	struct sock_filter filter[] = {
+		/* Grab the system call number */
			//BPF_LD 将值拷贝进寄存器(accumulator)
			//BPF_W Word BPF_H half Word
			//BPF_IND 可变的偏移 
			//BPF_ABS 固定的偏移
			//BPF_k 常数 BPF_A 累加器
			//BPF_JEQ 判断是否相等
	//#define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k }  
	//#define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k }  
 
+		BPF_STMT(BPF_LD+BPF_W+BPF_IND, regoffset(orig_eax)),
		//物理头，偏移regoffset(orig_eax) byte后，指向type*。
+		//* Jump table for the allowed syscalls */
		//进行比较，是否为I__NR_rt_sigreturn。 true的话 0,  false 10
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_rt_sigreturn, 10, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_sigreturn, 9, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 8, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit, 7, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_read, 1, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 2, 6),
+
+		/* Check that read is only using stdin. */
 
+		BPF_STMT(BPF_LD+BPF_W+BPF_IND, regoffset(ebx)),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 3, 4),
+
+		/* Check that write is only using stdout/stderr */
+		BPF_STMT(BPF_LD+BPF_W+BPF_IND, regoffset(ebx)),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDOUT_FILENO, 1, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDERR_FILENO, 0, 1),
+
+		/* Put the "accept" value in A */
+		BPF_STMT(BPF_LD+BPF_W+BPF_LEN, 0),
+
+		BPF_STMT(BPF_RET+BPF_A,0),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+		.filter = filter,
+	};

xialiangzhao

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
seccomp - Secure Computing

参考资料 http://www.man7.org/linux/man-pages/man2/seccomp.2.html http://man7.org/conf/lpc2015/limiting_kernel_attack_surface_with_seccomp-LPC_2015-Kerrisk.pdf https://www.freebsd.org/cgi/man....
复制链接

扫一扫