seccomp - Secure Computing

参考资料

 

配置

  • CONFIG_SECCOMP
  • CONFIG_SECCOMP_FILTER


内核态


seccomp 以 task_struct 为单位

struct task_struct {
    ...
    struct seccomp seccomp;
    ...
}

struct seccomp {
    int mode;
    struct seccomp_filter *filter;
};

用户态每次调用 seccomp 时注册的 sock_fprog->filter 都加入链表

struct seccomp_filter {
    atomic_t usage;
    struct seccomp_filter *prev;
    struct bpf_prog *prog;
};

struct bpf_prog {
    u16         pages;      /* Number of allocated pages */
    kmemcheck_bitfield_begin(meta);
    u16         jited:1,    /* Is our filter JIT'ed? */
                gpl_compatible:1, /* Is filter GPL compatible? */
                cb_access:1,    /* Is control block accessed? */
                dst_needed:1;   /* Do we need dst entry? */
    kmemcheck_bitfield_end(meta);
    u32         len;        /* Number of filter blocks */
    enum bpf_prog_type  type;       /* Type of BPF program */
    struct bpf_prog_aux *aux;       /* Auxiliary fields */
    struct sock_fprog_kern  *orig_prog; /* Original BPF program */
    unsigned int        (*bpf_func)(const struct sk_buff *skb,
                        const struct bpf_insn *filter);
    /* Instructions for interpreter */
    union {
        struct sock_filter  insns[0];
        struct bpf_insn     insnsi[0];
    };
};

sock_filter 用于 seccomp,操作对象是系统调用,bpf_insn 用于 BFP,操作对象是 packet。seccomp 复用了执行insn的虚拟机


sock_filter 执行流程:
el0_svc - __sys_trace - syscall_trace_enter - secure_computing - __seccomp_filter

对于每个系统调用,执行的时候都需要把该task_struct上的filter都执行一遍,并且返回一个值表明内核是否运行执行该系统调用

/**
 * struct seccomp_data - the format the BPF program executes over.
 * @nr: the system call number
 * @arch: indicates system call convention as an AUDIT_ARCH_* value
 *        as defined in <linux/audit.h>.
 * @instruction_pointer: at the time of the system call.
 * @args: up to 6 system call arguments always stored as 64-bit values
 *        regardless of the architecture.
 */
struct seccomp_data {
    int nr;
    __u32 arch;
    __u64 instruction_pointer;
    __u64 args[6];
};

static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
                const bool recheck_after_trace)
{
    u32 filter_ret, action;
    int data;

    /*
     * Make sure that any changes to mode from another thread have
     * been seen after TIF_SECCOMP was seen.
     */
    rmb();
    
    // 获取 filter 返回的 action/data
    filter_ret = seccomp_run_filters(sd);
    data = filter_ret & SECCOMP_RET_DATA;
    action = filter_ret & SECCOMP_RET_ACTION;

    switch (action) {
    case SECCOMP_RET_ERRNO:
        /* Set low-order bits as an errno, capped at MAX_ERRNO. */
        if (data > MAX_ERRNO)
            data = MAX_ERRNO;
        syscall_set_return_value(current, task_pt_regs(current),
                     -data, 0);
        goto skip;

    case SECCOMP_RET_TRAP:
        /* Show the handler the original registers. */
        syscall_rollback(current, task_pt_regs(current));
        /* Let the filter pass back 16 bits of data. */
        seccomp_send_sigsys(this_syscall, data);
        goto skip;

    case SECCOMP_RET_TRACE:
        /* We've been put in this state by the ptracer already. */
        if (recheck_after_trace)
            return 0;

        /* ENOSYS these calls if there is no tracer attached. */
        if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
            syscall_set_return_value(current,
                         task_pt_regs(current),
                         -ENOSYS, 0);
            goto skip;
        }

        /* Allow the BPF to provide the event message */
        ptrace_event(PTRACE_EVENT_SECCOMP, data);
        /*
         * The delivery of a fatal signal during event
         * notification may silently skip tracer notification,
         * which could leave us with a potentially unmodified
         * syscall that the tracer would have liked to have
         * changed. Since the process is about to die, we just
         * force the syscall to be skipped and let the signal
         * kill the process and correctly handle any tracer exit
         * notifications.
         */
        if (fatal_signal_pending(current))
            goto skip;
        /* Check if the tracer forced the syscall to be skipped. */
        this_syscall = syscall_get_nr(current, task_pt_regs(current));
        if (this_syscall < 0)
            goto skip;

        /*
         * Recheck the syscall, since it may have changed. This
         * intentionally uses a NULL struct seccomp_data to force
         * a reload of all registers. This does not goto skip since
         * a skip would have already been reported.
         */
        if (__seccomp_filter(this_syscall, NULL, true))
            return -1;

        return 0;

    case SECCOMP_RET_ALLOW:
        return 0;

    case SECCOMP_RET_KILL:
    default:
        audit_seccomp(this_syscall, SIGSYS, action);
        do_exit(SIGSYS);
    }

    unreachable();

skip:
    audit_seccomp(this_syscall, 0, action);
    return -1;
}

/**
 * seccomp_run_filters - evaluates all seccomp filters against @syscall
 * @syscall: number of the current system call
 *
 * Returns valid seccomp BPF response codes.
 */
static u32 seccomp_run_filters(const struct seccomp_data *sd)
{
    struct seccomp_data sd_local;
    u32 ret = SECCOMP_RET_ALLOW;
    /* Make sure cross-thread synced filter points somewhere sane. */
    struct seccomp_filter *f =
            lockless_dereference(current->seccomp.filter);

    /* Ensure unexpected behavior doesn't result in failing open. */
    if (unlikely(WARN_ON(f == NULL)))
        return SECCOMP_RET_KILL;

    if (!sd) {
        populate_seccomp_data(&sd_local); // 获取 seccomp_data
        sd = &sd_local;
    }

    /*
     * All filters in the list are evaluated and the lowest BPF return
     * value always takes priority (ignoring the DATA).
     */
    for (; f; f = f->prev) {
        u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);

        if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
            ret = cur_ret;
    }
    return ret;
}

 

用户态

 

#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/audit.h>
#include <linux/signal.h>
#include <sys/ptrace.h>

方法1

int seccomp(unsigned int operation, unsigned int flags, void *args);

方法2 使用 libseccomp 库

方法3:
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
#define SECCOMP_MODE_DISABLED	0 /* seccomp is not in use. */
#define SECCOMP_MODE_STRICT	1 /* uses hard-coded filter. */
#define SECCOMP_MODE_FILTER	2 /* uses user-supplied filter. */

/* Valid operations for seccomp syscall. */
#define SECCOMP_SET_MODE_STRICT     0
#define SECCOMP_SET_MODE_FILTER     1
#define SECCOMP_GET_ACTION_AVAIL    2 // since Linux 4.14

/* 使用方法,返回值为0为成功
unsigned int action = SECCOMP_RET_ALLOW
seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &action);
*/

/* Valid flags for SECCOMP_SET_MODE_FILTER */
#define SECCOMP_FILTER_FLAG_TSYNC   (1UL << 0)
#define SECCOMP_FILTER_FLAG_LOG     (1UL << 1)
#define SECCOMP_FILTER_FLAG_SPEC_ALLOW  (1UL << 2)

/*
 * All BPF programs must return a 32-bit value.
 * The bottom 16-bits are for optional return data.
 * The upper 16-bits are ordered from least permissive values to most,
 * as a signed value (so 0x8000000 is negative).
 *
 * The ordering ensures that a min_t() over composed return values always
 * selects the least permissive choice.
 */
#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
#define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
#define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
#define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
#define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
#define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
#define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
#define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */

fork/clone/exec 会继承 sock_filter,使用 SECCOMP_SET_MODE_FILTER 时需要设置:

prctl(PR_SET_NO_NEW_PRIVS, 1);

该设置会被fork/clone/exec继承,且不可撤销。此设置为了防止一种情况:正常情况下,普通进程 exec 一个setuid的

程序时,会执行完一些操作后 setuid(uid)降权,然而,普通进程可以在exec前设置 filter 来跳过 setuid 执行,导致进程

保持以root权限执行。设置 PR_SET_NO_NEW_PRIVS bit后,使得exec时不会获取更多的权限。

 

当operation为 SECCOMP_SET_MODE_FILTER, filter是一个 sock_fprog 指针

struct sock_fprog {
   unsigned short      len;    /* Number of BPF instructions */
   struct sock_filter *filter; /* Pointer to array of
                                  BPF instructions */
};

sock_fprog 至少包含一个 BPF instructions:

struct sock_filter {            /* Filter block */
   __u16 code;                 /* Actual filter code */
   __u8  jt;                   /* Jump true */
   __u8  jf;                   /* Jump false */
   __u32 k;                    /* Generic multiuse field */
};

每条指令执行的时候,其操作对象为 seccomp_data

struct seccomp_data {
   int   nr;                   /* System call number */
   __u32 arch;                 /* AUDIT_ARCH_* value
                                  (see <linux/audit.h>) */
   __u64 instruction_pointer;  /* CPU instruction pointer */
   __u64 args[6];              /* Up to 6 system call arguments */
};

 

内核交互

  • /proc/sys/kernel/seccomp
  • /proc/$pid/status
  • /proc/sys/net/core/bpf_jit_enable

 

Filter虚拟机

 

Filter是一组指令,指令只能向前跳转,并且以一个RET指令结束

A表示累加器,X是索引寄存器,K表示立即数

 

 

/* Instruction classes */
#define BPF_CLASS(code) ((code) & 0x07)
#define		BPF_LD		0x00    // 拷贝值到累加器
#define		BPF_LDX		0x01    // 拷贝值到索引寄存器
#define		BPF_ST		0x02
#define		BPF_STX		0x03
#define		BPF_ALU		0x04    // 累加器和索引寄存器或者常量计算
#define		BPF_JMP		0x05    // 向前跳转
#define		BPF_RET		0x06    // 结束filter,并返回值 
#define		BPF_MISC        0x07

/* ld/ldx fields */
#define BPF_SIZE(code)  ((code) & 0x18)
#define		BPF_W		0x00 /* 32-bit */    // 操作数size

#define BPF_MODE(code)  ((code) & 0xe0)
#define		BPF_IMM		0x00
#define		BPF_ABS		0x20    // 访问 seccomp_data buffer
#define		BPF_IND		0x40
#define		BPF_MEM		0x60
#define		BPF_LEN		0x80
#define		BPF_MSH		0xa0

/* alu/jmp fields */
#define BPF_OP(code)    ((code) & 0xf0)
#define		BPF_ADD		0x00
#define		BPF_SUB		0x10
#define		BPF_MUL		0x20
#define		BPF_DIV		0x30
#define		BPF_OR		0x40
#define		BPF_AND		0x50
#define		BPF_LSH		0x60
#define		BPF_RSH		0x70
#define		BPF_NEG		0x80
#define		BPF_MOD		0x90
#define		BPF_XOR		0xa0

#define		BPF_JA		0x00    // 无条件跳转,范围32bit
#define		BPF_JEQ		0x10    // 条件跳转,范围8bit
#define		BPF_JGT		0x20
#define		BPF_JGE		0x30
#define		BPF_JSET    0x40

#define BPF_SRC(code)   ((code) & 0x08)
#define		BPF_K		0x00    // 跳转比较测试对象是K
#define		BPF_X		0x08

#ifndef BPF_MAXINSNS
#define BPF_MAXINSNS 4096
#endif

/*
 * Macros for filter block array initializers. struct sock_filter
 */
#ifndef BPF_STMT
#define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k }
#endif
#ifndef BPF_JUMP
#define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k }
#endif

 

BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))) // 获取系统调用号
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K , __NR_open , 1 , 0) // 和__NR_open比较是否相等
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)    // 不相等则允许该系统调用
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL)    // 相等禁止该系统调用

实例

+	struct sock_filter filter[] = {
+		/* Grab the system call number */
			//BPF_LD 将值拷贝进寄存器(accumulator)
			//BPF_W Word BPF_H half Word
			//BPF_IND 可变的偏移 
			//BPF_ABS 固定的偏移
			//BPF_k 常数 BPF_A 累加器
			//BPF_JEQ 判断是否相等
	//#define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k }  
	//#define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k }  
 
+		BPF_STMT(BPF_LD+BPF_W+BPF_IND, regoffset(orig_eax)),
		//物理头,偏移regoffset(orig_eax) byte后,指向type*。
+		//* Jump table for the allowed syscalls */
		//进行比较,是否为I__NR_rt_sigreturn。 true的话 0,  false 10
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_rt_sigreturn, 10, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_sigreturn, 9, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 8, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit, 7, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_read, 1, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 2, 6),
+
+		/* Check that read is only using stdin. */
 
+		BPF_STMT(BPF_LD+BPF_W+BPF_IND, regoffset(ebx)),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 3, 4),
+
+		/* Check that write is only using stdout/stderr */
+		BPF_STMT(BPF_LD+BPF_W+BPF_IND, regoffset(ebx)),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDOUT_FILENO, 1, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDERR_FILENO, 0, 1),
+
+		/* Put the "accept" value in A */
+		BPF_STMT(BPF_LD+BPF_W+BPF_LEN, 0),
+
+		BPF_STMT(BPF_RET+BPF_A,0),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+		.filter = filter,
+	};

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值