以dhcpd使用bpf为例进行分析
通过PF_PACKET,SOCK_DGRAM socket直接从kernel 网卡设备层把原始整个以太网数据原始数据读到用空空间。为了只读取感兴趣的数据包类型,例如ARP包,可以通过配置bpf进行过滤。
用户空间 attch bpf
int
open_socket(struct interface *iface, int protocol)
{
int s;
union sockunion {
struct sockaddr sa;
struct sockaddr_in sin;
struct sockaddr_ll sll;
struct sockaddr_storage ss;
} su;
struct sock_fprog pf;
int *fd;
if ((s = socket(PF_PACKET, SOCK_DGRAM, htons(protocol))) == -1) //创建PF_PACKET, SOCK_DGRAM,、、ETHERTYPE_ARP socket
memset(&su, 0, sizeof(su));
su.sll.sll_family = PF_PACKET;
su.sll.sll_protocol = htons(protocol);
if (!(su.sll.sll_ifindex = if_nametoindex(iface->name))) { //获取网络接口索引
errno = ENOENT;
goto eexit;
}
/* Install the DHCP filter */
memset(&pf, 0, sizeof(pf));
if (protocol == ETHERTYPE_ARP) {
pf.filter = UNCONST(arp_bpf_filter); //设置bpf过滤配置,这里是arp
pf.len = arp_bpf_filter_len;
} else {
pf.filter = UNCONST(dhcp_bpf_filter);
pf.len = dhcp_bpf_filter_len;
}
if (setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &pf, sizeof(pf)) != 0) //把bpf过滤配置attach到socket
goto eexit;
if (set_cloexec(s) == -1)
goto eexit;
if (set_nonblock(s) == -1)
goto eexit;
if (bind(s, &su.sa, sizeof(su)) == -1) //绑定socket到指定接口
goto eexit;
if (protocol == ETHERTYPE_ARP)
fd = &iface->arp_fd;
else
fd = &iface->raw_fd;
if (*fd != -1)
close(*fd);
*fd = s;
return s;
eexit:
close(s);
return -1;
}
BPF指令码
arp过滤配置为例,
/*
* Try and keep these values and structures similar to BSD, especially
* the BPF code definitions which need to match so you can share filters
*/
struct sock_filter { /* Filter block */
__u16 code; /* Actual filter code */ 指令码
__u8 jt; /* Jump true */ 跳转指令时,如果满足判断条件,跳转到jt偏移指令处
__u8 jf; /* Jump false */跳转指令时,如果不满足判断条件,跳转到jf偏移指令处
__u32 k; /* Generic multiuse field */ 存放用于判断的值
};
struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
unsigned short len; /* Number of filter blocks */
struct sock_filter __user *filter;
};
static const struct bpf_insn const arp_bpf_filter [] = {
#ifndef BPF_SKIPTYPE
/* Make sure this is an ARP packet... */
BPF_STMT(BPF_LD + BPF_H + BPF_ABS, 12), //指令ld ,读取以太网数据包开头偏移12字节的2个字节(数据包类型)到寄存器
BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ETHERTYPE_ARP, 0, 3),//判断跳转指令jmp,如果上面读取值为ETHERTYPE_ARP,跳转到本条指令+0偏移,也就是下一条指令,否则跳转到偏移3的指令
#endif
/* Make sure this is an ARP REQUEST... */
BPF_STMT(BPF_LD + BPF_H + BPF_ABS, 20 + BPF_ETHCOOK),//指令ld ,读取以太网数据包开头偏移20字节的2个字节(arp数据包类型)到寄存器
BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPOP_REQUEST, 2, 0), //判断跳转指令jmp,如果上面读取值为ARPOP_REQUEST,跳转到本条指令+2偏移,否则跳转到偏移0的指令,也就是下一条指令
/* or ARP REPLY... */
BPF_STMT(BPF_LD + BPF_H + BPF_ABS, 20 + BPF_ETHCOOK),//如果上面判断不是ARPOP_REQUEST,指令ld ,读取以太网数据包开头偏移20字节的2个字节(arp数据包类型)到寄存器
BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPOP_REPLY, 0, 1),//判断跳转指令jmp,如果上面读取值为ARPOP_REPLY,跳转到本条指令+0偏移,也就是下一条指令;否则跳转到偏移1的指令,
/* If we passed all the tests, ask for the whole packet. */
BPF_STMT(BPF_RET + BPF_K, BPF_WHOLEPACKET), //ret 返回指令,BPF_WHOLEPACKET表示前面判断通过,接收数据包到用户空间
/* Otherwise, drop it. */
BPF_STMT(BPF_RET + BPF_K, 0), //ret 返回指令,BPF_WHOLEPACKET表示前面判断不通过,丢弃
};
SOCK_PACKET
系统初始化时packet socket创建,只看关注的点
static int packet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct packet_sock *po;
__be16 proto = (__force __be16)protocol; /* weird, but documented */
int err;
sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
sock->ops = &packet_ops;
if (sock->type == SOCK_PACKET)
sock->ops = &packet_ops_spkt;
sock_init_data(sock, sk);
po = pkt_sk(sk);
sk->sk_family = PF_PACKET;
po->num = proto;
/*
* Attach a protocol block
*/
spin_lock_init(&po->bind_lock);
mutex_init(&po->pg_vec_lock);
po->prot_hook.func = packet_rcv;//网卡设备层回调函数,__netif_receive_skb_core--deliver_skb
if (sock->type == SOCK_PACKET)
po->prot_hook.func = packet_rcv_spkt;
po->prot_hook.af_packet_priv = sk;
if (proto) {
po->prot_hook.type = proto;
register_prot_hook(sk); 注册到网卡设备层
}
}
register_prot_hook--》dev_add_pack--》list_add_rcu(&pt->list, head);ptype_all
用户空间bind
static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
struct sock *sk = sock->sk;
struct net_device *dev = NULL;
int err;
/*
* Check legality
*/
if (sll->sll_ifindex) {
err = -ENODEV;
dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
if (dev == NULL)
goto out;
}
err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
}
static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
{
struct packet_sock *po = pkt_sk(sk);
unregister_prot_hook(sk, true); //先解除了之前注册
po->num = protocol;
po->prot_hook.type = protocol;
po->prot_hook.dev = dev; //设置网卡设备,绑定
po->ifindex = dev ? dev->ifindex : 0;//设置网卡设备索引,绑定
packet_cached_dev_assign(po, dev);
if (!dev || (dev->flags & IFF_UP)) {
register_prot_hook(sk); //再次注册,这里在新网口注册,也就是在bind的接口注册
}
}
网卡设备收到数据
__netif_receive_skb_core
{
list_for_each_entry_rcu(ptype, &ptype_all, list) { //遍历注册的ptype
if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
}
static inline int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
return -ENOMEM;
atomic_inc(&skb->users);
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 调用回调函数,对于PF_PACKET,是packet_rcv
}
static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct sockaddr_ll *sll;
struct packet_sock *po;
u8 *skb_head = skb->data;
int skb_len = skb->len;
unsigned int snaplen, res;
sk = pt->af_packet_priv;
po = pkt_sk(sk);
skb->dev = dev;
snaplen = skb->len;
res = run_filter(skb, sk, snaplen); //调用过滤函数,也有就是用户空间配置的bpf过滤条件,进行过滤
if (!res) //0,丢弃数据包,否则返回到用户空间
goto drop_n_restore;
if (snaplen > res)
snaplen = res;
。。。。
}
内核bpf配置流程
用户空间通过setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &pf, sizeof(pf)) 配置bpf
sock_setsockopt–》sk_attach_filter
/**
* sk_attach_filter - attach a socket filter
* @fprog: the filter program
* @sk: the socket to use
*
* Attach the user's filter code. We first run some sanity checks on
* it to make sure it does not explode on us later. If an error
* occurs or there is insufficient memory for the filter a negative
* errno code is returned. On success the return is zero.
*/
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
struct sk_filter *fp, *old_fp;
unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
int err;
fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); //分配sk_filter 结构用于存储bpf配置
if (copy_from_user(fp->insns, fprog->filter, fsize)) { //复制用户空间过滤配置到fp->insns
sock_kfree_s(sk, fp, fsize+sizeof(*fp));
return -EFAULT;
}
atomic_set(&fp->refcnt, 1);
fp->len = fprog->len; //过滤条件,bpf指令条数
err = __sk_prepare_filter(fp); //
rcu_assign_pointer(sk->sk_filter, fp); //把fp赋给 socket结构sk_filter
return 0;
}
static int __sk_prepare_filter(struct sk_filter *fp)
{
int err;
fp->bpf_func = sk_run_filter; //复制bpf处理函数
err = sk_chk_filter(fp->insns, fp->len); //检查bpf配置指令字节码,并转换指令code,存储到fp insns中
bpf_jit_compile(fp);// 编译bpf字节码,也就是把bpf字节码转换为对应cpu架构指令
return 0;
}
bpf字节码编译
bpf字节码在内核编译(转换)为对应cpu架构指令;
linux在处理bpf字节码时两种方式,一种定义了CONFIG_BPF_JIT,会把bpf字节码在内核编译(转换)为对应cpu架构指令。如果没有定义了CONFIG_BPF_JIT,则模拟了过滤过程。
#ifdef CONFIG_BPF_JIT
#include <stdarg.h>
#include <linux/linkage.h>
#include <linux/printk.h>
extern void bpf_jit_compile(struct sk_filter *fp);
extern void bpf_jit_free(struct sk_filter *fp);
static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
u32 pass, void *image)
{
pr_err("flen=%u proglen=%u pass=%u image=%p\n",
flen, proglen, pass, image);
if (image)
print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS,
16, 1, image, proglen, false);
}
#define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns)
#else
static inline void bpf_jit_compile(struct sk_filter *fp)
{
}
static inline void bpf_jit_free(struct sk_filter *fp)
{
}
#define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns)
#endif
CONFIG_BPF_JIT定义时:arm指令
void bpf_jit_compile(struct sk_filter *fp)
{
struct jit_ctx ctx;
unsigned tmp_idx;
unsigned alloc_size;
if (!bpf_jit_enable)
return;
memset(&ctx, 0, sizeof(ctx));
ctx.skf = fp;
ctx.ret0_fp_idx = -1;
ctx.offsets = kzalloc(4 * (ctx.skf->len + 1), GFP_KERNEL);
if (ctx.offsets == NULL)
return;
/* fake pass to fill in the ctx->seen */
if (unlikely(build_body(&ctx)))
goto out;
tmp_idx = ctx.idx;
build_prologue(&ctx);
ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4;
/* there's nothing after the epilogue on ARMv7 */
build_epilogue(&ctx);
alloc_size = 4 * ctx.idx;
//申请内存用于存储bpf字节码编译后对应arm指令
ctx.target = module_alloc(max(sizeof(struct work_struct),
alloc_size));
if (unlikely(ctx.target == NULL))
goto out;
ctx.idx = 0;
//把bpf字节码转换为arm指令
build_prologue(&ctx);
build_body(&ctx);
build_epilogue(&ctx);
flush_icache_range((u32)ctx.target, (u32)(ctx.target + ctx.idx));
if (bpf_jit_enable > 1)
/* there are 2 passes here */
bpf_jit_dump(fp->len, alloc_size, 2, ctx.target);
fp->bpf_func = (void *)ctx.target; 重新修改bpf处理函数,指向arm指令所在内存地址
。。。。
}
过滤过程
packet_rcv–》run_filter—》SK_RUN_FILTER(filter, skb);
对于SK_RUN_FILTER,如果定义了CONFIG_BPF_JIT,那么调用bpf_func,实际跳转到arm指令
如果没有定义了CONFIG_BPF_JIT,调用sk_run_filter。
#ifdef CONFIG_BPF_JIT
#include <stdarg.h>
#include <linux/linkage.h>
#include <linux/printk.h>
extern void bpf_jit_compile(struct sk_filter *fp);
extern void bpf_jit_free(struct sk_filter *fp);
static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
u32 pass, void *image)
{
pr_err("flen=%u proglen=%u pass=%u image=%p\n",
flen, proglen, pass, image);
if (image)
print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS,
16, 1, image, proglen, false);
}
#define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns)
#else
static inline void bpf_jit_compile(struct sk_filter *fp)
{
}
static inline void bpf_jit_free(struct sk_filter *fp)
{
}
#define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns)
#endif
arm指令,就不看了。
看一下没有定义CONFIG_BPF_JIT时,sk_run_filter
/**
* sk_run_filter - run a filter on a socket
* @skb: buffer to run the filter on
* @fentry: filter to apply
*
* Decode and apply filter instructions to the skb->data.
* Return length to keep, 0 for none. @skb is the data we are
* filtering, @filter is the array of filter instructions.
* Because all jumps are guaranteed to be before last instruction,
* and last instruction guaranteed to be a RET, we dont need to check
* flen. (We used to pass to this function the length of filter)
*/
unsigned int sk_run_filter(const struct sk_buff *skb,
const struct sock_filter *fentry)
{
void *ptr;
u32 A = 0; /* Accumulator */
u32 X = 0; /* Index Register */
u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
u32 tmp;
int k;
/*
* Process array of filter instructions.
*/
//循环bpf字节码指令,指令模拟了arm指令处理流程
for (;; fentry++) {
#if defined(CONFIG_X86_32)
#define K (fentry->k)
#else
const u32 K = fentry->k;
#endif
switch (fentry->code) {
case BPF_S_ALU_ADD_X:
A += X;
continue;
case BPF_S_ALU_ADD_K:
A += K;
continue;
case BPF_S_ALU_SUB_X:
A -= X;
continue;
case BPF_S_ALU_SUB_K:
A -= K;
continue;
case BPF_S_ALU_MUL_X:
A *= X;
continue;
case BPF_S_ALU_MUL_K:
A *= K;
continue;
case BPF_S_ALU_DIV_X:
if (X == 0)
return 0;
A /= X;
continue;
case BPF_S_ALU_DIV_K:
A /= K;
continue;
case BPF_S_ALU_MOD_X:
if (X == 0)
return 0;
A %= X;
continue;
case BPF_S_ALU_MOD_K:
A %= K;
continue;
case BPF_S_ALU_AND_X:
A &= X;
continue;
case BPF_S_ALU_AND_K:
A &= K;
continue;
case BPF_S_ALU_OR_X:
A |= X;
continue;
case BPF_S_ALU_OR_K:
A |= K;
continue;
case BPF_S_ANC_ALU_XOR_X:
case BPF_S_ALU_XOR_X:
A ^= X;
continue;
case BPF_S_ALU_XOR_K:
A ^= K;
continue;
case BPF_S_ALU_LSH_X:
A <<= X;
continue;
case BPF_S_ALU_LSH_K:
A <<= K;
continue;
case BPF_S_ALU_RSH_X:
A >>= X;
continue;
case BPF_S_ALU_RSH_K:
A >>= K;
continue;
case BPF_S_ALU_NEG:
A = -A;
continue;
case BPF_S_JMP_JA:
fentry += K;
continue;
case BPF_S_JMP_JGT_K:
fentry += (A > K) ? fentry->jt : fentry->jf;
continue;
case BPF_S_JMP_JGE_K:
fentry += (A >= K) ? fentry->jt : fentry->jf;
continue;
case BPF_S_JMP_JEQ_K: //比较跳转指令
fentry += (A == K) ? fentry->jt : fentry->jf;
continue;
case BPF_S_JMP_JSET_K:
fentry += (A & K) ? fentry->jt : fentry->jf;
continue;
case BPF_S_JMP_JGT_X:
fentry += (A > X) ? fentry->jt : fentry->jf;
continue;
case BPF_S_JMP_JGE_X:
fentry += (A >= X) ? fentry->jt : fentry->jf;
continue;
case BPF_S_JMP_JEQ_X:
fentry += (A == X) ? fentry->jt : fentry->jf;
continue;
case BPF_S_JMP_JSET_X:
fentry += (A & X) ? fentry->jt : fentry->jf;
continue;
case BPF_S_LD_W_ABS: //加载内存skb data中指定偏移值到寄存器
k = K;
load_w:
ptr = load_pointer(skb, k, 4, &tmp);
if (ptr != NULL) {
A = get_unaligned_be32(ptr);
continue;
}
return 0;
case BPF_S_LD_H_ABS:
k = K;
load_h:
ptr = load_pointer(skb, k, 2, &tmp);
if (ptr != NULL) {
A = get_unaligned_be16(ptr);
continue;
}
return 0;
case BPF_S_LD_B_ABS:
k = K;
load_b:
ptr = load_pointer(skb, k, 1, &tmp);
if (ptr != NULL) {
A = *(u8 *)ptr;
continue;
}
return 0;
case BPF_S_LD_W_LEN:
A = skb->len;
continue;
case BPF_S_LDX_W_LEN:
X = skb->len;
continue;
case BPF_S_LD_W_IND:
k = X + K;
goto load_w;
case BPF_S_LD_H_IND:
k = X + K;
goto load_h;
case BPF_S_LD_B_IND:
k = X + K;
goto load_b;
case BPF_S_LDX_B_MSH:
ptr = load_pointer(skb, K, 1, &tmp);
if (ptr != NULL) {
X = (*(u8 *)ptr & 0xf) << 2;
continue;
}
return 0;
case BPF_S_LD_IMM:
A = K;
continue;
case BPF_S_LDX_IMM:
X = K;
continue;
case BPF_S_LD_MEM:
A = mem[K];
continue;
case BPF_S_LDX_MEM:
X = mem[K];
continue;
case BPF_S_MISC_TAX:
X = A;
continue;
case BPF_S_MISC_TXA:
A = X;
continue;
case BPF_S_RET_K:
return K;
case BPF_S_RET_A:
return A;
case BPF_S_ST:
mem[K] = A;
continue;
case BPF_S_STX:
mem[K] = X;
continue;
case BPF_S_ANC_PROTOCOL:
A = ntohs(skb->protocol);
continue;
case BPF_S_ANC_PKTTYPE:
A = skb->pkt_type;
continue;
case BPF_S_ANC_IFINDEX:
if (!skb->dev)
return 0;
A = skb->dev->ifindex;
continue;
case BPF_S_ANC_MARK:
A = skb->mark;
continue;
case BPF_S_ANC_QUEUE:
A = skb->queue_mapping;
continue;
case BPF_S_ANC_HATYPE:
if (!skb->dev)
return 0;
A = skb->dev->type;
continue;
case BPF_S_ANC_RXHASH:
A = skb->rxhash;
continue;
case BPF_S_ANC_CPU:
A = raw_smp_processor_id();
continue;
case BPF_S_ANC_VLAN_TAG:
A = vlan_tx_tag_get(skb);
continue;
case BPF_S_ANC_VLAN_TAG_PRESENT:
A = !!vlan_tx_tag_present(skb);
continue;
case BPF_S_ANC_PAY_OFFSET:
A = __skb_get_poff(skb);
continue;
case BPF_S_ANC_NLATTR: {
struct nlattr *nla;
if (skb_is_nonlinear(skb))
return 0;
if (skb->len < sizeof(struct nlattr))
return 0;
if (A > skb->len - sizeof(struct nlattr))
return 0;
nla = nla_find((struct nlattr *)&skb->data[A],
skb->len - A, X);
if (nla)
A = (void *)nla - (void *)skb->data;
else
A = 0;
continue;
}
case BPF_S_ANC_NLATTR_NEST: {
struct nlattr *nla;
if (skb_is_nonlinear(skb))
return 0;
if (skb->len < sizeof(struct nlattr))
return 0;
if (A > skb->len - sizeof(struct nlattr))
return 0;
nla = (struct nlattr *)&skb->data[A];
if (nla->nla_len > skb->len - A)
return 0;
nla = nla_find_nested(nla, X);
if (nla)
A = (void *)nla - (void *)skb->data;
else
A = 0;
continue;
}
#ifdef CONFIG_SECCOMP_FILTER
case BPF_S_ANC_SECCOMP_LD_W:
A = seccomp_bpf_load(fentry->k);
continue;
#endif
default:
WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
fentry->code, fentry->jt,
fentry->jf, fentry->k);
return 0;
}
}
return 0;
}
EXPORT_SYMBOL(sk_run_filter);