背景
本身从事网络安全工作,具体为防火墙产品的开发,对Linux 内核而言,Linux 防火墙功能由Netfilter框架实现,因此有了对Linux内核Netfilter实现逻辑的学习研究的兴趣,也想借此平台和大家一起交流学习。
概念
Netfilter 是 Linux 内核中用于进行网络包过滤和操作的框架,在报文接收的处理的路径上,针对不同的协议,在不同的Hook位置调用相应的Hook函数(钩子函数),实现对报文的处理,协议类型标注了钩子函数在处理网络包时所关注的协议,目前Netfilter根据不同的协议,对报文处理的支持分为以下几类:
1)NFPROTO_BRIDGE,用于桥接设备相关的协议类型标识,桥接设备用于连接一个或多个网段的设备,工作在数据链路层,通过MAC地址转发或过滤数据包。
2)NFPROTO_ARP,Netfilter中用于注册于ARP协议相关的钩子函数,用于在ARP报文处理过程中执行特定的操作。
3)NFPROTO_IPV4,ipv4相关报文处理
4)NFPROTO_IPV6, ipv6相关报文处理
与之相对应,存在ebtables、arptables、iptables、ip6tables用于配置相应的规则,从而实现对具体特定特征报文执行特定的动作处理,关于xxtables的内容另外讨论。
Netfilter处理
从Linux 内核实现上看,Netfilter处理分为以下几部分:
1)nf_hook_ops注册
2)NF_HOOK 钩子函数处理调用
3)钩子函数处理后的后续处理
nf_hook_ops的注册
钩子函数的注册通常随着模块的加载,在模块初始化过程中通过直接或间接调用nf_register_net_hooks或nf_register_net_hook(二者的区别在于注册一组还是一个钩子函数)注册(不绝对,有的可能在函数处理逻辑过程中注册),以连接模块的钩子函数注册为例,具体调用过程如下:
这里从nf_register_net_hooks的上层调用函数nf_ct_netns_do_get入手,具体实现如下:
static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
bool fixup_needed = false, retry = true;
int err = 0;
retry:
mutex_lock(&nf_ct_proto_mutex);
/* 如前所说Netfilter支持不同协议,这里依据NF协议,选择分支处理 */
switch (nfproto) {
case NFPROTO_IPV4:
cnet->users4++;
if (cnet->users4 > 1)
goto out_unlock;
err = nf_defrag_ipv4_enable(net);
if (err) {
cnet->users4 = 0;
goto out_unlock;
}
/* 注册ipv4 连接相关处理钩子函数 */
err = nf_register_net_hooks(net, ipv4_conntrack_ops,
ARRAY_SIZE(ipv4_conntrack_ops));
if (err)
cnet->users4 = 0;
else
fixup_needed = true;
break;
#if IS_ENABLED(CONFIG_IPV6)
case NFPROTO_IPV6:
cnet->users6++;
if (cnet->users6 > 1)
goto out_unlock;
err = nf_defrag_ipv6_enable(net);
if (err < 0) {
cnet->users6 = 0;
goto out_unlock;
}
/* 注册ipv6 连接相关处理钩子函数 */
err = nf_register_net_hooks(net, ipv6_conntrack_ops,
ARRAY_SIZE(ipv6_conntrack_ops));
if (err)
cnet->users6 = 0;
else
fixup_needed = true;
break;
#endif
case NFPROTO_BRIDGE:
if (!nf_ct_bridge_info) {
if (!retry) {
err = -EPROTO;
goto out_unlock;
}
mutex_unlock(&nf_ct_proto_mutex);
request_module("nf_conntrack_bridge");
retry = false;
goto retry;
}
if (!try_module_get(nf_ct_bridge_info->me)) {
err = -EPROTO;
goto out_unlock;
}
cnet->users_bridge++;
if (cnet->users_bridge > 1)
goto out_unlock;
/* 注册网桥报文连接相关处理钩子函数(针对三层转发报文) */
err = nf_register_net_hooks(net, nf_ct_bridge_info->ops,
nf_ct_bridge_info->ops_size);
if (err)
cnet->users_bridge = 0;
else
fixup_needed = true;
break;
default:
err = -EPROTO;
break;
}
out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
if (fixup_needed) {
struct nf_ct_iter_data iter_data = {
.net = net,
.data = (void *)(unsigned long)nfproto,
};
nf_ct_iterate_cleanup_net(nf_ct_tcp_fixup, &iter_data);
}
return err;
}
从nf_ct_netns_do_get函数的实现看,连接针对不同的协议报文注册了不同的钩子函数,以ipv4协议报文为例,对应ipv4_conntrack_ops;具体定义如下:
static const struct nf_hook_ops ipv4_conntrack_ops[] = {
{
.hook = ipv4_conntrack_in,/* 钩子函数*/
.pf = NFPROTO_IPV4, /* 面向的协议 */
.hooknum = NF_INET_PRE_ROUTING, /*hook点,对应报文收发过程中位置 */
.priority = NF_IP_PRI_CONNTRACK, /* 确定hook函数的调用顺序*/
},
{
.hook = ipv4_conntrack_local,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
};
这里不围绕连接展开,只是通过此示例了解nf_hook_ops结构体变量中各字段的含义,nf_hook_ops结构体定义如下:
struct nf_hook_ops {
nf_hookfn *hook; /* hook函数指针 */
struct net_device *dev;
void *priv; /* 私有数据 */
u8 pf;
/* hook操作类型,指示hook适用的场景,目前已知有三种:0: NF_HOOK_OP_UNDEFINED
1:NF_HOOK_OP_NF_TABLES 2:NF_HOOK_OP_BPF */
enum nf_hook_ops_type hook_ops_type:8;
/* hook点对应报文不同处理位置,分为PRE_ROUTING、LOCAL_IN、FORWARD、
LOCAL_OUT、POST_ROUTING */
unsigned int hooknum;
/* Hooks are ordered in ascending priority. */
int priority;
/* 优先级,一种协议的一个hook点因不同的模块可能会注册多个hook函数,
优先级决定了这些函数的调用顺序,nf_hook_ops在链表中以优先级升序(数值大小)排列,
但需要说明的是优先级数值大的,优先级反而低,即数值越小越早被调用 */
};
nf_hook_ops的注册的实质就是将依据nf_hook_ops对应的协议,hook点的位置,按照优先级的大小放到指定nf_hook_entry数组中组织起来,方便后续报文处理过程中依据协议、位置、以优先级决定的顺序调用hook函数处理,具体流程如下:
1)nf_register_net_hooks 函数注册nf_hook_ops组
2)调用nf_register_net_hook逐个注册nf_hook_ops
3)__nf_register_net_hook函数处理
a)pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev); 依据网络命名空间、协议、hook点,找到该协议、该位置nf_hook_entries数组起始地址,网络命名空间实现资源隔离。
b)nf_hook_entries_grow依据优先级以升序的方式将nf_hook_ops转换成nf_hook_entry条目信息放入nf_hook_entries内部nf_hook_entry数组对应位置。
NF_HOOK处理
NF_HOOK处理即在报文处理路径上的特定位置调用已注册的相应的钩子函数,并根据钩子函数处理后的判定结果决定是否对报文执行后续处理,具体实现如下及流程如下:
static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb,
struct net_device *in, struct net_device *out,
int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
/* nf_hook_ops 钩子函数处理*/
int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
/* 1即 NF_ACCEPT*/
if (ret == 1)
/* 钩子函数处理后的后续处理 */
ret = okfn(net, sk, skb);
return ret;
}
1)调用nf_hook函数,进一步调用对应特定协议、特定位置的钩子函数。
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
struct sock *sk, struct sk_buff *skb,
struct net_device *indev, struct net_device *outdev,
int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
struct nf_hook_entries *hook_head = NULL;
int ret = 1;
#ifdef CONFIG_JUMP_LABEL
if (__builtin_constant_p(pf) &&
__builtin_constant_p(hook) &&
!static_key_false(&nf_hooks_needed[pf][hook]))
return 1;
#endif
rcu_read_lock();
/*根据特定协议,hook点获取netfilter命名空间特定协议特定位置nf_hook_entries头指针 */
switch (pf) {
case NFPROTO_IPV4:
hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
break;
case NFPROTO_IPV6:
hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
break;
case NFPROTO_ARP:
#ifdef CONFIG_NETFILTER_FAMILY_ARP
if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp)))
break;
hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
#endif
break;
case NFPROTO_BRIDGE:
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
#endif
break;
default:
WARN_ON_ONCE(1);
break;
}
if (hook_head) {
struct nf_hook_state state;
/* 依据参数信息,初始化nf_hook_state已供后续钩子处理函数使用,
nf_hook_ops中的hook字段为nf_hookfn函数指针,具体定义为:
typedef unsigned int nf_hookfn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state); 需要nf_hook_state类型参数 */
nf_hook_state_init(&state, hook, pf, indev, outdev,
sk, net, okfn);
/* 根据hook_head, nf_hook_state, 逐个调用钩子函数 */
ret = nf_hook_slow(skb, &state, hook_head, 0);
}
rcu_read_unlock();
return ret;
}
通过上述实现可知,钩子函数调用,最终通过nf_hook_slow函数实现。
int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
const struct nf_hook_entries *e, unsigned int s)
{
unsigned int verdict;
int ret;
/*逐个遍历,由于nf_hook_ops是依据优先级数值大小升序排列,所以数值小的在前也更早处理,
这就对应了优先级数值大,优先级低的结论*/
for (; s < e->num_hook_entries; s++) {
verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
switch (verdict & NF_VERDICT_MASK) {
case NF_ACCEPT:
break;
case NF_DROP:
kfree_skb_reason(skb,
SKB_DROP_REASON_NETFILTER_DROP);
ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
return ret;
case NF_QUEUE:
ret = nf_queue(skb, state, s, verdict);
if (ret == 1)
continue;
return ret;
default:
/* Implicit handling for NF_STOLEN, as well as any other
* non conventional verdicts.
*/
return 0;
}
}
return 1;
}
HOOK点处理位置
Netfilter框架针对每种协议报文,提供多个HOOK点处理位置以及相应的后续处理,以IPv4报文处理为例,报文处理流程及路径如图所示:
NF_HOOK处理调用具体位置及实现如下。
PRE_ROUTING
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
struct net_device *orig_dev)
{
......
/* DNAT处理在此调用 */
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
skb, dev, NULL, ip_rcv_finish);
}
LOCAL_IN
int ip_local_deliver(struct sk_buff *skb)
{
......
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish);
}
EXPORT_SYMBOL(ip_local_deliver);
FORWARD
int ip_forward(struct sk_buff *skb)
{
......
/* 转发报文netfilter处理,匹配过滤 */
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
net, NULL, skb, skb->dev, rt->dst.dev, ip_forward_finish);
sr_failed:
/*
* Strict routing permits no gatewaying
*/
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
goto drop;
too_many_hops:
/* Tell the sender its packet died... */
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
SKB_DR_SET(reason, IP_INHDR);
drop:
kfree_skb_reason(skb, reason);
return NET_RX_DROP;
}
LOCAL_OUT
/* 该HOOK点可能依据外发的具体协议报文可能在多个点调用*/
err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, rt->dst.dev, dst_output);
POST_ROUTING
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;/* 报文入接口*/
......
/* POSTROUTING 可作SNAT处理 */
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, skb, indev, dev, ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}