浅入浅出 iptables 原理:在内核里骚一把 netfilter~
源码分析|Linux内核Netfilter与iptables的原理
//kernel/msm-5.4/include/uapi/linux/netfilter.h
enum nf_inet_hooks {
NF_INET_PRE_ROUTING,
NF_INET_LOCAL_IN,
NF_INET_FORWARD,
NF_INET_LOCAL_OUT,
NF_INET_POST_ROUTING,
NF_INET_NUMHOOKS
};
//kernel/msm-5.4/net/ipv4/ip_input.c
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
struct net_device *orig_dev)
{
//...
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
net, NULL, skb, dev, NULL,
ip_rcv_finish);
}
//kernel/msm-5.4/net/ipv4/ip_input.c
int ip_local_deliver(struct sk_buff *skb)
{
//...
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
//kernel/msm-5.4/net/ipv4/ip_forward.c
int ip_forward(struct sk_buff *skb)
{
//...
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
net, NULL, skb, skb->dev, rt->dst.dev,
ip_forward_finish);
//...
}
//kernel/msm-5.4/net/ipv4/ip_output.c
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
//...
err = __ip_local_out(net, sk, skb);
//..
}
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
//...
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
}
//kernel/msm-5.4/net/ipv4/ip_output.c
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
//...
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
//kernel/msm-5.4/include/linux/netfilter.h
static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb,
struct net_device *in, struct net_device *out,
int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
if (ret == 1)
ret = okfn(net, sk, skb);
return ret;
}
传入参数:
pf:协议类型。包括 NFPROTO_IPV4,NFPROTO_IPV6,NFPROTO_ARP,NFPROTO_BRIDGE等协议。
hook:函数挂载点。如上文五个挂载点。
net:
sk:
sk_buff:数据包。
in:数据包进来的网络设备。
out:数据包出去的网络设备。
okfn:处理完数据包以后,如果并为 DROP,接下来执行的回调函数。
首先调用 nf_hook 来处理数据包,若 ret 为 1,则调用 okfn 函数继续处理。
//kernel/msm-5.4/include/linux/netfilter.h
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
struct sock *sk, struct sk_buff *skb,
struct net_device *indev, struct net_device *outdev,
int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
struct nf_hook_entries *hook_head = NULL;
int ret = 1;
//...
rcu_read_unlock();
switch (pf) {
case NFPROTO_IPV4:
hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
break;
//...
if (hook_head) {
struct nf_hook_state state;
nf_hook_state_init(&state, hook, pf, indev, outdev,
sk, net, okfn);
ret = nf_hook_slow(skb, &state, hook_head, 0);
}
rcu_read_unlock();
return ret;
}
struct net 有一个 struct netns_nf 成员,里面有个 nf_hook_entries 列表,每个 nf_hook_entries 就对应了每个钩子点所对应的钩子函数列表。根据传入的 hook 值,找到对应钩子函数挂载点所对应的处理函数列表 hook_head。
创建一个 nf_hook_state 对象。
//kernel/msm-5.4/include/linux/netfilter.h
struct nf_hook_state {
unsigned int hook;
u_int8_t pf;
struct net_device *in;
struct net_device *out;
struct sock *sk;
struct net *net;
int (*okfn)(struct net *, struct sock *, struct sk_buff *);
};
struct nf_hook_entries {
u16 num_hook_entries;
struct nf_hook_entry hooks[];
};
struct nf_hook_entry {
nf_hookfn *hook;
void *priv;
};
nf_hook_state 是 钩子函数执行所必须的参数。
nf_hook_entries 内的 hooks 就是钩子点所有的过滤函数,具体的钩子函数就是 nf_hook_entry 内的 nf_hookfn 函数。
接下来执行 nf_hook_slow 函数。
//kernel/msm-5.4/net/netfilter/core.c
int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
const struct nf_hook_entries *e, unsigned int s)
{
unsigned int verdict;
int ret;
for (; s < e->num_hook_entries; s++) {
verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
switch (verdict & NF_VERDICT_MASK) {
case NF_ACCEPT:
break;
case NF_DROP:
kfree_skb(skb);
ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
return ret;
case NF_QUEUE:
ret = nf_queue(skb, state, s, verdict);
if (ret == 1)
continue;
return ret;
default:
/* Implicit handling for NF_STOLEN, as well as any other
* non conventional verdicts.
*/
return 0;
}
}
return 1;
}
遍历 nf_hook_entries 中的 hook 函数,依次执行,根据返回值判断,是 NF_ACCEPT,还是 NF_DROP,还是NF_QUEUE。
那么接下来,就要找到,这些hook函数是什么,是在哪里被注册的。
所谓四表就是 iptables 自己定义了四“套”,作用不一的,要分别放在五个点位的一些钩子函数。
- nat表: 做网络地址转换用的,可以修改数据包中的源、目标IP地址或端口。分别位于“INPUT”、“OUTPUT”、“PREROUTING”、 “POSTROUTING”等四条链儿上。
- filter表: 主要用作数据包过滤,可以用来决定是否要 Drop 掉某些包。分别位于“INPUT”、“FORWARD”、“OUTPUT”等三条链儿上。
- raw表: 决定是否要对数据包进行跟踪。分别位于“OUTPUT”和“PREROUTING”两条链上。
- mangle表: 用来给数据包做标记的。分别位于 “INPUT”、“OUTPUT”、“FORWARD”、“PREROUTING”、“POSTROUTING”等五条链儿上全都有。
//kernel/msm-5.4/net/ipv4/netfilter/iptable_nat.c
static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
{
.hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
},
{
.hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
{
.hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
},
{
.hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
},
};
其中 hook 表示钩子函数,pf 表示协议簇,除了 ipv4 还有 ipv6 的,hooknum 表示该钩子要放到哪个点位上,也就是要插入到哪条链儿上,priority 表示该钩子的优先级,数值越小,优先级越高,也就是会被越早执行。
//kernel/msm-5.4/net/ipv4/netfilter/iptable_nat.c
static int ipt_nat_register_lookups(struct net *net)
{
int i, ret;
for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) {
ret = nf_nat_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]);
if (ret) {
while (i)
nf_nat_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]);
return ret;
}
}
return 0;
}
接下来调用 ipt_nat_register_lookups,遍历 nf_nat_ipv4_ops ,以此将每个成员注册到 net 中。继续阅读源码发现,上述列出的 nf_nat_ipv4_ops 成员似乎并不是实际的钩子函数。
//kernel/msm-5.4/net/netfilter/nf_nat_proto.c
static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
/* Before packet filtering, change destination */
{
.hook = nf_nat_ipv4_pre_routing,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
},
/* After packet filtering, change source */
{
.hook = nf_nat_ipv4_out,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
/* Before packet filtering, change destination */
{
.hook = nf_nat_ipv4_local_fn,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
},
/* After packet filtering, change source */
{
.hook = nf_nat_ipv4_local_in,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
},
};
以上的数组,是实际上注册到内核中的相应钩子点的钩子函数。
//kernel/msm-5.4/net/netfilter/nf_nat_proto.c
int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
{
return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops,
ARRAY_SIZE(nf_nat_ipv4_ops));
}
//kernel/msm-5.4/net/netfilter/nf_nat_core.c
int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count)
{
struct nat_net *nat_net = net_generic(net, nat_net_id);
struct nf_nat_hooks_net *nat_proto_net;
struct nf_nat_lookup_hook_priv *priv;
unsigned int hooknum = ops->hooknum;
struct nf_hook_ops *nat_ops;
int i, ret;
nat_proto_net = &nat_net->nat_proto_net[pf];
for (i = 0; i < ops_count; i++) {
if (orig_nat_ops[i].hooknum == hooknum) {
hooknum = i;
break;
}
}
mutex_lock(&nf_nat_proto_mutex);
if (!nat_proto_net->nat_hook_ops) {
nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
for (i = 0; i < ops_count; i++) {
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (priv) {
nat_ops[i].priv = priv;
continue;
}
}
ret = nf_register_net_hooks(net, nat_ops, ops_count);
nat_proto_net->nat_hook_ops = nat_ops;
}
nat_ops = nat_proto_net->nat_hook_ops;
priv = nat_ops[hooknum].priv;
ret = nf_hook_entries_insert_raw(&priv->entries, ops);
if (ret == 0)
nat_proto_net->users++;
mutex_unlock(&nf_nat_proto_mutex);
return ret;
}
遍历给定的钩子函数数组 orig_nat_ops,找到与指定钩子号 hooknum 相匹配的钩子函数,并将其索引保存到 hooknum 变量中。检查nat_proto_net是否已经有注册的钩子函数。如果没有,则进行以下操作:
为每个钩子函数分配一个nf_nat_lookup_hook_priv结构体,并将其作为钩子函数的私有数据。
使用函数nf_register_net_hooks将nat_ops中的钩子函数注册到内核中。
调用函数nf_hook_entries_insert_raw将给定的钩子函数ops插入到私有数据priv->entries中。
那么 nf_nat_ipv4_pre_routing 等钩子函数的具体执行呢?其实也是调用了其 priv->entries 中的函数。
以 nf_nat_ipv4_pre_routing 为例
//kernel/msm-5.4/net/netfilter/nf_nat_core.c
unsigned int
nf_nat_inet_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
ct = nf_ct_get(skb, &ctinfo);
/* Can't track? It's not due to stress, or conntrack would
* have dropped it. Hence it's the user's responsibilty to
* packet filter it out, or implement conntrack/NAT for that
* protocol. 8) --RR
*/
if (!ct)
return NF_ACCEPT;
nat = nfct_nat(ct);
switch (ctinfo) {
case IP_CT_RELATED:
case IP_CT_RELATED_REPLY:
/* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */
case IP_CT_NEW:
/* Seen it before? This can happen for loopback, retrans,
* or local packets.
*/
if (!nf_nat_initialized(ct, maniptype)) {
struct nf_nat_lookup_hook_priv *lpriv = priv;
struct nf_hook_entries *e = rcu_dereference(lpriv->entries);
unsigned int ret;
int i;
if (!e)
goto null_bind;
for (i = 0; i < e->num_hook_entries; i++) {
ret = e->hooks[i].hook(e->hooks[i].priv, skb,
state);
if (ret != NF_ACCEPT)
return ret;
if (nf_nat_initialized(ct, maniptype))
goto do_nat;
}
null_bind:
ret = nf_nat_alloc_null_binding(ct, state->hook);
if (ret != NF_ACCEPT)
return ret;
} else {
pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
ct, ct->status);
if (nf_nat_oif_changed(state->hook, ctinfo, nat,
state->out))
goto oif_changed;
}
break;
default:
/* ESTABLISHED */
WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
ctinfo != IP_CT_ESTABLISHED_REPLY);
if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
goto oif_changed;
}
do_nat:
return nf_nat_packet(ct, ctinfo, state->hook, skb);
oif_changed:
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_DROP;
}
这三个表跟 nat 表不太一样,nat 表中的 hook 函数数组是写死的,而其他三个表中的 hook 函数是通过 xt_table 中的参数动态生成。
//kernel/msm-5.4/net/netfilter/x_tables.c
struct nf_hook_ops *
xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
{
unsigned int hook_mask = table->valid_hooks;
uint8_t i, num_hooks = hweight32(hook_mask);
uint8_t hooknum;
struct nf_hook_ops *ops;
if (!num_hooks)
return ERR_PTR(-EINVAL);
ops = kcalloc(num_hooks, sizeof(*ops), GFP_KERNEL);
if (ops == NULL)
return ERR_PTR(-ENOMEM);
for (i = 0, hooknum = 0; i < num_hooks && hook_mask != 0;
hook_mask >>= 1, ++hooknum) {
if (!(hook_mask & 1))
continue;
ops[i].hook = fn;
ops[i].pf = table->af;
ops[i].hooknum = hooknum;
ops[i].priority = table->priority;
++i;
}
return ops;
}
在所有的钩子点上创建了对应的 filter hook_op。将对应 xt_table 的 pf 和 priority 赋给对应的 hook_op,hook 则都是同一个函数。
成功创建了 hook_ops 以后将之注册到内核中。
repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
NULL, &net->ipv4.nat_table);
//kernel/msm-5.4/net/ipv4/netfilter/ip_tables.c
int ipt_register_table(struct net *net, const struct xt_table *table,
const struct ipt_replace *repl,
const struct nf_hook_ops *ops, struct xt_table **res)
{
int ret;
struct xt_table_info *newinfo;
struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
if (!newinfo)
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
if (ret != 0)
goto out_free;
new_table = xt_register_table(net, table, &bootstrap, newinfo);
//...
return ret;
out_free:
xt_free_table_info(newinfo);
return ret;
}
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ipt_do_table(struct sk_buff *skb,
const struct nf_hook_state *state,
struct xt_table *table)
{
unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
const struct iphdr *ip;
/* Initializing verdict to NF_DROP keeps gcc happy. */
unsigned int verdict = NF_DROP;
const char *indev, *outdev;
const void *table_base;
struct ipt_entry *e, **jumpstack;
unsigned int stackidx, cpu;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
/* Initialization */
stackidx = 0;
ip = ip_hdr(skb);
indev = state->in ? state->in->name : nulldevname;
outdev = state->out ? state->out->name : nulldevname;
/* We handle fragments by dealing with the first fragment as
* if it was a normal packet. All other fragments are treated
* normally, except that they will NEVER match rules that ask
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
acpar.thoff = ip_hdrlen(skb);
acpar.hotdrop = false;
acpar.state = state;
WARN_ON(!(table->valid_hooks & (1 << hook)));
local_bh_disable();
addend = xt_write_recseq_begin();
private = READ_ONCE(table->private); /* Address dependency. */
cpu = smp_processor_id();
table_base = private->entries;
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
/* Switch to alternate jumpstack if we're being invoked via TEE.
* TEE issues XT_CONTINUE verdict on original skb so we must not
* clobber the jumpstack.
*
* For recursion via REJECT or SYNPROXY the stack will be clobbered
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
e = get_entry(table_base, private->hook_entry[hook]);
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
struct xt_counters *counter;
WARN_ON(!e);
if (!ip_packet_match(ip, indev, outdev,
&e->ip, acpar.fragoff)) {
no_match:
e = ipt_next_entry(e);
continue;
}
xt_ematch_foreach(ematch, e) {
acpar.match = ematch->u.kernel.match;
acpar.matchinfo = ematch->data;
if (!acpar.match->match(skb, &acpar))
goto no_match;
}
counter = xt_get_this_cpu_counter(&e->counters);
ADD_COUNTER(*counter, skb->len, 1);
t = ipt_get_target_c(e);
WARN_ON(!t->u.kernel.target);
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
trace_packet(state->net, skb, hook, state->in,
state->out, table->name, private, e);
#endif
/* Standard target? */
if (!t->u.kernel.target->target) {
int v;
v = ((struct xt_standard_target *)t)->verdict;
if (v < 0) {
/* Pop from stack? */
if (v != XT_RETURN) {
verdict = (unsigned int)(-v) - 1;
break;
}
if (stackidx == 0) {
e = get_entry(table_base,
private->underflow[hook]);
} else {
e = jumpstack[--stackidx];
e = ipt_next_entry(e);
}
continue;
}
if (table_base + v != ipt_next_entry(e) &&
!(e->ip.flags & IPT_F_GOTO)) {
if (unlikely(stackidx >= private->stacksize)) {
verdict = NF_DROP;
break;
}
jumpstack[stackidx++] = e;
}
e = get_entry(table_base, v);
continue;
}
acpar.target = t->u.kernel.target;
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
if (verdict == XT_CONTINUE) {
/* Target might have changed stuff. */
ip = ip_hdr(skb);
e = ipt_next_entry(e);
} else {
/* Verdict */
break;
}
} while (!acpar.hotdrop);
xt_write_recseq_end(addend);
local_bh_enable();
if (acpar.hotdrop)
return NF_DROP;
else return verdict;
}
- 首先,获取函数参数中传入的网络包skb、钩子状态state和规则表table的相关信息。
- 检查传入的钩子号是否在规则表的有效钩子中。
- 对网络包进行初始化,包括获取IP头部指针、输入和输出设备名称等。
- 设置acpar结构体的一些成员变量,用于后续的匹配过程。
- 禁用本地软中断。
- 开始记录写序列,保证读取私有数据private时的一致性。
- 获取私有数据private的相关信息。
- 获取当前CPU的ID。
- 获取规则表的基地址、跳转栈和钩子入口。
- 如果启用了TEE(Traffic Engineering Engine),则切换到备用跳转栈。
- 获取规则表中的第一个规则条目e。
- 进入循环,对每个规则条目进行匹配和处理。
- 首先检查是否存在IP包匹配条件,如果不匹配,则跳过该规则条目。
- 对每个规则条目中的匹配模块进行遍历,调用匹配函数进行匹配。
- 更新该规则条目对应的计数器。
- 获取目标模块,并判断是否为标准目标模块。
- 如果是标准目标模块,根据返回的verdict值进行相应的处理。
- 如果不是标准目标模块,将其压入跳转栈。
- 获取下一个规则条目。
- 如果执行到这里,说明函数已经执行完毕,根据acpar结构体中的hotdrop标志返回相应的verdict。