目录
一、概述
二、netfilter
2.1 结构
第一个层次时table,有四个分别时filter,nat,mangle,raw
每个table对应一种的协议,每种协议的HOOK点可能不同,如IPV4的HOOK点有五个,如下:
其在协议栈的位置如下:
每个hook点由一条chain构成,chain是规则的集合,报文进入HOOK点会按照顺序匹配,如果命中规则就跳出,没有命中执行默认规则,如下图:
2.2 数据抽象
2.2.1 table的表示
struct xt_table {
struct list_head list;
unsigned int valid_hooks;
struct xt_table_info *private;
struct module *me;
u_int8_t af;
int priority;
int (*table_init)(struct net *net);
const char name[XT_TABLE_MAXNAMELEN];
};
- name
- valid_hooks HOOK type,掩码
- af address family
协议类型:
enum {
NFPROTO_UNSPEC = 0,
NFPROTO_INET = 1,
NFPROTO_IPV4 = 2,
NFPROTO_ARP = 3,
NFPROTO_NETDEV = 5,
NFPROTO_BRIDGE = 7,
NFPROTO_IPV6 = 10,
NFPROTO_DECNET = 12,
NFPROTO_NUMPROTO,
};
hook类型:
enum nf_inet_hooks {
NF_INET_PRE_ROUTING,
NF_INET_LOCAL_IN,
NF_INET_FORWARD,
NF_INET_LOCAL_OUT,
NF_INET_POST_ROUTING,
NF_INET_NUMHOOKS
};
2.2.2 nf_hook_ops
typedef unsigned int nf_hookfn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state);
struct nf_hook_ops {
/* User fills in from here down. */
nf_hookfn *hook;
struct net_device *dev;
void *priv;
u_int8_t pf;
bool nat_hook;
unsigned int hooknum;
/* Hooks are ordered in ascending priority. */
int priority;
};
hook入口处理函数,用户自行注册,通过(协议,HOOK类型) 确定hook处理函数hook和参数priv
- pf 协议类型
- hooknum hook类型
- hook hook处理函数
- pirv hook处理函数参数
2.3 接口
注册table
- int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl,
const struct nf_hook_ops *ops, struct xt_table **res) - int xt_register_target(struct xt_target *target);
2.4 实现
2.4.1 初始化
xt_init 初始化管理target和match的结构:
[net/netfilter/X_table.c]
static int __init xt_init(void) {
xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
if (!xt)
return -ENOMEM;
for (i = 0; i < NFPROTO_NUMPROTO; i++) {
mutex_init(&xt[i].mutex);
INIT_LIST_HEAD(&xt[i].target);
INIT_LIST_HEAD(&xt[i].match);
}
}
结构是通过xt_register_target接口进行注册的,下图可表示结构间的关系:
2.4.2 table的注册
table通过 ipt_register_table向系统进行注册,这里以filter table初始化过程:
filter table结构如下,其包括三个HOOK:LOCAL_IN,FORWARD,LOCAL_OUT:
[net/ipv4/netfilter/iptable_netfilter]
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_FILTER,
.table_init = iptable_filter_table_init,
};
[net/ipv4/netfilter/iptable_netfilter]
static int __init iptable_filter_init(void) {
int ret;filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
if (IS_ERR(filter_ops))
return PTR_ERR(filter_ops);ret = register_pernet_subsys(&iptable_filter_net_ops);
if (ret < 0)
kfree(filter_ops);return ret;
}
先看register_pernet_subsys注册的初始化流程iptable_filter_net_init->iptable_filter_table_init
static int __net_init iptable_filter_table_init(struct net *net)
{
struct ipt_replace *repl;
int err;
if (net->ipv4.iptable_filter)
return 0;
repl = ipt_alloc_initial_table(&packet_filter);
if (repl == NULL)
return -ENOMEM;
/* Entry 1 is the FORWARD hook */
((struct ipt_standard *)repl->entries)[1].target.verdict =
forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
err = ipt_register_table(net, &packet_filter, repl, filter_ops,
&net->ipv4.iptable_filter);
kfree(repl);
return err;
}
先来看table注册ipt_register_table参数的构造过程:
1)ipt_alloc_initial_table分配ipt_replace结构准备注册,这个结构如下图所示,这里先概括的给出一张结构图:
ipt_standard对应一个hook点的描述,ipt_entry使用target_offset和next_offset实现一个基于数组的链表结构:
2)xt_hook_ops_alloc为每个hook分配nf_hook_ops,每个HOOK点对应一个nf_hook_ops,这里packet_filter在IPV4协议下的3个hook注册的hook处理函数都是同一个iptable_filter_hook.此时分配的结构如下:
3)table注册的参数准备好之后,开始注册,接下来分析注册过程:
[net/ipv4/netfilter/iptable_netfilter/ip_tables.c]
int ipt_register_table(struct net *net, const struct xt_table *table,
const struct ipt_replace *repl,
const struct nf_hook_ops *ops, struct xt_table **res)
{
struct xt_table_info *newinfo;
newinfo = xt_alloc_table_info(repl->size);
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
new_table = xt_register_table(net, table, &bootstrap, newinfo);
WRITE_ONCE(*res, new_table);
ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
}
3-1 table注册首先会分配一个xt_table_info,将ipt_replace对应的ipt_standard结构copy到其中
3-2 translate_table首先做参数检查,它会进一步构造xt_table_info参数:
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ipt_replace *repl) {
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
ret = find_check_entry(iter, net, repl->name, repl->size,
&alloc_state);
if (ret != 0)
break;
++i;
}
}
看下find_check_entry
static int
find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
unsigned int size,
struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
t = ipt_get_target(e);
target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
t->u.user.revision);
t->u.kernel.target = target;
ret = check_target(e, net, name);
}
上面的代码片段显示将名称匹配的xt_target挂到xt_entry_target的对应结构上,而这个xt_target就是在xt_register_target注册的,这就是说对于一个table来说,它的target是在注册时匹配的符合(协议,HOOK, name) 的target,
-
如果target 值为空,表示是标准target,根据verdict值来处理报文。
-
如果target不为空,表示不是标准target,就使用target的target函数返回值来处理报文
这一步的图示:
更详细的图【1】:
其中xt_entry_match构造和xt_entry_target结构相同,他代表扩展match
ipt_entry实际上对应防火墙的rule,用户将新的rule通过do_replace传递下来,替换table中的info,注意,Netfilter把rule的排序由配置工具来管理和完成,每次使用iptable下发一条rule时,iptable首先要从内核中把对应表的所有rule到拷贝一份到用户空间,然后再把新的rule插入,然后排序后再下发到内核中,替换表中旧的rule。
3-3 接下来xt_register_table实际上是将table注册到系统并管理起来,相关的结构netns_xt:
struct xt_table *xt_register_table(struct net *net,
const struct xt_table *input_table,
struct xt_table_info *bootstrap,
struct xt_table_info *newinfo) {
list_for_each_entry(t, &net->xt.tables[table->af], list) {
if (strcmp(t->name, table->name) == 0) {
ret = -EEXIST;
goto unlock;
}
}
table->private = bootstrap;
if (!xt_replace_table(table, 0, newinfo, &ret)) //将new_info与table绑定
goto unlock;
private = table->private;
private->initial_entries = private->number;
list_add(&table->list, &net->xt.tables[table->af]);
}
这里给出一个图描述上述三个步骤:
此时table的注册完成。
3-4 最后将nf_register_net_hooks将filter_ops注册
static int __nf_register_net_hook(struct net *net, int pf,
const struct nf_hook_ops *reg)
{
struct nf_hook_entries *p, *new_hooks;
struct nf_hook_entries __rcu **pp;
pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
mutex_lock(&nf_hook_mutex);
p = nf_entry_dereference(*pp);
new_hooks = nf_hook_entries_grow(p, reg);
mutex_unlock(&nf_hook_mutex);
hooks_validate(new_hooks);
}
nf位于下面的结构,其实就是一个hook的hash
struct netns_nf {
struct nf_hook_entries __rcu *hooks_ipv4[NF_INET_NUMHOOKS];
}
综上,可以看出,table和钩子函数的注册管理上集中在下面两个结构上:
struct net {
struct netns_nf nf;
struct netns_xt xt;
}
- net->xt.tables[table->af]
- net->nf.hooks_ipv4 + hooknum
至此table和nf_hook_ops的注册管理就完成了,示意如下:
2.4.3 HOOK入口点分析
linux通过nf_hook指定协议和HOOK类型进入HOOK点:
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
struct sock *sk, struct sk_buff *skb,
struct net_device *indev, struct net_device *outdev,
int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
struct nf_hook_entries *hook_head = NULL;
int ret = 1;
rcu_read_lock();
switch (pf) {
case NFPROTO_IPV4:
hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
break;
case NFPROTO_IPV6:
hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
break;
case NFPROTO_ARP:
#ifdef CONFIG_NETFILTER_FAMILY_ARP
hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
#endif
break;
default:
WARN_ON_ONCE(1);
break;
}
}
if (hook_head) {
struct nf_hook_state state;
nf_hook_state_init(&state, hook, pf, indev, outdev,
sk, net, okfn);
ret = nf_hook_slow(skb, &state, hook_head, 0);
}
}
- 函数通过协议类型找到HOOK管理节点net->nf.hook, 这个在前面已经说明过了
int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
const struct nf_hook_entries *e, unsigned int s)
{
unsigned int verdict;
int ret;
for (; s < e->num_hook_entries; s++) {
verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
switch (verdict & NF_VERDICT_MASK) {
case NF_ACCEPT:
break;
case NF_DROP:
kfree_skb(skb);
ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
return ret;
case NF_QUEUE:
ret = nf_queue(skb, state, e, s, verdict);
if (ret == 1)
continue;
return ret;
default:
/* Implicit handling for NF_STOLEN, as well as any other
* non conventional verdicts.
*/
return 0;
}
}
return 1;
}
- 遍历hook点所有的规则,这里的规则可能是属于不同的table的,执行如下函数
static inline int
nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb,
struct nf_hook_state *state)
{
return entry->hook(entry->priv, skb, state);
}
对于filter table来说,会执行:
static unsigned int
iptable_filter_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
return ipt_do_table(skb, state, state->net->ipv4.iptable_filter);
}
下面具体分析一下:
流程上基本就是逐条匹配rule(ipt_entry), 每条rule要进行match,扩展match, 如果匹配成功,标准target,扩展target
e = get_entry(table_base, private->hook_entry[hook]);
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
struct xt_counters *counter;
if (!ip_packet_match(ip, indev, outdev,
&e->ip, acpar.fragoff)) {
no_match:
e = ipt_next_entry(e);
continue;
}
xt_ematch_foreach(ematch, e) {
acpar.match = ematch->u.kernel.match;
acpar.matchinfo = ematch->data;
if (!acpar.match->match(skb, &acpar))
goto no_match;
}
} while (!acpar.hotdrop);
标准的匹配ip_packet_match
扩展match对应于用户自定义,其结构和上面分析的target一样
如果匹配成功,会执行target
/* Standard target? */
if (!t->u.kernel.target->target) {
int v;
v = ((struct xt_standard_target *)t)->verdict;
if (v < 0) {
/* Pop from stack? */
if (v != XT_RETURN) {
verdict = (unsigned int)(-v) - 1;
break;
}
if (stackidx == 0) {
e = get_entry(table_base,
private->underflow[hook]);
} else {
e = jumpstack[--stackidx];
e = ipt_next_entry(e);
}
continue;
}
if (table_base + v != ipt_next_entry(e) &&
!(e->ip.flags & IPT_F_GOTO)) {
if (unlikely(stackidx >= private->stacksize)) {
verdict = NF_DROP;
break;
}
jumpstack[stackidx++] = e;
}
e = get_entry(table_base, v);
continue;
}
acpar.target = t->u.kernel.target;
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
if (verdict == XT_CONTINUE) {
/* Target might have changed stuff. */
ip = ip_hdr(skb);
e = ipt_next_entry(e);
} else {
/* Verdict */
break;
}