上一节讲了钩子函数用表中的规则处理报文从而实现防火墙功能,这一节对表的初始化流程进行描述。为了保持和上一章的延续我们回到filter表钩子函数初始化入口:
所在文件net/ipv4/netfilter/iptable_filter.c
static int __init iptable_filter_init(void)
{
int ret;
ret = register_pernet_subsys(&iptable_filter_net_ops);
if (ret < 0)
return ret;
/* Register hooks */
filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
if (IS_ERR(filter_ops)) {
ret = PTR_ERR(filter_ops);
unregister_pernet_subsys(&iptable_filter_net_ops);
}
return ret;
}
上一节讲了函数xt_hook_link(&packet_filter, iptable_filter_hook);会将钩子函数iptable_filter_hook注册到挂载点,其中packet_filter就是filter对应的表。
表结构定义如下:
所在文件include/linux/netfilter/x_tables.h
struct xt_table {
struct list_head list;
/* What hooks you will enter on */
unsigned int valid_hooks;
/* Man behind the curtain... */
struct xt_table_info *private; //最终存储表中规则信息
/* Set this to THIS_MODULE if you are a module, otherwise NULL */
struct module *me;
u_int8_t af; /* address/protocol family */
int priority; /* hook order */
/* A unique name... */
const char name[XT_TABLE_MAXNAMELEN];
};
申明如下:
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_FILTER,
};
上一节讲了以上申明的初始化对象会用于钩子函数初始化流程,那么表中其他参数如何初始化的呢?
和钩子函数初始化流程相同,表也有初始化入口函数,该函数会在内核模块加载时自动调用,定义如下:
所在文件net/ipv4/netfilter/iptable_filter.c
static int __net_init iptable_filter_net_init(struct net *net)
{
struct ipt_replace *repl;
repl = ipt_alloc_initial_table(&packet_filter);
if (repl == NULL)
return -ENOMEM;
/* Entry 1 is the FORWARD hook */
((struct ipt_standard *)repl->entries)[1].target.verdict =
forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
net->ipv4.iptable_filter =
ipt_register_table(net, &packet_filter, repl);
kfree(repl);
return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
}
通过ipt_alloc_initial_table初始化一个ipt_replace,定义在文件Include/uapi/linux/netfilter_ipv4/ip_tables.h中,如下所示:
struct ipt_replace {
/* Which table. */
char name[XT_TABLE_MAXNAMELEN];
/* Which hook entry points are valid: bitmask. You can't
change this. */
unsigned int valid_hooks;
/* Number of entries */
unsigned int num_entries;
/* Total size of new entries */
unsigned int size;
/* Hook entry points. */
/*存储表中各个规则链的第一条规则相对于entry entries 的偏移*/
unsigned int hook_entry[NF_INET_NUMHOOKS];
/* Underflow points. */
/*存储表中各个规则链的最后一条规则相对于entry entries 的偏移*/
unsigned int underflow[NF_INET_NUMHOOKS];
/* Information about old entries: */
/* Number of counters (must be equal to current number of entries). */
unsigned int num_counters;
/* The old entries' counters. */
struct xt_counters __user *counters;
/* The entries (hang off end: not really an array). */
struct ipt_entry entries[0];
};
该结构临时存储要注册到表中的规则信息。
在文件net/ipv4/netfilter/ip_tables.c中ipt_alloc_initial_table定义如下:
void *ipt_alloc_initial_table(const struct xt_table *info)
{
return xt_alloc_initial_table(ipt, IPT);
}
xt_alloc_initial_table 是一个宏,定义在文件net/netfilter/xt_repldata.h中,如下所示:
#define xt_alloc_initial_table(type, typ2) ({ \
unsigned int hook_mask = info->valid_hooks; \
unsigned int nhooks = hweight32(hook_mask); \
unsigned int bytes = 0, hooknum = 0, i = 0; \
struct { \
struct type##_replace repl; \
struct type##_standard entries[]; \
} *tbl; \
struct type##_error *term; \
size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \
__alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \
tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \
if (tbl == NULL) \
return NULL; \
term = (struct type##_error *)&(((char *)tbl)[term_offset]); \
strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \
*term = (struct type##_error)typ2##_ERROR_INIT; \
tbl->repl.valid_hooks = hook_mask; \
tbl->repl.num_entries = nhooks + 1; \
tbl->repl.size = nhooks * sizeof(struct type##_standard) + \
sizeof(struct type##_error); \
for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \
if (!(hook_mask & 1)) \
continue; \
tbl->repl.hook_entry[hooknum] = bytes; \
tbl->repl.underflow[hooknum] = bytes; \
tbl->entries[i++] = (struct type##_standard) \
typ2##_STANDARD_INIT(NF_ACCEPT); \
bytes += sizeof(struct type##_standard); \
} \
tbl; \
})
刚开始看这段代码可能会有点困难,带只要板type替换成ipt,type2替换成IPT后就会明了很多,其实就是申请了一个
struct {
struct ipt_replace repl;
struct ipt_standard entries[nhooks + 1]; // nhooks= hweight32(hook_mask)
} *tbl;
结构体空间,并对其初始化,其中nhooks=3。
经过该宏初始化后返回的tbl表结构如下:
ipt_replace初始化后通过ipt_register_table将表注册到内核。该函数定义在文件
net/ipv4/netfilter/ip_tables.c中,定义如下:
struct xt_table *ipt_register_table(struct net *net,
const struct xt_table *table,
const struct ipt_replace *repl)
{
int ret;
struct xt_table_info *newinfo; //该结构存储表中规则信息
struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
struct xt_table *new_table;
/*申请xt_table_info 结构体空间和规则存储空间,规则存储空间大小为repl->size ,有上面的表可知规则大小为3*sizeof(ipt_standard)+sizeof(ipt_error)*/
newinfo = xt_alloc_table_info(repl->size);
if (!newinfo) {
ret = -ENOMEM;
goto out;
}
//将ipt_replace中的规则存储到xt_table_info中
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
/*初始化newinfo,检测规则的合法性,主要是检测每条规则(ipt_entry)中target_offset、next_offset、合法性等 */
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
if (ret != 0)
goto out_free;
/*将newinfo 信息注册到table 的private */
new_table = xt_register_table(net, table, &bootstrap, newinfo);
if (IS_ERR(new_table)) {
ret = PTR_ERR(new_table);
goto out_free;
}
return new_table;
out_free:
xt_free_table_info(newinfo);
out:
return ERR_PTR(ret);
}
translate_table函数还是有点复杂,我们重点分析下,该函数定义如下:
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ipt_replace *repl)
{
struct ipt_entry *iter;
unsigned int i;
int ret = 0;
newinfo->size = repl->size;
newinfo->number = repl->num_entries;
/* Init all hooks to impossible value. */
/*将规则链偏移地址初始化为无效值*/
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
newinfo->hook_entry[i] = 0xFFFFFFFF;
newinfo->underflow[i] = 0xFFFFFFFF;
}
duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
/*检测表中各个规则的target_offset、next_offset是否正确,比较简单不展开分析了*/
xt_entry_foreach(iter, entry0, newinfo->size) {
ret = check_entry_size_and_hooks(iter, newinfo, entry0,
entry0 + repl->size,
repl->hook_entry,
repl->underflow,
repl->valid_hooks);
if (ret != 0)
return ret;
++i;
if (strcmp(ipt_get_target(iter)->u.user.name,
XT_ERROR_TARGET) == 0)
++newinfo->stacksize;
}
if (i != repl->num_entries) {
duprintf("translate_table: %u not %u entries\n",
i, repl->num_entries);
return -EINVAL;
}
/* Check hooks all assigned */
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
/* Only hooks which are valid */
if (!(repl->valid_hooks & (1 << i)))
continue;
if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
duprintf("Invalid hook entry %u %u\n",
i, repl->hook_entry[i]);
return -EINVAL;
}
if (newinfo->underflow[i] == 0xFFFFFFFF) {
duprintf("Invalid underflow %u %u\n",
i, repl->underflow[i]);
return -EINVAL;
}
}
/*检测规则链中规则是否回环,这个函数有点复杂,我们稍后重点分析下*/
if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
return -ELOOP;
/* Finally, each sanity check must pass */
/*进一步检测match和target是否合法*/
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
ret = find_check_entry(iter, net, repl->name, repl->size);
if (ret != 0)
break;
++i;
}
if (ret != 0) {
xt_entry_foreach(iter, entry0, newinfo->size) {
if (i-- == 0)
break;
cleanup_entry(iter, net);
}
return ret;
}
return ret;
}
在分析mark_source_chains前为了更加直观我们先列出规则保存的链表结构如下:
规则与各规则链首条规则地址记录成员(hook_entry[])关系如下:
表3中的xt_table_info就是linux 防火墙架构中的“表-table”,hook_entry[]就是所谓的“链-chain”,ipt_entry就是所谓的“规则-rule”。
现在我们对表、链、规则有了一定认识后再对mark_source_chains函数继续分析会容易很多,有了地图才不会容易迷路。函数定义如下:
static int
mark_source_chains(const struct xt_table_info *newinfo,
unsigned int valid_hooks, void *entry0)
{
unsigned int hook;
/* No recursion; use packet counter to save back ptrs (reset
to 0 as we leave), and comefrom to save source hook bitmask */
/*遍历表中的各个规则链*/
for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
/*通过规则链首条规则相对于规则基地址(entry0)偏移找到规则链的首地址,也就是规则链中第一条规则的地址*/
unsigned int pos = newinfo->hook_entry[hook];
struct ipt_entry *e = (struct ipt_entry *)(entry0 + pos);
/*如果当前规则链在表中无效则遍历下一条规则链*/
if (!(valid_hooks & (1 << hook)))
continue;
/* Set initial back pointer. */
/*用规则的 counters.pcnt 成员存储上一条规则相对于规则基地址的偏移量,我们知道通过当前规则的next_offset可以找到下一条规则,这里通过counters.pcnt来找到前一条规则,由于这是规则链的第一条规则,因此 counters.pcnt 记录的是规则链中第一条规则本身的偏移,后面也是通过比较当前规则的counters.pcnt 是否等于当前规则本身的偏移来判断是否遍历到规则链中的第一条规则通过counters.pcnt来记录前一个规则的偏移后,实际每条规则链就形成了一个双向链表*/
e->counters.pcnt = pos;
/*通过for循环来遍历规则链中的每条规则*/
for (;;) {
/*获取规则中的target地址*/
const struct xt_standard_target *t
= (void *)ipt_get_target_c(e);
/*通过visited 记录规则是否已经遍历,通过判断e->comefrom 中的hook位是否置1 来判断 该规则是否已经遍历*/
int visited = e->comefrom & (1 << hook);
/*通过判断e->comefrom 的NF_INET_NUMHOOKS 位是否置1来判断规则链是否有环*/
if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
pr_err("iptables: loop hook %u pos %u %08X.\n",
hook, pos, e->comefrom);
return 0;
}
/*将e->comefrom 的NF_INET_NUMHOOKS位和hook位都置1*/
e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
/* Unconditional return/END. */
/*判断是否已经遍历到最后一条规则,如果当前规则是标准的默认内部规则,则认为这是当前规则链的最后一条规则*/
if ((unconditional(e) &&
(strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
t->verdict < 0) || visited) {
unsigned int oldpos, size;
/*判断标准默认规则中的target的verdict成员是否合法 */
if ((strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
t->verdict < -NF_MAX_VERDICT - 1) {
duprintf("mark_source_chains: bad "
"negative verdict (%i)\n",
t->verdict);
return 0;
}
/* Return: backtrack through the last
big jump. */
/*对规则链反向遍历 */
do {
/*清除规则链中每条规则的e->comefrom 的NF_INET_NUMHOOKS 位*/
e->comefrom ^= (1<<NF_INET_NUMHOOKS);
#ifdef DEBUG_IP_FIREWALL_USER
if (e->comefrom
& (1 << NF_INET_NUMHOOKS)) {
duprintf("Back unset "
"on hook %u "
"rule %u\n",
hook, pos);
}
#endif
/*当前规则的偏移*/
oldpos = pos;
/*上一条规则的偏移*/
pos = e->counters.pcnt;
/*清除规则中的e->counters.pcnt */
e->counters.pcnt = 0;
/* We're at the start. */
/*判断是否反向遍历到规则链中的第一条规则,这里用之前说的方法判断:如果counters.pcnt等于当前规则偏移则是第一条规则,如果当前规则是第一规则则退出,然后遍历下一个规则链*/
if (pos == oldpos)
goto next;
e = (struct ipt_entry *)
(entry0 + pos);
} while (oldpos == pos + e->next_offset);
/* Move along one */
/*如果当前规则偏移不等于前一个规则偏移加规则的next_offset,则向后移一个规则后再遍历*/
size = e->next_offset;
e = (struct ipt_entry *)
(entry0 + pos + size);
if (pos + size >= newinfo->size)
return 0;
e->counters.pcnt = pos;
pos += size;
} else {/*依次遍历规则链中的规则*/
int newpos = t->verdict;
/*判断是否为自定义跳转规则*/
if (strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
/*自定义规则target的verdict 大于0*/
if (newpos > newinfo->size -
sizeof(struct ipt_entry)) {
duprintf("mark_source_chains: "
"bad verdict (%i)\n",
newpos);
return 0;
}
/* This a jump; chase it. */
duprintf("Jump rule %u -> %u\n",
pos, newpos);
e = (struct ipt_entry *)
(entry0 + newpos);
/*判断自定义跳转规则是否合法*/
if (!find_jump_target(newinfo, e))
return 0;
} else {
/* ... this is a fallthru */
/*标准规则*/
newpos = pos + e->next_offset;
if (newpos >= newinfo->size)
return 0;
}
/*获取下一条规则*/
e = (struct ipt_entry *)
(entry0 + newpos);
/*e->counters.pcn存储上一条规则的偏移,用于反向遍历*/
e->counters.pcnt = pos;
/* pos记录当前规则的偏移*/
pos = newpos;
}
}
next:
duprintf("Finished chain %u\n", hook);
}
return 1;
}
规则通过next_offset和counters.pcnt分别记录下一条规则和上一条规则的偏移形成双向链表如下所示:
到此mark_source_chains函数就分析完了。这个函数实现虽然有些复杂,但是其主要功能其实很简单,就是通过遍历表中的各个规则链来判断是否形成环。
xt_table_info 对象准备好后通过调用xt_register_table将表最终注册到net对应表的链表中,这个函数除了xt_replace_table函数不是很好理解,其他部分都很简单。
该函数定义在文件
net/ netfilter/x_tables.c中,定义如下:
struct xt_table *xt_register_table(struct net *net,
const struct xt_table *input_table,
struct xt_table_info *bootstrap,
struct xt_table_info *newinfo)
{
int ret;
struct xt_table_info *private;
struct xt_table *t, *table;
/* Don't add one object to multiple lists. */
/*申请xt_table表空间,并将input_table内容拷贝到新空间 */
table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
if (!table) {
ret = -ENOMEM;
goto out;
}
/*注册前上锁*/
mutex_lock(&xt[table->af].mutex);
/* Don't autoload: we'd eat our tail... */
/*判断表是否已经注册过,如果注册过则返回对应错误*/
list_for_each_entry(t, &net->xt.tables[table->af], list) {
if (strcmp(t->name, table->name) == 0) {
ret = -EEXIST;
goto unlock;
}
}
/* Simplifies replace_table code. */
/*将table->private指向一个初始值都为0的xt_table_info变量,这个变量的意义是为了实现xt_replace_table在多核CPU中的复用,为了描述方便后面就叫初始化参考量 */
table->private = bootstrap;
/*这个函数功能是将 table ->private指针指向newinfo ,功能非常简单,但函数实现不是很好理解,下面我们再展开分析下*/
if (!xt_replace_table(table, 0, newinfo, &ret))
goto unlock;
/*打印规则数*/
private = table->private;
pr_debug("table->private->number = %u\n", private->number);
/* save number of initial entries */
/*记录表中初始化的规则数*/
private->initial_entries = private->number;
/*将包注册到net->xt.tables */
list_add(&table->list, &net->xt.tables[table->af]);
/*注册完后需解锁*/
mutex_unlock(&xt[table->af].mutex);
return table;
unlock:
mutex_unlock(&xt[table->af].mutex);
kfree(table);
out:
return ERR_PTR(ret);
}
上面说了xt_replace_table函数可能会在多核CPU中同时被调用,现在我们来看下具体实现,该函数也是在net/ netfilter/x_tables.c中定义的,函数定义如下:
struct xt_table_info *
xt_replace_table(struct xt_table *table,
unsigned int num_counters,
struct xt_table_info *newinfo,
int *error)
{
struct xt_table_info *private;
int ret;
/*申请jumpstack 空间,这里暂时不分析该空间的作用*/
ret = xt_jumpstack_alloc(newinfo);
if (ret < 0) {
*error = ret;
return NULL;
}
/* Do the substitution. */
/*关闭下半部中断处理*/
local_bh_disable();
/*获取初始化参考量bootstrap */
private = table->private;
/* Check inside lock: is the old number correct? */
/*检测private->number 是否等于初始化参考量bootstrap 的number值,如果不等则退出*/
if (num_counters != private->number) {
pr_debug("num_counters != table->private->number (%u/%u)\n",
num_counters, private->number);
local_bh_enable();
*error = -EAGAIN;
return NULL;
}
/*初始化规则数设置成bootstrap 的值,这条代码个人感觉没什么作用,因为后面通过private->initial_entries = private->number会对表的initial_entries 再次赋值,不对之处望高手不吝赐教*/
newinfo->initial_entries = private->initial_entries;
/*
* Ensure contents of newinfo are visible before assigning to
* private.
*/
/*打开内存屏障,确保变量不被优化*/
smp_wmb();
/* table->private 指向newinfo */
table->private = newinfo;
/*
* Even though table entries have now been swapped, other CPU's
* may still be using the old entries. This is okay, because
* resynchronization happens because of the locking done
* during the get_counters() routine.
*/
/*重新使能中断下部处理*/
local_bh_enable();
#ifdef CONFIG_AUDIT
if (audit_enabled) {
struct audit_buffer *ab;
ab = audit_log_start(current->audit_context, GFP_KERNEL,
AUDIT_NETFILTER_CFG);
if (ab) {
audit_log_format(ab, "table=%s family=%u entries=%u",
table->name, table->af,
private->number);
audit_log_end(ab);
}
}
#endif
return private;
}
到这里filter表的初始化流程就结束了。
最后说明下分析内核复杂代码最重要的是心态,先了解代码要实现的功能、然后了解架构、最后用一种平和的心态分析细节,分析代码细节时不要图快,力图把每一处读懂。当复杂代码看得多了才能快。