author: jonathan
本文档的CopyRight归jonathan所有,可自由转载,转载时请保持文档的完整性。
/*----------------------------------------------------------------------------------------------------------------------------*/
Linux 连线跟踪流程整理(linux-2.6.31)
0 主要数据结构
0.1
/ *linux内核中抽象出一个net结构,代表了网络协议.其内部是协议族相关的数据结构.对于PF_INET来说,这里重点就是连线跟踪表 */
struct net {
atomic_t count; /* To decided when the network
* namespace should be freed.
*/
...
/* 从下面宏可以看出, 连线跟踪是建立在NETFILTER基础上的 */
#ifdef CONFIG_NETFILTER
struct netns_xt xt;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct netns_ct ct; /* 连线跟踪表 */
#endif
#endif
#ifdef CONFIG_XFRM
struct netns_xfrm xfrm; /* ipsec,gre等tunnel相关内容 */
#endif
struct net_generic *gen;
};
struct netns_ct {
atomic_t count;
unsigned int expect_count;
struct hlist_nulls_head *hash; /* 主要的hash表 */
struct hlist_head *expect_hash;
struct hlist_nulls_head unconfirmed;
struct hlist_nulls_head dying;
struct ip_conntrack_stat *stat;
....
int hash_vmalloc;
int expect_vmalloc;
};
/* hlist_nulls_node 有是什么连表呢? */
/* 一般的hlist表是以null指针来结尾的,而hlist_nulls_node却是以标志位来表示.只所以可以这么做,因为内核中对象都是4或者8字节对齐的,那么指针的后两位一定是0.因此这里可以复用这个位置来表示连表的结束:1表示连表结束;0表示未结束.一般连表最后一个数字都是由特别含义的.见如下实例:*/
struct hlist_nulls_node {
struct hlist_nulls_node *next, **pprev;
};
#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 1)))
/* for example:
#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
#define DYING_NULLS_VAL ((1<<30)+1)
NIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
*/
/* 接上回书,net 是通过nf_conntrack_init来初始化相关连线跟踪的资源 */
static struct pernet_operations nf_conntrack_net_ops = {
.init = nf_conntrack_net_init, /* 其调用了nf_conntrack_init接口 ,对于每一个 net都要调用这个接口的*/
.exit = nf_conntrack_net_exit,
};
static int __init nf_conntrack_standalone_init(void)
{
return register_pernet_subsys(&nf_conntrack_net_ops);
}
0.2
/* 记录数据报的特征值,对于tcp/ip协议,即是记录五元组 */
struct nf_conntrack_tuple
{
struct nf_conntrack_man src; /* 记录源方向的协议族\地址\端口等 */
/* These are the parts of the tuple which are fixed. */
struct {
union nf_inet_addr u3; /* 目标地址 */
/* 目标端口 */
/* The protocol. */
u_int8_t protonum; /* 四层协议类型 tcp , udp, gre等*/
/* The direction (for tuplehash) */
u_int8_t dir; /* 方向: IP_CT_DIR_ORIGINAL; IP_CT_DIR_REPLY */
} dst;
};
0.3
struct nf_conntrack_tuple_hash {
struct hlist_nulls_node hnnode;
struct nf_conntrack_tuple tuple;
};
0.4
struct sk_buff {
....
struct nf_conntrack *nfct; /* 数据包中连线跟踪数据,其实就是 nf_conn 数据头*/
....
};
struct nf_conntrack {
atomic_t use;
};
struct nf_conn {
/* Usage count in here is 1 for hash table/destruct timer, 1 per skb,
plus 1 for any connection(s) we are `master' for */
struct nf_conntrack ct_general;
spinlock_t lock;
/* XXX should I move this to the tail ? - Y.K */
/* These are my tuples; original and reply */
struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];
/* Have we seen traffic both ways yet? (bitset) */
unsigned long status;
/* If we were expected by an expectation, this will be it */
struct nf_conn *master;
/* Timer function; drops refcnt when it goes off. */
struct timer_list timeout;
...
/* Storage reserved for other modules: */
union nf_conntrack_proto proto;
/* Extensions */
struct nf_ct_ext *ext;
#ifdef CONFIG_NET_NS
struct net *ct_net;
#endif
};
0.5 数据结构关系总结
sk_buff ->
|
V
net.ct.hash[hash] -> nf_conn -> nf_conntrack_tuple_hash[orig/reply] -> nf_conntrack_tuple
1 初始化
static int __init nf_conntrack_l3proto_ipv4_init(void)
{
int ret = 0;
need_conntrack();
nf_defrag_ipv4_enable();
ret = nf_register_sockopt(&so_getorigdst);
if (ret < 0) {
printk(KERN_ERR "Unable to register netfilter socket option\n");
return ret;
}
/* 注册四层协议 */
/* 所有协议相关的流程都通过这个接口来注册,与协议无关的流程通过流程来控制 */
/* 四层协议都保存在nf_ct_protos这个2维数组的全局变量中,其初始化为nf_conntrack_l4proto_generic处理流程 */
/* struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
{
.l3proto = PF_INET,
.l4proto = IPPROTO_TCP,
.name = "tcp",
....
} */
/* 最后通过nf_conntrack_l4proto_register: rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],l4proto);注册完毕 */
ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
if (ret < 0) {
printk("nf_conntrack_ipv4: can't register tcp.\n");
goto cleanup_sockopt;
}
ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
if (ret < 0) {
printk("nf_conntrack_ipv4: can't register udp.\n");
goto cleanup_tcp;
}
ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
if (ret < 0) {
printk("nf_conntrack_ipv4: can't register icmp.\n");
goto cleanup_udp;
}
/* 注册三层协议 */
/* 所有协议相关的流程都通过这个接口来注册,与协议无关的流程通过流程来控制 */
/* 三层协议都保存在nf_ct_l3protos全局变量中,其初始化为nf_conntrack_l3proto_generic处理流程 */
/* extern struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX]; */
/* struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
.l3proto = PF_UNSPEC,
.name = "unknown",
.pkt_to_tuple = generic_pkt_to_tuple, /* 此处处理流程基本都是无操作过程 */
.invert_tuple = generic_invert_tuple,
.print_tuple = generic_print_tuple,
.get_l4proto = generic_get_l4proto,
}; */
/* 最后通过nf_conntrack_l3proto_register: rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);注册完毕 */
ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
if (ret < 0) {
printk("nf_conntrack_ipv4: can't register ipv4\n");
goto cleanup_icmp;
}
/* 注册通用的网络报处理流程, 网上很多这方面介绍,这里不多讲了 */
ret = nf_register_hooks(ipv4_conntrack_ops,
ARRAY_SIZE(ipv4_conntrack_ops));
...
return ret;
... /*错误处理 */
return ret;
}
通过这个函数, 简单的说就是初始化了三个数组: 四层协议数组,三层协议数组,网络流控制数组.
以后的处理流程就是: 网络流控制数组中函数 -> 三层协议数组 -> 四层协议数组
那么就让我们来跟踪流程.
2 ipv4_conntrack_in
static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
{
.hook = ipv4_conntrack_in,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_conntrack_local,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_confirm,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = ipv4_confirm,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
};
这个数组可以看出, 函数分为两类: 创建新连接 和 确认新连接. 由于连线跟踪模块是很多业务处理的基础,所以 创建新连接 优先级很高, 而 确认新连连接 优先级很低.
ipv4_conntrack_in接口很简单,就是调用nf_conntrack_in:
unsigned int
nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
struct sk_buff *skb)
{
...
/* Previously seen (loopback or untracked)? Ignore. */
if (skb->nfct) {
NF_CT_STAT_INC_ATOMIC(net, ignore);
return NF_ACCEPT;
}
/* rcu_read_lock()ed by nf_hook_slow */
/* 获取协议相关的三层协议结构 */
l3proto = __nf_ct_l3proto_find(pf);
/* 获取协议相关的四层协议号 */
ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
&dataoff, &protonum);
if (ret <= 0) {
....
return -ret;
}
/* 获取协议相关的四层协议结构 */
l4proto = __nf_ct_l4proto_find(pf, protonum);
....
/* 连线主接口 */
ct = resolve_normal_ct(net, skb, dataoff, pf, protonum,
l3proto, l4proto, &set_reply, &ctinfo);
...
/* 协议相关处理 ,基本就是协议相关的状态检测*/
ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
...
if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_STATUS, ct);
return ret;
}
static inline struct nf_conn *
resolve_normal_ct(struct net *net,
struct sk_buff *skb,
unsigned int dataoff,
u_int16_t l3num,
u_int8_t protonum,
struct nf_conntrack_l3proto *l3proto,
struct nf_conntrack_l4proto *l4proto,
int *set_reply,
enum ip_conntrack_info *ctinfo)
{
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
/* 调用三层和四层的pkb_to_tuple获取数据包五元组 */
/* 对于pf_INET: 三层协议仅是获取源和目的地址 */
/* 四层协议(tcp来说)仅是获取源和目的端口 */
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, l3num, protonum, &tuple, l3proto,
l4proto)) {
pr_debug("resolve_normal_ct: Can't get tuple\n");
return NULL;
}
/* look for tuple match */
/* 其是根据jhash2来生成tuple hash值 */
h = nf_conntrack_find_get(net, &tuple);
if (!h) {/* 一个新的连接到来了,并添加到net未确认连表中 */
h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff);
if (!h)
return NULL;
if (IS_ERR(h))
return (void *)h;
}
/* 获取真正的连线跟踪数据 */
ct = nf_ct_tuplehash_to_ctrack(h);
/* 更新连线跟踪数据状态 */
/* It exists; we have (non-exclusive) reference. */
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
/* Please set reply bit if this packet OK */
*set_reply = 1;
} else {
/* Once we've had two way comms, always ESTABLISHED. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
*ctinfo = IP_CT_ESTABLISHED;
} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
pr_debug("nf_conntrack_in: related packet for %p\n",
ct);
*ctinfo = IP_CT_RELATED;
} else {
pr_debug("nf_conntrack_in: new packet for %p\n", ct);
*ctinfo = IP_CT_NEW;
}
*set_reply = 0;
}
skb->nfct = &ct->ct_general;
skb->nfctinfo = *ctinfo;
return ct;
}
/* 这个函数很简单,但是有句话很牛,看看您能明白吗?*/
struct nf_conn *nf_conntrack_alloc(struct net *net,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
gfp_t gfp)
{
...
memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
...
}
3 ipv4_confirm
static unsigned int ipv4_confirm(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
...
return nf_conntrack_confirm(skb);
}
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
unsigned int hash, repl_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct);
/* ipt_REJECT uses nf_conntrack_attach to attach related
ICMP/TCP RST packets in other direction. Actual packet
which created connection will be IP_CT_NEW or for an
expected connection, IP_CT_RELATED. */
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return NF_ACCEPT;
hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
/* We're not in hash table, and we refuse to set up related
connections for unconfirmed conns. But packet copies and
REJECT will give spurious warnings here. */
/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
/* No external references means noone else could have
confirmed us. */
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
pr_debug("Confirming conntrack %p\n", ct);
spin_lock_bh(&nf_conntrack_lock);
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
&h->tuple))
goto out;
hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
&h->tuple))
goto out;
/* Remove from unconfirmed list */
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
ct->timeout.expires += jiffies;
add_timer(&ct->timeout);
atomic_inc(&ct->ct_general.use);
set_bit(IPS_CONFIRMED_BIT, &ct->status);
/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
* guarantee that no other CPU can find the conntrack before the above
* stores are visible.
*/
__nf_conntrack_hash_insert(ct, hash, repl_hash);
NF_CT_STAT_INC(net, insert);
spin_unlock_bh(&nf_conntrack_lock);
help = nfct_help(ct);
if (help && help->helper)
nf_conntrack_event_cache(IPCT_HELPER, ct);
nf_conntrack_event_cache(master_ct(ct) ?
IPCT_RELATED : IPCT_NEW, ct);
return NF_ACCEPT;
out:
NF_CT_STAT_INC(net, insert_failed);
spin_unlock_bh(&nf_conntrack_lock);
return NF_DROP;
}