关于连接跟踪
连接跟踪(CONNTRACK),就是跟踪并且记录连接状态。Linux为每一个经过网络堆栈的数据包,生成一个新的连接记录项(Connection entry)。此后,所有属于此连接的数据包都被唯一地分配给这个连接,并标识连接的状态。连接跟踪是防火墙模块的状态检测的基础,同时也是地址转换中实现SNAT和DNAT的前提。
1. 连接跟踪的相关结构
Netfilter中的连接记录项,即是对一个连接的产生,传输及终止进行跟踪记录。由所有记录项产生的表则称为连接跟踪表,这个表将存储所有连接的状态,它在算法上采用了hash算法。整个hash表用一个ip_conntrack_hash的全局指针来表示:
unsigned int ip_conntrack_htable_size = 0; //由多少条链构成hash表
int ip_conntrack_max = 0; //hash表的大小 后面将根据内存的大小计算
struct list_head *ip_conntrack_hash; //全局hash表
每个hash的节点同时又是一个链表的头部。节点的结构体为ip_conntrack_tuple_hash,
/* Connections have two entries in the hash table: one for each way */
struct ip_conntrack_tuple_hash
{
struct list_head list;//链表
struct ip_conntrack_tuple tuple;//描述数据包的多元组
/* this == &ctrack->tuplehash[DIRECTION(this)]. */
struct ip_conntrack *ctrack;
};
从socket编程的角度来说,使用 ip地址+端口的方式来标识一个连接。Netfilter延续了这种方式,使用ip_conntrack_tuple封装了连接的目的和来源。可以通过一个skbuff(网络数据包)得到这个元组。
- /* The protocol-specific manipulable parts of the tuple: always in
- network order! */
- union ip_conntrack_manip_proto
- {
- /* Add other protocols here. */
- u_int32_t all;
- struct {
- u_int16_t port;
- } tcp;
- struct {
- u_int16_t port;
- } udp;
- struct {
- u_int16_t id;
- } icmp;
- struct {
- u_int32_t key;
- } gre;
- };
- /* The manipulable part of the tuple. */
- struct ip_conntrack_manip
- {
- u_int32_t ip;
- union ip_conntrack_manip_proto u;
- };
- /* This contains the information to distinguish a connection. */
- struct ip_conntrack_tuple
- {
- struct ip_conntrack_manip src; //源端
- /* These are the parts of the tuple which are fixed. */
- struct {
- u_int32_t ip;
- union {
- /* Add other protocols here. */
- u_int32_t all;
- struct {
- u_int16_t port;
- } tcp;
- struct {
- u_int16_t port;
- } udp;
- struct {
- u_int8_t type, code;
- } icmp;
- struct {
- u_int32_t key;
- } gre;
- } u;
- /* The protocol. */
- u_int16_t protonum;
- } dst;//目的端
- };
注意tuple可以完全标识一个唯一的连接,但是它不能完整的描述一个连接的信息,描述一个连接的信息,使用ip_conntrack结构体,在ip_conntrack中有tuplehash[IP_CT_DIR_MAX],这个数组中包含“初始”和“应答”两个成员(tuplehash[IP_CT_DIR_ORIGINAL]和tuplehash[IP_CT_DIR_REPLY]),所以,当一个数据包进入连接跟踪模块后,先根据这个数据包的套接字对转换成一个“初始的”tuple,赋值给tuplehash[IP_CT_DIR_ORIGINAL],然后对这个数据包“取反”,计算出“应答”的tuple,赋值给tuplehash[IP_CT_DIR_REPLY]。
struct ip_conntrack
{
//…
/* These are my tuples; original and reply */
struct ip_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];
//…
};
2. 连接跟踪的具体过程
连接跟踪是在ip_conntrack_standalone.c 中进行初始化的,并完成向Netfilter注册相关的钩子。
模块的init函数会调用init_or_cleanup(),在init_or_cleanup()中首先调用了ip_conntrack_init(),这个函数主要负责初始化连接跟踪的变量及相关的数据结构例如,连接跟踪表的大小。而后init_or_cleanup()函数又注册了几个非常重要的钩子。如下:
static int init_or_cleanup(int init)
{
ip_conntrack_init();
nf_register_hook(&ip_conntrack_in_ops);
nf_register_hook(&ip_conntrack_local_out_ops);
nf_register_hook(&ip_conntrack_out_ops);
nf_register_hook(&ip_conntrack_local_in_ops);
}
钩子的结构体具体如下:
/* Connection tracking may drop packets, but never alters them, so
make it the first hook. */
static struct nf_hook_ops ip_conntrack_in_ops
= { { NULL, NULL }, ip_conntrack_in, PF_INET, NF_IP_PRE_ROUTING,
NF_IP_PRI_CONNTRACK };
static struct nf_hook_ops ip_conntrack_local_out_ops
= { { NULL, NULL }, ip_conntrack_local, PF_INET, NF_IP_LOCAL_OUT,
NF_IP_PRI_CONNTRACK };
/* Refragmenter; last chance. */
static struct nf_hook_ops ip_conntrack_out_ops
= { { NULL, NULL }, ip_refrag, PF_INET, NF_IP_POST_ROUTING, NF_IP_PRI_LAST };
static struct nf_hook_ops ip_conntrack_local_in_ops
= { { NULL, NULL }, ip_confirm, PF_INET, NF_IP_LOCAL_IN, NF_IP_PRI_LAST-1 };
由上面的代码可以看出输入的数据包将首先经过ip_conntrack_in()函数(NF_IP_PRI_CONNTRACK优先级比较高)。在ip_conntrack_in的主要任务为判断该数据包是否已经有连接跟踪,如果没有,则为这个数据包分配ip_conntrack,并初始化它。在ip_conntrack_in()函数会进一步调用resolve_normal_ct()函数,resolve_normal_ct()为连接跟踪中最重要的一个函数,它主要完成了ip_conntrack_in()函数提供的功能。
- /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
- //nfct字段将在ip_conntrack_in中被检测,表示当前数据包是否已被处理过
- static inline struct ip_conntrack *
- resolve_normal_ct(struct sk_buff *skb,
- struct ip_conntrack_protocol *proto,
- int *set_reply,
- unsigned int hooknum,
- enum ip_conntrack_info *ctinfo)
- {
- struct ip_conntrack_tuple tuple;
- struct ip_conntrack_tuple_hash *h;
- IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
- //从Skbuff中提取tuple信息
- if (!ip_ct_get_tuple(skb->nh.iph, skb->len, &tuple, proto))
- return NULL;
- /* look for tuple match */
- //在全局的连接跟踪链表中查询该tuple
- h = ip_conntrack_find_get(&tuple, NULL);
- if (!h) {
- //如果不存在,初始化一个
- h = init_conntrack(&tuple, proto, skb);
- if (!h)
- return NULL;
- if (IS_ERR(h))
- return (void *)h;
- if (ipctinit_callback)
- ipctinit_callback(h->ctrack);
- }
- /* It exists; we have (non-exclusive) reference. */
- //判断连接方向
- if (DIRECTION(h) == IP_CT_DIR_REPLY) {
- *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
- /* Please set reply bit if this packet OK */
- *set_reply = 1;
- } else {
- /* Once we've had two way comms, always ESTABLISHED. */
- if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
- DEBUGP("ip_conntrack_in: normal packet for %p/n",
- h->ctrack);
- *ctinfo = IP_CT_ESTABLISHED;
- } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
- DEBUGP("ip_conntrack_in: related packet for %p/n",
- h->ctrack);
- *ctinfo = IP_CT_RELATED;
- } else {
- DEBUGP("ip_conntrack_in: new packet for %p/n",
- h->ctrack);
- *ctinfo = IP_CT_NEW;
- }
- *set_reply = 0;
- }
- //每个sk_buff都将与ip_conntrack的一个状态关联,所以从sk_buff可以得到相应ip_conntrack的状态,即数据包的状态
- skb->nfct = &h->ctrack->infos[*ctinfo];
- return h->ctrack;
- }
初始化一个连接跟踪时,会调用init_conntrack()函数
- /* Allocate a new conntrack: we return -ENOMEM if classification
- failed due to stress. Otherwise it really is unclassifiable. */
- static struct ip_conntrack_tuple_hash *
- init_conntrack(const struct ip_conntrack_tuple *tuple,
- struct ip_conntrack_protocol *protocol,
- struct sk_buff *skb)
- {
- struct ip_conntrack *conntrack;
- struct ip_conntrack_tuple repl_tuple;
- size_t hash;
- struct ip_conntrack_expect *expected;
- int i;
- static unsigned int drop_next = 0;
- //计算hash值的随机数种子
- if (!ip_conntrack_hash_rnd_initted) {
- get_random_bytes(&ip_conntrack_hash_rnd, 4);
- ip_conntrack_hash_rnd_initted = 1;
- }
- //计算hash值
- hash = hash_conntrack(tuple);
- //判断连接跟踪表是否已满
- if (ip_conntrack_max &
- atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
- /* Try dropping from random chain, or else from the
- chain about to put into (in case they're trying to
- bomb one hash chain). */
- unsigned int next = (drop_next++)%ip_conntrack_htable_size;
- if (!early_drop(&ip_conntrack_hash[next])
- && !early_drop(&ip_conntrack_hash[hash])) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "ip_conntrack: table full, dropping"
- " packet./n");
- return ERR_PTR(-ENOMEM);
- }
- }
- //计算反向的tuple值
- if (!invert_tuple(&repl_tuple, tuple, protocol)) {
- DEBUGP("Can't invert tuple./n");
- return NULL;
- }
- //为连接分配空间
- conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
- if (!conntrack) {
- DEBUGP("Can't allocate conntrack./n");
- return ERR_PTR(-ENOMEM);
- }
- memset(conntrack, 0, sizeof(*conntrack));
- //设置计数器
- atomic_set(&conntrack->ct_general.use, 1);
- conntrack->ct_general.destroy = destroy_conntrack;
- //设置正反方向的tuple
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
- conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
- conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
- for (i=0; i < IP_CT_NUMBER; i++)
- conntrack->infos[i].master = &conntrack->ct_general;
- if (!protocol->new(conntrack, skb->nh.iph, skb->len)) {
- kmem_cache_free(ip_conntrack_cachep, conntrack);
- return NULL;
- }
- /* Don't set timer yet: wait for confirmation */
- //初始化计数器
- init_timer(&conntrack->timeout);
- conntrack->timeout.data = (unsigned long)conntrack;
- conntrack->timeout.function = death_by_timeout;
- INIT_LIST_HEAD(&conntrack->sibling_list);
- WRITE_LOCK(&ip_conntrack_lock);
- /* Need finding and deleting of expected ONLY if we win race */
- //判断是不是期待的子连接
- READ_LOCK(&ip_conntrack_expect_tuple_lock);
- expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
- struct ip_conntrack_expect *, tuple);
- READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
- /* If master is not in hash table yet (ie. packet hasn't left
- this machine yet), how can other end know about expected?
- Hence these are not the droids you are looking for (if
- master ct never got confirmed, we'd hold a reference to it
- and weird things would happen to future packets). */
- //主连接不在hash表中,这应该不是个正确被期待的子连接
- if (expected && !is_confirmed(expected->expectant))
- expected = NULL;
- //为一个主连接查找是否由注册的的helper函数
- /* Look up the conntrack helper for master connections only */
- if (!expected)
- conntrack->helper = ip_ct_find_helper(&repl_tuple);
- //等待已经超时了
- /* If the expectation is dying, then this is a looser. */
- if (expected
- && expected->expectant->helper->timeout
- && ! del_timer(&expected->timeout))
- expected = NULL;
- //是个被期待的子连接
- if (expected) {
- DEBUGP("conntrack: expectation arrives ct=%p exp=%p/n",
- conntrack, expected);
- /* Welcome, Mr. Bond. We've been expecting you... */
- __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
- conntrack->master = expected;
- expected->sibling = conntrack;
- //已经等到了,需要了从等待链表中删掉该连接
- LIST_DELETE(&ip_conntrack_expect_list, expected);
- expected->expectant->expecting--;
- nf_conntrack_get(&master_ct(conntrack)->infos[0]);
- }
- //没有直接将该连接加入hash表,而是加入unconfirmed链表
- /* Overload tuple linked list to put us in unconfirmed list. */
- list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list,
- &unconfirmed);
- atomic_inc(&ip_conntrack_count);
- WRITE_UNLOCK(&ip_conntrack_lock);
- if (expected && expected->expectfn)
- expected->expectfn(conntrack);
- return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
- }
由上面 的list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
可以看出,在init_conntrack函数中,并没有将一个新建连接的tuple放入全局的连接跟踪表(ip_conntrack_hash)中,那是在什么地方完成的这个任务呢,答案是__ip_conntrack_confirm函数,在Init_or_cleanup函数在NF_IP_POST_ROUTING中注册了ip_refrag函数,而在这个函数中最终会调用__ip_conntrack_confirm函数将tuple加入全局的连接跟踪表,同时把对应项从unconfirmed的队列中删除,代码如下:
- /* Confirm a connection given skb->nfct; places it in hash table */
- Int __ip_conntrack_confirm(struct nf_ct_info *nfct)
- {
- unsigned int hash, repl_hash;
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
- ct = __ip_conntrack_get(nfct, &ctinfo);
- /* ipt_REJECT uses ip_conntrack_attach to attach related
- ICMP/TCP RST packets in other direction. Actual packet
- which created connection will be IP_CT_NEW or for an
- expected connection, IP_CT_RELATED. */
- if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- return NF_ACCEPT;
- //计算正反tuple的hash值
- hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- /* We're not in hash table, and we refuse to set up related
- connections for unconfirmed conns. But packet copies and
- REJECT will give spurious warnings here. */
- /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
- /* No external references means noone else could have
- confirmed us. */
- IP_NF_ASSERT(!is_confirmed(ct));
- DEBUGP("Confirming conntrack %p/n", ct);
- //给全局连接跟踪表加写锁
- WRITE_LOCK(&ip_conntrack_lock);
- /* See if there's one in the list already, including reverse:
- NAT could have grabbed it without realizing, since we're
- not in the hash. If there is, we lost race. */
- if (!LIST_FIND(&ip_conntrack_hash[hash],
- conntrack_tuple_cmp,
- struct ip_conntrack_tuple_hash *,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
- && !LIST_FIND(&ip_conntrack_hash[repl_hash],
- conntrack_tuple_cmp,
- struct ip_conntrack_tuple_hash *,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
- /* Remove from unconfirmed list */
- list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
- //将正反方向的TUPLE都加入全局跟踪表中
- list_prepend(&ip_conntrack_hash[hash],
- &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
- list_prepend(&ip_conntrack_hash[repl_hash],
- &ct->tuplehash[IP_CT_DIR_REPLY]);
- /* Timer relative to confirmation time, not original
- setting time, otherwise we'd get timer wrap in
- weird delay cases. */
- ct->timeout.expires += jiffies;
- add_timer(&ct->timeout);
- atomic_inc(&ct->ct_general.use);
- set_bit(IPS_CONFIRMED_BIT, &ct->status);
- WRITE_UNLOCK(&ip_conntrack_lock);
- return NF_ACCEPT;
- }
- WRITE_UNLOCK(&ip_conntrack_lock);
- return NF_DROP;
- }