最近走读学习Linux内核关于连接跟踪相关处理,上一篇中记录了连接建立的过程,感兴趣的同学可以参考: nf_conntrack(一)-连接建立 ;一个连接的完整性需要经过确认才能建立,因此这里学习连接确认相关处理;
如前所述,连接处理依附于Linux内核Netfilter框架,连接处理函数即HOOK处理函数,关于HOOK处理函数(以ipv4为例)有如下定义:
static const struct nf_hook_ops ipv4_conntrack_ops[] = {
{
.hook = ipv4_conntrack_in,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_conntrack_local,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
};
从上述nf_hook_ops的定义来看,涉及连接确认的HOOK点有两个:NF_INET_LOCAL_IN和NF_INET_POST_ROUTING;分别对应对到本机报文建立连接的确认处理和转发报文建立的连接的确认处理,后面分别简称为本地会话和转发会话,但对应的处理函数均为nf_confirm,我们稍后讨论。
已知同一HOOK点可以由不同模块挂在多个处理函数,函数依据优先级顺序逐个处理,优先级定义如下:
enum nf_ip_hook_priorities {
NF_IP_PRI_FIRST = INT_MIN,
NF_IP_PRI_RAW_BEFORE_DEFRAG = -450,
NF_IP_PRI_CONNTRACK_DEFRAG = -400,
NF_IP_PRI_RAW = -300,
NF_IP_PRI_SELINUX_FIRST = -225,
NF_IP_PRI_CONNTRACK = -200,
NF_IP_PRI_MANGLE = -150,
NF_IP_PRI_NAT_DST = -100,
NF_IP_PRI_FILTER = 0,
NF_IP_PRI_SECURITY = 50,
NF_IP_PRI_NAT_SRC = 100,
NF_IP_PRI_SELINUX_LAST = 225,
NF_IP_PRI_CONNTRACK_HELPER = 300,
NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,
NF_IP_PRI_LAST = INT_MAX,
};
可见同一HOOK点,连接确认处理函数最后被执行。
连接确认
如上连接确认处理函数为nf_confirm(),依据HOOK点(又称钩子),首先确认下被调用执行的位置,本机报文和转发报文分别讨论。
在此之前,首先要对内核协议栈中报文转发路径的抉择有个基本的认知,主要过程如下:
1)网卡接收报文,通过DMA技术,将报文存放到网卡指定缓冲区(通常存在一个ring_bufffer队列)
2)产生硬件中断,通知CPU有报文到达
3)硬中断处理,并触发收报软中断。
4)软中段处理,将报文从ring_buffer中拷贝复制skb_buffer中并上送协议栈。
5)协议栈报文合法性、完整性校验,
6)根据目的mac是否到本地确定走L2转发还是L3转发
7)L3转发,根据目的IP查路由,若目的IP是本地IP则为到本机的报文,本地报文上送
8)否则,根据路由信息查找对应的邻居信息确认出接口,报文转发
本机报文
ip_local_deliver函数内部有如下调用:
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
转发报文
ip_output函数内部有如下调用:
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, skb, indev, dev, ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
nf_confirm函数
nf_confirm函数代码如下所示:
unsigned int nf_confirm(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
const struct nf_conn_help *help;
enum ip_conntrack_info ctinfo;
unsigned int protoff;
struct nf_conn *ct;
bool seqadj_needed;
__be16 frag_off;
int start;
u8 pnum;
/* 获取连接信息、连接状态信息 */
ct = nf_ct_get(skb, &ctinfo);
if (!ct || in_vrf_postrouting(state))
return NF_ACCEPT;
/* 获取helper扩展信息,helper扩展信息通常用于处理特殊协议或应用层协议,如FTP(父子连接)等 */
help = nfct_help(ct);
seqadj_needed = test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb);
if (!help && !seqadj_needed)
return nf_conntrack_confirm(skb);
/* helper->help() do not expect ICMP packets */
/* 存在相关连接以及扩展信息 */
if (ctinfo == IP_CT_RELATED_REPLY)
return nf_conntrack_confirm(skb);
switch (nf_ct_l3num(ct)) {
case NFPROTO_IPV4:
protoff = skb_network_offset(skb) + ip_hdrlen(skb);
break;
case NFPROTO_IPV6:
pnum = ipv6_hdr(skb)->nexthdr;
start = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off);
if (start < 0 || (frag_off & htons(~0x7)) != 0)
return nf_conntrack_confirm(skb);
protoff = start;
break;
default:
return nf_conntrack_confirm(skb);
}
/* help 扩展处理 */
if (help) {
const struct nf_conntrack_helper *helper;
int ret;
/* rcu_read_lock()ed by nf_hook */
helper = rcu_dereference(help->helper);
if (helper) {
ret = helper->help(skb,
protoff,
ct, ctinfo);
if (ret != NF_ACCEPT)
return ret;
}
}
if (seqadj_needed &&
!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
return NF_DROP;
}
/* We've seen it coming out the other side: confirm it */
return nf_conntrack_confirm(skb);
}
确认的核心处理函数为nf_conntrack_confirm();代码走读发现该函数核心调用_nf_conntrack_confirm()函数,因此连接确认处理集中在_nf_conntrack_confirm函数中。
__nf_conntrack_confirm函数
/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
unsigned int chainlen = 0, sequence, max_chainlen;
const struct nf_conntrack_zone *zone;
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
int ret = NF_DROP;
ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct);
/* ipt_REJECT uses nf_conntrack_attach to attach related
ICMP/TCP RST packets in other direction. Actual packet
which created connection will be IP_CT_NEW or for an
expected connection, IP_CT_RELATED. */
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return NF_ACCEPT;
zone = nf_ct_zone(ct);
local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
/* reuse the hash saved before */
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
hash = scale_hash(hash);
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* We are not in hash table, and we refuse to set up related
* connections for unconfirmed conns. But packet copies and
* REJECT will give spurious warnings here.
*
* Another skb with the same unconfirmed conntrack may
* win the race. This may happen for bridge(br_flood)
* or broadcast/multicast packets do skb_clone with
* unconfirmed conntrack.
*/
if (unlikely(nf_ct_is_confirmed(ct))) {
WARN_ON_ONCE(1);
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
return NF_DROP;
}
if (!nf_ct_ext_valid_pre(ct->ext)) {
NF_CT_STAT_INC(net, insert_failed);
goto dying;
}
/* We have to check the DYING flag after unlink to prevent
* a race against nf_ct_get_next_corpse() possibly called from
* user context, else we insert an already 'dead' hash, blocking
* further use of that particular connection -JM.
*/
/* 将连接状态设置为已确认 */
ct->status |= IPS_CONFIRMED;
if (unlikely(nf_ct_is_dying(ct))) {
NF_CT_STAT_INC(net, insert_failed);
goto dying;
}
max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we are
not in the hash. If there is, we lost race. */
/* 正反方向其实是两个半链接,每个方向的五元组信息都应放入对应的hash链表中,插入前首先遍历已有连接判断是否已存在 */
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
if (chainlen++ > max_chainlen)
goto chaintoolong;
}
chainlen = 0;
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
if (chainlen++ > max_chainlen) {
chaintoolong:
NF_CT_STAT_INC(net, chaintoolong);
NF_CT_STAT_INC(net, insert_failed);
ret = NF_DROP;
goto dying;
}
}
/* Timer relative to confirmation time, not original
setting time, otherwise we had get timer wrap in weird delay cases. */
/* 连接超时时间相对于连接确认时刻,而不是最初建立连接的时刻,因此更新连接超时时间 */
ct->timeout += nfct_time_stamp;
__nf_conntrack_insert_prepare(ct);
/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
* guarantee that no other CPU can find the conntrack before the above
* stores are visible.
*/
/* 将正反方向五元组信息插入对应的hash链表中 */
__nf_conntrack_hash_insert(ct, hash, reply_hash);
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
/* ext area is still valid (rcu read lock is held,
* but will go out of scope soon, we need to remove
* this conntrack again.
*/
if (!nf_ct_ext_valid_post(ct->ext)) {
nf_ct_kill(ct);
NF_CT_STAT_INC_ATOMIC(net, drop);
return NF_DROP;
}
help = nfct_help(ct);
if (help && help->helper)
nf_conntrack_event_cache(IPCT_HELPER, ct);
nf_conntrack_event_cache(master_ct(ct) ?
IPCT_RELATED : IPCT_NEW, ct);
return NF_ACCEPT;
out:
ret = nf_ct_resolve_clash(skb, h, reply_hash);
dying:
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
return ret;
}
由此可以看出连接确认的目的主要作了如下几件事:
1)设置连接确认状态位。
2)设置连接超时时间
3)将正反方向五元组信息节点插入对应的hash链表中。
从连接确认处理函数被执行的位置及处理过程来看,连接确认在原始报文内核协议栈处理过程已经完成了,而不是等待有回复报文到来后才确认。
连接建立到连接确认的处理函数执行位置可以得出这样一个结论:入口报文建立连接,出口报文确认连接,这里出口报文值得是报文或者被转发出去,或者被上送到应用层。
本着学习的态度写下了这篇博客,可能存在误解或不清楚的地方,欢迎多提意见,共同学习讨论。