这篇笔记记录的是服务器端收到TCP第三次握手的ACK包后的行为。
1. 数据包入口
在《TCP之服务器端接收SYN请求段》中,就有提到TCP对ACK包的处理是由tcp_v4_do_rcv()完成的,这里再次列出相关的核心代码:
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
if (sk->sk_state == TCP_LISTEN) {
//返回NULL:出错,丢弃数据包
//nsk == sk:收到的是第一次握手的SYN
//NSK != SK: 收到的是第三次握手的ACK
struct sock *nsk = tcp_v4_hnd_req(sk, skb);
if (!nsk)
goto discard;
if (nsk != sk) {
//收到ACK报文会调用该函数
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return 0;
}
}
reset:
tcp_v4_send_reset(rsk, skb);
}
2. tcp_v4_hnd_req()
该函数用来搜索监听套接字的连接请求队列,判断是SYN包还是ACK包。
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
struct request_sock **prev;
//首先搜索监听套接字的SYN请求队列,如果找到,则说明前两次握手成功,此时很可能收到的是ACK报文·
struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
iph->saddr, iph->daddr);
//虽然在SYN请求队列中找到了对应的连接请求块,但是还需要对输入报文进行检查,确保其是期望的ACK报文
if (req)
return tcp_check_req(sk, skb, req, prev);
//为什么要搜索ehash表,这里没看明白...
nsk = inet_lookup_established(sk->sk_net, &tcp_hashinfo, iph->saddr,
th->source, iph->daddr, th->dest, inet_iif(skb));
if (nsk) {
//处于TIME_WAIT状态的套接字需要做特殊处理,原因暂时未知
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
#ifdef CONFIG_SYN_COOKIES
if (!th->rst && !th->syn && th->ack)
sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
//其余情况返回监听套接字
return sk;
}
2.1 SYN请求队列的搜索
//@sk: 监听套接字的TCB
//@prevp: 保存半连接队列中目标request_sock的后继结点的地址
//@rport: 输入数据包中的源端口
//@raddr: 输入数据包中的源IP地址
//@laddr:输入数据包中目的IP地址
struct request_sock *inet_csk_search_req(const struct sock *sk,
struct request_sock ***prevp,
const __be16 rport, const __be32 raddr,
const __be32 laddr)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
//SYN请求队列
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
struct request_sock *req, **prev;
//哈希函数的参数包含了输入数据包中的源IP地址和源端口号
for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
lopt->nr_table_entries)];
(req = *prev) != NULL;
prev = &req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req);
//比较的是客户端IP、端口以及本地IP和地址族
if (ireq->rmt_port == rport &&
ireq->rmt_addr == raddr &&
ireq->loc_addr == laddr &&
AF_INET_FAMILY(req->rsk_ops->family)) {
BUG_TRAP(!req->sk);
*prevp = prev;
break;
}
}
//如果找到,req就是连接请求控制块,否则是NULL
return req;
}
2.2 tcp_check_req()
正如注释所说,该函数处理属于SYN_RECV状态的套接字的数据包,在该状态下,最期望收到的是来自客户端的ACK报文,这样就可以完成三次握手了。
这里关于SYN_RECV状态要多说一句,监听套接字在收到SYN包时自己并不会将状态从TCP_LISTEN迁移到TCP_SYN_RECV,也就是说在实现时,连接请求块代表的socket就处于TCP_SYN_RECV。
/*
* Process an incoming packet for SYN_RECV sockets represented
* as a request_sock.
*/
//@sk: 监听套接字的TCB
//@skb: 输入数据包
//@req: 在SYN请求队列中找到的连接请求块
//@prev:req在SYN请求队列中的后继结点的指针的地址
struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
struct request_sock *req,
struct request_sock **prev)
{
const struct tcphdr *th = tcp_hdr(skb);
//将输入数据包中的TCP标志位提取出来
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
int paws_reject = 0;
struct tcp_options_received tmp_opt;
struct sock *child;
//数据包中TCP选项相关处理,忽略。在实际使用过程中,ACK报文中很少带有选项字段
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
tcp_parse_options(skb, &tmp_opt, 0);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
/* We do not store true stamp, but it is not required,
* it can be estimated (approximately)
* from another data.
*/
tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
paws_reject = tcp_paws_check(&tmp_opt, th->rst);
}
}
//收到的数据包是SYN数据包,说明这是一个SYN请求的重传包。这种情况会向客户端重传SYN+ACK包,
//这是通过调用连接请求块中的回调函数rtx_syn_ack()实现的,实际上就是tcp_v4_send_synack()。
//最后返回NULL,表示对此数据包的处理到此为止
if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
flg == TCP_FLAG_SYN && !paws_reject) {
req->rsk_ops->rtx_syn_ack(sk, req, NULL);
return NULL;
}
/* Further reproduces section "SEGMENT ARRIVES"
for state SYN-RECEIVED of RFC793.
It is broken, however, it does not work only
when SYNs are crossed.
You would think that SYN crossing is impossible here, since
we should have a SYN_SENT socket (from connect()) on our end,
but this is not true if the crossed SYNs were sent to both
ends by a malicious third party. We must defend against this,
and to do that we first verify the ACK (as per RFC793, page
36) and reset if it is invalid. Is this a true full defense?
To convince ourselves, let us consider a way in which the ACK
test can still pass in this 'malicious crossed SYNs' case.
Malicious sender sends identical SYNs (and thus identical sequence
numbers) to both A and B:
A: gets SYN, seq=7
B: gets SYN, seq=7
By our good fortune, both A and B select the same initial
send sequence number of seven :-)
A: sends SYN|ACK, seq=7, ack_seq=8
B: sends SYN|ACK, seq=7, ack_seq=8
So we are now A eating this SYN|ACK, ACK test passes. So
does sequence test, SYN is truncated, and thus we consider
it a bare ACK.
If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
bare ACK. Otherwise, we create an established connection. Both
ends (listening sockets) accept the new incoming connection and try
to talk to each other. 8-)
Note: This case is both harmless, and rare. Possibility is about the
same as us discovering intelligent life on another plant tomorrow.
But generally, we should (RFC lies!) to accept ACK
from SYNACK both here and in tcp_rcv_state_process().
tcp_rcv_state_process() does not, hence, we do not too.
Note that the case is absolutely generic:
we cannot optimize anything here without
violating protocol. All the checks must be made
before attempt to create socket.
*/
/* RFC793 page 36: "If the connection is in any non-synchronized state ...
* and the incoming segment acknowledges something not yet
* sent (the segment carries an unacceptable ACK) ...
* a reset is sent."
*
* Invalid ACK: reset will be sent by listening socket
*/
//收到了ACK报文,但是其ACK序号和发送SYN+ACK报文的序号不一致。这种情况不作处理,
//直接返回监听套接字,该报文会后面会在tcp_rcv_state_process()中发送RST
if ((flg & TCP_FLAG_ACK) &&
(TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
return sk;
/* Also, it would be not so bad idea to check rcv_tsecr, which
* is essentially ACK extension and too early or too late values
* should cause reset in unsynchronized states.
*/
/* RFC793: "first check sequence number". */
//处理接收报文不再接收窗口范围内的情况。tcp_in_window(a,b,c,d)用来判断[a,b]是否在[c,d]范围内
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
/* Out of window: send ACK and drop. */
//向发送回复ACK,这可以尽快的让发送端意识到错误,从而发送正确序号的数据包
//该函数的实现是tcp_v4_reqsk_send_ack()
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_ack(skb, req);
if (paws_reject)
NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
//返回NULL,结束对该输入数据包的处理过程
return NULL;
}
/* In sequence, PAWS is OK. */
//时间戳选项,忽略
if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
req->ts_recent = tmp_opt.rcv_tsval;
//ACK数据包的序号和SYN包的序号相同,这是错误的,因为SYN会消耗一个序号,
//所以ACK报文的起始序号应该是SYN报文的序号+1。但是这里清除SYN标记的操作
//很奇怪,不理解
if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
/* Truncate SYN, it is out of window starting
at tcp_rsk(req)->rcv_isn + 1. */
flg &= ~TCP_FLAG_SYN;
}
/* RFC793: "second check the RST bit" and
* "fourth, check the SYN bit"
*/
if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
goto embryonic_reset;
}
/* ACK sequence verified above, just make sure ACK is
* set. If ACK not set, just silently drop the packet.
*/
//收到的报文没有设置ACK标记位,什么都不做,返回NULL,结束对该数据包的后续处理
if (!(flg & TCP_FLAG_ACK))
return NULL;
/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
//如果设置了TCP_DEFER_ACCEPT选项,则不接收纯粹的ACK包,而是等待有负荷的
//数据包到达后,再完成三次握手过程,这里返回NULL,直接丢弃纯ACK。
//个人理解该选项会对高并发服务器不利,很有可能会导致SYN请求队列accept连接队列满
//而无法及时接收新的连接请求
if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
return NULL;
}
/* OK, ACK is valid, create big socket and
* feed this segment to it. It will repeat all
* the tests. THIS SEGMENT MUST MOVE SOCKET TO
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*/
//所有事情都OK,调用监听套接字的syn_recv_sock()回调函数创建新的TCB
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
req, NULL);
//创建失败,说明当前监听套接字很繁忙
if (child == NULL)
goto listen_overflow;
//将连接请求块从SYN请求队列中移除
inet_csk_reqsk_queue_unlink(sk, req, prev);
inet_csk_reqsk_queue_removed(sk, req);
//将连接请求块加入到accept连接队列中等待应用程序accept()
inet_csk_reqsk_queue_add(sk, req, child);
//返回新的TCB
return child;
listen_overflow:
//系统参数tcp_abort_on_overflow(/proc/sys/net/ipv4/tcp_abort_on_ownerflow)表示
//当服务器端繁忙而无法接受新的连接时,是否向对端发送RST报文,默认为0,即不发送RST,因为服
//务器端可能很快就可以降低负载,从而可以继续提供服务。acked的使用见
//《TCP之服务器端发送SYN+ACK包》中的超时重传介绍
if (!sysctl_tcp_abort_on_overflow) {
inet_rsk(req)->acked = 1;
return NULL;
}
embryonic_reset:
NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
//如果对端没有发送RST,向对端回复RST,这种判断可以说明
//服务器端不会对客户端的RST再回复RST(见tcp_v4_send_reset())
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_reset(sk, skb);
//遇到错误,会将该请求套接字从半连接队列中清除
inet_csk_reqsk_queue_drop(sk, req, prev);
return NULL;
}
2.3 创建新的socket
上面看到,收到ACK后会调用监听套接字的inet_csk(sk)->icsk_af_ops->syn_recv_sock(),对于TCP,实际上是tcp_v4_syn_recv_sock(),参考socket创建过程中init()函数的调用可以确认这一点。
/*
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*/
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
#endif
//如果accept接收队列已满,则返回创建失败
if (sk_acceptq_is_full(sk))
goto exit_overflow;
//路由相关查询操作
if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
goto exit;
//分配一个新的TCB
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto exit;
newsk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(newsk, dst);
//根据连接请求块中的内容初始化新的TCB的各个字段
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
newinet->daddr = ireq->rmt_addr;
newinet->rcv_saddr = ireq->loc_addr;
newinet->saddr = ireq->loc_addr;
newinet->opt = ireq->opt;
ireq->opt = NULL;
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = ip_hdr(skb)->ttl;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (newinet->opt)
inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
newinet->id = newtp->write_seq ^ jiffies;
//路径MTU相关初始化
tcp_mtup_init(newsk);
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
tcp_initialize_rcv_mss(newsk);
#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
/*
* We're using one, so create a matching key
* on the newsk structure. If we fail to get
* memory, then we end up not copying the key
* across. Shucks.
*/
char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
if (newkey != NULL)
tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
newkey, key->keylen);
}
#endif
//将新的TCB加入到TCP的ehash散列表中
__inet_hash_nolisten(newsk);
//保存新的TCB的端口信息
__inet_inherit_port(sk, newsk);
return newsk;
exit_overflow:
NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
exit:
NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
dst_release(dst);
return NULL;
}
tcp_create_openreq_child()会创建新的TCB,并且对其中大多数字段做基本的初始化,这里不再一一罗列,需要时到该函数中查询具体参数的值是如何确定的即可。
2.3.1 新套接字端口号的生成
/* Caller must disable local BH processing. */
static inline void __inet_inherit_port(struct sock *sk, struct sock *child)
{
struct inet_hashinfo *table = sk->sk_prot->hashinfo;
const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind_bucket *tb;
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
//将新生成的TCB加入到监听套接字对应的端口的owner链表中,
//从这里可以看出,新生成的TCB和监听套接字是共享同一个端口的
sk_add_bind_node(child, &tb->owners);
inet_csk(child)->icsk_bind_hash = tb;
spin_unlock(&head->lock);
}
3. tcp_child_process()
int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb)
{
int ret = 0;
int state = child->sk_state;
//如果用户进程没有锁住child,则让child重新处理该ACK报文,这可以让child
//套接字由TCP_SYN_RECV迁移到TCP_ESTABLISH状态
if (!sock_owned_by_user(child)) {
//见下文
ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
skb->len);
/* Wakeup parent, send SIGIO */
//child套接字状态发生了迁移,唤醒监听套接字上的进程,可能由于调用accept()而block
if (state == TCP_SYN_RECV && child->sk_state != state)
parent->sk_data_ready(parent, 0);
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
* socket does not protect us more.
*/
//缓存该skb后续处理
sk_add_backlog(child, skb);
}
bh_unlock_sock(child);
sock_put(child);
return ret;
}
关于队列操作、唤醒等操作在数据接收部分再来细谈,下面看看tcp_rcv_state_process()对ACK报文的处理。
3.1 tcp_rcv_state_process()
这里我们只关注其中TCP_SYN_RECV状态下对ACK报文的处理过程。
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
...
/* step 5: check the ACK field */
if (th->ack) {
int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
switch (sk->sk_state) {
case TCP_SYN_RECV:
//下面进行的还是一些字段的初始化,需要时再细查
if (acceptable) {
tp->copied_seq = tp->rcv_nxt;
smp_mb();
//从TCP_SYN_RECV变为TCP_ESTABLISHED
tcp_set_state(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk);
/* Note, that this wakeup is only for marginal
* crossed SYN case. Passively open sockets
* are not waked up, because sk->sk_sleep ==
* NULL and sk->sk_socket == NULL.
*/
if (sk->sk_socket)
sk_wake_async(sk,
SOCK_WAKE_IO, POLL_OUT);
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) <<
tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq,
TCP_SKB_CB(skb)->seq);
/* tcp_ack considers this ACK as duplicate
* and does not calculate rtt.
* Fix it at least with timestamps.
*/
if (tp->rx_opt.saw_tstamp &&
tp->rx_opt.rcv_tsecr && !tp->srtt)
tcp_ack_saw_tstamp(sk, 0);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
/* Make sure socket is routed, for
* correct metrics.
*/
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_metrics(sk);
tcp_init_congestion_control(sk);
/* Prevent spurious tcp_cwnd_restart() on
* first data packet.
*/
tp->lsndtime = tcp_time_stamp;
tcp_mtup_init(sk);
tcp_initialize_rcv_mss(sk);
tcp_init_buffer_space(sk);
tcp_fast_path_on(tp);
} else {
return 1;
}
break;
} else
goto discard;
...
return 0;
}