1、tcp四次挥手过程状态迁移如下所示:
1)、客户端通过close系统调用向服务端发起第一次挥手请求,此时客户端将自己状态置为TCP_FIN_WAIT1状态;
2)、服务端收到fin请求后,将状态置为TCP_CLOSE_WAIT,并设置延时ack,然后通知服务端应用程序;
3)、服务端应用程序通过close系统调用向客户端发起fin请求,同时回复之前客户端fin的ack,此时服务端将状态置为TCP_LAST_ACK状态;
4)、客户端收到fin+ack后,先将状态迁移至TCP_FIN_WAIT2,然后创建tw socket,销毁旧的socket,向服务端回复ack,进入TCP_TIME_WAIT状态,启动超时定时器,等待TIME_WAIT状态超时,回收tw socket资源;
5)、服务端收到ack后,回收socket资源;
应用程序 应用程序
| | close系统调用
|close系统调用 |
| |
客户端 服务端
fin
-------------->
TCP_FIN_WAIT1 TCP_CLOSE_WAIT
ack
<--------------
TCP_FIN_WAIT2
fin (收到close)
<-------------- TCP_LAST_ACK <----------
TCP_TIME_WAIT
ack
--------------> tcp_done(回收socket资源)
(2MSL超时,释放socke资)
以上流程可以看出,发起连接断开的客户端,最后回收socket资源后,还会创建一个time wait socket,这个time wait socket复用之前的socket信息,等待定时器超时后才回收tw socket,而服务端在收到最后一个挥手ack后,直接就回收socket,并不要等待超时,那发起连接断开的一段为什么需要TIME_WAIT状态呢?
1)、防止最后一个ack丢失,如果服务端发送完fin后,一直没收到ack,那服务端会重发,这时候如果客户端socket已经释放了,那就会导致消息无人处理,报错;
2)、防止之前的tcp连接,网络链路上还有数据残留的时候就创建新的tcp连接;
2、客户端发起close流程
sys_close
__close_fd
fput
task_work_add(往进程添加一些work工作任务,在进程返回用户空间时会检查标志,通过do_notify_resume->tracehook_notify_resume->task_work_run调用真正的处理函数____fput)
____fput
file->f_op->release
sock_close
inet_release
tcp_close
void tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
int data_was_unread = 0;
int state;
lock_sock(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
//如果还处在LISTEN状态, 则将icsk_accept_queue半连接队列的request删除
if (sk->sk_state == TCP_LISTEN) {
tcp_set_state(sk, TCP_CLOSE);
/* Special case. */
inet_csk_listen_stop(sk);
goto adjudge_to_death;
}
/* We need to flush the recv. buffs. We do this only on the
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
*/
//清空接收队列缓存的skb
while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
len--;
data_was_unread += len;
__kfree_skb(skb);
}
sk_mem_reclaim(sk);
/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
if (sk->sk_state == TCP_CLOSE)
goto adjudge_to_death;
/* As outlined in RFC 2525, section 2.17, we send a RST here because
* data was lost. To witness the awful effects of the old behavior of
* always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
* GET in an FTP client, suspend the process, wait for the client to
* advertise a zero window, then kill -9 the FTP client, wheee...
* Note: timeout is always zero in such a case.
*/
if (unlikely(tcp_sk(sk)->repair)) {
sk->sk_prot->disconnect(sk, 0);
} else if (data_was_unread) {
//如果close的时候,接收队列还有消息未处理,发送rst消息
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, sk->sk_allocation);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
} else if (tcp_close_state(sk)) { //这里先完成一次状态迁移 TCP_ESTABLISHED--->TCP_FIN_WAIT1
/* We FIN if the application ate all the data before
* zapping the connection.
*/
/* RED-PEN. Formally speaking, we have broken TCP state
* machine. State transitions:
*
* TCP_ESTABLISHED -> TCP_FIN_WAIT1
* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
* TCP_CLOSE_WAIT -> TCP_LAST_ACK
*
* are legal only when FIN has been sent (i.e. in window),
* rather than queued out of window. Purists blame.
*
* F.e. "RFC state" is ESTABLISHED,
* if Linux state is FIN-WAIT-1, but FIN is still not sent.
*
* The visible declinations are that sometimes
* we enter time-wait state, when it is not required really
* (harmless), do not send active resets, when they are
* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
* they look as CLOSING or LAST_ACK for Linux)
* Probably, I missed some more holelets.
* --ANK
* XXX (TFO) - To start off we don't support SYN+ACK+FIN
* in a single packet! (May consider it later but will
* probably need API support or TCP_CORK SYN-ACK until
* data is written and socket is closed.)
*/
tcp_send_fin(sk);
}
...
}
3、服务端收到fin包,先将状态置为TCP_CLOSE_WAIT
tcp_data_queue
tcp_fin
void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
inet_csk_schedule_ack(sk);
sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
case TCP_SYN_RECV:
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
//状态从TCP_ESTABLISHED变为TCP_CLOSE_WAIT,并设置延时ack pingpong=1
tcp_set_state(sk, TCP_CLOSE_WAIT);
inet_csk(sk)->icsk_ack.pingpong = 1;
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
/* Received a retransmission of the FIN, do
* nothing.
*/
break;
case TCP_LAST_ACK:
/* RFC793: Remain in the LAST-ACK state. */
break;
case TCP_FIN_WAIT1:
/* This case occurs when a simultaneous close
* happens, we must ack the received FIN and
* enter the CLOSING state.
*/
tcp_send_ack(sk);
tcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
/* Received a FIN -- send ACK and enter TIME_WAIT. */
tcp_send_ack(sk);
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
break;
default:
/* Only TCP_LISTEN and TCP_CLOSE are left, in these
* cases we should never reach this piece of code.
*/
pr_err("%s: Impossible, sk->sk_state=%d\n",
__func__, sk->sk_state);
break;
}
/* It _is_ possible, that we have something out-of-order _after_ FIN.
* Probably, we should reset in this case. For now drop them.
*/
skb_rbtree_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
sk_mem_reclaim(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
//唤醒用户进程收包,用户进程唤醒后在tcp_recvmsg里会检测到fin包,最终应用程序会调用close发送finish包
sk->sk_state_change(sk);
/* Do not send POLL_HUP for half duplex close. */
if (sk->sk_shutdown == SHUTDOWN_MASK ||
sk->sk_state == TCP_CLOSE)
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
else
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
}
}
4、服务端应用程序调用close,向客户端发送fin
tcp_close
tcp_close_state(这里将服务端状态从TCP_CLOSE_WAIT迁移成TCP_LAST_ACK)
tcp_send_fin
5、客户端接收到服务端的fin+ack
tcp_v4_do_rcv
tcp_rcv_state_process(这里将状态从TCP_FIN_WAIT1迁移成TCP_FIN_WAIT2)
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
/* step 7: process the segment text */
switch (sk->sk_state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
* RFC 1122 says we MUST send a reset.
* BSD 4.4 also does reset.
*/
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
}
}
/* Fall through */
//注意上面的TCP_FIN_WAIT1、TCP_FIN_WAIT2并没有break,所以TCP_FIN_WAIT1、TCP_FIN_WAIT2状态下也会执行tcp_data_queue
case TCP_ESTABLISHED:
tcp_data_queue(sk, skb);
queued = 1;
break;
}
/* tcp_data could move socket to TIME-WAIT */
if (sk->sk_state != TCP_CLOSE) {
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
}
if (!queued) {
discard:
tcp_drop(sk, skb);
}
return 0;
}
tcp_data_queue
tcp_fin
tcp_send_ack(向服务端回复最后一个ack)
tcp_time_wait(将状态从TCP_FIN_WAIT2迁移成TCP_TIME_WAIT,启动超时定时器,等待超时回收tcp)
void tcp_time_wait(struct sock *sk, int state, int timeo)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_timewait_sock *tw;
struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
//申请一个time wait socket,socket状态为TCP_TIME_WAIT,并将旧的sk信息赋给time wait socket
tw = inet_twsk_alloc(sk, tcp_death_row, state);
if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
struct inet_sock *inet = inet_sk(sk);
tw->tw_transparent = inet->transparent;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
tcptw->tw_ts_offset = tp->tsoffset;
tcptw->tw_last_oow_ack_time = 0;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
tw->tw_v6_daddr = sk->sk_v6_daddr;
tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_tclass = np->tclass;
tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
tw->tw_ipv6only = sk->sk_ipv6only;
}
#endif
#ifdef CONFIG_TCP_MD5SIG
/*
* The timewait bucket does not have the key DB from the
* sock structure. We just make a quick copy of the
* md5 key being used (if indeed we are using one)
* so the timewait ack generating code has the key.
*/
do {
struct tcp_md5sig_key *key;
tcptw->tw_md5_key = NULL;
key = tp->af_specific->md5_lookup(sk, sk);
if (key) {
tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
BUG();
}
} while (0);
#endif
/* Get the TIME_WAIT timeout firing. */
if (timeo < rto)
timeo = rto;
tw->tw_timeout = TCP_TIMEWAIT_LEN;
if (state == TCP_TIME_WAIT)
timeo = TCP_TIMEWAIT_LEN;
//启动60s定时器
inet_twsk_schedule(tw, timeo);
/* Linkage updates. */
__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
inet_twsk_put(tw);
} else {
/* Sorry, if we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
}
tcp_update_metrics(sk);
//旧的sk资源释放
tcp_done(sk);
}
最后进入time wait定时器超时处理函数tw_timer_handler,回收time wait socket
static void tw_timer_handler(unsigned long data)
{
struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
if (tw->tw_kill)
__NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
else
__NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED);
inet_twsk_kill(tw);
}
6、服务端收到ack,回收socket
tcp_rcv_state_process
tcp_done(TCP_LAST_ACK状态下直接释放socket)