在tcp socket初始化的时候,会初始化设置三个定时器,isck_retransmit_timer、isck_delack_timer、sk_timer,本文主要描述下这三种tcp定时器。
void inet_csk_init_xmit_timers(struct sock *sk,
void (*retransmit_handler)(unsigned long),
void (*delack_handler)(unsigned long),
void (*keepalive_handler)(unsigned long))
{
struct inet_connection_sock *icsk = inet_csk(sk);
setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
(unsigned long)sk);
setup_timer(&icsk->icsk_delack_timer, delack_handler,
(unsigned long)sk);
setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
1、isck_retransmit_timer
先看下retransmit_timer的超时处理函数tcp_write_timer_handler,可以看到会根据当前定时器的event pending类型,执行不同的处理函数;
void tcp_write_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int event;
if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
!icsk->icsk_pending)
goto out;
if (time_after(icsk->icsk_timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
goto out;
}
event = icsk->icsk_pending;
switch (event) {
case ICSK_TIME_REO_TIMEOUT:
tcp_rack_reo_timeout(sk);
break;
case ICSK_TIME_LOSS_PROBE:
tcp_send_loss_probe(sk);
break;
case ICSK_TIME_RETRANS:
icsk->icsk_pending = 0;
tcp_retransmit_timer(sk);
break;
case ICSK_TIME_PROBE0:
icsk->icsk_pending = 0;
tcp_probe_timer(sk);
break;
}
out:
sk_mem_reclaim(sk);
}
pengding event主要包括以下几种:
#define ICSK_TIME_RETRANS 1 /* Retransmit timer */
#define ICSK_TIME_DACK 2 /* Delayed ack timer */
#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */
#define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */
#define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */
#define ICSK_TIME_REO_TIMEOUT 6 /* Reordering timer */
ICSK_TIME_RETRANS
作用:发送完skb后,启用超时定时器,定时器周期为rto,如果定时器超时还未收到ack,则认为丢包,重传数据包;
启动时机:
1)、tcp_event_new_data_sent
tcp_write_xmit每次发送一个新数据时都会进入tcp_event_new_data_sent流程,这里判断新发送的skb是否为write队列上的第一个skb,如果是的话那就重新启动retrans定时器;
static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
tcp_advance_send_head(sk, skb);
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
tp->packets_out += tcp_skb_pcount(skb);
//如果之前的packets_out为0,说明这个skb是write队列上发送的第一个skb
//因此需要重新启动retrans定时器
//之前启动的定时器已近在tcp_ack->tcp_clean_rtx_queue->tcp_rearm_rto流程里被清除了
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
tcp_skb_pcount(skb));
}
2)、tcp_rearm_rto
tcp_ack->tcp_clean_rtx_queue->tcp_rearm_rto流程里,判断当前write队列还有数据未ack,则重置retrans定时器;
void tcp_rearm_rto(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* If the retrans timer is currently being used by Fast Open
* for SYN-ACK retrans purpose, stay put.
*/
if (tp->fastopen_rsk)
return;
//如果现在in_flight的数据包个数为0,此时write队列上已经没有等到ack的数据了,清空定时器
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
//重新安装retrans定时器,走到这里,说明packet_out不为0,也
//就是说write队列上肯定还有skb数据,取write队列的第一个skb,按skb的rto时间与当前时间的偏差
//作为新的定时器的超时时间
u32 rto = inet_csk(sk)->icsk_rto;
/* Offset the time elapsed after installing regular RTO */
if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
struct sk_buff *skb = tcp_write_queue_head(sk);
const u32 rto_time_stamp =
tcp_skb_timestamp(skb) + rto;
s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
/* delta may not be positive if the socket is locked
* when the retrans timer fires and is rescheduled.
*/
if (delta > 0)
rto = delta;
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
TCP_RTO_MAX);
}
}
关闭时机:
tcp_ack->tcp_clean_rtx_queue->tcp_rearm_rto流程里,判断当前write队列已经没有数据了,则清除retrans定时器;
ICSK_TIME_PROBE0
作用:当接收端通告接收窗口为0,发送端会暂定发送数据包,接收端等有足够的接收窗口时会重新通知发送端,这个通知可能是放在一个ack包也可能是放在一个数据包里,如果是ack包,由于是不可靠的,因此可能丢失,这样的话发送端就有可能会收不到接收端发送的接收窗口更新消息,导致无法发送新数据。因此发送端判断接收端接收窗口为0时,本地启动一个定时器,定时器超时发送一个探测包到接收端,等接收端回复时就自然会携带新的接收窗口信息。
启动时机:
1)、tcp_write_xmit返回失败,数据包发送不出去;
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle)
{
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and
* all will be happy.
*/
if (unlikely(sk->sk_state == TCP_CLOSE))
return;
//tcp_write_xmit返回非0表示packet_out为0,但sk_send_head不为0,说明本段有数据要发送,但发不出去
//可能是没有接收窗口了,启动timer_probe定时器
if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
sk_gfp_mask(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}
2)、tcp_ack时,判断packets_out为0,但是sk_send_head不为0,表示本段数据发送不出去;
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
...
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
sk->sk_err_soft = 0;
icsk->icsk_probes_out = 0;
tp->rcv_tstamp = tcp_time_stamp;
if (!prior_packets)
goto no_queue;
...
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
if (tcp_send_head(sk))
tcp_ack_probe(sk);
...
}
ICSK_TIME_LOSS_PROBE
作用:在拥塞窗口比较小或者最后一个skb丢包,导致无法收到足够的sack,然后通过启动loss probe定时器,定时发送write队列的最后一个skb或者当前send_head(下一个需要发送的数据),loss_probe的超时时间不会超过rto时间;
启用时机:
1)、tcp_write_xmit
当tcp发送数据时,这里判断push_one不为2就会启动loss probe定时器,push_one=2的场景是发送的tcp数据本身是loss probe的数据(tcp_send_loss_probe),这种就不在这里重复启定时器;
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp)
{
...
max_segs = tcp_tso_segs(sk, mss_now);
//从下一个要发送的数据开始遍历
while ((skb = tcp_send_head(sk))) {
...
}
...
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
/* Send one loss probe per tail loss episode. */
//push_one不为2,说明发送的skb不是loss probe skb, 如果是loss probe skb,那
//函数tcp_send_loss_probe会将push_one置为2
//相当于说,只要有触发发送新的skb,每次发送完最后一个skb那就要进入tcp_schedule_loss_probe,启动loss probe定时器
if (push_one != 2)
tcp_schedule_loss_probe(sk);
is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
return !tp->packets_out && tcp_send_head(sk);
}
2)、tcp_ack
在tcp_ack,判断如果当前定时器类型为rto,则修改成loss probe;
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
...
//如果定时器类型为rto重传,则重新成loss probe
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
...
return 0;
}
超时处理函数:
先看write队列是否还有未发送的数据,如果有,则发送下一个数据,如果没有则重传write队列最后一个数据;
void tcp_send_loss_probe(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int pcount;
int mss = tcp_current_mss(sk);
//sk_send_head表示下一个需要发送的数据,如果有需要,则发送下一个数据
skb = tcp_send_head(sk);
if (skb) {
if (tcp_snd_wnd_test(tp, skb, mss)) {
pcount = tp->packets_out;
tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
if (tp->packets_out > pcount)
goto probe_sent;
goto rearm_timer;
}
skb = tcp_write_queue_prev(sk, skb);
} else {
//没有需要发送的,则发送write队列的最后一个数据
skb = tcp_write_queue_tail(sk);
}
/* At most one outstanding TLP retransmission. */
if (tp->tlp_high_seq)
goto rearm_timer;
/* Retransmit last segment. */
if (WARN_ON(!skb))
goto rearm_timer;
if (skb_still_in_host_queue(sk, skb))
goto rearm_timer;
pcount = tcp_skb_pcount(skb);
if (WARN_ON(!pcount))
goto rearm_timer;
if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
GFP_ATOMIC)))
goto rearm_timer;
skb = tcp_write_queue_next(sk, skb);
}
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer;
if (__tcp_retransmit_skb(sk, skb, 1))
goto rearm_timer;
/* Record snd_nxt for loss detection. */
tp->tlp_high_seq = tp->snd_nxt;
probe_sent:
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
/* Reset s.t. tcp_rearm_rto will restart timer from now */
inet_csk(sk)->icsk_pending = 0;
rearm_timer:
tcp_rearm_rto(sk);
}
ICSK_TIME_REO_TIMEOUT
作用:rack标记loss数据包的过程,如果遍历数据包还未过期,则按过期的时间设置超时定时器,定时器超时后重新调用rack流程标记loss数据包;
启动时机:
收到可以ack,进入rack标记流程时,判断当前数据包还未rack超时,则根据剩余超时时间启动reo_timeout定时器;
void tcp_rack_mark_lost(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout;
//如果advanced没有置位,说明没有收到sack消息,不处理
if (!tp->rack.advanced)
return;
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp->rack.advanced = 0;
tcp_rack_detect_loss(sk, &timeout);
//返回timeout表示数据包rack标记loss时还未超时,返回剩余时间
//剩余时间作为timeout超时时间,启动reo_timeout定时器
if (timeout) {
timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
timeout, inet_csk(sk)->icsk_rto);
}
}
关闭时机:
reo_timeout超时后,重新标记loss,然后通过tcp_rearm_rto将定时器还原成retrans;
void tcp_rack_reo_timeout(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout, prior_inflight;
prior_inflight = tcp_packets_in_flight(tp);
skb_mstamp_get(&tp->tcp_mstamp);
tcp_rack_detect_loss(sk, &timeout);
if (prior_inflight != tcp_packets_in_flight(tp)) {
if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
tcp_enter_recovery(sk, false);
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_cwnd_reduction(sk, 1, 0);
}
tcp_xmit_retransmit_queue(sk);
}
if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
tcp_rearm_rto(sk);
}
总结:
retrans_timer的几种定时器类型变化过程大致如下图所示:
2、ICSK_TIME_DACK
之前的几种timer都是针对非ack数据的,icsk_timer_dack是针对ack数据包的,表示用于延时发送ack的定时器类型;
启用时机:
1)、正常发送ack时,申请不到内存;
void tcp_send_ack(struct sock *sk)
{
struct sk_buff *buff;
/* If we have been reset, we may not send again. */
if (sk->sk_state == TCP_CLOSE)
return;
tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
/* We are not putting this on the write queue, so
* tcp_transmit_skb() will set the ownership to this
* sock.
*/
buff = alloc_skb(MAX_TCP_HEADER,
sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (unlikely(!buff)) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
return;
}
/* Reserve space for headers and prepare control bits. */
skb_reserve(buff, MAX_TCP_HEADER);
tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
*/
skb_set_tcp_pure_ack(buff);
/* Send it off, this clears delayed acks for us. */
skb_mstamp_get(&buff->skb_mstamp);
tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
}
2)、接收端每次接收到新数据都会进入__tcp_ack_snd_check,该函数会判断是否需要立即ack,以下几种场景会立即ack;
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
/* More than one full frame received... */
//有几种情况会直接回复ack
//1、乱序
//2、icsk->icsk_ack.pingpong为0,说明不是交互式应用,可能数据是单向的,因此无法延时ack
//3、本端每发送一个tcp(包括ack),都会更新rcv_window(tcp_select_window),
//在tcp_select_window里就会更新rcv_wnp为当前的rcv_net,因此针对ack场景这里
//tp->rcv_nxt - tp->rcv_wup > inet_csk(sk)->icsk_ack.rcv_mss距离上次回复ack
//到现在,至少有一个mss报文段的数据没有回复了,并且接收窗口至少没有下降
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
/* ... and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise). Or...
*/
__tcp_select_window(sk) >= tp->rcv_wnd) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
(ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
/* Then ack it now */
tcp_send_ack(sk);
} else {
/* Else, send delayed ack. */
tcp_send_delayed_ack(sk);
}
}
上面判断是否延时ack有个条件就是距离上次ack,至少有接收到一个mss长度的skb,那如果假设发送端发送的是小包呢?也就是说接收端这时候收到2个小包了还未回复ack,那这种情况会不会立即ack呢?这种情况下也会回复ack,主要是依赖于tcp接收数据清理rbuf的流程,tcp在将接收到的数据放到recvive_queue后会通过sk->sk_data_ready->sock_def_readable流程唤醒tcp接收数据,在tcp_recvmsg里会进入tcp_cleanup_rbuf清理接收队列的内存,在这里也会判断是否需要回复ack;
1)、每接收到一个skb,进入tcp_measure_rcv_mss,如果skb长度小于mss,则标记ICSK_ACK_PUSHED或ICSK_ACK_PUSHED2;
static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const unsigned int lss = icsk->icsk_ack.last_seg_size;
unsigned int len;
icsk->icsk_ack.last_seg_size = 0;
/* skb->len may jitter because of SACKs, even if peer
* sends good full-sized frames.
*/
len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
/* Account for possibly-removed options */
if (unlikely(len > icsk->icsk_ack.rcv_mss +
MAX_TCP_OPTION_SPACE))
tcp_gro_dev_warn(sk, skb, len);
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
*
* "len" is invariant segment length, including TCP header.
*/
len += skb->data - skb_transport_header(skb);
if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
/* If PSH is not set, packet should be
* full sized, provided peer TCP is not badly broken.
* This observation (if it is correct 8)) allows
* to handle super-low mtu links fairly.
*/
(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
!(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
/* Subtract also invariant (if peer is RFC compliant),
* tcp header plus fixed timestamp option length.
* Resulting "len" is MSS free of SACK jitter.
*/
len -= tcp_sk(sk)->tcp_header_len;
icsk->icsk_ack.last_seg_size = len;
if (len == lss) {
icsk->icsk_ack.rcv_mss = len;
return;
}
}
//收到第一个小于mss的skb,标记ICSK_ACK_PUSHED
//收到第二个时,标记为ICSK_ACK_PUSHED2
//然后在tcp_cleanup_rbuf里判断为ICSK_ACK_PUSHED2就回复ack
if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
}
}
2)、在tcp_cleanup_rbuf根据pendbing标志位决定是否回复ack;
static void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
"cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
//ICSK_ACK_SCHED还未清除,说明ack还未回复,如果有回复ack,会清理pending标志位
if (inet_csk_ack_scheduled(sk)) {
const struct inet_connection_sock *icsk = inet_csk(sk);
/* Delayed ACKs frequently hit locked sockets during bulk
* receive. */
if (icsk->icsk_ack.blocked ||
/* Once-per-two-segments ACK was not sent by tcp_input.c */
tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
/*
* If this read emptied read buffer, we send ACK, if
* connection is not bidirectional, user drained
* receive buffer and there was a small segment
* in queue.
*/
//这里判断pending如果是ICSK_ACK_PUSHED并未非交互式
//或者pending为ICSK_ACK_PUSHED2,则会回复ack
//tcp在接收包处理流程tcp_measure_rcv_mss里,会判如果第一次收到小于
//mss的包,则标记ICSK_ACK_PUSHED,第二次收到则标记ICSK_ACK_PUSHED2
//因此这里pending为ICSK_ACK_PUSHED2的话就说明有两个skb(小于mss长度)未回复了
//这种情况也需要立即ack
(copied > 0 &&
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
!icsk->icsk_ack.pingpong)) &&
!atomic_read(&sk->sk_rmem_alloc)))
time_to_ack = true;
}
/* We send an ACK if we can now advertise a non-zero window
* which has been raised "significantly".
*
* Even if window raised up to infinity, do not send window open ACK
* in states, where we will not receive more. It is useless.
*/
if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
__u32 rcv_window_now = tcp_receive_window(tp);
/* Optimize, __tcp_select_window() is not cheap. */
if (2*rcv_window_now <= tp->window_clamp) {
__u32 new_window = __tcp_select_window(sk);
/* Send ACK now, if this read freed lots of space
* in our buffer. Certainly, new_window is new window.
* We can advertise it now, if it is not less than current one.
* "Lots" means "at least twice" here.
*/
if (new_window && new_window >= 2 * rcv_window_now)
time_to_ack = true;
}
}
if (time_to_ack)
tcp_send_ack(sk);
}
清除时机:
当正常发送一个ack数据时,会清除延时发送ack定时器;
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
...
if (likely(tcb->tcp_flags & TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
}
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
3、Keeplive timer
作用:用于检测tcp的一端是否已经断开,如A给B发一个tcpkeep live的探测包,如果B发生重启过,那B本地找不到对应socket连接信息,给A回复rst。