1、滑动窗口
滑动窗口是发送方根据接收方的接收窗口来控制发送速率的手段,接收发的滑动窗口可分成以下四个部分,最左边的紫色表示发送方已发送并且接收发已经确认的序列号,蓝色部分表示发送方已经发送但接收方还未确认的序列号,绿色部门表示发送方可发送但未发送的序列号(也表示接收方通知的接收窗口),蓝色和绿色两部分之和即为接收端通知的接收窗口,对应发送端的snd_wnd字段,最右边表示发送方不能发送的序列号。紫色区域最右边的值对应tcp的snd_una,蓝色最左侧对应tcp的snd_next;在数据包传输过程中,snd_una、snd_next一直在增加,整个窗口呈现出不断向右侧滑动的状态。
接收窗口发送:
接收端回复发送端时,在数据包发送流程,会根据当前接收socket内存信息获取一个接收窗口,填充到tcp头的window字段;
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
...
tcp_options_write((__be32 *)(th + 1), tp, &opts);
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
th->window = htons(tcp_select_window(sk));
tcp_ecn_send(sk, skb, th, tcp_header_size);
} else {
/* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
th->window = htons(min(tp->rcv_wnd, 65535U));
}
}
接收端更新窗口时机:
发送端会将接收端的接收窗口记录在tcp的snd_wnd里,主要有两个时机更新:
1)、tcp建联时
tcp服务端在收到第三次握手时,记录接收端的窗口;
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
...
switch (sk->sk_state) {
case TCP_SYN_RECV:
if (!acceptable)
return 1;
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
/* Once we leave TCP_SYN_RECV, we no longer need req
* so release it.
*/
if (req) {
inet_csk(sk)->icsk_retransmits = 0;
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_congestion_control(sk);
tcp_mtup_init(sk);
tp->copied_seq = tp->rcv_nxt;
tcp_init_buffer_space(sk);
}
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk);
/* Note, that this wakeup is only for marginal crossed SYN case.
* Passively open sockets are not waked up, because
* sk->sk_sleep == NULL and sk->sk_socket == NULL.
*/
if (sk->sk_socket)
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
if (req) {
/* Re-arm the timer because data may have been sent out.
* This is similar to the regular data transmission case
* when new data has just been ack'ed.
*
* (TFO) - we could try to be more aggressive and
* retransmitting any data sooner based on when they
* are sent out.
*/
tcp_rearm_rto(sk);
} else
tcp_init_metrics(sk);
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk);
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_time_stamp;
tcp_initialize_rcv_mss(sk);
tcp_fast_path_on(tp);
break;
}
...
}
2)、tcp接收慢路径
static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
u32 ack_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
int flag = 0;
u32 nwin = ntohs(tcp_hdr(skb)->window);
if (likely(!tcp_hdr(skb)->syn))
nwin <<= tp->rx_opt.snd_wscale;
//ack表示接收端本次确认的序列号,ack_seq表示接收方发送本次ack数据包所使用的序列号
//如果:
//1、接收方本次有新确认的数据包
//2、发送发所使用的序列号比上次更新接收窗口时使用的序列号大
//3、发送发使用的序列号与上次更新窗口时使用的序列号相同,但接收窗口值变大(发送方重传?,并且重传数据包通知的接收窗口变大)
if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
flag |= FLAG_WIN_UPDATE;
tcp_update_wl(tp, ack_seq);
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
tp->pred_flags = 0;
tcp_fast_path_check(sk);
if (nwin > tp->max_window) {
tp->max_window = nwin;
tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
}
}
}
tcp_snd_una_update(tp, ack);
return flag;
}
发送方的流量控制
当发送方需要发送数据包时,会检验当前接收方是否有足够的接收窗口,从这里的流程看,发送方会校验接收方至少要有1个接收窗口才能正常发送,当接收端没有足够的接收窗口时,发送端会暂停发送,从这里也可以看出,发送窗口(snd_wnd)主要是用于流量控制,防止接收端的接收速率跟不上发送端的发送速率。
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp)
{
...
max_segs = tcp_tso_segs(sk, mss_now);
//从下一个要发送的数据开始遍历
while ((skb = tcp_send_head(sk))) {
...
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
is_rwnd_limited = true;
break;
}
...
}
...
}
static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
const struct sk_buff *skb,
unsigned int cur_mss)
{
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (skb->len > cur_mss)
end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
return !after(end_seq, tcp_wnd_end(tp));
}
2、拥塞窗口(snd_cwnd)
拥塞窗口主要是用于控制数据包在链路上的传输,避免过多的数据包发送到链路上导致拥塞丢包,拥塞窗口对应tcp的snd_cwnd字段。拥塞控制算法主要就是根据链路的状态(传统拥塞算法主要是看丢包、乱序等,bbr算法主要是根据检测到的链路带宽及最小时延)调节拥塞窗口的大小,初始化时,会为每个tcp连接分配一个10的初始拥塞窗口。
void tcp_init_sock(struct sock *sk)
{
...
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
tp->snd_cwnd = TCP_INIT_CWND;
...
}
3、拥塞控制
拥塞控制算法主要包含几个流程:慢启动、拥塞避免、快恢复、快重传等状态;
慢启动流程 && 拥塞避免:
tcp为每个连接分配一个snd_ssthresh,当snd_cwnd小于snd_ssthresh时,每ack一个数据包,snd_cwnd就加1,此时的snd_cwnd呈现比较快速的增长阶段。
1)、snd_ssthresh初始化
snd_ssthresh会初始化一个0x7fffffff的值;
void tcp_init_sock(struct sock *sk)
{
...
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
...
}
2)、snd_ssthresh更新
判断当snd_cwnd小于snd_ssthresh时,处于慢启动流程,由于snd_ssthresh初始化的值是一个很大的值,因此如果用snd_cwnd跟snd_ssthresh初始化比较,显然没有意义;当tcp正常ack数据时,会进入pkts_acked,最终调用cubic的处理函数bictcp_acked,在bictcp_acked里,判断当snd_cwnd小于snd_ssthresh,并且snd_cwnd大于16时就会将snd_ssthresh更新成当前的snd_cwnd;
static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
u32 delay;
/* Some calls are for duplicates without timetamps */
if (sample->rtt_us < 0)
return;
/* Discard delay samples right after fast recovery */
if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
return;
delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
if (delay == 0)
delay = 1;
/* first time call or link delay decreases */
if (ca->delay_min == 0 || ca->delay_min > delay)
ca->delay_min = delay;
/* hystart triggers when cwnd is larger than some threshold */
if (hystart && tcp_in_slow_start(tp) &&
tp->snd_cwnd >= hystart_low_window)
hystart_update(sk, delay);
}
接收端在收到ack时,判断如果当前不处于快恢复或loss状态,并且数据正常ack,则进入tcp_cong_avoid流程,这里会根据当前snd_cwnd与snd_ssthresh的关系决定当前的snd_cwnd如何增长;
static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
//acked返回非0表示,增加完拥塞窗口后,拥塞窗口已经超过了阀值snd_ssthresh
//正常慢启动流程,这里返回0,snd_cwnd += acked,每ack一个数据包,拥塞窗口加1
acked = tcp_slow_start(tp, acked);
if (!acked)
return;
}
//到了这里说明已经完成慢启动,进入拥塞避免阶段
//首先先计算一个阈值ca->cnt,表示ack多少个数据才增加一个拥塞窗口
bictcp_update(ca, tp->snd_cwnd, acked);
//更新拥塞窗口值
tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
快恢复状态:
当tcp数据包乱序时,接收端会回复重复的ack,当接收端收到可疑的ack或重复ack时,会调用tcp_fastretrans_alert修改当前的拥塞状态,在tcp_fastretrans_alert里会判断如果当前处于open状态,并且乱序包已经超过乱序阀值,判断当前有数据包丢失,进入快恢复阶段,调用tcp_entry_recovery;
进入快恢复状态,cubic首先将snd_ssthresh降为当前拥塞窗口的717/1024倍,然后将拥塞窗口降为当前网络上的数据包个数(tcp_packets_in_flight + 1),然后重新进入慢启动流程;
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
return;
tp->prr_delivered += newly_acked_sacked;
if (delta < 0) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
} else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
!(flag & FLAG_LOST_RETRANS)) {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
} else {
sndcnt = min(delta, newly_acked_sacked);
}
/* Force a fast retransmit upon entering fast recovery */
sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
快恢复状态退出:
在进入快恢复阶段时,tcp会保存当时的snd_next到high_seq,每次ack时,tcp判断当前处于非open状态都会进入tcp_fastretrans_alert,在tcp_fastretrans_alert里,判断如果snd_uan大于high_seq,那说明之前未ack的数据现在都已经ack了,已经没有数据空洞了,此时可以退出快恢复,重新回到open状态;
static void tcp_fastretrans_alert(struct sock *sk, const int acked,
const int prior_unsacked,
bool is_dupack, int flag)
{
...
/* D. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
WARN_ON(tp->retrans_out != 0);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
/* high_seq表示进入快恢复状态时的snd_next值,如果snd_una比high_seq大
那说明已经有新数据得到ack了,因此以下流程判断是否可以退出快恢复状态*/
switch (icsk->icsk_ca_state) {
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
if (tp->snd_una != tp->high_seq) {
tcp_end_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
case TCP_CA_Recovery:
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
if (tcp_try_undo_recovery(sk))
return;
tcp_end_cwnd_reduction(sk);
break;
}
}
...
}
当tcp退出快恢复状态时,snd_cwnd、snd_ssthresh从当前值与进入快恢复时的值两者间取更大的那个作为回到open状态时的snd_cwnd和snd_ssthresh;
static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
{
struct tcp_sock *tp = tcp_sk(sk);
if (unmark_loss) {
struct sk_buff *skb;
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
}
tp->lost_out = 0;
tcp_clear_all_retrans_hints(tp);
}
if (tp->prior_ssthresh) {
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->undo_cwnd)
//设置退出快恢复状态的cwnd
tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
else
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
//设置退出快恢复状态的snd_ssthresh
if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
TCP_ECN_withdraw_cwr(tp);
}
} else {
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
}
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->undo_marker = 0;
}
上图的icsk->icsk_ca_ops->undo_cwnd最终会调用到bictcp_undo_cwnd里;
static u32 bictcp_undo_cwnd(struct sock *sk)
{
struct bictcp *ca = inet_csk_ca(sk);
//退出loss或recovery时,snd_cwnd取当前snd_cwnd与进入loss或recovery时的snd_cwnd的最大值
return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
}
快重传:
针对不同的拥塞算法,快重传的条件也不一样:
1)、reno
收到3个重复的ack;
static void tcp_fastretrans_alert(struct sock *sk, const int acked,
const int prior_unsacked,
bool is_dupack, int flag)
{
...
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else {
if (tcp_try_undo_partial(sk, acked, prior_unsacked))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) ||
tcp_fackets_out(tp) > tp->reordering;
}
if (tcp_try_undo_dsack(sk)) {
tcp_try_keep_open(sk);
return;
}
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack);
if (icsk->icsk_ca_state != TCP_CA_Open)
return;
/* Fall through to processing in Open state. */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
if (is_dupack)
//reno算法时,每收到一个重复ack,sack_out加1,直到
//sack_out到达乱序阀值3时,触发重传
tcp_add_reno_sack(sk);
}
...
}
验证过程:
通过packetdrill构造数据报文,同时结合ss命令查看:
关闭rack: echo 0 > /proc/sys/net/ipv4/tcp_recovery;
关闭fack: echo 0 >/proc/sys/net/ipv4/tcp_fack;
关闭sack: echo 0 > /proc/sys/net/ipv4/tcp_sack;
packetdrill脚本:
// Test fast retransmit with 4 packets outstanding, receiver sending SACKs.
// In this variant the receiver supports SACK.
// Establish a connection.
0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0
+0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0 > S. 0:0(0) ack 1 <...>
+.1 < . 1:1(0) ack 1 win 257
+0 accept(3, ..., ...) = 4
// Send 1 data segment and get an ACK, so cwnd is now 4.
+0 write(4, ..., 1000) = 1000
+0 > P. 1:1001(1000) ack 1
+.1 < . 1:1(0) ack 1001 win 257
// Write 10 data segments.
+0 write(4, ..., 10000) = 10000
+0 > P. 1001:10001(9000) ack 1
//连续收到3个重复ack
+.1 < . 1:1(0) ack 1001 win 257 <sack 3001:4001,nop,nop>
+.1 < . 1:1(0) ack 1001 win 257 <sack 4001:5001,nop,nop>
+.1 < . 1:1(0) ack 1001 win 257 <sack 5001:6001,nop,nop>
//重传1001:2001数据包
+.0 > . 1001:2001(1000) ack 1
// Receiver ACKs all data.
//所有消息包都ack,关闭socket连接
+1 < . 1:1(0) ack 10001 win 257
每隔一小段时间,捕获一次ss状态,从ss的状态显示看,当重复ack个数未满3个时,tcp保持在乱序状态(1---open),当收到第3个重复ack时,tcp状态为快恢复状态,并对第一个未ack的数据包标记loss,然后重传;
2)、sack
当sack的个数大于等于乱序阀值(默认3)时,tcp就会开始标记loss状态,标记的时候,从write队列最开始的skb开始标记,直至被标记loss的skb的右侧的sack个数为3为止。如下所示,一共发送10个skb,其中第4、7、9、10四个包被sack,那tcp计算此时fack_out=10,sack_out=4,loss_out=5;
3)、fack
echo 1 >/proc/sys/net/ipv4/tcp_fack启用fack
fack是sack的更进一步优化,当使用fack时,sack的数据包个数不再需要超过乱序阀值,fack只关心收到的最大的sack数据包相比snd_una的间距是否超过乱序阀值,当超过时就会开始标记loss,fack标记loss时,从write队列最左侧开始标记,直至待标记的skb距离最大sack的skb的间距为reordering-1为止;
如下,发送10个skb,其中第4、9个skb被sack,那此时fack_out=9,loss_out=5;
4)、rack
echo 1 > /proc/sys/net/ipv4/tcp_recovery启用rack;
rack相比fack又更进一步,rack是根据skb的发送时间来判断消息包是否丢失,tcp每发送一个skb时,都会为其分配一个时间戳信息,当sack时,rack机制判断那些在被sack的数据包之前发送的,并且相比被sack的数据包的时间间距到达一定间隔时,就判断该数据包为丢失,进而进入快恢复状态,然后重传被标记丢失的数据包。
tcp发送数据包时,设置当前的发送时间:
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
...
if (clone_it) {
//标记数据包发送时间
skb_mstamp_get(&skb->skb_mstamp);
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
tcp_rate_skb_sent(sk, skb);
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
else
skb = skb_clone(skb, gfp_mask);
if (unlikely(!skb))
return -ENOBUFS;
}
...
}
当发送端收到sack报文时,开始标记sack标记时,进入rack更新流程:
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
int dup_sack, int pcount,
const struct skb_mstamp *xmit_time)
{
...
if (!(sacked & TCPCB_SACKED_ACKED)) {
tcp_rack_advance(tp, sacked, end_seq, xmit_time);
...
}
rack更新流程主要是记录当前最新的sack数据包的发送时间及序列号等信息:
/*当接收端收到sack时,进入该函数,记录被sack的消息包的发送时间及序列号*/
void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
const struct skb_mstamp *xmit_time)
{
u32 rtt_us;
//如果skb比上一次标记rack的skb的发送时间更早,或者序列号更小,则不处理
if (tp->rack.mstamp.v64 &&
!tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
end_seq, tp->rack.end_seq))
return;
rtt_us = skb_mstamp_us_delta(&tp->tcp_mstamp, xmit_time);
if (sacked & TCPCB_RETRANS) {
/* If the sacked packet was retransmitted, it's ambiguous
* whether the retransmission or the original (or the prior
* retransmission) was sacked.
*
* If the original is lost, there is no ambiguity. Otherwise
* we assume the original can be delayed up to aRTT + min_rtt.
* the aRTT term is bounded by the fast recovery or timeout,
* so it's at least one RTT (i.e., retransmission is at least
* an RTT later).
*/
if (rtt_us < tcp_min_rtt(tp))
return;
}
//设置sack数据包的发送时间及序列号信息
tp->rack.rtt_us = rtt_us;
tp->rack.mstamp = *xmit_time;
tp->rack.end_seq = end_seq;
tp->rack.advanced = 1;
}
在tcp_fastretrans_alert流程判断如果有开启rack,则进入rack标记loss的流程:tcp_rack_identify_loss;
如果有开启rack,则根据sack数据包的时间戳判断未sack的数据包是否丢包,在标记loss流程里会更新tp->lost_out计数,等回到tcp_fastretrans_alert里,函数tcp_time_to_recover判断lost_out非零,则进入recovery状态。如果没有需要标记loss的数据包,则根据剩余时间启动定时器,定时器超时,重新检测。
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
u32 reo_wnd;
*reo_timeout = 0;
/* To be more reordering resilient, allow min_rtt/4 settling delay
* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
* RTT because reordering is often a path property and less related
* to queuing or delayed ACKs.
*/
reo_wnd = 1000;
if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
tcp_for_write_queue(skb, sk) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
if (skb == tcp_send_head(sk))
break;
/* Skip ones already (s)acked */
if (!after(scb->end_seq, tp->snd_una) ||
scb->sacked & TCPCB_SACKED_ACKED)
continue;
//tp->rack.mstamp表示sack的包的发送时间,tp->rack.end_seq表示sack数据包的序列号
//如果skb比被sack的数据的发送时间更早,或者序列号更小,那说明有可能这个skb是丢了
if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
tp->rack.end_seq, scb->end_seq)) {
/* Step 3 in draft-cheng-tcpm-rack-00.txt:
* A packet is lost if its elapsed time is beyond
* the recent RTT plus the reordering window.
*/
//tp->tcp_mstamp为最新ack(sack)的发送时间
//计算skb从发送到现在的时间距离
u32 elapsed = skb_mstamp_us_delta(&tp->tcp_mstamp,
&skb->skb_mstamp);
//如果后发送的skb已经被sack,旧的还没有sack,并且时间超过rtt+窗口阀值
//则认为之前的包已经丢失,rack对其标记loss
//tp->rack.rtt_us表示sack的数据包的发送到现在的时间距离
//这个remaining的意思是,如果sack的数据包与比它更早发送出去但还未被sack的数据包
//的时间间隔超过1000us与1/4 rtt两个时间相对大的那个,那说明还未被sack的数据包可能是已经丢了,标记loss
s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
if (remaining < 0) {
tcp_rack_mark_skb_lost(sk, skb);
continue;
}
/* Skip ones marked lost but not yet retransmitted */
if ((scb->sacked & TCPCB_LOST) &&
!(scb->sacked & TCPCB_SACKED_RETRANS))
continue;
//如果时间还未超时,则启动超时定时器,如果定时器超时,则超时处理函数又进入tcp_rack_detect_loss重新检测
/* Record maximum wait time (+1 to avoid 0) */
*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
* b/c the rest are all sent after rack_sent
*/
break;
}
}
}
验证过程:
开启rack: echo 1 >/proc/sys/net/ipv4/tcp_recovery;
先发送一个数据包,然后延时一定时间再发送第二个数据包;然后packetdrill构造直接sack第二个数据包;此时ss可以看到发送端标记了一个loss,并且进入了recovery状态,说明这个loss并非rto超时标记的,而是rack标记的。
packetdrill脚本:
// Establish a connection.
0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0
+0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0 > S. 0:0(0) ack 1 <...>
+.8 < . 1:1(0) ack 1 win 257
+0 accept(3, ..., ...) = 4
// 发送第1个skb
+0 write(4, ..., 1000) = 1000
+0 > P. 1:1001(1000) ack 1
//第2个skb延时发送
+.1 write(4, ..., 1000) = 1000
//第二个skb被sack
+0 < . 1:1(0) ack 1 win 257 <sack 1001:2001, nop,nop>
LOSS状态
rto定时器超时后,进入loss状态(tcp_enter_loss),进入loss状态后,先调整snd_ssthresh,然后将cwnd降为1;
void tcp_enter_loss(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
bool mark_lost;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
//进入loss,调整慢启动阀值snd_ssthresh
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
tcp_init_undo(tp);
}
//进入loss后,将cwnd拥塞窗口降为1
tp->snd_cwnd = 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
...
}