Linux tcp拥塞控制

1、滑动窗口

滑动窗口是发送方根据接收方的接收窗口来控制发送速率的手段,接收发的滑动窗口可分成以下四个部分,最左边的紫色表示发送方已发送并且接收发已经确认的序列号,蓝色部分表示发送方已经发送但接收方还未确认的序列号,绿色部门表示发送方可发送但未发送的序列号(也表示接收方通知的接收窗口),蓝色和绿色两部分之和即为接收端通知的接收窗口,对应发送端的snd_wnd字段,最右边表示发送方不能发送的序列号。紫色区域最右边的值对应tcp的snd_una,蓝色最左侧对应tcp的snd_next;在数据包传输过程中,snd_una、snd_next一直在增加,整个窗口呈现出不断向右侧滑动的状态。

接收窗口发送:

接收端回复发送端时,在数据包发送流程,会根据当前接收socket内存信息获取一个接收窗口,填充到tcp头的window字段;

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
			    gfp_t gfp_mask)
{
	...
	tcp_options_write((__be32 *)(th + 1), tp, &opts);
	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
		th->window      = htons(tcp_select_window(sk));
		tcp_ecn_send(sk, skb, th, tcp_header_size);
	} else {
		/* RFC1323: The window in SYN & SYN/ACK segments
		 * is never scaled.
		 */
		th->window	= htons(min(tp->rcv_wnd, 65535U));
	}
}

接收端更新窗口时机:

发送端会将接收端的接收窗口记录在tcp的snd_wnd里,主要有两个时机更新:

1)、tcp建联时

tcp服务端在收到第三次握手时,记录接收端的窗口;

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
	...
	switch (sk->sk_state) {
	case TCP_SYN_RECV:
		if (!acceptable)
			return 1;

		if (!tp->srtt_us)
			tcp_synack_rtt_meas(sk, req);

		/* Once we leave TCP_SYN_RECV, we no longer need req
		 * so release it.
		 */
		if (req) {
			inet_csk(sk)->icsk_retransmits = 0;
			reqsk_fastopen_remove(sk, req, false);
		} else {
			/* Make sure socket is routed, for correct metrics. */
			icsk->icsk_af_ops->rebuild_header(sk);
			tcp_init_congestion_control(sk);

			tcp_mtup_init(sk);
			tp->copied_seq = tp->rcv_nxt;
			tcp_init_buffer_space(sk);
		}
		smp_mb();
		tcp_set_state(sk, TCP_ESTABLISHED);
		sk->sk_state_change(sk);

		/* Note, that this wakeup is only for marginal crossed SYN case.
		 * Passively open sockets are not waked up, because
		 * sk->sk_sleep == NULL and sk->sk_socket == NULL.
		 */
		if (sk->sk_socket)
			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);

		tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

		if (tp->rx_opt.tstamp_ok)
			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

		if (req) {
			/* Re-arm the timer because data may have been sent out.
			 * This is similar to the regular data transmission case
			 * when new data has just been ack'ed.
			 *
			 * (TFO) - we could try to be more aggressive and
			 * retransmitting any data sooner based on when they
			 * are sent out.
			 */
			tcp_rearm_rto(sk);
		} else
			tcp_init_metrics(sk);

		if (!inet_csk(sk)->icsk_ca_ops->cong_control)
			tcp_update_pacing_rate(sk);

		/* Prevent spurious tcp_cwnd_restart() on first data packet */
		tp->lsndtime = tcp_time_stamp;

		tcp_initialize_rcv_mss(sk);
		tcp_fast_path_on(tp);
		break;
	}
	...
}

2)、tcp接收慢路径

static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
				 u32 ack_seq)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int flag = 0;
	u32 nwin = ntohs(tcp_hdr(skb)->window);

	if (likely(!tcp_hdr(skb)->syn))
		nwin <<= tp->rx_opt.snd_wscale;

	//ack表示接收端本次确认的序列号,ack_seq表示接收方发送本次ack数据包所使用的序列号
	//如果:
	//1、接收方本次有新确认的数据包
	//2、发送发所使用的序列号比上次更新接收窗口时使用的序列号大
	//3、发送发使用的序列号与上次更新窗口时使用的序列号相同,但接收窗口值变大(发送方重传?,并且重传数据包通知的接收窗口变大)
	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
		flag |= FLAG_WIN_UPDATE;
		tcp_update_wl(tp, ack_seq);

		if (tp->snd_wnd != nwin) {
			tp->snd_wnd = nwin;

			/* Note, it is the only place, where
			 * fast path is recovered for sending TCP.
			 */
			tp->pred_flags = 0;
			tcp_fast_path_check(sk);

			if (nwin > tp->max_window) {
				tp->max_window = nwin;
				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
			}
		}
	}

	tcp_snd_una_update(tp, ack);

	return flag;
}

发送方的流量控制

当发送方需要发送数据包时,会检验当前接收方是否有足够的接收窗口,从这里的流程看,发送方会校验接收方至少要有1个接收窗口才能正常发送,当接收端没有足够的接收窗口时,发送端会暂停发送,从这里也可以看出,发送窗口(snd_wnd)主要是用于流量控制,防止接收端的接收速率跟不上发送端的发送速率。

static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
			   int push_one, gfp_t gfp)
{
	...
	max_segs = tcp_tso_segs(sk, mss_now);
	//从下一个要发送的数据开始遍历
	while ((skb = tcp_send_head(sk))) {
		...

		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
			is_rwnd_limited = true;
			break;
		}
		...
	}
	...
}

static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
			     const struct sk_buff *skb,
			     unsigned int cur_mss)
{
	u32 end_seq = TCP_SKB_CB(skb)->end_seq;

	if (skb->len > cur_mss)
		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;

	return !after(end_seq, tcp_wnd_end(tp));
}

 

2、拥塞窗口(snd_cwnd)

拥塞窗口主要是用于控制数据包在链路上的传输,避免过多的数据包发送到链路上导致拥塞丢包,拥塞窗口对应tcp的snd_cwnd字段。拥塞控制算法主要就是根据链路的状态(传统拥塞算法主要是看丢包、乱序等,bbr算法主要是根据检测到的链路带宽及最小时延)调节拥塞窗口的大小,初始化时,会为每个tcp连接分配一个10的初始拥塞窗口。

void tcp_init_sock(struct sock *sk)
{
    ...

	/* So many TCP implementations out there (incorrectly) count the
	 * initial SYN frame in their delayed-ACK and congestion control
	 * algorithms that we must have the following bandaid to talk
	 * efficiently to them.  -DaveM
	 */
	tp->snd_cwnd = TCP_INIT_CWND;
	...
}

3、拥塞控制

拥塞控制算法主要包含几个流程:慢启动、拥塞避免、快恢复、快重传等状态;

慢启动流程 && 拥塞避免:

tcp为每个连接分配一个snd_ssthresh,当snd_cwnd小于snd_ssthresh时,每ack一个数据包,snd_cwnd就加1,此时的snd_cwnd呈现比较快速的增长阶段。

1)、snd_ssthresh初始化

snd_ssthresh会初始化一个0x7fffffff的值;

void tcp_init_sock(struct sock *sk)
{
	...
	/* See draft-stevens-tcpca-spec-01 for discussion of the
	 * initialization of these values.
	 */
	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
	...
}

2)、snd_ssthresh更新

判断当snd_cwnd小于snd_ssthresh时,处于慢启动流程,由于snd_ssthresh初始化的值是一个很大的值,因此如果用snd_cwnd跟snd_ssthresh初始化比较,显然没有意义;当tcp正常ack数据时,会进入pkts_acked,最终调用cubic的处理函数bictcp_acked,在bictcp_acked里,判断当snd_cwnd小于snd_ssthresh,并且snd_cwnd大于16时就会将snd_ssthresh更新成当前的snd_cwnd;

static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	struct bictcp *ca = inet_csk_ca(sk);
	u32 delay;

	/* Some calls are for duplicates without timetamps */
	if (sample->rtt_us < 0)
		return;

	/* Discard delay samples right after fast recovery */
	if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
		return;

	delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
	if (delay == 0)
		delay = 1;

	/* first time call or link delay decreases */
	if (ca->delay_min == 0 || ca->delay_min > delay)
		ca->delay_min = delay;

	/* hystart triggers when cwnd is larger than some threshold */
	if (hystart && tcp_in_slow_start(tp) &&
	    tp->snd_cwnd >= hystart_low_window)
		hystart_update(sk, delay);
}

接收端在收到ack时,判断如果当前不处于快恢复或loss状态,并且数据正常ack,则进入tcp_cong_avoid流程,这里会根据当前snd_cwnd与snd_ssthresh的关系决定当前的snd_cwnd如何增长;

static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bictcp *ca = inet_csk_ca(sk);

	if (!tcp_is_cwnd_limited(sk))
		return;

	if (tp->snd_cwnd <= tp->snd_ssthresh) {
		if (hystart && after(ack, ca->end_seq))
			bictcp_hystart_reset(sk);
		//acked返回非0表示,增加完拥塞窗口后,拥塞窗口已经超过了阀值snd_ssthresh
		//正常慢启动流程,这里返回0,snd_cwnd += acked,每ack一个数据包,拥塞窗口加1
		acked = tcp_slow_start(tp, acked);
		if (!acked)
			return;
	}
	//到了这里说明已经完成慢启动,进入拥塞避免阶段
	//首先先计算一个阈值ca->cnt,表示ack多少个数据才增加一个拥塞窗口
	bictcp_update(ca, tp->snd_cwnd, acked);
	//更新拥塞窗口值
	tcp_cong_avoid_ai(tp, ca->cnt, acked);
}

快恢复状态:

当tcp数据包乱序时,接收端会回复重复的ack,当接收端收到可疑的ack或重复ack时,会调用tcp_fastretrans_alert修改当前的拥塞状态,在tcp_fastretrans_alert里会判断如果当前处于open状态,并且乱序包已经超过乱序阀值,判断当前有数据包丢失,进入快恢复阶段,调用tcp_entry_recovery;

进入快恢复状态,cubic首先将snd_ssthresh降为当前拥塞窗口的717/1024倍,然后将拥塞窗口降为当前网络上的数据包个数(tcp_packets_in_flight + 1),然后重新进入慢启动流程;

void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int sndcnt = 0;
	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);

	if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
		return;

	tp->prr_delivered += newly_acked_sacked;
	if (delta < 0) {
		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
			       tp->prior_cwnd - 1;
		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
	} else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
		   !(flag & FLAG_LOST_RETRANS)) {
		sndcnt = min_t(int, delta,
			       max_t(int, tp->prr_delivered - tp->prr_out,
				     newly_acked_sacked) + 1);
	} else {
		sndcnt = min(delta, newly_acked_sacked);
	}
	/* Force a fast retransmit upon entering fast recovery */
	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}

快恢复状态退出:

在进入快恢复阶段时,tcp会保存当时的snd_next到high_seq,每次ack时,tcp判断当前处于非open状态都会进入tcp_fastretrans_alert,在tcp_fastretrans_alert里,判断如果snd_uan大于high_seq,那说明之前未ack的数据现在都已经ack了,已经没有数据空洞了,此时可以退出快恢复,重新回到open状态;

static void tcp_fastretrans_alert(struct sock *sk, const int acked,
				  const int prior_unsacked,
				  bool is_dupack, int flag)
{
	...
	/* D. Check state exit conditions. State can be terminated
	 *    when high_seq is ACKed. */
	if (icsk->icsk_ca_state == TCP_CA_Open) {
		WARN_ON(tp->retrans_out != 0);
		tp->retrans_stamp = 0;
	} else if (!before(tp->snd_una, tp->high_seq)) {  
		/* high_seq表示进入快恢复状态时的snd_next值,如果snd_una比high_seq大
			那说明已经有新数据得到ack了,因此以下流程判断是否可以退出快恢复状态*/
		switch (icsk->icsk_ca_state) {
		case TCP_CA_CWR:
			/* CWR is to be held something *above* high_seq
			 * is ACKed for CWR bit to reach receiver. */
			if (tp->snd_una != tp->high_seq) {
				tcp_end_cwnd_reduction(sk);
				tcp_set_ca_state(sk, TCP_CA_Open);
			}
			break;

		case TCP_CA_Recovery:
			if (tcp_is_reno(tp))
				tcp_reset_reno_sack(tp);
			if (tcp_try_undo_recovery(sk))
				return;
			tcp_end_cwnd_reduction(sk);
			break;
		}
	}
	...
}

当tcp退出快恢复状态时,snd_cwnd、snd_ssthresh从当前值与进入快恢复时的值两者间取更大的那个作为回到open状态时的snd_cwnd和snd_ssthresh;

static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (unmark_loss) {
		struct sk_buff *skb;

		tcp_for_write_queue(skb, sk) {
			if (skb == tcp_send_head(sk))
				break;
			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
		}
		tp->lost_out = 0;
		tcp_clear_all_retrans_hints(tp);
	}

	if (tp->prior_ssthresh) {
		const struct inet_connection_sock *icsk = inet_csk(sk);

		if (icsk->icsk_ca_ops->undo_cwnd)
			//设置退出快恢复状态的cwnd
			tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
		else
			tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);

		//设置退出快恢复状态的snd_ssthresh
		if (tp->prior_ssthresh > tp->snd_ssthresh) {
			tp->snd_ssthresh = tp->prior_ssthresh;
			TCP_ECN_withdraw_cwr(tp);
		}
	} else {
		tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
	}
	tp->snd_cwnd_stamp = tcp_time_stamp;
	tp->undo_marker = 0;
}

上图的icsk->icsk_ca_ops->undo_cwnd最终会调用到bictcp_undo_cwnd里;

static u32 bictcp_undo_cwnd(struct sock *sk)
{
	struct bictcp *ca = inet_csk_ca(sk);

	//退出loss或recovery时,snd_cwnd取当前snd_cwnd与进入loss或recovery时的snd_cwnd的最大值
	return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
}

快重传:

针对不同的拥塞算法,快重传的条件也不一样:

1)、reno

收到3个重复的ack;

static void tcp_fastretrans_alert(struct sock *sk, const int acked,
				  const int prior_unsacked,
				  bool is_dupack, int flag)
{
	...
	/* E. Process state. */
	switch (icsk->icsk_ca_state) {
	case TCP_CA_Recovery:
		if (!(flag & FLAG_SND_UNA_ADVANCED)) {
			if (tcp_is_reno(tp) && is_dupack)
				tcp_add_reno_sack(sk);
		} else {
			if (tcp_try_undo_partial(sk, acked, prior_unsacked))
				return;
			/* Partial ACK arrived. Force fast retransmit. */
			do_lost = tcp_is_reno(tp) ||
				  tcp_fackets_out(tp) > tp->reordering;
		}
		if (tcp_try_undo_dsack(sk)) {
			tcp_try_keep_open(sk);
			return;
		}
		break;
	case TCP_CA_Loss:
		tcp_process_loss(sk, flag, is_dupack);
		if (icsk->icsk_ca_state != TCP_CA_Open)
			return;
		/* Fall through to processing in Open state. */
	default:
		if (tcp_is_reno(tp)) {
			if (flag & FLAG_SND_UNA_ADVANCED)
				tcp_reset_reno_sack(tp);
			if (is_dupack)
				//reno算法时,每收到一个重复ack,sack_out加1,直到
				//sack_out到达乱序阀值3时,触发重传
				tcp_add_reno_sack(sk);
		}
	...
}

验证过程:

通过packetdrill构造数据报文,同时结合ss命令查看:

关闭rack: echo 0 >  /proc/sys/net/ipv4/tcp_recovery;

关闭fack: echo 0 >/proc/sys/net/ipv4/tcp_fack;

关闭sack: echo 0 > /proc/sys/net/ipv4/tcp_sack;

packetdrill脚本:

// Test fast retransmit with 4 packets outstanding, receiver sending SACKs.
// In this variant the receiver supports SACK.

// Establish a connection.
0   socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0  setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0

+0  bind(3, ..., ...) = 0
+0  listen(3, 1) = 0

+0  < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0  > S. 0:0(0) ack 1 <...>

+.1 < . 1:1(0) ack 1 win 257
+0  accept(3, ..., ...) = 4

// Send 1 data segment and get an ACK, so cwnd is now 4.
+0  write(4, ..., 1000) = 1000
+0  > P. 1:1001(1000) ack 1

+.1 < . 1:1(0) ack 1001 win 257

// Write 10 data segments.
+0  write(4, ..., 10000) = 10000
+0  > P. 1001:10001(9000) ack 1

//连续收到3个重复ack
+.1 < . 1:1(0) ack 1001 win 257 <sack 3001:4001,nop,nop>
+.1 < . 1:1(0) ack 1001 win 257 <sack 4001:5001,nop,nop>
+.1 < . 1:1(0) ack 1001 win 257 <sack 5001:6001,nop,nop>

//重传1001:2001数据包
+.0  > . 1001:2001(1000) ack 1

// Receiver ACKs all data.
//所有消息包都ack,关闭socket连接
+1 < . 1:1(0) ack 10001 win 257

每隔一小段时间,捕获一次ss状态,从ss的状态显示看,当重复ack个数未满3个时,tcp保持在乱序状态(1---open),当收到第3个重复ack时,tcp状态为快恢复状态,并对第一个未ack的数据包标记loss,然后重传;

2)、sack

当sack的个数大于等于乱序阀值(默认3)时,tcp就会开始标记loss状态,标记的时候,从write队列最开始的skb开始标记,直至被标记loss的skb的右侧的sack个数为3为止。如下所示,一共发送10个skb,其中第4、7、9、10四个包被sack,那tcp计算此时fack_out=10,sack_out=4,loss_out=5;

3)、fack

echo 1 >/proc/sys/net/ipv4/tcp_fack启用fack

fack是sack的更进一步优化,当使用fack时,sack的数据包个数不再需要超过乱序阀值,fack只关心收到的最大的sack数据包相比snd_una的间距是否超过乱序阀值,当超过时就会开始标记loss,fack标记loss时,从write队列最左侧开始标记,直至待标记的skb距离最大sack的skb的间距为reordering-1为止;

如下,发送10个skb,其中第4、9个skb被sack,那此时fack_out=9,loss_out=5;

 

4)、rack

echo 1 >  /proc/sys/net/ipv4/tcp_recovery启用rack;

rack相比fack又更进一步,rack是根据skb的发送时间来判断消息包是否丢失,tcp每发送一个skb时,都会为其分配一个时间戳信息,当sack时,rack机制判断那些在被sack的数据包之前发送的,并且相比被sack的数据包的时间间距到达一定间隔时,就判断该数据包为丢失,进而进入快恢复状态,然后重传被标记丢失的数据包。

tcp发送数据包时,设置当前的发送时间:

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
			    gfp_t gfp_mask)
{
	...
	if (clone_it) {
		//标记数据包发送时间
		skb_mstamp_get(&skb->skb_mstamp);
		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
			- tp->snd_una;
		tcp_rate_skb_sent(sk, skb);

		if (unlikely(skb_cloned(skb)))
			skb = pskb_copy(skb, gfp_mask);
		else
			skb = skb_clone(skb, gfp_mask);
		if (unlikely(!skb))
			return -ENOBUFS;
	}
	...
}

当发送端收到sack报文时,开始标记sack标记时,进入rack更新流程:   

static u8 tcp_sacktag_one(struct sock *sk,
			  struct tcp_sacktag_state *state, u8 sacked,
			  u32 start_seq, u32 end_seq,
			  int dup_sack, int pcount,
			  const struct skb_mstamp *xmit_time)
{
	...
	if (!(sacked & TCPCB_SACKED_ACKED)) {
		tcp_rack_advance(tp, sacked, end_seq, xmit_time);
	...
}

rack更新流程主要是记录当前最新的sack数据包的发送时间及序列号等信息:

/*当接收端收到sack时,进入该函数,记录被sack的消息包的发送时间及序列号*/
void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
		      const struct skb_mstamp *xmit_time)
{
	u32 rtt_us;

	//如果skb比上一次标记rack的skb的发送时间更早,或者序列号更小,则不处理
	if (tp->rack.mstamp.v64 &&
	    !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
				 end_seq, tp->rack.end_seq))
		return;

	rtt_us = skb_mstamp_us_delta(&tp->tcp_mstamp, xmit_time);
	if (sacked & TCPCB_RETRANS) {
		/* If the sacked packet was retransmitted, it's ambiguous
		 * whether the retransmission or the original (or the prior
		 * retransmission) was sacked.
		 *
		 * If the original is lost, there is no ambiguity. Otherwise
		 * we assume the original can be delayed up to aRTT + min_rtt.
		 * the aRTT term is bounded by the fast recovery or timeout,
		 * so it's at least one RTT (i.e., retransmission is at least
		 * an RTT later).
		 */
		if (rtt_us < tcp_min_rtt(tp))
			return;
	}
	//设置sack数据包的发送时间及序列号信息
	tp->rack.rtt_us = rtt_us;
	tp->rack.mstamp = *xmit_time;
	tp->rack.end_seq = end_seq;
	tp->rack.advanced = 1;
}

在tcp_fastretrans_alert流程判断如果有开启rack,则进入rack标记loss的流程:tcp_rack_identify_loss;

如果有开启rack,则根据sack数据包的时间戳判断未sack的数据包是否丢包,在标记loss流程里会更新tp->lost_out计数,等回到tcp_fastretrans_alert里,函数tcp_time_to_recover判断lost_out非零,则进入recovery状态。如果没有需要标记loss的数据包,则根据剩余时间启动定时器,定时器超时,重新检测。

static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	u32 reo_wnd;

	*reo_timeout = 0;
	/* To be more reordering resilient, allow min_rtt/4 settling delay
	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
	 * RTT because reordering is often a path property and less related
	 * to queuing or delayed ACKs.
	 */
	reo_wnd = 1000;
	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);

	tcp_for_write_queue(skb, sk) {
		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);

		if (skb == tcp_send_head(sk))
			break;

		/* Skip ones already (s)acked */
		if (!after(scb->end_seq, tp->snd_una) ||
		    scb->sacked & TCPCB_SACKED_ACKED)
			continue;

		//tp->rack.mstamp表示sack的包的发送时间,tp->rack.end_seq表示sack数据包的序列号
		//如果skb比被sack的数据的发送时间更早,或者序列号更小,那说明有可能这个skb是丢了
		if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
					tp->rack.end_seq, scb->end_seq)) {
			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
			 * A packet is lost if its elapsed time is beyond
			 * the recent RTT plus the reordering window.
			 */
			//tp->tcp_mstamp为最新ack(sack)的发送时间
			//计算skb从发送到现在的时间距离
			u32 elapsed = skb_mstamp_us_delta(&tp->tcp_mstamp,
							  &skb->skb_mstamp);
			//如果后发送的skb已经被sack,旧的还没有sack,并且时间超过rtt+窗口阀值
			//则认为之前的包已经丢失,rack对其标记loss
			//tp->rack.rtt_us表示sack的数据包的发送到现在的时间距离
			//这个remaining的意思是,如果sack的数据包与比它更早发送出去但还未被sack的数据包
			//的时间间隔超过1000us与1/4 rtt两个时间相对大的那个,那说明还未被sack的数据包可能是已经丢了,标记loss
			s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;

			if (remaining < 0) {
				tcp_rack_mark_skb_lost(sk, skb);
				continue;
			}

			/* Skip ones marked lost but not yet retransmitted */
			if ((scb->sacked & TCPCB_LOST) &&
			    !(scb->sacked & TCPCB_SACKED_RETRANS))
				continue;
			//如果时间还未超时,则启动超时定时器,如果定时器超时,则超时处理函数又进入tcp_rack_detect_loss重新检测
			/* Record maximum wait time (+1 to avoid 0) */
			*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);

		} else if (!(scb->sacked & TCPCB_RETRANS)) {
			/* Original data are sent sequentially so stop early
			 * b/c the rest are all sent after rack_sent
			 */
			break;
		}
	}
}

验证过程:

开启rack: echo 1 >/proc/sys/net/ipv4/tcp_recovery;

先发送一个数据包,然后延时一定时间再发送第二个数据包;然后packetdrill构造直接sack第二个数据包;此时ss可以看到发送端标记了一个loss,并且进入了recovery状态,说明这个loss并非rto超时标记的,而是rack标记的。

packetdrill脚本:

// Establish a connection.
0   socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0  setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0

+0  bind(3, ..., ...) = 0
+0  listen(3, 1) = 0

+0  < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0  > S. 0:0(0) ack 1 <...>

+.8 < . 1:1(0) ack 1 win 257
+0  accept(3, ..., ...) = 4

// 发送第1个skb
+0  write(4, ..., 1000) = 1000
+0  > P. 1:1001(1000) ack 1

//第2个skb延时发送
+.1  write(4, ..., 1000) = 1000

//第二个skb被sack
+0 < . 1:1(0) ack 1 win 257 <sack 1001:2001, nop,nop>

LOSS状态

rto定时器超时后,进入loss状态(tcp_enter_loss),进入loss状态后,先调整snd_ssthresh,然后将cwnd降为1;

void tcp_enter_loss(struct sock *sk)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_sock *tp = tcp_sk(sk);
	struct net *net = sock_net(sk);
	struct sk_buff *skb;
	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
	bool is_reneg;			/* is receiver reneging on SACKs? */
	bool mark_lost;

	/* Reduce ssthresh if it has not yet been made inside this window. */
	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
	    !after(tp->high_seq, tp->snd_una) ||
	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
		tp->prior_ssthresh = tcp_current_ssthresh(sk);
		//进入loss,调整慢启动阀值snd_ssthresh
		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
		tcp_ca_event(sk, CA_EVENT_LOSS);
		tcp_init_undo(tp);
	}
	//进入loss后,将cwnd拥塞窗口降为1
	tp->snd_cwnd	   = 1;
	tp->snd_cwnd_cnt   = 0;
	tp->snd_cwnd_stamp = tcp_time_stamp;
	...
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值