TCP bbr简介

1、背景

现有的拥塞控制算法,如cubic,基于丢包检查,问题

1)、网络设备buffer大,导致bufferbloat只要不丢包,就会发送,这样就容易把网络设备的buffer填充满,导致延时增加;

2)、网络设备buffer小,容易丢包,拥塞算法根据丢包控制发包速率,导致整体带宽吞吐小;

2、BBR四个阶段流程

start up

进入start up有两个时机:

1)、初始化

bbr_init,先将bbr->mode初始化成BBR_STARTUP;

static void bbr_init(struct sock *sk)
{
	bbr_reset_startup_mode(sk);

	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
}

2)、tcp ack收到拥塞信号,如果当前处于probe rtt阶段,并且还没探测到最大带宽,则重新进入start up阶段;

static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bbr *bbr = inet_csk_ca(sk);

	if (event == CA_EVENT_TX_START && tp->app_limited) {
		bbr->idle_restart = 1;
		/* Avoid pointless buffer overflows: pace at est. bw if we don't
		 * need more speed (we're restarting from idle and app-limited).
		 */
		//如果在probe bw阶段收到拥塞信号,则将pacing_gain调成成1,控制发包量
		if (bbr->mode == BBR_PROBE_BW)
			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
		else if (bbr->mode == BBR_PROBE_RTT)
			//如果是在probe rtt阶段收到拥塞信号,则判断当前如果还未探测到最大带宽,则重新
			//进入start up阶段,否则进入probe bw阶段
			bbr_check_probe_rtt_done(sk);
	}
}

在start up阶段,bbr会以较大的pacing gain和cwnd gain发送数据包;

static void bbr_update_gains(struct sock *sk)
{
	struct bbr *bbr = inet_csk_ca(sk);

	switch (bbr->mode) {
	case BBR_STARTUP:
		bbr->pacing_gain = bbr_high_gain;  //2.95倍
		bbr->cwnd_gain	 = bbr_high_gain;
		break;
}

drain排空

在start up阶段,以较大的pacing、cwnd速率发送消息包,因此探测到的bw也越来越大,当bbr发现已经探测到最大带宽时,此时链路上的设备buffer已经被填充满,开始进入drain阶段,将链路设备的buffer排空;

static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
{
	struct bbr *bbr = inet_csk_ca(sk);

	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
		bbr->mode = BBR_DRAIN;	/* drain queue we created */
		tcp_sk(sk)->snd_ssthresh =
				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
	}	/* fall through to check if in-flight is already small: */
	if (bbr->mode == BBR_DRAIN &&
	    tcp_packets_in_flight(tcp_sk(sk)) <=
	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
}

在drain排空阶段,bbr会控制减少pacing的速率;

static void bbr_update_gains(struct sock *sk)
{
	struct bbr *bbr = inet_csk_ca(sk);

	switch (bbr->mode) {

	case BBR_DRAIN: 
		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain 1/2.9倍增益 */
		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
		break;
}

等到检测到in_flight的数据包个数少于BDP时,进入probe_bw阶段。

static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
{
	struct bbr *bbr = inet_csk_ca(sk);

	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
		bbr->mode = BBR_DRAIN;	/* drain queue we created */
		tcp_sk(sk)->snd_ssthresh =
				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
	}	/* fall through to check if in-flight is already small: */
	if (bbr->mode == BBR_DRAIN &&
	    tcp_packets_in_flight(tcp_sk(sk)) <=
	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
}

Probe bw

如何探测?

进入probe_bw阶段,会使用以下的pacing增益系数不断循环探测;

static const int bbr_pacing_gain[] = {
	BBR_UNIT * 5 / 4,	/* probe for more available bw */
	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
};

当处在某个phase下,判断当前满足条件,进入下一个phase;

static bool bbr_is_next_cycle_phase(struct sock *sk,
				    const struct rate_sample *rs)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bbr *bbr = inet_csk_ca(sk);
	bool is_full_length =
		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
		bbr->min_rtt_us;
	u32 inflight, bw;

	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
	 * use the pipe without increasing the queue.
	 */
	//如果是1倍增益,则探测时间到达一个min_rtt,就进入下一个phase
	if (bbr->pacing_gain == BBR_UNIT)
		return is_full_length;		/* just use wall clock time */

	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
	bw = bbr_max_bw(sk);

	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
	 * small (e.g. on a LAN). We do not persist if packets are lost, since
	 * a path with small buffers may not hold that much.
	 */
	//如果是大于1倍增益,则探测时间超过1个min_rtt并且有丢包或inflight达到该增益倍数下计算的bdp值
	//则进入下一个phase
	//正增益下,意图是提高链路的使用率
	if (bbr->pacing_gain > BBR_UNIT)
		return is_full_length &&
			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));

	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
	 * probing didn't find more bw. If inflight falls to match BDP then we
	 * estimate queue is drained; persisting would underutilize the pipe.
	 */
	//如果是小于1倍增益,则探测时间超过1个min_rtt并且infight达到1倍增益下的bdp值,则进入下一个phase
	//减增益下,意图是释放链路上的带宽资源
	return is_full_length ||
		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
}

探测何时结束?

在probe_bw模式下,经过一轮的phase定义的不同pacing增益发包逻辑,探测到链路开始出现丢包,并且丢包率达到预期值后,认为开始探测到最大bw了,如果两次探测的最大bw在固定误差范围内,则设置本轮探测的bw为二者的平均值,然后进入lt_use_bw状态;

static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
{
	struct bbr *bbr = inet_csk_ca(sk);
	u32 diff;

	//本次计算得到的bw跟之前探测到的bbr->lt_bw误差在1/8内,则设置本轮探测的bw为二者的平均值
	//然后设置lt_ues_bw为1,进入long term状态,暂时退出probe_bw,等待下一轮的探测周期
	//在long term模式下,pacing_gain设置为1倍增益
	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
		/* Is new bw close to the lt_bw from the previous interval? */
		diff = abs(bw - bbr->lt_bw);
		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
		     bbr_lt_bw_diff)) {
			/* All criteria are met; estimate we're policed. */
			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
			bbr->lt_use_bw = 1;
			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
			bbr->lt_rtt_cnt = 0;
			return;
		}
	}
	bbr->lt_bw = bw;
	bbr_reset_lt_bw_sampling_interval(sk);
}

进入lt_use_bw状态后,当经过固定周期(bbr_lt_bw_max_rtts),probe_bw模式重新生效;

static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bbr *bbr = inet_csk_ca(sk);
	u32 lost, delivered;
	u64 bw;
	u32 t;

	//如果是在lt_use_bw状态下,判断当前处于probe_bw阶段,则经过bbr_lt_bw_max_rtts次有正常ack后
	//重新进入prote_bw阶段
	//当bbr->mode==BBR_RROBE_BW时,并不一定真的有去探测bw,而是要看当前是否处于lt_use_bw,
	//每次probe_bw周期,都会重新探测到一个稳定的最大带宽,然后设置lt_use_bw为1
	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
		}
		return;
	}

	/* Wait for the first loss before sampling, to let the policer exhaust
	 * its tokens and estimate the steady-state rate allowed by the policer.
	 * Starting samples earlier includes bursts that over-estimate the bw.
	 */
	if (!bbr->lt_is_sampling) {
		if (!rs->losses)
			return;
		bbr_reset_lt_bw_sampling_interval(sk);
		bbr->lt_is_sampling = true;
	}

	//探测probe_bw阶段,如果处于app_limited,则不进行探测
	/* To avoid underestimates, reset sampling if we run out of data. */
	if (rs->is_app_limited) {
		bbr_reset_lt_bw_sampling(sk);
		return;
	}

	if (bbr->round_start)
		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
		return;		/* sampling interval needs to be longer */
	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
		return;
	}

	/* End sampling interval when a packet is lost, so we estimate the
	 * policer tokens were exhausted. Stopping the sampling before the
	 * tokens are exhausted under-estimates the policed rate.
	 */
	//探测发送到有出现丢包,才会计算bw
	if (!rs->losses)
		return;

	/* Calculate packets lost and delivered in sampling interval. */
	lost = tp->lost - bbr->lt_last_lost;
	delivered = tp->delivered - bbr->lt_last_delivered;
	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */

	//丢包率至少要达到20%
	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
		return;

	/* Find average delivery rate in this sampling interval. */
	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
	if ((s32)t < 1)
		return;		/* interval is less than one ms, so wait */
	/* Check if can multiply without overflow */
	if (t >= ~0U / USEC_PER_MSEC) {
		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
		return;
	}
	t *= USEC_PER_MSEC;
	bw = (u64)delivered * BW_UNIT;
	do_div(bw, t);
	bbr_lt_bw_interval_done(sk, bw);
}

Delivered速率怎么计算?

如上图,t1时刻发送序列号seq=2,t2时刻发送3号skb,t3时刻发送4号skb,然后t4时刻ack4号skb;bbr在计算带宽时的原理就是计算某个skb从发送到被ack时一共delivered了多少个数据以及用了的时间interval_us,然后两者相除就得到当前的带宽值;

由于存在延时ack等因素,因此可能计算出来的带宽值会偏差,bbr在计算interval_us时会同时计算发送时间及ack时间,然后取两者的小值,因此bw的计算方法如下:

delivered = tp->delivered – tp->tp_pri_delivered

Interverl_ns = min(send_usack_us)

Bw = delivered / Interverl_ns

Probe rtt

每隔10s,bbr都会进行一轮min_rtt探测,探测周期为200ms,在probe_rtt阶段,bbr会降低发包速率,保证链路不会出现拥堵;

static void bbr_update_gains(struct sock *sk)
{
	struct bbr *bbr = inet_csk_ca(sk);

	switch (bbr->mode) {
	case BBR_PROBE_RTT:
		bbr->pacing_gain = BBR_UNIT;
		bbr->cwnd_gain	 = BBR_UNIT;
		break;
	default:
		WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
		break;
	}
}
static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bbr *bbr = inet_csk_ca(sk);
	bool filter_expired;

	/* Track min RTT seen in the min_rtt_win_sec filter window: */
	//每经过10s(bbr_min_rtt_win_sec),就会探测一次min_rtt
	filter_expired = after(tcp_jiffies32,
			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
	//10s超时后,判断本轮探测的rtt是否比之前探测的min_rtt更小,如果是,则更新min_rtt为本轮探测的rtt
	if (rs->rtt_us >= 0 &&
	    (rs->rtt_us <= bbr->min_rtt_us ||
	     (filter_expired && !rs->is_ack_delayed))) {
		bbr->min_rtt_us = rs->rtt_us;
		bbr->min_rtt_stamp = tcp_jiffies32;
	}

	//检测周期到期,当前不是probe_rtt模式,则进入probe_rtt模式
	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
		bbr->probe_rtt_done_stamp = 0;
	}

	if (bbr->mode == BBR_PROBE_RTT) {
		/* Ignore low rate samples during this mode. */
		tp->app_limited =
			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
		/* Maintain min packets in flight for max(200 ms, 1 round). */
		//设置本轮的采样周期(200ms),进入probe_rtt阶段后,会降低发包速率,等in_flight数据包
		//个数降到bbr_cwnd_min_target开始探测rtt
		if (!bbr->probe_rtt_done_stamp &&
		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
			bbr->probe_rtt_round_done = 0;
			bbr->next_rtt_delivered = tp->delivered;
		} else if (bbr->probe_rtt_done_stamp) {
			if (bbr->round_start)
				bbr->probe_rtt_round_done = 1;
			if (bbr->probe_rtt_round_done)
				bbr_check_probe_rtt_done(sk);
		}
	}
	/* Restart after idle ends only once we process a new S/ACK for data */
	if (rs->delivered > 0)
		bbr->idle_restart = 0;
}

当一轮rtt探测结束后,通过bbr_reset_mode重新进入probe_bw或start up阶段;

static void bbr_check_probe_rtt_done(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bbr *bbr = inet_csk_ca(sk);

	if (!(bbr->probe_rtt_done_stamp &&
	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
		return;

	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
	tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
	bbr_reset_mode(sk);
}

BBR的输出

数据包ack后进入bbr模块处理,处理完成后,bbr模块会有两个输出,一个是根据bw设置的pacing速率,一个是根据bdp计算得到的cwnd;

Pacing速率

Pacing处理

pacing速率的计算:

bbr的pacing处理有两种方式:

1)、依赖于tc-fq的pacing

当使用tc-fq时,qdisc默认会使能rate_enable限速,这个流程也会利用bbr算法计算得到的sk_pacing_rate完成pacing功能;

static struct sk_buff *fq_dequeue(struct Qdisc *sch)
{
	//rate_enable模式使能
	if (!q->rate_enable)
		goto out;

	/* Do not pace locally generated ack packets */
	if (skb_is_tcp_pure_ack(skb))
		goto out;

	rate = q->flow_max_rate;
	if (skb->sk)
		rate = min(skb->sk->sk_pacing_rate, rate);

	if (rate <= q->low_rate_threshold) {
		f->credit = 0;
		plen = qdisc_pkt_len(skb);
	} else {
		plen = max(qdisc_pkt_len(skb), q->quantum);
		if (f->credit > 0)
			goto out;
	}
	if (rate != ~0U) {
		u64 len = (u64)plen * NSEC_PER_SEC;

		if (likely(rate))
			do_div(len, rate);
		/* Since socket rate can change later,
		 * clamp the delay to 1 second.
		 * Really, providers of too big packets should be fixed !
		 */
		if (unlikely(len > NSEC_PER_SEC)) {
			len = NSEC_PER_SEC;
			q->stat_pkts_too_long++;
		}
		/* Account for schedule/timers drifts.
		 * f->time_next_packet was set when prior packet was sent,
		 * and current time (@now) can be too late by tens of us.
		 */
		if (f->time_next_packet)
			len -= min(len/2, now - f->time_next_packet);
		f->time_next_packet = now + len;
	}
out:
	qdisc_bstats_update(sch, skb);
	return skb;
}

2)、tcp主动pacing

bbr_init时,默认使能SK_PACING_NEEDED;

static void bbr_init(struct sock *sk)
{
	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
}

tcp_write_xmit的时候,通过tcp_pacing_check判断当前是否已经启动pacing高精度定时器,如果已经启动,则退出xmit流程;

static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
			   int push_one, gfp_t gfp)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	unsigned int tso_segs, sent_pkts;
	int cwnd_quota;
	int result;
	bool is_cwnd_limited = false, is_rwnd_limited = false;
	u32 max_segs;

	sent_pkts = 0;

	tcp_mstamp_refresh(tp);
	if (!push_one) {
		/* Do MTU probing. */
		result = tcp_mtu_probe(sk);
		if (!result) {
			return false;
		} else if (result > 0) {
			sent_pkts = 1;
		}
	}

	max_segs = tcp_tso_segs(sk, mss_now);
	while ((skb = tcp_send_head(sk))) {
		unsigned int limit;

		if (tcp_pacing_check(sk))
			break;
    ...
}

__tcp_transmit_skb判断是否需要tcp层做pacing,需要的话就启动高精度定时器;

static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
	...
	if (skb->len != tcp_header_size) {
		tcp_event_data_sent(tp, sk);
		tp->data_segs_out += tcp_skb_pcount(skb);
		tp->bytes_sent += skb->len - tcp_header_size;
		tcp_internal_pacing(sk, skb);
	}
	...
}
static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
{
	u64 len_ns;
	u32 rate;

	if (!tcp_needs_internal_pacing(sk))
		return;
	rate = sk->sk_pacing_rate;
	if (!rate || rate == ~0U)
		return;

	/* Should account for header sizes as sch_fq does,
	 * but lets make things simple.
	 */
	//sk_pacing_rate表示1分钟能发送的字节数
	//skb->len / rate表示发送skb->len字节数需要的时间长度(长度是分钟)
	//len_ns = skb->len / rate * NSEC_PER_SEC即将时间换算成纳秒,然后启动pacing高精度定时器
	len_ns = (u64)skb->len * NSEC_PER_SEC;
	do_div(len_ns, rate);
	hrtimer_start(&tcp_sk(sk)->pacing_timer,
		      ktime_add_ns(ktime_get(), len_ns),
		      HRTIMER_MODE_ABS_PINNED_SOFT);
	sock_hold(sk);
}

拥塞窗口cwnd

static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
			 u32 acked, u32 bw, int gain)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bbr *bbr = inet_csk_ca(sk);
	u32 cwnd = tp->snd_cwnd, target_cwnd = 0;

	if (!acked)
		goto done;  /* no packet fully ACKed; just apply caps */

	//第一次进入recovery状态时,返回true,此时cwnd=tcp_packets_in_flight(tp) + acked,因此这时候主要还是
	//考虑链路上的数据包守恒,ack多少个数据包就发送多少个数据包,相当于保持cwnd不变
	//退出recovery或loss状态时,cwnd=进入recovery或loss状态时的cwnd
	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
		goto done;

	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
	// 1、先根据bdp计算得到target_cwnd
	target_cwnd = bbr_bdp(sk, bw, gain);
	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
	//当cwnd=cwnd+acked时表示: 本次ack p个数据包,则可发送2*p个数据包
	//本次ack p个包后,in_flight数据包少了p个,所以本来cwnd里就有空闲的p个数据包可以发送,再加acked
	//那就相当于可以发送2*p个数据包了
	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
		cwnd = min(cwnd + acked, target_cwnd);
	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
		cwnd = cwnd + acked;
	cwnd = max(cwnd, bbr_cwnd_min_target);

done:
	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
		tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
}

 

  • 4
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值