本文分析linux-4.19.12代码的htcp拥塞算法
与reno拥塞算法不同处,慢启动超过阈值ssthresh的ack直接忽略了; 拥塞避免阶段多了个参数ca->alpha.
开始ca->alpha=1<<7,与标准的reno增长速度一样; 拥塞乘性减窗口时候不是固定beta不是固定的0.5
static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
if (!tcp_is_cwnd_limited(sk))
return;
if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
*/
// alpha单位是128, 相对reno的cnt>snd_cwnd改成了cnd*alpha > snd_cwnd
if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tp->snd_cwnd) {
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
tp->snd_cwnd_cnt = 0;
htcp_alpha_update(ca);
} else
tp->snd_cwnd_cnt += ca->pkts_acked;
ca->pkts_acked = 1;
}
}
static u32 htcp_recalc_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct htcp *ca = inet_csk_ca(sk);
htcp_param_update(sk);
// beta单位是128,reno的beta是固定0.5,这里是一个变动的值
return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
}
本算法的重点是alpha和beta的更新.
按照论文的说法,每收到一个ack就更新alpha,(代码里面好像是增加窗口的时候才会更新),每次发生拥塞就更新beta(也会更新alpha). 并补充到, 考虑到非常小的队列,限制beta的值在区间[0.5, 0.8];为了不受rtt干扰,用rtt缩放alpha. .
alpha的二次曲线如下图. 当距离上次丢包时间越久,认为BDP越大, 增长越激进.
static inline void htcp_alpha_update(struct htcp *ca)
{
u32 minRTT = ca->minRTT;
u32 factor = 1;
u32 diff = htcp_cong_time(ca);
/*
对应论文的分段函数.居然和HZ有关系....不同机器HZ不一样?
*/
if (diff > HZ) {
diff -= HZ;
factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / HZ)) / HZ;
}
if (use_rtt_scaling && minRTT) {
/*
居然和HZ有关系....不同机器HZ不一样?
当minRTT>0.2*HZ, scale = 0.5,
当minRTT<0.01*HZ, scale = 10,
当0.01*HZ<=minRTT<=0.2*HZ, scale=HZ/(10*minRTT)
*/
u32 scale = (HZ << 3) / (10 * minRTT);
/* clamping ratio to interval [0.5,10]<<3 */
scale = min(max(scale, 1U << 2), 10U << 3);
factor = (factor << 3) / scale;
if (!factor)
factor = 1;
}
ca->alpha = 2 * factor * ((1 << 7) - ca->beta);
if (!ca->alpha)
ca->alpha = ALPHA_BASE;
}
/*
last_cong: Time since last congestion event end.
jiffies - ca->last_cong: 距上次丢包的时间
*/
static inline u32 htcp_cong_time(const struct htcp *ca)
{
return jiffies - ca->last_cong;
}
拥塞发生会调用htcp_param_update函数更新参数.
/*
* After we have the rtt data to calculate beta, we'd still prefer to wait one
* rtt before we adjust our beta to ensure we are working from a consistent
* data.
*
* This function should be called when we hit a congestion event since only at
* that point do we really have a real sense of maxRTT (the queues en route
* were getting just too full now).
*/
static void htcp_param_update(struct sock *sk)
{
struct htcp *ca = inet_csk_ca(sk);
u32 minRTT = ca->minRTT;
u32 maxRTT = ca->maxRTT;
htcp_beta_update(ca, minRTT, maxRTT);
htcp_alpha_update(ca);
/* add slowly fading memory for maxRTT to accommodate routing changes */
/* 稍微减小maxRTT. why?*/
if (minRTT > 0 && maxRTT > minRTT)
ca->maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100;
}
static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
{
if (use_bandwidth_switch) {
u32 maxB = ca->maxB;
u32 old_maxB = ca->old_maxB;
ca->old_maxB = ca->maxB;
/* |maxB-old_maxB|/old_maxB > 0.2 */
if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) {
ca->beta = BETA_MIN;
/* modeswitch如果为0,下面的更新会导致beta=BETA_MIN */
ca->modeswitch = 0;
return;
}
}
/*
如果延迟开关开启,最小rtt大于10000/HZ(毫秒),且maxRTT不为0
beta=minRTT/maxRTT,并限制在[BETA_MIN,BETA_MAX]中
否则
beta=BETA_MIN, 延迟开关开启.
*/
if (ca->modeswitch && minRTT > msecs_to_jiffies(10) && maxRTT) {
ca->beta = (minRTT << 7) / maxRTT;
if (ca->beta < BETA_MIN)
ca->beta = BETA_MIN;
else if (ca->beta > BETA_MAX)
ca->beta = BETA_MAX;
} else {
ca->beta = BETA_MIN;
ca->modeswitch = 1;
}
}
前面更新参数用到了带宽和rtt,从htcp结构体可以看出,每次受到ack都会调用measure_achieved_throughput来计算吞吐量.
static struct tcp_congestion_ops htcp __read_mostly = {
.init = htcp_init,
.ssthresh = htcp_recalc_ssthresh,
.cong_avoid = htcp_cong_avoid,
.set_state = htcp_state,
.undo_cwnd = htcp_cwnd_undo,
.pkts_acked = measure_achieved_throughput,
.owner = THIS_MODULE,
.name = "htcp",
};
static void measure_achieved_throughput(struct sock *sk,
const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
u32 now = tcp_jiffies32;
if (icsk->icsk_ca_state == TCP_CA_Open)
ca->pkts_acked = sample->pkts_acked;
if (sample->rtt_us > 0)
measure_rtt(sk, usecs_to_jiffies(sample->rtt_us));
if (!use_bandwidth_switch)
return;
/* achieved throughput calculations */
/* 只有在open或disorder状态才计算吞吐量 */
if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
ca->packetcount = 0;
ca->lasttime = now;
return;
}
ca->packetcount += sample->pkts_acked;
/*接近一个rtt周期(packetcount大于cwnd_alpha, delta_t>minRTT)计算一次*/
if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
now - ca->lasttime >= ca->minRTT &&
ca->minRTT > 0) {
// 计算当前吞吐量
__u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
/*
如果距上次丢包的时间小于3个minRTT
带宽值都用cur_Bi覆盖
否则
Bi用3/4*Bi+1/4*cur_Bi平滑处理
maxB=max(maxB, ca->Bi)
minB=min(minB, maxB)
*/
if (htcp_ccount(ca) <= 3) {
/* just after backoff */
ca->minB = ca->maxB = ca->Bi = cur_Bi;
} else {
ca->Bi = (3 * ca->Bi + cur_Bi) / 4;
if (ca->Bi > ca->maxB)
ca->maxB = ca->Bi;
if (ca->minB > ca->maxB)
ca->minB = ca->maxB;
}
ca->packetcount = 0;
ca->lasttime = now;
}
}
/*
根据srtt更新最小rtt和最大rtt.
如果minRTT>srtt或minRTT=0 ,则minRTT=srtt
如果处于open状态
如果maxRTT<minRTT,则maxRTT=minRTT. 如果max+20<=srtt,则maxRTT=srtt
*/
static inline void measure_rtt(struct sock *sk, u32 srtt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct htcp *ca = inet_csk_ca(sk);
/* keep track of minimum RTT seen so far, minRTT is zero at first */
if (ca->minRTT > srtt || !ca->minRTT)
ca->minRTT = srtt;
/* max RTT */
if (icsk->icsk_ca_state == TCP_CA_Open) {
if (ca->maxRTT < ca->minRTT)
ca->maxRTT = ca->minRTT;
if (ca->maxRTT < srtt &&
srtt <= ca->maxRTT + msecs_to_jiffies(20))
ca->maxRTT = srtt;
}
}