SACK是接收方用来向发送方通知已经接收到哪些序列号段的一种机制,这样发送方在重传时就只需要重传接收方真正未收到的部分即可。
初始化
sack提供了proc接口用来控制是否支持sack能力(/proc/sys/net/ipv4/tcp_sack),该选项默认为1,是能sack能力。
1)、发送端发送sync报文时,判断本地是否开启sack选项,如果开启,则options选项置上SACK_ADVERTISE标志
if (likely(sysctl_tcp_sack)) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!(OPTION_TS & opts->options)))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
2)、填充SACK选项内容时,置上TCPOPT_SACK_PERM,用于接收端解析;
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
}
3)、接收端在解析tcp option选项时,根据TCPOPT_SACK_PERM选项以及本地的sysctl_tcp_sack值判断是否支持sack,如果支持,则opt_rx->sack_ok置上TCP_SACK_SEEN标志,后续协议栈会通过tcp_is_sack函数来判断是否支持sack能力;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM && th->syn &&
!estab && sysctl_tcp_sack) {
opt_rx->sack_ok = TCP_SACK_SEEN;
tcp_sack_reset(opt_rx);
}
break;
接收端流程:
tcp_data_queue_ofo
接收端收到消息包后进入tcp_rcv_established,该函数分为快路径和慢路径两种情况,当接收到的数据包序列号不是期望接收的下一个序列号(rcv_next)时,判断为乱序,乱序最终会通过慢路径走到函数tcp_data_queue_ofo将乱序数据包放到队列tp->out_of_order_queue里,放队列前会先进行skb的合并、排序等常规操作。
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb1;
u32 seq, end_seq;
TCP_ECN_check_ce(tp, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
__kfree_skb(skb);
return;
}
/* Disable header prediction. */
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
skb1 = skb_peek_tail(&tp->out_of_order_queue);
if (!skb1) {
/* Initial out of order segment, build 1 SACK. */
if (tcp_is_sack(tp)) {
tp->rx_opt.num_sacks = 1;
//乱序的首包,初始化sack信息
tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
tp->selective_acks[0].end_seq =
TCP_SKB_CB(skb)->end_seq;
}
__skb_queue_head(&tp->out_of_order_queue, skb);
goto end;
}
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
//新到的skb为乱序队列的下一个skb,做合并处理
if (seq == TCP_SKB_CB(skb1)->end_seq) {
bool fragstolen;
//尝试将skb的内容合并到skb1里, 如果合并失败,则将skb1添加到skb的队列后面
if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
} else {
tcp_grow_window(sk, skb);
kfree_skb_partial(skb, fragstolen);
skb = NULL;
}
if (!tp->rx_opt.num_sacks ||
tp->selective_acks[0].end_seq != seq)
goto add_sack;
/* Common case: data arrive in order after hole. */
tp->selective_acks[0].end_seq = end_seq;
goto end;
}
/* Find place to insert this segment. */
//按序列号的顺序,找到需要插入的skb节点
while (1) {
if (!after(TCP_SKB_CB(skb1)->seq, seq))
break;
if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
skb1 = NULL;
break;
}
skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
}
/* Do skb overlap to previous one? */
if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
__kfree_skb(skb);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
}
if (after(seq, TCP_SKB_CB(skb1)->seq)) {
/* Partial overlap. */
tcp_dsack_set(sk, seq,
TCP_SKB_CB(skb1)->end_seq);
} else {
if (skb_queue_is_first(&tp->out_of_order_queue,
skb1))
skb1 = NULL;
else
skb1 = skb_queue_prev(
&tp->out_of_order_queue,
skb1);
}
}
//找不到插入的skb1节点,则将新到的skb插入乱序列表头
if (!skb1)
__skb_queue_head(&tp->out_of_order_queue, skb);
//否则插到找到的skb1的后面
else
__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
/* And clean segments covered by new one as whole. */
while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
break;
if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
end_seq);
break;
}
__skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
__kfree_skb(skb1);
}
add_sack:
//收到乱序包后,构建回复的sack信息
if (tcp_is_sack(tp))
tcp_sack_new_ofo_skb(sk, seq, end_seq);
end:
if (skb) {
tcp_grow_window(sk, skb);
skb_set_owner_r(skb, sk);
}
}
在tcp_data_queue_ofo里最终会调用tcp_sack_new_ofo_skb添加回复发送端的sack信息;
static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sack_block *sp = &tp->selective_acks[0];
int cur_sacks = tp->rx_opt.num_sacks;
int this_sack;
if (!cur_sacks)
goto new_sack;
//判断跟已有的sack信息是否有可以合并的
for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
if (tcp_sack_extend(sp, seq, end_seq)) {
/* Rotate this_sack to the first one. */
for (; this_sack > 0; this_sack--, sp--)
swap(*sp, *(sp - 1));
if (cur_sacks > 1)
tcp_sack_maybe_coalesce(tp);
return;
}
}
/* Could not find an adjacent existing SACK, build a new one,
* put it at the front, and shift everyone else down. We
* always know there is at least one SACK present already here.
*
* If the sack array is full, forget about the last one.
*/
if (this_sack >= TCP_NUM_SACKS) {
this_sack--;
tp->rx_opt.num_sacks--;
sp--;
}
for (; this_sack > 0; this_sack--, sp--)
*sp = *(sp - 1);
new_sack:
/* Build the new head SACK, and we're done. */
//新创建一个sack block
sp->start_seq = seq;
sp->end_seq = end_seq;
tp->rx_opt.num_sacks++;
}
__tcp_ack_snd_check
当接收端接收端乱序包时,会立即回复ack,这里的ofo_possible设置为1;
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
/* More than one full frame received... */
//ofo_possible表示出现乱序了,那么马上回复ack
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
/* ... and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise). Or...
*/
__tcp_select_window(sk) >= tp->rcv_wnd) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
(ofo_possible && skb_peek(&tp->out_of_order_queue))) {
/* Then ack it now */
//马上回复ack,ack的seq号为snd_nxt;
//ack的ack_seq为rcv_nxt(在tcp_transmit_skb里设置)
tcp_send_ack(sk);
} else {
/* Else, send delayed ack. */
tcp_send_delayed_ack(sk);
}
}
ack流程最终通过tcp_send_ack->tcp_transmit_skb走到tcp_options_write函数,在tcp_options_write里,将之前构建的sack信息填充到skb头部里,另外,由于本次收到的数据包没有ack新的包,所以tp->rcv_nxt不会更新,因此本次回复的ack_seq为旧的序列号。
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
struct tcp_out_options *opts)
{
th = tcp_hdr(skb);
th->source = inet->inet_sport;
th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(tp->rcv_nxt);
if (unlikely(opts->num_sack_blocks)) {
struct tcp_sack_block *sp = tp->rx_opt.dsack ?
tp->duplicate_sack : tp->selective_acks;
int this_sack;
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK << 8) |
(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
TCPOLEN_SACK_PERBLOCK)));
for (this_sack = 0; this_sack < opts->num_sack_blocks;
++this_sack) {
*ptr++ = htonl(sp[this_sack].start_seq);
*ptr++ = htonl(sp[this_sack].end_seq);
}
tp->rx_opt.dsack = 0;
}
}