由TCP数据接收之入口中看到,无论是哪个队列,最终都是调用tcp_v4_do_rcv()处理输入数据包,对于连接态,是由tcp_rcv_established()函数完成处理,这篇笔记就来看看该函数的实现。
1. 慢速路径执行
int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
...
slow_path:
//长度检查和校验
if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
goto csum_error;
//PAWS相关处理,忽略
/*
* RFC1323: H1. Apply PAWS check first.
*/
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
tcp_send_dupack(sk, skb);
goto discard;
}
/* Resets are accepted even if PAWS failed.
ts_recent update must be made after we are sure
that the packet is in window.
*/
}
/*
* Standard slow path.
*/
//如果输入数据包的序号不再接收窗口内,则需要丢弃
if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
/* RFC793, page 37: "In all states except SYN-SENT, all reset
* (RST) segments are validated by checking their SEQ-fields."
* And page 69: "If an incoming segment is not acceptable,
* an acknowledgment should be sent in reply (unless the RST bit
* is set, if so drop the segment and return)".
*/
//如果数据包没有复位标记,则向发送端回复重复ACK
if (!th->rst)
tcp_send_dupack(sk, skb);
goto discard;
}
//收到RST报文,做复位操作(根据TCP状态设置错误标记位)后丢弃该报文
if (th->rst) {
tcp_reset(sk);
goto discard;
}
//更新时间戳
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
//已建立连接的TCP收到SYN包时,说明对端遇到了错误,这时复位当前TCP
if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
TCP_INC_STATS_BH(TCP_MIB_INERRS);
NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
tcp_reset(sk);
return 1;
}
step5:
//收到ACK,处理(一般都会有,携带ACK是没有开销的)
if (th->ack)
tcp_ack(sk, skb, FLAG_SLOWPATH);
//采样更新RTT
tcp_rcv_rtt_measure_ts(sk, skb);
//处理紧急数据,忽略
/* Process urgent data. */
tcp_urg(sk, skb, th);
//对数据包中的数据部分进行处理,包括内存检查,进队列操作等
/* step 7: process the segment text */
tcp_data_queue(sk, skb);
//尝试发送数据
tcp_data_snd_check(sk);
//尝试发送ACK
tcp_ack_snd_check(sk);
return 0;
csum_error:
TCP_INC_STATS_BH(TCP_MIB_INERRS);
discard:
__kfree_skb(skb);
return 0;
}
2. tcp_data_queue()
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
struct tcp_sock *tp = tcp_sk(sk);
int eaten = -1;
//该函数要处理就是带数据的输入段,所以如果数据数据段没有数据部分,直接丢弃
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
goto drop;
//调整data指针指向数据部分
__skb_pull(skb, th->doff * 4);
//ECN相关处理
TCP_ECN_accept_cwr(tp, skb);
//延迟ACK相关处理
if (tp->rx_opt.dsack) {
tp->rx_opt.dsack = 0;
tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks,
4 - tp->rx_opt.tstamp_ok);
}
/* Queue data for delivery to the user.
* Packets in sequence go to the receive queue.
* Out of sequence packets to the out_of_order_queue.
*/
//虽然这里是慢速路径的处理,但是输入段还是有可能是预期的段,所以还是要将数据放入
//接收队列或者直接拷贝到用户空间,这里的处理和快速路径的处理非常类似
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
//接收窗口为0,表示本端没有空间接收数据了,所以立马给对端发送零窗口通告
if (tcp_receive_window(tp) == 0)
goto out_of_window;
//如果用户空间程序正在等待数据,并且数据正好是要读取的,直接拷贝给用户空间
if (tp->ucopy.task == current &&
tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
sock_owned_by_user(sk) && !tp->urg_data) {
int chunk = min_t(unsigned int, skb->len,
tp->ucopy.len);
__set_current_state(TASK_RUNNING);
local_bh_enable();
//执行数据拷贝
if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
eaten = (chunk == skb->len && !th->fin);
tcp_rcv_space_adjust(sk);
}
local_bh_disable();
}
//如果没有拷贝成功(内存受限,或者没有进程在等待等原因)
if (eaten <= 0) {
queue_and_out:
//内存不足,丢弃数据包
if (eaten < 0 &&
tcp_try_rmem_schedule(sk, skb->truesize))
goto drop;
//将输入数据包放入接收队列中
skb_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
}
//更新rcv_nxt
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
//收到了新数据,做新数据到的事件处理
if (skb->len)
tcp_event_data_recv(sk, skb);
//输入数据包中携带了FIN标记,做断开连接处理
if (th->fin)
tcp_fin(skb, sk, th);
//如果乱序队列不为空,那么因为来了新数据,所以乱序队列中可能有些数据变为连续的,
//所以需要将这些数据移到接收队列中
if (!skb_queue_empty(&tp->out_of_order_queue)) {
//处理乱序队列
tcp_ofo_queue(sk);
/* RFC2581. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
if (skb_queue_empty(&tp->out_of_order_queue))
inet_csk(sk)->icsk_ack.pingpong = 0;
}
//SACK相关处理
if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp);
//重新设置首部预测标记
tcp_fast_path_check(sk);
//如果数据已经拷贝给了用户空间程序,那么释放skb,否则通知用户空间程序数据可读
if (eaten > 0)
__kfree_skb(skb);
else if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, 0);
return;
}
//下面负责处理收到的非预期段
//输入数据段的end_seq都小于rcv_nxt,所以数据段一定是重复段
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
/* A retransmit, 2nd most common case. Force an immediate ack. */
NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
tcp_enter_quickack_mode(sk);
inet_csk_schedule_ack(sk);
drop:
__kfree_skb(skb);
return;
}
//输入数据段超过了接收窗口的右边界
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
goto out_of_window;
//到这里,输入段在接收窗口内,但是一定是乱序报文
//进入快速ACK模式
tcp_enter_quickack_mode(sk);
//这个条件成立,说明输入段的一部分数据在接收窗口内
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq);
tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
/* If window is closed, drop tail of packet. But after
* remembering D-SACK for its head made in previous line.
*/
if (!tcp_receive_window(tp))
goto out_of_window;
goto queue_and_out;
}
//ECN相关
TCP_ECN_check_ce(tp, skb);
//内存检查
if (tcp_try_rmem_schedule(sk, skb->truesize))
goto drop;
//因为发生了乱序,所以需要关闭首部预测标记
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
skb_set_owner_r(skb, sk);
//如果当前乱序队列为空
if (!skb_peek(&tp->out_of_order_queue)) {
//SACK相关
/* Initial out of order segment, build 1 SACK. */
if (tcp_is_sack(tp)) {
tp->rx_opt.num_sacks = 1;
tp->rx_opt.dsack = 0;
tp->rx_opt.eff_sacks = 1;
tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
tp->selective_acks[0].end_seq =
TCP_SKB_CB(skb)->end_seq;
}
//将skb加入乱序队列
__skb_queue_head(&tp->out_of_order_queue, skb);
} else {
//下面代码执行的就是将skb放入乱序队列,虽然是乱序队列,但是放入的时候还是
//保持序号的顺序排列,这样方便后续从乱序队列搬移到接收队列的处理。由于夹杂
//着选择ACK和性能方面的考虑,所以看起来实现比较复杂
struct sk_buff *skb1 = tp->out_of_order_queue.prev;
u32 seq = TCP_SKB_CB(skb)->seq;
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (seq == TCP_SKB_CB(skb1)->end_seq) {
__skb_append(skb1, skb, &tp->out_of_order_queue);
if (!tp->rx_opt.num_sacks ||
tp->selective_acks[0].end_seq != seq)
goto add_sack;
/* Common case: data arrive in order after hole. */
tp->selective_acks[0].end_seq = end_seq;
return;
}
/* Find place to insert this segment. */
do {
if (!after(TCP_SKB_CB(skb1)->seq, seq))
break;
} while ((skb1 = skb1->prev) !=
(struct sk_buff *)&tp->out_of_order_queue);
/* Do skb overlap to previous one? */
if (skb1 != (struct sk_buff *)&tp->out_of_order_queue &&
before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
__kfree_skb(skb);
tcp_dsack_set(tp, seq, end_seq);
goto add_sack;
}
if (after(seq, TCP_SKB_CB(skb1)->seq)) {
/* Partial overlap. */
tcp_dsack_set(tp, seq,
TCP_SKB_CB(skb1)->end_seq);
} else {
skb1 = skb1->prev;
}
}
__skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue);
/* And clean segments covered by new one as whole. */
while ((skb1 = skb->next) !=
(struct sk_buff *)&tp->out_of_order_queue &&
after(end_seq, TCP_SKB_CB(skb1)->seq)) {
if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq,
end_seq);
break;
}
__skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
__kfree_skb(skb1);
}
add_sack:
if (tcp_is_sack(tp))
tcp_sack_new_ofo_skb(sk, seq, end_seq);
}
}