在上一篇笔记TCP数据发送之tcp_sendmsg()中介绍了TCP发送相关系统调用的内核核心处理函数tcp_sendmsg(),可以看出该函数做的核心工作就是将待发送的数据组织成一个个的skb,并且将这些skb按照先后顺序放入到发送队列sk_write_queue中。并且该函数也会尝试调用tcp_push()(以及其它两个接口)进行一次新数据发送。
此外,在收到确认后,TCP会调用tcp_data_snd_check()检查是否可以发送数据,这里也会有机会发送新数据。
这篇笔记记录了这些新数据发送过程(注意是新数据,而不是重传数据)。
1. tcp_push()
从下面的实现中可以看出,tcp_push()在判断了是否需要设置PUSH标记位之后,会调用__tcp_push_pending_frames()。
static inline void tcp_push(struct sock *sk, int flags, int mss_now,
int nonagle)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_send_head(sk)) {
//判断是否需要设置PUSH标记
struct sk_buff *skb = tcp_write_queue_tail(sk);
if (!(flags & MSG_MORE) || forced_push(tp))
tcp_mark_push(tp, skb);
//MSG_OOB相关,忽略
tcp_mark_urg(tp, flags, skb);
//调用__tcp_push_pending_frames()尝试发送
__tcp_push_pending_frames(sk, mss_now,
(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
}
}
2. __tcp_push_pending_frames()
该函数调用tcp_write_xmit()完成发送。
/* Push out any pending frames which were held back due to
* TCP_CORK or attempt at coalescing tiny packets.
* The socket must be locked by the caller.
*/
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle)
{
struct sk_buff *skb = tcp_send_head(sk);
//如果有新数据可供发送,调用tcp_write_xmit()发送
if (skb) {
if (tcp_write_xmit(sk, cur_mss, nonagle))
//和PMTU相关
tcp_check_probe_timer(sk);
}
}
2.1 tcp_write_xmit()
该函数是TCP发送新数据的核心函数,包括发送窗口判断、拥塞控制判断等核心操作都是在该函数中完成。
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
*
* Returns 1, if no segments are in flight and we have queued segments, but
* cannot send anything now because of SWS or another problem.
*/
static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and all
* will be happy.
*/
//检查TCB的状态
if (unlikely(sk->sk_state == TCP_CLOSE))
return 0;
//sent_pkts将记录本次调用发送的数据段数
sent_pkts = 0;
//PMTU探测相关,如果发送了探测报文,则sent_pkts加1
if ((result = tcp_mtu_probe(sk)) == 0) {
return 0;
} else if (result > 0) {
sent_pkts = 1;
}
//循环发送尚未发送过的数据包
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
//设置skb中的GSO分段信息。返回值tso_segs表示该skb中的数据需要分成几个段发送
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
//获取拥塞窗口允许发送的数据段数。如果为0,表示拥塞窗口不允许发送数据,结束发送过程
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota)
break;
//检测发送窗口是否至少允许发送skb中的一个的段。如果不允许,结束发送过程
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
break;
if (tso_segs == 1) {
//tso_segs为1,说明skb只有一个段,而且长度可能小于MSS,即是一个小数据包,
//所以需要检测nagle算法是否允许发送该skb
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH))))
break;
} else {
//tso_segs>1,需要TSO分段,判断是否需要推迟发送,这种推迟主要是为了提高GSO性能
if (tcp_tso_should_defer(sk, skb))
break;
}
//通过上面的拥塞窗口和发送窗口的检测后,我们知道,目前至少是可以发送一个
//TCP段的。当然也有可能还可以发送更多,所以下面需要根据条件调整limit
//如果skb有多个段,需要检查到底可以发送多少数据
limit = mss_now;
if (tso_segs > 1)
//tcp_mss_split_point()返回的是发送窗口和拥塞窗口允许发送的最大字节数,
//可能会超过skb本身的数据量,见下文
limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota);
//skb的数据量超过了限定值,需要分段。这种情况只可能发生在TSO情形,因为非TSO场景,skb
//的长度是不可能超过MSS的。此外,这种分段完全是因为拥塞控制和流量控制算法限制了发包大小,
//所以才需要分割,和TSO本身没有任何关系
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now)))
break;
//更新数据包的发送时间戳
TCP_SKB_CB(skb)->when = tcp_time_stamp;
//发送数据,如果返回非0,表示本次发送失败(如qdisc队列已满等),那么结束本次发送过程
//第三个参数为1,表示让tcp_transmit_skb()发送时克隆一份skb首部
if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
break;
//发送了新数据,更新发送队列以及相关统计
tcp_event_new_data_sent(sk, skb);
//Nagle算法相关,如果当前发送的数据量小于MSS,认为是小包,所以更新snd_sml的值
tcp_minshall_update(tp, mss_now, skb);
//累加发包计数
sent_pkts++;
}//end of while((skb = tcp_send_head(sk)))
//如果本次发送了数据包,则调整拥塞控制相关变量
if (likely(sent_pkts)) {
tcp_cwnd_validate(sk);
return 0;
}
//这两种特殊情况中的任意一种也认为是成功返回(返回0表示函数执行成功):
//1. 当前已有未确认的数据包在发送;
//2. sk->sk_send_head为NULL,即当前已没有新数据需要发送
return !tp->packets_out && tcp_send_head(sk);
}
2.1.1 拥塞窗口检测tcp_cwnd_test()
该函数检测拥塞窗口是否允许发送数据段,如果允许,返回在拥塞窗口限制范围内,可用于发送的段数(注意:不是字节数)。
/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules? If so, return how many segments are allowed.
*/
static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
{
u32 in_flight, cwnd;
/* Don't be strict about the congestion window for the final FIN. */
//如果是FIN段,并且只有一个段(FIN有可能会携带很多数据),那么总是可以发送,不会被拥塞窗口限制
if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && tcp_skb_pcount(skb) == 1)
return 1;
//估算当前还在网络中传输的TCP段的数目
in_flight = tcp_packets_in_flight(tp);
//snd_cwnd就是当前拥塞窗口的大小,以TCP段为单位
cwnd = tp->snd_cwnd;
//比较拥塞窗口大小和飞行报文数目,余量就是拥塞控制还允许发送的段数
if (in_flight < cwnd)
return (cwnd - in_flight);
//拥塞窗口已经好耗尽,返回0表示不允许发送数据
return 0;
}
//该函数估算的是那些已经发送出去(初传+重传)并且已经离开
//网络的段的数目,这些段主要是SACK确认的+已经判定为丢失的段
static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{
//sacked_out:启用SACK时,表示已经被SACK选项确认的段的数量;
// 不启用SACK时,记录了收到的重复ACK的次数,因为重复ACK不会自动发送,一定是对端收到了数据包;
//lost_out:记录发送后在传输过程中丢失的段的数目,因为TCP没有一种机制可以准确的知道
// 发出去的段是否真的丢了,所以这只是一种算法上的估计值
//无论如何,这两种段属于已经发送,但是可以确定它们在网络中已经不存在了
return tp->sacked_out + tp->lost_out;
}
/* This determines how many packets are "in the network" to the best
* of our knowledge. In many cases it is conservative, but where
* detailed information is available from the receiver (via SACK
* blocks etc.) we can make more aggressive calculations.
*
* Use this for decisions involving congestion control, use just
* tp->packets_out to determine if the send queue is empty or not.
*
* Read this equation as:
*
* "Packets sent once on transmission queue" MINUS
* "Packets left network, but not honestly ACKed yet" PLUS
* "Packets fast retransmitted"
*/
static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
{
//packets_out记录的是已经从发送队列发出,但是尚未被确认的段的数目(不包括重传)
//retrans_out表示的是因为重传才发送出去,但是还没有被确认的段的数目
//tcp_left_out():发出去了但是已经离开了网络的数据包数目
return tp->packets_out - tcp_left_out(tp) + tp->retrans_out;
}
可以看出,拥塞窗口的检测实际上非常的简单,就是看当前网络中还在传输的报文(即飞行报文)数量是否超过了拥塞窗口的限制。拥塞控制的核心在于如何在各种情况下合理的设定拥塞窗口tp->snd_cwnd的值。
2.1.2 发送窗口检测tcp_snd_wnd_test()
该函数判断当前发送窗口是否至少允许发送一个段,如果允许,返回1,否则返回0。如果skb的大小超过了一个MSS,那么只要允许发送一个MSS,就返回1;如果skb的大小小于一个MSS,那么只要允许发送所需的数据量就会返回1。
/* Does at least the first segment of SKB fit into the send window? */
static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
{
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
//如果skb中数据超过了一个段大小,则调整end_seq为一个段大小的序号
if (skb->len > cur_mss)
end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
//检查一个段的末尾序号是否超过了发送窗口的右边界
return !after(end_seq, tcp_wnd_end(tp));
}
//返回发送窗口的右边界
static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
{
//snd_una:已经发送但是还没有被确认的最小序号
//snd_wnd:当前发送窗口大小,即接收方剩余的接收缓冲区
return tp->snd_una + tp->snd_wnd;
}
2.1.3 tcp_mss_split_point()
该函数综合skb中数据长度、发送窗口允许发送数据量、拥塞窗口允许发送数据量,计算本次允许当前skb发送的数据量,以字节为单位。
/* Returns the portion of skb which can be sent right away without
* introducing MSS oddities to segment boundaries. In rare cases where
* mss_now != mss_cache, we will request caller to create a small skb
* per input skb which could be mostly avoided here (if desired).
*
* We explicitly want to create a request for splitting write queue tail
* to a small skb for Nagle purposes while avoiding unnecessary modulos,
* thus all the complexity (cwnd_len is always MSS multiple which we
* return whenever allowed by the other factors). Basically we need the
* modulo only when the receiver window alone is the limiting factor or
* when we would be allowed to send the split-due-to-Nagle skb fully.
*/
@skb:待判断的skb
@mss_now:当前MSS
@cwnd:拥塞窗口允许发送的段数,cwnd*mss_now即拥塞窗口允许发送的字节数;
static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
unsigned int mss_now, unsigned int cwnd)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 needed, window, cwnd_len;
//window为发送窗口允许当前skb发送的最大字节数(可能会超过skb->len)
window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
//cwnd_len为拥塞窗口允许发送的字节数
cwnd_len = mss_now * cwnd;
//这段逻辑要实现的效果见下面的注释
if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
return cwnd_len;
//needed为经过发送窗口矫正后的实际要发送的数据量
needed = min(skb->len, window);
if (skb == tcp_write_queue_tail(sk) && cwnd_len <= needed)
return cwnd_len;
//最终返回值是MSS的整数倍,当然单位依然是字节
return needed - needed % mss_now;
}
上面的实现不是很好理解,实际上该函数的逻辑如下:
- 最后一个skb、拥塞窗口受限-----返回拥塞窗口允许发送的数据量;
- 最后一个skb、拥塞窗口不受限-----返回min(发送窗口允许的数据量,实际要发送的数据量skb->len);
- 不是最后一个skb、拥塞窗口受限-----返回拥塞窗口允许发送的数据量,这种情况返回的允许值可能会大于skb中要发送的数据量。因为可能是这样的关系skb->len < cwnd_len <= window.
- 不是最后一个skb、拥塞窗口不受限-----返回min(发送窗口允许的数据量,实际要发送的数据量skb->len)。
2.1.4 tso_fragment()
在tcp_write_xmit()中,如果skb中数据量过大,超过了发送窗口和拥塞窗口的限定,只允许发送skb的一部分,那么就需要将skb拆分成两段,前半段长度为len,本次可以发送,后半段保存在新分配的skb中,在发送队列sk_write_queue中将后半段插入到前半段的后面,这样可以保证数据的顺序发送。
注:由于这种分割只是修改struct share_info的frags[]中的指针关系,不涉及内存拷贝,所以速度是很快的。
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
* which is put after SKB on the list. It is very much like
* tcp_fragment() except that it may make several kinds of assumptions
* in order to speed up the splitting operation. In particular, we
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
unsigned int mss_now)
{
struct sk_buff *buff;
//新skb的长度为剩余部分
int nlen = skb->len - len;
u16 flags;
//如果skb中包含线性区域,则分段由tcp_fragment()完成,一旦开启TSO时,tcp_sendmsg()
//在组织skb时,实际上是不会往线性区域放数据的,具体见select_size()
if (skb->len != skb->data_len)
return tcp_fragment(sk, skb, len, mss_now);
//分配一个新的skb,其线性区域大小为0
buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);
if (unlikely(buff == NULL))
return -ENOMEM;
//更新内存记账
sk->sk_wmem_queued += buff->truesize;
sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
skb->truesize -= nlen;
//设置新分配的skb的序号信息
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
/* PSH and FIN should only be set in the second packet. */
//新分配的skb的标记
flags = TCP_SKB_CB(skb)->flags;
TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
TCP_SKB_CB(buff)->flags = flags;
/* This packet was never sent out yet, so no SACK bits. */
TCP_SKB_CB(buff)->sacked = 0;
buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
//拆封skb的SG区域指针关系
skb_split(skb, buff, len);
/* Fix up tso_factor for both original and new SKB. */
//设置新老SKB的TSO信息
tcp_set_skb_tso_segs(sk, skb, mss_now);
tcp_set_skb_tso_segs(sk, buff, mss_now);
//将新的skb插入到发送队列中
skb_header_release(buff);
tcp_insert_write_queue_after(skb, buff, sk);
return 0;
}
2.1.5 tcp_event_new_data_sent()
发送队列中有新数据被发送出去时,调用该函数更新数据段统计信息。
static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
//将发送队列指针sk_send_head前移
tcp_advance_send_head(sk, skb);
//更新下一个待发送的段的TCP序号
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
/* Don't override Nagle indefinately with F-RTO */
//F-RTO算法
if (tp->frto_counter == 2)
tp->frto_counter = 3;
//累加已经发送,但是尚未被确认的TCP段个数统计
tp->packets_out += tcp_skb_pcount(skb);
//如果之前没有发送过数据,则启动超时重传定时器
if (!prior_packets)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}
3. tcp_push_one()
看了上面tcp_write_xmit()的处理后,再来看看tcp_push_one()的实现,会发现二者基本一致,区别正如函数名,本函数只尝试发送一包数据,而tcp_push()会尝试遍历整个发送队列,直到无法继续发送为止。
/* Send _single_ skb sitting at the send head. This function requires
* true push pending frames to setup probe timer etc.
*/
void tcp_push_one(struct sock *sk, unsigned int mss_now)
{
struct sk_buff *skb = tcp_send_head(sk);
unsigned int tso_segs, cwnd_quota;
BUG_ON(!skb || skb->len < mss_now);
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
if (likely(cwnd_quota)) {
unsigned int limit;
BUG_ON(!tso_segs);
limit = mss_now;
if (tso_segs > 1)
limit = tcp_mss_split_point(sk, skb, mss_now,
cwnd_quota);
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now)))
return;
/* Send it out now. */
TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
tcp_event_new_data_sent(sk, skb);
tcp_cwnd_validate(sk);
return;
}
}
}
4. tcp_transmit_skb()
该函数为传入的skb构造TCP首部,然后调用IP层的输出接口完成数据发送。
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
* All SKB's seen here are completely headerless. It is our
* job to build the TCP header, and pass the packet down to
* IP so it can do the same plus pass the packet off to the
* device.
*
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
int tcp_header_size;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *md5;
__u8 *md5_hash_location;
#endif
struct tcphdr *th;
int sysctl_flags;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
/* If congestion control is doing timestamping, we must
* take such a timestamp before we potentially clone/copy.
*/
//拥塞控制算法相关
if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
__net_timestamp(skb);
//如果调用者指明需要克隆skb然后再发送,那么执行skb的克隆操作
if (likely(clone_it)) {
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
else
skb = skb_clone(skb, gfp_mask);
if (unlikely(!skb))
return -ENOBUFS;
}
inet = inet_sk(sk);
tp = tcp_sk(sk);
tcb = TCP_SKB_CB(skb);
tcp_header_size = tp->tcp_header_len;
#define SYSCTL_FLAG_TSTAMPS 0x1
#define SYSCTL_FLAG_WSCALE 0x2
#define SYSCTL_FLAG_SACK 0x4
//根据报文类型,确定TCP首部长度,因为有些选项只能在SYN段中携带,所以这里需要区分计算
sysctl_flags = 0;
if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
if (sysctl_tcp_timestamps) {
tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
}
if (sysctl_tcp_window_scaling) {
tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
sysctl_flags |= SYSCTL_FLAG_WSCALE;
}
if (sysctl_tcp_sack) {
sysctl_flags |= SYSCTL_FLAG_SACK;
if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
}
} else if (unlikely(tp->rx_opt.eff_sacks)) {
/* A SACK is 2 pad bytes, a 2 byte header, plus
* 2 32-bit sequence numbers for each SACK block.
*/
tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
(tp->rx_opt.eff_sacks *
TCPOLEN_SACK_PERBLOCK));
}
//拥塞控制相关。如果之前没有正在传输的报文,那么这是第一次传输,通知拥塞控制
//算法CA_EVENT_TX_START事件,表示新启动了发送
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
#ifdef CONFIG_TCP_MD5SIG
/*
* Are we doing MD5 on this segment? If so - make
* room for it.
*/
md5 = tp->af_specific->md5_lookup(sk, sk);
if (md5)
tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
#endif
//填充TCP首部各个字段
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
skb_set_owner_w(skb, sk);
/* Build TCP header and checksum it. */
th = tcp_hdr(skb);
th->source = inet->sport;
th->dest = inet->dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(tp->rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->flags);
if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
/* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
th->window = htons(min(tp->rcv_wnd, 65535U));
} else {
th->window = htons(tcp_select_window(sk));
}
th->check = 0;
th->urg_ptr = 0;
if (unlikely(tp->urg_mode &&
between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
th->urg_ptr = htons(tp->snd_up - tcb->seq);
th->urg = 1;
}
//构造TCP首部的选项部分
if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
tcp_syn_build_options((__be32 *)(th + 1),
tcp_advertise_mss(sk),
(sysctl_flags & SYSCTL_FLAG_TSTAMPS),
(sysctl_flags & SYSCTL_FLAG_SACK),
(sysctl_flags & SYSCTL_FLAG_WSCALE),
tp->rx_opt.rcv_wscale,
tcb->when,
tp->rx_opt.ts_recent,
#ifdef CONFIG_TCP_MD5SIG
md5 ? &md5_hash_location :
#endif
NULL);
} else {
tcp_build_and_update_options((__be32 *)(th + 1),
tp, tcb->when,
#ifdef CONFIG_TCP_MD5SIG
md5 ? &md5_hash_location :
#endif
NULL);
TCP_ECN_send(sk, skb, tcp_header_size);
}
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
tp->af_specific->calc_md5_hash(md5_hash_location,
md5,
sk, NULL, NULL,
tcp_hdr(skb),
sk->sk_protocol,
skb->len);
}
#endif
//校验和相关处理,在TCPv4中为tcp_v4_send_check()
icsk->icsk_af_ops->send_check(sk, skb->len, skb);
//发送的段中携带了ACK,延时确认机制需要做些更新操作
if (likely(tcb->flags & TCPCB_FLAG_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
//如果发送的段中携带了有效数据,需要进行一些拥塞控制相关的操作
if (skb->len != tcp_header_size)
tcp_event_data_sent(tp, skb, sk);
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
TCP_INC_STATS(TCP_MIB_OUTSEGS);
//调用发送接口queue_xmit发送报文,进入到ip层,如果失败返回错误码。
//在TCP中该接口实现函数为ip_queue_xmit()
err = icsk->icsk_af_ops->queue_xmit(skb, 0);
if (likely(err <= 0))
return err;
//显示拥塞相关
tcp_enter_cwr(sk, 1);
//根据错误码返回发送结果
return net_xmit_eval(err);
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
}
注:上面tcp_transmit_skb()中有些内容涉及到TCP的其它机制,后面有时间再来分析。
5. tcp_data_snd_check()
接收过程中,在收到ACK后,更新了发送窗口、拥塞窗口之后,也会调用tcp_data_snd_check()检查是否可以发送新数据。
static inline void tcp_data_snd_check(struct sock *sk)
{
//是上面__tcp_push_pending_frames()的包装函数
tcp_push_pending_frames(sk);
//内存管理相关
tcp_check_space(sk);
}
static inline void tcp_push_pending_frames(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__tcp_push_pending_frames(sk, tcp_current_mss(sk, 1), tp->nonagle);
}