TCP数据发送之发送新数

在上一篇笔记TCP数据发送之tcp_sendmsg()中介绍了TCP发送相关系统调用的内核核心处理函数tcp_sendmsg(),可以看出该函数做的核心工作就是将待发送的数据组织成一个个的skb,并且将这些skb按照先后顺序放入到发送队列sk_write_queue中。并且该函数也会尝试调用tcp_push()(以及其它两个接口)进行一次新数据发送。

此外,在收到确认后,TCP会调用tcp_data_snd_check()检查是否可以发送数据,这里也会有机会发送新数据。

这篇笔记记录了这些新数据发送过程(注意是新数据,而不是重传数据)。

1. tcp_push()

从下面的实现中可以看出,tcp_push()在判断了是否需要设置PUSH标记位之后,会调用__tcp_push_pending_frames()。

static inline void tcp_push(struct sock *sk, int flags, int mss_now,
			    int nonagle)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (tcp_send_head(sk)) {
		//判断是否需要设置PUSH标记
		struct sk_buff *skb = tcp_write_queue_tail(sk);
		if (!(flags & MSG_MORE) || forced_push(tp))
			tcp_mark_push(tp, skb);
		//MSG_OOB相关,忽略
		tcp_mark_urg(tp, flags, skb);
		//调用__tcp_push_pending_frames()尝试发送
		__tcp_push_pending_frames(sk, mss_now,
					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
	}
}

2. __tcp_push_pending_frames()

该函数调用tcp_write_xmit()完成发送。

/* Push out any pending frames which were held back due to
 * TCP_CORK or attempt at coalescing tiny packets.
 * The socket must be locked by the caller.
 */
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
			       int nonagle)
{
	struct sk_buff *skb = tcp_send_head(sk);
	//如果有新数据可供发送,调用tcp_write_xmit()发送
	if (skb) {
		if (tcp_write_xmit(sk, cur_mss, nonagle))
			//和PMTU相关
			tcp_check_probe_timer(sk);
	}
}

2.1 tcp_write_xmit()

该函数是TCP发送新数据的核心函数,包括发送窗口判断、拥塞控制判断等核心操作都是在该函数中完成。

/* This routine writes packets to the network.  It advances the
 * send_head.  This happens as incoming acks open up the remote
 * window for us.
 *
 * Returns 1, if no segments are in flight and we have queued segments, but
 * cannot send anything now because of SWS or another problem.
 */
static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	unsigned int tso_segs, sent_pkts;
	int cwnd_quota;
	int result;

	/* If we are closed, the bytes will have to remain here.
	 * In time closedown will finish, we empty the write queue and all
	 * will be happy.
	 */
	//检查TCB的状态
	if (unlikely(sk->sk_state == TCP_CLOSE))
		return 0;

	//sent_pkts将记录本次调用发送的数据段数
	sent_pkts = 0;

	//PMTU探测相关,如果发送了探测报文,则sent_pkts加1
	if ((result = tcp_mtu_probe(sk)) == 0) {
		return 0;
	} else if (result > 0) {
		sent_pkts = 1;
	}

	//循环发送尚未发送过的数据包
	while ((skb = tcp_send_head(sk))) {
		unsigned int limit;
		//设置skb中的GSO分段信息。返回值tso_segs表示该skb中的数据需要分成几个段发送
		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
		BUG_ON(!tso_segs);
		//获取拥塞窗口允许发送的数据段数。如果为0,表示拥塞窗口不允许发送数据,结束发送过程
		cwnd_quota = tcp_cwnd_test(tp, skb);
		if (!cwnd_quota)
			break;
		//检测发送窗口是否至少允许发送skb中的一个的段。如果不允许,结束发送过程
		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
			break;

		if (tso_segs == 1) {
			//tso_segs为1,说明skb只有一个段,而且长度可能小于MSS,即是一个小数据包,
			//所以需要检测nagle算法是否允许发送该skb
			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
						     (tcp_skb_is_last(sk, skb) ?
						      nonagle : TCP_NAGLE_PUSH))))
				break;
		} else {
			//tso_segs>1,需要TSO分段,判断是否需要推迟发送,这种推迟主要是为了提高GSO性能
			if (tcp_tso_should_defer(sk, skb))
				break;
		}

		//通过上面的拥塞窗口和发送窗口的检测后,我们知道,目前至少是可以发送一个
		//TCP段的。当然也有可能还可以发送更多,所以下面需要根据条件调整limit

		//如果skb有多个段,需要检查到底可以发送多少数据
		limit = mss_now;
		if (tso_segs > 1)
			//tcp_mss_split_point()返回的是发送窗口和拥塞窗口允许发送的最大字节数,
			//可能会超过skb本身的数据量,见下文
			limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota);

		//skb的数据量超过了限定值,需要分段。这种情况只可能发生在TSO情形,因为非TSO场景,skb
		//的长度是不可能超过MSS的。此外,这种分段完全是因为拥塞控制和流量控制算法限制了发包大小,
		//所以才需要分割,和TSO本身没有任何关系
		if (skb->len > limit &&
		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
			break;
		//更新数据包的发送时间戳
		TCP_SKB_CB(skb)->when = tcp_time_stamp;
		//发送数据,如果返回非0,表示本次发送失败(如qdisc队列已满等),那么结束本次发送过程
		//第三个参数为1,表示让tcp_transmit_skb()发送时克隆一份skb首部
		if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
			break;
		//发送了新数据,更新发送队列以及相关统计
		tcp_event_new_data_sent(sk, skb);
		//Nagle算法相关,如果当前发送的数据量小于MSS,认为是小包,所以更新snd_sml的值
		tcp_minshall_update(tp, mss_now, skb);
		//累加发包计数
		sent_pkts++;
	}//end of while((skb = tcp_send_head(sk)))

	//如果本次发送了数据包,则调整拥塞控制相关变量
	if (likely(sent_pkts)) {
		tcp_cwnd_validate(sk);
		return 0;
	}
	//这两种特殊情况中的任意一种也认为是成功返回(返回0表示函数执行成功):
	//1. 当前已有未确认的数据包在发送;
	//2. sk->sk_send_head为NULL,即当前已没有新数据需要发送
	return !tp->packets_out && tcp_send_head(sk);
}

2.1.1 拥塞窗口检测tcp_cwnd_test()

该函数检测拥塞窗口是否允许发送数据段,如果允许,返回在拥塞窗口限制范围内,可用于发送的段数(注意:不是字节数)。

/* Can at least one segment of SKB be sent right now, according to the
 * congestion window rules?  If so, return how many segments are allowed.
 */
static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
{
	u32 in_flight, cwnd;

	/* Don't be strict about the congestion window for the final FIN.  */
	//如果是FIN段,并且只有一个段(FIN有可能会携带很多数据),那么总是可以发送,不会被拥塞窗口限制
	if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && tcp_skb_pcount(skb) == 1)
		return 1;
	//估算当前还在网络中传输的TCP段的数目
	in_flight = tcp_packets_in_flight(tp);
	//snd_cwnd就是当前拥塞窗口的大小,以TCP段为单位
	cwnd = tp->snd_cwnd;
	//比较拥塞窗口大小和飞行报文数目,余量就是拥塞控制还允许发送的段数
	if (in_flight < cwnd)
		return (cwnd - in_flight);
	//拥塞窗口已经好耗尽,返回0表示不允许发送数据
	return 0;
}

//该函数估算的是那些已经发送出去(初传+重传)并且已经离开
//网络的段的数目,这些段主要是SACK确认的+已经判定为丢失的段
static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{
	//sacked_out:启用SACK时,表示已经被SACK选项确认的段的数量;
	//			不启用SACK时,记录了收到的重复ACK的次数,因为重复ACK不会自动发送,一定是对端收到了数据包;
	//lost_out:记录发送后在传输过程中丢失的段的数目,因为TCP没有一种机制可以准确的知道
	//		  发出去的段是否真的丢了,所以这只是一种算法上的估计值
	//无论如何,这两种段属于已经发送,但是可以确定它们在网络中已经不存在了
	return tp->sacked_out + tp->lost_out;
}

/* This determines how many packets are "in the network" to the best
 * of our knowledge.  In many cases it is conservative, but where
 * detailed information is available from the receiver (via SACK
 * blocks etc.) we can make more aggressive calculations.
 *
 * Use this for decisions involving congestion control, use just
 * tp->packets_out to determine if the send queue is empty or not.
 *
 * Read this equation as:
 *
 *	"Packets sent once on transmission queue" MINUS
 *	"Packets left network, but not honestly ACKed yet" PLUS
 *	"Packets fast retransmitted"
 */
static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
{
	//packets_out记录的是已经从发送队列发出,但是尚未被确认的段的数目(不包括重传)
	//retrans_out表示的是因为重传才发送出去,但是还没有被确认的段的数目
	//tcp_left_out():发出去了但是已经离开了网络的数据包数目
	return tp->packets_out - tcp_left_out(tp) + tp->retrans_out;
}

可以看出,拥塞窗口的检测实际上非常的简单,就是看当前网络中还在传输的报文(即飞行报文)数量是否超过了拥塞窗口的限制。拥塞控制的核心在于如何在各种情况下合理的设定拥塞窗口tp->snd_cwnd的值。

2.1.2 发送窗口检测tcp_snd_wnd_test()

该函数判断当前发送窗口是否至少允许发送一个段,如果允许,返回1,否则返回0。如果skb的大小超过了一个MSS,那么只要允许发送一个MSS,就返回1;如果skb的大小小于一个MSS,那么只要允许发送所需的数据量就会返回1。

/* Does at least the first segment of SKB fit into the send window? */
static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
{
	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
	//如果skb中数据超过了一个段大小,则调整end_seq为一个段大小的序号
	if (skb->len > cur_mss)
		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
	//检查一个段的末尾序号是否超过了发送窗口的右边界
	return !after(end_seq, tcp_wnd_end(tp));
}

//返回发送窗口的右边界
static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
{
	//snd_una:已经发送但是还没有被确认的最小序号
	//snd_wnd:当前发送窗口大小,即接收方剩余的接收缓冲区
	return tp->snd_una + tp->snd_wnd;
}

2.1.3 tcp_mss_split_point()

该函数综合skb中数据长度、发送窗口允许发送数据量、拥塞窗口允许发送数据量,计算本次允许当前skb发送的数据量,以字节为单位。

/* Returns the portion of skb which can be sent right away without
 * introducing MSS oddities to segment boundaries. In rare cases where
 * mss_now != mss_cache, we will request caller to create a small skb
 * per input skb which could be mostly avoided here (if desired).
 *
 * We explicitly want to create a request for splitting write queue tail
 * to a small skb for Nagle purposes while avoiding unnecessary modulos,
 * thus all the complexity (cwnd_len is always MSS multiple which we
 * return whenever allowed by the other factors). Basically we need the
 * modulo only when the receiver window alone is the limiting factor or
 * when we would be allowed to send the split-due-to-Nagle skb fully.
 */
@skb:待判断的skb
@mss_now:当前MSS
@cwnd:拥塞窗口允许发送的段数,cwnd*mss_now即拥塞窗口允许发送的字节数;
static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
					unsigned int mss_now, unsigned int cwnd)
{
	struct tcp_sock *tp = tcp_sk(sk);
	u32 needed, window, cwnd_len;
	//window为发送窗口允许当前skb发送的最大字节数(可能会超过skb->len)
	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
	//cwnd_len为拥塞窗口允许发送的字节数
	cwnd_len = mss_now * cwnd;

	//这段逻辑要实现的效果见下面的注释
	if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
		return cwnd_len;
	//needed为经过发送窗口矫正后的实际要发送的数据量
	needed = min(skb->len, window);

	if (skb == tcp_write_queue_tail(sk) && cwnd_len <= needed)
		return cwnd_len;
	//最终返回值是MSS的整数倍,当然单位依然是字节
	return needed - needed % mss_now;
}

上面的实现不是很好理解,实际上该函数的逻辑如下:

  • 最后一个skb、拥塞窗口受限-----返回拥塞窗口允许发送的数据量;
  • 最后一个skb、拥塞窗口不受限-----返回min(发送窗口允许的数据量,实际要发送的数据量skb->len);
  • 不是最后一个skb、拥塞窗口受限-----返回拥塞窗口允许发送的数据量,这种情况返回的允许值可能会大于skb中要发送的数据量。因为可能是这样的关系skb->len < cwnd_len <= window.
  • 不是最后一个skb、拥塞窗口不受限-----返回min(发送窗口允许的数据量,实际要发送的数据量skb->len)。

2.1.4 tso_fragment()

在tcp_write_xmit()中,如果skb中数据量过大,超过了发送窗口和拥塞窗口的限定,只允许发送skb的一部分,那么就需要将skb拆分成两段,前半段长度为len,本次可以发送,后半段保存在新分配的skb中,在发送队列sk_write_queue中将后半段插入到前半段的后面,这样可以保证数据的顺序发送。

注:由于这种分割只是修改struct share_info的frags[]中的指针关系,不涉及内存拷贝,所以速度是很快的。

/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
 * which is put after SKB on the list.  It is very much like
 * tcp_fragment() except that it may make several kinds of assumptions
 * in order to speed up the splitting operation.  In particular, we
 * know that all the data is in scatter-gather pages, and that the
 * packet has never been sent out before (and thus is not cloned).
 */
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
			unsigned int mss_now)
{
	struct sk_buff *buff;
	//新skb的长度为剩余部分
	int nlen = skb->len - len;
	u16 flags;

	//如果skb中包含线性区域,则分段由tcp_fragment()完成,一旦开启TSO时,tcp_sendmsg()
	//在组织skb时,实际上是不会往线性区域放数据的,具体见select_size()
	if (skb->len != skb->data_len)
		return tcp_fragment(sk, skb, len, mss_now);

	//分配一个新的skb,其线性区域大小为0
	buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);
	if (unlikely(buff == NULL))
		return -ENOMEM;
	//更新内存记账
	sk->sk_wmem_queued += buff->truesize;
	sk_mem_charge(sk, buff->truesize);
	buff->truesize += nlen;
	skb->truesize -= nlen;

	//设置新分配的skb的序号信息
	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;

	/* PSH and FIN should only be set in the second packet. */
	//新分配的skb的标记
	flags = TCP_SKB_CB(skb)->flags;
	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
	TCP_SKB_CB(buff)->flags = flags;

	/* This packet was never sent out yet, so no SACK bits. */
	TCP_SKB_CB(buff)->sacked = 0;

	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
	//拆封skb的SG区域指针关系
	skb_split(skb, buff, len);

	/* Fix up tso_factor for both original and new SKB.  */
	//设置新老SKB的TSO信息
	tcp_set_skb_tso_segs(sk, skb, mss_now);
	tcp_set_skb_tso_segs(sk, buff, mss_now);

	//将新的skb插入到发送队列中
	skb_header_release(buff);
	tcp_insert_write_queue_after(skb, buff, sk);

	return 0;
}

2.1.5 tcp_event_new_data_sent()

发送队列中有新数据被发送出去时,调用该函数更新数据段统计信息。

static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
	unsigned int prior_packets = tp->packets_out;
	//将发送队列指针sk_send_head前移
	tcp_advance_send_head(sk, skb);
	//更新下一个待发送的段的TCP序号
	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;

	/* Don't override Nagle indefinately with F-RTO */
	//F-RTO算法
	if (tp->frto_counter == 2)
		tp->frto_counter = 3;
	//累加已经发送,但是尚未被确认的TCP段个数统计
	tp->packets_out += tcp_skb_pcount(skb);
	//如果之前没有发送过数据,则启动超时重传定时器
	if (!prior_packets)
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}

3. tcp_push_one()

看了上面tcp_write_xmit()的处理后,再来看看tcp_push_one()的实现,会发现二者基本一致,区别正如函数名,本函数只尝试发送一包数据,而tcp_push()会尝试遍历整个发送队列,直到无法继续发送为止。

/* Send _single_ skb sitting at the send head. This function requires
 * true push pending frames to setup probe timer etc.
 */
void tcp_push_one(struct sock *sk, unsigned int mss_now)
{
	struct sk_buff *skb = tcp_send_head(sk);
	unsigned int tso_segs, cwnd_quota;

	BUG_ON(!skb || skb->len < mss_now);

	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);

	if (likely(cwnd_quota)) {
		unsigned int limit;

		BUG_ON(!tso_segs);

		limit = mss_now;
		if (tso_segs > 1)
			limit = tcp_mss_split_point(sk, skb, mss_now,
						    cwnd_quota);

		if (skb->len > limit &&
		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
			return;

		/* Send it out now. */
		TCP_SKB_CB(skb)->when = tcp_time_stamp;

		if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
			tcp_event_new_data_sent(sk, skb);
			tcp_cwnd_validate(sk);
			return;
		}
	}
}

4. tcp_transmit_skb()

该函数为传入的skb构造TCP首部,然后调用IP层的输出接口完成数据发送。

/* This routine actually transmits TCP packets queued in by
 * tcp_do_sendmsg().  This is used by both the initial
 * transmission and possible later retransmissions.
 * All SKB's seen here are completely headerless.  It is our
 * job to build the TCP header, and pass the packet down to
 * IP so it can do the same plus pass the packet off to the
 * device.
 *
 * We are working here with either a clone of the original
 * SKB, or a fresh unique copy made by the retransmit engine.
 */
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
			    gfp_t gfp_mask)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_sock *inet;
	struct tcp_sock *tp;
	struct tcp_skb_cb *tcb;
	int tcp_header_size;
#ifdef CONFIG_TCP_MD5SIG
	struct tcp_md5sig_key *md5;
	__u8 *md5_hash_location;
#endif
	struct tcphdr *th;
	int sysctl_flags;
	int err;

	BUG_ON(!skb || !tcp_skb_pcount(skb));

	/* If congestion control is doing timestamping, we must
	 * take such a timestamp before we potentially clone/copy.
	 */
	//拥塞控制算法相关
	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
		__net_timestamp(skb);
	//如果调用者指明需要克隆skb然后再发送,那么执行skb的克隆操作
	if (likely(clone_it)) {
		if (unlikely(skb_cloned(skb)))
			skb = pskb_copy(skb, gfp_mask);
		else
			skb = skb_clone(skb, gfp_mask);
		if (unlikely(!skb))
			return -ENOBUFS;
	}

	inet = inet_sk(sk);
	tp = tcp_sk(sk);
	tcb = TCP_SKB_CB(skb);
	tcp_header_size = tp->tcp_header_len;

#define SYSCTL_FLAG_TSTAMPS	0x1
#define SYSCTL_FLAG_WSCALE	0x2
#define SYSCTL_FLAG_SACK	0x4

	//根据报文类型,确定TCP首部长度,因为有些选项只能在SYN段中携带,所以这里需要区分计算
	sysctl_flags = 0;
	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
		tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
		if (sysctl_tcp_timestamps) {
			tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
			sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
		}
		if (sysctl_tcp_window_scaling) {
			tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
			sysctl_flags |= SYSCTL_FLAG_WSCALE;
		}
		if (sysctl_tcp_sack) {
			sysctl_flags |= SYSCTL_FLAG_SACK;
			if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
				tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
		}
	} else if (unlikely(tp->rx_opt.eff_sacks)) {
		/* A SACK is 2 pad bytes, a 2 byte header, plus
		 * 2 32-bit sequence numbers for each SACK block.
		 */
		tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
				    (tp->rx_opt.eff_sacks *
				     TCPOLEN_SACK_PERBLOCK));
	}

	//拥塞控制相关。如果之前没有正在传输的报文,那么这是第一次传输,通知拥塞控制
	//算法CA_EVENT_TX_START事件,表示新启动了发送
	if (tcp_packets_in_flight(tp) == 0)
		tcp_ca_event(sk, CA_EVENT_TX_START);

#ifdef CONFIG_TCP_MD5SIG
	/*
	 * Are we doing MD5 on this segment? If so - make
	 * room for it.
	 */
	md5 = tp->af_specific->md5_lookup(sk, sk);
	if (md5)
		tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
#endif
	//填充TCP首部各个字段
	skb_push(skb, tcp_header_size);
	skb_reset_transport_header(skb);
	skb_set_owner_w(skb, sk);

	/* Build TCP header and checksum it. */
	th = tcp_hdr(skb);
	th->source		= inet->sport;
	th->dest		= inet->dport;
	th->seq			= htonl(tcb->seq);
	th->ack_seq		= htonl(tp->rcv_nxt);
	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
					tcb->flags);

	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
		/* RFC1323: The window in SYN & SYN/ACK segments
		 * is never scaled.
		 */
		th->window	= htons(min(tp->rcv_wnd, 65535U));
	} else {
		th->window	= htons(tcp_select_window(sk));
	}
	th->check		= 0;
	th->urg_ptr		= 0;

	if (unlikely(tp->urg_mode &&
		     between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
		th->urg_ptr		= htons(tp->snd_up - tcb->seq);
		th->urg			= 1;
	}
	//构造TCP首部的选项部分
	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
		tcp_syn_build_options((__be32 *)(th + 1),
				      tcp_advertise_mss(sk),
				      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
				      (sysctl_flags & SYSCTL_FLAG_SACK),
				      (sysctl_flags & SYSCTL_FLAG_WSCALE),
				      tp->rx_opt.rcv_wscale,
				      tcb->when,
				      tp->rx_opt.ts_recent,

#ifdef CONFIG_TCP_MD5SIG
				      md5 ? &md5_hash_location :
#endif
				      NULL);
	} else {
		tcp_build_and_update_options((__be32 *)(th + 1),
					     tp, tcb->when,
#ifdef CONFIG_TCP_MD5SIG
					     md5 ? &md5_hash_location :
#endif
					     NULL);
		TCP_ECN_send(sk, skb, tcp_header_size);
	}

#ifdef CONFIG_TCP_MD5SIG
	/* Calculate the MD5 hash, as we have all we need now */
	if (md5) {
		tp->af_specific->calc_md5_hash(md5_hash_location,
					       md5,
					       sk, NULL, NULL,
					       tcp_hdr(skb),
					       sk->sk_protocol,
					       skb->len);
	}
#endif
	//校验和相关处理,在TCPv4中为tcp_v4_send_check()
	icsk->icsk_af_ops->send_check(sk, skb->len, skb);
	//发送的段中携带了ACK,延时确认机制需要做些更新操作
	if (likely(tcb->flags & TCPCB_FLAG_ACK))
		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
	//如果发送的段中携带了有效数据,需要进行一些拥塞控制相关的操作
	if (skb->len != tcp_header_size)
		tcp_event_data_sent(tp, skb, sk);

	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
		TCP_INC_STATS(TCP_MIB_OUTSEGS);
	//调用发送接口queue_xmit发送报文,进入到ip层,如果失败返回错误码。
	//在TCP中该接口实现函数为ip_queue_xmit()
	err = icsk->icsk_af_ops->queue_xmit(skb, 0);
	if (likely(err <= 0))
		return err;
	//显示拥塞相关
	tcp_enter_cwr(sk, 1);
	//根据错误码返回发送结果
	return net_xmit_eval(err);

#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
}

注:上面tcp_transmit_skb()中有些内容涉及到TCP的其它机制,后面有时间再来分析。

5. tcp_data_snd_check()

接收过程中,在收到ACK后,更新了发送窗口、拥塞窗口之后,也会调用tcp_data_snd_check()检查是否可以发送新数据。

static inline void tcp_data_snd_check(struct sock *sk)
{
	//是上面__tcp_push_pending_frames()的包装函数
	tcp_push_pending_frames(sk);
	//内存管理相关
	tcp_check_space(sk);
}

static inline void tcp_push_pending_frames(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	__tcp_push_pending_frames(sk, tcp_current_mss(sk, 1), tp->nonagle);
}

6. 参考

  1. TCP数据发送之TSO处理
  2. TCP特性之Nagle算法(未完成)
  • 2
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值