linux协议栈 TCP定时器之超时重传定时器

本文详细探讨了Linux TCP协议栈中的超时重传机制,包括相关数据结构icsk_retransmit_timer的初始化、启动定时器inet_csk_reset_xmit_timer的时机、超时处理函数tcp_write_timer()及其子函数,如tcp_retransmit_timer()。此外,还介绍了系统参数如tcp_retries1、tcp_retries2等对超时策略的影响。
摘要由CSDN通过智能技术生成

目录

1 相关数据结构 icsk_retransmit_timer

2 初始化 tcp_init_xmit_timers

3 启动定时器 inet_csk_reset_xmit_timer

3.1 启动时机

3.1.1 发送新数据 tcp_event_new_data_sent

3.1.2 客户端发生SYN报文

4 超时处理 tcp_write_timer()

4.1 超时重传事件处理 tcp_retransmit_timer()

4.1.1 重传超时截止判断 tcp_write_timeout()

4.1.2 孤儿socket最大重传次数判定 tcp_orphan_retries()

4.1.3 孤儿socket资源消耗判定tcp_out_of_resources()

5 数据包重传tcp_retransmit_skb()

6 系统参数

6.1 sysctl_tcp_retries1(INTEGER)

6.2 sysctl_tcp_retries2(INTEGER)

6.3 sysctl_tcp_orphan_retries(INTEGER)

6.4 sysctl_tcp_max_orphans(INTEGER)

6.5 sysctl_tcp_retrans_collapse(BOOLEAN)


1 相关数据结构 icsk_retransmit_timer

struct inet_connection_sock {
...
	//icsk_retransmit_timer的超时时刻,jiffies超过该值时定时器超时
	unsigned long		  icsk_timeout;
	//超时重传定时器、持续定时器(还有其它)
 	struct timer_list	  icsk_retransmit_timer;
...
	//拥塞状态
	__u8			  icsk_ca_state;
	//发生超时重传的次数。当退出LOSS状态时该计数器清零
	__u8			  icsk_retransmits;
	//icsk_retransmit_timer定时器函数可以处理4个定时器,
	//当前应该处理哪个事件也需要区分,0表示没有事件需要处理
	__u8			  icsk_pending;
...
};

2 初始化 tcp_init_xmit_timers

超时重传定时器是在socket()系统调用执行过程中初始化的,具体在创建TCB后调用tcp_v4_sock_init()初始化其TCP相关字段时完成的,代码如下:

static int tcp_v4_init_sock(struct sock *sk)
{
...
	tcp_init_xmit_timers(sk);
...
}

void tcp_init_xmit_timers(struct sock *sk)
{
	//超时重传定时器的定时器处理函数为tcp_write_time()
	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
				  &tcp_keepalive_timer);
}

/*
 * Using different timers for retransmit, delayed acks and probes
 * We may wish use just one timer maintaining a list of expire jiffies
 * to optimize.
 */
//该函数安装三个定时器
void inet_csk_init_xmit_timers(struct sock *sk,
			       void (*retransmit_handler)(unsigned long),
			       void (*delack_handler)(unsigned long),
			       void (*keepalive_handler)(unsigned long))
{
	struct inet_connection_sock *icsk = inet_csk(sk);

	setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
			(unsigned long)sk);
	setup_timer(&icsk->icsk_delack_timer, delack_handler,
			(unsigned long)sk);
	setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
	//由于同一个定时器函数可以处理多个定时器,它们也需要进行区分,pending参数表示
	//当前需要处理的是哪个定时事件,0表示没有事件需要处理
	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}

//setup_timer()仅仅是初始化定时器参数,并没有启动定时器
static inline void setup_timer(struct timer_list * timer,
				void (*function)(unsigned long),
				unsigned long data)
{
	timer->function = function;
	//data参数就是TCB指针
	timer->data = data;
	init_timer(timer);
}

3 启动定时器 inet_csk_reset_xmit_timer

启动定时器由函数inet_csk_reset_xmit_timer()完成。

/*
 *	Reset the retransmission timer
 */
@what: 表示要复位的定时器,对于超时重传定时器,该值为ICSK_TIME_RETRANS;
@when: 表示该定时器再过几个滴答超时;
@max_when: when可取的最大值,如果指定when超过了max_when,那么只取max_when.
static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
					     unsigned long when,
					     const unsigned long max_when)
{
	struct inet_connection_sock *icsk = inet_csk(sk);

	//矫正when参数
	if (when > max_when) {
		when = max_when;
	}
	//关注what为ICSK_TIME_RETRANS的情况
	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) {
		//将事件记录到icsk_pending中表示启动的定时器是超时重传定时器
		icsk->icsk_pending = what;
		//记录超时时间点到icsk_timeout
		icsk->icsk_timeout = jiffies + when;
		//重新启动定时器
		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
	} else if (what == ICSK_TIME_DACK) {
		icsk->icsk_ack.pending |= ICSK_ACK_TIMER;
		icsk->icsk_ack.timeout = jiffies + when;
		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
	}
}

void sk_reset_timer(struct sock *sk, struct timer_list* timer,
		    unsigned long expires)
{
	//定时器启动期间会持有TCB的引用,防止其被释放
	if (!mod_timer(timer, expires))
		sock_hold(sk);
}

3.1 启动时机

超时重传定时器启动的场景很多:

3.1.1 发送新数据 tcp_event_new_data_sent

tcp_event_new_data_sent
	--tcp_write_xmit
		--__tcp_push_pending_frames
			--tcp_push
	--tcp_push_one

static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
	unsigned int prior_packets = tp->packets_out;

	tcp_advance_send_head(sk, skb);
	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;

	/* Don't override Nagle indefinately with F-RTO */
	if (tp->frto_counter == 2)
		tp->frto_counter = 3;

	tp->packets_out += tcp_skb_pcount(skb);
	if (!prior_packets)
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}

3.1.2 客户端发生SYN报文

/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
    ...
    /* 这里是初始化传输控制块中与连接相关的成员
       其中就包括对重传超时时间RTO的初始化,对于
       SYN报文时1s,不同内核版本该值会不一样,在
       内核版本3.0.101(SUSE11SP3)上该值是3s。
    */
    tcp_connect_init(sk);
    ...

    /* Timer for repeating the SYN until an answer. 
       这里便是激活超时重传定时器了,其中TCP_RTO_MAX的值为120s。
    */
    inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
    return 0;
}

4 超时处理 tcp_write_timer()

RTO超时后,由定时器函数tcp_write_timer()开始处理,在整个超时处理过程中,当前只关注连接态的处理。

static void tcp_write_timer(unsigned long data)
{
	struct sock *sk = (struct sock*)data;
	struct inet_connection_sock *icsk = inet_csk(sk);
	int event;

	//定时器函数在软中断中执行,这里先锁定套接字
	bh_lock_sock(sk);

	//如果TCB被进程上下文锁定,那么推后50ms再尝试
	if (sock_owned_by_user(sk)) {
		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
		goto out_unlock;
	}
	//如果套接字已经关闭或者定时器根本就没有启动,退出
	if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
		goto out;
	//如果没有超时,重新设定超时时间
	if (time_after(icsk->icsk_timeout, jiffies)) {
		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
		goto out;
	}
	//以上都是一些保护性的检查

	event = icsk->icsk_pending;
	//即将处理事件,将icsk_pending置为0,表示清除事件,后面如果需要会重新设定
	icsk->icsk_pending = 0;
	switch (event) {
	case ICSK_TIME_RETRANS:
		//超时重传有该函数处理
		tcp_retransmit_timer(sk);
		break;
	case ICSK_TIME_PROBE0:
		tcp_probe_timer(sk);
		break;
	}
	TCP_CHECK_TIMER(sk);

out:
	//内存回收
	sk_mem_reclaim(sk);
out_unlock:
	bh_unlock_sock(sk);
	sock_put(sk);
}

4.1 超时重传事件处理 tcp_retransmit_timer()

超时重传定时器核心逻辑如下:

  1. 首先检查是否还允许继续进行超时重传,这时综合考虑最大重传次数限制、系统拥塞限制、socket状态等因素;
  2. 一旦允许超时重传,那么只重发当前发送队列中的第一个包,然后按照指数退避算法重启定时器。
/*
 *	The TCP retransmit timer.
 */
static void tcp_retransmit_timer(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);

    //packets_out表示发送出去尚未收到对端确认的数据包,如果没有,超时就没意义,返回
	//如果根本就没有发送数据何来超时处理。保护性检查
	if (!tp->packets_out)
		goto out;
	//同上,发送队列是空的
	BUG_TRAP(!tcp_write_queue_empty(sk));

	//发送窗口为0;socket没有关闭;当前不再三次握手阶段(说明在连接态)。
	//这种情况下发生了超时,需要检查是否需要关闭套接字
	if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
	    !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
		/* Receiver dastardly shrinks window. Our retransmits
		 * become zero probes, but we should not timeout this
		 * connection. If the socket is an orphan, time it out,
		 * we cannot allow such beasts to hang infinitely.
		 */
		//如果这条连接上已经有很长时间(超过TCP_RTO_MAX=120s)没有收到对端
		//的确认了,认为连接异常了,直接关闭该套接字
		if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
			tcp_write_err(sk);
			goto out;
		}
		//这种情况说明对方已经很拥塞了,进入LOSS状态
		tcp_enter_loss(sk, 0);
		//重传第一包数据
		tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
		//清理路由,因为路由可能也是有问题的
		__sk_dst_reset(sk);
		goto out_reset_timer;
	}

	//超时重传也不能无限重传下去,必须有截止时间,该函数判断是否继续执行超时重传
	if (tcp_write_timeout(sk))
		goto out;

	//如果是第一次超时重传
	if (icsk->icsk_retransmits == 0) {
		if (icsk->icsk_ca_state == TCP_CA_Disorder ||
		    icsk->icsk_ca_state == TCP_CA_Recovery) {
			if (tcp_is_sack(tp)) {
				if (icsk->icsk_ca_state == TCP_CA_Recovery)
					NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
				else
					NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
			} else {
				if (icsk->icsk_ca_state == TCP_CA_Recovery)
					NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
				else
					NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
			}
		} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
			NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
		} else {
			NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
		}
	}

	//暂时不考虑F-RTO算法,走else分支,切换拥塞状态为LOSS
	if (tcp_use_frto(sk)) {
		tcp_enter_frto(sk);
	} else {
		tcp_enter_loss(sk, 0);
	}
	//尝试重传第一包数据,如果发送失败说明本地发生了拥塞,这时不执行指数退避
	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
		/* Retransmission failed because of local congestion,
		 * do not backoff.
		 */
		if (!icsk->icsk_retransmits)
			icsk->icsk_retransmits = 1;
		//重启超时重传定时器,超时时间为当前RTO和500ms(本地资源探测间隔)中的最小值
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
					  min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
					  TCP_RTO_MAX);
		goto out;
	}

	/* Increase the timeout each time we retransmit.  Note that
	 * we do not increase the rtt estimate.  rto is initialized
	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
	 * that doubling rto each time is the least we can get away with.
	 * In KA9Q, Karn uses this for the first few times, and then
	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
	 * defined in the protocol as the maximum possible RTT.  I guess
	 * we'll have to use something other than TCP to talk to the
	 * University of Mars.
	 *
	 * PAWS allows us longer timeouts and large windows, so once
	 * implemented ftp to mars will work nicely. We will have to fix
	 * the 120 second clamps though!
	 */
	//累加指数退避次数和发生超时重传次数
	icsk->icsk_backoff++;
	icsk->icsk_retransmits++;

out_reset_timer:
	//执行指数退避算法,更新下次超时间隔记录到icsk_rto
	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
	if (icsk->icsk_retransmits > sysctl_tcp_retries1)
		__sk_dst_reset(sk);

out:;
}

4.1.1 重传超时截止判断 tcp_write_timeout()

/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	//retry_until记录最多可以发生多少次超时重传
	int retry_until;

	//SYN段的重传超时截止判定
	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
		if (icsk->icsk_retransmits)
			dst_negative_advice(&sk->sk_dst_cache);
		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
	} else {
		//对于非SYN段,如果超时重传次数超过了门限sysctl_tcp_retries1,
		//那么尝试做MTU探测和重新路由选择
		if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
			/* Black hole detection */
			tcp_mtu_probing(icsk, sk);
			dst_negative_advice(&sk->sk_dst_cache);
		}
		//允许的最大重传次数由系统参数sysctl_tcp_retries2决定
		retry_until = sysctl_tcp_retries2;
		//如果socket设置了DEAD标记,那么这种socket叫做“孤儿socket”,
		//此时需要检查这种socket是否超过了系统限制,如果超过那么也不再继续重传
		if (sock_flag(sk, SOCK_DEAD)) {
			//如果该socket的RTO还没有超过最大RTO 120s,那么认为它还是存活的,
			//即认为其有可能是可以按照正常流程终止的,后面会尽可能的进行重传
			const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
			//根据alive标记判断允许该socket重传的最大次数
			retry_until = tcp_orphan_retries(sk, alive);
			//如果孤儿socket超过了指定数目限制,并且系统资源吃紧,那么会关闭该socket;
			//如果该socket还不至于彻底死掉,那么为了让对端感知关闭,也会发送RST
			if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
				return 1;
		}
	}
	//如果最终超时重传次数超过了最大允许次数,关闭socket,这会向应用程序返回-1(ETIMEOUT)
	if (icsk->icsk_retransmits >= retry_until) {
		/* Has it gone just too far? */
		tcp_write_err(sk);
		return 1;
	}
	return 0;
}

4.1.2 孤儿socket最大重传次数判定 tcp_orphan_retries()

/* Calculate maximal number or retries on an orphaned socket. */
static int tcp_orphan_retries(struct sock *sk, int alive)
{
	//孤儿socket默认取系统允许的最大重传次数
	int retries = sysctl_tcp_orphan_retries; /* May be zero. */

	//该socket已经明确的检测到了错误,并且RTO已经超过了TCP_RTO_MAX,
	//没有必要继续重试了,返回0次
	if (sk->sk_err_soft && !alive)
		retries = 0;

	/* However, if socket sent something recently, select some safe
	 * number of retries. 8 corresponds to >100 seconds with minimal
	 * RTO of 200msec. */
	//socket还可以保留,并且系统没有配置sysctl_tcp_orphan_retries的情况下设置为8次
	if (retries == 0 && alive)
		retries = 8;
	return retries;
}

4.1.3 孤儿socket资源消耗判定tcp_out_of_resources()

为了防止系统出现过多的孤儿socket吃光系统资源,TCP会及时的清理这些socket。

/* Do not allow orphaned sockets to eat all our resources.
 * This is direct violation of TCP specs, but it is required
 * to prevent DoS attacks. It is called when a retransmission timeout
 * or zero probe timeout occurs on orphaned socket.
 *
 * Criteria is still not confirmed experimentally and may change.
 * We kill the socket, if:
 * 1. If number of orphaned sockets exceeds an administratively configured
 *    limit.
 * 2. If we have strong memory pressure.
 */
static int tcp_out_of_resources(struct sock *sk, int do_reset)
{
	struct tcp_sock *tp = tcp_sk(sk);
	//全局变量tcp_orphan_count记录了当前系统有多少个孤儿进程
	int orphans = atomic_read(&tcp_orphan_count);

	/* If peer does not open window for long time, or did not transmit
	 * anything for long time, penalize it. */
	//如果确实已经很长时间没有发送数据了,那么惩罚一下,把orphans扩大一倍,
	//使其更容易使tcp_too_many_orphans()成立
	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
		orphans <<= 1;

	//同上,如果有软件错误,也惩罚
	if (sk->sk_err_soft)
		orphans <<= 1;

	//判断是否该socket是否需要关闭。就是基于系统参数和当前socket的内存占用情况
	if (tcp_too_many_orphans(sk, orphans)) {
		if (net_ratelimit())
			printk(KERN_INFO "Out of socket memory\n");

		/* Catch exceptional cases, when connection requires reset.
		 *      1. Last segment was sent recently. */
		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
		    /*  2. Window is closed. */
		    (!tp->snd_wnd && !tp->packets_out))
			do_reset = 1;
		//向对端发送RST报文
		if (do_reset)
			tcp_send_active_reset(sk, GFP_ATOMIC);
		//关闭该孤儿socket
		tcp_done(sk);
		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
		return 1;
	}
	return 0;
}

static inline int tcp_too_many_orphans(struct sock *sk, int num)
{
	return (num > sysctl_tcp_max_orphans) ||
		(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
		 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]);
}

5 数据包重传tcp_retransmit_skb()

其实无论是超时重传还是快速重传,都是通过调用该函数实现的,每调用一次该函数尝试发送一个skb,如果成功发送出去(只是递交给了底层),该函数返回0,否则返回错误码。

/* This retransmits one SKB.  Policy decisions and retransmit queue
 * state updates are done by the caller.  Returns non-zero if an
 * error occurred which prevented the send.
 */
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
	unsigned int cur_mss = tcp_current_mss(sk, 0);
	int err;

	/* Inconslusive MTU probe */
	if (icsk->icsk_mtup.probe_size) {
		icsk->icsk_mtup.probe_size = 0;
	}

	/* Do not sent more than we queued. 1/4 is reserved for possible
	 * copying overhead: fragmentation, tunneling, mangling etc.
	 */
	//因为重传时要克隆skb头部,需要内存分配,所以先检查内存使用情况
	if (atomic_read(&sk->sk_wmem_alloc) >
	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
		return -EAGAIN;
	//如果skb的前半部分已经确认过了,那么需要拆分该skb
	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
		//如果发现后半部分也已经确认过了,还是让重传,出BUG了
		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
			BUG();
		//将skb头部的指定字节数删除,这部分数据是已经确认过的
		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
			return -ENOMEM;
	}

	/* If receiver has shrunk his window, and skb is out of
	 * new window, do not retransmit it. The exception is the
	 * case, when window is shrunk to zero. In this case
	 * our retransmit serves as a zero window probe.
	 */
	//发送窗口不允许重传
	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))
	    && TCP_SKB_CB(skb)->seq != tp->snd_una)
		return -EAGAIN;

	//如果skb长度超过了MSS,那么拆分该skb。这时可以发现,如果使用了TSO,
	//原来skb中保存了很多个段,一旦该skb需要重传,那么只能发送一个段,还
	//需要进行skb的拆封
	if (skb->len > cur_mss) {
		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
			return -ENOMEM; /* We'll try again later. */
	}

	/* Collapse two adjacent packets if worthwhile and we can. */
	//额...,为了兼容某些打印机,和系统参数sysctl_tcp_retrans_collapse相关,尝试合并skb
	if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
	    (skb->len < (cur_mss >> 1)) &&
	    (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
	    (!tcp_skb_is_last(sk, skb)) &&
	    (skb_shinfo(skb)->nr_frags == 0 &&
	     skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
	    (tcp_skb_pcount(skb) == 1 &&
	     tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) &&
	    (sysctl_tcp_retrans_collapse != 0))
		tcp_retrans_try_collapse(sk, skb, cur_mss);
	//重新进行路由查询,具体见inet_sk_rebuild_header()
	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
		return -EHOSTUNREACH; /* Routing failure or similar. */

	/* Some Solaris stacks overoptimize and ignore the FIN on a
	 * retransmit when old data is attached.  So strip it off
	 * since it is cheap to do so and saves bytes on the network.
	 */
	if (skb->len > 0 && (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
	    tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
		if (!pskb_trim(skb, 0)) {
			/* Reuse, even though it does some unnecessary work */
			tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, TCP_SKB_CB(skb)->flags);
			skb->ip_summed = CHECKSUM_NONE;
		}
	}

	/* Make a copy, if the first transmission SKB clone we made
	 * is still in somebody's hands, else make a clone.
	 */
	//记录该skb的发送时间,这里可以看出,重传时skb的when是会更新的
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	//发送该skb,第三个参数表示要克隆skb头部
	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
	//如果发送成功,更新重传相关的统计量
	if (err == 0) {
		/* Update global TCP statistics. */
		TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
		//累加总的重传次数
		tp->total_retrans++;
		//如果之前[snd_una, snd_nxt)之间还没有数据包发生重传,那么
		//这是第一个,将发送重传时的snd_nxt记录到lost_retrans_low
		//中,用于拥塞控制算法中Recovery和LOSS状态的退出
		if (!tp->retrans_out)
			tp->lost_retrans_low = tp->snd_nxt;
		//该skb的记分牌中打上重传标记,表示其发生过了重传
		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
		//累加重传的段数
		tp->retrans_out += tcp_skb_pcount(skb);

		/* Save stamp of the first retransmit. */
		//retrans_stamp记录了第一次发生重传的时间。当收到的ACK确认了新数据或者
		//retrans_out重新变为0时该值会被重新设置为0
		if (!tp->retrans_stamp)
			tp->retrans_stamp = TCP_SKB_CB(skb)->when;

		tp->undo_retrans++;
		/* snd_nxt is stored to detect loss of retransmitted segment,
		 * see tcp_input.c tcp_sacktag_write_queue().
		 */
		//对于重传段,该ack_seq已经不是其本身的含义了,而是记录了重传该段时的snd_nxt
		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
	}
	return err;
}

6 系统参数

6.1 sysctl_tcp_retries1(INTEGER)

对应系统目录/proc/sys/net/ipv4/tcp_retries1,原生文档的解释为:This value influences the time, after which TCP decides, that something is wrong due to unacknowledged RTO retransmissions, and reports this suspicion to the network layer. See tcp_retries2 for more details.

RFC 1122 recommends at least 3 retransmissions, which is the default.

也就是说当TCP处于连接态时,该参数决定了TCP应该在RTO连续发生几次后将这一错误报告给网络层。从上面的代码实现中可以看到,linux的实现是进行PMTU探测和重新进行路由选择。

6.2 sysctl_tcp_retries2(INTEGER)

对应系统目录/proc/sys/net/ipv4/tcp_retries2,原生文档的解释为:This value influences the timeout of an alive TCP connection, when RTO retransmissions remain unacknowledged. Given a value of N, a hypothetical TCP connection following exponential backoff with an initial RTO of TCP_RTO_MIN would retransmit N times before killing the connection at the (N+1)th RTO.

The default value of 15 yields a hypothetical timeout of 924.6 seconds and is a lower bound for the effective timeout. TCP will effectively time out at the first RTO which exceeds the hypothetical timeout.

RFC 1122 recommends at least 100 seconds for the timeout, which corresponds to a value of at least 8.

意思就是当RTO超过指定次数后就不再重传,而是放弃该TCP连接。

6.3 sysctl_tcp_orphan_retries(INTEGER)

对应系统目录/proc/sys/net/ipv4/tcp_orphan_retries,原生文档的解释为: This value influences the timeout of a locally closed TCP connection, when RTO retransmissions remain unacknowledged. See tcp_retries2 for more details.

The default value is 8. If your machine is a loaded WEB server, you should think about lowering this value, such sockets may consume significant resources. Cf. tcp_max_orphans.

意思就是对于本地已经关闭的socket(对应代码中设置了DEAD标记),还需要经历多少次超时重传,这样的socket会消耗系统资源。

6.4 sysctl_tcp_max_orphans(INTEGER)

对应系统目录/proc/sys/net/ipv4/tcp_max_orphans,原生文档的解释为: Maximal number of TCP sockets not attached to any user file handle, held by system. If this number is exceeded orphaned connections are reset immediately and warning is printed. This limit exists only to prevent simple DoS attacks, you must not rely on this or lower the limit artificially, but rather increase it(probably, after increasing installed memory), if network conditions require more than default value, and tune network services to linger and kill such states more aggressively. Let me to remind again: each orphan eats up to ~64K of unswappable memory.

意思就是孤儿socket会消耗系统内存,所以不应该存在过多,通过该值可以简单的限定它们的数量。

6.5 sysctl_tcp_retrans_collapse(BOOLEAN)

对应系统目录/proc/sys/net/ipv4/tcp_retrans_collapse,原生文档的描述为:Bug-to-bug compatibility with some broken printers. On retransmit try to send bigger packets to work around bugs in certain TCP stacks.

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值