Table of Contents
2 SYN+ACK报文发送 tcp_v4_send_synack()
2.1 SYN+ACK报文够造 tcp_make_synack()
2.1.1 SYN+ACK报文内存分配 sock_wmalloc
3 SYN+ACK报文超时处理 tcp_synack_timer
3.1 inet_csk_reqsk_queue_prune()
1 SYN+ACK报文发送场景概述
当 tcp 服务器端收到SYN包后,将会调用 tcp_v4_send_synack() 向客户端发送SYN+ACK报文,同时启动 SYN+ACK 超时重传机制最终也会调用 tcp_v4_send_synack() 接口推送 SYN+ACK 报文,堆栈信息如下:
tcp_v4_conn_request //收到SYN立刻回复SYN+ACK
--tcp_v4_send_synack
tcp_synack_timer //超时重传
--inet_csk_reqsk_queue_prune
--tcp_v4_rtx_synack //req->rsk_ops->rtx_syn_ack(parent, req, NULL)
--tcp_v4_send_synack
2 SYN+ACK报文发送 tcp_v4_send_synack()
- 获取发送路由信息 //首次收到SYN 已经在 tcp_v4_conn_request 获取过了
- 构造 SYN+ACK 报文
- 计算 checksum 校验和
- 通过 ip 层将 tcp 报文发送出去
/*
* Send a SYN-ACK after having received a SYN.
* This still operates on a request_sock only, not on a big
* socket.
*/
static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
struct dst_entry *dst)
{
const struct inet_request_sock *ireq = inet_rsk(req);
int err = -1;
struct sk_buff * skb;
//获取路由
if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
goto out;
//根据监听套接字、连接请求块和路由构造SYN+ACK数据包
skb = tcp_make_synack(sk, dst, req);
if (skb) {
struct tcphdr *th = tcp_hdr(skb);
//计算TCP校验和
th->check = tcp_v4_check(skb->len,
ireq->loc_addr,
ireq->rmt_addr,
csum_partial((char *)th, skb->len,
skb->csum));
//构造IP报文并发送,属于IP层动作,暂不深究
err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
ireq->rmt_addr,
ireq->opt);
err = net_xmit_eval(err);
}
out:
dst_release(dst);
return err;
}
从上面的代码可以看出,TCP构造出SYN+ACK报文后,会直接发送给IP层,并且不会将该数据包加入TCP的发送队列。
2.1 SYN+ACK报文够造 tcp_make_synack()
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req)
{
struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th;
int tcp_header_size;
struct sk_buff *skb;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *md5;
__u8 *md5_hash_location;
#endif
//分配数据包内存,这里第三个参数为1表示强制分配,即无论本次分配
//是否会超出写内存使用量上限都会进行分配
skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
if (skb == NULL)
return NULL;
//为TCP首部预留空间
skb_reserve(skb, MAX_TCP_HEADER);
skb->dst = dst_clone(dst);
//根据需要包含的TCP选项计算实际的TCP首部长度,可以看到MSS选项是一定会包含进去的
tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
(ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
(ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
/* SACK_PERM is in the place of NOP NOP of TS */
((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
#ifdef CONFIG_TCP_MD5SIG
/* Are we doing MD5 on this segment? If so - make room for it */
md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
if (md5)
tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
#endif
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
//设置标准TCP首部各个字段
th = tcp_hdr(skb);
memset(th, 0, sizeof(struct tcphdr));
//SYN、ACK标志位置位
th->syn = 1;
th->ack = 1;
TCP_ECN_make_synack(req, th);
//源端口和目的端口
th->source = inet_sk(sk)->sport;
th->dest = ireq->rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
* not even correctly set)
*/
tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
//服务器端初始序列号设置(初始序列号的选择见SYN请求报文的处理过程)
th->seq = htonl(TCP_SKB_CB(skb)->seq);
//设置ACK序号为客户端SYN报文的初始序列号+1
th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
//设置服务器端的接收窗口大小(与本端接收缓冲区大小以及窗口扩大因子选项有关)
if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
__u8 rcv_wscale;
/* Set this up on the first call only */
req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(tcp_full_space(sk),
dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
&req->rcv_wnd,
&req->window_clamp,
ireq->wscale_ok,
&rcv_wscale);
ireq->rcv_wscale = rcv_wscale;
}
//将接收窗口设置到TCP首部中
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rcv_wnd, 65535U));
//设置SKB中的发送时间戳
TCP_SKB_CB(skb)->when = tcp_time_stamp;
//设置TCP首部的选项部分
tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
TCP_SKB_CB(skb)->when,
req->ts_recent,
(
#ifdef CONFIG_TCP_MD5SIG
md5 ? &md5_hash_location :
#endif
NULL)
);
th->doff = (tcp_header_size >> 2);
TCP_INC_STATS(TCP_MIB_OUTSEGS);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
if (md5) {
tp->af_specific->calc_md5_hash(md5_hash_location,
md5,
NULL, dst, req,
tcp_hdr(skb), sk->sk_protocol,
skb->len);
}
#endif
return skb;
}
2.1.1 SYN+ACK报文内存分配 sock_wmalloc
/*
* Allocate a skb from the socket's send buffer.
*/
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
//1. force不为0表示强制进行分配
//2. 如果已分配内存大小没有超过发送缓存上限,也可以进行分配
if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
struct sk_buff * skb = alloc_skb(size, priority);
if (skb) {
//设置skb的属主为监听套接字
skb_set_owner_w(skb, sk);
return skb;
}
}
return NULL;
}
3 SYN+ACK报文超时处理 tcp_synack_timer
SYN+ACK报文的超时处理函数为tcp_synack_timer(),下面看其实现:
#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
#define TCP_TIMEOUT_INIT ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value */
#define TCP_RTO_MAX ((unsigned)(120*HZ))
static void tcp_synack_timer(struct sock *sk)
{
inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
}
3.1 inet_csk_reqsk_queue_prune()
- 获取半连接队列
- 根据最大重传次数,判断超时,依次传输半连接请求
- 重传次数完毕删除半连接请求
- 重设定时器,+200ms之后继续调用
void inet_csk_reqsk_queue_prune(struct sock *parent,
const unsigned long interval,
const unsigned long timeout,
const unsigned long max_rto)
{
//icsk为监听套接字的连接请求队列
struct inet_connection_sock *icsk = inet_csk(parent);
//queue为accept连接队列
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
//lopt为SYN请求队列
struct listen_sock *lopt = queue->listen_opt;
//配置的SYN+ACK报文的最大重传次数(/proc/sys/net/ipv4/tcp_synack_retries)
int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
//实际允许的重传次数,下面会看到,thresh的值会调整
int thresh = max_retries;
unsigned long now = jiffies;
struct request_sock **reqp, *req;
int i, budget;
//如果SYN请求队列为空,什么都不需要做,直接返回
if (lopt == NULL || lopt->qlen == 0)
return;
//为了让监听套接字能够提供更好的服务,当SYN请求队列空间紧张时,队列中那些重传过
//SYN+ACK的连接应该能够及时的释放,因为这些重传过的报文最有可能永远都不会再收到
//ACK了(链路出了问题,导致SYN+ACK丢失、ACK丢失或者RTO过大)。为了能够即使释
//放这些不再“年轻”的连接,SYN+ACK报文的重传次数thresh就不再是固定值,而是根据
//当前SYN请求队列的状态动态调整的。
//下面这个分支就是确定thresh的值,个人认为不用理解具体的调整细节
if (lopt->qlen>>(lopt->max_qlen_log-1)) {
int young = (lopt->qlen_young<<1);
while (thresh > 2) {
if (lopt->qlen < young)
break;
thresh--;
young <<= 1;
}
}
//TCP_DEFER_ACCEPT选项相关
if (queue->rskq_defer_accept)
max_retries = queue->rskq_defer_accept;
//budget代表的是遍历多少个SYN请求队列哈希桶。不知为何要这么设计,为什么不
//直接将整个SYN请求队列遍历完,难道是性能考量,避免一次遍历花费过多时间?
budget = 2 * (lopt->nr_table_entries / (timeout / interval));
//如下,clock_hand是上次遍历结束时的索引
i = lopt->clock_hand;
do {
//指向哈希表冲突链的第一个元素
reqp=&lopt->syn_table[i];
//遍历冲突链
while ((req = *reqp) != NULL) {
//如果该连接请求超时
if (time_after_eq(now, req->expires)) {
//cond1: 该连接的重传次数还没有超过门限thresh;
//cond2: 该连接请求块已经被ACK过了,但是在创建通信套接字时失败了(资源受限等),
// 这种连接也是很快就可以成功的,所以应该重发SYN+ACK,所以用系统设定的
// 最大重传次数限制,acked赋值为1见《TCP之服务端接收ACK报文》
//cond1或者cond2满足一个,则调用回调rtx_syn_ack()重传SYN+ACK
if ((req->retrans < thresh ||
(inet_rsk(req)->acked && req->retrans < max_retries))
&& !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
unsigned long timeo;
//如果是首次重传,那么递减qlen_yong,即该连接不再年轻
if (req->retrans++ == 0)
lopt->qlen_young--;
//设定下次超时时间,3*2^已重传次数,即3s、6s、12s...,最大120s
timeo = min((timeout << req->retrans), max_rto);
req->expires = now + timeo;
reqp = &req->dl_next;
continue;
}
//该连接已经超过了最大重传次数,将其从SYN请求队列中清除
inet_csk_reqsk_queue_unlink(parent, req, reqp);
reqsk_queue_removed(queue, req);
reqsk_free(req);
continue;
}
reqp = &req->dl_next;
}
//i指向哈希表的下一个冲突链,求余是为了实现自动回绕
i = (i + 1) & (lopt->nr_table_entries - 1);
} while (--budget > 0);
//clock_hand变量记录遍历的位置,方便下次遍历
lopt->clock_hand = i;
//最后,如果SYN请求队列仍然不为空,重新设定定时器interval后(200ms)后超时
if (lopt->qlen)
inet_csk_reset_keepalive_timer(parent, interval);
}
从《linux内核 TCP服务器端接收SYN请求段Ⅰ》可以知道,回调函数rtx_syn_ack()就是tcp_v4_send_synack(),即SYN+ACK的重传和初传使用的是同一个函数。