linux内核协议栈 TCP服务器端接收SYN请求段Ⅰ_linux原始套服务端获取syn-CSDN博客

本文详细介绍了Linux内核协议栈中TCP服务器如何处理SYN请求，包括通过5元组查找套接字、判断请求队列是否已满、发送SYN+ACK响应、分配初始化连接请求块并加入SYN队列，以及启动超时重传机制。

摘要由CSDN通过智能技术生成

Table of Contents

1 tcp server 接收 SYN 概述

2 tcp server 接收 SYN 处理过程

2.1 tcp层的数据包输入接口 tcp_v4_rcv

2.2 tcp_v4_do_rcv()

2.3 tcp_rcv_state_process()

2.4 tcp处理SYN请求接口 tcp_v4_conn_request()（核心）

2.4.1 SYN请求队列（半连接）已满判断 inet_csk_reqsk_queue_is_full

2.4.2 accept连接队列（全连接）已满判断 sk_acceptq_is_full

2.5 连接请求块的分配和初始化

2.5.1 连接请求块的分配reqsk_alloc / inet_reqsk_alloc

2.5.2 连接请求块的初始化

2.6 将连接请求块加入SYN请求队列 inet_csk_reqsk_queue_hash_add

1 tcp server 接收 SYN 概述

根据5元组信息查考从 tcp_hashinfo 查找本端套接字
判断本端套接字请求队列满，包括：半连接，全连接
发送 SYN+ACK 报文给客户端
将新建的请求套接字加入到 SYN 队列中，并启动 SYN+ACK 超时重传定时器(初始值为3s)

注意：linux 内核协议栈在收到SYN包后，并不会将状态迁移至 SYN_RECV 状态，该状态是在收到客户端发来的ACK报文后才会新建一个sock，并将该sock的状态设置成 TCP_SYN_RECV ，在此之后调用 tcp_rcv_state_process 将状态迁移至TCP_ESTABLISHED

2 tcp server 接收 SYN 处理过程

tcp_v4_rcv
--tcp_v4_do_rcv
    --tcp_rcv_state_process
	    --tcp_v4_conn_request
		    --inet_csk_reqsk_queue_is_full
		    --sk_acceptq_is_full
		    --inet_reqsk_alloc
		    --tcp_v4_send_synack
		    --inet_csk_reqsk_queue_hash_add

2.1 tcp层的数据包输入接口 tcp_v4_rcv

校验tcp报文的合法性
根据5元组信息从 tcp_hashinfo 查找本端套接字
调用tcp_v4_do_rcv() 对数据包进行处理

int tcp_v4_rcv(struct sk_buff *skb)
{
	struct tcphdr *th;
	struct sock *sk;
	int ret;

	//获取TCP首部指针
	th = tcp_hdr(skb);
        //获取IP首部指针
        iph = ip_hdr(skb);
	//从TCP的哈希表中寻找应该由哪个套接字来处理这个数据段（根据输入数据段的tcp/ip头部信息）
	//对于处理SYN请求段的场景，这里找到的就是监听套接字
	sk = __inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->saddr,
			th->source, iph->daddr, th->dest, inet_iif(skb));
	if (!sk)
		goto no_tcp_socket;

process:
	//这里涉及TCP接收时为了性能考虑使用的三个队列，暂不关注，直接看tcp_v4_do_rcv()
	if (!sock_owned_by_user(sk)) {
		if (!tcp_prequeue(sk, skb))
			//调用tcp_v4_do_rcv()对数据包进行处理
			ret = tcp_v4_do_rcv(sk, skb);
	} else
		sk_add_backlog(sk, skb);
	
	bh_unlock_sock(sk);
	sock_put(sk);
	return ret;
}

2.2 tcp_v4_do_rcv()

调用 tcp_v4_hnd_req 查找请求套接字，没有找到，返回sk
调用 tcp_rcv_state_process 处理SYN请求报文

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
	struct sock *rsk;

	if (sk->sk_state == TCP_LISTEN) {
		//返回NULL：出错
		//nsk == sk：没有找到新的TCB，所以收到的是第一次握手的SYN(这篇笔记就是这种情况)
		//NSK != SK: 找到了新的TCB，所以收到的是第三次握手的ACK
		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
		if (!nsk)
			goto discard;
		//ACK包由tcp_child_process处理
		if (nsk != sk) {
			if (tcp_child_process(sk, nsk, skb)) {
				rsk = nsk;
				goto reset;
			}
			return 0;
		}
	}

	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
		//如果返回非0，表示收到了不期望的数据包，此时会向对端发送RST报文
		rsk = sk;
		goto reset;
	}
	return 0;
}

2.3 tcp_rcv_state_process()

调用 tcp_v4_conn_request 处理 SYN 连接请求

/*
sk: 接收该报文的TCP套接字
skb：输入数据报文
th：指向该报文的TCP头部指针
len：数据报文长度
*/
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
			  struct tcphdr *th, unsigned len)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
	int queued = 0;

	switch (sk->sk_state) {
	case TCP_LISTEN:
		//此函数只处理SYN报文段，如果ACK置为，说明收到的是非预期的报文，
		//返回1会导致向对端回复RST报文
		if (th->ack)
			return 1;
		//收到RST报文，只是忽略该报文
		if (th->rst)
			goto discard;
		
		if (th->syn) {
			//收到了SYN报文，交由TCP的tcp_v4_conn_request()处理，
			//该指针在传输控制块初始化过程中被指定，见tcp_v4_init_sock
			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
				return 1;

			/* Now we have several options: In theory there is
			 * nothing else in the frame. KA9Q has an option to
			 * send data with the syn, BSD accepts data with the
			 * syn up to the [to be] advertised window and
			 * Solaris 2.1 gives you a protocol error. For now
			 * we just ignore it, that fits the spec precisely
			 * and avoids incompatibilities. It would be nice in
			 * future to drop through and process the data.
			 *
			 * Now that TTCP is starting to be used we ought to
			 * queue this data.
			 * But, this leaves one open to an easy denial of
			 * service attack, and SYN cookies can't defend
			 * against this problem. So, we drop the data
			 * in the interest of security over speed unless
			 * it's still in use.
			 */
			//上面是关于第一个SYN包是否可以携带数据的讨论，当期版本的实现是不允许其携带报文的
			kfree_skb(skb);
			return 0;
		}
		goto discard;
	}
}

2.4 tcp处理SYN请求接口 tcp_v4_conn_request()（核心）

该函数要做的最主要的事情就是创建连接请求套接字对象，即struct tcp_request_sock，然后将其加入到监听套接字的SYN请求队列（半连接队列 listen_sock.syn_table）中。总结下该函数的核心操作：

检查SYN请求队列和accept连接队列是否还允许接收该SYN请求，如果已经无法接收，那么丢弃该SYN请求段(这里不考虑syn_cookie)，但是不会给客户端回RST，这样后续如果客户端重试并且服务器端队列有空余了，就可以继续处理该请求；
分配连接请求块struct tcp_request_sock对象；
解析处理SYN请求段中的TCP选项(暂不分析)；
根据收到的选项初始化新分配的连接请求块；
生成SYN+ACK报文要携带的seq，即服务器端的初始序列号；
向客户段发送SYN+ACK报文(见《TCP之服务器端发送SYN+ACK报文》)；
将连接请求块加入到监听套接字的SYN请求队列中并启动SYN+ACK超时定时器。

int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
	struct inet_request_sock *ireq;
	struct tcp_options_received tmp_opt;
	struct request_sock *req;
	//记录SYN请求段中的源和目的地址
	__be32 saddr = ip_hdr(skb)->saddr;
	__be32 daddr = ip_hdr(skb)->daddr;
	__u32 isn = TCP_SKB_CB(skb)->when;
	struct dst_entry *dst = NULL;
    
	//SYN COOKIE技术相关内容，忽略
#ifdef CONFIG_SYN_COOKIES
	int want_cookie = 0;
#else
#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
#endif
	//对于发送给广播和组播地址的SYN报文丢弃，TCP不支持广播，这里应该是出于可靠性的考虑
	if (((struct rtable *)skb->dst)->rt_flags &
	    (RTCF_BROADCAST | RTCF_MULTICAST))
		goto drop;

	//如果SYN请求队列已满，那么丢弃(不考虑SYN Cookie)请求，这种情况客户端会重传SYN请求
	/* TW buckets are converted to open requests without
	 * limitations, they conserve resources and peer is
	 * evidently real one.
	 */
	//这里为什么要判断isn，不理解...
	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
		if (sysctl_tcp_syncookies) {
			want_cookie = 1;
		} else
#endif
		goto drop;
	}

	//如果accept接收队列已满，并且SYN请求队列中至少有一个请求还没有重传过SYN+ACK包，则丢弃该新的SYN请求.
	//个人理解这样设计的考虑是：因为SYN请求队列中有这种“年轻的SYN请求“，而且当前accept队列已满，那么这种
	//年轻的SYN请求很可能很快就会完成三次握手，进而需要添加到accept队列中，所以此时如果接受该新的SYN请求,
	//那么很可能会导致由于无法加入到accept队列而导致已经完成三次握手的TCP连接失败
	/* Accept backlog is full. If we have already queued enough
	 * of warm entries in syn queue, drop request. It is better than
	 * clogging syn queue with openreqs with exponentially increasing
	 * timeout.
	 */
	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
		goto drop;

	//分配struct tcp_reqeust_sock对象，并将tcp_request_sock_ops赋值给其rsk_ops，
	//后续连接建立过程中会调用该结构指定的函数，
	req = reqsk_alloc(&tcp_request_sock_ops);
	if (!req)
		goto drop;

#ifdef CONFIG_TCP_MD5SIG
	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
#endif

	//解析SYN包携带的TCP选项，这里先不关注TCP选项相关内容
	tcp_clear_options(&tmp_opt);
	tmp_opt.mss_clamp = 536;
	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
	tcp_parse_options(skb, &tmp_opt, 0);

	//SYN Cookie相关，忽略
	if (want_cookie) {
		tcp_clear_options(&tmp_opt);
		tmp_opt.saw_tstamp = 0;
	}

	//时间戳选项处理
	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
		/* Some OSes (unknown ones, but I see them on web server, which
		 * contains information interesting only for windows'
		 * users) do not send their stamp in SYN. It is easy case.
		 * We simply do not advertise TS support.
		 */
		tmp_opt.saw_tstamp = 0;
		tmp_opt.tstamp_ok  = 0;
	}
	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;

	//根据SYN请求段中的字段和选项来初始化连接请求块
	tcp_openreq_init(req, &tmp_opt, skb);

	if (security_inet_conn_request(sk, skb, req))
		goto drop_and_free;

	//记录该套接字的源地址和目的地址，这里的saddr和daddr分别是skb中的源IP和目的IP字段，所以相反赋值
	ireq = inet_rsk(req);
	ireq->loc_addr = daddr;
	ireq->rmt_addr = saddr;
	//将SYN请求段中的IP选项部分保存到连接请求块中
	ireq->opt = tcp_v4_save_options(sk, skb);
	if (!want_cookie)
		TCP_ECN_create_request(req, tcp_hdr(skb));

	//根据不同情况生成服务器端的初始发送序号
	if (want_cookie) {
#ifdef CONFIG_SYN_COOKIES
		syn_flood_warning(skb);
#endif
		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
	} else if (!isn) {
		struct inet_peer *peer = NULL;

		/* VJ's idea. We save last timestamp seen
		 * from the destination in peer table, when entering
		 * state TIME-WAIT, and check against it before
		 * accepting new connection request.
		 *
		 * If "isn" is not zero, this request hit alive
		 * timewait bucket, so that all the necessary checks
		 * are made in the function processing timewait state.
		 */
		if (tmp_opt.saw_tstamp &&
		    tcp_death_row.sysctl_tw_recycle &&
		    (dst = inet_csk_route_req(sk, req)) != NULL &&
		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
		    peer->v4daddr == saddr) {
			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
			    (s32)(peer->tcp_ts - req->ts_recent) >
							TCP_PAWS_WINDOW) {
				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
				dst_release(dst);
				goto drop_and_free;
			}
		}
		/* Kill the following clause, if you dislike this way. */
		else if (!sysctl_tcp_syncookies &&
			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
			  (sysctl_max_syn_backlog >> 2)) &&
			 (!peer || !peer->tcp_ts_stamp) &&
			 (!dst || !dst_metric(dst, RTAX_RTT))) {
			/* Without syncookies last quarter of
			 * backlog is filled with destinations,
			 * proven to be alive.
			 * It means that we continue to communicate
			 * to destinations, already remembered
			 * to the moment of synflood.
			 */
			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
				       "request from %u.%u.%u.%u/%u\n",
				       NIPQUAD(saddr),
				       ntohs(tcp_hdr(skb)->source));
			dst_release(dst);
			goto drop_and_free;
		}
		isn = tcp_v4_init_sequence(skb);
	}
	//将确定的初始序列号记录到TCP控制块中
	tcp_rsk(req)->snt_isn = isn;
	
	//发送SYN+ACK报文
	if (tcp_v4_send_synack(sk, req, dst))
		goto drop_and_free;

	if (want_cookie) {
		reqsk_free(req);
	} else {
		//将连接请求块加入到SYN请求队列中，并启动SYN+ACK超时重传定时器(初始值为3s)
		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
	}
	return 0;

drop_and_free:
	reqsk_free(req);
drop:
	return 0;
}

2.4 连接请求队列状态判断

这里要看的accept连接队列和SYN请求队列是否已满的判断。

2.4.1 SYN请求队列（半连接）已满判断 inet_csk_reqsk_queue_is_full

static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
{
	return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);
}

static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
{
	//如果当前已经收到SYN请求的套接字数目(qlen)大于nr_tables_entries，
	//则认为SYN请求队列已满，这里巧妙的运用了移位运算而不是比较运算
	return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
}

2.4.2 accept连接队列（全连接）已满判断 sk_acceptq_is_full

static inline int sk_acceptq_is_full(struct sock *sk)
{
	//直接比较当前已完成三次握手的套接字数目和允许的最大值，这可以看出listen()
	//调用中backlog参数(它会被赋值给sk_max_ack_backlog)的作用
	return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
}

2.5 连接请求块的分配和初始化

2.5.1 连接请求块的分配reqsk_alloc / inet_reqsk_alloc

static inline struct request_sock *(const struct request_sock_ops *ops)
{
	//分配一个连接请求块，这里实际上是分配的struct tcp_request_sock结构
	struct request_sock *req = kmem_cache_alloc(ops->slab, GFP_ATOMIC);
	//将操作函数赋值给连接请求块的ops成员
	if (req != NULL)
		req->rsk_ops = ops;

	return req;
}

调用reqsk_alloc()时传入的ops是tcp_request_sock_ops，其定义如下：

struct request_sock_ops tcp_request_sock_ops __read_mostly = {
	.family		=	PF_INET,
	//创建的对象为struct tcp_request_sock
	.obj_size	=	sizeof(struct tcp_request_sock),
	.rtx_syn_ack	=	tcp_v4_send_synack,
	.send_ack	=	tcp_v4_reqsk_send_ack,
	.destructor	=	tcp_v4_reqsk_destructor,
	.send_reset	=	tcp_v4_send_reset,
};

这里ops->slab是在AF_INET协议族初始化的时候创建的，代码如下：

struct proto tcp_prot = {
	...
	.rsk_prot		= &tcp_request_sock_ops,
    ...
};

static int __init inet_init(void)
{
	...
	rc = proto_register(&tcp_prot, 1);
	if (rc)
		goto out;
    ...
}

int proto_register(struct proto *prot, int alloc_slab)
{
	...
    prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
                         prot->rsk_prot->obj_size, 0,
                         SLAB_HWCACHE_ALIGN, NULL);
    ...
}

2.5.2 连接请求块的初始化

连接请求块的初始化依赖于SYN请求段中的TCP选项，所以是在完成TCP选项解析后执行的，代码如下：

static inline void tcp_openreq_init(struct request_sock *req,
				    struct tcp_options_received *rx_opt,
				    struct sk_buff *skb)
{
	struct inet_request_sock *ireq = inet_rsk(req);

	req->rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
	req->mss = rx_opt->mss_clamp;
	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
	ireq->tstamp_ok = rx_opt->tstamp_ok;
	ireq->sack_ok = rx_opt->sack_ok;
	ireq->snd_wscale = rx_opt->snd_wscale;
	ireq->wscale_ok = rx_opt->wscale_ok;
	ireq->acked = 0;
	ireq->ecn_ok = 0;
	ireq->rmt_port = tcp_hdr(skb)->source;
}

2.6 将连接请求块加入SYN请求队列 inet_csk_reqsk_queue_hash_add

void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
				   unsigned long timeout)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	//获取SYN请求队列
	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
	//根据连接请求块的对端IP地址、端口号、初始哈希值计算一个哈希值
	const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
				     lopt->hash_rnd, lopt->nr_table_entries);
	//将连接请求块插入SYN请求队列中，并且将超时时间设置到该连接请求块中
	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
	//更新SYN请求队列中的计数信息：qlen、qlen_yong，并启动SYN+ACK重传定时器
	inet_csk_reqsk_queue_added(sk, timeout);
}

static inline void reqsk_queue_hash_req(struct request_sock_queue *queue,
					u32 hash, struct request_sock *req,
					unsigned long timeout)
{
	struct listen_sock *lopt = queue->listen_opt;

	//设置超时参数
	req->expires = jiffies + timeout;
	//初始化SYN+ACK报文重传次数为0
	req->retrans = 0;
	req->sk = NULL;
	//将新的连接请求块插入到SYN请求队列的首部
	req->dl_next = lopt->syn_table[hash];
	write_lock(&queue->syn_wait_lock);
	lopt->syn_table[hash] = req;
	write_unlock(&queue->syn_wait_lock);
}

static inline void inet_csk_reqsk_queue_added(struct sock *sk,
					      const unsigned long timeout)
{
	//更新listen_ops的计数信息。如果函数返回0，表示之前SYN请求队列为空，
	//这种情况需要复位SYN+ACK重传定时器
	if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0)
		inet_csk_reset_keepalive_timer(sk, timeout);
}

static inline int reqsk_queue_added(struct request_sock_queue *queue)
{
	struct listen_sock *lopt = queue->listen_opt;
	const int prev_qlen = lopt->qlen;

	//更新qlne和qlen_young
	lopt->qlen_young++;
	lopt->qlen++;
	//返回SYN请求队列之前的长度
	return prev_qlen;
}