Linux TCP那些事儿之被动套接字(一)

1. 概述

github项目地址:https://github.com/superwujc

尊重原创,欢迎转载,注明出处:https://my.oschina.net/superwjc/blog/1816385

Linux下创建TCP服务端的流程为依次调用socket(2),bind(2),listen(2),accept(2),其中,listen(2)将本端套接字设置为被动套接字(passive socket),使之进入监听状态,以等待客户端发起的主动连接。

#include <sys/types.h>
#include <sys/socket.h>

int listen(int sockfd, int backlog);

对于listen(2)的backlog参数,网上大多数文章大都表意不清,或有歧义,且缺乏有力的实证;本文将从内核源码的角度分析被动套接字的创建流程,包括backlog参数的具体含义,以及与之相关的内核参数somaxconn,tcp_max_syn_backlog,tcp_syncookies。

2. 示例程序

操作系统版本为CentOS Linux release 7.5.1804 (Core),内核版本为3.10.0-862.2.3.el7.x86_64。

服务端:tcp_server.c,以命令行选项-b指定listen(2)的backlog参数

/* tcp_server.c */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <string.h>
#include <errno.h>

#define SA struct sockaddr
#define ERR_EXIT(msg, ...) \
	do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while (0)

int main(int argc, char *argv[])
{
	if (argc < 3)
		ERR_EXIT("Usage: %s -b <backlog>\n", argv[0]);

	int lfd, c, backlog;
	struct sockaddr_in svaddr;
	socklen_t svaddrlen;

	while ((c = getopt(argc, argv, "b:")) != -1) {
		if (c == 'b')
			backlog = atoi(optarg);
		else
			exit(EXIT_FAILURE);
	}

	lfd = socket(AF_INET, SOCK_STREAM, 0);
	if (lfd == -1)
		ERR_EXIT("socket() failed: %s\n", strerror(errno));

	svaddrlen = sizeof(svaddr);
	
	memset(&svaddr, 0, svaddrlen);
	svaddr.sin_family = AF_INET;
	svaddr.sin_addr.s_addr = htonl(INADDR_ANY);
	svaddr.sin_port = htons(8888);

	if (bind(lfd, (SA *)&svaddr, svaddrlen) == -1)
		ERR_EXIT("bind() failed: %s\n", strerror(errno));
	
	if (listen(lfd, backlog) == -1)
		ERR_EXIT("listen() failed: %s\n", strerror(errno));
	
	for ( ; ; )
		sleep(1);
	
	exit(EXIT_SUCCESS);
}

客户端程序:tcp_client.c,命令行选项-i/-p/-n分别指定服务端ip,端口,以及发起连接的数量

/* tcp_client.c */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <string.h>
#include <errno.h>

#define SA struct sockaddr
#define ERR_EXIT(msg, ...) \
	do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while (0)

static void connect_to_server(char *, int);

int main(int argc, char *argv[])
{
	if (argc < 5)
		ERR_EXIT("Usage: %s -i <ip> -p <port> -n <count>\n", argv[0]);

	int cfd, i, c;

	int nr_children = 1;
	char *svip = NULL;
	in_port_t svport = 0;
	struct sigaction sa;

	sigemptyset(&sa.sa_mask);
	sa.sa_handler = SIG_IGN;
	if (sigaction(SIGCHLD, &sa, NULL) == -1)
		ERR_EXIT("sigaction() failed: %s\n", strerror(errno));

	while ((c = getopt(argc, argv, "i:p:n:")) != -1) {
		switch (c) {
		case 'i':
			svip = optarg;
			break;
		case 'p':
			svport = (in_port_t)atoi(optarg);
			break;
		case 'n':
			nr_children = atoi(optarg);
			break;
		default:
			exit(EXIT_FAILURE);
		}
	}

	if (!svip)
		ERR_EXIT("invalid dst ip\n");

	if (!svport)
		ERR_EXIT("invalid dst port\n");

	for (i = 0; i < nr_children; i++) {
		switch (fork()) {
		case -1:
			ERR_EXIT("fork() failed:%s\n", strerror(errno));
		case 0:
			connect_to_server(svip, svport);
			_exit(EXIT_SUCCESS);
		default:
			close(cfd);
			break;
		}
	}

	for ( ; ; )
		sleep(1);

	exit(EXIT_SUCCESS);
}

static void connect_to_server(char *svip, int svport)
{
	int cfd;
	struct sockaddr_in svaddr;
	socklen_t svaddrlen;

	cfd = socket(AF_INET, SOCK_STREAM, 0);
	if (cfd == -1)
		ERR_EXIT("socket() failed: %s\n", strerror(errno));

	svaddrlen = sizeof(svaddr);

	memset(&svaddr, 0, svaddrlen);
	svaddr.sin_family = AF_INET;
	svaddr.sin_port = htons(svport);
	inet_pton(AF_INET, svip, &svaddr.sin_addr);

	if (connect(cfd, (SA *)&svaddr, svaddrlen) == -1)
		ERR_EXIT("connect() failed: %s\n", strerror(errno));

	for ( ; ; )
		sleep(1);
}

3. 内核数据结构

内核为TCP的连接与传输维持2个队列:已完成队列与未完成队列;已完成队列在逻辑上又可划分为半连接队列与全连接队列。

半连接队列通过request_sock_queue结构表示,该结构中包括request_sock结构与listen_sock结构,其中,listen_sock结构的max_qlen_log与qlen字段共同决定半连接队列允许的最大长度:qlen表示半连接队列的当前长度,max_qlen_log表示以2为底数的半连接队列最大长度的对数。

/* include/net/request_sock.h */

151 struct request_sock_queue {
152     struct request_sock *rskq_accept_head;
153     struct request_sock *rskq_accept_tail;
		...
157     struct listen_sock  *listen_opt;
		...
164 };


 50 struct request_sock {
 51     struct sock_common      __req_common;
 52     struct request_sock     *dl_next;
 53     u16             mss;
 54     u8              num_retrans; /* number of retransmits */
 55     u8              cookie_ts:1; /* syncookie: encode tcpopts in timestamp */
 56     u8              num_timeout:7; /* number of timeouts */
 57     /* The following two fields can be easily recomputed I think -AK */
 58     u32             window_clamp; /* window clamp at creation time */
 59     u32             rcv_wnd;      /* rcv_wnd offered first time */
 60     u32             ts_recent;
 61     unsigned long           expires;
 62     const struct request_sock_ops   *rsk_ops;
 63     struct sock         *sk;
 64     u32             secid;
 65     u32             peer_secid;
 66 };


 91 /** struct listen_sock - listen state
 92  *
 93  * @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs
 94  */
 95 struct listen_sock {
 96     u8          max_qlen_log;
 97     u8          synflood_warned;
 98     /* 2 bytes hole, try to use */
 99     int         qlen;
100     int         qlen_young;
101     int         clock_hand;
102     u32         hash_rnd;
103     u32         nr_table_entries;
104     struct request_sock *syn_table[0];
105 };

全连接队列通过sock结构中的sk_ack_backlogsk_max_ack_backlog字段共同表示,分别表示全连接队列的当前长度与最大长度。

/* include/net/sock.h */
303 struct sock {
		...
408 	unsigned short      sk_ack_backlog;
409 	unsigned short      sk_max_ack_backlog;
		...
460 };

对于每一个传入的TCP连接请求,内核将先后检查半连接与全连接队列的当前长度,根据队列是否已满选择相应的处理方式。

/* include/net/request_sock.h */
258 static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
259 {
260     return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
261 }


/* include/net/sock.h */
 815 static inline bool sk_acceptq_is_full(const struct sock *sk)
 816 {
 817     return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
 818 }

4. 内核处理流程

4.1 - TCP协议初始化

TCP协议的源码以静态的方式编译至内核映像,随系统启动而完成初始化,包括流套接字创建,接收与发送等操作时执行的一系列回调函数,以及ipv4/ipv6支持的特定操作等。

首先通过tcp_prot结构指定TCP套接字的内核回调函数,包括用于初始化的tcp_v4_init_sock()以及用于接收数据包的函数tcp_v4_do_rcv()等,封装在ipv4中的TCP另包含特定的处理函数tcp_v4_conn_request()与tcp_v4_syn_recv_sock()等:

/* net/ipv4/tcp_ipv4.c */
2428 struct proto tcp_prot = {
2429     .name           = "TCP",
2430     .owner          = THIS_MODULE,
		 ...
2436     .init           = tcp_v4_init_sock,
		 ...
2444     .backlog_rcv        = tcp_v4_do_rcv,
		 ...
2473 };

/* net/ipv4/tcp_ipv4.c */
1843 static int tcp_v4_init_sock(struct sock *sk)
1844 {
		 ...
1849     icsk->icsk_af_ops = &ipv4_specific;
		 ...
1855     return 0;
1856 };

/* net/ipv4/tcp_ipv4.c */
1811 const struct inet_connection_sock_af_ops ipv4_specific = {
		 ...
1816     .conn_request      = tcp_v4_conn_request,
1817     .syn_recv_sock     = tcp_v4_syn_recv_sock,
		 ...
1829 };

然后通过inet_stream_ops结构指定TCP套接字支持的方法,这些内核函数分别对应于用户空间的bind(2),listen(2)等:

/* net/ipv4/af_inet.c */
 904 const struct proto_ops inet_stream_ops = {
 905     .family        = PF_INET,
 906     .owner         = THIS_MODULE,
 907     .release       = inet_release,
 908     .bind          = inet_bind,
 909     .connect       = inet_stream_connect,
 910     .socketpair    = sock_no_socketpair,
 911     .accept        = inet_accept,
 912     .getname       = inet_getname,
 913     .poll          = tcp_poll,
 914     .ioctl         = inet_ioctl,
 915     .listen        = inet_listen,
 916     .shutdown      = inet_shutdown,
 917     .setsockopt    = sock_common_setsockopt,
 918     .getsockopt    = sock_common_getsockopt,
 919     .sendmsg       = inet_sendmsg,
 920     .recvmsg       = inet_recvmsg,
 921     .mmap          = sock_no_mmap,
 922     .sendpage      = inet_sendpage,
 923     .splice_read       = tcp_splice_read,
 924 #ifdef CONFIG_COMPAT
 925     .compat_setsockopt = compat_sock_common_setsockopt,
 926     .compat_getsockopt = compat_sock_common_getsockopt,
 927     .compat_ioctl      = inet_compat_ioctl,
 928 #endif
 929 };

包含tcp_protinet_stream_ops的套接字类型数组inetsw_array:

/* net/ipv4/af_inet.c */
 998 static struct inet_protosw inetsw_array[] =
 999 {
1000     {
1001         .type =       SOCK_STREAM,
1002         .protocol =   IPPROTO_TCP,
1003         .prot =       &tcp_prot,
1004         .ops =        &inet_stream_ops,
1005         .flags =      INET_PROTOSW_PERMANENT |
1006                   INET_PROTOSW_ICSK,
1007     },
         ...
1032 };

用于注册协议的tcp_protocol,其中包含接收数据包时调用的内核函数tcp_v4_rcv():

/* net/ipv4/af_inet.c */
1544 static const struct net_protocol tcp_protocol = {
1545     .early_demux    =   tcp_v4_early_demux,
1546     .handler    =   tcp_v4_rcv,
1547     .err_handler    =   tcp_v4_err,
1548     .no_policy  =   1,
1549     .netns_ok   =   1,
1550 };

初始化操作最终通过inet_init()函数实现,将以上设置的tcp_prot,tcp_protocol,inetsw_array[]进行整合,最后调用tcp_init()函数,打印Hash tables configured (established %u bind %u)信息,通过dmesg可以查看:

/* net/ipv4/af_inet.c */
1683 static int __init inet_init(void)
1684 {
		...
1695    rc = proto_register(&tcp_prot, 1);
		...
1731    if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
		...
1742    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
1743        inet_register_protosw(q);
		...
1757    tcp_v4_init();
		...
1760    tcp_init();
		...
1881 };


/* net/ipv4/tcp.c */
3098 void __init tcp_init(void)
3099 {
		 ...
3171     pr_info("Hash tables configured (established %u bind %u)\n",
3172         tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
		 ...
3177 };
# dmesg | grep 'Hash tables configured'
[    1.511233] TCP: Hash tables configured (established 8192 bind 8192)

4.2 - listen(2)系统调用

listen(2)系统调用对应的内核函数为sys_listen():

# stap -L 'kernel.function("sys_listen")'
kernel.function("SyS_listen@net/socket.c:1576") $fd:long int $backlog:long int
/* net/socket.c */
1576 SYSCALL_DEFINE2(listen, int, fd, int, backlog)
1577 {
1578     struct socket *sock;
1579     int err, fput_needed;
1580     int somaxconn;
1581
1582     sock = sockfd_lookup_light(fd, &err, &fput_needed);
1583     if (sock) {
1584         somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
1585         if ((unsigned int)backlog > somaxconn)
1586             backlog = somaxconn;
1587
1588         err = security_socket_listen(sock, backlog);
1589         if (!err)
1590             err = sock->ops->listen(sock, backlog);
1591
1592         fput_light(sock->file, fput_needed);
1593     }
1594     return err;
1595 }

1584 - 1586行,将手动指定的backlog与系统的somaxconn进行比较,取最小值。

1590行,调用listen()函数,对应于初始化阶段设置的inet_stream_ops.inet_listen()回调函数

somaxconn值可以通过sysctl接口的net.core.somaxconn参数,或proc接口的/proc/sys/net/core/somaxconn伪文件指定;proc项与内核参数的对应关系定义在net/ipv4/sysctl_net_ipv4.cnet/core/sysctl_net_core.c

/* net/core/sysctl_net_core.c */
297 static struct ctl_table netns_core_table[] = {
298     {
299         .procname   = "somaxconn",
300         .data       = &init_net.core.sysctl_somaxconn,
301         .maxlen     = sizeof(int),
302         .mode       = 0644,
303         .extra1     = &zero,
304         .extra2     = &ushort_max,
305         .proc_handler   = proc_dointvec_minmax
306     },
307     { }
308 };
# stap -L 'kernel.function("inet_listen")'
kernel.function("inet_listen@net/ipv4/af_inet.c:193") $sock:struct socket* $backlog:int
/* net/ipv4/af_inet.c */
 193 int inet_listen(struct socket *sock, int backlog)
 194 {
 195     struct sock *sk = sock->sk;
 196     unsigned char old_state;
 197     int err;
 198
 199     lock_sock(sk);
 200
 201     err = -EINVAL;
 202     if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
 203         goto out;
 204
 205     old_state = sk->sk_state;
 206     if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
 207         goto out;
 208
 209     /* Really, if the socket is already in listen state
 210      * we can only allow the backlog to be adjusted.
 211      */
 212     if (old_state != TCP_LISTEN) {
 213         /* Check special setups for testing purpose to enable TFO w/o
 214          * requiring TCP_FASTOPEN sockopt.
 215          * Note that only TCP sockets (SOCK_STREAM) will reach here.
 216          * Also fastopenq may already been allocated because this
 217          * socket was in TCP_LISTEN state previously but was
 218          * shutdown() (rather than close()).
 219          */
 220         if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
 221             inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
 222             if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
 223                 err = fastopen_init_queue(sk, backlog);
 224             else if ((sysctl_tcp_fastopen &
 225                   TFO_SERVER_WO_SOCKOPT2) != 0)
 226                 err = fastopen_init_queue(sk,
 227                     ((uint)sysctl_tcp_fastopen) >> 16);
 228             else
 229                 err = 0;
 230             if (err)
 231                 goto out;
 232
 233             tcp_fastopen_init_key_once(true);
 234         }
 235         err = inet_csk_listen_start(sk, backlog);
 236         if (err)
 237             goto out;
 238     }
 239     sk->sk_max_ack_backlog = backlog;
 240     err = 0;
 241
 242 out:
 243     release_sock(sk);
 244     return err;
 245 }

通过systemtap追踪该函数的执行流程:

# stap -c './tcp_server -b 3' -e 'probe kernel.statement("inet_listen@net/ipv4/af_inet.c:*") {printf("%s\n", pp())}'
WARNING: probe kernel.statement("inet_listen@net/ipv4/af_inet.c:220") (address 0xffffffff89a6ef47) registration error (rc -22)
kernel.statement("inet_listen@net/ipv4/af_inet.c:194")
kernel.statement("inet_listen@net/ipv4/af_inet.c:195")
kernel.statement("inet_listen@net/ipv4/af_inet.c:202")
kernel.statement("inet_listen@net/ipv4/af_inet.c:201")
kernel.statement("inet_listen@net/ipv4/af_inet.c:205")
kernel.statement("inet_listen@net/ipv4/af_inet.c:206")
kernel.statement("inet_listen@net/ipv4/af_inet.c:212")
kernel.statement("inet_listen@net/ipv4/af_inet.c:235")
kernel.statement("inet_listen@net/ipv4/af_inet.c:236")
kernel.statement("inet_listen@net/ipv4/af_inet.c:239")
kernel.statement("inet_listen@net/ipv4/af_inet.c:240")
kernel.statement("inet_listen@net/ipv4/af_inet.c:243")
kernel.statement("inet_listen@net/ipv4/af_inet.c:245")

可以得出该函数执行的主要操作为:

235行,调用inet_csk_listen_start()函数

239行,inet_csk_listen_start()函数返回后,将backlog作为已连接队列长度的最大值sk_max_ack_backlog。

# stap -L 'kernel.function("inet_csk_listen_start")'
kernel.function("inet_csk_listen_start@net/ipv4/inet_connection_sock.c:758") $sk:struct sock* $nr_table_entries:int const
/* net/ipv4/inet_connection_sock.c */
758 int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
759 {
760     struct inet_sock *inet = inet_sk(sk);
761     struct inet_connection_sock *icsk = inet_csk(sk);
762     int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
763
764     if (rc != 0)
765         return rc;
766
767     sk->sk_max_ack_backlog = 0;
768     sk->sk_ack_backlog = 0;
769     inet_csk_delack_init(sk);
770
771     /* There is race window here: we announce ourselves listening,
772      * but this transition is still not validated by get_port().
773      * It is OK, because this socket enters to hash table only
774      * after validation is complete.
775      */
776     sk->sk_state = TCP_LISTEN;
777     if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
778         inet->inet_sport = htons(inet->inet_num);
779
780         sk_dst_reset(sk);
781         sk->sk_prot->hash(sk);
782
783         return 0;
784     }
785
786     sk->sk_state = TCP_CLOSE;
787     __reqsk_queue_destroy(&icsk->icsk_accept_queue);
788     return -EADDRINUSE;
789 }

inet_csk_listen_start()函数的nr_table_entries参数对应于inet_listen()函数的backlog参数。

762行,调用reqsk_queue_alloc()函数,分配已完成队列(请求连接队列)

767 - 768行,reqsk_queue_alloc()函数返回后,设置已完成队列sk_max_ack_backlog与sk_ack_backlog的初始值为0。

776行,将套接字的状态设置为TCP_LISTEN,即监听状态。

# stap -L 'kernel.function("reqsk_queue_alloc")'
kernel.function("reqsk_queue_alloc@net/core/request_sock.c:40") $queue:struct request_sock_queue* $nr_table_entries:unsigned int $lopt_size:size_t
/* net/core/request_sock.c */
 37 int sysctl_max_syn_backlog = 256;
 38 EXPORT_SYMBOL(sysctl_max_syn_backlog);
 39
 40 int reqsk_queue_alloc(struct request_sock_queue *queue,
 41               unsigned int nr_table_entries)
 42 {
 43     size_t lopt_size = sizeof(struct listen_sock);
 44     struct listen_sock *lopt;
 45
 46     nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
 47     nr_table_entries = max_t(u32, nr_table_entries, 8);
 48     nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
 49     lopt_size += nr_table_entries * sizeof(struct request_sock *);
 50     if (lopt_size > PAGE_SIZE)
 51         lopt = vzalloc(lopt_size);
 52     else
 53         lopt = kzalloc(lopt_size, GFP_KERNEL);
 54     if (lopt == NULL)
 55         return -ENOMEM;
 56
 57     for (lopt->max_qlen_log = 3;
 58          (1 << lopt->max_qlen_log) < nr_table_entries;
 59          lopt->max_qlen_log++);
 60
 61     get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
 62     rwlock_init(&queue->syn_wait_lock);
 63     queue->rskq_accept_head = NULL;
 64     lopt->nr_table_entries = nr_table_entries;
 65
 66     write_lock_bh(&queue->syn_wait_lock);
 67     queue->listen_opt = lopt;
 68     write_unlock_bh(&queue->syn_wait_lock);
 69
 70     return 0;
 71 }

37行,将sysctl_max_syn_backlog的值显式设置为256,但使用该变量的所有内核函数,包括reqsk_queue_alloc(),仍按照/proc/sys/net/ipv4/tcp_max_syn_backlog或net.ipv4.tcp_max_syn_backlog指定的值执行。可通过以下systemtap脚本验证:

# cat /proc/sys/net/ipv4/tcp_max_syn_backlog
128
#
# stap -e 'probe kernel.statement("reqsk_queue_alloc@net/core/request_sock.c:71") {printf("%d\n", $sysctl_max_syn_backlog)}' -c './tcp_server -b 3'
128

46 - 48行,设置nr_table_entries参数的最终值,并在64行赋予struct request_sock_queue中struct listen_sock的nr_table_entries字段。

min_t(u32, nr_table_entries, sysctl_max_syn_backlog):在nr_table_entries与sysctl_max_syn_backlog之间取最小值。
max_t(u32, nr_table_entries, 8):在nr_table_entries与8之间取最大值。
roundup_pow_of_two(nr_table_entries + 1):按(nr_table_entries + 1)向上取2的正整数次幂。

经过前2步,可以确定nr_table_entries的最小值为8,因此执行第3步后,nr_table_entries的最小值为(8 + 1)向上取2的正整数次幂,即16;但nr_table_entries的最大值由somaxconn与tcp_max_syn_backlog以及nr_table_entries三者中的最小值决定。

49 - 55行,为半连接队列struct listen_sock分配空间,长度为sizeof(struct listen_sock) + nr_table_entries * sizeof(struct request_sock *);struct request_sock的空间此时并未分配,直到接收到客户端的第一个syn包。

57 - 59行,设置max_qlen_log与qlen的最终值。

max_qlen_log表示"log of max q len",即以2为底数,半连接队列长度最大值的对数,该值用于测试半连接队列是否已满。

首先设置初始值为3,然后计算1算数左移max_qlen_log的结果,即2的max_qlen_log次幂,若小于nr_table_entries,则max_qlen_log自增;由于nr_table_entries的最小值为16,因此max_qlen_log的最小值为初始值3自增1次,即4,此时满足2 ^ max_qlen_log == nr_table_entries,而判断半连接队列是否已满的条件为qlen >> max_qlen_log返回1或0,因此半连接队列的长度为(nr_table_entries - 1)

至此,TCP主动套接字的设置完毕,函数调用与返回的顺序相反,依次为:

reqsk_queue_alloc() -> inet_csk_listen_start() -> inet_listen() -> sys_listen()

此过程根据listen(2)的backlog参数,系统的somaxconn以及tcp_max_syn_backlog参数,设置了已完成连接队列的长度,包括用于控制半连接队列的qlen与max_qlen_log,以及用于控制全连接队列的sk_ack_backlog与sk_max_ack_backlog,并将套接字的状态设置为LISTEN。

4.3 - 接收连接

负责对新传入的连接进行处理的函数为TCP协议初始化阶段设置的tcp_v4_rcv() -> tcp_v4_do_rcv() ->tcp_rcv_state_process() -> tcp_conn_request()。

/* net/ipv4/tcp_ipv4.c */
1287 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1288     .mss_clamp  =   TCP_MSS_DEFAULT,
1289 #ifdef CONFIG_TCP_MD5SIG
1290     .md5_lookup =   tcp_v4_reqsk_md5_lookup,
1291     .calc_md5_hash  =   tcp_v4_md5_hash_skb,
1292 #endif
1293     .init_req   =   tcp_v4_init_req,
1294 #ifdef CONFIG_SYN_COOKIES
1295     .cookie_init_seq =  cookie_v4_init_sequence,
1296 #endif
1297     .route_req  =   tcp_v4_route_req,
1298     .init_seq   =   tcp_v4_init_sequence,
1299     .send_synack    =   tcp_v4_send_synack,
1300     .queue_hash_add =   inet_csk_reqsk_queue_hash_add,
1301 };
1302
1303 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1304 {
1305     /* Never answer to SYNs send to broadcast or multicast */
1306     if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1307         goto drop;
1308
1309     return tcp_conn_request(&tcp_request_sock_ops,
1310                 &tcp_request_sock_ipv4_ops, sk, skb);
1311 drop:
1312     NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1313     return 0;
1314 }
/* net/ipv4/tcp_input.c */
5998 int tcp_conn_request(struct request_sock_ops *rsk_ops,
5999              const struct tcp_request_sock_ops *af_ops,
6000              struct sock *sk, struct sk_buff *skb)
6001 {
         ...
6017     if ((sysctl_tcp_syncookies == 2 ||
6018          inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6019         want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
6020         if (!want_cookie)
6021             goto drop;
6022     }
         ...
6030     if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
6031         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6032         goto drop;
6033     }
6034
6035     req = inet_reqsk_alloc(rsk_ops);
6036     if (!req)
6037         goto drop;
         ...
6081         /* Kill the following clause, if you dislike this way. */
6082         else if (!sysctl_tcp_syncookies &&
6083              (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6084               (sysctl_max_syn_backlog >> 2)) &&
6085              !tcp_peer_is_proven(req, dst, false,
6086                          tmp_opt.saw_tstamp)) {
6087             /* Without syncookies last quarter of
6088              * backlog is filled with destinations,
6089              * proven to be alive.
6090              * It means that we continue to communicate
6091              * to destinations, already remembered
6092              * to the moment of synflood.
6093              */
6094             pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
6095                     rsk_ops->family);
6096             goto drop_and_release;
6097         }
6098
6099         isn = af_ops->init_seq(skb);
6100     }
         ...
6120     err = af_ops->send_synack(sk, dst, &fl, req,
6121                   skb_get_queue_mapping(skb), &foc);
6122     if (!fastopen) {
6123         if (err || want_cookie)
6124             goto drop_and_free;
6125
6126         tcp_rsk(req)->listener = NULL;
6127         af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6128     }
6129
6130     return 0;
6131
6132 drop_and_release:
6133     dst_release(dst);
6134 drop_and_free:
6135     reqsk_free(req);
6136 drop:
6137     NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
6138     return 0;
6139 }

该函数中的isn为TCP控制块struct tcp_skb_cb中的一个字段,表示在套接字的TIME-WAIT阶段设置的初始序列号(initial sequence number),首次建立连接时,该字段值为0。

6017 - 6022行,若开启了tcp syncookie功能且值为2,则发送cookie;若半连接队列是否已满,则直接丢弃数据包且不对客户端做出响应。

/* include/net/inet_connection_sock.h */
306 static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
307 {
308     return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);
309 }

/* include/net/request_sock.h */
258 static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
259 {
260     return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
261 }
/* net/ipv4/tcp_ipv4.c */
 865 bool tcp_syn_flood_action(struct sock *sk,
 866              const struct sk_buff *skb,
 867              const char *proto)
 868 {
 869     const char *msg = "Dropping request";
 870     bool want_cookie = false;
 871     struct listen_sock *lopt;
 872
 873
 874
 875 #ifdef CONFIG_SYN_COOKIES
 876     if (sysctl_tcp_syncookies) {
 877         msg = "Sending cookies";
 878         want_cookie = true;
 879         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 880     } else
 881 #endif
 882         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 883
 884     lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 885     if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
 886         lopt->synflood_warned = 1;
 887         pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 888             proto, ntohs(tcp_hdr(skb)->dest), msg);
 889     }
 890     return want_cookie;
 891 }

6030 - 6033行,检查全连接队列是否已满,且其中包含的年轻(young)连接数是否大于1,若是,则直接丢弃且不对客户端做出任何响应。年轻的连接表示三次握手的第二阶段(TCP服务端向客户端发送初始SYN+ACK)未超时的连接。

6035行,首次为新传入的连接请求分配struct request_sock结构实例。

6082 - 6097行,未开启tcp_syncookies,且达到未完成队列规定的上限值时,TCP服务端向客户端发送的初始SYN+ACK仍未收到客户端的确认,则调用pr_drop_req打印信息,并丢弃该连接。

未开启tcp_syncookies时,内核参数sysctl_max_syn_backlog除了决定半连接队列的最大长度,还根据半连接队列的当前长度决定未完成连接的上限值,即

(sysctl_max_syn_backlog - qlen) < (sysctl_max_syn_backlog >> 2),qlen的数值大于sysctl_max_syn_backlog的3/4时,将丢弃当前连接请求。

/* Documentation/networking/ip-sysctl.txt */

tcp_max_syn_backlog - INTEGER
    Maximal number of remembered connection requests, which have not
    received an acknowledgment from connecting client.

    The minimal value is 128 for low memory machines, and it will
    increase in proportion to the memory of machine.
    If server suffers from overload, try increasing this number.

/* include/net/inet_connection_sock.h */
296 static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
297 {
298     return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
299 }

/* include/net/request_sock.h */
248 static inline int reqsk_queue_len(const struct request_sock_queue *queue)
249 {
250     return queue->listen_opt != NULL ? queue->listen_opt->qlen : 0;
251 }
/* net/ipv4/tcp_input.c */
5949 static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5950 {
5951     struct inet_request_sock *ireq = inet_rsk(req);
5952
5953     if (family == AF_INET)
5954         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
5955                    &ireq->ir_rmt_addr, port);
5956 #if IS_ENABLED(CONFIG_IPV6)
5957     else if (family == AF_INET6)
5958         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
5959                    &ireq->ir_v6_rmt_addr, port);
5960 #endif
5961 }

6120 - 6121行,向客户端发送SYN+ACK,执行三次握手的第二阶段。

6127行,执行queue_hash_add()回调函数,即inet_csk_reqsk_queue_hash_add(),更新半连接队列的当前长度qlen,并设置超时计时器。

/* net/ipv4/inet_connection_sock.c */
526 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
527                    unsigned long timeout)
528 {
529     struct inet_connection_sock *icsk = inet_csk(sk);
530     struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
531     const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
532                      inet_rsk(req)->ir_rmt_port,
533                      lopt->hash_rnd, lopt->nr_table_entries);
534
535     reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
536     inet_csk_reqsk_queue_added(sk, timeout);
537 }

/* include/net/inet_connection_sock.h */
289 static inline void inet_csk_reqsk_queue_added(struct sock *sk,
290                           const unsigned long timeout)
291 {
292     if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0)
293         inet_csk_reset_keepalive_timer(sk, timeout);
294 }

/* include/net/request_sock.h */
238 static inline int reqsk_queue_added(struct request_sock_queue *queue)
239 {
240     struct listen_sock *lopt = queue->listen_opt;
241     const int prev_qlen = lopt->qlen;
242
243     lopt->qlen_young++;
244     lopt->qlen++;
245     return prev_qlen;
246 }

5. 杂项&总结

各种文章使用了各种不同的术语来描述backlog, somaxconn与tcp_max_syn_backlog,包括半连接队列和已连接队列,请求连接队列与等待连接队列等,导致理解方面的困难;本文根据listen(2)的手册页,将这些队列划分为已完成队列与未完成队列;而listen(2)的backlog同时参与了sk_max_ack_backlog与max_qlen_log的设置,因此又将已完成队列内部划分为半连接队列与全连接队列。

The behavior of the backlog argument on TCP sockets changed with Linux 2.2.  Now it spec‐
ifies the queue length for completely established sockets waiting to be accepted, instead
of  the  number  of  incomplete connection requests.  The maximum length of the queue for
incomplete sockets can be set using  /proc/sys/net/ipv4/tcp_max_syn_backlog.   When  syn‐
cookies  are enabled there is no logical maximum length and this setting is ignored
.  See
tcp(7) for more information.

 不论术语如何称谓,以下规则总是生效:

  1. 全连接队列的长度由listen(2)的backlog参数与somaxconn共同决定:二者间的最小值作为全连接队列的长度(sk_max_ack_backlog),全连接队列长度的当前值(sk_ack_backlog)初始为0
  2. 半连接队列的长度由listen(2)的backlog参数,somaxconn与tcp_max_syn_backlog共同决定:三者间的最小值+1后向上取2的正整数次幂,可以得出nr_table_entries的最终值,(nr_table_entries - 1)即为半连接队列的长度(qlen)
  3. 仅当未开启tcp_syncookies时,tcp_max_syn_backlog才会决定未连接队列的长度,即未被客户端确认的SYN+ACK数量,此时满足 qlen > 3/4 * tcp_max_syn_backlog
  4. Linux下TCP的三次握手过程发生在accept(2)系统调用之前:成功建立连接的客户端连接保存在全连接队列中

6. 示例

服务端IP为22.99.22.111,客户端IP为22.99.22.101

服务端关闭tcp_syncookies,将somaxconn与tcp_max_syn_backlog分别设置为128与8,listen(2)的backlog参数设置为0,通过systemtap脚本观察内核参数:

# echo 0 > /proc/sys/net/ipv4/tcp_syncookies
# echo 128 > /proc/sys/net/core/somaxconn
# echo 8 > /proc/sys/net/ipv4/tcp_max_syn_backlog
# vi tcp_conn_request.stp

#! /usr/bin/stap

global i = 1;
probe kernel.statement("tcp_conn_request@net/ipv4/tcp_input.c:6001")
{
    qlen = @cast($sk, "struct inet_connection_sock")->icsk_accept_queue->listen_opt->qlen
    max_qlen_log = @cast($sk, "struct inet_connection_sock")->icsk_accept_queue->listen_opt->max_qlen_log
    nr_table_entries = @cast($sk, "struct inet_connection_sock")->icsk_accept_queue->listen_opt->nr_table_entries
    sk_ack_backlog = $sk->sk_ack_backlog
    sk_max_ack_backlog = $sk->sk_max_ack_backlog

    printf("%02d:  qlen = %d, max_qlen_log = %d, nr_table_entries = %d, sk_ack_backlog = %d, sk_max_ack_backlog = %d\n",
            i, qlen, max_qlen_log, nr_table_entries, sk_ack_backlog, sk_max_ack_backlog)
    i++
}

服务端运行

# ./tcp_conn_request.stp -c './tcp_server -b 0' &

客户端发起20个初始连接

# ./tcp_client -i 22.99.22.111 -p 8888 -n 20

服务端输出为

01:  qlen = 0, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 0, sk_max_ack_backlog = 0
02:  qlen = 1, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 0, sk_max_ack_backlog = 0
03:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 0, sk_max_ack_backlog = 0
04:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
05:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
06:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
07:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
08:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
09:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
10:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
11:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
12:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
13:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
14:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
15:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
16:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
17:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
18:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
19:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
20:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
21:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
22:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
23:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
24:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
25:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
26:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
27:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
28:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
29:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
30:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
31:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
32:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
33:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
34:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
35:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
36:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
37:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
38:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
39:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
40:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
41:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
42:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
43:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
44:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
45:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
46:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
47:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
48:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
49:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
50:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
51:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
52:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
53:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
54:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
55:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
56:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
57:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
58:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
59:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
60:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
61:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
62:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
63:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
64:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
65:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
66:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
67:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
68:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
69:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
70:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
71:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
72:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
73:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
74:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
75:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
76:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
77:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
78:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
79:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
80:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
81:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
82:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
83:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
84:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
85:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
86:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
87:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
88:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
89:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
90:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
91:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
92:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
93:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
94:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
95:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
96:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
97:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
98:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
99:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
100:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
101:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
102:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
103:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
104:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
105:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
106:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
# ss -atn | grep :8888
LISTEN     1      0            *:8888                     *:*
SYN-RECV   0      0      22.99.22.111:8888               22.99.22.101:59896
SYN-RECV   0      0      22.99.22.111%if355557656:8888               22.99.22.101:59898
SYN-RECV   0      0      22.99.22.111:8888               22.99.22.101:59904
SYN-RECV   0      0      22.99.22.111:8888               22.99.22.101:59894
SYN-RECV   0      0      22.99.22.111%if355557655:8888               22.99.22.101:59892
SYN-RECV   0      0      22.99.22.111%if355557655:8888               22.99.22.101:59888
SYN-RECV   0      0      22.99.22.111:8888               22.99.22.101:59890
ESTAB      0      0      22.99.22.111:8888               22.99.22.101:59886
# dmesg
[39630.522777] TCP: drop open request from 22.99.22.101/59906
[39630.522886] TCP: drop open request from 22.99.22.101/59900
[39630.522972] TCP: drop open request from 22.99.22.101/59902
[39630.523002] TCP: drop open request from 22.99.22.101/59908
[39630.523023] TCP: drop open request from 22.99.22.101/59914
[39630.523042] TCP: drop open request from 22.99.22.101/59916
[39630.523062] TCP: drop open request from 22.99.22.101/59924
[39630.523081] TCP: drop open request from 22.99.22.101/59910
[39630.523762] TCP: drop open request from 22.99.22.101/59912
[39630.523771] TCP: drop open request from 22.99.22.101/59918

客户端输出为

# ss -atn | grep 8888
SYN-SENT   0      1      22.99.22.101:59900              22.99.22.111:8888
SYN-SENT   0      1      22.99.22.101:59918              22.99.22.111:8888
SYN-SENT   0      1      22.99.22.101:59906              22.99.22.111:8888
SYN-SENT   0      1      22.99.22.101:59910              22.99.22.111:8888
SYN-SENT   0      1      22.99.22.101:59914              22.99.22.111:8888
ESTAB      0      0      22.99.22.101:59904              22.99.22.111:8888
ESTAB      0      0      22.99.22.101:59894              22.99.22.111:8888
ESTAB      0      0      22.99.22.101:59890              22.99.22.111:8888
SYN-SENT   0      1      22.99.22.101:59922              22.99.22.111:8888
SYN-SENT   0      1      22.99.22.101:59902              22.99.22.111:8888
ESTAB      0      0      22.99.22.101:59888              22.99.22.111:8888
ESTAB      0      0      22.99.22.101:59896              22.99.22.111:8888
ESTAB      0      0      22.99.22.101:59892              22.99.22.111:8888
SYN-SENT   0      1      22.99.22.101:59916              22.99.22.111:8888
SYN-SENT   0      1      22.99.22.101:59920              22.99.22.111:8888
SYN-SENT   0      1      22.99.22.101:59908              22.99.22.111:8888
SYN-SENT   0      1      22.99.22.101:59912              22.99.22.111:8888
SYN-SENT   0      1      22.99.22.101:59924              22.99.22.111:8888
ESTAB      0      0      22.99.22.101:59898              22.99.22.111:8888

tcp_max_syn_backlog(8) * 3/4 = 6,qlen大于6且未开启tcp_syncookies,触发pr_drop_req()函数的执行。

将tcp_max_syn_backlog设置为12,再次观察

# echo 12 > /proc/sys/net/ipv4/tcp_max_syn_backlog
# ./tcp_conn_request.stp -c './tcp_server -b 0' &
# 01:  qlen = 0, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 0, sk_max_ack_backlog = 0
02:  qlen = 1, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 0, sk_max_ack_backlog = 0
03:  qlen = 1, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
04:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
05:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
06:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
07:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
08:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
09:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
10:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
11:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
12:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
13:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
14:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
15:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
16:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
17:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
18:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
19:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
20:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
21:  qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
22:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
23:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
24:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
25:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
26:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
27:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
28:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
29:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
30:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
31:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
32:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
33:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
34:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
35:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
36:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
37:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
38:  qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
39:  qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
40:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
41:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
42:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
43:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
44:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
45:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
46:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
47:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
48:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
49:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
50:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
51:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
52:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
53:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
54:  qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
55:  qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
56:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
57:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
58:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
59:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
60:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
61:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
62:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
63:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
64:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
65:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
66:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
67:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
68:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
69:  qlen = 8, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
70:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
71:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
72:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
73:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
74:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
75:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
76:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
77:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
78:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
79:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
80:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
81:  qlen = 8, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
82:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
83:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
84:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
85:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
86:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
87:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
88:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
89:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
90:  qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
91:  qlen = 8, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
92:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
93:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
94:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
95:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
96:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
97:  qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0

tcp_max_syn_backlog(12) * 3/4 = 9,qlen不大于9且未开启tcp_syncookies时,未连接队列未满,不会触发pr_drop_req()函数的执行,因此dmesg不会输出pr_drop_req()打印的"...drop open request from..."信息;但由于半连接队列与全连接队列已满,导致客户端连接被丢弃而超时重传。

未完待续。

转载于:https://my.oschina.net/superwjc/blog/1816385

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值