1. 概述
github项目地址:https://github.com/superwujc
尊重原创,欢迎转载,注明出处:https://my.oschina.net/superwjc/blog/1816385
Linux下创建TCP服务端的流程为依次调用socket(2),bind(2),listen(2),accept(2),其中,listen(2)将本端套接字设置为被动套接字(passive socket),使之进入监听状态,以等待客户端发起的主动连接。
#include <sys/types.h>
#include <sys/socket.h>
int listen(int sockfd, int backlog);
对于listen(2)的backlog参数,网上大多数文章大都表意不清,或有歧义,且缺乏有力的实证;本文将从内核源码的角度分析被动套接字的创建流程,包括backlog参数的具体含义,以及与之相关的内核参数somaxconn,tcp_max_syn_backlog,tcp_syncookies。
2. 示例程序
操作系统版本为CentOS Linux release 7.5.1804 (Core),内核版本为3.10.0-862.2.3.el7.x86_64。
服务端:tcp_server.c,以命令行选项-b指定listen(2)的backlog参数
/* tcp_server.c */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <string.h>
#include <errno.h>
#define SA struct sockaddr
#define ERR_EXIT(msg, ...) \
do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while (0)
int main(int argc, char *argv[])
{
if (argc < 3)
ERR_EXIT("Usage: %s -b <backlog>\n", argv[0]);
int lfd, c, backlog;
struct sockaddr_in svaddr;
socklen_t svaddrlen;
while ((c = getopt(argc, argv, "b:")) != -1) {
if (c == 'b')
backlog = atoi(optarg);
else
exit(EXIT_FAILURE);
}
lfd = socket(AF_INET, SOCK_STREAM, 0);
if (lfd == -1)
ERR_EXIT("socket() failed: %s\n", strerror(errno));
svaddrlen = sizeof(svaddr);
memset(&svaddr, 0, svaddrlen);
svaddr.sin_family = AF_INET;
svaddr.sin_addr.s_addr = htonl(INADDR_ANY);
svaddr.sin_port = htons(8888);
if (bind(lfd, (SA *)&svaddr, svaddrlen) == -1)
ERR_EXIT("bind() failed: %s\n", strerror(errno));
if (listen(lfd, backlog) == -1)
ERR_EXIT("listen() failed: %s\n", strerror(errno));
for ( ; ; )
sleep(1);
exit(EXIT_SUCCESS);
}
客户端程序:tcp_client.c,命令行选项-i/-p/-n分别指定服务端ip,端口,以及发起连接的数量
/* tcp_client.c */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <string.h>
#include <errno.h>
#define SA struct sockaddr
#define ERR_EXIT(msg, ...) \
do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while (0)
static void connect_to_server(char *, int);
int main(int argc, char *argv[])
{
if (argc < 5)
ERR_EXIT("Usage: %s -i <ip> -p <port> -n <count>\n", argv[0]);
int cfd, i, c;
int nr_children = 1;
char *svip = NULL;
in_port_t svport = 0;
struct sigaction sa;
sigemptyset(&sa.sa_mask);
sa.sa_handler = SIG_IGN;
if (sigaction(SIGCHLD, &sa, NULL) == -1)
ERR_EXIT("sigaction() failed: %s\n", strerror(errno));
while ((c = getopt(argc, argv, "i:p:n:")) != -1) {
switch (c) {
case 'i':
svip = optarg;
break;
case 'p':
svport = (in_port_t)atoi(optarg);
break;
case 'n':
nr_children = atoi(optarg);
break;
default:
exit(EXIT_FAILURE);
}
}
if (!svip)
ERR_EXIT("invalid dst ip\n");
if (!svport)
ERR_EXIT("invalid dst port\n");
for (i = 0; i < nr_children; i++) {
switch (fork()) {
case -1:
ERR_EXIT("fork() failed:%s\n", strerror(errno));
case 0:
connect_to_server(svip, svport);
_exit(EXIT_SUCCESS);
default:
close(cfd);
break;
}
}
for ( ; ; )
sleep(1);
exit(EXIT_SUCCESS);
}
static void connect_to_server(char *svip, int svport)
{
int cfd;
struct sockaddr_in svaddr;
socklen_t svaddrlen;
cfd = socket(AF_INET, SOCK_STREAM, 0);
if (cfd == -1)
ERR_EXIT("socket() failed: %s\n", strerror(errno));
svaddrlen = sizeof(svaddr);
memset(&svaddr, 0, svaddrlen);
svaddr.sin_family = AF_INET;
svaddr.sin_port = htons(svport);
inet_pton(AF_INET, svip, &svaddr.sin_addr);
if (connect(cfd, (SA *)&svaddr, svaddrlen) == -1)
ERR_EXIT("connect() failed: %s\n", strerror(errno));
for ( ; ; )
sleep(1);
}
3. 内核数据结构
内核为TCP的连接与传输维持2个队列:已完成队列与未完成队列;已完成队列在逻辑上又可划分为半连接队列与全连接队列。
半连接队列通过request_sock_queue结构表示,该结构中包括request_sock结构与listen_sock结构,其中,listen_sock结构的max_qlen_log与qlen字段共同决定半连接队列允许的最大长度:qlen表示半连接队列的当前长度,max_qlen_log表示以2为底数的半连接队列最大长度的对数。
/* include/net/request_sock.h */
151 struct request_sock_queue {
152 struct request_sock *rskq_accept_head;
153 struct request_sock *rskq_accept_tail;
...
157 struct listen_sock *listen_opt;
...
164 };
50 struct request_sock {
51 struct sock_common __req_common;
52 struct request_sock *dl_next;
53 u16 mss;
54 u8 num_retrans; /* number of retransmits */
55 u8 cookie_ts:1; /* syncookie: encode tcpopts in timestamp */
56 u8 num_timeout:7; /* number of timeouts */
57 /* The following two fields can be easily recomputed I think -AK */
58 u32 window_clamp; /* window clamp at creation time */
59 u32 rcv_wnd; /* rcv_wnd offered first time */
60 u32 ts_recent;
61 unsigned long expires;
62 const struct request_sock_ops *rsk_ops;
63 struct sock *sk;
64 u32 secid;
65 u32 peer_secid;
66 };
91 /** struct listen_sock - listen state
92 *
93 * @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs
94 */
95 struct listen_sock {
96 u8 max_qlen_log;
97 u8 synflood_warned;
98 /* 2 bytes hole, try to use */
99 int qlen;
100 int qlen_young;
101 int clock_hand;
102 u32 hash_rnd;
103 u32 nr_table_entries;
104 struct request_sock *syn_table[0];
105 };
全连接队列通过sock结构中的sk_ack_backlog与sk_max_ack_backlog字段共同表示,分别表示全连接队列的当前长度与最大长度。
/* include/net/sock.h */
303 struct sock {
...
408 unsigned short sk_ack_backlog;
409 unsigned short sk_max_ack_backlog;
...
460 };
对于每一个传入的TCP连接请求,内核将先后检查半连接与全连接队列的当前长度,根据队列是否已满选择相应的处理方式。
/* include/net/request_sock.h */
258 static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
259 {
260 return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
261 }
/* include/net/sock.h */
815 static inline bool sk_acceptq_is_full(const struct sock *sk)
816 {
817 return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
818 }
4. 内核处理流程
4.1 - TCP协议初始化
TCP协议的源码以静态的方式编译至内核映像,随系统启动而完成初始化,包括流套接字创建,接收与发送等操作时执行的一系列回调函数,以及ipv4/ipv6支持的特定操作等。
首先通过tcp_prot结构指定TCP套接字的内核回调函数,包括用于初始化的tcp_v4_init_sock()以及用于接收数据包的函数tcp_v4_do_rcv()等,封装在ipv4中的TCP另包含特定的处理函数tcp_v4_conn_request()与tcp_v4_syn_recv_sock()等:
/* net/ipv4/tcp_ipv4.c */
2428 struct proto tcp_prot = {
2429 .name = "TCP",
2430 .owner = THIS_MODULE,
...
2436 .init = tcp_v4_init_sock,
...
2444 .backlog_rcv = tcp_v4_do_rcv,
...
2473 };
/* net/ipv4/tcp_ipv4.c */
1843 static int tcp_v4_init_sock(struct sock *sk)
1844 {
...
1849 icsk->icsk_af_ops = &ipv4_specific;
...
1855 return 0;
1856 };
/* net/ipv4/tcp_ipv4.c */
1811 const struct inet_connection_sock_af_ops ipv4_specific = {
...
1816 .conn_request = tcp_v4_conn_request,
1817 .syn_recv_sock = tcp_v4_syn_recv_sock,
...
1829 };
然后通过inet_stream_ops结构指定TCP套接字支持的方法,这些内核函数分别对应于用户空间的bind(2),listen(2)等:
/* net/ipv4/af_inet.c */
904 const struct proto_ops inet_stream_ops = {
905 .family = PF_INET,
906 .owner = THIS_MODULE,
907 .release = inet_release,
908 .bind = inet_bind,
909 .connect = inet_stream_connect,
910 .socketpair = sock_no_socketpair,
911 .accept = inet_accept,
912 .getname = inet_getname,
913 .poll = tcp_poll,
914 .ioctl = inet_ioctl,
915 .listen = inet_listen,
916 .shutdown = inet_shutdown,
917 .setsockopt = sock_common_setsockopt,
918 .getsockopt = sock_common_getsockopt,
919 .sendmsg = inet_sendmsg,
920 .recvmsg = inet_recvmsg,
921 .mmap = sock_no_mmap,
922 .sendpage = inet_sendpage,
923 .splice_read = tcp_splice_read,
924 #ifdef CONFIG_COMPAT
925 .compat_setsockopt = compat_sock_common_setsockopt,
926 .compat_getsockopt = compat_sock_common_getsockopt,
927 .compat_ioctl = inet_compat_ioctl,
928 #endif
929 };
包含tcp_prot与inet_stream_ops的套接字类型数组inetsw_array:
/* net/ipv4/af_inet.c */
998 static struct inet_protosw inetsw_array[] =
999 {
1000 {
1001 .type = SOCK_STREAM,
1002 .protocol = IPPROTO_TCP,
1003 .prot = &tcp_prot,
1004 .ops = &inet_stream_ops,
1005 .flags = INET_PROTOSW_PERMANENT |
1006 INET_PROTOSW_ICSK,
1007 },
...
1032 };
用于注册协议的tcp_protocol,其中包含接收数据包时调用的内核函数tcp_v4_rcv():
/* net/ipv4/af_inet.c */
1544 static const struct net_protocol tcp_protocol = {
1545 .early_demux = tcp_v4_early_demux,
1546 .handler = tcp_v4_rcv,
1547 .err_handler = tcp_v4_err,
1548 .no_policy = 1,
1549 .netns_ok = 1,
1550 };
初始化操作最终通过inet_init()函数实现,将以上设置的tcp_prot,tcp_protocol,inetsw_array[]进行整合,最后调用tcp_init()函数,打印Hash tables configured (established %u bind %u)信息,通过dmesg可以查看:
/* net/ipv4/af_inet.c */
1683 static int __init inet_init(void)
1684 {
...
1695 rc = proto_register(&tcp_prot, 1);
...
1731 if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
...
1742 for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
1743 inet_register_protosw(q);
...
1757 tcp_v4_init();
...
1760 tcp_init();
...
1881 };
/* net/ipv4/tcp.c */
3098 void __init tcp_init(void)
3099 {
...
3171 pr_info("Hash tables configured (established %u bind %u)\n",
3172 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
...
3177 };
# dmesg | grep 'Hash tables configured'
[ 1.511233] TCP: Hash tables configured (established 8192 bind 8192)
4.2 - listen(2)系统调用
listen(2)系统调用对应的内核函数为sys_listen():
# stap -L 'kernel.function("sys_listen")'
kernel.function("SyS_listen@net/socket.c:1576") $fd:long int $backlog:long int
/* net/socket.c */
1576 SYSCALL_DEFINE2(listen, int, fd, int, backlog)
1577 {
1578 struct socket *sock;
1579 int err, fput_needed;
1580 int somaxconn;
1581
1582 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1583 if (sock) {
1584 somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
1585 if ((unsigned int)backlog > somaxconn)
1586 backlog = somaxconn;
1587
1588 err = security_socket_listen(sock, backlog);
1589 if (!err)
1590 err = sock->ops->listen(sock, backlog);
1591
1592 fput_light(sock->file, fput_needed);
1593 }
1594 return err;
1595 }
1584 - 1586行,将手动指定的backlog与系统的somaxconn进行比较,取最小值。
1590行,调用listen()函数,对应于初始化阶段设置的inet_stream_ops.inet_listen()回调函数
somaxconn值可以通过sysctl接口的net.core.somaxconn参数,或proc接口的/proc/sys/net/core/somaxconn伪文件指定;proc项与内核参数的对应关系定义在net/ipv4/sysctl_net_ipv4.c与net/core/sysctl_net_core.c中
/* net/core/sysctl_net_core.c */
297 static struct ctl_table netns_core_table[] = {
298 {
299 .procname = "somaxconn",
300 .data = &init_net.core.sysctl_somaxconn,
301 .maxlen = sizeof(int),
302 .mode = 0644,
303 .extra1 = &zero,
304 .extra2 = &ushort_max,
305 .proc_handler = proc_dointvec_minmax
306 },
307 { }
308 };
# stap -L 'kernel.function("inet_listen")'
kernel.function("inet_listen@net/ipv4/af_inet.c:193") $sock:struct socket* $backlog:int
/* net/ipv4/af_inet.c */
193 int inet_listen(struct socket *sock, int backlog)
194 {
195 struct sock *sk = sock->sk;
196 unsigned char old_state;
197 int err;
198
199 lock_sock(sk);
200
201 err = -EINVAL;
202 if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
203 goto out;
204
205 old_state = sk->sk_state;
206 if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
207 goto out;
208
209 /* Really, if the socket is already in listen state
210 * we can only allow the backlog to be adjusted.
211 */
212 if (old_state != TCP_LISTEN) {
213 /* Check special setups for testing purpose to enable TFO w/o
214 * requiring TCP_FASTOPEN sockopt.
215 * Note that only TCP sockets (SOCK_STREAM) will reach here.
216 * Also fastopenq may already been allocated because this
217 * socket was in TCP_LISTEN state previously but was
218 * shutdown() (rather than close()).
219 */
220 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
221 inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
222 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
223 err = fastopen_init_queue(sk, backlog);
224 else if ((sysctl_tcp_fastopen &
225 TFO_SERVER_WO_SOCKOPT2) != 0)
226 err = fastopen_init_queue(sk,
227 ((uint)sysctl_tcp_fastopen) >> 16);
228 else
229 err = 0;
230 if (err)
231 goto out;
232
233 tcp_fastopen_init_key_once(true);
234 }
235 err = inet_csk_listen_start(sk, backlog);
236 if (err)
237 goto out;
238 }
239 sk->sk_max_ack_backlog = backlog;
240 err = 0;
241
242 out:
243 release_sock(sk);
244 return err;
245 }
通过systemtap追踪该函数的执行流程:
# stap -c './tcp_server -b 3' -e 'probe kernel.statement("inet_listen@net/ipv4/af_inet.c:*") {printf("%s\n", pp())}'
WARNING: probe kernel.statement("inet_listen@net/ipv4/af_inet.c:220") (address 0xffffffff89a6ef47) registration error (rc -22)
kernel.statement("inet_listen@net/ipv4/af_inet.c:194")
kernel.statement("inet_listen@net/ipv4/af_inet.c:195")
kernel.statement("inet_listen@net/ipv4/af_inet.c:202")
kernel.statement("inet_listen@net/ipv4/af_inet.c:201")
kernel.statement("inet_listen@net/ipv4/af_inet.c:205")
kernel.statement("inet_listen@net/ipv4/af_inet.c:206")
kernel.statement("inet_listen@net/ipv4/af_inet.c:212")
kernel.statement("inet_listen@net/ipv4/af_inet.c:235")
kernel.statement("inet_listen@net/ipv4/af_inet.c:236")
kernel.statement("inet_listen@net/ipv4/af_inet.c:239")
kernel.statement("inet_listen@net/ipv4/af_inet.c:240")
kernel.statement("inet_listen@net/ipv4/af_inet.c:243")
kernel.statement("inet_listen@net/ipv4/af_inet.c:245")
可以得出该函数执行的主要操作为:
235行,调用inet_csk_listen_start()函数
239行,inet_csk_listen_start()函数返回后,将backlog作为已连接队列长度的最大值sk_max_ack_backlog。
# stap -L 'kernel.function("inet_csk_listen_start")'
kernel.function("inet_csk_listen_start@net/ipv4/inet_connection_sock.c:758") $sk:struct sock* $nr_table_entries:int const
/* net/ipv4/inet_connection_sock.c */
758 int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
759 {
760 struct inet_sock *inet = inet_sk(sk);
761 struct inet_connection_sock *icsk = inet_csk(sk);
762 int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
763
764 if (rc != 0)
765 return rc;
766
767 sk->sk_max_ack_backlog = 0;
768 sk->sk_ack_backlog = 0;
769 inet_csk_delack_init(sk);
770
771 /* There is race window here: we announce ourselves listening,
772 * but this transition is still not validated by get_port().
773 * It is OK, because this socket enters to hash table only
774 * after validation is complete.
775 */
776 sk->sk_state = TCP_LISTEN;
777 if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
778 inet->inet_sport = htons(inet->inet_num);
779
780 sk_dst_reset(sk);
781 sk->sk_prot->hash(sk);
782
783 return 0;
784 }
785
786 sk->sk_state = TCP_CLOSE;
787 __reqsk_queue_destroy(&icsk->icsk_accept_queue);
788 return -EADDRINUSE;
789 }
inet_csk_listen_start()函数的nr_table_entries参数对应于inet_listen()函数的backlog参数。
762行,调用reqsk_queue_alloc()函数,分配已完成队列(请求连接队列)
767 - 768行,reqsk_queue_alloc()函数返回后,设置已完成队列sk_max_ack_backlog与sk_ack_backlog的初始值为0。
776行,将套接字的状态设置为TCP_LISTEN,即监听状态。
# stap -L 'kernel.function("reqsk_queue_alloc")'
kernel.function("reqsk_queue_alloc@net/core/request_sock.c:40") $queue:struct request_sock_queue* $nr_table_entries:unsigned int $lopt_size:size_t
/* net/core/request_sock.c */
37 int sysctl_max_syn_backlog = 256;
38 EXPORT_SYMBOL(sysctl_max_syn_backlog);
39
40 int reqsk_queue_alloc(struct request_sock_queue *queue,
41 unsigned int nr_table_entries)
42 {
43 size_t lopt_size = sizeof(struct listen_sock);
44 struct listen_sock *lopt;
45
46 nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
47 nr_table_entries = max_t(u32, nr_table_entries, 8);
48 nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
49 lopt_size += nr_table_entries * sizeof(struct request_sock *);
50 if (lopt_size > PAGE_SIZE)
51 lopt = vzalloc(lopt_size);
52 else
53 lopt = kzalloc(lopt_size, GFP_KERNEL);
54 if (lopt == NULL)
55 return -ENOMEM;
56
57 for (lopt->max_qlen_log = 3;
58 (1 << lopt->max_qlen_log) < nr_table_entries;
59 lopt->max_qlen_log++);
60
61 get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
62 rwlock_init(&queue->syn_wait_lock);
63 queue->rskq_accept_head = NULL;
64 lopt->nr_table_entries = nr_table_entries;
65
66 write_lock_bh(&queue->syn_wait_lock);
67 queue->listen_opt = lopt;
68 write_unlock_bh(&queue->syn_wait_lock);
69
70 return 0;
71 }
37行,将sysctl_max_syn_backlog的值显式设置为256,但使用该变量的所有内核函数,包括reqsk_queue_alloc(),仍按照/proc/sys/net/ipv4/tcp_max_syn_backlog或net.ipv4.tcp_max_syn_backlog指定的值执行。可通过以下systemtap脚本验证:
# cat /proc/sys/net/ipv4/tcp_max_syn_backlog
128
#
# stap -e 'probe kernel.statement("reqsk_queue_alloc@net/core/request_sock.c:71") {printf("%d\n", $sysctl_max_syn_backlog)}' -c './tcp_server -b 3'
128
46 - 48行,设置nr_table_entries参数的最终值,并在64行赋予struct request_sock_queue中struct listen_sock的nr_table_entries字段。
min_t(u32, nr_table_entries, sysctl_max_syn_backlog):在nr_table_entries与sysctl_max_syn_backlog之间取最小值。
max_t(u32, nr_table_entries, 8):在nr_table_entries与8之间取最大值。
roundup_pow_of_two(nr_table_entries + 1):按(nr_table_entries + 1)向上取2的正整数次幂。经过前2步,可以确定nr_table_entries的最小值为8,因此执行第3步后,nr_table_entries的最小值为(8 + 1)向上取2的正整数次幂,即16;但nr_table_entries的最大值由somaxconn与tcp_max_syn_backlog以及nr_table_entries三者中的最小值决定。
49 - 55行,为半连接队列struct listen_sock分配空间,长度为sizeof(struct listen_sock) + nr_table_entries * sizeof(struct request_sock *);struct request_sock的空间此时并未分配,直到接收到客户端的第一个syn包。
57 - 59行,设置max_qlen_log与qlen的最终值。
max_qlen_log表示"log of max q len",即以2为底数,半连接队列长度最大值的对数,该值用于测试半连接队列是否已满。
首先设置初始值为3,然后计算1算数左移max_qlen_log的结果,即2的max_qlen_log次幂,若小于nr_table_entries,则max_qlen_log自增;由于nr_table_entries的最小值为16,因此max_qlen_log的最小值为初始值3自增1次,即4,此时满足2 ^ max_qlen_log == nr_table_entries,而判断半连接队列是否已满的条件为qlen >> max_qlen_log返回1或0,因此半连接队列的长度为(nr_table_entries - 1)
至此,TCP主动套接字的设置完毕,函数调用与返回的顺序相反,依次为:
reqsk_queue_alloc() -> inet_csk_listen_start() -> inet_listen() -> sys_listen()
此过程根据listen(2)的backlog参数,系统的somaxconn以及tcp_max_syn_backlog参数,设置了已完成连接队列的长度,包括用于控制半连接队列的qlen与max_qlen_log,以及用于控制全连接队列的sk_ack_backlog与sk_max_ack_backlog,并将套接字的状态设置为LISTEN。
4.3 - 接收连接
负责对新传入的连接进行处理的函数为TCP协议初始化阶段设置的tcp_v4_rcv() -> tcp_v4_do_rcv() ->tcp_rcv_state_process() -> tcp_conn_request()。
/* net/ipv4/tcp_ipv4.c */
1287 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1288 .mss_clamp = TCP_MSS_DEFAULT,
1289 #ifdef CONFIG_TCP_MD5SIG
1290 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1291 .calc_md5_hash = tcp_v4_md5_hash_skb,
1292 #endif
1293 .init_req = tcp_v4_init_req,
1294 #ifdef CONFIG_SYN_COOKIES
1295 .cookie_init_seq = cookie_v4_init_sequence,
1296 #endif
1297 .route_req = tcp_v4_route_req,
1298 .init_seq = tcp_v4_init_sequence,
1299 .send_synack = tcp_v4_send_synack,
1300 .queue_hash_add = inet_csk_reqsk_queue_hash_add,
1301 };
1302
1303 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1304 {
1305 /* Never answer to SYNs send to broadcast or multicast */
1306 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1307 goto drop;
1308
1309 return tcp_conn_request(&tcp_request_sock_ops,
1310 &tcp_request_sock_ipv4_ops, sk, skb);
1311 drop:
1312 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1313 return 0;
1314 }
/* net/ipv4/tcp_input.c */
5998 int tcp_conn_request(struct request_sock_ops *rsk_ops,
5999 const struct tcp_request_sock_ops *af_ops,
6000 struct sock *sk, struct sk_buff *skb)
6001 {
...
6017 if ((sysctl_tcp_syncookies == 2 ||
6018 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6019 want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
6020 if (!want_cookie)
6021 goto drop;
6022 }
...
6030 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
6031 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6032 goto drop;
6033 }
6034
6035 req = inet_reqsk_alloc(rsk_ops);
6036 if (!req)
6037 goto drop;
...
6081 /* Kill the following clause, if you dislike this way. */
6082 else if (!sysctl_tcp_syncookies &&
6083 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6084 (sysctl_max_syn_backlog >> 2)) &&
6085 !tcp_peer_is_proven(req, dst, false,
6086 tmp_opt.saw_tstamp)) {
6087 /* Without syncookies last quarter of
6088 * backlog is filled with destinations,
6089 * proven to be alive.
6090 * It means that we continue to communicate
6091 * to destinations, already remembered
6092 * to the moment of synflood.
6093 */
6094 pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
6095 rsk_ops->family);
6096 goto drop_and_release;
6097 }
6098
6099 isn = af_ops->init_seq(skb);
6100 }
...
6120 err = af_ops->send_synack(sk, dst, &fl, req,
6121 skb_get_queue_mapping(skb), &foc);
6122 if (!fastopen) {
6123 if (err || want_cookie)
6124 goto drop_and_free;
6125
6126 tcp_rsk(req)->listener = NULL;
6127 af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6128 }
6129
6130 return 0;
6131
6132 drop_and_release:
6133 dst_release(dst);
6134 drop_and_free:
6135 reqsk_free(req);
6136 drop:
6137 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
6138 return 0;
6139 }
该函数中的isn为TCP控制块struct tcp_skb_cb中的一个字段,表示在套接字的TIME-WAIT阶段设置的初始序列号(initial sequence number),首次建立连接时,该字段值为0。
6017 - 6022行,若开启了tcp syncookie功能且值为2,则发送cookie;若半连接队列是否已满,则直接丢弃数据包且不对客户端做出响应。
/* include/net/inet_connection_sock.h */
306 static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
307 {
308 return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);
309 }
/* include/net/request_sock.h */
258 static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
259 {
260 return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
261 }
/* net/ipv4/tcp_ipv4.c */
865 bool tcp_syn_flood_action(struct sock *sk,
866 const struct sk_buff *skb,
867 const char *proto)
868 {
869 const char *msg = "Dropping request";
870 bool want_cookie = false;
871 struct listen_sock *lopt;
872
873
874
875 #ifdef CONFIG_SYN_COOKIES
876 if (sysctl_tcp_syncookies) {
877 msg = "Sending cookies";
878 want_cookie = true;
879 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
880 } else
881 #endif
882 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
883
884 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
885 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
886 lopt->synflood_warned = 1;
887 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
888 proto, ntohs(tcp_hdr(skb)->dest), msg);
889 }
890 return want_cookie;
891 }
6030 - 6033行,检查全连接队列是否已满,且其中包含的年轻(young)连接数是否大于1,若是,则直接丢弃且不对客户端做出任何响应。年轻的连接表示三次握手的第二阶段(TCP服务端向客户端发送初始SYN+ACK)未超时的连接。
6035行,首次为新传入的连接请求分配struct request_sock结构实例。
6082 - 6097行,未开启tcp_syncookies,且达到未完成队列规定的上限值时,TCP服务端向客户端发送的初始SYN+ACK仍未收到客户端的确认,则调用pr_drop_req打印信息,并丢弃该连接。
未开启tcp_syncookies时,内核参数sysctl_max_syn_backlog除了决定半连接队列的最大长度,还根据半连接队列的当前长度决定未完成连接的上限值,即
(sysctl_max_syn_backlog - qlen) < (sysctl_max_syn_backlog >> 2),qlen的数值大于sysctl_max_syn_backlog的3/4时,将丢弃当前连接请求。
/* Documentation/networking/ip-sysctl.txt */
tcp_max_syn_backlog - INTEGER
Maximal number of remembered connection requests, which have not
received an acknowledgment from connecting client.
The minimal value is 128 for low memory machines, and it will
increase in proportion to the memory of machine.
If server suffers from overload, try increasing this number.
/* include/net/inet_connection_sock.h */
296 static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
297 {
298 return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
299 }
/* include/net/request_sock.h */
248 static inline int reqsk_queue_len(const struct request_sock_queue *queue)
249 {
250 return queue->listen_opt != NULL ? queue->listen_opt->qlen : 0;
251 }
/* net/ipv4/tcp_input.c */
5949 static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5950 {
5951 struct inet_request_sock *ireq = inet_rsk(req);
5952
5953 if (family == AF_INET)
5954 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
5955 &ireq->ir_rmt_addr, port);
5956 #if IS_ENABLED(CONFIG_IPV6)
5957 else if (family == AF_INET6)
5958 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
5959 &ireq->ir_v6_rmt_addr, port);
5960 #endif
5961 }
6120 - 6121行,向客户端发送SYN+ACK,执行三次握手的第二阶段。
6127行,执行queue_hash_add()回调函数,即inet_csk_reqsk_queue_hash_add(),更新半连接队列的当前长度qlen,并设置超时计时器。
/* net/ipv4/inet_connection_sock.c */
526 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
527 unsigned long timeout)
528 {
529 struct inet_connection_sock *icsk = inet_csk(sk);
530 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
531 const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
532 inet_rsk(req)->ir_rmt_port,
533 lopt->hash_rnd, lopt->nr_table_entries);
534
535 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
536 inet_csk_reqsk_queue_added(sk, timeout);
537 }
/* include/net/inet_connection_sock.h */
289 static inline void inet_csk_reqsk_queue_added(struct sock *sk,
290 const unsigned long timeout)
291 {
292 if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0)
293 inet_csk_reset_keepalive_timer(sk, timeout);
294 }
/* include/net/request_sock.h */
238 static inline int reqsk_queue_added(struct request_sock_queue *queue)
239 {
240 struct listen_sock *lopt = queue->listen_opt;
241 const int prev_qlen = lopt->qlen;
242
243 lopt->qlen_young++;
244 lopt->qlen++;
245 return prev_qlen;
246 }
5. 杂项&总结
各种文章使用了各种不同的术语来描述backlog, somaxconn与tcp_max_syn_backlog,包括半连接队列和已连接队列,请求连接队列与等待连接队列等,导致理解方面的困难;本文根据listen(2)的手册页,将这些队列划分为已完成队列与未完成队列;而listen(2)的backlog同时参与了sk_max_ack_backlog与max_qlen_log的设置,因此又将已完成队列内部划分为半连接队列与全连接队列。
The behavior of the backlog argument on TCP sockets changed with Linux 2.2. Now it spec‐
ifies the queue length for completely established sockets waiting to be accepted, instead
of the number of incomplete connection requests. The maximum length of the queue for
incomplete sockets can be set using /proc/sys/net/ipv4/tcp_max_syn_backlog. When syn‐
cookies are enabled there is no logical maximum length and this setting is ignored. See
tcp(7) for more information.
不论术语如何称谓,以下规则总是生效:
- 全连接队列的长度由listen(2)的backlog参数与somaxconn共同决定:二者间的最小值作为全连接队列的长度(sk_max_ack_backlog),全连接队列长度的当前值(sk_ack_backlog)初始为0
- 半连接队列的长度由listen(2)的backlog参数,somaxconn与tcp_max_syn_backlog共同决定:三者间的最小值+1后向上取2的正整数次幂,可以得出nr_table_entries的最终值,(nr_table_entries - 1)即为半连接队列的长度(qlen)
- 仅当未开启tcp_syncookies时,tcp_max_syn_backlog才会决定未连接队列的长度,即未被客户端确认的SYN+ACK数量,此时满足 qlen > 3/4 * tcp_max_syn_backlog
- Linux下TCP的三次握手过程发生在accept(2)系统调用之前:成功建立连接的客户端连接保存在全连接队列中
6. 示例
服务端IP为22.99.22.111,客户端IP为22.99.22.101
服务端关闭tcp_syncookies,将somaxconn与tcp_max_syn_backlog分别设置为128与8,listen(2)的backlog参数设置为0,通过systemtap脚本观察内核参数:
# echo 0 > /proc/sys/net/ipv4/tcp_syncookies
# echo 128 > /proc/sys/net/core/somaxconn
# echo 8 > /proc/sys/net/ipv4/tcp_max_syn_backlog
# vi tcp_conn_request.stp
#! /usr/bin/stap
global i = 1;
probe kernel.statement("tcp_conn_request@net/ipv4/tcp_input.c:6001")
{
qlen = @cast($sk, "struct inet_connection_sock")->icsk_accept_queue->listen_opt->qlen
max_qlen_log = @cast($sk, "struct inet_connection_sock")->icsk_accept_queue->listen_opt->max_qlen_log
nr_table_entries = @cast($sk, "struct inet_connection_sock")->icsk_accept_queue->listen_opt->nr_table_entries
sk_ack_backlog = $sk->sk_ack_backlog
sk_max_ack_backlog = $sk->sk_max_ack_backlog
printf("%02d: qlen = %d, max_qlen_log = %d, nr_table_entries = %d, sk_ack_backlog = %d, sk_max_ack_backlog = %d\n",
i, qlen, max_qlen_log, nr_table_entries, sk_ack_backlog, sk_max_ack_backlog)
i++
}
服务端运行
# ./tcp_conn_request.stp -c './tcp_server -b 0' &
客户端发起20个初始连接
# ./tcp_client -i 22.99.22.111 -p 8888 -n 20
服务端输出为
01: qlen = 0, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 0, sk_max_ack_backlog = 0
02: qlen = 1, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 0, sk_max_ack_backlog = 0
03: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 0, sk_max_ack_backlog = 0
04: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
05: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
06: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
07: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
08: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
09: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
10: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
11: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
12: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
13: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
14: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
15: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
16: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
17: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
18: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
19: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
20: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
21: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
22: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
23: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
24: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
25: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
26: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
27: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
28: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
29: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
30: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
31: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
32: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
33: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
34: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
35: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
36: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
37: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
38: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
39: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
40: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
41: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
42: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
43: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
44: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
45: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
46: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
47: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
48: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
49: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
50: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
51: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
52: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
53: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
54: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
55: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
56: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
57: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
58: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
59: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
60: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
61: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
62: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
63: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
64: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
65: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
66: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
67: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
68: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
69: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
70: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
71: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
72: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
73: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
74: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
75: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
76: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
77: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
78: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
79: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
80: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
81: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
82: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
83: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
84: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
85: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
86: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
87: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
88: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
89: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
90: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
91: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
92: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
93: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
94: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
95: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
96: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
97: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
98: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
99: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
100: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
101: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
102: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
103: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
104: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
105: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
106: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
# ss -atn | grep :8888
LISTEN 1 0 *:8888 *:*
SYN-RECV 0 0 22.99.22.111:8888 22.99.22.101:59896
SYN-RECV 0 0 22.99.22.111%if355557656:8888 22.99.22.101:59898
SYN-RECV 0 0 22.99.22.111:8888 22.99.22.101:59904
SYN-RECV 0 0 22.99.22.111:8888 22.99.22.101:59894
SYN-RECV 0 0 22.99.22.111%if355557655:8888 22.99.22.101:59892
SYN-RECV 0 0 22.99.22.111%if355557655:8888 22.99.22.101:59888
SYN-RECV 0 0 22.99.22.111:8888 22.99.22.101:59890
ESTAB 0 0 22.99.22.111:8888 22.99.22.101:59886
# dmesg
[39630.522777] TCP: drop open request from 22.99.22.101/59906
[39630.522886] TCP: drop open request from 22.99.22.101/59900
[39630.522972] TCP: drop open request from 22.99.22.101/59902
[39630.523002] TCP: drop open request from 22.99.22.101/59908
[39630.523023] TCP: drop open request from 22.99.22.101/59914
[39630.523042] TCP: drop open request from 22.99.22.101/59916
[39630.523062] TCP: drop open request from 22.99.22.101/59924
[39630.523081] TCP: drop open request from 22.99.22.101/59910
[39630.523762] TCP: drop open request from 22.99.22.101/59912
[39630.523771] TCP: drop open request from 22.99.22.101/59918
客户端输出为
# ss -atn | grep 8888
SYN-SENT 0 1 22.99.22.101:59900 22.99.22.111:8888
SYN-SENT 0 1 22.99.22.101:59918 22.99.22.111:8888
SYN-SENT 0 1 22.99.22.101:59906 22.99.22.111:8888
SYN-SENT 0 1 22.99.22.101:59910 22.99.22.111:8888
SYN-SENT 0 1 22.99.22.101:59914 22.99.22.111:8888
ESTAB 0 0 22.99.22.101:59904 22.99.22.111:8888
ESTAB 0 0 22.99.22.101:59894 22.99.22.111:8888
ESTAB 0 0 22.99.22.101:59890 22.99.22.111:8888
SYN-SENT 0 1 22.99.22.101:59922 22.99.22.111:8888
SYN-SENT 0 1 22.99.22.101:59902 22.99.22.111:8888
ESTAB 0 0 22.99.22.101:59888 22.99.22.111:8888
ESTAB 0 0 22.99.22.101:59896 22.99.22.111:8888
ESTAB 0 0 22.99.22.101:59892 22.99.22.111:8888
SYN-SENT 0 1 22.99.22.101:59916 22.99.22.111:8888
SYN-SENT 0 1 22.99.22.101:59920 22.99.22.111:8888
SYN-SENT 0 1 22.99.22.101:59908 22.99.22.111:8888
SYN-SENT 0 1 22.99.22.101:59912 22.99.22.111:8888
SYN-SENT 0 1 22.99.22.101:59924 22.99.22.111:8888
ESTAB 0 0 22.99.22.101:59898 22.99.22.111:8888
tcp_max_syn_backlog(8) * 3/4 = 6,qlen大于6且未开启tcp_syncookies,触发pr_drop_req()函数的执行。
将tcp_max_syn_backlog设置为12,再次观察
# echo 12 > /proc/sys/net/ipv4/tcp_max_syn_backlog
# ./tcp_conn_request.stp -c './tcp_server -b 0' &
# 01: qlen = 0, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 0, sk_max_ack_backlog = 0
02: qlen = 1, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 0, sk_max_ack_backlog = 0
03: qlen = 1, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
04: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
05: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
06: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
07: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
08: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
09: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
10: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
11: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
12: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
13: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
14: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
15: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
16: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
17: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
18: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
19: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
20: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
21: qlen = 2, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
22: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
23: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
24: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
25: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
26: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
27: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
28: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
29: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
30: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
31: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
32: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
33: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
34: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
35: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
36: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
37: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
38: qlen = 3, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
39: qlen = 4, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
40: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
41: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
42: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
43: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
44: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
45: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
46: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
47: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
48: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
49: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
50: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
51: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
52: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
53: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
54: qlen = 5, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
55: qlen = 6, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
56: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
57: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
58: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
59: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
60: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
61: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
62: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
63: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
64: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
65: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
66: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
67: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
68: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
69: qlen = 8, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
70: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
71: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
72: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
73: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
74: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
75: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
76: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
77: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
78: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
79: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
80: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
81: qlen = 8, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
82: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
83: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
84: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
85: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
86: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
87: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
88: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
89: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
90: qlen = 7, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
91: qlen = 8, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
92: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
93: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
94: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
95: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
96: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
97: qlen = 9, max_qlen_log = 4, nr_table_entries = 16, sk_ack_backlog = 1, sk_max_ack_backlog = 0
tcp_max_syn_backlog(12) * 3/4 = 9,qlen不大于9且未开启tcp_syncookies时,未连接队列未满,不会触发pr_drop_req()函数的执行,因此dmesg不会输出pr_drop_req()打印的"...drop open request from..."信息;但由于半连接队列与全连接队列已满,导致客户端连接被丢弃而超时重传。