在server端完成listen系统调用之后,就处于可以接受请求的状态,这时client端就可以发送SYN请求给server从而开始“三次握手”。SYN请求的发送是通过connect系统调用实现的:
int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
其中,sockfd是使用socket系统调用创建的socket的文件描述符,addr为要连接的服务器端地址,addrlen为地址长度。addr的内容与addrlen的大小必须与服务器端bind系统调用的addr和addrlen参数相符,否则连接无法建立。
connect系统调用对应的内核代码如下:
1658 SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
1659 int, addrlen)
1660 {
1661 struct socket *sock;
1662 struct sockaddr_storage address;
1663 int err, fput_needed;
1664
1665 sock = sockfd_lookup_light(fd, &err, &fput_needed); //根据描述符查找socket
1666 if (!sock)
1667 goto out;
1668 err = move_addr_to_kernel(uservaddr, addrlen, &address); //将地址信息由用户态copy到内核
1669 if (err < 0)
1670 goto out_put;
1671
1672 err =
1673 security_socket_connect(sock, (struct sockaddr *)&address, addrlen); //安全控制
1674 if (err)
1675 goto out_put;
1676
1677 err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, //connect系统调用核心函数
1678 sock->file->f_flags);
1679 out_put:
1680 fput_light(sock->file, fput_needed);
1681 out:
1682 return err;
1683 }
sock->ops->connect指向的函数为inet_stream_connect,它封装了__inet_stream_connect函数:
599 int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
600 int addr_len, int flags)
601 {
602 struct sock *sk = sock->sk;
603 int err;
604 long timeo;
...
631 err = sk->sk_prot->connect(sk, uaddr, addr_len); //调用TCP协议指定的connect函数,TCPv4是tcp_v4_connect,TCPv6是tcp_v6_connect
632 if (err < 0)
633 goto out;
634
635 sock->state = SS_CONNECTING;
636
637 /* Just entered SS_CONNECTING state; the only
638 * difference is that return value in non-blocking
639 * case is EINPROGRESS, rather than EALREADY.
640 */
641 err = -EINPROGRESS;
642 break;
643 }
644
645 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); //如果socket是非阻塞的,则获取timeo
646
647 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
648 int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
649 tcp_sk(sk)->fastopen_req &&
650 tcp_sk(sk)->fastopen_req->data ? 1 : 0;
651
652 /* Error code is set above */
653 if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) //如果timeo不为0,则会调用inet_wait_for_connect一直等待到连接成功或超时
654 goto out;
655
656 err = sock_intr_errno(timeo);
657 if (signal_pending(current))
658 goto out;
659 }
660
661 /* Connection was closed by RST, timeout, ICMP error
662 * or another process disconnected us.
663 */
664 if (sk->sk_state == TCP_CLOSE)
665 goto sock_error;
666
667 /* sk->sk_err may be not zero now, if RECVERR was ordered by user
668 * and error was received after socket entered established state.
669 * Hence, it is handled normally after connect() return successfully.
670 */
671
672 sock->state = SS_CONNECTED;
673 err = 0;
674 out:
675 return err;
676
677 sock_error:
678 err = sock_error(sk) ? : -ECONNABORTED;
679 sock->state = SS_UNCONNECTED;
680 if (sk->sk_prot->disconnect(sk, flags))
681 sock->state = SS_DISCONNECTING;
682 goto out;
683 }
__inet_stream_connect函数除了调用TCP指定的connect函数完成connect系统调用的核心功能外,还会调用sock_sndtimeo获取阻塞时间timeo。如果socket是非阻塞的,则timeo是0;否则为可以通过setsockopt函数的SO_SNDTIMEO选项来设置的值,这个值默认是MAX_SCHEDULE_TIMEOUT.socket默认都是阻塞的。如果timeo非零则connect系统调用会睡眠,直到超时,或被对端的消息(SYN|ACK,ICMP等)唤醒。
下面来看核心函数 tcp_v4_connect:
142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143 {
144 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
145 struct inet_sock *inet = inet_sk(sk);
146 struct tcp_sock *tp = tcp_sk(sk);
147 __be16 orig_sport, orig_dport;
148 __be32 daddr, nexthop;
...
160 nexthop = daddr = usin->sin_addr.s_addr;
161 inet_opt = rcu_dereference_protected(inet->inet_opt,
162 sock_owned_by_user(sk));
163 if (inet_opt && inet_opt->opt.srr) {
164 if (!daddr)
165 return -EINVAL;
166 nexthop = inet_opt->opt.faddr;
167 }
168
169 orig_sport = inet->inet_sport;
170 orig_dport = usin->sin_port;
171 fl4 = &inet->cork.fl.u.ip4;
172 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
173 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174 IPPROTO_TCP,
175 orig_sport, orig_dport, sk, true);
...
191 if (!inet->inet_saddr) //如果源IP地址没有确定,则使用路由查询的结果
192 inet->inet_saddr = fl4->saddr;
193 inet->inet_rcv_saddr = inet->inet_saddr;
...
207 inet->inet_dport = usin->sin_port;
208 inet->inet_daddr = daddr;
...
221 tcp_set_state(sk, TCP_SYN_SENT);
222 err = inet_hash_connect(&tcp_death_row, sk); //绑定IP地址和端口,并将socket加入到连接表中
223 if (err)
224 goto failure;
225
226 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 inet->inet_sport, inet->inet_dport, sk); //用新的源端口再次做路由查询
228 if (IS_ERR(rt)) {
229 err = PTR_ERR(rt);
230 rt = NULL;
231 goto failure;
232 }
233 /* OK, now commit destination to socket. */
234 sk->sk_gso_type = SKB_GSO_TCPV4;
235 sk_setup_caps(sk, &rt->dst);
236
237 if (!tp->write_seq && likely(!tp->repair))
238 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 inet->inet_daddr,
240 inet->inet_sport,
241 usin->sin_port); //使用加密的方法生成初始序列号
...
245 err = tcp_connect(sk); //发送SYN请求
246
247 rt = NULL;
248 if (err)
249 goto failure;
250
251 return 0;
此函数小于208行的部分是确定socket的源/目的IP和源/目的端口。目的IP和目的端口是由connect系统调用的入参指定,如果没有调用过bind的话,源IP是由路由查询的结果决定。影响路由查询结果的主要因素是目的IP,端口如何影响查询结果目前尚不清楚。源端口如果是0(没有bind)则会在inet_hash_connect函数中指定,然后用新的端口再次进行路由查询(226-227行)。而新的查询结果不影响源IP的选取,由此看来源端口的值不会影响源IP的选择。
secure_tcp_sequence_number函数用加密并与时间关联的非法生成初始序列号,以保证一个TCP连接的初始序列号很难被猜到,而且即使先后发起多次是源/目的IP和源/目的端口都一样的连接,它们的起始序列号也都不一样。前者是为了避免黑客攻击,后者是为了减小旧的报文混入新连接的概率。
245 tcp_connect函数用于构建并发送一个SYN请求。
下面总结一下connect系统调用的功能:
1、设置socket的源/目的IP和源/目的端口;
2、设置socket的状态为SYN_SENT;
3、将socket加入到连接表中等待后续TCP报文到来时进行匹配;
4、选取初始序列号;
5、构建并发送SYN请求报文。
至此,还有两个关键函数:inet_hash_connect和tcp_connect函数我们没有分析。抱着喜欢探究秘密的好奇心,我们一起来看看这两个函数是这样工作的:
589 int inet_hash_connect(struct inet_timewait_death_row *death_row,
590 struct sock *sk)
591 {
592 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
593 __inet_check_established, __inet_hash_nolisten);
594 }
__inet_hash_connect函数:
477 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
478 struct sock *sk, u32 port_offset,
479 int (*check_established)(struct inet_timewait_death_row *,
480 struct sock *, __u16, struct inet_timewait_sock **),
481 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
482 {
483 struct inet_hashinfo *hinfo = death_row->hashinfo;
484 const unsigned short snum = inet_sk(sk)->inet_num;
485 struct inet_bind_hashbucket *head;
486 struct inet_bind_bucket *tb;
487 int ret;
488 struct net *net = sock_net(sk);
489 int twrefcnt = 1;
490
491 if (!snum) { //端口未绑定
492 int i, remaining, low, high, port;
493 static u32 hint;
494 u32 offset = hint + port_offset;
495 struct inet_timewait_sock *tw = NULL;
496
497 inet_get_local_port_range(&low, &high);
498 remaining = (high - low) + 1;
499
500 local_bh_disable();
501 for (i = 1; i <= remaining; i++) {
502 port = low + (i + offset) % remaining;
503 if (inet_is_reserved_local_port(port))
504 continue;
505 head = &hinfo->bhash[inet_bhashfn(net, port,
506 hinfo->bhash_size)];
507 spin_lock(&head->lock);
508
509 /* Does not bother with rcv_saddr checks,
510 * because the established check is already
511 * unique enough.
512 */ //绑定到一个port的socket可能是通过bind 系统调用,也可能是调用connect系统调用时__inet_hash_connect函数选取的。
513 inet_bind_bucket_for_each(tb, &head->chain) { //遍历所有绑定到此端口的socket
514 if (net_eq(ib_net(tb), net) &&
515 tb->port == port) {
516 if (tb->fastreuse >= 0 ||
517 tb->fastreuseport >= 0)
518 goto next_port;
519 WARN_ON(hlist_empty(&tb->owners));
520 if (!check_established(death_row, sk,
521 port, &tw))
522 goto ok; //没有冲突
523 goto next_port;
524 }
525 }
526 //当前端口没有被使用
527 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
528 net, head, port);
529 if (!tb) {
530 spin_unlock(&head->lock);
531 break;
532 }
533 tb->fastreuse = -1;
534 tb->fastreuseport = -1;
535 goto ok;
536
537 next_port:
538 spin_unlock(&head->lock);
539 }
540 local_bh_enable();
541
542 return -EADDRNOTAVAIL;
543
544 ok:
545 hint += i;
546
547 /* Head lock still held and bh's disabled */
548 inet_bind_hash(sk, tb, port); //将socket加入port对应的tb的socket队列中,即将此socket与port相关联
549 if (sk_unhashed(sk)) { //如果socket没有被加入到“已建立连接”的连接表中
550 inet_sk(sk)->inet_sport = htons(port);
551 twrefcnt += hash(sk, tw); //将socket加入到“已建立连接”的连接表中
552 }
553 if (tw)
554 twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
555 spin_unlock(&head->lock);
556
557 if (tw) {
558 inet_twsk_deschedule(tw, death_row);
559 while (twrefcnt) {
560 twrefcnt--;
561 inet_twsk_put(tw);
562 }
563 }
564
565 ret = 0;
566 goto out;
567 }
568
569 head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
570 tb = inet_csk(sk)->icsk_bind_hash; //将tb加入到bind hash表中
571 spin_lock_bh(&head->lock);
572 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { //有且仅有一个socket绑定到这个端口,无需冲突检查
573 hash(sk, NULL); //将socket加入到“已建立连接”的连接表中
574 spin_unlock_bh(&head->lock);
575 return 0;
576 } else { //否则检查是否冲突,
577 spin_unlock(&head->lock);
578 /* No definite answer... Walk to established hash table */
579 ret = check_established(death_row, sk, snum, NULL); //这个检查与520行不重合,因为能走到这里说明端口一定是非零的
580 out:
581 local_bh_enable();
582 return ret;
583 }
584 }
__inet_hash_connect的主要功能与bind系统调用中的inet_csk_get_port类似,都是:
1、如果没有选取端口则选定一个;
2、将socket与端口绑定;
3、将scoket加入到连接表中(这个功能inet_csk_get_port没有)。
另外一点不同是:inet_csk_get_port进行冲突检查时关注的是绑定冲突,而__inet_hash_connect检查的是当前socket是否与“已建立连接的socket”的冲突。__inet_hash_connect检查冲突的函数是__inet_check_established:
311 static int __inet_check_established(struct inet_timewait_death_row *death_row,
312 struct sock *sk, __u16 lport,
313 struct inet_timewait_sock **twp)
314 {
315 struct inet_hashinfo *hinfo = death_row->hashinfo;
316 struct inet_sock *inet = inet_sk(sk);
317 __be32 daddr = inet->inet_rcv_saddr;
318 __be32 saddr = inet->inet_daddr;
319 int dif = sk->sk_bound_dev_if;
320 INET_ADDR_COOKIE(acookie, saddr, daddr)
321 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
322 struct net *net = sock_net(sk);
323 unsigned int hash = inet_ehashfn(net, daddr, lport,
324 saddr, inet->inet_dport);
325 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); //找到连接表中的表项
...
334 /* Check TIME-WAIT sockets first. */
335 sk_nulls_for_each(sk2, node, &head->twchain) {
336 if (sk2->sk_hash != hash)
337 continue;
338
339 if (likely(INET_TW_MATCH(sk2, net, acookie,
340 saddr, daddr, ports, dif))) {
341 tw = inet_twsk(sk2);
342 if (twsk_unique(sk, sk2, twp))
343 goto unique;
344 else
345 goto not_unique;
346 }
347 }
348 tw = NULL;
349
350 /* And established part... */
351 sk_nulls_for_each(sk2, node, &head->chain) {
352 if (sk2->sk_hash != hash)
353 continue;
354 if (likely(INET_MATCH(sk2, net, acookie,
355 saddr, daddr, ports, dif)))
356 goto not_unique;
357 }
358
359 unique:
360 /* Must record num and sport now. Otherwise we will see
361 * in hash table socket with a funny identity. */
362 inet->inet_num = lport;
363 inet->inet_sport = htons(lport);
364 sk->sk_hash = hash;
...
可见__inet_check_established先检查TIME_WAIT表,然后再检查establish表,与这两个表中的任意一个冲突都是不允许的。后面的章节中会介绍TIME_WAIT功能。
在listen系统调用中,inet_hash函数会将socket加入到listen连接表中:
426 static void __inet_hash(struct sock *sk)
427 {
428 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
429 struct inet_listen_hashbucket *ilb;
430
431 if (sk->sk_state != TCP_LISTEN) {
432 __inet_hash_nolisten(sk, NULL);
433 return;
434 }
435
436 WARN_ON(!sk_unhashed(sk));
437 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
438
439 spin_lock(&ilb->lock);
440 __sk_nulls_add_node_rcu(sk, &ilb->head);
441 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
442 spin_unlock(&ilb->lock);
443 }
444
445 void inet_hash(struct sock *sk)
446 {
447 if (sk->sk_state != TCP_CLOSE) {
448 local_bh_disable();
449 __inet_hash(sk);
450 local_bh_enable();
451 }
452 }
在__inet_hash_connect函数中,加入socket到连接表的函数是__inet_hash_nolisten:
399 int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
400 {
401 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
402 struct hlist_nulls_head *list;
403 spinlock_t *lock;
404 struct inet_ehash_bucket *head;
405 int twrefcnt = 0;
406
407 WARN_ON(!sk_unhashed(sk));
408
409 sk->sk_hash = inet_sk_ehashfn(sk);
410 head = inet_ehash_bucket(hashinfo, sk->sk_hash);
411 list = &head->chain;
412 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
413
414 spin_lock(lock);
415 __sk_nulls_add_node_rcu(sk, list);
416 if (tw) {
417 WARN_ON(sk->sk_hash != tw->tw_hash);
418 twrefcnt = inet_twsk_unhash(tw);
419 }
420 spin_unlock(lock);
421 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
422 return twrefcnt;
423 }
159 static inline struct inet_ehash_bucket *inet_ehash_bucket(
160 struct inet_hashinfo *hashinfo,
161 unsigned int hash)
162 {
163 return &hashinfo->ehash[hash & hashinfo->ehash_mask];
164 }
可见server端的socket在进行listen系统调用后被加入到sk->sk_prot->h.hashinfo->listening_hash中,client端的socket在进行connect系统调用后被加入到sk->sk_prot->h.hashinfo->ehash中,而对于TCPv4和TCPv6,sk->sk_prot->h.hashinfo指向的都是tcp_hashinfo。
再看tcp_connect函数:
2925 int tcp_connect(struct sock *sk)
2926 {
2927 struct tcp_sock *tp = tcp_sk(sk);
2928 struct sk_buff *buff;
2929 int err;
2930
2931 tcp_connect_init(sk); //初始化TCP连接信息
2932
2933 if (unlikely(tp->repair)) {
2934 tcp_finish_connect(sk, NULL);
2935 return 0;
2936 }
2937
2938 buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); //构建SKB用于承载TCP报文
2939 if (unlikely(buff == NULL))
2940 return -ENOBUFS;
2941
2942 /* Reserve space for headers. */
2943 skb_reserve(buff, MAX_TCP_HEADER);
2944
2945 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); //初始化不带数据的TCP SYN报文,SYN包的seq为tp->write_seq
2946 tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
2947 tcp_connect_queue_skb(sk, buff); //将报文放入发送队列中
2948 TCP_ECN_send_syn(sk, buff);
2949
2950 /* Send off SYN; include data in Fast Open. */
2951 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : //如果TFO功能开启,则用tcp_send_syn_data函数发送带数据的SYN
2952 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); //否则用tcp_transmit_skb发送之前构建好的不带数据的SYN
2953 if (err == -ECONNREFUSED)
2954 return err;
2955
2956 /* We change tp->snd_nxt after the tcp_transmit_skb() call
2957 * in order to make this packet get counted in tcpOutSegs.
2958 */
2959 tp->snd_nxt = tp->write_seq; //snd_nxt = seq + 1
2960 tp->pushed_seq = tp->write_seq;
2961 TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
2962
2963 /* Timer for repeating the SYN until an answer. */
2964 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2965 inet_csk(sk)->icsk_rto, TCP_RTO_MAX); //设置重传定时器,如果在超时之前没有收到SYN|ACK,则重传SYN
2966 return 0;
2967 }
关于TFO的内容后续章节有详细分析。现在看TFO不开启的情况,这时SYN的发送靠tcp_transmit_skb函数完成:
828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
829 gfp_t gfp_mask)
830 {
831 const struct inet_connection_sock *icsk = inet_csk(sk);
832 struct inet_sock *inet;
833 struct tcp_sock *tp;
834 struct tcp_skb_cb *tcb;
835 struct tcp_out_options opts;
836 unsigned int tcp_options_size, tcp_header_size;
837 struct tcp_md5sig_key *md5;
838 struct tcphdr *th;
839 int err;
... //clone_it大多数情况下都会为非零,因为TCP的大部分报文都是先放到发送队列中,发送的时候再复制一份
849 if (likely(clone_it)) {
850 const struct sk_buff *fclone = skb + 1;
851
852 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
853 fclone->fclone == SKB_FCLONE_CLONE))
854 NET_INC_STATS_BH(sock_net(sk),
855 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
856
857 if (unlikely(skb_cloned(skb)))
858 skb = pskb_copy(skb, gfp_mask);
859 else
860 skb = skb_clone(skb, gfp_mask);
861 if (unlikely(!skb))
862 return -ENOBUFS;
863 }
864
865 inet = inet_sk(sk);
866 tp = tcp_sk(sk);
867 tcb = TCP_SKB_CB(skb);
868 memset(&opts, 0, sizeof(opts));
869
870 if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
871 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); //构建SYN报文的选项字段
872 else
873 tcp_options_size = tcp_established_options(sk, skb, &opts,
874 &md5);
875 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
...
885 skb_push(skb, tcp_header_size); //将skb数据指针向报文的开始部分移动TCP头的大小
886 skb_reset_transport_header(skb); //设置TCP首部指针
887
888 skb_orphan(skb);
889 skb->sk = sk;
890 skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
891 tcp_wfree : sock_wfree;
892 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
893
894 /* Build TCP header and checksum it. */
895 th = tcp_hdr(skb);
896 th->source = inet->inet_sport;
897 th->dest = inet->inet_dport;
898 th->seq = htonl(tcb->seq);
899 th->ack_seq = htonl(tp->rcv_nxt);
900 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
901 tcb->tcp_flags);
902 //设置窗口大小
903 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
904 /* RFC1323: The window in SYN & SYN/ACK segments
905 * is never scaled.
906 */
907 th->window = htons(min(tp->rcv_wnd, 65535U));
908 } else {
909 th->window = htons(tcp_select_window(sk));
910 }
911 th->check = 0;
912 th->urg_ptr = 0;
... //将选项信息写入TCP报文段
925 tcp_options_write((__be32 *)(th + 1), tp, &opts);
...
938 icsk->icsk_af_ops->send_check(sk, skb); //调用tcp_v4_send_check函数计算检验和
...
950 err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); //调用ip_queue_xmit函数将报文段向下层传递
951 if (likely(err <= 0))
952 return err;
953
954 tcp_enter_cwr(sk, 1);
955
956 return net_xmit_eval(err);
957 }
下层函数ip_queue_xmit的功能是构建IP报头、二层报头,并将报文直接发送到网卡驱动中或放入队列中等待发送。限于本文的目的,TCP下层的数据处理就不做分析。
发送完SYN后,connect系统调用回直接返回,或等待SYN|ACK的到来。server端在收到SYN请求后会如何处理呢?下回分解!