作者:gfree.wind@gmail.com
博客:blog.focus-linux.net   linuxfocus.blog.chinaunix.net
 
 
本文的copyleft归gfree.wind@gmail.com所有,使用GPL发布,可以自由拷贝,转载。但转载请保持文档的完整性,注明原作者及原链接,严禁用于任何商业用途。
======================================================================================================
在以前的文章中,学习了UDP数据包的接收和发送。今天开始研究一下TCP数据包的接受。与UDP数据包类似,当IP数据包到达ip_local_deliver_finish函数时,根据四层协议从inet_protos数组中得到TCP协议对应的tcp_protocol。
  1. static const struct net_protocol tcp_protocol = {
  2.     .handler = tcp_v4_rcv,
  3.     .err_handler = tcp_v4_err,
  4.     .gso_send_check = tcp_v4_gso_send_check,
  5.     .gso_segment = tcp_tso_segment,
  6.     .gro_receive = tcp4_gro_receive,
  7.     .gro_complete = tcp4_gro_complete,
  8.     .no_policy = 1,
  9.     .netns_ok = 1,
  10. };
那么TCP数据包的接收函数入口即为tcp_v4_rcv
  1. int tcp_v4_rcv(struct sk_buff *skb)
  2. {
  3.     const struct iphdr *iph;
  4.     const struct tcphdr *th;
  5.     struct sock *sk;
  6.     int ret;
  7.     struct net *net = dev_net(skb->dev);
     
     /* 检测该包是否为发给本机的 */
  1.     if (skb->pkt_type != PACKET_HOST)
  2.         goto discard_it;

  3.     /* Count it even if it's bad */
  4.     TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
     
     /* 检查包长至少比TCP的首部长 */
  1.     if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
  2.         goto discard_it;

  3.     th = tcp_hdr(skb);

      /* 检查TCP首部 */
  1.     if (th->doff < sizeof(struct tcphdr) / 4)
  2.         goto bad_packet;
  3.     if (!pskb_may_pull(skb, th->doff * 4))
  4.         goto discard_it;

  5.     /* An explanation is required here, I think.
  6.      * Packet length and doff are validated by header prediction,
  7.      * provided case of th->doff==0 is eliminated.
  8.      * So, we defer the checks. */
  9.     if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
  10.         goto bad_packet;
     /* 将sequence,ack等保存到socket的TCP控制块中 */
  1.     th = tcp_hdr(skb);
  2.     iph = ip_hdr(skb);
  3.     TCP_SKB_CB(skb)->seq = ntohl(th->seq);
  4.     TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
  5.                  skb->len - th->doff * 4);
  6.     TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
  7.     TCP_SKB_CB(skb)->when     = 0;
  8.     TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
  9.     TCP_SKB_CB(skb)->sacked     = 0;
     
     /* 
     通过源IP,目的IP,源端口,目的端口,和接收到的interface来查找socket。
     这里一共涉及两个hash表,一个是保存已连接TCP session,一个是处于listening的TCP session
     关于这两个hash,以后再分析。
     */
  1.     sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
  2.     if (!sk)
  3.         goto no_tcp_socket;

  4. process:
  5.     /* TIME_WAIT的处理,以后再学习 */
  6.     if (sk->sk_state == TCP_TIME_WAIT)
  7.         goto do_time_wait;

  8.     if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
  9.         NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
  10.         goto discard_and_relse;
  11.     }
     /* IPsec的检查 */
  1.     if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
  2.         goto discard_and_relse;
  3.     nf_reset(skb);
     
     /* socket filter没有用过。。。  */
  1.     if (sk_filter(sk, skb))
  2.         goto discard_and_relse;

  3.     skb->dev = NULL;

  4.     bh_lock_sock_nested(sk);
  5.     ret = 0;
     /* 
     检查该socket是否由当前执行上下文拥有,如果是,可以继续处理该skb,
     如果不是,那么就将skb加到当前socket的sk_backlog上。
     这样的处理与UDP不同,因为TCP是有内部状态的,当处理一个TCP报文的时候,在中间又处理另外一个TCP报文的      时候,可能会改变TCP的状态,导致被打断的TCP报文处理失败。
     这里保证TCP的一个报文处理不会被打断
     */
  1.     if (!sock_owned_by_user(sk)) {
  2. #ifdef CONFIG_NET_DMA
  3.         struct tcp_sock *tp = tcp_sk(sk);
  4.         if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
  5.             tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
  6.         if (tp->ucopy.dma_chan)
  7.             ret = tcp_v4_do_rcv(sk, skb);
  8.         else
  9. #endif
  10.         {
  11.             if (!tcp_prequeue(sk, skb))
  12.                 ret = tcp_v4_do_rcv(sk, skb);
  13.         }
  14.     } else if (unlikely(sk_add_backlog(sk, skb))) {
  15.         bh_unlock_sock(sk);
  16.         NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
  17.         goto discard_and_relse;
  18.     }

  19.     ...... ......
进入tcp_v4_do_rcv

  1. int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
  2. {
  3.     struct sock *rsk;
  4. #ifdef CONFIG_TCP_MD5SIG
  5.     /*
  6.      * We really want to reject the packet as early as possible
  7.      * if:
  8.      * o We're expecting an MD5'd packet and this is no MD5 tcp option
  9.      * o There is an MD5 option and we're not expecting one
  10.      */
  11.     if (tcp_v4_inbound_md5_hash(sk, skb))
  12.         goto discard;
  13. #endif

  14.     if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
  15.         /* 该TCP处于已连接状态,留作以后学习 */
  16.         sock_rps_save_rxhash(sk, skb);
  17.         if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
  18.             rsk = sk;
  19.             goto reset;
  20.         }
  21.         return 0;
  22.     }
     
  1.     if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
  2.         goto csum_err;

  3.     if (sk->sk_state == TCP_LISTEN) {
  4.         /* 
  5.         处理TCP request包,即请求连接本机TCP端口的TCP报文,并返回应处理该skb的socket。
  6.         对于第一个sync包,返回的nsk就是sk。
  7.         */
  8.         struct sock *nsk = tcp_v4_hnd_req(sk, skb);
  9.         if (!nsk)
  10.             goto discard;

         /* 如前面所说,对于第一个sync包,nsk就是sk,于是继续往下执行 */
  1.         if (nsk != sk) {
  2.             sock_rps_save_rxhash(nsk, skb);
  3.             if (tcp_child_process(sk, nsk, skb)) {
  4.                 rsk = nsk;
  5.                 goto reset;
  6.             }
  7.             return 0;
  8.         }
  9.     } else
  10.         sock_rps_save_rxhash(sk, skb);

  1.     if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
  2.         rsk = sk;
  3.         goto reset;
  4.     }
  5.     return 0;

  6.     ...... ...... 
  7. }
进入tcp_rcv_state_process

  1. int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  2.              const struct tcphdr *th, unsigned int len)
  3. {
  4.     struct tcp_sock *tp = tcp_sk(sk);
  5.     struct inet_connection_sock *icsk = inet_csk(sk);
  6.     int queued = 0;
  7.     int res;

  8.     tp->rx_opt.saw_tstamp = 0;

  9.     switch (sk->sk_state) {
  10.     case TCP_CLOSE:
  11.         goto discard;

  12.     case TCP_LISTEN:
  13.         /* 本文的重点,第一个sync包会到这里 */
  14.          
  15.         /* 非法的TCP包,LISTEN状态只处理sync包 */
  16.         if (th->ack)
  17.             return 1;
  1.         if (th->rst)
  2.             goto discard;

  3.         if (th->syn) {
  4.             /* 第一个syn包 */
  5.             if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
  6.                 return 1;

  7.             /* Now we have several options: In theory there is
  8.              * nothing else in the frame. KA9Q has an option to
  9.              * send data with the syn, BSD accepts data with the
  10.              * syn up to the [to be] advertised window and
  11.              * Solaris 2.1 gives you a protocol error. For now
  12.              * we just ignore it, that fits the spec precisely
  13.              * and avoids incompatibilities. It would be nice in
  14.              * future to drop through and process the data.
  15.              *
  16.              * Now that TTCP is starting to be used we ought to
  17.              * queue this data.
  18.              * But, this leaves one open to an easy denial of
  19.              * service attack, and SYN cookies can't defend
  20.              * against this problem. So, we drop the data
  21.              * in the interest of security over speed unless
  22.              * it's still in use.
  23.              */
  24.             kfree_skb(skb);
  25.             return 0;
  26.         }
  27.         goto discard;
     ......  ......
     ......  ......
  1. }
对于IPv4的TCP数据包,conn_request为tcp_v4_conn_request
  1. int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
  2. {
  3.     struct tcp_extend_values tmp_ext;
  4.     struct tcp_options_received tmp_opt;
  5.     const u8 *hash_location;
  6.     struct request_sock *req;
  7.     struct inet_request_sock *ireq;
  8.     struct tcp_sock *tp = tcp_sk(sk);
  9.     struct dst_entry *dst = NULL;
  10.     __be32 saddr = ip_hdr(skb)->saddr;
  11.     __be32 daddr = ip_hdr(skb)->daddr;
  12.     __u32 isn = TCP_SKB_CB(skb)->when;
  13.     int want_cookie = 0;

  14.     /* Never answer to SYNs send to broadcast or multicast */
  15.     if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
  16.         goto drop;

  17.     /* TW buckets are converted to open requests without
  18.      * limitations, they conserve resources and peer is
  19.      * evidently real one.
  20.      */
  21.     //检查syn queue是否已满,即request queue是否已满
  22.     if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
  23.         /* 是否使用sync cookie */
  24.         want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
  25.         if (!want_cookie)
  26.             goto drop;
  27.     }

  28.     /* Accept backlog is full. If we have already queued enough
  29.      * of warm entries in syn queue, drop request. It is better than
  30.      * clogging syn queue with openreqs with exponentially increasing
  31.      * timeout.
  32.      */
  33.     //检查accept queue是否已满
  34.     if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
  35.         goto drop;
     
     //申请一个新的request_sock
  1.     req = inet_reqsk_alloc(&tcp_request_sock_ops);
  2.     if (!req)
  3.         goto drop;

  4. #ifdef CONFIG_TCP_MD5SIG
  5.     tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
  6. #endif
     //解析TCP的option
  1.     tcp_clear_options(&tmp_opt);
  2.     tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
  3.     tmp_opt.user_mss = tp->rx_opt.user_mss;
  4.     tcp_parse_options(skb, &tmp_opt, &hash_location, 0);

  1.     if (tmp_opt.cookie_plus > 0 &&
  2.      tmp_opt.saw_tstamp &&
  3.      !tp->rx_opt.cookie_out_never &&
  4.      (sysctl_tcp_cookie_size > 0 ||
  5.      (tp->cookie_values != NULL &&
  6.      tp->cookie_values->cookie_desired > 0))) {
  7.         /* 
  8.         不太确定这部分代码的用途,看上去跟sync cookie相关
  9.         貌似是为了检查sync-cookie。
  10.         */
  11.         u8 *c;
  12.         u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
  13.         int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;

  14.         if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
  15.             goto drop_and_release;

  16.         /* Secret recipe starts with IP addresses */
  17.         *mess++ ^= (__force u32)daddr;
  18.         *mess++ ^= (__force u32)saddr;

  19.         /* plus variable length Initiator Cookie */
  20.         c = (u8 *)mess;
  21.         while (l-- > 0)
  22.             *c++ ^= *hash_location++;

  23.         want_cookie = 0;    /* not our kind of cookie */
  24.         tmp_ext.cookie_out_never = 0; /* false */
  25.         tmp_ext.cookie_plus = tmp_opt.cookie_plus;
  26.     } else if (!tp->rx_opt.cookie_in_always) {
  27.         /* redundant indications, but ensure initialization. */
  28.         tmp_ext.cookie_out_never = 1; /* true */
  29.         tmp_ext.cookie_plus = 0;
  30.     } else {
  31.         goto drop_and_release;
  32.     }
  33.     tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;

  34.     if (want_cookie && !tmp_opt.saw_tstamp)
  35.         tcp_clear_options(&tmp_opt);

  36.     tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
  37.     tcp_openreq_init(req, &tmp_opt, skb);

  38.     ireq = inet_rsk(req);
  39.     ireq->loc_addr = daddr;
  40.     ireq->rmt_addr = saddr;
  41.     ireq->no_srccheck = inet_sk(sk)->transparent;
  42.     ireq->opt = tcp_v4_save_options(sk, skb);

  43.     if (security_inet_conn_request(sk, skb, req))
  44.         goto drop_and_free;

  45.     if (!want_cookie || tmp_opt.tstamp_ok)
  46.         TCP_ECN_create_request(req, tcp_hdr(skb));

  47.     if (want_cookie) {
  48.         /* 生成sync cookie使用的Initial sequence numnber */
  49.         isn = cookie_v4_init_sequence(sk, skb, &req->mss);
  50.         req->cookie_ts = tmp_opt.tstamp_ok;
  51.     } else if (!isn) {
  52.         struct inet_peer *peer = NULL;
  53.         struct flowi4 fl4;

  54.         /* VJ's idea. We save last timestamp seen
  55.          * from the destination in peer table, when entering
  56.          * state TIME-WAIT, and check against it before
  57.          * accepting new connection request.
  58.          *
  59.          * If "isn" is not zero, this request hit alive
  60.          * timewait bucket, so that all the necessary checks
  61.          * are made in the function processing timewait state.
  62.          */
  63.         /* 还是不懂这块的检查是为了什么。。。*/
  64.         if (tmp_opt.saw_tstamp &&
  65.          tcp_death_row.sysctl_tw_recycle &&
  66.          (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
  67.          fl4.daddr == saddr &&
  68.          (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
  69.             inet_peer_refcheck(peer);
  70.             if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
  71.              (s32)(peer->tcp_ts - req->ts_recent) >
  72.                             TCP_PAWS_WINDOW) {
  73.                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
  74.                 goto drop_and_release;
  75.             }
  76.         }
  77.         /* Kill the following clause, if you dislike this way. */
  78.         else if (!sysctl_tcp_syncookies &&
  79.              (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
  80.              (sysctl_max_syn_backlog >> 2)) &&
  81.              (!peer || !peer->tcp_ts_stamp) &&
  82.              (!dst || !dst_metric(dst, RTAX_RTT))) {
  83.             /* Without syncookies last quarter of
  84.              * backlog is filled with destinations,
  85.              * proven to be alive.
  86.              * It means that we continue to communicate
  87.              * to destinations, already remembered
  88.              * to the moment of synflood.
  89.              */
  90.             LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
  91.                  &saddr, ntohs(tcp_hdr(skb)->source));
  92.             goto drop_and_release;
  93.         }

         /* 生成Initial Sequence Number */
  1.         isn = tcp_v4_init_sequence(skb);
  2.     }
  3.     tcp_rsk(req)->snt_isn = isn;
  4.     tcp_rsk(req)->snt_synack = tcp_time_stamp;
     /* 回复syn+ack包 */
  1.     if (tcp_v4_send_synack(sk, dst, req,
  2.              (struct request_values *)&tmp_ext) ||
  3.      want_cookie)
  4.         goto drop_and_free;
     /* 将该request_sock添加到父socket的icsk_accept_queue中的 listen_opt上  */
  1.     inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
  2.     return 0;

  3. drop_and_release:
  4.     dst_release(dst);
  5. drop_and_free:
  6.     reqsk_free(req);
  7. drop:
  8.     return 0;
  9. }

今天仅仅学习了一下TCP处理第一个sync包的过程,就发现了很多不明白的地方,还需要继续努力啊。争取早日把TCP的这些细节搞懂。