dpvs synproxy

  • TCP协议开辟了一个比较大的内存空间请求连接队列来存储连接请求块,当SYN请求不断增加,请求连接数目到达上限时,会致使系统丢弃SYN连接请求。SYN cookies技术就可以使服务器在半连接队列已满的情况下仍能处理新的SYN请求

  • 当半连接队列满时,SYN cookies并不丢弃SYN请求,而是通过加密技术来标识半连接状态。在TCP实现中,当收到客户端的SYN请求时,服务器需回复SYN+ACK包给客户端,然后客户端再发送确认包给服务器。通常服务器的初始序列号是由服务器按照一定的规律计算得到或采用随机数,而在SYN cookies中,服务器的初始序列号是由客户端IP地址,客户端端口,服务器IP地址和服务器端口,接收到的客户端初始序列号以及其他一些安全数值进行hash运算,并加密后得到的,称之为cookie。当服务器遭受SYN攻击使得请求连接队列满时,服务器并不拒绝新的SYN请求,而是回复一个初始化序列号为cookie的SYN包给客户端,如果收到客户端的ACK段,服务器将客户端的ACK序列号减1得到的值,与用上述那些元素hash运算得到的值比较,如果相等,直接完成三次握手,注意:此时并不必查看此连接是否属于请求连接队列

  • linux内核启用SYN cookies是通过在启动环境中设置以下命令来完成

    echo 1 > /proc/sys/net/ipv4/tcp_syncookies
    
  • 三次握手示例图

     

  • synproxy说明

    1. client发送syn,LB代理了第一次握手,不转发给rs. LB返回syn+ack数据包时,seq由syn cookies算法生成,并且将rcv_wnd设置为0,不允许在握手阶段携带数据,由此得知不支持tcp fast open
    2. 当client返回ack时,反解seq,如果与syn cookies算法匹配,那么就是正常流量。此时LB与后端RS开启三次握手,并透传win size,由于经过LB代理,还需要记录seq差值delta
    3. 数据交互通信,lb除了正常的full-nat工作,还要补偿seq delta
    4. 连接关闭,正常清理

 client第一次握手

  • __dp_vs_pre_routing

    static int __dp_vs_pre_routing(void *priv, struct rte_mbuf *mbuf,
                                   const struct inet_hook_state *state, int af)
    {
       ...
        /* Synproxy: defence synflood */
        //如果是传输层协议是TCP,synproxy处理,此处处理client端第一次握手(syn)包的处理
        if (IPPROTO_TCP == iph.proto) {
            int v = INET_ACCEPT;
            if (0 == dp_vs_synproxy_syn_rcv(af, mbuf, &iph, &v))
                return v;
        }
    
        return INET_ACCEPT;
    }
    
  • dp_vs_synproxy_syn_rcv

    • 处理client侧第一次握手数据包(syn包)
    /* Syn-proxy step 1 logic: receive client's Syn.
     * Check if synproxy is enabled for this skb, and send syn/ack back
     *
     * Synproxy is enabled when:
     * 1) mbuf is a syn packet,
     * 2) and the service is synproxy-enable,
     * 3) and ip_vs_todrop return fasle (not supported now)
     *
     * @return 0 means the caller should return at once and use
     * verdict as return value, return 1 for nothing.
     */
    int dp_vs_synproxy_syn_rcv(int af, struct rte_mbuf *mbuf,
            const struct dp_vs_iphdr *iph, int *verdict)
    {
        int ret;
        struct dp_vs_service *svc = NULL;
        struct tcphdr *th, _tcph;
        struct dp_vs_synproxy_opt tcp_opt;
        struct netif_port *dev;
        struct ether_hdr *eth;
        struct ether_addr ethaddr;
        //th指向tcp首部
        th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph);
        if (unlikely(NULL == th))
            goto syn_rcv_out;
        //第一次握手只有syn包,并有访问svc,开启了syn proxy防护
        if (th->syn && !th->ack && !th->rst && !th->fin &&
                (svc = dp_vs_service_lookup(af, iph->proto, &iph->daddr, th->dest, 0,
                                            NULL, NULL, NULL, rte_lcore_id())) &&
                (svc->flags & DP_VS_SVC_F_SYNPROXY)) {
            /* if service's weight is zero (non-active realserver),
             * do noting and drop the packet */
             //如果后端服务svc权重为0,没有可用后端,返回 INET_DROP
            if (svc->weight == 0) {
                dp_vs_estats_inc(SYNPROXY_NO_DEST);
                goto syn_rcv_out;
            }
    
            /* drop packet from blacklist */
            //如果在黑名单中,那么退出,返回 INET_DROP
            if (dp_vs_blklst_lookup(iph->af, iph->proto, &iph->daddr,
                        th->dest, &iph->saddr)) {
                goto syn_rcv_out;
            }
        } else {
            return 1;
        }
    
        /* mbuf will be reused and ether header will be set.
         * FIXME: to support non-ether packets. */
        if (mbuf->l2_len != sizeof(struct ether_hdr))
            goto syn_rcv_out;
    
        /* update statistics */
        //更新统计信息
        dp_vs_estats_inc(SYNPROXY_SYN_CNT);
    
        /* set tx offload flags */
        //校验
        assert(mbuf->port <= NETIF_MAX_PORTS);
        //获取net_device层设备,并做校验
        dev = netif_port_get(mbuf->port);
        if (unlikely(!dev)) {
            RTE_LOG(ERR, IPVS, "%s: device eth%d not found\\n",
                    __func__, mbuf->port);
            goto syn_rcv_out;
        }
        //根据物理设备的硬件负载功能,设置mbuf相应标志位
        if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) {
            if (af == AF_INET)
                mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4);
            else
                mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6);
        }
    
        /* reuse mbuf */
        //复用mbuf并回复syn+ack,为什么说是复用呢?因为对mbuf修改后,直接当做syn+ack回复包返回给了client
        syn_proxy_reuse_mbuf(af, mbuf, th, &tcp_opt);
    
        /* set L2 header and send the packet out
         * It is noted that "ipv4_xmit" should not used here,
         * because mbuf is reused. */
        //设置L2层的header,源和目的对换
        eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, mbuf->l2_len);
        if (unlikely(!eth)) {
            RTE_LOG(ERR, IPVS, "%s: no memory\\n", __func__);
            goto syn_rcv_out;
        }
        memcpy(&ethaddr, &eth->s_addr, sizeof(struct ether_addr));
        memcpy(&eth->s_addr, &eth->d_addr, sizeof(struct ether_addr));
        memcpy(&eth->d_addr, &ethaddr, sizeof(struct ether_addr));
        //调用netif_xmit发送数据包
        if (unlikely(EDPVS_OK != (ret = netif_xmit(mbuf, dev)))) {
            RTE_LOG(ERR, IPVS, "%s: netif_xmit failed -- %s\\n",
                    __func__, dpvs_strerror(ret));
        /* should not set verdict to INET_DROP since netif_xmit
         * always consume the mbuf while INET_DROP means mbuf'll
         * be free in INET_HOOK.*/
        }
        *verdict = INET_STOLEN;
        return 0;
    
    syn_rcv_out:
        /* drop and destroy the packet */
        *verdict = INET_DROP;
        return 0;
    }
    
  • syn_proxy_reuse_mbuf

    • 设置tcp选项
    • 计算syn+ack包的seq,syn cookies计算
    • 设置syn+ack包的seq和ack_seq
    • 交换ip和tcp首部的源,目的信息
    • 计算ip首部和tcp首部校验和
    /* Reuse mbuf for syn proxy, called by syn_proxy_syn_rcv().
     * do following things:
     * 1) set tcp options,
     * 2) compute seq with cookie func,
     * 3) set tcp seq and ack_seq,
     * 4) exchange ip addr and tcp port,
     * 5) compute iphdr and tcp check (HW xmit checksum offload not support for syn).
     */
    static void syn_proxy_reuse_mbuf(int af, struct rte_mbuf *mbuf,
                                     struct tcphdr *th,
                                     struct dp_vs_synproxy_opt *opt)
    {
        uint32_t isn;
        uint16_t tmpport;
        int      iphlen;
    
        //获取ip首部长度
        if (AF_INET6 == af)
        {
            iphlen = sizeof(struct ip6_hdr);
        }
        else
        {
            iphlen = ip4_hdrlen(mbuf);
        }
        //长度校验,确保首部长度正确
        if (mbuf_may_pull(mbuf, iphlen + (th->doff << 2)) != 0)
        {
            return;
        }
    
        /* deal with tcp options */
        //解析并且设置tcp options,包括mss,window size,timestamp
        syn_proxy_parse_set_opts(mbuf, th, opt);
    
        /* get cookie */
        //根据syn cookies算法生成syn+ack数据包的seq
        if (AF_INET6 == af)
        {
            isn = syn_proxy_cookie_v6_init_sequence(mbuf, th, opt);
        }
        else
        {
            isn = syn_proxy_cookie_v4_init_sequence(mbuf, th, opt);
        }
    
        /* set syn-ack flag */
        //设置syn|ack标志
        ((uint8_t *)th)[13] = 0x12;
    
        /* exchage ports */
        //交换dest,source端口
        tmpport    = th->dest;
        th->dest   = th->source;
        th->source = tmpport;
        /* set window size to zero */
        //设置接收窗口为0,不允许握手阶段携带数据信息
        th->window = 0;
        /* set seq(cookie) and ack_seq */
        //设置seq和ack_seq,其中ack_seq是客户端序号加1,而返回的syn seq就是刚刚计算出来的cookie
        th->ack_seq = htonl(ntohl(th->seq) + 1);
        th->seq     = htonl(isn);
    
        /* exchage addresses */
        //交换源和目的ip地址信息并重新计算校验和
        if (AF_INET6 == af)
        {
            struct in6_addr tmpaddr;
            struct ip6_hdr *ip6h = ip6_hdr(mbuf);
    
            tmpaddr        = ip6h->ip6_src;
            ip6h->ip6_src  = ip6h->ip6_dst;
            ip6h->ip6_dst  = tmpaddr;
            ip6h->ip6_hlim = dp_vs_synproxy_ctrl_synack_ttl;
    
            if (likely(mbuf->ol_flags & PKT_TX_TCP_CKSUM))
            {
                mbuf->l3_len = (void *)th - (void *)ip6h;
                mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) - mbuf->l3_len;
                th->check    = ip6_phdr_cksum(ip6h, mbuf->ol_flags, mbuf->l3_len, IPPROTO_TCP);
            }
            else
            {
                if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)
                {
                    return;
                }
                tcp6_send_csum((struct ipv6_hdr *)ip6h, th);
            }
        }
        else
        {
            uint32_t      tmpaddr;
            struct iphdr *iph = (struct iphdr *)ip4_hdr(mbuf);
    
            tmpaddr    = iph->saddr;
            iph->saddr = iph->daddr;
            iph->daddr = tmpaddr;
            iph->ttl   = dp_vs_synproxy_ctrl_synack_ttl;
            iph->tos   = 0;
    
            /* compute checksum */
            if (likely(mbuf->ol_flags & PKT_TX_TCP_CKSUM))
            {
                mbuf->l3_len = iphlen;
                mbuf->l4_len = ntohs(iph->tot_len) - iphlen;
                th->check    = ip4_phdr_cksum((struct ipv4_hdr *)iph, mbuf->ol_flags);
            }
            else
            {
                if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)
                {
                    return;
                }
                tcp4_send_csum((struct ipv4_hdr *)iph, th);
            }
            //如果硬件不支持计算csum,调用ip4_send_csum生成checksum
            if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM))
            {
                iph->check = 0;
            }
            else
            {
                ip4_send_csum((struct ipv4_hdr *)iph);
            }
        }
    }
    
  • syn_proxy_parse_set_opts

    /* Replace tcp options in tcp header, called by syn_proxy_reuse_mbuf() */
    static void syn_proxy_parse_set_opts(struct rte_mbuf *mbuf, struct tcphdr *th,
                                         struct dp_vs_synproxy_opt *opt)
    {
        /* mss in received packet */
        uint16_t        in_mss;
        uint32_t *      tmp;
        unsigned char * ptr;
        //计算tcp选项长度
        int             length   = (th->doff * 4) - sizeof(struct tcphdr);
        uint16_t        user_mss = dp_vs_synproxy_ctrl_init_mss;
        struct timespec tsp_now;
    
        memset(opt, '\\0', sizeof(struct dp_vs_synproxy_opt));
        opt->mss_clamp = 536;
        ptr            = (unsigned char *)(th + 1);
    
        while (length > 0)
        {
            unsigned char *tmp_opcode = ptr;
            int            opcode     = *ptr++;
            int            opsize;
    
            switch (opcode)
            {
            //选项结束,直接返回
            case TCPOPT_EOL:
                return;
            //NOP选项,只作填充用,因此选项长度减1,进入下一个循环处理下一个选项
            case TCPOPT_NOP:
                length--;
                continue;
    
            default:
                opsize = *ptr++;
                //如果不是选项表结束标志也不是空操作,则选取选项长度,并检测其合法性
                if (opsize < 2)  /* silly options */
                {
                    return;
                }
                //选项长度校验
                if (opsize > length)
                {
                    return; /* don't parse partial options */
                }
                switch (opcode)
                {
                case TCPOPT_MAXSEG:
                    //用来通告最大段长度,最大段长度选项格式如下
                    //kind=2|len=4|最大段长度
                    //该选项只能出现在SYN段片段中
                    if (opsize == TCPOLEN_MAXSEG)
                    {
                        in_mss = ntohs(*(uint16_t *)ptr);
                        if (in_mss)
                        {
                            //如果系统设置的mss小于对端通告的mss,使用较小值回复
                            if (user_mss < in_mss)
                            {
                                in_mss = user_mss;
                            }
                            opt->mss_clamp = in_mss;
                        }
                        //字节序转换
                        *(uint16_t *)ptr = htons(opt->mss_clamp);
                    }
                    break;
                //窗口选项
                case TCPOPT_WINDOW:
                    /**
                     * kind=3|len=3|位移数
                     * 去窗口扩大因子选项中的位移数,将标识SYN段中包含窗口扩大因子选项的wscale_ok置为1,
                     * 如果选项中位移数大于14则警告
                     */
                    if (opsize == TCPOLEN_WINDOW)
                    {
                        if (dp_vs_synproxy_ctrl_wscale)
                        {
                            opt->wscale_ok  = 1;
                            opt->snd_wscale = *(uint8_t *)ptr;
                            if (opt->snd_wscale > DP_VS_SYNPROXY_WSCALE_MAX)
                            {
                                RTE_LOG(INFO, IPVS, "tcp_parse_options: Illegal window "
                                        "scaling value %d > %d received.",
                                        opt->snd_wscale, DP_VS_SYNPROXY_WSCALE_MAX);
                                opt->snd_wscale = DP_VS_SYNPROXY_WSCALE_MAX;
                            }
                            *(uint8_t *)ptr = (uint8_t)dp_vs_synproxy_ctrl_wscale;
                        }
                        else
                        {
                            //不支持以NOP选项填充
                            memset(tmp_opcode, TCPOPT_NOP, TCPOLEN_WINDOW);
                        }
                    }
                    break;
                //时间戳选项
                case TCPOPT_TIMESTAMP:
                    if (opsize == TCPOLEN_TIMESTAMP)
                    {
                        if (dp_vs_synproxy_ctrl_timestamp)
                        {
                            memset(&tsp_now, 0, sizeof(tsp_now));
                            clock_gettime(CLOCK_REALTIME, &tsp_now);
                            opt->tstamp_ok = 1;
                            tmp            = (uint32_t *)ptr;
                            *(tmp + 1)     = *tmp;
                            *tmp           = htonl((uint32_t)(TCP_OPT_TIMESTAMP(tsp_now)));
                        }
                        else
                        {
                            memset(tmp_opcode, TCPOPT_NOP, TCPOLEN_TIMESTAMP);
                        }
                    }
                    break;
                
                case TCPOPT_SACK_PERMITTED:
                     //允许SACK选项,只能出现在SYN段中,将sack_ok置为1,标识syn中允许sack选项.
                    if (opsize == TCPOLEN_SACK_PERMITTED)
                    {
                        if (dp_vs_synproxy_ctrl_sack)
                        {
                            opt->sack_ok = 1;
                        }
                        else
                        {
                            memset(tmp_opcode, TCPOPT_NOP, TCPOLEN_SACK_PERMITTED);
                        }
                    }
                    break;
                }
                ptr    += opsize - 2;
                length -= opsize;
            }
        }
    }
    

 client第三次握手包应答

  • __dp_vs_in

    • client侧第三次握手包(ACK),在__dp_vs_pre_routing中肯定会返回ACCEPT,继续在__dp_vs_in中处理
    • 查找连接时不会命中,调用tcp传输层tcp_conn_sched函数进行新连接的调度
    static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
                          const struct inet_hook_state *state, int af)
    {
    		....
    		//对于新建的连接,肯定是没有会话的,conn_sched根据请求选择一个后端real server建立连接
        if (unlikely(!conn))
        {
            /* try schedule RS and create new connection */
            //调用proto中conn_sched接口选择一个后端rs建立连接,如果创建连接失败,返回verdict
            if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK)
            {
                /* RTE_LOG(DEBUG, IPVS, "%s: fail to schedule.\\n", __func__); */
                return(verdict);
            }
    
            /* only SNAT triggers connection by inside-outside traffic. */
            //snat模式,则是内部服务器访问外部服务,内网服务器--->dpvs--->外网服务器(baidu),所以设置dir=DPVS_CONN_DIR_OUTBOUND
            if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT)
            {
                dir = DPVS_CONN_DIR_OUTBOUND;
            }
            else
            {
                //其余模式设置dir=DPVS_CONN_DIR_INBOUND
                dir = DPVS_CONN_DIR_INBOUND;
            }
        }
    		...
    }
    
  • tcp_conn_sched

    static int tcp_conn_sched(struct dp_vs_proto *proto,
                              const struct dp_vs_iphdr *iph,
                              struct rte_mbuf *mbuf,
                              struct dp_vs_conn **conn,
                              int *verdict)
    {
    		...
    		/* Syn-proxy step 2 logic: receive client's 3-handshacke ack packet */
    
        /* When synproxy disabled, only SYN packets can arrive here.
         * So don't judge SYNPROXY flag here! If SYNPROXY flag judged, and syn_proxy
         * got disbled and keepalived reloaded, SYN packets for RS may never be sent. */
        //如果是syn cookies 连接建立第三次握手数据包,则返回EDPVS_PKTSTOLEN
        if (dp_vs_synproxy_ack_rcv(iph->af, mbuf, th, proto, conn, iph, verdict) == 0)
        {
            /* Attention: First ACK packet is also stored in conn->ack_mbuf */
            return(EDPVS_PKTSTOLEN);
        }
    		...
    }
    
  • dp_vs_synproxy_ack_rcv

    • syn cookies校验
    • dp_vs_schedule 新建立连接后端调度,选择一个real server
    • syn_proxy_send_rs_syn进行LB与RS的第一次握手
    /* Syn-proxy step 2 logic: receive client's Ack
     * Receive client's 3-handshakes ack packet, do cookie check and then
     * send syn to rs after creating a session */
    int dp_vs_synproxy_ack_rcv(int af, struct rte_mbuf *mbuf,
                               struct tcphdr *th, struct dp_vs_proto *pp,
                               struct dp_vs_conn **cpp,
                               const struct dp_vs_iphdr *iph, int *verdict)
    {
        int res;
        struct dp_vs_synproxy_opt opt;
        struct dp_vs_service *    svc;
        int res_cookie_check;
    
        /* Do not check svc syn-proxy flag, as it may be changed after syn-proxy step 1. */
        
        if (!th->syn && th->ack && !th->rst && !th->fin &&
            (svc = dp_vs_service_lookup(af, iph->proto, &iph->daddr,
                                        th->dest, 0, NULL, NULL, NULL, rte_lcore_id())))
        {
            if (dp_vs_synproxy_ctrl_defer &&
                !syn_proxy_ack_has_data(mbuf, iph, th))
            {
                /* Update statistics */
                dp_vs_estats_inc(SYNPROXY_NULL_ACK);
    
                /* We get a pure ack when expecting ack packet with payload, so
                 * have to drop it */
                *verdict = INET_DROP;
                return(0);
            }
            //syn cookies验证,如果不匹配,那么就是攻击或是无效流量,将包丢弃。如果成功,执行 syn proxy 第二阶段,lb 调用
            //dp_vs_schedule 与后端 real server 建立连接
            if (AF_INET6 == af)
            {
                res_cookie_check = syn_proxy_v6_cookie_check(mbuf,
                                                             ntohl(th->ack_seq) - 1, &opt);
            }
            else
            {
                res_cookie_check = syn_proxy_v4_cookie_check(mbuf,
                                                             ntohl(th->ack_seq) - 1, &opt);
            }
            if (!res_cookie_check)
            {
                /* Update statistics */
                dp_vs_estats_inc(SYNPROXY_BAD_ACK);
                /* Cookie check failed, drop the packet */
                RTE_LOG(DEBUG, IPVS, "%s: syn_cookie check failed seq=%u\\n", __func__,
                        ntohl(th->ack_seq) - 1);
                *verdict = INET_DROP;
                return(0);
            }
    
            /* Update statistics */
            dp_vs_estats_inc(SYNPROXY_OK_ACK);
    
            /* Let the virtual server select a real server for the incoming connetion,
             * and create a connection entry */
             //dp_vs_schedule 新建立连接后端调度,选择一个real server
            *cpp = dp_vs_schedule(svc, iph, mbuf, 1, 0);
            if (unlikely(!*cpp))
            {
                RTE_LOG(WARNING, IPVS, "%s: ip_vs_schedule failed\\n", __func__);
    
                /* FIXME: What to do when virtual service is available but no destination
                 * available for a new connetion: send an icmp UNREACHABLE ? */
                *verdict = INET_DROP;
                return(0);
            }
    
            /* Do nothing but print a error msg when fail, because session will be
             * correctly freed in dp_vs_conn_expire */
            //syn_proxy_send_rs_syn 完成 lb 与 real server 建连
            if (EDPVS_OK != (res = syn_proxy_send_rs_syn(af, th, *cpp, mbuf, pp, &opt)))
            {
                RTE_LOG(ERR, IPVS, "%s: syn_proxy_send_rs_syn failed -- %s\\n",
                        __func__, dpvs_strerror(res));
            }
    
            /* Count in the ack packet (STOLEN by synproxy) */
            dp_vs_stats_in(*cpp, mbuf);
    
            /* Active session timer, and dec refcnt.
             * Also steal the mbuf, and let caller return immediately */
            dp_vs_conn_put(*cpp);
            *verdict = INET_STOLEN;
            return(0);
        }
    
        return(1);
    }
    
  • syn_proxy_send_rs_syn

    /* Create syn packet and send it to rs.
     * We also store syn mbuf in cp if syn retransmition is turned on. */
    static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th,
                                     struct dp_vs_conn *cp, struct rte_mbuf *mbuf,
                                     struct dp_vs_proto *pp, struct dp_vs_synproxy_opt *opt)
    {
        int tcp_hdr_size;
        struct rte_mbuf *   syn_mbuf, *syn_mbuf_cloned;
        struct rte_mempool *pool;
        struct tcphdr *     syn_th;
    
        if (!cp->packet_xmit)
        {
            RTE_LOG(WARNING, IPVS, "%s: packet_xmit is null\\n", __func__);
            return(EDPVS_INVAL);
        }
    
        /* Allocate mbuf from device mempool */
        pool = get_mbuf_pool(cp, DPVS_CONN_DIR_INBOUND);
        if (unlikely(!pool))
        {
            //RTE_LOG(WARNING, IPVS, "%s: %s\\n", __func__, dpvs_strerror(EDPVS_NOROUTE));
            return(EDPVS_NOROUTE);
        }
        //从内存池中分配syn_mbuf,用于发送到后端real server
        syn_mbuf = rte_pktmbuf_alloc(pool);
        if (unlikely(!syn_mbuf))
        {
            //RTE_LOG(WARNING, IPVS, "%s: %s\\n", __func__, dpvs_strerror(EDPVS_NOMEM));
            return(EDPVS_NOMEM);
        }
        //设置路由缓存为null
        syn_mbuf->userdata = NULL; /* make sure "no route info" */
    
        /* Reserve space for tcp header */
        //为tcp层保留空间,包括选项,通过prepend向mbuf的headroom添加数据
        tcp_hdr_size = (sizeof(struct tcphdr) + TCPOLEN_MAXSEG
                        + (opt->tstamp_ok ? TCPOLEN_TSTAMP_APPA : 0)
                        + (opt->wscale_ok ? TCP_OLEN_WSCALE_ALIGNED : 0)
                        /* SACK_PERM is in the palce of NOP NOP of TS */
                        + ((opt->sack_ok && !opt->tstamp_ok) ? TCP_OLEN_SACKPERMITTED_ALIGNED : 0));
        syn_th = (struct tcphdr *)rte_pktmbuf_prepend(syn_mbuf, tcp_hdr_size);
        if (!syn_th)
        {
            rte_pktmbuf_free(syn_mbuf);
            //RTE_LOG(WARNING, IPVS, "%s:%s\\n", __func__, dpvs_strerror(EDPVS_NOROOM));
            return(EDPVS_NOROOM);
        }
    
        /* Set up tcp header */
        memset(syn_th, 0, tcp_hdr_size);
        syn_th->source              = th->source;
        syn_th->dest                = th->dest;
        syn_th->seq                 = htonl(ntohl(th->seq) - 1);
        syn_th->ack_seq             = 0;
        *(((uint16_t *)syn_th) + 6) = htons(((tcp_hdr_size >> 2) << 12) | /*TH_SYN*/ 0x02);
        /* FIXME: what window should we use */
        syn_th->window  = htons(5000);
        syn_th->check   = 0;
        syn_th->urg_ptr = 0;
        syn_th->urg     = 0;
        //构造syn包的tcp选项
        syn_proxy_syn_build_options((uint32_t *)(syn_th + 1), opt);
        //IP首部的构造
        if (AF_INET6 == af)
        {
            struct ip6_hdr *ack_ip6h;
            struct ip6_hdr *syn_ip6h;
    
            /* Reserve space for ipv6 header */
            syn_ip6h = (struct ip6_hdr *)rte_pktmbuf_prepend(syn_mbuf,
                                                             sizeof(struct ip6_hdr));
            if (!syn_ip6h)
            {
                rte_pktmbuf_free(syn_mbuf);
                //RTE_LOG(WARNING, IPVS, "%s:%s\\n", __func__, dpvs_strerror(EDPVS_NOROOM));
                return(EDPVS_NOROOM);
            }
    
            ack_ip6h = (struct ip6_hdr *)ip6_hdr(mbuf);
    
            syn_ip6h->ip6_vfc  = 0x60; /* IPv6 */
            syn_ip6h->ip6_src  = ack_ip6h->ip6_src;
            syn_ip6h->ip6_dst  = ack_ip6h->ip6_dst;
            syn_ip6h->ip6_plen = htons(tcp_hdr_size);
            syn_ip6h->ip6_nxt  = NEXTHDR_TCP;
            syn_ip6h->ip6_hlim = IPV6_DEFAULT_HOPLIMIT;
    
            syn_mbuf->l3_len = sizeof(*syn_ip6h);
        }
        else
        {
            struct iphdr *ack_iph;
            struct iphdr *syn_iph;
    
            /* Reserve space for ipv4 header */
            syn_iph = (struct iphdr *)rte_pktmbuf_prepend(syn_mbuf, sizeof(struct ipv4_hdr));
            if (!syn_iph)
            {
                rte_pktmbuf_free(syn_mbuf);
                //RTE_LOG(WARNING, IPVS, "%s:%s\\n", __func__, dpvs_strerror(EDPVS_NOROOM));
                return(EDPVS_NOROOM);
            }
    
            ack_iph = (struct iphdr *)ip4_hdr(mbuf);
            *((uint16_t *)syn_iph) = htons((4 << 12) | (5 << 8) | (ack_iph->tos & 0x1E));
            syn_iph->tot_len       = htons(syn_mbuf->pkt_len);
            syn_iph->frag_off      = htons(IPV4_HDR_DF_FLAG);
            syn_iph->ttl           = 64;
            syn_iph->protocol      = IPPROTO_TCP;
            syn_iph->saddr         = ack_iph->saddr;
            syn_iph->daddr         = ack_iph->daddr;
    
            syn_mbuf->l3_len = sizeof(*syn_iph);
    
            /* checksum is done by fnat_in_handler */
            syn_iph->check = 0;
        }
    
        /* Save syn_mbuf if syn retransmission is on */
        //syn_retry,主动连接时的超时重传次数,如果大于零,将构造的数据报缓存起来
        if (dp_vs_synproxy_ctrl_syn_retry > 0)
        {
            syn_mbuf_cloned = mbuf_copy(syn_mbuf, pool);
            if (unlikely(!syn_mbuf_cloned))
            {
                rte_pktmbuf_free(syn_mbuf);
                //RTE_LOG(WARNING, IPVS, "%s:%s\\n", __func__, dpvs_strerror(EDPVS_NOMEM));
                return(EDPVS_NOMEM);
            }
    
            syn_mbuf_cloned->userdata = NULL;
            cp->syn_mbuf = syn_mbuf_cloned;
            sp_dbg_stats32_inc(sp_syn_saved);
            rte_atomic32_set(&cp->syn_retry_max, dp_vs_synproxy_ctrl_syn_retry);
        }
    
        /* TODO: Save info for fast_response_xmit */
    
        /* Count in the syn packet */
        dp_vs_stats_in(cp, mbuf);
    
        /* If xmit failed, syn_mbuf will be freed correctly */
        //调用packet_xmit发送,此处为dp_vs_xmit_fnat
        cp->packet_xmit(pp, cp, syn_mbuf);
    
        return(EDPVS_OK);
    }
    

 rs端syn+ack应答

  • __dp_vs_in

    • 方向为DPVS_CONN_DIR_OUTBOUND
    • 此时能够查找到连接,最终会进入dp_vs_synproxy_synack_rcv逻辑
    static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
                          const struct inet_hook_state *state, int af)
    {
    		if (conn->flags & DPVS_CONN_F_SYNPROXY)
        {
            if (dir == DPVS_CONN_DIR_INBOUND)
            {
                /* Filter out-in ack packet when cp is at SYN_SENT state.
                 * Drop it if not a valid packet, store it otherwise */
                if (0 == dp_vs_synproxy_filter_ack(mbuf, conn, prot,
                                                   &iph, &verdict))
                {
                    dp_vs_stats_in(conn, mbuf);
                    dp_vs_conn_put(conn);
                    return(verdict);
                }
    
                /* "Reuse" synproxy sessions.
                 * "Reuse" means update syn_proxy_seq struct
                 * and clean ack_mbuf etc. */
                if (0 != dp_vs_synproxy_ctrl_conn_reuse)
                {
                    if (0 == dp_vs_synproxy_reuse_conn(af, mbuf, conn, prot,
                                                       &iph, &verdict))
                    {
                        dp_vs_stats_in(conn, mbuf);
                        dp_vs_conn_put(conn);
                        return(verdict);
                    }
                }
            }
            else
            {
                /* Syn-proxy 3 logic: receive syn-ack from rs */
                if (dp_vs_synproxy_synack_rcv(mbuf, conn, prot,
                                              iph.len, &verdict) == 0)
                {
                    dp_vs_stats_out(conn, mbuf);
                    dp_vs_conn_put(conn);
                    return(verdict);
                }
            }
        }
    }
    
  • dp_vs_synproxy_synack_rcv

    /* Syn-proxy step 3 logic: receive rs's Syn/Ack.
     * Update syn_proxy_seq.delta and send stored ack mbufs to rs. */
    int dp_vs_synproxy_synack_rcv(struct rte_mbuf *mbuf, struct dp_vs_conn *cp,
                                  struct dp_vs_proto *pp, int th_offset, int *verdict)
    {
        struct tcphdr _tcph, *th;
        struct dp_vs_synproxy_ack_pakcet *tmbuf, *tmbuf2;
        struct list_head   save_mbuf;
        struct dp_vs_dest *dest         = cp->dest;
        unsigned           conn_timeout = 0;
    
        //th指向tcp首部起始位置
        th = mbuf_header_pointer(mbuf, th_offset, sizeof(_tcph), &_tcph);
        if (unlikely(!th))
        {
            *verdict = INET_DROP;
            return(0);
        }
    
    #ifdef CONFIG_DPVS_IPVS_DEBUG
        RTE_LOG(DEBUG, IPVS, "%s: seq = %u ack_seq = %u %c%c%c cp->is_synproxy = %u "
                "cp->state = %u\\n", __func__, ntohl(th->seq), ntohl(th->ack_seq),
                (th->syn) ? 'S' : '-',
                (th->ack) ? 'A' : '-',
                (th->rst) ? 'R' : '-',
                cp->flags & DPVS_CONN_F_SYNPROXY, cp->state);
    #endif
    
        INIT_LIST_HEAD(&save_mbuf);
        //判断应答包状态,必须是syn和ack包,并且开启了synproxy,当前conn连接处于DPVS_TCP_S_SYN_SENT状态
        if ((th->syn) && (th->ack) && (!th->rst) &&
            (cp->flags & DPVS_CONN_F_SYNPROXY) &&
            (cp->state == DPVS_TCP_S_SYN_SENT))
        {
            //更新syn_proxy_seq.delta 序列号差值
            cp->syn_proxy_seq.delta = ntohl(cp->syn_proxy_seq.isn) - ntohl(th->seq);
            //连接状态进入ESTABLISHED
            cp->state = DPVS_TCP_S_ESTABLISHED;
            //获取连接超时时间
            conn_timeout = dp_vs_get_conn_timeout(cp);
            if (unlikely((conn_timeout != 0) && (cp->proto == IPPROTO_TCP)))
            {
                cp->timeout.tv_sec = conn_timeout;
            }
            else
            {
                cp->timeout.tv_sec = pp->timeout_table[cp->state];
            }
            dpvs_time_rand_delay(&cp->timeout, 1000000);
            //更新dest上的连接统计信息
            if (dest)
            {
                rte_atomic32_inc(&dest->actconns);
                rte_atomic32_dec(&dest->inactconns);
                cp->flags &= ~DPVS_CONN_F_INACTIVE;
            }
    
            /* Save tcp sequence for fullnat/nat, inside to outside */
            //保存序号 rs_end_seq 和 rs_end_ack
            if (DPVS_FWD_MODE_NAT == cp->dest->fwdmode ||
                DPVS_FWD_MODE_FNAT == cp->dest->fwdmode)
            {
                cp->rs_end_seq = htonl(ntohl(th->seq) + 1);
                cp->rs_end_ack = th->ack_seq;
    #ifdef CONFIG_DPVS_IPVS_DEBUG
                RTE_LOG(DEBUG, IPVS, "%s: packet from rs, seq = %u, ack_seq = %u, port %u => %u\\n",
                        __func__, ntohl(th->seq), ntohl(th->ack_seq),
                        ntohs(th->source), ntohs(th->dest));
    #endif
            }
    
            /* TODO: ip_vs_synproxy_save_fast_xmit_info ? */
    
            /* Free stored syn mbuf, no need for retransmition any more */
            //syn_mbuf上保存了lb->rs发起连接请求的数据报,此时连接正常完成,需要释放
            if (cp->syn_mbuf)
            {
                rte_pktmbuf_free(cp->syn_mbuf);
                cp->syn_mbuf = NULL;
                sp_dbg_stats32_dec(sp_syn_saved);
            }
            //在全局 ack_mbuf 链表中删除自己的 ack_mbuf 引用
            if (list_empty(&cp->ack_mbuf))
            {
                /*
                 * FIXME: Maybe a bug here, print err msg and go.
                 * Attention: cp->state has been changed and we
                 * should still DROP the syn/ack mbuf.
                 */
                RTE_LOG(ERR, IPVS, "%s: got ack_mbuf NULL pointer: ack-saved = %u\\n",
                        __func__, cp->ack_num);
                *verdict = INET_DROP;
                return(0);
            }
    
            /* Window size has been set to zero in the syn-ack packet to Client.
             * If get more than one ack packet here,
             * it means client has sent a window probe after one RTO.
             * The probe will be forward to RS and RS will respond a window update.
             * So DPVS has no need to send a window update.
             */
            //设置窗口
            if (cp->ack_num == 1)
            {
                syn_proxy_send_window_update(tuplehash_out(cp).af, mbuf, cp, pp, th);
            }
    
            list_for_each_entry_safe(tmbuf, tmbuf2, &cp->ack_mbuf, list)
            {
                list_del_init(&tmbuf->list);
                cp->ack_num--;
                list_add_tail(&tmbuf->list, &save_mbuf);
            }
            assert(cp->ack_num == 0);
    				//调用packet_xmit将缓存发送至rs侧的数据包发送至rs,其中包括第三次握手的ack数据包
            list_for_each_entry_safe(tmbuf, tmbuf2, &save_mbuf, list)
            {
                list_del_init(&tmbuf->list);
                /* syn_mbuf will be freed correctly if xmit failed */
                //调用packet_xmit将其发送至rs
                cp->packet_xmit(pp, cp, tmbuf->mbuf);
                /* free dp_vs_synproxy_ack_pakcet */
                rte_mempool_put(this_ack_mbufpool, tmbuf);
                sp_dbg_stats32_dec(sp_ack_saved);
            }
            //这个ack连接数据报不需要发送给client侧,所以此处返回drop
            *verdict = INET_DROP;
            return(0);
        }
        else if ((th->rst) &&
                 (cp->flags & DPVS_CONN_F_SYNPROXY) &&
                 (cp->state == DPVS_TCP_S_SYN_SENT))
        {
            RTE_LOG(DEBUG, IPVS, "%s: get rst from rs, seq = %u ack_seq = %u\\n",
                    __func__, ntohl(th->seq), ntohl(th->ack_seq));
    
            /* Count the delta of seq */
            //如果是rst包,设置连接状态为DPVS_TCP_S_CLOSE
            cp->syn_proxy_seq.delta = ntohl(cp->syn_proxy_seq.isn) - ntohl(th->seq);
            cp->state          = DPVS_TCP_S_CLOSE;
            cp->timeout.tv_sec = pp->timeout_table[cp->state];
            dpvs_time_rand_delay(&cp->timeout, 1000000);
            th->seq = htonl(ntohl(th->seq) + 1);
            //syn_proxy_seq_csum_update ?
    
            return(1);
        }
        return(1);
    }
    
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值