ovs+dpdk场景下的tx checksum offload

一、checksum:
tcp checksum包括三部分:
1)、伪头部校验和;
   伪头部包括: 源ip 、宿ip、 协议号、tcp 长度,主要用于校验是正确的目的机器接收到数据包
2)、tcp头部校验和;
3)、数据部分校验和;

当硬件有checksum offload能力时,可以通过ethtook -K tx on/off设置是否将checksum offload到硬件处理;
如果将checksum offload到硬件,那协议栈只需要计算伪头部的校验和,然后将其存放在tcp->check里;并同时
将csum_start、csum_offset高速硬件,csum_start表示硬件需要计算checksum的起始位置(tcp头部起始位置),
csum_offset表示硬件计算完checksum后将值存放的位置。

 

二、offload流程

1、传输层

tcp_sendmsg
    if (sk->sk_route_caps & NETIF_F_CSUM_MASK)(判断网卡是否有checksum offload功能,如果有,则将ip_summed置为CHECKSUM_PARTIAL)
        skb->ip_summed = CHECKSUM_PARTIAL;
        
    skb_copy_to_page_nocache
        skb_do_copy_data_nocache  (根据ip_summed值填充skb->csum;如果网卡没有offload能力,则这里先计算数据部分的checksum值,然后保存在skb->csum里;如果硬件有offload能力,仅仅只是将数据从用户态拷贝过来,不计算校验和)

        tcp_push_one
            tcp_write_xmit
                tcp_transmit_skb
                    icsk->icsk_af_ops->send_check
                        tcp_v4_send_check
                            __tcp_v4_send_check
                            void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)      

void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
    struct tcphdr *th = tcp_hdr(skb);
    //如果需要offload,则th->check仅仅保存伪头部的校验和值,tcp头部及数据交给硬件计算
    if (skb->ip_summed == CHECKSUM_PARTIAL) {
        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
        //设置硬件计算校验和开始的地址偏移,csum_start指向tcp头部的起始地址处
        skb->csum_start = skb_transport_header(skb) - skb->head;
        //设置硬件计算校验和保存的地址偏移,保存在tcp->check处
        skb->csum_offset = offsetof(struct tcphdr, check);
    } else {
        //硬件没有offload能力,则需要在计算checksum
        //1)、skb->csum为skb_do_copy_data_nocache里计算的数据部分校验和;
        //2)、csum_partial(th, h->doff << 2, skb->csum))为计算tcp头部+数据的检验和;
        //3)、tcp_v4_check再将算出来的tcp头部与数据的检验和再加上伪头部校验和
        th->check = tcp_v4_check(skb->len, saddr, daddr,
                                csum_partial(th,
                                th->doff << 2,
                                skb->csum));
            }
    }		

 

2、ip层
__ip_local_out
    __ip_local_out_sk
        ip_send_check(计算ip头的checksum,保存在iph->check)

3、virtio驱动层
start_xmit(virtio_net)    
    xmit_skb

static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
    ...
    if (skb->ip_summed == CHECKSUM_PARTIAL) {
        //消息头设置需要硬件checksum标志
        hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
        //将csum_start、csum_offset保存在virtio_net_hdr里
        hdr->hdr.csum_start = cpu_to_virtio16(vi->vdev,
        skb_checksum_start_offset(skb));
        hdr->hdr.csum_offset = cpu_to_virtio16(vi->vdev, skb->csum_offset);
    } else {
        hdr->hdr.flags = 0;
        hdr->hdr.csum_offset = hdr->hdr.csum_start = 0;
    }
}

4、dpdk层vhost_user口收包

virtio_dev_tx_split    
    copy_desc_to_mbuf
        vhost_dequeue_offload

static __rte_always_inline void
vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
{
    uint16_t l4_proto = 0;
    void *l4_hdr = NULL;
    struct tcp_hdr *tcp_hdr = NULL;

    if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
        return;
    
    parse_ethernet(m, &l4_proto, &l4_hdr);
    //判断virtio驱动是否有置为需要硬件做checksum
    if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
        //这里csum_start如果等于m->l2_len + m->l3_len,表明csum_start
        //为L4的起始位置,然后根据csum_offset决定是tcp offload还是udp offload,或者sctp offload
        if (hdr->csum_start == (m->l2_len + m->l3_len)) {
            switch (hdr->csum_offset) {
                case (offsetof(struct tcp_hdr, cksum)):
                    if (l4_proto == IPPROTO_TCP)
                        m->ol_flags |= PKT_TX_TCP_CKSUM;
                    break;
                case (offsetof(struct udp_hdr, dgram_cksum)):
                    if (l4_proto == IPPROTO_UDP)
                        m->ol_flags |= PKT_TX_UDP_CKSUM;
                    break;
                case (offsetof(struct sctp_hdr, cksum)):
                    if (l4_proto == IPPROTO_SCTP)
                        m->ol_flags |= PKT_TX_SCTP_CKSUM;
                    break;
                default:
                    break;
            }
        }
    }

    if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
        switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
            case VIRTIO_NET_HDR_GSO_TCPV6:
                tcp_hdr = l4_hdr;
                m->ol_flags |= PKT_TX_TCP_SEG;
                m->tso_segsz = hdr->gso_size;
                m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
                break;
            case VIRTIO_NET_HDR_GSO_TCPV4:
                tcp_hdr = l4_hdr;
                m->ol_flags |= PKT_TX_TCP_SEG;
                m->tso_segsz = hdr->gso_size;
                m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
							
                //tso offload场景下,硬件需要重新拆分数据包,这个标志区分是ipv4还是ipv6
                m->ol_flags |= PKT_TX_IP_CKSUM;
                break;
            case VIRTIO_NET_HDR_GSO_UDP:
                m->ol_flags |= PKT_TX_UDP_SEG;
                m->tso_segsz = hdr->gso_size;
                m->l4_len = sizeof(struct udp_hdr);
                break;
            default:
                RTE_LOG(WARNING, VHOST_DATA,
                    "unsupported gso type %u.\n", hdr->gso_type);
                break;
            }
        }			
        RTE_LOG(ERR, VHOST_DATA, "vhost_dequeue_offload  ol_flags %llu.\n", m->ol_flags);
    }
}

5、ovs将数据包转给dpdk口

static inline int
netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
                         struct rte_mbuf **pkts, int cnt)
{
    uint32_t nb_tx = 0;

	//先通过dpdk的prepare接口计算伪头部csum(在tso offload开启场景下需要计算伪头部csum)
    cnt = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);

    while (nb_tx != cnt) {
        uint32_t ret;

        ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
        if (!ret) {
            break;
        }

        nb_tx += ret;
    }

    if (OVS_UNLIKELY(nb_tx != cnt)) {
        /* Free buffers, which we couldn't transmit, one at a time (each
         * packet could come from a different mempool) */
        int i;

        for (i = nb_tx; i < cnt; i++) {
            rte_pktmbuf_free(pkts[i]);
        }

        struct dp_netdev_port *dp_port = CONTAINER_OF(&dev->up, struct dp_netdev_port, netdev);		
        dp_port->need_reconfigure = true;
        int retsult = port_reconfigure(port);
        VLOG_ERR_RL(&rl, "zzzz after port reconfigure ret: %d \n", retsult);
    }
    return cnt - nb_tx;
}

6、dpdk,ixgbe驱动将数据转给硬件

ixgbe_xmit_pkts

    ixgbe_set_xmit_ctx

static inline void  ixgbe_set_xmit_ctx(struct ixgbe_tx_queue *txq,
			volatile struct ixgbe_adv_tx_context_desc *ctx_txd,
			uint64_t ol_flags, union ixgbe_tx_offload tx_offload,
			__rte_unused uint64_t *mdata)
{
		uint32_t type_tucmd_mlhl;
		uint32_t mss_l4len_idx = 0;
		uint32_t ctx_idx;
		uint32_t vlan_macip_lens;
		union ixgbe_tx_offload tx_offload_mask;
		uint32_t seqnum_seed = 0;

		ctx_idx = txq->ctx_curr;
		tx_offload_mask.data[0] = 0;
		tx_offload_mask.data[1] = 0;
		type_tucmd_mlhl = 0;

		/* Specify which HW CTX to upload. */
		mss_l4len_idx |= (ctx_idx << IXGBE_ADVTXD_IDX_SHIFT);

		if (ol_flags & PKT_TX_VLAN_PKT) {
			tx_offload_mask.vlan_tci |= ~0;
		}

		/* check if TCP segmentation required for this packet */
		//判断是否需要硬件做tso
		if (ol_flags & PKT_TX_TCP_SEG) {
			/* implies IP cksum in IPv4 */
			//硬件tso的场景下,需要重新计算tcp、ip的checksum
			//因此这里只判断是做ipv4还是ipv6的ip csum
			if (ol_flags & PKT_TX_IP_CKSUM)
				type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4 |
					IXGBE_ADVTXD_TUCMD_L4T_TCP |
					IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
			else
				type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV6 |
					IXGBE_ADVTXD_TUCMD_L4T_TCP |
					IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;

			//高速硬件l2、l3、l4及segment数
			tx_offload_mask.l2_len |= ~0;
			tx_offload_mask.l3_len |= ~0;
			tx_offload_mask.l4_len |= ~0;
			tx_offload_mask.tso_segsz |= ~0;
			mss_l4len_idx |= tx_offload.tso_segsz << IXGBE_ADVTXD_MSS_SHIFT;
			mss_l4len_idx |= tx_offload.l4_len << IXGBE_ADVTXD_L4LEN_SHIFT;
		} else { /* no TSO, check if hardware checksum is needed */
			//非tso场景下,硬件只需要计算tcp头+payload的csum,ip层的有需要才重新计算,因此
			//这里判断是否需要重新做ip层的csum
			if (ol_flags & PKT_TX_IP_CKSUM) {
				type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4;
				tx_offload_mask.l2_len |= ~0;
				tx_offload_mask.l3_len |= ~0;
			}

			switch (ol_flags & PKT_TX_L4_MASK) {
			case PKT_TX_UDP_CKSUM:
				type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP |
					IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
				mss_l4len_idx |= sizeof(struct udp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
				tx_offload_mask.l2_len |= ~0;
				tx_offload_mask.l3_len |= ~0;
				break;
			//非tso的checksum场景,硬件只需要重新计算
			case PKT_TX_TCP_CKSUM:
				type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP |
					IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
				mss_l4len_idx |= sizeof(struct tcp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
				tx_offload_mask.l2_len |= ~0;
				tx_offload_mask.l3_len |= ~0;
				break;
			case PKT_TX_SCTP_CKSUM:
				type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP |
					IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
				mss_l4len_idx |= sizeof(struct sctp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
				tx_offload_mask.l2_len |= ~0;
				tx_offload_mask.l3_len |= ~0;
				break;
			default:
				type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_RSV |
					IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
				break;
			}
		}

		if (ol_flags & PKT_TX_OUTER_IP_CKSUM) {
			tx_offload_mask.outer_l2_len |= ~0;
			tx_offload_mask.outer_l3_len |= ~0;
			tx_offload_mask.l2_len |= ~0;
			seqnum_seed |= tx_offload.outer_l3_len
					   << IXGBE_ADVTXD_OUTER_IPLEN;
			seqnum_seed |= tx_offload.l2_len
					   << IXGBE_ADVTXD_TUNNEL_LEN;
		}
	#ifdef RTE_LIBRTE_SECURITY
		if (ol_flags & PKT_TX_SEC_OFFLOAD) {
			union ixgbe_crypto_tx_desc_md *md =
					(union ixgbe_crypto_tx_desc_md *)mdata;
			seqnum_seed |=
				(IXGBE_ADVTXD_IPSEC_SA_INDEX_MASK & md->sa_idx);
			type_tucmd_mlhl |= md->enc ?
					(IXGBE_ADVTXD_TUCMD_IPSEC_TYPE_ESP |
					IXGBE_ADVTXD_TUCMD_IPSEC_ENCRYPT_EN) : 0;
			type_tucmd_mlhl |=
				(md->pad_len & IXGBE_ADVTXD_IPSEC_ESP_LEN_MASK);
			tx_offload_mask.sa_idx |= ~0;
			tx_offload_mask.sec_pad_len |= ~0;
		}
	#endif

		txq->ctx_cache[ctx_idx].flags = ol_flags;
		txq->ctx_cache[ctx_idx].tx_offload.data[0]  =
			tx_offload_mask.data[0] & tx_offload.data[0];
		txq->ctx_cache[ctx_idx].tx_offload.data[1]  =
			tx_offload_mask.data[1] & tx_offload.data[1];
		txq->ctx_cache[ctx_idx].tx_offload_mask    = tx_offload_mask;

		ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
		vlan_macip_lens = tx_offload.l3_len;
		if (ol_flags & PKT_TX_OUTER_IP_CKSUM)
			vlan_macip_lens |= (tx_offload.outer_l2_len <<
						IXGBE_ADVTXD_MACLEN_SHIFT);
		else
			vlan_macip_lens |= (tx_offload.l2_len <<
						IXGBE_ADVTXD_MACLEN_SHIFT);
		vlan_macip_lens |= ((uint32_t)tx_offload.vlan_tci << IXGBE_ADVTXD_VLAN_SHIFT);
		ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
		ctx_txd->mss_l4len_idx   = rte_cpu_to_le_32(mss_l4len_idx);
		ctx_txd->seqnum_seed     = seqnum_seed;
	}

 

 

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值