dpvs之l2层数据包处理

  • 二层数据报处理入口函数
void lcore_process_packets(struct netif_queue_conf *qconf, struct rte_mbuf **mbufs,
                           lcoreid_t cid, uint16_t count, bool pkts_from_ring)
{
    int i, t;
    struct ether_hdr *eth_hdr;
    struct rte_mbuf * mbuf_copied = NULL;

    /* prefetch packets */
    //prefetch预取一定数量的mbuf,主要提高cache命中率,提高效率
    for (t = 0; t < count && t < NETIF_PKT_PREFETCH_OFFSET; t++)
    {
        rte_prefetch0(rte_pktmbuf_mtod(mbufs[t], void *));
    }

    /* L2 filter */
    for (i = 0; i < count; i++)
    {
        struct rte_mbuf *mbuf = mbufs[i];
        //获取数据包对应的网课信息
        struct netif_port *dev = netif_port_get(mbuf->port);
        //如果未找到对应的net_device层设备,释放数据报,更新统计信息
        if (unlikely(!dev))
        {
            rte_pktmbuf_free(mbuf);
            lcore_stats[cid].dropped++;
            continue;
        }
        //处理网卡是bond情况,将net_device设备更改为绑定至的设备
        if (dev->type == PORT_TYPE_BOND_SLAVE)
        {
            dev        = dev->bond->slave.master;
            mbuf->port = dev->id;
        }
        //在处理当前数据包同时,预取接下来处理数据包
        if (t < count)
        {
            rte_prefetch0(rte_pktmbuf_mtod(mbufs[t], void *));
            t++;
        }
        //获取以太网头,并判断当前二层包类型,本机RTE_ETH_PKT_HOST,广播或组播等
        eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
        /* reuse mbuf.packet_type, it was RTE_PTYPE_XXX */
        mbuf->packet_type = eth_type_parse(eth_hdr, dev);

        /*
         * In NETIF_PORT_FLAG_FORWARD2KNI mode.
         * All packets received are deep copied and sent to  KNI
         * for the purpose of capturing forwarding packets.Since the
         * rte_mbuf will be modified in the following procedure,
         * we should use mbuf_copy instead of rte_pktmbuf_clone.
         */
        //kni模式的所有的包都要透传到内核,深拷贝一份
        if (dev->flag & NETIF_PORT_FLAG_FORWARD2KNI)
        {
            if (likely(NULL != (mbuf_copied = mbuf_copy(mbuf,
                                                        pktmbuf_pool[dev->socket]))))
            {
                kni_ingress(mbuf_copied, dev, qconf);
            }
            else
            {
                RTE_LOG(WARNING, NETIF, "%s: Failed to copy mbuf\\n",
                        __func__);
            }
        }

        /*
         * do not drop pkt to other hosts (ETH_PKT_OTHERHOST)
         * since virtual devices may have different MAC with
         * underlying device.
         */

        /*
         * handle VLAN
         * if HW offload vlan strip, it's still need vlan module
         * to act as VLAN filter.
         */
        //兼容处理vlan的情况,如果下层协议是vlan,或者vlan已经被硬件负载功能stripped,进行vlan处理
        if (eth_hdr->ether_type == htons(ETH_P_8021Q) ||
            mbuf->ol_flags & PKT_RX_VLAN_STRIPPED)
        {
            //vlan处理失败,释放数据报,更新统计信息
            if (vlan_rcv(mbuf, netif_port_get(mbuf->port)) != EDPVS_OK)
            {
                rte_pktmbuf_free(mbuf);
                lcore_stats[cid].dropped++;
                continue;
            }
            //获取对应的dev设备,在vlan_rcv中mbuf->port会变更改为对应vlanid的port号,所以此处重新获取dev
            dev = netif_port_get(mbuf->port);
            if (unlikely(!dev))
            {
                rte_pktmbuf_free(mbuf);
                lcore_stats[cid].dropped++;
                continue;
            }
            //重新获取eth_hdr,未启用vlan_stripped硬件负载功能时,会移除vlan信息,导致eth_hdr->ether_type发生变更
            eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
        }
        /* handler should free mbuf */
        //数据包转发函数
        netif_deliver_mbuf(mbuf, eth_hdr->ether_type, dev, qconf,
                           (dev->flag & NETIF_PORT_FLAG_FORWARD2KNI) ? true : false,
                           cid, pkts_from_ring);
        //更新统计数据
        lcore_stats[cid].ibytes += mbuf->pkt_len;
        lcore_stats[cid].ipackets++;
    }
}
  • netif_deliver_mbuf

    • 二层数据包分发,包括kni,arp数据报广播,和pkt_type查找
    static inline int netif_deliver_mbuf(struct rte_mbuf *mbuf,
                                         uint16_t eth_type,
                                         struct netif_port *dev,
                                         struct netif_queue_conf *qconf,
                                         bool forward2kni,
                                         lcoreid_t cid,
                                         bool pkts_from_ring)
    {
        struct pkt_type *pt;
        int      err;
        uint16_t data_off;
        //校验相关
        assert(mbuf->port <= NETIF_MAX_PORTS);
        assert(dev != NULL);
        //pkt_type对应L3层处理函数,比如arp_pkt_type,ipv4_pkt_type
        pt = pkt_type_get(eth_type, dev);
        //pt为空说明没有对应协议的处理
        if (NULL == pt)
        {
            //如果没有转发过,则转发到kni,否则释放数据包
            if (!forward2kni)
            {
                kni_ingress(mbuf, dev, qconf);
            }
            else
            {
                rte_pktmbuf_free(mbuf);
            }
            return(EDPVS_OK);
        }
    
        /*clone arp pkt to every queue*/
        //如果是 arp 类型的包,复制拷到所有队列。这块为什么这么处理呢?猜测,dpdk 程序是每个核都有本地变量,无锁的,所以邻居
        //子系统也要每个核都是全量的
        if (pt->type == rte_cpu_to_be_16(ETHER_TYPE_ARP) && !pkts_from_ring)
        {
            struct rte_mempool *mbuf_pool;
            struct rte_mbuf *   mbuf_clone;
            uint8_t             i;
            struct arp_hdr *    arp;
            unsigned            socket_id;
    
            socket_id = rte_socket_id();
            mbuf_pool = pktmbuf_pool[socket_id];
    
            rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr));
            arp = rte_pktmbuf_mtod(mbuf, struct arp_hdr *);
            rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr));
            //判断arp_op是否是ARP_OP_REPLY,是的话,需要clone mbuf,然后调用rte_ring_enqueue发送到其他每个核一份
            if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REPLY)
            {
                for (i = 0; i < DPVS_MAX_LCORE; i++)
                {
                    if ((i == cid) || (!is_lcore_id_fwd(i)) ||
                        (i == rte_get_master_lcore()))
                    {
                        continue;
                    }
                    /*rte_pktmbuf_clone will not clone pkt.data, just copy pointer!*/
                    mbuf_clone = rte_pktmbuf_clone(mbuf, mbuf_pool);
                    if (mbuf_clone)
                    {
                        int ret = rte_ring_enqueue(arp_ring[i], mbuf_clone);
                        if (unlikely(-EDQUOT == ret))
                        {
                            RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d quota exceeded\\n",
                                    __func__, i);
                        }
                        else if (ret < 0)
                        {
                            RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d enqueue failed\\n",
                                    __func__, i);
                            rte_pktmbuf_free(mbuf_clone);
                        }
                    }
                }
            }
        }
        //设置l2层数据长度
        mbuf->l2_len = sizeof(struct ether_hdr);
        /* Remove ether_hdr at the beginning of an mbuf */
        data_off = mbuf->data_off;
        //调整mbuf指向三层首部,数据包一直往上传递时,直接指向当前层的开头,避免多次解析数据包
        //长度校验,此处是否忘记释放数据包
        if (unlikely(NULL == rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr))))
        {
            return(EDPVS_INVPKT);
        }
        //执行pkt_type对应处理函数,如果ipv4则对应ipv4_rcv
        err = pt->func(mbuf, dev);
        //是否需要通过kni传递至linux 内核处理
        if (err == EDPVS_KNICONTINUE)
        {
            //如果不是本地lcore收到的数据报,或者已经传递过forward2kni,则直接释放,防止重复数据报传递至内核
            if (pkts_from_ring || forward2kni)
            {
                rte_pktmbuf_free(mbuf);
                return(EDPVS_OK);
            }
            //此处将数据包的起始地址进行复原,在传递至L3层之前,data_off记录了原始的mbuf data起始地址
            if (likely(NULL != rte_pktmbuf_prepend(mbuf,
                                                   (mbuf->data_off - data_off))))
            {
                kni_ingress(mbuf, dev, qconf);
            }
            else
            {
                rte_pktmbuf_free(mbuf);
            }
        }
    
        return(EDPVS_OK);
    }
    
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值