- 二层数据报处理入口函数
void lcore_process_packets(struct netif_queue_conf *qconf, struct rte_mbuf **mbufs,
lcoreid_t cid, uint16_t count, bool pkts_from_ring)
{
int i, t;
struct ether_hdr *eth_hdr;
struct rte_mbuf * mbuf_copied = NULL;
/* prefetch packets */
//prefetch预取一定数量的mbuf,主要提高cache命中率,提高效率
for (t = 0; t < count && t < NETIF_PKT_PREFETCH_OFFSET; t++)
{
rte_prefetch0(rte_pktmbuf_mtod(mbufs[t], void *));
}
/* L2 filter */
for (i = 0; i < count; i++)
{
struct rte_mbuf *mbuf = mbufs[i];
//获取数据包对应的网课信息
struct netif_port *dev = netif_port_get(mbuf->port);
//如果未找到对应的net_device层设备,释放数据报,更新统计信息
if (unlikely(!dev))
{
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
//处理网卡是bond情况,将net_device设备更改为绑定至的设备
if (dev->type == PORT_TYPE_BOND_SLAVE)
{
dev = dev->bond->slave.master;
mbuf->port = dev->id;
}
//在处理当前数据包同时,预取接下来处理数据包
if (t < count)
{
rte_prefetch0(rte_pktmbuf_mtod(mbufs[t], void *));
t++;
}
//获取以太网头,并判断当前二层包类型,本机RTE_ETH_PKT_HOST,广播或组播等
eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
/* reuse mbuf.packet_type, it was RTE_PTYPE_XXX */
mbuf->packet_type = eth_type_parse(eth_hdr, dev);
/*
* In NETIF_PORT_FLAG_FORWARD2KNI mode.
* All packets received are deep copied and sent to KNI
* for the purpose of capturing forwarding packets.Since the
* rte_mbuf will be modified in the following procedure,
* we should use mbuf_copy instead of rte_pktmbuf_clone.
*/
//kni模式的所有的包都要透传到内核,深拷贝一份
if (dev->flag & NETIF_PORT_FLAG_FORWARD2KNI)
{
if (likely(NULL != (mbuf_copied = mbuf_copy(mbuf,
pktmbuf_pool[dev->socket]))))
{
kni_ingress(mbuf_copied, dev, qconf);
}
else
{
RTE_LOG(WARNING, NETIF, "%s: Failed to copy mbuf\\n",
__func__);
}
}
/*
* do not drop pkt to other hosts (ETH_PKT_OTHERHOST)
* since virtual devices may have different MAC with
* underlying device.
*/
/*
* handle VLAN
* if HW offload vlan strip, it's still need vlan module
* to act as VLAN filter.
*/
//兼容处理vlan的情况,如果下层协议是vlan,或者vlan已经被硬件负载功能stripped,进行vlan处理
if (eth_hdr->ether_type == htons(ETH_P_8021Q) ||
mbuf->ol_flags & PKT_RX_VLAN_STRIPPED)
{
//vlan处理失败,释放数据报,更新统计信息
if (vlan_rcv(mbuf, netif_port_get(mbuf->port)) != EDPVS_OK)
{
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
//获取对应的dev设备,在vlan_rcv中mbuf->port会变更改为对应vlanid的port号,所以此处重新获取dev
dev = netif_port_get(mbuf->port);
if (unlikely(!dev))
{
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
//重新获取eth_hdr,未启用vlan_stripped硬件负载功能时,会移除vlan信息,导致eth_hdr->ether_type发生变更
eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
}
/* handler should free mbuf */
//数据包转发函数
netif_deliver_mbuf(mbuf, eth_hdr->ether_type, dev, qconf,
(dev->flag & NETIF_PORT_FLAG_FORWARD2KNI) ? true : false,
cid, pkts_from_ring);
//更新统计数据
lcore_stats[cid].ibytes += mbuf->pkt_len;
lcore_stats[cid].ipackets++;
}
}
-
netif_deliver_mbuf
- 二层数据包分发,包括kni,arp数据报广播,和pkt_type查找
static inline int netif_deliver_mbuf(struct rte_mbuf *mbuf, uint16_t eth_type, struct netif_port *dev, struct netif_queue_conf *qconf, bool forward2kni, lcoreid_t cid, bool pkts_from_ring) { struct pkt_type *pt; int err; uint16_t data_off; //校验相关 assert(mbuf->port <= NETIF_MAX_PORTS); assert(dev != NULL); //pkt_type对应L3层处理函数,比如arp_pkt_type,ipv4_pkt_type pt = pkt_type_get(eth_type, dev); //pt为空说明没有对应协议的处理 if (NULL == pt) { //如果没有转发过,则转发到kni,否则释放数据包 if (!forward2kni) { kni_ingress(mbuf, dev, qconf); } else { rte_pktmbuf_free(mbuf); } return(EDPVS_OK); } /*clone arp pkt to every queue*/ //如果是 arp 类型的包,复制拷到所有队列。这块为什么这么处理呢?猜测,dpdk 程序是每个核都有本地变量,无锁的,所以邻居 //子系统也要每个核都是全量的 if (pt->type == rte_cpu_to_be_16(ETHER_TYPE_ARP) && !pkts_from_ring) { struct rte_mempool *mbuf_pool; struct rte_mbuf * mbuf_clone; uint8_t i; struct arp_hdr * arp; unsigned socket_id; socket_id = rte_socket_id(); mbuf_pool = pktmbuf_pool[socket_id]; rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr)); arp = rte_pktmbuf_mtod(mbuf, struct arp_hdr *); rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr)); //判断arp_op是否是ARP_OP_REPLY,是的话,需要clone mbuf,然后调用rte_ring_enqueue发送到其他每个核一份 if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REPLY) { for (i = 0; i < DPVS_MAX_LCORE; i++) { if ((i == cid) || (!is_lcore_id_fwd(i)) || (i == rte_get_master_lcore())) { continue; } /*rte_pktmbuf_clone will not clone pkt.data, just copy pointer!*/ mbuf_clone = rte_pktmbuf_clone(mbuf, mbuf_pool); if (mbuf_clone) { int ret = rte_ring_enqueue(arp_ring[i], mbuf_clone); if (unlikely(-EDQUOT == ret)) { RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d quota exceeded\\n", __func__, i); } else if (ret < 0) { RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d enqueue failed\\n", __func__, i); rte_pktmbuf_free(mbuf_clone); } } } } } //设置l2层数据长度 mbuf->l2_len = sizeof(struct ether_hdr); /* Remove ether_hdr at the beginning of an mbuf */ data_off = mbuf->data_off; //调整mbuf指向三层首部,数据包一直往上传递时,直接指向当前层的开头,避免多次解析数据包 //长度校验,此处是否忘记释放数据包 if (unlikely(NULL == rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr)))) { return(EDPVS_INVPKT); } //执行pkt_type对应处理函数,如果ipv4则对应ipv4_rcv err = pt->func(mbuf, dev); //是否需要通过kni传递至linux 内核处理 if (err == EDPVS_KNICONTINUE) { //如果不是本地lcore收到的数据报,或者已经传递过forward2kni,则直接释放,防止重复数据报传递至内核 if (pkts_from_ring || forward2kni) { rte_pktmbuf_free(mbuf); return(EDPVS_OK); } //此处将数据包的起始地址进行复原,在传递至L3层之前,data_off记录了原始的mbuf data起始地址 if (likely(NULL != rte_pktmbuf_prepend(mbuf, (mbuf->data_off - data_off)))) { kni_ingress(mbuf, dev, qconf); } else { rte_pktmbuf_free(mbuf); } } return(EDPVS_OK); }