一、接收数据
DPDK中对数据的处理简化一下就如现在分析的这个程序,其实就是一个收发的过程,只要把基础的收发搞明白,再加上其它逻辑其实就是一个丰富的应用。在这里首先看一下此程序中DPDK如何接收数据。
接收数据包括下面的发送数据,其实都包含了非常多的知识点,除了前面已经分析过的相关知识外,还有:
1、DMA的具体知识
这个需要注意的是,现在IO设备自身也增加了DMA控制系统即现在DMA系统分两类,一类是内部DMA,一类是系统DMA(IO系统)。其它具体的DMA相关技术需要自己去学习相关的知识,包括软件方面的当然也有关联到的硬件方面的。
2、网卡与内存通信的协议
这个就包括相关一些说明文件和格式,特别是后者,包括一些描述符的性质等,应该是实际用到的需要认真的弄明白。
从宏观上讲,数据包会先通过物理途径到达网卡,然后网卡DMA将数据包写入到系统Ring收包队列(进入内存),上层应用mmap内存映射拿到数据。具体的流程如下:
1、CPU与DMA通信配置相关队列的信息(含头地址和大小等,物理地址),DMA会通过队列中描述符得到上层应用的虚拟地址,并将把收到数据包保存到buf地址内存(虚拟地址)即DMA会从接收方的描述符中得到数据缓冲地址指向上层应用的mbuf中的指定内存。
2、网卡读取接收Ring接收端的描述符同时获取上层应用的缓冲buf地址。
3、网卡接收网络数据到网卡本地FIFO缓冲区。
4、网卡的DMA控制器将数据写到指定的上层应用内存缓冲区mbuf。
5、网卡更新描述符队列(物理地址)中相关DD标志为1,即数据接收完成。
6、CPU读取DD标志,为1则表示可以读取网卡数据。
7、CPU读取相关系统队列(虚拟地址)中数据并申请新的mbuf替换该描述符(同1),转换相关地址并更新DMA物理地址和描述符中ring的DD标志为0,表示可以继续接收数据。
8、处理DMA寄存器的接收队列ring,此处是更新tail寄存器而不是将新的mbuf填充到tail寄存器。
此处的重点其实就是在于两个队列,即上层应用的系统内存队列(虚拟地址)与描述符接收队列(物理地址)一个动态的转换操作。有点类似于map映射,而不是强硬的进行增加和删除操作。
二、发送数据
发送数据也是如此,上层应用写入到内存然后mmap到DMA访问的内存拷贝到网卡,然后网卡将数据发送即可。其具体的流程如下:
1、mmap的内存地址写入数据。
2、CPU读取发送端的Ring队列(物理)中的DD标志是否为1,1表示发送完成。
3、将发送完成的描述符对应的缓冲区mbuf释放(虚拟)。
4、CPU将mbuf发送缓冲区(虚拟)填充到描述符的系统队列(虚拟)。
5、CPU将mbuf中的data物理地址转换到到后填充到发送端队列(物理)并置DD标志为0.
6、DMA控制器读取base寄存器得到发送端描述符并获取发送队列地址(物理),根据DD标志为0则读取mbuf数据(虚拟)到网卡缓存并发送。
7、DMA控制器置DD标志位为1并触发CPU数据发送成功消息。
发送相对接收似乎看上去简单一点啊。
其实从上面的分析可以看到很多细节其实需要对硬件和DMA及相关网卡中的控制器等需要有一个相对了解的过程,这也是前面提示大家需要对一些基础准备知识要有一个心理准备一样。
三、源码分析
对数据的收发进行分析后,结合源码来看一下:
1、接收数据
//在DPDK中有很多类似下面网卡的相关定义
static const struct eth_dev_ops ixgbe_eth_dev_ops = {
.dev_configure = ixgbe_dev_configure,
.dev_start = ixgbe_dev_start,
.dev_stop = ixgbe_dev_stop,
.dev_set_link_up = ixgbe_dev_set_link_up,
.dev_set_link_down = ixgbe_dev_set_link_down,
.dev_close = ixgbe_dev_close,
......
//注意下面的函数注册
.rx_queue_start = ixgbe_dev_rx_queue_start,
.rx_queue_stop = ixgbe_dev_rx_queue_stop,
.tx_queue_start = ixgbe_dev_tx_queue_start,
.tx_queue_stop = ixgbe_dev_tx_queue_stop,
.rx_queue_setup = ixgbe_dev_rx_queue_setup,
.rx_queue_intr_enable = ixgbe_dev_rx_queue_intr_enable,
.rx_queue_intr_disable = ixgbe_dev_rx_queue_intr_disable,
.rx_queue_release = ixgbe_dev_rx_queue_release,
.rx_queue_count = ixgbe_dev_rx_queue_count,
.rx_descriptor_done = ixgbe_dev_rx_descriptor_done,
.rx_descriptor_status = ixgbe_dev_rx_descriptor_status,
.tx_descriptor_status = ixgbe_dev_tx_descriptor_status,
.tx_queue_setup = ixgbe_dev_tx_queue_setup,
.tx_queue_release = ixgbe_dev_tx_queue_release,
......
.tm_ops_get = ixgbe_tm_ops_get,
};
int
rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
uint16_t nb_rx_desc, unsigned int socket_id,
const struct rte_eth_rxconf *rx_conf,
struct rte_mempool *mp)
{
int ret;
......
//dev_ops->rx_queue_setup和上面的数据结构体的函数指针注册结合起来
ret = (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc,
socket_id, &local_conf, mp);
if (!ret) {
if (!dev->data->min_rx_buf_size ||
dev->data->min_rx_buf_size > mbp_buf_size)
dev->data->min_rx_buf_size = mbp_buf_size;
}
return eth_err(port_id, ret);
}
//注册函数调用下面的函数,注意每个设备的注册可能都有不同
int __attribute__((cold))
ixgbe_dev_rx_queue_setup(struct rte_eth_dev *dev,
uint16_t queue_idx,
uint16_t nb_desc,
unsigned int socket_id,
const struct rte_eth_rxconf *rx_conf,
struct rte_mempool *mp)
{
const struct rte_memzone *rz;
struct ixgbe_rx_queue *rxq;
struct ixgbe_hw *hw;
uint16_t len;
struct ixgbe_adapter *adapter = dev->data->dev_private;
uint64_t offloads;
PMD_INIT_FUNC_TRACE();
hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
/*
* Validate number of receive descriptors.
* It must not exceed hardware maximum, and must be multiple
* of IXGBE_ALIGN.
*/
if (nb_desc % IXGBE_RXD_ALIGN != 0 ||
(nb_desc > IXGBE_MAX_RING_DESC) ||
(nb_desc < IXGBE_MIN_RING_DESC)) {
return -EINVAL;
}
/* Free memory prior to re-allocation if needed... */
if (dev->data->rx_queues[queue_idx] != NULL) {
ixgbe_rx_queue_release(dev->data->rx_queues[queue_idx]);
dev->data->rx_queues[queue_idx] = NULL;
}
/* First allocate the rx queue data structure */
rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct ixgbe_rx_queue),
RTE_CACHE_LINE_SIZE, socket_id);
if (rxq == NULL)
return -ENOMEM;
rxq->mb_pool = mp;
rxq->nb_rx_desc = nb_desc;
rxq->rx_free_thresh = rx_conf->rx_free_thresh;
rxq->queue_id = queue_idx;
rxq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
rxq->port_id = dev->data->port_id;
if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
rxq->crc_len = RTE_ETHER_CRC_LEN;
else
rxq->crc_len = 0;
rxq->drop_en = rx_conf->rx_drop_en;
rxq->rx_deferred_start = rx_conf->rx_deferred_start;
rxq->offloads = offloads;
/*
* The packet type in RX descriptor is different for different NICs.
* Some bits are used for x550 but reserved for other NICS.
* So set different masks for different NICs.
*/
if (hw->mac.type == ixgbe_mac_X550 ||
hw->mac.type == ixgbe_mac_X550EM_x ||
hw->mac.type == ixgbe_mac_X550EM_a ||
hw->mac.type == ixgbe_mac_X550_vf ||
hw->mac.type == ixgbe_mac_X550EM_x_vf ||
hw->mac.type == ixgbe_mac_X550EM_a_vf)
rxq->pkt_type_mask = IXGBE_PACKET_TYPE_MASK_X550;
else
rxq->pkt_type_mask = IXGBE_PACKET_TYPE_MASK_82599;
/*
* 82599 errata, UDP frames with a 0 checksum can be marked as checksum
* errors.
*/
if (hw->mac.type == ixgbe_mac_82599EB)
rxq->rx_udp_csum_zero_err = 1;
/*
* Allocate RX ring hardware descriptors. A memzone large enough to
* handle the maximum ring size is allocated in order to allow for
* resizing in later calls to the queue setup function.
*/
rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx,
RX_RING_SZ, IXGBE_ALIGN, socket_id);
if (rz == NULL) {
ixgbe_rx_queue_release(rxq);
return -ENOMEM;
}
/*
* Zero init all the descriptors in the ring.
*/
memset(rz->addr, 0, RX_RING_SZ);
/*
* Modified to setup VFRDT for Virtual Function
*/
if (hw->mac.type == ixgbe_mac_82599_vf ||
hw->mac.type == ixgbe_mac_X540_vf ||
hw->mac.type == ixgbe_mac_X550_vf ||
hw->mac.type == ixgbe_mac_X550EM_x_vf ||
hw->mac.type == ixgbe_mac_X550EM_a_vf) {
rxq->rdt_reg_addr =
IXGBE_PCI_REG_ADDR(hw, IXGBE_VFRDT(queue_idx));
rxq->rdh_reg_addr =
IXGBE_PCI_REG_ADDR(hw, IXGBE_VFRDH(queue_idx));
} else {
rxq->rdt_reg_addr =
IXGBE_PCI_REG_ADDR(hw, IXGBE_RDT(rxq->reg_idx));
rxq->rdh_reg_addr =
IXGBE_PCI_REG_ADDR(hw, IXGBE_RDH(rxq->reg_idx));
}
rxq->rx_ring_phys_addr = rz->iova;
rxq->rx_ring = (union ixgbe_adv_rx_desc *) rz->addr;
/*
* Certain constraints must be met in order to use the bulk buffer
* allocation Rx burst function. If any of Rx queues doesn't meet them
* the feature should be disabled for the whole port.
*/
if (check_rx_burst_bulk_alloc_preconditions(rxq)) {
PMD_INIT_LOG(DEBUG, "queue[%d] doesn't meet Rx Bulk Alloc "
"preconditions - canceling the feature for "
"the whole port[%d]",
rxq->queue_id, rxq->port_id);
adapter->rx_bulk_alloc_allowed = false;
}
/*
* Allocate software ring. Allow for space at the end of the
* S/W ring to make sure look-ahead logic in bulk alloc Rx burst
* function does not access an invalid memory region.
*/
len = nb_desc;
if (adapter->rx_bulk_alloc_allowed)
len += RTE_PMD_IXGBE_RX_MAX_BURST;
rxq->sw_ring = rte_zmalloc_socket("rxq->sw_ring",
sizeof(struct ixgbe_rx_entry) * len,
RTE_CACHE_LINE_SIZE, socket_id);
if (!rxq->sw_ring) {
ixgbe_rx_queue_release(rxq);
return -ENOMEM;
}
/*
* Always allocate even if it's not going to be needed in order to
* simplify the code.
*
* This ring is used in LRO and Scattered Rx cases and Scattered Rx may
* be requested in ixgbe_dev_rx_init(), which is called later from
* dev_start() flow.
*/
rxq->sw_sc_ring =
rte_zmalloc_socket("rxq->sw_sc_ring",
sizeof(struct ixgbe_scattered_rx_entry) * len,
RTE_CACHE_LINE_SIZE, socket_id);
if (!rxq->sw_sc_ring) {
ixgbe_rx_queue_release(rxq);
return -ENOMEM;
}
PMD_INIT_LOG(DEBUG, "sw_ring=%p sw_sc_ring=%p hw_ring=%p "
"dma_addr=0x%"PRIx64,
rxq->sw_ring, rxq->sw_sc_ring, rxq->rx_ring,
rxq->rx_ring_phys_addr);
if (!rte_is_power_of_2(nb_desc)) {
PMD_INIT_LOG(DEBUG, "queue[%d] doesn't meet Vector Rx "
"preconditions - canceling the feature for "
"the whole port[%d]",
rxq->queue_id, rxq->port_id);
adapter->rx_vec_allowed = false;
} else
ixgbe_rxq_vec_setup(rxq);
dev->data->rx_queues[queue_idx] = rxq;
ixgbe_reset_rx_queue(adapter, rxq);
return 0;
}
在上面的注册函数中还有很多的函数,需要注意的是不同的设备可能注册的函数都有所不同,不要盲目的僵化套路这些功能。在数据结构体中注册的ixgbe_dev_start函数中,调用了ixgbe_dev_rx_init:
int __attribute__((cold))
ixgbe_dev_rx_init(struct rte_eth_dev *dev)
{
struct ixgbe_hw *hw;
struct ixgbe_rx_queue *rxq;
uint64_t bus_addr;
uint32_t rxctrl;
uint32_t fctrl;
uint32_t hlreg0;
uint32_t maxfrs;
uint32_t srrctl;
uint32_t rdrxctl;
uint32_t rxcsum;
uint16_t buf_size;
uint16_t i;
struct rte_eth_rxmode *rx_conf = &dev->data->dev_conf.rxmode;
int rc;
PMD_INIT_FUNC_TRACE();
hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
/*
* Make sure receives are disabled while setting
* up the RX context (registers, descriptor rings, etc.).
*/
rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL);
IXGBE_WRITE_REG(hw, IXGBE_RXCTRL, rxctrl & ~IXGBE_RXCTRL_RXEN);
/* Enable receipt of broadcasted frames */
fctrl = IXGBE_READ_REG(hw, IXGBE_FCTRL);
fctrl |= IXGBE_FCTRL_BAM;
fctrl |= IXGBE_FCTRL_DPF;
fctrl |= IXGBE_FCTRL_PMCF;
IXGBE_WRITE_REG(hw, IXGBE_FCTRL, fctrl);
/*
* Configure CRC stripping, if any.
*/
hlreg0 = IXGBE_READ_REG(hw, IXGBE_HLREG0);
if (rx_conf->offloads & DEV_RX_OFFLOAD_KEEP_CRC)
hlreg0 &= ~IXGBE_HLREG0_RXCRCSTRP;
else
hlreg0 |= IXGBE_HLREG0_RXCRCSTRP;
/*
* Configure jumbo frame support, if any.
*/
if (rx_conf->offloads & DEV_RX_OFFLOAD_JUMBO_FRAME) {
hlreg0 |= IXGBE_HLREG0_JUMBOEN;
maxfrs = IXGBE_READ_REG(hw, IXGBE_MAXFRS);
maxfrs &= 0x0000FFFF;
maxfrs |= (rx_conf->max_rx_pkt_len << 16);
IXGBE_WRITE_REG(hw, IXGBE_MAXFRS, maxfrs);
} else
hlreg0 &= ~IXGBE_HLREG0_JUMBOEN;
/*
* If loopback mode is configured, set LPBK bit.
*/
if (dev->data->dev_conf.lpbk_mode != 0) {
rc = ixgbe_check_supported_loopback_mode(dev);
if (rc < 0) {
PMD_INIT_LOG(ERR, "Unsupported loopback mode");
return rc;
}
hlreg0 |= IXGBE_HLREG0_LPBK;
} else {
hlreg0 &= ~IXGBE_HLREG0_LPBK;
}
IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg0);
/*
* Assume no header split and no VLAN strip support
* on any Rx queue first .
*/
rx_conf->offloads &= ~DEV_RX_OFFLOAD_VLAN_STRIP;
/* Setup RX queues */
for (i = 0; i < dev->data->nb_rx_queues; i++) {
rxq = dev->data->rx_queues[i];
/*
* Reset crc_len in case it was changed after queue setup by a
* call to configure.
*/
if (rx_conf->offloads & DEV_RX_OFFLOAD_KEEP_CRC)
rxq->crc_len = RTE_ETHER_CRC_LEN;
else
rxq->crc_len = 0;
/* Setup the Base and Length of the Rx Descriptor Rings */
bus_addr = rxq->rx_ring_phys_addr;
IXGBE_WRITE_REG(hw, IXGBE_RDBAL(rxq->reg_idx),
(uint32_t)(bus_addr & 0x00000000ffffffffULL));
IXGBE_WRITE_REG(hw, IXGBE_RDBAH(rxq->reg_idx),
(uint32_t)(bus_addr >> 32));
IXGBE_WRITE_REG(hw, IXGBE_RDLEN(rxq->reg_idx),
rxq->nb_rx_desc * sizeof(union ixgbe_adv_rx_desc));
IXGBE_WRITE_REG(hw, IXGBE_RDH(rxq->reg_idx), 0);
IXGBE_WRITE_REG(hw, IXGBE_RDT(rxq->reg_idx), 0);
/* Configure the SRRCTL register */
srrctl = IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF;
/* Set if packets are dropped when no descriptors available */
if (rxq->drop_en)
srrctl |= IXGBE_SRRCTL_DROP_EN;
/*
* Configure the RX buffer size in the BSIZEPACKET field of
* the SRRCTL register of the queue.
* The value is in 1 KB resolution. Valid values can be from
* 1 KB to 16 KB.
*/
buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
RTE_PKTMBUF_HEADROOM);
srrctl |= ((buf_size >> IXGBE_SRRCTL_BSIZEPKT_SHIFT) &
IXGBE_SRRCTL_BSIZEPKT_MASK);
IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(rxq->reg_idx), srrctl);
buf_size = (uint16_t) ((srrctl & IXGBE_SRRCTL_BSIZEPKT_MASK) <<
IXGBE_SRRCTL_BSIZEPKT_SHIFT);
/* It adds dual VLAN length for supporting dual VLAN */
if (dev->data->dev_conf.rxmode.max_rx_pkt_len +
2 * IXGBE_VLAN_TAG_SIZE > buf_size)
dev->data->scattered_rx = 1;
if (rxq->offloads & DEV_RX_OFFLOAD_VLAN_STRIP)
rx_conf->offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
}
if (rx_conf->offloads & DEV_RX_OFFLOAD_SCATTER)
dev->data->scattered_rx = 1;
/*
* Device configured with multiple RX queues.
*/
ixgbe_dev_mq_rx_configure(dev);
/*
* Setup the Checksum Register.
* Disable Full-Packet Checksum which is mutually exclusive with RSS.
* Enable IP/L4 checkum computation by hardware if requested to do so.
*/
rxcsum = IXGBE_READ_REG(hw, IXGBE_RXCSUM);
rxcsum |= IXGBE_RXCSUM_PCSD;
if (rx_conf->offloads & DEV_RX_OFFLOAD_CHECKSUM)
rxcsum |= IXGBE_RXCSUM_IPPCSE;
else
rxcsum &= ~IXGBE_RXCSUM_IPPCSE;
IXGBE_WRITE_REG(hw, IXGBE_RXCSUM, rxcsum);
if (hw->mac.type == ixgbe_mac_82599EB ||
hw->mac.type == ixgbe_mac_X540) {
rdrxctl = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
if (rx_conf->offloads & DEV_RX_OFFLOAD_KEEP_CRC)
rdrxctl &= ~IXGBE_RDRXCTL_CRCSTRIP;
else
rdrxctl |= IXGBE_RDRXCTL_CRCSTRIP;
rdrxctl &= ~IXGBE_RDRXCTL_RSCFRSTSIZE;
IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
}
rc = ixgbe_set_rsc(dev);
if (rc)
return rc;
ixgbe_set_rx_function(dev);
return 0;
}
然后启动队列:
/*
* Start Receive Units for specified queue.
*/
int __attribute__((cold))
ixgbe_dev_rx_queue_start(struct rte_eth_dev *dev, uint16_t rx_queue_id)
{
struct ixgbe_hw *hw;
struct ixgbe_rx_queue *rxq;
uint32_t rxdctl;
int poll_ms;
PMD_INIT_FUNC_TRACE();
hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
rxq = dev->data->rx_queues[rx_queue_id];
/* Allocate buffers for descriptor rings */
if (ixgbe_alloc_rx_queue_mbufs(rxq) != 0) {
PMD_INIT_LOG(ERR, "Could not alloc mbuf for queue:%d",
rx_queue_id);
return -1;
}
rxdctl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(rxq->reg_idx));
rxdctl |= IXGBE_RXDCTL_ENABLE;
IXGBE_WRITE_REG(hw, IXGBE_RXDCTL(rxq->reg_idx), rxdctl);
/* Wait until RX Enable ready */
poll_ms = RTE_IXGBE_REGISTER_POLL_WAIT_10_MS;
do {
rte_delay_ms(1);
rxdctl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(rxq->reg_idx));
} while (--poll_ms && !(rxdctl & IXGBE_RXDCTL_ENABLE));
if (!poll_ms)
PMD_INIT_LOG(ERR, "Could not enable Rx Queue %d", rx_queue_id);
rte_wmb();
IXGBE_WRITE_REG(hw, IXGBE_RDH(rxq->reg_idx), 0);
IXGBE_WRITE_REG(hw, IXGBE_RDT(rxq->reg_idx), rxq->nb_rx_desc - 1);
dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STARTED;
return 0;
}
然后就可以真正接收数据了,在接收数据前得看看如何将函数指针挂接到相关的设备函数中:
static int
eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev)
{
int diag;
uint32_t tc, tcs;
struct ixgbe_adapter *ad = eth_dev->data->dev_private;
struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
struct rte_intr_handle *intr_handle = &pci_dev->intr_handle;
struct ixgbe_hw *hw =
IXGBE_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
struct ixgbe_vfta *shadow_vfta =
IXGBE_DEV_PRIVATE_TO_VFTA(eth_dev->data->dev_private);
struct ixgbe_hwstrip *hwstrip =
IXGBE_DEV_PRIVATE_TO_HWSTRIP_BITMAP(eth_dev->data->dev_private);
struct rte_ether_addr *perm_addr =
(struct rte_ether_addr *)hw->mac.perm_addr;
PMD_INIT_FUNC_TRACE();
eth_dev->dev_ops = &ixgbevf_eth_dev_ops;
eth_dev->rx_pkt_burst = &ixgbe_recv_pkts;//此处是RX
eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;//上处是TX
}
uint16_t
ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
{
struct ixgbe_rx_queue *rxq;
volatile union ixgbe_adv_rx_desc *rx_ring;
volatile union ixgbe_adv_rx_desc *rxdp;
struct ixgbe_rx_entry *sw_ring;
struct ixgbe_rx_entry *rxe;
struct rte_mbuf *rxm;
struct rte_mbuf *nmb;
union ixgbe_adv_rx_desc rxd;
uint64_t dma_addr;
uint32_t staterr;
uint32_t pkt_info;
uint16_t pkt_len;
uint16_t rx_id;
uint16_t nb_rx;
uint16_t nb_hold;
uint64_t pkt_flags;
uint64_t vlan_flags;
nb_rx = 0;
nb_hold = 0;
rxq = rx_queue;
rx_id = rxq->rx_tail;
rx_ring = rxq->rx_ring;
sw_ring = rxq->sw_ring;
vlan_flags = rxq->vlan_flags;
while (nb_rx < nb_pkts) {
/*
* The order of operations here is important as the DD status
* bit must not be read after any other descriptor fields.
* rx_ring and rxdp are pointing to volatile data so the order
* of accesses cannot be reordered by the compiler. If they were
* not volatile, they could be reordered which could lead to
* using invalid descriptor fields when read from rxd.
*/
rxdp = &rx_ring[rx_id];
staterr = rxdp->wb.upper.status_error;
if (!(staterr & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
break;
rxd = *rxdp;
/*
* End of packet.
*
* If the IXGBE_RXDADV_STAT_EOP flag is not set, the RX packet
* is likely to be invalid and to be dropped by the various
* validation checks performed by the network stack.
*
* Allocate a new mbuf to replenish the RX ring descriptor.
* If the allocation fails:
* - arrange for that RX descriptor to be the first one
* being parsed the next time the receive function is
* invoked [on the same queue].
*
* - Stop parsing the RX ring and return immediately.
*
* This policy do not drop the packet received in the RX
* descriptor for which the allocation of a new mbuf failed.
* Thus, it allows that packet to be later retrieved if
* mbuf have been freed in the mean time.
* As a side effect, holding RX descriptors instead of
* systematically giving them back to the NIC may lead to
* RX ring exhaustion situations.
* However, the NIC can gracefully prevent such situations
* to happen by sending specific "back-pressure" flow control
* frames to its peer(s).
*/
PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
"ext_err_stat=0x%08x pkt_len=%u",
(unsigned) rxq->port_id, (unsigned) rxq->queue_id,
(unsigned) rx_id, (unsigned) staterr,
(unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
if (nmb == NULL) {
PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
"queue_id=%u", (unsigned) rxq->port_id,
(unsigned) rxq->queue_id);
rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
break;
}
nb_hold++;
rxe = &sw_ring[rx_id];
rx_id++;
if (rx_id == rxq->nb_rx_desc)
rx_id = 0;
/* Prefetch next mbuf while processing current one. */
rte_ixgbe_prefetch(sw_ring[rx_id].mbuf);
/*
* When next RX descriptor is on a cache-line boundary,
* prefetch the next 4 RX descriptors and the next 8 pointers
* to mbufs.
*/
if ((rx_id & 0x3) == 0) {
rte_ixgbe_prefetch(&rx_ring[rx_id]);
rte_ixgbe_prefetch(&sw_ring[rx_id]);
}
rxm = rxe->mbuf;
rxe->mbuf = nmb;
dma_addr =
rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
rxdp->read.hdr_addr = 0;
rxdp->read.pkt_addr = dma_addr;
/*
* Initialize the returned mbuf.
* 1) setup generic mbuf fields:
* - number of segments,
* - next segment,
* - packet length,
* - RX port identifier.
* 2) integrate hardware offload data, if any:
* - RSS flag & hash,
* - IP checksum flag,
* - VLAN TCI, if any,
* - error flags.
*/
pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
rxq->crc_len);
rxm->data_off = RTE_PKTMBUF_HEADROOM;
rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
rxm->nb_segs = 1;
rxm->next = NULL;
rxm->pkt_len = pkt_len;
rxm->data_len = pkt_len;
rxm->port = rxq->port_id;
pkt_info = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
/* Only valid if PKT_RX_VLAN set in pkt_flags */
rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
pkt_flags = rx_desc_status_to_pkt_flags(staterr, vlan_flags);
pkt_flags = pkt_flags |
rx_desc_error_to_pkt_flags(staterr, (uint16_t)pkt_info,
rxq->rx_udp_csum_zero_err);
pkt_flags = pkt_flags |
ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
rxm->ol_flags = pkt_flags;
rxm->packet_type =
ixgbe_rxd_pkt_info_to_pkt_type(pkt_info,
rxq->pkt_type_mask);
if (likely(pkt_flags & PKT_RX_RSS_HASH))
rxm->hash.rss = rte_le_to_cpu_32(
rxd.wb.lower.hi_dword.rss);
else if (pkt_flags & PKT_RX_FDIR) {
rxm->hash.fdir.hash = rte_le_to_cpu_16(
rxd.wb.lower.hi_dword.csum_ip.csum) &
IXGBE_ATR_HASH_MASK;
rxm->hash.fdir.id = rte_le_to_cpu_16(
rxd.wb.lower.hi_dword.csum_ip.ip_id);
}
/*
* Store the mbuf address into the next entry of the array
* of returned packets.
*/
rx_pkts[nb_rx++] = rxm;
}
rxq->rx_tail = rx_id;
/*
* If the number of free RX descriptors is greater than the RX free
* threshold of the queue, advance the Receive Descriptor Tail (RDT)
* register.
* Update the RDT with the value of the last processed RX descriptor
* minus 1, to guarantee that the RDT register is never equal to the
* RDH register, which creates a "full" ring situtation from the
* hardware point of view...
*/
nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
if (nb_hold > rxq->rx_free_thresh) {
PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
"nb_hold=%u nb_rx=%u",
(unsigned) rxq->port_id, (unsigned) rxq->queue_id,
(unsigned) rx_id, (unsigned) nb_hold,
(unsigned) nb_rx);
rx_id = (uint16_t) ((rx_id == 0) ?
(rxq->nb_rx_desc - 1) : (rx_id - 1));
IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
nb_hold = 0;
}
rxq->nb_rx_hold = nb_hold;
return nb_rx;
}
IXGBE_RXDADV_STAT_DD就是那个总提到的DD标志位,大家对应着相关的上面的说明并结合注释就基本弄明白了整个细节。
2、发送数据
发送的前的准备和接收前的准备有些类似,在上面的注册函数中都有,这里就不再拷贝进来,可以直接下载源码观看,重点看一下发送的代码:
uint16_t
ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts)
{
struct ixgbe_tx_queue *txq;
struct ixgbe_tx_entry *sw_ring;
struct ixgbe_tx_entry *txe, *txn;
volatile union ixgbe_adv_tx_desc *txr;
volatile union ixgbe_adv_tx_desc *txd, *txp;
struct rte_mbuf *tx_pkt;
struct rte_mbuf *m_seg;
uint64_t buf_dma_addr;
uint32_t olinfo_status;
uint32_t cmd_type_len;
uint32_t pkt_len;
uint16_t slen;
uint64_t ol_flags;
uint16_t tx_id;
uint16_t tx_last;
uint16_t nb_tx;
uint16_t nb_used;
uint64_t tx_ol_req;
uint32_t ctx = 0;
uint32_t new_ctx;
union ixgbe_tx_offload tx_offload;
#ifdef RTE_LIBRTE_SECURITY
uint8_t use_ipsec;
#endif
tx_offload.data[0] = 0;
tx_offload.data[1] = 0;
txq = tx_queue;
sw_ring = txq->sw_ring;
txr = txq->tx_ring;
tx_id = txq->tx_tail;
txe = &sw_ring[tx_id];
txp = NULL;
/* Determine if the descriptor ring needs to be cleaned. */
if (txq->nb_tx_free < txq->tx_free_thresh)
ixgbe_xmit_cleanup(txq);
rte_prefetch0(&txe->mbuf->pool);
/* TX loop */
for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
new_ctx = 0;
tx_pkt = *tx_pkts++;
pkt_len = tx_pkt->pkt_len;
/*
* Determine how many (if any) context descriptors
* are needed for offload functionality.
*/
ol_flags = tx_pkt->ol_flags;
#ifdef RTE_LIBRTE_SECURITY
use_ipsec = txq->using_ipsec && (ol_flags & PKT_TX_SEC_OFFLOAD);
#endif
/* If hardware offload required */
tx_ol_req = ol_flags & IXGBE_TX_OFFLOAD_MASK;
if (tx_ol_req) {
tx_offload.l2_len = tx_pkt->l2_len;
tx_offload.l3_len = tx_pkt->l3_len;
tx_offload.l4_len = tx_pkt->l4_len;
tx_offload.vlan_tci = tx_pkt->vlan_tci;
tx_offload.tso_segsz = tx_pkt->tso_segsz;
tx_offload.outer_l2_len = tx_pkt->outer_l2_len;
tx_offload.outer_l3_len = tx_pkt->outer_l3_len;
#ifdef RTE_LIBRTE_SECURITY
if (use_ipsec) {
union ixgbe_crypto_tx_desc_md *ipsec_mdata =
(union ixgbe_crypto_tx_desc_md *)
&tx_pkt->udata64;
tx_offload.sa_idx = ipsec_mdata->sa_idx;
tx_offload.sec_pad_len = ipsec_mdata->pad_len;
}
#endif
/* If new context need be built or reuse the exist ctx. */
ctx = what_advctx_update(txq, tx_ol_req,
tx_offload);
/* Only allocate context descriptor if required*/
new_ctx = (ctx == IXGBE_CTX_NUM);
ctx = txq->ctx_curr;
}
/*
* Keep track of how many descriptors are used this loop
* This will always be the number of segments + the number of
* Context descriptors required to transmit the packet
*/
nb_used = (uint16_t)(tx_pkt->nb_segs + new_ctx);
if (txp != NULL &&
nb_used + txq->nb_tx_used >= txq->tx_rs_thresh)
/* set RS on the previous packet in the burst */
txp->read.cmd_type_len |=
rte_cpu_to_le_32(IXGBE_TXD_CMD_RS);
/*
* The number of descriptors that must be allocated for a
* packet is the number of segments of that packet, plus 1
* Context Descriptor for the hardware offload, if any.
* Determine the last TX descriptor to allocate in the TX ring
* for the packet, starting from the current position (tx_id)
* in the ring.
*/
tx_last = (uint16_t) (tx_id + nb_used - 1);
/* Circular ring */
if (tx_last >= txq->nb_tx_desc)
tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
" tx_first=%u tx_last=%u",
(unsigned) txq->port_id,
(unsigned) txq->queue_id,
(unsigned) pkt_len,
(unsigned) tx_id,
(unsigned) tx_last);
/*
* Make sure there are enough TX descriptors available to
* transmit the entire packet.
* nb_used better be less than or equal to txq->tx_rs_thresh
*/
if (nb_used > txq->nb_tx_free) {
PMD_TX_FREE_LOG(DEBUG,
"Not enough free TX descriptors "
"nb_used=%4u nb_free=%4u "
"(port=%d queue=%d)",
nb_used, txq->nb_tx_free,
txq->port_id, txq->queue_id);
if (ixgbe_xmit_cleanup(txq) != 0) {
/* Could not clean any descriptors */
if (nb_tx == 0)
return 0;
goto end_of_tx;
}
/* nb_used better be <= txq->tx_rs_thresh */
if (unlikely(nb_used > txq->tx_rs_thresh)) {
PMD_TX_FREE_LOG(DEBUG,
"The number of descriptors needed to "
"transmit the packet exceeds the "
"RS bit threshold. This will impact "
"performance."
"nb_used=%4u nb_free=%4u "
"tx_rs_thresh=%4u. "
"(port=%d queue=%d)",
nb_used, txq->nb_tx_free,
txq->tx_rs_thresh,
txq->port_id, txq->queue_id);
/*
* Loop here until there are enough TX
* descriptors or until the ring cannot be
* cleaned.
*/
while (nb_used > txq->nb_tx_free) {
if (ixgbe_xmit_cleanup(txq) != 0) {
/*
* Could not clean any
* descriptors
*/
if (nb_tx == 0)
return 0;
goto end_of_tx;
}
}
}
}
/*
* By now there are enough free TX descriptors to transmit
* the packet.
*/
/*
* Set common flags of all TX Data Descriptors.
*
* The following bits must be set in all Data Descriptors:
* - IXGBE_ADVTXD_DTYP_DATA
* - IXGBE_ADVTXD_DCMD_DEXT
*
* The following bits must be set in the first Data Descriptor
* and are ignored in the other ones:
* - IXGBE_ADVTXD_DCMD_IFCS
* - IXGBE_ADVTXD_MAC_1588
* - IXGBE_ADVTXD_DCMD_VLE
*
* The following bits must only be set in the last Data
* Descriptor:
* - IXGBE_TXD_CMD_EOP
*
* The following bits can be set in any Data Descriptor, but
* are only set in the last Data Descriptor:
* - IXGBE_TXD_CMD_RS
*/
cmd_type_len = IXGBE_ADVTXD_DTYP_DATA |
IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT;
#ifdef RTE_LIBRTE_IEEE1588
if (ol_flags & PKT_TX_IEEE1588_TMST)
cmd_type_len |= IXGBE_ADVTXD_MAC_1588;
#endif
olinfo_status = 0;
if (tx_ol_req) {
if (ol_flags & PKT_TX_TCP_SEG) {
/* when TSO is on, paylen in descriptor is the
* not the packet len but the tcp payload len */
pkt_len -= (tx_offload.l2_len +
tx_offload.l3_len + tx_offload.l4_len);
}
/*
* Setup the TX Advanced Context Descriptor if required
*/
if (new_ctx) {
volatile struct ixgbe_adv_tx_context_desc *
ctx_txd;
ctx_txd = (volatile struct
ixgbe_adv_tx_context_desc *)
&txr[tx_id];
txn = &sw_ring[txe->next_id];
rte_prefetch0(&txn->mbuf->pool);
if (txe->mbuf != NULL) {
rte_pktmbuf_free_seg(txe->mbuf);
txe->mbuf = NULL;
}
ixgbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
tx_offload, &tx_pkt->udata64);
txe->last_id = tx_last;
tx_id = txe->next_id;
txe = txn;
}
/*
* Setup the TX Advanced Data Descriptor,
* This path will go through
* whatever new/reuse the context descriptor
*/
cmd_type_len |= tx_desc_ol_flags_to_cmdtype(ol_flags);
olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags);
olinfo_status |= ctx << IXGBE_ADVTXD_IDX_SHIFT;
}
olinfo_status |= (pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
#ifdef RTE_LIBRTE_SECURITY
if (use_ipsec)
olinfo_status |= IXGBE_ADVTXD_POPTS_IPSEC;
#endif
m_seg = tx_pkt;
do {
txd = &txr[tx_id];
txn = &sw_ring[txe->next_id];
rte_prefetch0(&txn->mbuf->pool);
if (txe->mbuf != NULL)
rte_pktmbuf_free_seg(txe->mbuf);
txe->mbuf = m_seg;
/*
* Set up Transmit Data Descriptor.
*/
slen = m_seg->data_len;
buf_dma_addr = rte_mbuf_data_iova(m_seg);
txd->read.buffer_addr =
rte_cpu_to_le_64(buf_dma_addr);
txd->read.cmd_type_len =
rte_cpu_to_le_32(cmd_type_len | slen);
txd->read.olinfo_status =
rte_cpu_to_le_32(olinfo_status);
txe->last_id = tx_last;
tx_id = txe->next_id;
txe = txn;
m_seg = m_seg->next;
} while (m_seg != NULL);
/*
* The last packet data descriptor needs End Of Packet (EOP)
*/
cmd_type_len |= IXGBE_TXD_CMD_EOP;
txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used);
/* Set RS bit only on threshold packets' last descriptor */
if (txq->nb_tx_used >= txq->tx_rs_thresh) {
PMD_TX_FREE_LOG(DEBUG,
"Setting RS bit on TXD id="
"%4u (port=%d queue=%d)",
tx_last, txq->port_id, txq->queue_id);
cmd_type_len |= IXGBE_TXD_CMD_RS;
/* Update txq RS bit counters */
txq->nb_tx_used = 0;
txp = NULL;
} else
txp = txd;
txd->read.cmd_type_len |= rte_cpu_to_le_32(cmd_type_len);
}
end_of_tx:
/* set RS on last packet in the burst */
if (txp != NULL)
txp->read.cmd_type_len |= rte_cpu_to_le_32(IXGBE_TXD_CMD_RS);
rte_wmb();
/*
* Set the Transmit Descriptor Tail (TDT)
*/
PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
(unsigned) txq->port_id, (unsigned) txq->queue_id,
(unsigned) tx_id, (unsigned) nb_tx);
IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
txq->tx_tail = tx_id;
return nb_tx;
}
有了源码,有了流程,其实再学习整个功能就没有不可逾越的坎了。
3、系统ring和描述符ring映射代码
上文分析过这两个队列挺重要,并做了一个转换的动作,看一下其趁着的源码:
static int __attribute__((cold))
ixgbe_alloc_rx_queue_mbufs(struct ixgbe_rx_queue *rxq)
{
struct ixgbe_rx_entry *rxe = rxq->sw_ring;
uint64_t dma_addr;
unsigned int i;
/* Initialize software ring entries */
for (i = 0; i < rxq->nb_rx_desc; i++) {
volatile union ixgbe_adv_rx_desc *rxd;
struct rte_mbuf *mbuf = rte_mbuf_raw_alloc(rxq->mb_pool);
if (mbuf == NULL) {
PMD_INIT_LOG(ERR, "RX mbuf alloc failed queue_id=%u",
(unsigned) rxq->queue_id);
return -ENOMEM;
}
mbuf->data_off = RTE_PKTMBUF_HEADROOM;
mbuf->port = rxq->port_id;
dma_addr =
rte_cpu_to_le_64(rte_mbuf_data_iova_default(mbuf));
rxd = &rxq->rx_ring[i];
rxd->read.hdr_addr = 0;
rxd->read.pkt_addr = dma_addr;
rxe[i].mbuf = mbuf;
}
return 0;
}
在ixgbe_dev_rx_queue_setup可以看到rx_ring和sw_ring的创建,但其真正的关联是在上面的函数中完成的,当然,不同的设备可能还是有所不同请大家注意。
四、分析汇总
通过上面的源码分析和流程说明,其实发现,DPDK的应用可以分成两个层次,一个层次是偏向于上层应用的,可以用软件的编译思维去解决;另外一个是硬件层次的,毕竟DPDK是一种数据平台框架,它一定最终落实到某个型号的网卡上。那么这其实就是类似于驱动层次了,需要对硬件的各种情况非常了解,这就需要对Datasheet进行详细的掌握。当然可能对于常见的几种网卡和常见的应用形式,其实第二个层次基本已经成熟,仍然是聚集到第一个层次。
但是,无论从学习的角度还是从最终应用的角度,这两个层次的知识都要深入掌握。其实还有个重要的方向,就是内核,包括虚拟化的方向。
路漫漫其修远兮!
五、总结
虽然侯教师说“源码面前,了无秘密”。可对于大的框架型软件来说,阅读代码确实相当吃力,特别是国内的编程水平和编程风格和国外的还有很大的不同,特别是一些具体的处理手段,有些高明之处不是一下两下就能看明白的。还有就是可能应用的具体的技术和国内也有不同,这就导致相关的代码阅读对于一些硬上的同学们非常有难度。
但还是那个字,干!