dpdk驱动根据inter网卡驱动进行修改简化而来,减少内存拷贝,替换inter网卡的中断模式取数据,采用轮询模式。
内存初始化
首先来看内存的初始化:int ret = rte_eal_init(argc, argv);
int
rte_eal_init(int argc, char **argv)
{
int i, fctret, ret;
pthread_t thread_id;
static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
struct shared_driver *solib = NULL;
const char *logid;
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
if (!rte_atomic32_test_and_set(&run_once))
return -1;
logid = strrchr(argv[0], '/');
logid = strdup(logid ? logid + 1: argv[0]);
thread_id = pthread_self();
if (rte_eal_log_early_init() < 0)
rte_panic("Cannot init early logs\n");
if (rte_eal_cpu_init() < 0)
rte_panic("Cannot detect lcores\n");
fctret = eal_parse_args(argc, argv);
if (fctret < 0)
exit(1);
/* set log level as early as possible */
rte_set_log_level(internal_config.log_level);
if (internal_config.no_hugetlbfs == 0 &&
internal_config.process_type != RTE_PROC_SECONDARY &&
internal_config.xen_dom0_support == 0 &&
eal_hugepage_info_init() < 0)//获取huge的信息,和选取一个有效的huge
rte_panic("Cannot get hugepage information\n");
if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
if (internal_config.no_hugetlbfs)
internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
else
internal_config.memory = eal_get_hugepage_mem_size();
}
if (internal_config.vmware_tsc_map == 1) {
#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT
rte_cycles_vmware_tsc_map = 1;
RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "
"you must have monitor_control.pseudo_perfctr = TRUE\n");
#else
RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "
"RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");
#endif
}
rte_srand(rte_rdtsc());
rte_config_init();//分配内存mmap映射
if (rte_eal_pci_init() < 0)
rte_panic("Cannot init PCI\n");//初始化pci总线
#ifdef RTE_LIBRTE_IVSHMEM
if (rte_eal_ivshmem_init() < 0)
rte_panic("Cannot init IVSHMEM\n");
#endif
if (rte_eal_memory_init() < 0)//根据大页大小分配映射空间
rte_panic("Cannot init memory\n");
/* the directories are locked during eal_hugepage_info_init */
eal_hugedirs_unlock();
if (rte_eal_memzone_init() < 0)//管理内存
rte_panic("Cannot init memzone\n");
if (rte_eal_tailqs_init() < 0)
rte_panic("Cannot init tail queues for objects\n");
#ifdef RTE_LIBRTE_IVSHMEM
if (rte_eal_ivshmem_obj_init() < 0)
rte_panic("Cannot init IVSHMEM objects\n");
#endif
if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0)
rte_panic("Cannot init logs\n");
if (rte_eal_alarm_init() < 0)
rte_panic("Cannot init interrupt-handling thread\n");
if (rte_eal_intr_init() < 0)//IO的多路复用(轮询)
rte_panic("Cannot init interrupt-handling thread\n");
if (rte_eal_timer_init() < 0)
rte_panic("Cannot init HPET or TSC timers\n");
eal_check_mem_on_local_socket();
rte_eal_mcfg_complete();
TAILQ_FOREACH(solib, &solib_list, next) {
RTE_LOG(INFO, EAL, "open shared lib %s\n", solib->name);
solib->lib_handle = dlopen(solib->name, RTLD_NOW);
if (solib->lib_handle == NULL)
RTE_LOG(WARNING, EAL, "%s\n", dlerror());
}
eal_thread_init_master(rte_config.master_lcore);
ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
rte_config.master_lcore, (int)thread_id, cpuset,
ret == 0 ? "" : "...");
if (rte_eal_dev_init() < 0)//这个地方应该是在初始化设备信息。
rte_panic("Cannot init pmd devices\n");
RTE_LCORE_FOREACH_SLAVE(i) {
/*
* create communication pipes between master thread
* and children
*/
if (pipe(lcore_config[i].pipe_master2slave) < 0)
rte_panic("Cannot create pipe\n");
if (pipe(lcore_config[i].pipe_slave2master) < 0)
rte_panic("Cannot create pipe\n");
lcore_config[i].state = WAIT;
/* create a thread for each lcore */
ret = pthread_create(&lcore_config[i].thread_id, NULL,
eal_thread_loop, NULL);
if (ret != 0)
rte_panic("Cannot create thread\n");
}
/*
* Launch a dummy function on all slave lcores, so that master lcore
* knows they are all ready when this function returns.
*/
rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
rte_eal_mp_wait_lcore();
/* Probe & Initialize PCI devices */
if (rte_eal_pci_probe())
rte_panic("Cannot probe PCI\n");
return fctret;
}
其中空间分配最主要还是rte_eal_memory_init()这个函数,根据选取的大页配置信息进行空间的初始化以及映射。
int
rte_eal_memory_init(void)
{
RTE_LOG(INFO, EAL, "Setting up memory...\n");
const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() ://主线程调用
rte_eal_hugepage_attach();//从线程调用
if (retval < 0)
return -1;
if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
return -1;
return 0;
}
static int
rte_eal_hugepage_init(void)
{
。。。。。。。。
tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
if (tmp_hp == NULL)
goto fail;
memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
hp_offset = 0; /* where we start the current page size entries */
/* map all hugepages and sort them */
for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
struct hugepage_info *hpi;
/*
* we don't yet mark hugepages as used at this stage, so
* we just map all hugepages available to the system
* all hugepages are still located on socket 0
*/
hpi = &internal_config.hugepage_info[i];
if (hpi->num_pages[0] == 0)
continue;
/* map all hugepages available 映射第一次的连续空间*/
if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
(unsigned)(hpi->hugepage_sz / 0x100000));
goto fail;
}
/* find physical addresses and sockets for each hugepage */
//查找对应的物理地址
if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0){
RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n",
(unsigned)(hpi->hugepage_sz / 0x100000));
goto fail;
}
//查找对应是socke,找到映射的大页内存被放在哪个NUMA node上
if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
(unsigned)(hpi->hugepage_sz / 0x100000));
goto fail;
}
if (sort_by_physaddr(&tmp_hp[hp_offset], hpi) < 0)//排序
goto fail;
//重新映射虚拟空间平且释放掉第一次映射的空间
#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
/* remap all hugepages into single file segments */
new_pages_count[i] = remap_all_hugepages(&tmp_hp[hp_offset], hpi);
if (new_pages_count[i] < 0){
RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
(unsigned)(hpi->hugepage_sz / 0x100000));
goto fail;
}
/* we have processed a num of hugepages of this size, so inc offset */
hp_offset += new_pages_count[i];
#else
/* remap all hugepages */
if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
(unsigned)(hpi->hugepage_sz / 0x100000));
goto fail;
}
/* unmap original mappings */
if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
goto fail;
/* we have processed a num of hugepages of this size, so inc offset */
hp_offset += hpi->num_pages[0];
#endif
}
。。。。。。。。。
/* create shared memory 创建共享内存管理映射出来的虚拟空间,创建一块共享内存保存虚拟/物理地址信息,以便后续primary进程或者secondary进程使用*/
hugepage = create_shared_memory(eal_hugepage_info_path(),
nr_hugefiles * sizeof(struct hugepage_file));
if (hugepage == NULL) {
RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
goto fail;
}
memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
/*取消一些多余的映射
* unmap pages that we won't need (looks at used_hp).
* also, sets final_va to NULL on pages that were unmapped.
*/
if (unmap_unneeded_hugepages(tmp_hp, used_hp,
internal_config.num_hugepage_sizes) < 0) {
RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
goto fail;
}
/*将虚拟/物理地址信息拷贝到共享内存
* copy stuff from malloc'd hugepage* to the actual shared memory.
* this procedure only copies those hugepages that have final_va
* not NULL. has overflow protection.
*/
if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
tmp_hp, nr_hugefiles) < 0) {
RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
goto fail;
}
/* free the temporary hugepage table */
free(tmp_hp);
tmp_hp = NULL;
/* find earliest free memseg - this is needed because in case of IVSHMEM,
* segments might have already been initialized */
for (j = 0; j < RTE_MAX_MEMSEG; j++)
if (mcfg->memseg[j].addr == NULL) {
/* move to previous segment and exit loop */
j--;
break;
}
//整理不连续的内存片段,用数组来保存
for (i = 0; i < nr_hugefiles; i++) {
new_memseg = 0;
/* if this is a new section, create a new memseg */
if (i == 0)
new_memseg = 1;
else if (hugepage[i].socket_id != hugepage[i-1].socket_id)
new_memseg = 1;
else if (hugepage[i].size != hugepage[i-1].size)
new_memseg = 1;
#ifdef RTE_ARCH_PPC_64
/* On PPC64 architecture, the mmap always start from higher
* virtual address to lower address. Here, both the physical
* address and virtual address are in descending order */
else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=
hugepage[i].size)
new_memseg = 1;
else if (((unsigned long)hugepage[i-1].final_va -
(unsigned long)hugepage[i].final_va) != hugepage[i].size)
new_memseg = 1;
#else
else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
hugepage[i].size)
new_memseg = 1;
else if (((unsigned long)hugepage[i].final_va -
(unsigned long)hugepage[i-1].final_va) != hugepage[i].size)
new_memseg = 1;
#endif
if (new_memseg) {
j += 1;
if (j == RTE_MAX_MEMSEG)
break;
mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
mcfg->memseg[j].addr = hugepage[i].final_va;
#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
mcfg->memseg[j].len = hugepage[i].size * hugepage[i].repeated;
#else
mcfg->memseg[j].len = hugepage[i].size;
#endif
mcfg->memseg[j].socket_id = hugepage[i].socket_id;
mcfg->memseg[j].hugepage_sz = hugepage[i].size;
}
/* continuation of previous memseg */
else {
#ifdef RTE_ARCH_PPC_64
/* Use the phy and virt address of the last page as segment
* address for IBM Power architecture */
mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
mcfg->memseg[j].addr = hugepage[i].final_va;
#endif
mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
}
hugepage[i].memseg_id = j;
}
if (i < nr_hugefiles) {
RTE_LOG(ERR, EAL, "Can only reserve %d pages "
"from %d requested\n"
"Current %s=%d is not enough\n"
"Please either increase it or request less amount "
"of memory.\n",
i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
RTE_MAX_MEMSEG);
return (-ENOMEM);
}
return 0;
}
空间分配主要还是这个函数,将分配出来的空间进行映射尽量使其虚拟空间连续,其中需要映射两次,第一次映射出来的虚拟空间作为物理地址的关联,第二次mmap会在phy addr连续的基础上,尽量也保证virt addr也是连续的,同时,本次mmap,会尽量保证virt addr在用户传进来的baseaddr基础上增长。然后释放掉第一次mmap的空间。
这些空间将分配到不同的socekt上,通过find_numasocket函数可以找到映射的大页内存被放在哪个NUMA node上。后面还会进行重新分配。这样我们的数据就可以直接到用户态了。
这些映射出来的空间都有一个对应的hugepage file结构体保存对应的virt addr/phy addr等信息,通过共享内存,将这些结构体进行保存和共享,后面primary进程或者secondary进程就可以很方便的使用这些地址。
由于页的数量很多(1024个2M),所以不可能全部的空间都是连续的所以使用全局的数组将这些空间连续起来。
参考:https://www.cnblogs.com/yhp-smarthome/p/6995292.html
https://www.cnblogs.com/jiayy/p/3429725.html
内存分配
这个时候有人就会问道:根据大页内存分配内存到底干了些什么事?
问的好,我当初也在纠结大页内存到底干了些什么,怎么存储数据的?
上面说了,分配内存之后就把这些内存分配到不同的socket上了嘛,接下来在内存管理的时候在进行其他操作。
我们先看rte_eal_memzone_init(void)函数:
先看malloc_elm结构体吧
struct malloc_elem {
struct malloc_heap *heap;
struct malloc_elem *volatile prev; /* points to prev elem in memseg */
LIST_ENTRY(malloc_elem) free_list; /* list of free elements in heap */
const struct rte_memseg *ms;
volatile enum elem_state state;
uint32_t pad;
size_t size;
#ifdef RTE_LIBRTE_MALLOC_DEBUG
uint64_t header_cookie; /* Cookie marking start of data */
/* trailer cookie at start + size */
#endif
} __rte_cache_aligned;
/*内存管理模块
* Init the memzone subsystem
*/
int
rte_eal_memzone_init(void)
{
struct rte_mem_config *mcfg;
const struct rte_memseg *memseg;
unsigned i = 0;
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
/* mirror the runtime memsegs from config */
free_memseg = mcfg->free_memseg;
/* secondary processes don't need to initialise anything */
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
return 0;
memseg = rte_eal_get_physmem_layout();
if (memseg == NULL) {
RTE_LOG(ERR, EAL, "%s(): Cannot get physical layout\n", __func__);
return -1;
}
rte_rwlock_write_lock(&mcfg->mlock);
/* fill in uninitialized free_memsegs */
for (i = 0; i < RTE_MAX_MEMSEG; i++) {
if (memseg[i].addr == NULL)
break;
if (free_memseg[i].addr != NULL)
continue;
memcpy(&free_memseg[i], &memseg[i], sizeof(struct rte_memseg));
}
/* make all zones cache-aligned 缓存对齐*/
for (i = 0; i < RTE_MAX_MEMSEG; i++) {
if (free_memseg[i].addr == NULL)
break;
if (memseg_sanitize(&free_memseg[i]) < 0) {
RTE_LOG(ERR, EAL, "%s(): Sanity check failed\n", __func__);
rte_rwlock_write_unlock(&mcfg->mlock);
return -1;
}
}
/* delete all zones */
mcfg->memzone_idx = 0;
memset(mcfg->memzone, 0, sizeof(mcfg->memzone));
rte_rwlock_write_unlock(&mcfg->mlock);
return 0;
}
这个函数主要就是把内存放到空闲链表中,等需要的时候,能够分配出去。
接下来再需要内存的时候就进行分配:
内存存放的地方当然事内存池了,将所有的内存进行管理,虽然这些内存已经分配到具体的socket上去了。在创建内存池时,会创建一个ring来存储分配的对象,同时,为了减少多核之间对同一个ring的访问,每一个核都维护着一个cache,优先从cache中取。
内存分配有一系列的接口:大多定义在rte_malloc.c
文件中。我们重点挑两个来看一下。
rte_malloc_socket()
这个是一个基础函数,可以在这个函数的基础上进行封装,主要参数是类型,大小,对齐,以及从哪个socket上分。一般来说,分配内存从当前线程运行的socket上分配,可以避免内存跨socket访问,提高性能。
ret = malloc_heap_alloc(&mcfg->malloc_heaps[socket], type,
size, 0, align == 0 ? 1 : align, 0);
if (ret != NULL || socket_arg != SOCKET_ID_ANY)
return ret;
先在指定的socket上分配,如果不能成功,再去尝试其他的socket。我们接着看函数malloc_heap_alloc()
:
void *
malloc_heap_alloc(struct malloc_heap *heap,
const char *type __attribute__((unused)), size_t size, unsigned flags,
size_t align, size_t bound)
{
struct malloc_elem *elem;
size = RTE_CACHE_LINE_ROUNDUP(size);
align = RTE_CACHE_LINE_ROUNDUP(align);
rte_spinlock_lock(&heap->lock);
elem = find_suitable_element(heap, size, flags, align, bound);
if (elem != NULL) {
elem = malloc_elem_alloc(elem, size, align, bound);
/* increase heap's count of allocated elements */
heap->alloc_count++;
}
rte_spinlock_unlock(&heap->lock);
return elem == NULL ? NULL : (void *)(&elem[1]);
先去空闲链表中找是否有满足需求的内存块,如果找到,就进行分配,否则返回失败。进一步的,在函数malloc_elem_alloc()
分配的的时候,如果存在的内存大于需要的内存时,会对内存进行切割,然后把用不完的重新挂在空闲链表上。就不细致的代码分析了。
rte_memzone_reserve_aligned()
这个函数的返回值类型是struct rte_memzone
型的,这是和上一个分配接口的不同之处,同时注意分配时的flag的不同。分配出来的memzone可以直接使用名字索引到。这个函数最终也是会调用到malloc_heap_alloc()
,就不多说了,可以看看代码。
分配队列
看示例代码
/* Allocate and set up 4 RX queue per Ethernet port. */
for (q = 0; q < rx_rings; q++) {
retval = rte_eth_rx_queue_setup(port, q, RX_RING_SIZE,
rte_eth_dev_socket_id(port), NULL, mbuf_pool);
if (retval < 0)
return retval;
}
这个地方我需要分配4给队列给网口。
ret = (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc,
socket_id, rx_conf, mp);
设置队列以及初始化。由于这个地方是封装了的函数指针,所以适配dpdk上所有的兼容的驱动。这个地方以ixgbe驱动为例:
int
ixgbe_dev_rx_queue_setup(struct rte_eth_dev *dev,
uint16_t queue_idx,
uint16_t nb_desc,
unsigned int socket_id,
const struct rte_eth_rxconf *rx_conf,
struct rte_mempool *mp)
{
const struct rte_memzone *rz;
struct ixgbe_rx_queue *rxq;
struct ixgbe_hw *hw;
uint16_t len;
PMD_INIT_FUNC_TRACE();
hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
/*
* Validate number of receive descriptors.
* It must not exceed hardware maximum, and must be multiple
* of IXGBE_ALIGN.
*/
if (((nb_desc * sizeof(union ixgbe_adv_rx_desc)) % IXGBE_ALIGN) != 0 ||
(nb_desc > IXGBE_MAX_RING_DESC) ||
(nb_desc < IXGBE_MIN_RING_DESC)) {
return (-EINVAL);
}
/* Free memory prior to re-allocation if needed... */
if (dev->data->rx_queues[queue_idx] != NULL) {
ixgbe_rx_queue_release(dev->data->rx_queues[queue_idx]);
dev->data->rx_queues[queue_idx] = NULL;
}
/* First allocate the rx queue data structure */
rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct ixgbe_rx_queue),
RTE_CACHE_LINE_SIZE, socket_id);
if (rxq == NULL)
return (-ENOMEM);
rxq->mb_pool = mp;
rxq->nb_rx_desc = nb_desc;
rxq->rx_free_thresh = rx_conf->rx_free_thresh;
rxq->queue_id = queue_idx;
rxq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
rxq->port_id = dev->data->port_id;
rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ?
0 : ETHER_CRC_LEN);
rxq->drop_en = rx_conf->rx_drop_en;
rxq->rx_deferred_start = rx_conf->rx_deferred_start;
/*
* Allocate RX ring hardware descriptors. A memzone large enough to
* handle the maximum ring size is allocated in order to allow for
* resizing in later calls to the queue setup function.
*/
rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx,
RX_RING_SZ, socket_id);
if (rz == NULL) {
ixgbe_rx_queue_release(rxq);
return (-ENOMEM);
}
/*
* Zero init all the descriptors in the ring.
*/
memset (rz->addr, 0, RX_RING_SZ);
/*
* Modified to setup VFRDT for Virtual Function
*/
if (hw->mac.type == ixgbe_mac_82599_vf ||
hw->mac.type == ixgbe_mac_X540_vf ||
hw->mac.type == ixgbe_mac_X550_vf ||
hw->mac.type == ixgbe_mac_X550EM_x_vf) {
rxq->rdt_reg_addr =
IXGBE_PCI_REG_ADDR(hw, IXGBE_VFRDT(queue_idx));
rxq->rdh_reg_addr =
IXGBE_PCI_REG_ADDR(hw, IXGBE_VFRDH(queue_idx));
}
else {
rxq->rdt_reg_addr =
IXGBE_PCI_REG_ADDR(hw, IXGBE_RDT(rxq->reg_idx));
rxq->rdh_reg_addr =
IXGBE_PCI_REG_ADDR(hw, IXGBE_RDH(rxq->reg_idx));
}
#ifndef RTE_LIBRTE_XEN_DOM0
rxq->rx_ring_phys_addr = (uint64_t) rz->phys_addr;
#else
rxq->rx_ring_phys_addr = rte_mem_phy2mch(rz->memseg_id, rz->phys_addr);
#endif
rxq->rx_ring = (union ixgbe_adv_rx_desc *) rz->addr;
/*
* Certain constraints must be met in order to use the bulk buffer
* allocation Rx burst function. If any of Rx queues doesn't meet them
* the feature should be disabled for the whole port.
*/
if (check_rx_burst_bulk_alloc_preconditions(rxq)) {
PMD_INIT_LOG(DEBUG, "queue[%d] doesn't meet Rx Bulk Alloc "
"preconditions - canceling the feature for "
"the whole port[%d]",
rxq->queue_id, rxq->port_id);
hw->rx_bulk_alloc_allowed = false;
}
/*
* Allocate software ring. Allow for space at the end of the
* S/W ring to make sure look-ahead logic in bulk alloc Rx burst
* function does not access an invalid memory region.
*/
len = nb_desc;
if (hw->rx_bulk_alloc_allowed)
len += RTE_PMD_IXGBE_RX_MAX_BURST;
rxq->sw_ring = rte_zmalloc_socket("rxq->sw_ring",
sizeof(struct ixgbe_rx_entry) * len,
RTE_CACHE_LINE_SIZE, socket_id);
if (rxq->sw_ring == NULL) {
ixgbe_rx_queue_release(rxq);
return (-ENOMEM);
}
PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
if (!rte_is_power_of_2(nb_desc)) {
PMD_INIT_LOG(DEBUG, "queue[%d] doesn't meet Vector Rx "
"preconditions - canceling the feature for "
"the whole port[%d]",
rxq->queue_id, rxq->port_id);
hw->rx_vec_allowed = false;
} else
ixgbe_rxq_vec_setup(rxq);
dev->data->rx_queues[queue_idx] = rxq;
ixgbe_reset_rx_queue(hw, rxq);
return 0;
}
接下来的都是重点咯:
<1>.分配队列结构体,并填充结构
rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct ixgbe_rx_queue),
RTE_CACHE_LINE_SIZE, socket_id);
填充结构体的所属内存池,描述符个数,队列号,队列所属接口号等成员。
<2>.分配描述符队列的空间,按照最大的描述符个数进行分配
rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx,
RX_RING_SZ, IXGBE_ALIGN, socket_id);
接着获取描述符队列的头和尾寄存器的地址,在收发包后,软件要对这个寄存器进行处理。
rxq->rdt_reg_addr =
IXGBE_PCI_REG_ADDR(hw, IXGBE_RDT(rxq->reg_idx));
rxq->rdh_reg_addr =
IXGBE_PCI_REG_ADDR(hw, IXGBE_RDH(rxq->reg_idx));
设置队列的接收描述符ring的物理地址和虚拟地址。
rxq->rx_ring_phys_addr = rte_mem_phy2mch(rz->memseg_id, rz->phys_addr);
rxq->rx_ring = (union ixgbe_adv_rx_desc *) rz->addr;
<3>分配sw_ring,这个ring中存储的对象是struct ixgbe_rx_entry
,其实里面就是数据包mbuf的指针。
rxq->sw_ring = rte_zmalloc_socket("rxq->sw_ring",
sizeof(struct ixgbe_rx_entry) * len,
RTE_CACHE_LINE_SIZE, socket_id);
以上三步做完以后,新分配的队列结构体重要的部分就已经填充完了,下面需要重置一下其他成员
ixgbe_reset_rx_queue()
先把分配的描述符队列清空,其实清空在分配的时候就已经做了,没必要重复做
for (i = 0; i < len; i++) {
rxq->rx_ring[i] = zeroed_desc;
}
然后初始化队列中一下其他成员
rxq->rx_nb_avail = 0;
rxq->rx_next_avail = 0;
rxq->rx_free_trigger = (uint16_t)(rxq->rx_free_thresh - 1);
rxq->rx_tail = 0;
rxq->nb_rx_hold = 0;
rxq->pkt_first_seg = NULL;
rxq->pkt_last_seg = NULL;
这样,接收队列就初始化完了。
收数据包
参考一下:https://www.cnblogs.com/yhp-smarthome/p/6705638.html
dpdk在初始化的时候就根据大页的配置信息初始化好了内存,这些内存片又被分配到不同的socket上去了,最后通过共享内存和内存池来进行管理。
在建立内存池的时候就将这内存片的信息结构体放入到内存池的一个ring队列中去了。
在多队列的模式下,将内存池中的内存信息进行多队列的分配,将这些内存的信息再次进行填充,如果当前线程下的socket上分配的内存足够的化那么当前队列直接使用当前socket上的内存,如果不够则取其他的socket上的内存使用,这样也减少了socket之间的调用。
每个队列中都有一个DMA寄存器,这些寄存器的工作就是将数据包拷贝到这些内存中去。这些地址通过各种转换之后就可以直接让DMA寄存器来使用,当DMA寄存器把这些数据放到这些内存之后CPU就可以直接取对应地址的数据了。