dpdk内存管理分析

本文详细探讨了DPDK的内存管理机制,包括配置初始化、大页内存收集、内存区初始化、内存初始化以及malloc堆初始化。DPDK利用大页内存提高TLB缓存效率,通过内存配置文件映射内存区域,并根据NUMA节点分配内存。在启动过程中,它会创建和映射大页,然后将这些大页分配给不同的内存区,最后将内存区与malloc堆关联,以便高效地分配内存。
摘要由CSDN通过智能技术生成

dpdk内存管理分析

1.1 简述

dpdk使用了hugepage和numa感知进行构建内存管理。采用大页内存的好处一方面可以将内存空间固定住(使其不被swap),另一方面也能有效提升TLB缓存的命中效率。

内存管理初始化流程简化如下:(其余初始化功能请参考其余文章)

int
rte_eal_init(int argc, char **argv)
{
    // 初始化rte_config 将rte_config->mem_config进行mmap
	if (rte_config_init() < 0) {
		rte_eal_init_alert("Cannot init config");
		return -1;
	}

	if (internal_conf->no_hugetlbfs == 0) {
		/* rte_config isn't initialized yet */
		ret = internal_conf->process_type == RTE_PROC_PRIMARY ?
            	// 收集系统可用大页内存 保存在internal_config->hupage_info[3]
				eal_hugepage_info_init() :
				eal_hugepage_info_read();
		if (ret < 0) {
			rte_eal_init_alert("Cannot get hugepage information.");
			rte_errno = EACCES;
			__atomic_store_n(&run_once, 0, __ATOMIC_RELAXED);
			return -1;
		}
	}

    // memzone初始化
	if (rte_eal_memzone_init() < 0) {
		rte_eal_init_alert("Cannot init memzone");
		rte_errno = ENODEV;
		return -1;
	}

    // 大页内存映射,并且保存在rte_config->mem_config->memseg
	if (rte_eal_memory_init() < 0) {
		rte_eal_init_alert("Cannot init memory");
		rte_errno = ENOMEM;
		return -1;
	}

    // 将rte_config->mem_config->memseg插入rte_config->mem_config->malloc_heaps[]
	if (rte_eal_malloc_heap_init() < 0) {
		rte_eal_init_alert("Cannot init malloc heap");
		rte_errno = ENODEV;
		return -1;
	}
}

1.2 rte_config_init分析

主进程: 映射/var/run/config,保存config->mem_config内容。然后保存config->mem_config->mem_cfg_addr以便从进程能够获取同样的虚拟地址。

辅助进程:第一次获取主进程的mem_cfg_addr,第二次使用相同地址访问config->mem_config

static int rte_config_init(void)
{
	struct rte_config *config = rte_eal_get_configuration();
	const struct internal_config *internal_conf =
		eal_get_internal_configuration();

	// 在eal_parse_args检测函数
	config->process_type = internal_conf->process_type;

	switch (config->process_type) {
	case RTE_PROC_PRIMARY: // PRIMARY进程
		if (rte_eal_config_create() < 0) // 主进程create文件 以及mmap 以及锁文件 以及保存一些变量
			return -1;
		eal_mcfg_update_from_internal(); // 从internal_conf里面取一些变量
										 // 给ret_config->mem_config
		break;
	case RTE_PROC_SECONDARY: // SECONDARY进程
		if (rte_eal_config_attach() < 0) // attach 主进程的值
			return -1;
		eal_mcfg_wait_complete(); // 等待ret_config->mem_config->magic变RTE_MAGIC
		if (eal_mcfg_check_version() < 0) { // 版本检测
			RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n");
			return -1;
		}
		if (rte_eal_config_reattach() < 0) // 重新attach
			return -1;
		if (!__rte_mp_enable()) { // mp 开启
			RTE_LOG(ERR, EAL, "Primary process refused secondary attachment\n");
			return -1;
		}
		eal_mcfg_update_internal(); // 从ret_config->mem_config里面取一些变量给internal_conf
		break;
	case RTE_PROC_AUTO:
	case RTE_PROC_INVALID:
		RTE_LOG(ERR, EAL, "Invalid process type %d\n",
			config->process_type);
		return -1;
	}

	return 0;
}
/* create memory configuration in shared/mmap memory. Take out
 * a write lock on the memsegs, so we can auto-detect primary/secondary.
 * This means we never close the file while running (auto-close on exit).
 * We also don't lock the whole file, so that in future we can use read-locks
 * on other parts, e.g. memzones, to detect if there are running secondary
 * processes. */
static int
rte_eal_config_create(void)
{
	struct rte_config *config = rte_eal_get_configuration();
	size_t page_sz = sysconf(_SC_PAGE_SIZE);
	size_t cfg_len = sizeof(*config->mem_config);
	size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz);
	void *rte_mem_cfg_addr, *mapped_mem_cfg_addr;
	int retval;
	const struct internal_config *internal_conf =
		eal_get_internal_configuration();

	// /var/run/config路径
	const char *pathname = eal_runtime_config_path();

	if (internal_conf->no_shconf)
		return 0;

	/* map the config before hugepage address so that we don't waste a page */
	// 在大页面地址之前映射配置,这样我们就不会浪费页面
    if (internal_conf->base_virtaddr != 0)
		rte_mem_cfg_addr = (void *)
			RTE_ALIGN_FLOOR(internal_conf->base_virtaddr -
			sizeof(struct rte_mem_config), page_sz);
	else
		rte_mem_cfg_addr = NULL;

	// 创建/var/run/config
	if (mem_cfg_fd < 0){
		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600);
		if (mem_cfg_fd < 0) {
			RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
				pathname);
			return -1;
		}
	}

	// 将文件大小改为sizeof(*config->mem_config)
	retval = ftruncate(mem_cfg_fd, cfg_len);
	if (retval < 0){
		close(mem_cfg_fd);
		mem_cfg_fd = -1;
		RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n",
			pathname);
		return -1;
	}

	// 加锁
	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
	if (retval < 0){
		close(mem_cfg_fd);
		mem_cfg_fd = -1;
		RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary "
			"process running?\n", pathname);
		return -1;
	}

	/* reserve space for config */
	// 为config预留空间
	rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr,
			&cfg_len_aligned, page_sz, 0, 0);
	if (rte_mem_cfg_addr == NULL) {
		RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n");
		close(mem_cfg_fd);
		mem_cfg_fd = -1;
		return -1;
	}

	/* remap the actual file into the space we've just reserved */
	// mmap config文件
	mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr,
			cfg_len_aligned, PROT_READ | PROT_WRITE,
			MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0);
	if (mapped_mem_cfg_addr == MAP_FAILED) {
		munmap(rte_mem_cfg_addr, cfg_len);
		close(mem_cfg_fd);
		mem_cfg_fd = -1;
		RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n");
		return -1;
	}

	// 将config->mem_config保存到映射文件rte_mem_cfg_addr
	memcpy(rte_mem_cfg_addr, config->mem_config, sizeof(struct rte_mem_config));
	// 将映射的文件rte_mem_cfg_addr保存到config->mem_config
	config->mem_config = rte_mem_cfg_addr;

	/* store address of the config in the config itself so that secondary
	 * processes could later map the config into this exact location
	 */
	// 保存变量以便从进程能够获取
	config->mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
	config->mem_config->dma_maskbits = 0;

	return 0;
}

1.3 eal_hugepage_info_init的分析

收集所有/sys/kernel/mm/hugepages大页

/*
 * when we initialize the hugepage info, everything goes
 * to socket 0 by default. it will later get sorted by memory
 * initialization procedure.
 */
int
eal_hugepage_info_init(void)
{
	struct hugepage_info *hpi, *tmp_hpi;
	unsigned int i;
	struct internal_config *internal_conf =
		eal_get_internal_configuration();

	// 收集所有的大页
	if (hugepage_info_init() < 0)
		return -1;

	/* for no shared files mode, we're done */
	if (internal_conf->no_shconf)
		return 0;

	hpi = &internal_conf->hugepage_info[0];

	// 进行mmap映射
	tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
			sizeof(internal_conf->hugepage_info));
	if (tmp_hpi == NULL) {
		RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
		return -1;
	}

	// 拷贝hpi倒共享内存
	memcpy(tmp_hpi, hpi, sizeof(internal_conf->hugepage_info));

	/* we've copied file descriptors along with everything else, but they
	 * will be invalid in secondary process, so overwrite them
	 */
	for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) {
		struct hugepage_info *tmp = &tmp_hpi[i];
		tmp->lock_descriptor = -1;
	}

	if (munmap(tmp_hpi, sizeof(internal_conf->hugepage_info)) < 0) {
		RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
		return -1;
	}
	return 0;
}
static int hugepage_info_init(void)
{	const char dirent_start_text[] = "hugepages-";
	const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
	unsigned int i, num_sizes = 0;
	uint64_t reusable_bytes;
	unsigned int reusable_pages;
	DIR *dir;
	struct dirent *dirent;
	struct internal_config *internal_conf =
		eal_get_internal_configuration();

	// 打开文件夹/sys/kernel/mm/hugepages
	dir = opendir(sys_dir_path);
	if (dir == NULL) {
		RTE_LOG(ERR, EAL,
			"Cannot open directory %s to read system hugepage info\n",
			sys_dir_path);
		return -1;
	}

	// 遍历文件夹
	for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
		struct hugepage_info *hpi;

		// 如果不以hugepages-开头 则跳过该目录
		if (strncmp(dirent->d_name, dirent_start_text,
			    dirent_start_len) != 0)
			continue;

		// 超过3页 跳过
		if (num_sizes >= MAX_HUGEPAGE_SIZES)
			break;

		hpi = &internal_conf->hugepage_info[num_sizes];
		// 获取大小 赋值给internal_conf->hugepage_info[0-2]
		hpi->hugepage_sz =
			rte_str_to_size(&dirent->d_name[dirent_start_len]);

		/* first, check if we have a mountpoint */
		// 检查是否有mountpoint点 在/proc/mounts文件夹里面 没有则继续 跳过本次循环
		if (get_hugepage_dir(hpi->hugepage_sz,
			hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
			uint32_t num_pages;

			// 如果没有mount 则打印日志
			num_pages = get_num_hugepages(dirent->d_name,
					hpi->hugepage_sz, 0);
			if (num_pages > 0)
				RTE_LOG(NOTICE, EAL,
					"%" PRIu32 " hugepages of size "
					"%" PRIu64 " reserved, but no mounted "
					"hugetlbfs found for that size\n",
					num_pages, hpi->hugepage_sz);
			/* if we have kernel support for reserving hugepages
			 * through mmap, and we're in in-memory mode, treat this
			 * page size as valid. we cannot be in legacy mode at
			 * this point because we've checked this earlier in the
			 * init process.
			 */
#ifdef MAP_HUGE_SHIFT
			if (internal_conf->in_memory) {
				RTE_LOG(DEBUG, EAL, "In-memory mode enabled, "
					"hugepages of size %" PRIu64 " bytes "
					"will be allocated anonymously\n",
					hpi->hugepage_sz);
				calc_num_pages(hpi, dirent, 0);
				num_sizes++;
			}
#endif
			continue;
		}

		/* try to obtain a writelock */
		hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);

		/* if blocking lock failed */
		if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
			RTE_LOG(CRIT, EAL,
				"Failed to lock hugepage directory!\n");
			break;
		}

		/*
		 * Check for existing hugepage files and either remove them
		 * or count how many of them can be reused.
		 */
		// 检查现有的大页面文件并删除它们或计算其中有大可以重复使用。
		// 获取有多少页可用
		reusable_pages = 0;
		if (!internal_conf->hugepage_file.unlink_existing) {
			reusable_bytes = 0;
			if (inspect_hugedir(hpi->hugedir,
					&reusable_bytes) < 0)
				break;
			RTE_ASSERT(reusable_bytes % hpi->hugepage_sz == 0);
			reusable_pages = reusable_bytes / hpi->hugepage_sz;
		} else if (clear_hugedir(hpi->hugedir) < 0) {
			break;
		}

		// 该函数里面有尝试将page和socket进行绑定 如果绑定失败 
		// 则将所有大小放入hpi->num_pages[0]位置
		calc_num_pages(hpi, dirent, reusable_pages);
		num_sizes++;
	}
	closedir(dir);

	/* something went wrong, and we broke from the for loop above */
	if (dirent != NULL)
		return -1;

	internal_conf->num_hugepage_sizes = num_sizes;

	/* sort the page directory entries by size, largest to smallest */
	// 从大到小排序
	qsort(&internal_conf->hugepage_info[0], num_sizes,
	      sizeof(internal_conf->hugepage_info[0]), compare_hpi);

	/* now we have all info, check we have at least one valid size */
	// 如果有可用 则返回0
	for (i = 0; i < num_sizes; i++) {
		/* pages may no longer all be on socket 0, so check all */
		unsigned int j, num_pages = 0;
		struct hugepage_info *hpi = &internal_conf->hugepage_info[i];

		for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
			num_pages += hpi->num_pages[j];
		if (num_pages > 0)
			return 0;
	}

	/* no valid hugepage mounts available, return error */
	return -1;
}

1.4 rte_eal_memzone_init的分析

初始rte_config->mem_config, 以及对rte_config->mem_config->memzones申请内存

int rte_eal_memzone_init(void)
{
	struct rte_mem_config *mcfg;
	int ret = 0;

	/* get pointer to global configuration */
	mcfg = rte_eal_get_configuration()->mem_config;

	rte_rwlock_write_lock(&mcfg->mlock);

	// 主进程rte_fbarray_init 从进程attach
	if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
			rte_fbarray_init(&mcfg->memzones, "memzone",
			RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) {
		RTE_LOG(ERR, EAL, "Cannot allocate memzone list\n");
		ret = -1;
	} else if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
			rte_fbarray_attach(&mcfg->memzones)) {
		RTE_LOG(ERR, EAL, "Cannot attach to memzone list\n");
		ret = -1;
	}

	rte_rwlock_write_unlock(&mcfg->mlock);

	return ret;
}
int rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
		unsigned int elt_sz)
{
	size_t page_sz, mmap_len;
	char path[PATH_MAX];
	struct used_mask *msk;
	struct mem_area *ma = NULL;
	void *data = NULL;
	int fd = -1;
	const struct internal_config *internal_conf =
		eal_get_internal_configuration();

	if (arr == NULL) {
		rte_errno = EINVAL;
		return -1;
	}

	// 参数检查
	if (fully_validate(name, elt_sz, len))
		return -1;

	/* allocate mem area before doing anything */
	// 便于链接到mem_area_tailq
	ma = malloc(sizeof(*ma));
	if (ma == NULL) {
		rte_errno = ENOMEM;
		return -1;
	}

	// 获取pagesize
	page_sz = rte_mem_page_size();
	if (page_sz == (size_t)-1) {
		free(ma);
		return -1;
	}

	/* calculate our memory limits */
	// 计算内存大小
	mmap_len = calc_data_size(page_sz, elt_sz, len);

	// 会去mmap mmap_len大小 递减
	data = eal_get_virtual_area(NULL, &mmap_len, page_sz, 0, 0);
	if (data == NULL) {
		free(ma);
		return -1;
	}

	rte_spinlock_lock(&mem_area_lock);

	fd = -1;

	if (internal_conf->no_shconf) {
		/* remap virtual area as writable */
		static const int flags = RTE_MAP_FORCE_ADDRESS |
			RTE_MAP_PRIVATE | RTE_MAP_ANONYMOUS;

		// 重新mmap mmap_len大小
		void *new_data = rte_mem_map(data, mmap_len,
			RTE_PROT_READ | RTE_PROT_WRITE, flags, fd, 0);
		if (new_data == NULL) {
			RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n",
					__func__, rte_strerror(rte_errno));
			goto fail;
		}
	} else {
		eal_get_fbarray_path(path, sizeof(path), name);

		/*
		 * Each fbarray is unique to process namespace, i.e. the
		 * filename depends on process prefix. Try to take out a lock
		 * and see if we succeed. If we don't, someone else is using it
		 * already.
		 */
		fd = eal_file_open(path, EAL_OPEN_CREATE | EAL_OPEN_READWRITE);
		if (fd < 0) {
			RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n",
				__func__, path, rte_strerror(rte_errno));
			goto fail;
		} else if (eal_file_lock(
				fd, EAL_FLOCK_EXCLUSIVE, EAL_FLOCK_RETURN)) {
			RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n",
				__func__, path, rte_strerror(rte_errno));
			rte_errno = EBUSY;
			goto fail;
		}

		/* take out a non-exclusive lock, so that other processes could
		 * still attach to it, but no other process could reinitialize
		 * it.
		 */
		if (eal_file_lock(fd, EAL_FLOCK_SHARED, EAL_FLOCK_RETURN))
			goto fail;

		if (resize_and_map(fd, path, data, mmap_len))
			goto fail;
	}
	// 初始化ma 便于链接
	ma->addr = data;
	ma->len = mmap_len;
	ma->fd = fd;

	/* do not close fd - keep it until detach/destroy */
	TAILQ_INSERT_TAIL(&mem_area_tailq, ma, next);

	/* initialize the data */
	// 清空
	memset(data, 0, mmap_len);

	/* populate data structure */
	// 将map的内存链接到rte_fbarray里
	strlcpy(arr->name, name, sizeof(arr->name));
	arr->data = data;
	arr->len = len;
	arr->elt_sz = elt_sz;
	arr->count = 0;

	msk = get_used_mask(data, elt_sz, len);
	msk->n_masks = MASK_LEN_TO_IDX(RTE_ALIGN_CEIL(len, MASK_ALIGN));

	rte_rwlock_init(&arr->rwlock);

	rte_spinlock_unlock(&mem_area_lock);

	return 0;
fail:
	if (data)
		rte_mem_unmap(data, mmap_len);
	if (fd >= 0)
		close(fd);
	free(ma);

	rte_spinlock_unlock(&mem_area_lock);
	return -1;
}

1.5 rte_eal_memory_init的分析

int
rte_eal_memory_init(void)
{
	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
	const struct internal_config *internal_conf =
		eal_get_internal_configuration();

	int retval;
	RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");

	if (!mcfg)
		return -1;

	/* lock mem hotplug here, to prevent races while we init */
	rte_mcfg_mem_read_lock();

	// 
	if (rte_eal_memseg_init() < 0)
		goto fail;

	if (eal_memalloc_init() < 0)
		goto fail;

	retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
			rte_eal_hugepage_init() :
			rte_eal_hugepage_attach();
	if (retval < 0)
		goto fail;

	if (internal_conf->no_shconf == 0 && rte_eal_memdevice_init() < 0)
		goto fail;

	return 0;
fail:
	rte_mcfg_mem_read_unlock();
	return -1;
}

对于rte_eal_memseg_init函数,分析64位主进程

int
eal_dynmem_memseg_lists_init(void)
{
	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
	struct memtype {
		uint64_t page_sz;
		int socket_id;
	} *memtypes = NULL;
	int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
	struct rte_memseg_list *msl;
	uint64_t max_mem, max_mem_per_type;
	unsigned int max_seglists_per_type;
	unsigned int n_memtypes, cur_type;
	struct internal_config *internal_conf =
		eal_get_internal_configuration();

	/* no-huge does not need this at all */
	if (internal_conf->no_hugetlbfs)
		return 0;

	/*
	 * figuring out amount of memory we're going to have is a long and very
	 * involved process. the basic element we're operating with is a memory
	 * type, defined as a combination of NUMA node ID and page size (so that
	 * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
	 *
	 * deciding amount of memory going towards each memory type is a
	 * balancing act between maximum segments per type, maximum memory per
	 * type, and number of detected NUMA nodes. the goal is to make sure
	 * each memory type gets at least one memseg list.
	 *
	 * the total amount of memory is limited by RTE_MAX_MEM_MB value.
	 *
	 * the total amount of memory per type is limited by either
	 * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
	 * of detected NUMA nodes. additionally, maximum number of segments per
	 * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
	 * smaller page sizes, it can take hundreds of thousands of segments to
	 * reach the above specified per-type memory limits.
	 *
	 * additionally, each type may have multiple memseg lists associated
	 * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
	 * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
	 *
	 * the number of memseg lists per type is decided based on the above
	 * limits, and also taking number of detected NUMA nodes, to make sure
	 * that we don't run out of memseg lists before we populate all NUMA
	 * nodes with memory.
	 *
	 * we do this in three stages. first, we collect the number of types.
	 * then, we figure out memory constraints and populate the list of
	 * would-be memseg lists. then, we go ahead and allocate the memseg
	 * lists.
	 */

	/*
	* 计算出我们将拥有的内存量是一个漫长而非常的过程
	* 涉及的过程。我们操作的基本元素是内存
	* 类型,定义为 NUMA 节点 ID 和页面大小的组合(这样
	* 例如2 个具有 2 个页面大小的套接字总共产生 4 种内存类型)。
	*
	* 决定每种内存类型的内存量是
	* 每种类型的最大段数、每个类型的最大内存之间的平衡行为
	* 类型和检测到的 NUMA 节点数。目标是确保
	* 每种内存类型至少有一个 memseg 列表。
	*
	* 内存总量受 RTE_MAX_MEM_MB 值限制。
	*
	* 每种类型的内存总量受任一限制
	* RTE_MAX_MEM_MB_PER_TYPE,或由 RTE_MAX_MEM_MB 除以数量
	* 检测到的 NUMA 节点。此外,每个最大段数
	* 类型也受 RTE_MAX_MEMSEG_PER_TYPE 限制。这是因为对于
	* 较小的页面大小,可能需要数十万个段
	* 达到上述指定的每种类型的内存限制。
	*
	* 此外,每种类型可能有多个关联的 memseg 列表
	* 有了它,每个都受 RTE_MAX_MEM_MB_PER_LIST 的限制以获得更大
	* 页面大小,或较小的 RTE_MAX_MEMSEG_PER_LIST 段。
	*
	* 每种类型的 memseg 列表的数量基于上述决定
	* 限制,并获取检测到的 NUMA 节点的数量,以确保
	* 在我们填充所有 NUMA 之前,我们不会用完 memseg 列表
	* 带内存的节点。
	*
	* 我们分三个阶段进行。首先,我们收集类型的数量。
	* 然后,我们计算出内存限制并填充列表
	* 可能的 memseg 列表。然后,我们继续分配 memseg
	* 列表。
	*/

	/* create space for mem types */
	// 进行组合
	n_memtypes = internal_conf->num_hugepage_sizes * rte_socket_count();
	memtypes = calloc(n_memtypes, sizeof(*memtypes));
	if (memtypes == NULL) {
		RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
		return -1;
	}

	/* populate mem types */
	// 进行组合
	cur_type = 0;
	for (hpi_idx = 0; hpi_idx < (int) internal_conf->num_hugepage_sizes;
			hpi_idx++) {
		struct hugepage_info *hpi;
		uint64_t hugepage_sz;

		hpi = &internal_conf->hugepage_info[hpi_idx];
		hugepage_sz = hpi->hugepage_sz;

		for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
			int socket_id = rte_socket_id_by_idx(i);

#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
			/* we can still sort pages by socket in legacy mode */
			if (!internal_conf->legacy_mem && socket_id > 0)
				break;
#endif
			memtypes[cur_type].page_sz = hugepage_sz;
			memtypes[cur_type].socket_id = socket_id;

			RTE_LOG(DEBUG, EAL, "Detected memory type: "
				"socket_id:%u hugepage_sz:%" PRIu64 "\n",
				socket_id, hugepage_sz);
		}
	}
	/* number of memtypes could have been lower due to no NUMA support */
	n_memtypes = cur_type;

	/* set up limits for types */
	max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
	max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
			max_mem / n_memtypes);
	/*
	 * limit maximum number of segment lists per type to ensure there's
	 * space for memseg lists for all NUMA nodes with all page sizes
	 */
	max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;

	if (max_seglists_per_type == 0) {
		RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
			RTE_STR(RTE_MAX_MEMSEG_LISTS));
		goto out;
	}

	/* go through all mem types and create segment lists */
	// 遍历所有memroy 类型 创建segment lists
	msl_idx = 0;
	for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
		unsigned int cur_seglist, n_seglists, n_segs;
		unsigned int max_segs_per_type, max_segs_per_list;
		struct memtype *type = &memtypes[cur_type];
		uint64_t max_mem_per_list, pagesz;
		int socket_id;

		pagesz = type->page_sz;
		socket_id = type->socket_id;

		/*
		 * we need to create segment lists for this type. we must take
		 * into account the following things:
		 *
		 * 1. total amount of memory we can use for this memory type
		 * 2. total amount of memory per memseg list allowed
		 * 3. number of segments needed to fit the amount of memory
		 * 4. number of segments allowed per type
		 * 5. number of segments allowed per memseg list
		 * 6. number of memseg lists we are allowed to take up
		 */

		/*
		* 我们需要为这种类型创建段列表。 我们必须采取
		* 考虑以下几点:
		*
		* 1. 我们可以用于这种内存类型的内存总量
		* 2. 每个 memseg 列表允许的内存总量
		* 3. 适合内存量所需的段数
		* 4. 每种类型允许的段数
		* 5. 每个 memseg 列表允许的段数
		* 6. 我们被允许占用的 memseg 列表的数量
		*/

		/* calculate how much segments we will need in total */
		max_segs_per_type = max_mem_per_type / pagesz;
		/* limit number of segments to maximum allowed per type */
		max_segs_per_type = RTE_MIN(max_segs_per_type,
				(unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
		/* limit number of segments to maximum allowed per list */
		max_segs_per_list = RTE_MIN(max_segs_per_type,
				(unsigned int)RTE_MAX_MEMSEG_PER_LIST);

		/* calculate how much memory we can have per segment list */
		max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
				(uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);

		/* calculate how many segments each segment list will have */
		n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);

		/* calculate how many segment lists we can have */
		n_seglists = RTE_MIN(max_segs_per_type / n_segs,
				max_mem_per_type / max_mem_per_list);

		/* limit number of segment lists according to our maximum */
		n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);

		RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
				"n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
			n_seglists, n_segs, socket_id, pagesz);

		/* create all segment lists */
		for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
			if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
				RTE_LOG(ERR, EAL,
					"No more space in memseg lists, please increase %s\n",
					RTE_STR(RTE_MAX_MEMSEG_LISTS));
				goto out;
			}
			msl = &mcfg->memsegs[msl_idx++];

			if (eal_memseg_list_init(msl, pagesz, n_segs,
					socket_id, cur_seglist, true))
				goto out;

			if (eal_memseg_list_alloc(msl, 0)) {
				RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
				goto out;
			}
		}
	}
	/* we're successful */
	ret = 0;
out:
	free(memtypes);
	return ret;
}

其中eal_memseg_list_init会调用rte_fbarray_init函数

int
eal_memseg_list_init_named(struct rte_memseg_list *msl, const char *name,
		uint64_t page_sz, int n_segs, int socket_id, bool heap)
{
	// 这里会mmap n_segs*sizeof(struct rte_memseg)大小的内存
	if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
			sizeof(struct rte_memseg))) {
		RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
			rte_strerror(rte_errno));
		return -1;
	}

	msl->page_sz = page_sz;
	msl->socket_id = socket_id;
	msl->base_va = NULL;
	msl->heap = heap;

	RTE_LOG(DEBUG, EAL,
		"Memseg list allocated at socket %i, page size 0x%"PRIx64"kB\n",
		socket_id, page_sz >> 10);

	return 0;
}

而eal_memseg_list_alloc执行

int
eal_memseg_list_alloc(struct rte_memseg_list *msl, int reserve_flags)
{
	size_t page_sz, mem_sz;
	void *addr;

	page_sz = msl->page_sz;
	mem_sz = page_sz * msl->memseg_arr.len;

	// 底层会重新mmap
	addr = eal_get_virtual_area(
		msl->base_va, &mem_sz, page_sz, 0, reserve_flags);
	if (addr == NULL) {
#ifndef RTE_EXEC_ENV_WINDOWS
		/* The hint would be misleading on Windows, because address
		 * is by default system-selected (base VA = 0).
		 * However, this function is called from many places,
		 * including common code, so don't duplicate the message.
		 */
		if (rte_errno == EADDRNOTAVAIL)
			RTE_LOG(ERR, EAL, "Cannot reserve %llu bytes at [%p] - "
				"please use '--" OPT_BASE_VIRTADDR "' option\n",
				(unsigned long long)mem_sz, msl->base_va);
#endif
		return -1;
	}
	msl->base_va = addr;
	msl->len = mem_sz;

	RTE_LOG(DEBUG, EAL, "VA reserved for memseg list at %p, size %zx\n",
			addr, mem_sz);

	return 0;
}

对于eal_memalloc_init函数, 实际执行alloc_list, 其中fd_list是全局变量

static int
alloc_list(int list_idx, int len)
{
	int *data;
	int i;
	const struct internal_config *internal_conf =
		eal_get_internal_configuration();

	/* single-file segments mode does not need fd list */
	if (!internal_conf->single_file_segments) {
		/* ensure we have space to store fd per each possible segment */
		data = malloc(sizeof(int) * len);
		if (data == NULL) {
			RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
			return -1;
		}
		/* set all fd's as invalid */
		for (i = 0; i < len; i++)
			data[i] = -1;
		fd_list[list_idx].fds = data;
		fd_list[list_idx].len = len;
	} else {
		fd_list[list_idx].fds = NULL;
		fd_list[list_idx].len = 0;
	}

	fd_list[list_idx].count = 0;
	fd_list[list_idx].memseg_list_fd = -1;

	return 0;
}

对于rte_eal_hugepage_init函数,该函数是实际映射大页

首先分析legacy模式

static int
eal_legacy_hugepage_init(void)
{
	struct rte_mem_config *mcfg;
	struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
	struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
	struct internal_config *internal_conf =
		eal_get_internal_configuration();

	uint64_t memory[RTE_MAX_NUMA_NODES];

	unsigned hp_offset;
	int i, j;
	int nr_hugefiles, nr_hugepages = 0;
	void *addr;

	memset(used_hp, 0, sizeof(used_hp));

	/* get pointer to global configuration */
	// 获取全局的mem_config 该区域是被共享的
	mcfg = rte_eal_get_configuration()->mem_config;

	/* hugetlbfs can be disabled */
	if (internal_conf->no_hugetlbfs) {
		void *prealloc_addr;
		size_t mem_sz;
		struct rte_memseg_list *msl;
		int n_segs, fd, flags;
#ifdef MEMFD_SUPPORTED
		int memfd;
#endif
		uint64_t page_sz;

		/* nohuge mode is legacy mode */
		internal_conf->legacy_mem = 1;

		/* nohuge mode is single-file segments mode */
		internal_conf->single_file_segments = 1;

		/* create a memseg list */
		msl = &mcfg->memsegs[0];

		mem_sz = internal_conf->memory;
		page_sz = RTE_PGSIZE_4K;
		n_segs = mem_sz / page_sz;

		if (eal_memseg_list_init_named(
				msl, "nohugemem", page_sz, n_segs, 0, true)) {
			return -1;
		}

		/* set up parameters for anonymous mmap */
		fd = -1;
		flags = MAP_PRIVATE | MAP_ANONYMOUS;

#ifdef MEMFD_SUPPORTED
		/* create a memfd and store it in the segment fd table */
		memfd = memfd_create("nohuge", 0);
		if (memfd < 0) {
			RTE_LOG(DEBUG, EAL, "Cannot create memfd: %s\n",
					strerror(errno));
			RTE_LOG(DEBUG, EAL, "Falling back to anonymous map\n");
		} else {
			/* we got an fd - now resize it */
			if (ftruncate(memfd, internal_conf->memory) < 0) {
				RTE_LOG(ERR, EAL, "Cannot resize memfd: %s\n",
						strerror(errno));
				RTE_LOG(ERR, EAL, "Falling back to anonymous map\n");
				close(memfd);
			} else {
				/* creating memfd-backed file was successful.
				 * we want changes to memfd to be visible to
				 * other processes (such as vhost backend), so
				 * map it as shared memory.
				 */
				RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
				fd = memfd;
				flags = MAP_SHARED;
			}
		}
#endif
		/* preallocate address space for the memory, so that it can be
		 * fit into the DMA mask.
		 */
		if (eal_memseg_list_alloc(msl, 0)) {
			RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
			return -1;
		}

		prealloc_addr = msl->base_va;
		addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE,
				flags | MAP_FIXED, fd, 0);
		if (addr == MAP_FAILED || addr != prealloc_addr) {
			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
					strerror(errno));
			munmap(prealloc_addr, mem_sz);
			return -1;
		}

		/* we're in single-file segments mode, so only the segment list
		 * fd needs to be set up.
		 */
		if (fd != -1) {
			if (eal_memalloc_set_seg_list_fd(0, fd) < 0) {
				RTE_LOG(ERR, EAL, "Cannot set up segment list fd\n");
				/* not a serious error, proceed */
			}
		}

		eal_memseg_list_populate(msl, addr, n_segs);

		if (mcfg->dma_maskbits &&
		    rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
			RTE_LOG(ERR, EAL,
				"%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n",
				__func__);
			if (rte_eal_iova_mode() == RTE_IOVA_VA &&
			    rte_eal_using_phys_addrs())
				RTE_LOG(ERR, EAL,
					"%s(): Please try initializing EAL with --iova-mode=pa parameter.\n",
					__func__);
			goto fail;
		}
		return 0;
	}

	/* calculate total number of hugepages available. at this point we haven't
	 * yet started sorting them so they all are on socket 0 */
	/* 计算可用的大页面总数。 之前calculate里面有放入num_pages[0]*/
	for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++) {
		/* meanwhile, also initialize used_hp hugepage sizes in used_hp */
		used_hp[i].hugepage_sz = internal_conf->hugepage_info[i].hugepage_sz;

		nr_hugepages += internal_conf->hugepage_info[i].num_pages[0];
	}

	/*
	 * allocate a memory area for hugepage table.
	 * this isn't shared memory yet. due to the fact that we need some
	 * processing done on these pages, shared memory will be created
	 * at a later stage.
	 */
	/*
	* 为大页表分配内存区域。经过一些处理后,会将值拷贝到共享内存
	*/
	tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
	if (tmp_hp == NULL)
		goto fail;

	memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));

	hp_offset = 0; /* where we start the current page size entries */

	huge_register_sigbus();

	/* make a copy of socket_mem, needed for balanced allocation. */
	/* 拷贝副本,用于平衡分配。 */
	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
		memory[i] = internal_conf->socket_mem[i];

	/* map all hugepages and sort them */
	/* 映射所有大页面并对其进行排序 */
	for (i = 0; i < (int)internal_conf->num_hugepage_sizes; i++) {
		unsigned pages_old, pages_new;
		struct hugepage_info *hpi;

		/*
		 * we don't yet mark hugepages as used at this stage, so
		 * we just map all hugepages available to the system
		 * all hugepages are still located on socket 0
		 */
		/*
		* 我们还没有在这个阶段将大页面标记为使用,所以
		* 我们只映射系统可用的所有大页面
		* 所有大页面仍然位于套接字 0
		*/
		hpi = &internal_conf->hugepage_info[i];

		if (hpi->num_pages[0] == 0)
			continue;

		/* map all hugepages available */
		pages_old = hpi->num_pages[0];
		// 这里去map了所有大页
		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
		if (pages_new < pages_old) {
			RTE_LOG(DEBUG, EAL,
				"%d not %d hugepages of size %u MB allocated\n",
				pages_new, pages_old,
				(unsigned)(hpi->hugepage_sz / 0x100000));

			int pages = pages_old - pages_new;

			nr_hugepages -= pages;
			hpi->num_pages[0] = pages_new;
			if (pages_new == 0)
				continue;
		}

		// 设置物理地址
		if (rte_eal_using_phys_addrs() &&
				rte_eal_iova_mode() != RTE_IOVA_VA) {
			/* find physical addresses for each hugepage */
			if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
				RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
					"for %u MB pages\n",
					(unsigned int)(hpi->hugepage_sz / 0x100000));
				goto fail;
			}
		} else {
			/* set physical addresses for each hugepage */
			if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
				RTE_LOG(DEBUG, EAL, "Failed to set phys addr "
					"for %u MB pages\n",
					(unsigned int)(hpi->hugepage_sz / 0x100000));
				goto fail;
			}
		}

		// 解析maps文件 获取物理地址 进行比对 然后设置socket id
		if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
			RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
					(unsigned)(hpi->hugepage_sz / 0x100000));
			goto fail;
		}

		qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
		      sizeof(struct hugepage_file), cmp_physaddr);

		/* we have processed a num of hugepages of this size, so inc offset */
		// 进行了一系列的map 所以进行偏移
		hp_offset += hpi->num_pages[0];
	}

	huge_recover_sigbus();

	// 获取hugepage memory size
	if (internal_conf->memory == 0 && internal_conf->force_sockets == 0)
		internal_conf->memory = eal_get_hugepage_mem_size();

	// 记录页
	nr_hugefiles = nr_hugepages;


	/* clean out the numbers of pages */
	// 情空
	for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++)
		for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
			internal_conf->hugepage_info[i].num_pages[j] = 0;

	/* get hugepages for each socket */
	for (i = 0; i < nr_hugefiles; i++) {
		int socket = tmp_hp[i].socket_id;

		/* find a hugepage info with right size and increment num_pages */
		/* 找到一个大小合适的大页面信息并增加 num_pages */
		const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
				(int)internal_conf->num_hugepage_sizes);
		for (j = 0; j < nb_hpsizes; j++) {
			if (tmp_hp[i].size ==
					internal_conf->hugepage_info[j].hugepage_sz) {
				internal_conf->hugepage_info[j].num_pages[socket]++;
			}
		}
	}

	/* make a copy of socket_mem, needed for number of pages calculation */
	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
		memory[i] = internal_conf->socket_mem[i];

	/* calculate final number of pages */
	nr_hugepages = eal_dynmem_calc_num_pages_per_socket(memory,
			internal_conf->hugepage_info, used_hp,
			internal_conf->num_hugepage_sizes);

	/* error if not enough memory available */
	if (nr_hugepages < 0)
		goto fail;

	/* reporting in! */
	// 打个日志 美滋滋
	for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++) {
		for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
			if (used_hp[i].num_pages[j] > 0) {
				RTE_LOG(DEBUG, EAL,
					"Requesting %u pages of size %uMB"
					" from socket %i\n",
					used_hp[i].num_pages[j],
					(unsigned)
					(used_hp[i].hugepage_sz / 0x100000),
					j);
			}
		}
	}

	/* create shared memory */
	// 创建共享内存
	hugepage = create_shared_memory(eal_hugepage_data_path(),
			nr_hugefiles * sizeof(struct hugepage_file));

	if (hugepage == NULL) {
		RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
		goto fail;
	}
	memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));

	/*
	 * unmap pages that we won't need (looks at used_hp).
	 * also, sets final_va to NULL on pages that were unmapped.
	 */
	/*
	* 取消映射我们不需要的页面(查看 used_hp)。
	* 此外,在未映射的页面上将 final_va 设置为 NULL。
	*/
	if (unmap_unneeded_hugepages(tmp_hp, used_hp,
			internal_conf->num_hugepage_sizes) < 0) {
		RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
		goto fail;
	}

	/*
	 * copy stuff from malloc'd hugepage* to the actual shared memory.
	 * this procedure only copies those hugepages that have orig_va
	 * not NULL. has overflow protection.
	 */
	// 拷贝到共享内存
	if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
			tmp_hp, nr_hugefiles) < 0) {
		RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
		goto fail;
	}

#ifndef RTE_ARCH_64
	/* for legacy 32-bit mode, we did not preallocate VA space, so do it */
	if (internal_conf->legacy_mem &&
			prealloc_segments(hugepage, nr_hugefiles)) {
		RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n");
		goto fail;
	}
#endif

	/* remap all pages we do need into memseg list VA space, so that those
	 * pages become first-class citizens in DPDK memory subsystem
	 */
	// 重新remap需要的 memseg list
	if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
		RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n");
		goto fail;
	}

	/* free the hugepage backing files */
	if (internal_conf->hugepage_file.unlink_before_mapping &&
		unlink_hugepage_files(tmp_hp, internal_conf->num_hugepage_sizes) < 0) {
		RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n");
		goto fail;
	}

	/* free the temporary hugepage table */
	free(tmp_hp);
	tmp_hp = NULL;

	munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
	hugepage = NULL;

	/* we're not going to allocate more pages, so release VA space for
	 * unused memseg lists
	 */
	// 释放无用的rte_memseg_list
	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
		struct rte_memseg_list *msl = &mcfg->memsegs[i];
		size_t mem_sz;

		/* skip inactive lists */
		if (msl->base_va == NULL)
			continue;
		/* skip lists where there is at least one page allocated */
		if (msl->memseg_arr.count > 0)
			continue;
		/* this is an unused list, deallocate it */
		mem_sz = msl->len;
		munmap(msl->base_va, mem_sz);
		msl->base_va = NULL;
		msl->heap = 0;

		/* destroy backing fbarray */
		rte_fbarray_destroy(&msl->memseg_arr);
	}

	if (mcfg->dma_maskbits &&
	    rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
		RTE_LOG(ERR, EAL,
			"%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n",
			__func__);
		goto fail;
	}

	return 0;

fail:
	huge_recover_sigbus();
	free(tmp_hp);
	if (hugepage != NULL)
		munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));

	return -1;
}

1.6 rte_eal_malloc_heap_init的分析

int rte_eal_malloc_heap_init(void)
{
	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
	unsigned int i;
	const struct internal_config *internal_conf =
		eal_get_internal_configuration();

	if (internal_conf->match_allocations)
		RTE_LOG(DEBUG, EAL, "Hugepages will be freed exactly as allocated.\n");

	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
		/* assign min socket ID to external heaps */
		mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID;

		/* assign names to default DPDK heaps */
		// 拷贝名字
		for (i = 0; i < rte_socket_count(); i++) {
			struct malloc_heap *heap = &mcfg->malloc_heaps[i];
			char heap_name[RTE_HEAP_NAME_MAX_LEN];
			int socket_id = rte_socket_id_by_idx(i);

			snprintf(heap_name, sizeof(heap_name),
					"socket_%i", socket_id);
			strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN);
			heap->socket_id = socket_id;
		}
	}

	// 注册对应的mp函数指针处理函数
	if (register_mp_requests()) {
		RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n");
		rte_mcfg_mem_read_unlock();
		return -1;
	}

	/* unlock mem hotplug here. it's safe for primary as no requests can
	 * even come before primary itself is fully initialized, and secondaries
	 * do not need to initialize the heap.
	 */
	rte_mcfg_mem_read_unlock();

	/* secondary process does not need to initialize anything */
	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
		return 0;

	/* add all IOVA-contiguous areas to the heap */
	// 将所有 IOVA 连续区域添加到堆中
	// 将memseg放入 malloc_heap,首尾各分配一个 malloc_elem,后者指向前者,将前者插入free_head
    /* 该函数会执行malloc_add_seg
        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
        int i, ms_idx, ret = 0;

        for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
            struct rte_memseg_list *msl = &mcfg->memsegs[i];
            const struct rte_memseg *ms;
            struct rte_fbarray *arr;

            if (msl->memseg_arr.count == 0)
                continue;

            arr = &msl->memseg_arr;

            ms_idx = rte_fbarray_find_next_used(arr, 0);
            while (ms_idx >= 0) {
                int n_segs;
                size_t len;

                ms = rte_fbarray_get(arr, ms_idx);

                /* find how many more segments there are, starting with
                 * this one.
                 */
                n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
                len = n_segs * msl->page_sz;
				
    			// 这里操作的是rte_config->mem_config->memsegs[i], rte_memseg ms = rte_fbarray_get(arr, ms_idx);
                ret = func(msl, ms, len, arg);
                if (ret)
                    return ret;
                ms_idx = rte_fbarray_find_next_used(arr,
                        ms_idx + n_segs);
            }
        }
    */
	return rte_memseg_contig_walk(malloc_add_seg, NULL);
}
/*
 * Expand the heap with a memory area.
 */
static struct malloc_elem * malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,
		void *start, size_t len, bool dirty)
{
	struct malloc_elem *elem = start;
	
	// elem进行初始化 加上header tailer 要注意msl 之后进行分配也是这个msl
	malloc_elem_init(elem, heap, msl, len, elem, len, dirty);

	// 插入
	malloc_elem_insert(elem);

	// 进行连接
	elem = malloc_elem_join_adjacent_free(elem);

	// 插入free list
	malloc_elem_free_list_insert(elem);

	return elem;
}

static int malloc_add_seg(const struct rte_memseg_list *msl,
		const struct rte_memseg *ms, size_t len, void *arg __rte_unused)
{
	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
	struct rte_memseg_list *found_msl;
	struct malloc_heap *heap;
	int msl_idx, heap_idx;

	if (msl->external)
		return 0;

	// 从mcfg->malloc_heaps找一个msl->socket_id相等的heap idx
	heap_idx = malloc_socket_to_heap_id(msl->socket_id);
	if (heap_idx < 0) {
		RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n");
		return -1;
	}
	heap = &mcfg->malloc_heaps[heap_idx];

	/* msl is const, so find it */
	msl_idx = msl - mcfg->memsegs;

	if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
		return -1;

    // 注意msl
	found_msl = &mcfg->memsegs[msl_idx];

	// 将memseg放入 malloc_heap
	malloc_heap_add_memory(heap, found_msl, ms->addr, len,
			ms->flags & RTE_MEMSEG_FLAG_DIRTY);

	heap->total_size += len;

	RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20,
			msl->socket_id);
	return 0;
}
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
DPDK e1000驱动是针对英特尔e1000网卡的一种高性能驱动程序。它使用DPDK框架中的API,通过绕过Linux内核协议栈,直接与网络硬件进行交互,从而提高了网络性能。DPDK e1000驱动的主要工作方式包括初始化、DMA配置和中断处理等,其实现代码主要分布在“lib/librte_e1000_em”和“lib/librte_pmd_e1000_em”两个目录下。 DPDK e1000驱动的初始化过程主要包括初始化硬件设备、动态配置硬件寄存器和设置驱动程序的相关参数等。驱动初始化时,会对网卡进行复位,并设置MAC地址、RSS多队列等参数。此外,驱动还会初始化一些硬件性能参数,如帧大小、Jumbo帧支持等。 在DMA配置方面,DPDK e1000驱动会使用DPDK提供的rte_mempool来管理内存池,对接收和发送的数据包进行缓存和预先分配内存,避免了重复的内存申请和管理操作,从而提升了驱动程序的效率。 在中断处理方面,DPDK e1000驱动会通过rte_intr_enable()和rte_intr_unmask()函数来使能网卡接收中断,并使用rte_eth_rx_burst()和rte_eth_tx_burst()函数提高接收和发送的效率。此外,DPDK e1000驱动还支持RSS多队列技术,可以将接收到的数据包划分到不同的队列中处理,提高网络的处理能力和负载均衡能力。 总之,DPDK e1000驱动在单个处理器上能够达到每秒数百万个数据包的处理速度,是一种高性能的网络驱动程序,广泛应用于云计算、大数据等高性能计算领域。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值