rte_eal_init之内存空间初始化

目录

1 rte_eal_memzone_init

1.1 memzone存储架构图

​1.2 函数解析

​2 rte_eal_memory_init

2.1 memseg存储架构图

 2.2 函数解析

 2.3 总结


1 rte_eal_memzone_init

1.1 memzone存储架构图

1.2 函数解析

int
rte_eal_memzone_init(void)
{
	struct rte_mem_config *mcfg;
	int ret = 0;

	/* get pointer to global configuration */
	mcfg = rte_eal_get_configuration()->mem_config;

	rte_rwlock_write_lock(&mcfg->mlock);

	if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
			rte_fbarray_init(&mcfg->memzones, "memzone",
			RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) {
		RTE_LOG(ERR, EAL, "Cannot allocate memzone list\n");
		ret = -1;
	} else if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
			rte_fbarray_attach(&mcfg->memzones)) {
		RTE_LOG(ERR, EAL, "Cannot attach to memzone list\n");
		ret = -1;
	}

	rte_rwlock_write_unlock(&mcfg->mlock);

	return ret;
}

static size_t
calc_data_size(size_t page_sz, unsigned int elt_sz, unsigned int len)
{
    //len是指元素的个数,最终计算的是页对齐的数据和掩码的大小,结构如上图
    size_t data_sz = elt_sz * len;
	size_t msk_sz = calc_mask_size(len);
	return RTE_ALIGN_CEIL(data_sz + msk_sz, page_sz);
}

int
rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
		unsigned int elt_sz)
{
	size_t page_sz, mmap_len;
	char path[PATH_MAX];
	struct used_mask *msk;
	struct mem_area *ma = NULL;
	void *data = NULL;
	int fd = -1;

	if (arr == NULL) {
		rte_errno = EINVAL;
		return -1;
	}

	if (fully_validate(name, elt_sz, len))
		return -1;

	/* allocate mem area before doing anything */
	ma = malloc(sizeof(*ma));
	if (ma == NULL) {
		rte_errno = ENOMEM;
		return -1;
	}

	page_sz = sysconf(_SC_PAGESIZE);
	if (page_sz == (size_t)-1) {
		free(ma);
		return -1;
	}

	/* calculate our memory limits */
    //计算需要预留的内存空间大小
	mmap_len = calc_data_size(page_sz, elt_sz, len);

    //匿名申请一块内存mmap len大小的内存空间
	data = eal_get_virtual_area(NULL, &mmap_len, page_sz, 0, 0);
	if (data == NULL) {
		free(ma);
		return -1;
	}

	rte_spinlock_lock(&mem_area_lock);

	fd = -1;

	if (internal_config.no_shconf) {
		/* remap virtual area as writable */
		void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE,
				MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
		if (new_data == MAP_FAILED) {
			RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n",
					__func__, strerror(errno));
			goto fail;
		}
	} else {
	    //path:/var/run/dpdk/pg1/fbarray_memzone
		eal_get_fbarray_path(path, sizeof(path), name);

		/*
		 * Each fbarray is unique to process namespace, i.e. the
		 * filename depends on process prefix. Try to take out a lock
		 * and see if we succeed. If we don't, someone else is using it
		 * already.
		 */
		fd = open(path, O_CREAT | O_RDWR, 0600);
		if (fd < 0) {
			RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n",
					__func__, path, strerror(errno));
			rte_errno = errno;
			goto fail;
		} else if (flock(fd, LOCK_EX | LOCK_NB)) {
			RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n",
					__func__, path, strerror(errno));
			rte_errno = EBUSY;
			goto fail;
		}

		/* take out a non-exclusive lock, so that other processes could
		 * still attach to it, but no other process could reinitialize
		 * it.
		 */
		if (flock(fd, LOCK_SH | LOCK_NB)) {
			rte_errno = errno;
			goto fail;
		}

        //将文件设置成指定大小,同时将fd映射到虚拟内存空间
		if (resize_and_map(fd, data, mmap_len))
			goto fail;
	}
	ma->addr = data;
	ma->len = mmap_len;//data size+mask_size
	ma->fd = fd;

	/* do not close fd - keep it until detach/destroy */
    //将ma加入到链表
	TAILQ_INSERT_TAIL(&mem_area_tailq, ma, next);

	/* initialize the data */
	memset(data, 0, mmap_len);

	/* populate data structure */
    //设置出参内容
	strlcpy(arr->name, name, sizeof(arr->name));
	arr->data = data;
	arr->len = len;
	arr->elt_sz = elt_sz;
	arr->count = 0;
    
    //获取mask的首地址,偏移n个data之后的地址
	msk = get_used_mask(data, elt_sz, len);
    //确认mask的index,有多少个mask
	msk->n_masks = MASK_LEN_TO_IDX(RTE_ALIGN_CEIL(len, MASK_ALIGN));

	rte_rwlock_init(&arr->rwlock);

	rte_spinlock_unlock(&mem_area_lock);

	return 0;
fail:
	if (data)
		munmap(data, mmap_len);
	if (fd >= 0)
		close(fd);
	free(ma);

	rte_spinlock_unlock(&mem_area_lock);
	return -1;
}

1.3 总结

这段代码主要做了两件事:

1、申请一块虚拟的内存空间挂到memzone,并将fd映射到这块虚拟内存

2、申请mem_area保存在链表中        

2 rte_eal_memory_init

2.1 memseg存储架构图

 

 2.2 函数解析

函数调用关系:

rte_eal_memory_init->rte_eal_memseg_init->memseg_primary_init

static int __rte_unused
memseg_primary_init(void)
{
	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
	struct memtype {
		uint64_t page_sz;
		int socket_id;
	} *memtypes = NULL;
	int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
	struct rte_memseg_list *msl;
	uint64_t max_mem, max_mem_per_type;
	unsigned int max_seglists_per_type;
	unsigned int n_memtypes, cur_type;

	/* no-huge does not need this at all */
	if (internal_config.no_hugetlbfs)
		return 0;

	/*
	 * figuring out amount of memory we're going to have is a long and very
	 * involved process. the basic element we're operating with is a memory
	 * type, defined as a combination of NUMA node ID and page size (so that
	 * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
	 *
	 * deciding amount of memory going towards each memory type is a
	 * balancing act between maximum segments per type, maximum memory per
	 * type, and number of detected NUMA nodes. the goal is to make sure
	 * each memory type gets at least one memseg list.
	 *
	 * the total amount of memory is limited by RTE_MAX_MEM_MB value.
	 *
	 * the total amount of memory per type is limited by either
	 * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
	 * of detected NUMA nodes. additionally, maximum number of segments per
	 * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
	 * smaller page sizes, it can take hundreds of thousands of segments to
	 * reach the above specified per-type memory limits.
	 *
	 * additionally, each type may have multiple memseg lists associated
	 * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
	 * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
	 *
	 * the number of memseg lists per type is decided based on the above
	 * limits, and also taking number of detected NUMA nodes, to make sure
	 * that we don't run out of memseg lists before we populate all NUMA
	 * nodes with memory.
	 *
	 * we do this in three stages. first, we collect the number of types.
	 * then, we figure out memory constraints and populate the list of
	 * would-be memseg lists. then, we go ahead and allocate the memseg
	 * lists.
	 */

	/* create space for mem types */
    //由于本机只有一个socket使用,所以这个是一个1
	n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count();
	memtypes = calloc(n_memtypes, sizeof(*memtypes));
	if (memtypes == NULL) {
		RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
		return -1;
	}

	/* populate mem types */
	cur_type = 0;
	for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
			hpi_idx++) {
		struct hugepage_info *hpi;
		uint64_t hugepage_sz;

		hpi = &internal_config.hugepage_info[hpi_idx];
		hugepage_sz = hpi->hugepage_sz;

		for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
			int socket_id = rte_socket_id_by_idx(i);

#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
			/* we can still sort pages by socket in legacy mode */
			if (!internal_config.legacy_mem && socket_id > 0)
				break;
#endif
			memtypes[cur_type].page_sz = hugepage_sz;
			memtypes[cur_type].socket_id = socket_id;

			RTE_LOG(DEBUG, EAL, "Detected memory type: "
				"socket_id:%u hugepage_sz:%" PRIu64 "\n",
				socket_id, hugepage_sz);
		}
	}
	/* number of memtypes could have been lower due to no NUMA support */
	n_memtypes = cur_type;

	/* set up limits for types */
	max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
    //max_mem_per_type=128G
	max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
			max_mem / n_memtypes);

	/*
	 * limit maximum number of segment lists per type to ensure there's
	 * space for memseg lists for all NUMA nodes with all page sizes
	 */
	 //max_seglists_per_type = 64
	max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;

	if (max_seglists_per_type == 0) {
		RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
			RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
		goto out;
	}

	/* go through all mem types and create segment lists */
	msl_idx = 0;
	for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
		unsigned int cur_seglist, n_seglists, n_segs;
		unsigned int max_segs_per_type, max_segs_per_list;
		struct memtype *type = &memtypes[cur_type];
		uint64_t max_mem_per_list, pagesz;
		int socket_id;

		pagesz = type->page_sz;
		socket_id = type->socket_id;

		/*
		 * we need to create segment lists for this type. we must take
		 * into account the following things:
		 *
		 * 1. total amount of memory we can use for this memory type
		 * 2. total amount of memory per memseg list allowed
		 * 3. number of segments needed to fit the amount of memory
		 * 4. number of segments allowed per type
		 * 5. number of segments allowed per memseg list
		 * 6. number of memseg lists we are allowed to take up
		 */

		/* calculate how much segments we will need in total */
        /*
        以下是将一块内存分成几段,每段包含多少个页,每页有多大,计算过程如下所示具体结构图如上所        示
        */
        //max_segs_per_type:65536页
		max_segs_per_type = max_mem_per_type / pagesz;
		/* limit number of segments to maximum allowed per type */
        
        //max_segs_per_type:32768 页
        max_segs_per_type = RTE_MIN(max_segs_per_type,
				(unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
        
		/* limit number of segments to maximum allowed per list */
        //max_segs_per_list:8192 页
		max_segs_per_list = RTE_MIN(max_segs_per_type,
				(unsigned int)RTE_MAX_MEMSEG_PER_LIST);
        
		/* calculate how much memory we can have per segment list */
        //max_mem_per_list:17179869184 //16G
        max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
				(uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
        
		/* calculate how many segments each segment list will have */
        //n_segs:8192
        n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
        
		/* calculate how many segment lists we can have */
        //n_seglists:4
        n_seglists = RTE_MIN(max_segs_per_type / n_segs,
				max_mem_per_type / max_mem_per_list);
        
		/* limit number of segment lists according to our maximum */
        //n_seglists:4
		n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
        
		RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
				"n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
			n_seglists, n_segs, socket_id, pagesz);

		/* create all segment lists */
		for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
			if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
				RTE_LOG(ERR, EAL,
					"No more space in memseg lists, please increase %s\n",
					RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
				goto out;
			}
			msl = &mcfg->memsegs[msl_idx++];
            //因为分成四段,所以要对四段内容进行设置,需要设置的一般是虚拟地址的首地址,长度,页数等,这个在下一个函数中有解析

			if (alloc_memseg_list(msl, pagesz, n_segs,
					socket_id, cur_seglist))
				goto out;

			if (alloc_va_space(msl)) {
				RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
				goto out;
			}
		}
	}
	/* we're successful */
	ret = 0;
out:
	free(memtypes);
	return ret;
}

static int
alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
		int n_segs, int socket_id, int type_msl_idx)
{
	char name[RTE_FBARRAY_NAME_LEN];

	snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
		 type_msl_idx);
    //根据页的个数,每个页的大小申请一块虚拟内存,将内存挂到msl->memseg_arr下,设置了页的个数,以及每个元素的大小,并且申请一块内存空间挂载到mem_area_tailq链表下
	if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
			sizeof(struct rte_memseg))) {
		RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
			rte_strerror(rte_errno));
		return -1;
	}

    //设置msl页的大小,socket id
	msl->page_sz = page_sz;
	msl->socket_id = socket_id;
	msl->base_va = NULL;
	msl->heap = 1; /* mark it as a heap segment */

	RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
			(size_t)page_sz >> 10, socket_id);

	return 0;
}

static int
alloc_va_space(struct rte_memseg_list *msl)
{
	uint64_t page_sz;
	size_t mem_sz;
	void *addr;
	int flags = 0;

	page_sz = msl->page_sz;
	mem_sz = page_sz * msl->memseg_arr.len;//8092

     //因为msl->base_va=NULL最开始是空,所以匿名申请一块虚拟内存
	addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
	if (addr == NULL) {
		if (rte_errno == EADDRNOTAVAIL)
			RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - "
				"please use '--" OPT_BASE_VIRTADDR "' option\n",
				(unsigned long long)mem_sz, msl->base_va);
		else
			RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
		return -1;
	}
    //申请了一块虚拟内存,首地址和大小进行赋值,挂载到base_va
	msl->base_va = addr;
	msl->len = mem_sz;

	return 0;
}

 eal_memalloc_init->fd_list_create_walk

static int
fd_list_create_walk(const struct rte_memseg_list *msl,
		void *arg __rte_unused)
{
	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
	unsigned int len;
	int msl_idx;

	if (msl->external)
		return 0;

	msl_idx = msl - mcfg->memsegs;
	len = msl->memseg_arr.len;

	return alloc_list(msl_idx, len);
}

static int
alloc_list(int list_idx, int len)
{
	int *data;
	int i;

	/* single-file segments mode does not need fd list */
	if (!internal_config.single_file_segments) {
		/* ensure we have space to store fd per each possible segment */
		data = malloc(sizeof(int) * len);
		if (data == NULL) {
			RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
			return -1;
		}
		/* set all fd's as invalid */
		for (i = 0; i < len; i++)
			data[i] = -1;
        //为每一个段内存申请一块物理空间,赋值给了fd_list
		fd_list[list_idx].fds = data;
		fd_list[list_idx].len = len;
	} else {
		fd_list[list_idx].fds = NULL;
		fd_list[list_idx].len = 0;
	}

	fd_list[list_idx].count = 0;
	fd_list[list_idx].memseg_list_fd = -1;

	return 0;
}

rte_eal_hugepage_init->eal_hugepage_init

static int
eal_hugepage_init(void)
{
	struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
	uint64_t memory[RTE_MAX_NUMA_NODES];
	int hp_sz_idx, socket_id;

	memset(used_hp, 0, sizeof(used_hp));

	for (hp_sz_idx = 0;
			hp_sz_idx < (int) internal_config.num_hugepage_sizes;
			hp_sz_idx++) {
#ifndef RTE_ARCH_64
		struct hugepage_info dummy;
		unsigned int i;
#endif
		/* also initialize used_hp hugepage sizes in used_hp */
		struct hugepage_info *hpi;
		hpi = &internal_config.hugepage_info[hp_sz_idx];
		used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;

#ifndef RTE_ARCH_64
		/* for 32-bit, limit number of pages on socket to whatever we've
		 * preallocated, as we cannot allocate more.
		 */
		memset(&dummy, 0, sizeof(dummy));
		dummy.hugepage_sz = hpi->hugepage_sz;
		if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0)
			return -1;

		for (i = 0; i < RTE_DIM(dummy.num_pages); i++) {
			hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i],
					dummy.num_pages[i]);
            printf("%s[%d] numpage:%u\r\n", __func__, __LINE__, hpi->num_pages[i]);
		}
#endif
	}

	/* make a copy of socket_mem, needed for balanced allocation. */
	for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
		memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx];

	/* calculate final number of pages */
    //计算used_hp使用多少大页, memory此时是剩余内存
	if (calc_num_pages_per_socket(memory,
			internal_config.hugepage_info, used_hp,
			internal_config.num_hugepage_sizes) < 0)
		return -1;

	for (hp_sz_idx = 0;
			hp_sz_idx < (int)internal_config.num_hugepage_sizes;
			hp_sz_idx++) {
		for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
				socket_id++) {
			struct rte_memseg **pages;
			struct hugepage_info *hpi = &used_hp[hp_sz_idx];
			unsigned int num_pages = hpi->num_pages[socket_id];
			unsigned int num_pages_alloc;

			if (num_pages == 0)
				continue;

			RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n",
				num_pages, hpi->hugepage_sz >> 20, socket_id);

			/* we may not be able to allocate all pages in one go,
			 * because we break up our memory map into multiple
			 * memseg lists. therefore, try allocating multiple
			 * times and see if we can get the desired number of
			 * pages from multiple allocations.
			 */

			num_pages_alloc = 0;
			do {
				int i, cur_pages, needed;

				needed = num_pages - num_pages_alloc;
  
                //申请了一维指针
				pages = malloc(sizeof(*pages) * needed);
                
				/* do not request exact number of pages */
                //为每一个段的rte_message申请一块虚拟内存
				cur_pages = eal_memalloc_alloc_seg_bulk(pages,
						needed, hpi->hugepage_sz,
						socket_id, false);
				if (cur_pages <= 0) {
					free(pages);
					return -1;
				}

				/* mark preallocated pages as unfreeable */
				for (i = 0; i < cur_pages; i++) {
					struct rte_memseg *ms = pages[i];
					ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
				}
                //具体内容没有删除,但是最初申请的一段指针删除了
				free(pages);

				num_pages_alloc += cur_pages;
			} while (num_pages_alloc != num_pages);
		}
	}
	/* if socket limits were specified, set them */
	if (internal_config.force_socket_limits) {
		unsigned int i;
		for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
			uint64_t limit = internal_config.socket_limit[i];
			if (limit == 0)
				continue;
			if (rte_mem_alloc_validator_register("socket-limit",
					limits_callback, i, limit))
				RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n");
		}
	}
	return 0;
}

int
eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
		int socket, bool exact)
{
	int i, ret = -1;
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
	bool have_numa = false;
	int oldpolicy;
	struct bitmask *oldmask;
#endif
	struct alloc_walk_param wa;
	struct hugepage_info *hi = NULL;

	memset(&wa, 0, sizeof(wa));

	/* dynamic allocation not supported in legacy mode */
	if (internal_config.legacy_mem)
		return -1;

	for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
		if (page_sz ==
				internal_config.hugepage_info[i].hugepage_sz) {
			hi = &internal_config.hugepage_info[i];
			break;
		}
	}
	if (!hi) {
		RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
			__func__);
		return -1;
	}

#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
	if (check_numa()) {
		oldmask = numa_allocate_nodemask();
		prepare_numa(&oldpolicy, oldmask, socket);
		have_numa = true;
	}
#endif

	wa.exact = exact;
	wa.hi = hi;
    //page申请的内存挂在ws的结构体中,然后alloc_seg_walk申请内存信息
	wa.ms = ms;
	wa.n_segs = n_segs;
	wa.page_sz = page_sz;
	wa.socket = socket;
	wa.segs_allocated = 0;

  
	/* memalloc is locked, so it's safe to use thread-unsafe version */
    //遍历所有的memsegs
	ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
	if (ret == 0) {
		RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
			__func__);
		ret = -1;
	} else if (ret > 0) {
		ret = (int)wa.segs_allocated;
	}

#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
	if (have_numa)
		restore_numa(&oldpolicy, oldmask);
#endif
	return ret;
}

static int
alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
{
	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
	struct alloc_walk_param *wa = arg;
	struct rte_memseg_list *cur_msl;
	size_t page_sz;
	int cur_idx, start_idx, j, dir_fd = -1;
	unsigned int msl_idx, need, i;

	if (msl->page_sz != wa->page_sz)
		return 0;
	if (msl->socket_id != wa->socket)
		return 0;

	page_sz = (size_t)msl->page_sz;

	msl_idx = msl - mcfg->memsegs;
	cur_msl = &mcfg->memsegs[msl_idx];

	need = wa->n_segs;

	/* try finding space in memseg list */
	if (wa->exact) {
		/* if we require exact number of pages in a list, find them */
		cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0,
				need);
		if (cur_idx < 0)
			return 0;
		start_idx = cur_idx;
	} else {
		int cur_len;

		/* we don't require exact number of pages, so we're going to go
		 * for best-effort allocation. that means finding the biggest
		 * unused block, and going with that.
		 */
		cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr,
				0);
		if (cur_idx < 0)
			return 0;
		start_idx = cur_idx;
		/* adjust the size to possibly be smaller than original
		 * request, but do not allow it to be bigger.
		 */
		cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr,
				cur_idx);
		need = RTE_MIN(need, (unsigned int)cur_len);
	}

	/* do not allow any page allocations during the time we're allocating,
	 * because file creation and locking operations are not atomic,
	 * and we might be the first or the last ones to use a particular page,
	 * so we need to ensure atomicity of every operation.
	 *
	 * during init, we already hold a write lock, so don't try to take out
	 * another one.
	 */
	if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
		dir_fd = open(wa->hi->hugedir, O_RDONLY);
		if (dir_fd < 0) {
			RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
				__func__, wa->hi->hugedir, strerror(errno));
			return -1;
		}
		/* blocking writelock */
		if (flock(dir_fd, LOCK_EX)) {
			RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
				__func__, wa->hi->hugedir, strerror(errno));
			close(dir_fd);
			return -1;
		}
	}

	for (i = 0; i < need; i++, cur_idx++) {
		struct rte_memseg *cur;
		void *map_addr;
         //为每一个segment确认一个fd
		cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
		map_addr = RTE_PTR_ADD(cur_msl->base_va,
				cur_idx * page_sz);
        //map_addr根据每一页的大小偏移,也可以认为是每一个rte_message的首地址
		if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
				msl_idx, cur_idx)) {
			RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n",
				need, i);

			/* if exact number wasn't requested, stop */
			if (!wa->exact)
				goto out;

			/* clean up */
			for (j = start_idx; j < cur_idx; j++) {
				struct rte_memseg *tmp;
				struct rte_fbarray *arr =
						&cur_msl->memseg_arr;

				tmp = rte_fbarray_get(arr, j);
				rte_fbarray_set_free(arr, j);

				/* free_seg may attempt to create a file, which
				 * may fail.
				 */
				if (free_seg(tmp, wa->hi, msl_idx, j))
					RTE_LOG(DEBUG, EAL, "Cannot free page\n");
			}
			/* clear the list */
			if (wa->ms)
				memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);

			if (dir_fd >= 0)
				close(dir_fd);
			return -1;
		}
		if (wa->ms)
			wa->ms[i] = cur;

		rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
	}
out:
	wa->segs_allocated = i;
	if (i > 0)
		cur_msl->version++;
	if (dir_fd >= 0)
		close(dir_fd);
	/* if we didn't allocate any segments, move on to the next list */
	return i > 0;
}

static int
alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
		struct hugepage_info *hi, unsigned int list_idx,
		unsigned int seg_idx)
{
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
	int cur_socket_id = 0;
#endif
	uint64_t map_offset;
	rte_iova_t iova;
	void *va;
	char path[PATH_MAX];
	int ret = 0;
	int fd;
	size_t alloc_sz;
	int flags;
	void *new_addr;

	alloc_sz = hi->hugepage_sz;

	/* these are checked at init, but code analyzers don't know that */
	if (internal_config.in_memory && !anonymous_hugepages_supported) {
		RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n");
		return -1;
	}
	if (internal_config.in_memory && !memfd_create_supported &&
			internal_config.single_file_segments) {
		RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n");
		return -1;
	}

	/* in-memory without memfd is a special case */
	int mmap_flags;

	if (internal_config.in_memory && !memfd_create_supported) {
		const int in_memory_flags = MAP_HUGETLB | MAP_FIXED |
				MAP_PRIVATE | MAP_ANONYMOUS;
		int pagesz_flag;

		pagesz_flag = pagesz_flags(alloc_sz);
		fd = -1;
		mmap_flags = in_memory_flags | pagesz_flag;

		/* single-file segments codepath will never be active
		 * here because in-memory mode is incompatible with the
		 * fallback path, and it's stopped at EAL initialization
		 * stage.
		 */
		map_offset = 0;
	} else {
		/* takes out a read lock on segment or segment list */
        //为每一个segment申请一个fd
		fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
		if (fd < 0) {
			RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
			return -1;
		}
        //
		if (internal_config.single_file_segments) {
			map_offset = seg_idx * alloc_sz;
			ret = resize_hugefile(fd, map_offset, alloc_sz, true);
			if (ret < 0)
				goto resized;

			fd_list[list_idx].count++;
		} else {
			map_offset = 0;
			if (ftruncate(fd, alloc_sz) < 0) {
				RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
					__func__, strerror(errno));
				goto resized;
			}
			if (internal_config.hugepage_unlink &&
					!internal_config.in_memory) {
				if (unlink(path)) {
					RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
						__func__, strerror(errno));
					goto resized;
				}
			}
		}
		mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
	}

	/*
	 * map the segment, and populate page tables, the kernel fills
	 * this segment with zeros if it's a new page.
	 */
	 //以每个segment的页申请一块虚拟内存
	va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
			map_offset);

	if (va == MAP_FAILED) {
		RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
			strerror(errno));
		/* mmap failed, but the previous region might have been
		 * unmapped anyway. try to remap it
		 */
		goto unmapped;
	}
	if (va != addr) {
		RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
		munmap(va, alloc_sz);
		goto resized;
	}

	/* In linux, hugetlb limitations, like cgroup, are
	 * enforced at fault time instead of mmap(), even
	 * with the option of MAP_POPULATE. Kernel will send
	 * a SIGBUS signal. To avoid to be killed, save stack
	 * environment here, if SIGBUS happens, we can jump
	 * back here.
	 */
	if (huge_wrap_sigsetjmp()) {
		RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
			(unsigned int)(alloc_sz >> 20));
		goto mapped;
	}

	/* we need to trigger a write to the page to enforce page fault and
	 * ensure that page is accessible to us, but we can't overwrite value
	 * that is already there, so read the old value, and write itback.
	 * kernel populates the page with zeroes initially.
	 */
	*(volatile int *)addr = *(volatile int *)addr;

    //设置iova地址(后期解析的分析)
	iova = rte_mem_virt2iova(addr);
	if (iova == RTE_BAD_PHYS_ADDR) {
		RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
			__func__);
		goto mapped;
	}

#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
	ret = get_mempolicy(&cur_socket_id, NULL, 0, addr,
			    MPOL_F_NODE | MPOL_F_ADDR);
	if (ret < 0) {
		RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n",
			__func__, strerror(errno));
		goto mapped;
	} else if (cur_socket_id != socket_id) {
		RTE_LOG(DEBUG, EAL,
				"%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
			__func__, socket_id, cur_socket_id);
		goto mapped;
	}
#else
	if (rte_socket_count() > 1)
		RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n",
				__func__);
#endif

    //申请了一快虚拟内存
	ms->addr = addr;
	ms->hugepage_sz = alloc_sz;
	ms->len = alloc_sz;
	ms->nchannel = rte_memory_get_nchannel();
	ms->nrank = rte_memory_get_nrank();
	ms->iova = iova;
	ms->socket_id = socket_id;

	return 0;

mapped:
	munmap(addr, alloc_sz);
unmapped:
	flags = MAP_FIXED;
	new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
	if (new_addr != addr) {
		if (new_addr != NULL)
			munmap(new_addr, alloc_sz);
		/* we're leaving a hole in our virtual address space. if
		 * somebody else maps this hole now, we could accidentally
		 * override it in the future.
		 */
		RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
	}
	/* roll back the ref count */
	if (internal_config.single_file_segments)
		fd_list[list_idx].count--;
resized:
	/* some codepaths will return negative fd, so exit early */
	if (fd < 0)
		return -1;

	if (internal_config.single_file_segments) {
		resize_hugefile(fd, map_offset, alloc_sz, false);
		/* ignore failure, can't make it any worse */

		/* if refcount is at zero, close the file */
		if (fd_list[list_idx].count == 0)
			close_hugefile(fd, path, list_idx);
	} else {
		/* only remove file if we can take out a write lock */
		if (internal_config.hugepage_unlink == 0 &&
				internal_config.in_memory == 0 &&
				lock(fd, LOCK_EX) == 1)
			unlink(path);
		close(fd);
		fd_list[list_idx].fds[seg_idx] = -1;
	}
	return -1;
}

 2.3 总结

(1)rte_eal_memseg_init主要功能是将内存划分成段,每个段存在多少个页,申请了一块虚拟内存,将/var/run/dpdk/pg1/fbarray_memzone对应的fd映射到这块内存

(2)eal_memalloc_init为memseg_list设置fd信息

(3)rte_eal_hugepage_init这个函数有意思的地方是为每一个segment中rte_message申请内存信息,但是最终却释放了,这样做是为了测试当前内存是否能够满足申请内存,释放内存只是把最外层的指针释放,但是内层指针指向的内容没有释放

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值