DPDK18.11.11内存初始化流程总结

前言

本篇主要是对DPDK的EAL(Environment Abstraction Layer)中内存的初始化流程进行总结,由于DPDK支持多进程应用,此篇总结主要针对primary process主进程流程进行跟踪总结,先了解下主次进程概念,如下:

1,在DPDK中,初始化由primary process完成。而其他process统称为secondary process,其可以通过读取一些文件来获取primary process的初始化信息,从而使得自身与primary process保持相同的内存映像。

2, DPDK采用了一种集中式控制的方式,比如在多进程的场景中,若一个secondary process要申请内存,则向primary process发起请求,由primary process完成相应操作后在通知secondary process。

一、初始化相关的代码调用流程
从lib/librte_eal/linux/eal/eal.c中的函数int rte_eal_init(int argc,char **argv)开始,内存的初始化调用栈依次为:
int rte_eal_init()
----eal_reset_internal_config()
----rte_config_init()
----eal_hugepage_info_init()
----rte_eal_memzone_init()
----rte_eal_memory_init()
----rte_eal_malloc_heap_init()
下面依次对这几个方面进行解析:

int
rte_eal_init(int argc, char **argv){
	······
	eal_reset_internal_config(&internal_config);
	
	rte_config_init();
	if (internal_config.no_hugetlbfs == 0) {
		/* rte_config isn't initialized yet */
		ret = internal_config.process_type == RTE_PROC_PRIMARY ?
				eal_hugepage_info_init() :
				eal_hugepage_info_read();
		······
	}
	······
	if (rte_eal_memzone_init() < 0) { ······ }
	if (rte_eal_memory_init() < 0) { ······ }
    if (rte_eal_malloc_heap_init() < 0) { ······  }
}

1、eal_reset_internal_config()初始化全局变量internal_config;

结构体主要成员定义如下:

struct internal_config { //DPDK的全局配置信息
	volatile size_t memory;           /**< amount of asked memory */ 	
	//请求分配的内存数量
	·······
	volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */  
	//是否允许使用hugetlbfs
	
	unsigned hugepage_unlink;         /**< true to unlink backing files */ 
	//是否删除hugepage文件(DPDK在memalloc时将每一个hugepage当做一个文件处理)
	·······
	volatile unsigned no_shconf;      /**< true if there is no shared config */ 
	//是否允许共享,不允许的话primary process不会将初始化信息写入到文件 
	
	volatile enum rte_proc_type_t process_type; /**< multi-process proc type */  
	//用于区分primary process, 或者secondary process
	
	/** true to try allocating memory on specific sockets */
	volatile unsigned force_sockets; //强制在指定的socket上分配内存
	volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */  
	//表示每一个socket分配的内存数量
	
	volatile unsigned force_socket_limits; //设置是否限制socket分配的内存
	volatile uint64_t socket_limit[RTE_MAX_NUMA_NODES]; /**< limit amount of memory per socket */  
	//每一个socket分配的内存的上限
	
	uintptr_t base_virtaddr;          /**< base address to try and reserve memory from */
	//从指定的虚拟地址分配内存
	
	volatile unsigned legacy_mem;
	 //指明是legacy mode, 或者dynamic mode
	
	volatile unsigned single_file_segments; 
	/**< true if storing all pages within single files (per-page-size,* per-node) non-legacy mode only.*/
	//指明是single-file-segments mode, 或者 page-per-file mode
	 
	unsigned num_hugepage_sizes;      /**< how many sizes on this system */
	//系统支持的大页内存值,2M 、1G等
	
	struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
	//大页内存信息保存,主要初始化结构体
};

对应的初始化函数把主要成员给初始值:

eal_reset_internal_config(struct internal_config *internal_cfg)
{
	int i;

	internal_cfg->memory = 0;
	internal_cfg->force_nrank = 0;
	internal_cfg->force_nchannel = 0;
	internal_cfg->hugefile_prefix = NULL;
	internal_cfg->hugepage_dir = NULL;
	............
	internal_cfg->create_uio_dev = 0;
	internal_cfg->iova_mode = RTE_IOVA_DC;
	internal_cfg->user_mbuf_pool_ops_name = NULL;
	CPU_ZERO(&internal_cfg->ctrl_cpuset);
	internal_cfg->init_complete = 0;
}

GDB看到的初始化值内容:

(gdb) p internal_config
$1 = {memory = 0, force_nchannel = 0, force_nrank = 0, no_hugetlbfs = 0, hugepage_unlink = 0, no_pci = 0,
no_hpet = 1, vmware_tsc_map = 0, no_shconf = 0, in_memory = 0, create_uio_dev = 0,
process_type = RTE_PROC_PRIMARY, force_sockets = 0, socket_mem = {0, 0, 0, 0, 0, 0, 0, 0},
force_socket_limits = 0, socket_limit = {0, 0, 0, 0, 0, 0, 0, 0}, base_virtaddr = 0, legacy_mem = 0,
single_file_segments = 0, syslog_facility = 24, vfio_intr_mode = RTE_INTR_MODE_NONE, hugefile_prefix = 0x0,
hugepage_dir = 0x0, user_mbuf_pool_ops_name = 0x0, num_hugepage_sizes = 0, hugepage_info = {{
hugepage_sz = 0, hugedir = ‘\000’ <repeats 4095 times>, num_pages = {0, 0, 0, 0, 0, 0, 0, 0},
lock_descriptor = -1}, {hugepage_sz = 0, hugedir = ‘\000’ <repeats 4095 times>, num_pages = {0, 0, 0,
0, 0, 0, 0, 0}, lock_descriptor = -1}, {hugepage_sz = 0, hugedir = ‘\000’ <repeats 4095 times>,
num_pages = {0, 0, 0, 0, 0, 0, 0, 0}, lock_descriptor = -1}, {hugepage_sz = 0,
hugedir = ‘\000’ <repeats 4095 times>, num_pages = {0, 0, 0, 0, 0, 0, 0, 0}, lock_descriptor = -1}},
iova_mode = RTE_IOVA_DC, ctrl_cpuset = {__bits = {1, 0 <repeats 15 times>}}, init_complete = 0}
(gdb)
(gdb) p internal_config.process_type
$2 = RTE_PROC_PRIMARY

也就是后续初始化按照主流程RTE_PROC_PRIMARY进行初始化内存;

2、rte_config_init() :初始化内存配置

涉及结构体:

struct rte_config { //运行时环境的配置
	······
	/** PA or VA mapping mode */
	enum rte_iova_mode iova_mode; 
	//指明了DMA使用虚拟地址(virtual address, 简称VA), 还是物理地址(physical address, 简称PA)

	/**
	 * Pointer to memory configuration, which may be shared across multiple
	 * DPDK instances
	 */
	struct rte_mem_config *mem_config;
    //这个指针指向的内存空间存放了一个DPDK instance的内存分布情况
	//DPDK内存初始化过程主要是初始化struct rte_mem_config中的每一项
} __attribute__((__packed__));

struct rte_mem_config {
	volatile uint32_t magic;   /**< Magic number - Sanity check. */
	/* memory topology */
	uint32_t nchannel;    /**< Number of channels (0 if unknown). */
	uint32_t nrank;       /**< Number of ranks (0 if unknown). */
	······
	/* memory segments amemnd zones */
	struct rte_fbarray memzones; /**< Memzone descriptors. */
	
	//每一个struct rte_memseg_list中使用<socket id, pagesz>进行标识
	//memsegs 可能存在多个具有相同<socket id, page_sz>的struct rte_memseg_list
	struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
	/**< list of dynamic arrays holding memsegs */
	······
	/* Heaps of Malloc */
	struct malloc_heap malloc_heaps[RTE_MAX_HEAPS];
    ······

	uint64_t mem_cfg_addr; //这个地址等于struct rte_config中的struct rte_mem_config *mem_config

	/* legacy mem and single file segments options are shared */
	uint32_t legacy_mem;	
	//指明内存是legacy mode, 还是dynamic mode
	
	uint32_t single_file_segments; 
	// 指明memalloc是single-file-segments mode, 还是page-per-file mode
    ······
} __attribute__((__packed__));

涉及的代码:

/* Sets up rte_config structure with the pointer to shared memory config.*/
static void
rte_config_init(void)
{
	rte_config.process_type = internal_config.process_type;

	switch (rte_config.process_type){
	case RTE_PROC_PRIMARY:
		rte_eal_config_create();
		break;
	}
}

这个函数主要是为struct rte_config中的struct rte_mem_config *mem_config(简称mcfg)申请一块内存空间,并且在运行时目录下创建一个名字为config的文件,并且将mcfg的内容写进此文件。这样,secondary process在初始化时就能通过读取config文件来创建和primary process一样的内存映像。

/* create memory configuration in shared/mmap memory. Take out
 * a write lock on the memsegs, so we can auto-detect primary/secondary.
 * This means we never close the file while running (auto-close on exit).
 * We also don't lock the whole file, so that in future we can use read-locks
 * on other parts, e.g. memzones, to detect if there are running secondary
 * processes. */

static void
rte_eal_config_create(void)
{
	void *rte_mem_cfg_addr;
	int retval;

	const char *pathname = eal_runtime_config_path();

	/* map the config before hugepage address so that we don't waste a page */
	if (internal_config.base_virtaddr != 0)
		rte_mem_cfg_addr = (void *)
			RTE_ALIGN_FLOOR(internal_config.base_virtaddr -
			sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE));
	else
		rte_mem_cfg_addr = NULL;

	if (mem_cfg_fd < 0){
		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600);
		if (mem_cfg_fd < 0)
			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
	}
	。。。。。。
	。。。。。。
	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);

	if (rte_mem_cfg_addr == MAP_FAILED){
		rte_panic("Cannot mmap memory for rte_config\n");
	}
	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
	rte_config.mem_config = rte_mem_cfg_addr;
}

在这里采用了mmap()的方式将config文件和mcfg进行了映射,所以在后面的初始化操作中,一旦对config进行了写操作,也能够立刻反映到其他的进程中(类似于使用共享内存通信);

# ls /var/run/dpdk/rte/config 
/var/run/dpdk/rte/config

gdb看到的数据如下:

(gdb) p rte_config 
$2 = {master_lcore = 1, lcore_count = 3, numa_node_count = 1, numa_nodes = {0, 0, 0, 0, 0, 0, 0, 0}, 
  service_lcore_count = 0, lcore_role = {ROLE_OFF, ROLE_RTE, ROLE_RTE, ROLE_RTE, 
    ROLE_OFF <repeats 252 times>}, process_type = RTE_PROC_PRIMARY, iova_mode = RTE_IOVA_DC, 
  mem_config = 0x7fb4a0e000}
  
(gdb) p rte_config.mem_config 
$3 = (struct rte_mem_config *) 0x7fb4a0e000

(gdb) p /x rte_config.mem_config.mem_cfg_addr 
$7 = 0x7fb4a0e000

3、eal_hugepage_info_init() : 读取系统中的hugepage的信息。

涉及结构体:

/*
 * internal configuration structure for the number, size and
 * mount points of hugepages
 */
struct hugepage_info {
	uint64_t hugepage_sz;   /**< size of a huge page */
	//一个大页内存文件大小
	
	char hugedir[PATH_MAX];    /**< dir where hugetlbfs is mounted */
	//大页内存挂载点
	
	uint32_t num_pages[RTE_MAX_NUMA_NODES];
	//分配的大页内存总页数
	
	/**< number of hugepages of that size on each socket */
	int lock_descriptor;    /**< file descriptor for hugepage dir */
	//挂载点(即hugedir字段)对应的file descriptor
};

涉及代码块:

static int
hugepage_info_init(void)
{	
	DIR *dir;
	struct dirent *dirent;

	dir = opendir(sys_dir_path);
	if (dir == NULL) {
		RTE_LOG(ERR, EAL,
			"Cannot open directory %s to read system hugepage info\n",
			sys_dir_path);
		return -1;
	}

	for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
		struct hugepage_info *hpi;
		。。。
		hpi = &internal_config.hugepage_info[num_sizes];
		hpi->hugepage_sz =
			rte_str_to_size(&dirent->d_name[dirent_start_len]);

		/* first, check if we have a mountpoint */
		if (get_hugepage_dir(hpi->hugepage_sz,
			hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
			uint32_t num_pages;

			num_pages = get_num_hugepages(dirent->d_name);
			if (num_pages > 0)
			......
			continue;
		}

		/* try to obtain a writelock */
		hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);

		/* if blocking lock failed */
		if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
		}

		calc_num_pages(hpi, dirent);

		num_sizes++;
	}
	closedir(dir);

	internal_config.num_hugepage_sizes = num_sizes;

	/* sort the page directory entries by size, largest to smallest */
	qsort(&internal_config.hugepage_info[0], num_sizes,
	      sizeof(internal_config.hugepage_info[0]), compare_hpi);
}

在linux系统中,会打开系统目录/sys/kernel/mm/hugepages,遍历每一个目录项下获取系统支持的hugepage size。然后从/proc/mounts中根据hugepage size获取对应挂载点(mount point), 然后计算在不同socket中每一种free hugepage的数量,。将每一种大页的相关信息存放在internal_config->hugepage_info中。然后会在runtime dir下创建一个名字为hugepage_info的文件,将internal_config->hugepage_info写入到该文件。

gdb下看到的信息:

(gdb) p internal_config.hugepage_info 
$9 = {hugepage_sz = 2097152, hugedir = "/mnt/hugetlbfs", '\000' <repeats 4081 times>, num_pages = {6667, 0, 0, 0, 0, 0, 0, 0}, lock_descriptor = 10}

(gdb) p 2097152/1024/1024
$12 = 2

# ls /sys/kernel/mm/hugepages/
hugepages-2048kB

# cat /proc/mounts | grep hugetlbfs
none /mnt/hugetlbfs hugetlbfs rw,relatime 0 0

# cat /proc/meminfo | grep Huge
AnonHugePages:     88064 kB
HugePages_Total:    6667
HugePages_Free:     6667
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:       2048 kB

# ls  /var/run/dpdk/rte/
config         hugepage_info  mp_socket

# cat  /var/run/dpdk/rte/hugepage_info 
 /mnt/hugetlbfs

4、rte_eal_memzone_init()初始化内存域

涉及的结构体:

struct rte_memzone {

#define RTE_MEMZONE_NAMESIZE 32       /**< Maximum length of memory zone name.*/
	char name[RTE_MEMZONE_NAMESIZE];  /**< Name of the memory zone. */
	size_t len;                       /**< Length of the memzone. */
	uint64_t hugepage_sz;             /**< The page size of underlying memory */
	int32_t socket_id;                /**< NUMA socket ID. */
	uint32_t flags;                   /**< Characteristics of this memzone. */
} __attribute__((__packed__));

struct rte_fbarray {
	char name[RTE_FBARRAY_NAME_LEN]; /**< name associated with an array */
	unsigned int count;              /**< number of entries stored */
	unsigned int len;                /**< current length of the array */
	unsigned int elt_sz;             /**< size of each element */
	void *data;                      /**< data pointer */
	rte_rwlock_t rwlock;             /**< multiprocess lock */
};

涉及的代码块:

int
rte_eal_memzone_init(void)
{
	struct rte_mem_config *mcfg;

	/* get pointer to global configuration */
	mcfg = rte_eal_get_configuration()->mem_config;

	if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
			rte_fbarray_init(&mcfg->memzones, "memzone",
			RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) {
	} else if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
			rte_fbarray_attach(&mcfg->memzones)) {
	}
}

初始化mcfg->memzones, 申请一块内存空间,用于保存以后内存分配时使用到的struct memzone,后面memzone所使用的内存空间也是从rte_heap中分配的;

GDB下看到的信息:

(gdb) p rte_config.mem_config.memzones 
$17 = {name = "memzone", '\000' <repeats 56 times>, count = 0, len = 2560, elt_sz = 72, data = 0x100000000, 
  rwlock = {cnt = 0}}

5、rte_eal_memory_init() : 内存初始化过程的核心
先后调用了:
----memseg_primary_init()
----eal_memalloc_init()
----rte_eal_hugepage_init()
----rte_eal_memdevice_init().

1) memseg_primary_init() 初始化memsegs list

涉及的结构体:

struct rte_memseg_list {
	RTE_STD_C11
	union {
		void *base_va;
		/**< Base virtual address for this memseg list. */
		uint64_t addr_64;
		/**< Makes sure addr is always 64-bits */
	};
	//指向一块用于存放rte_memseg的内存空间
	
	uint64_t page_sz; /**< Page size for all memsegs in this list. */
	int socket_id; /**< Socket ID for all memsegs in this list. */
	······
	size_t len; /**< Length of memory area covered by this memseg list. */
	 //指明具有base_va所指向的内存空间的字节数总量
	······
	struct rte_fbarray memseg_arr; 
	//用于管理base_va指向的内存空间,包含rte_memseg相关的元数据
};

涉及的代码块:

/* limit number of segment lists according to our maximum */
n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);

/* create all segment lists */
for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
	if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
		msl = &mcfg->memsegs[msl_idx++];

		if (alloc_memseg_list(msl, pagesz, n_segs,
				socket_id, cur_seglist))
			goto out;
}

确定每一种类型(由socket id和page sz确定)的struct rte_memseg_list的数量,及其所包含的mem segment的数量。然后,根据确定的数量为mcfg->memsegs中的struct rte_memseg_list分配虚拟内存空间。

GDB下看到的信息:

(gdb) p rte_config.mem_config.memsegs[0]
$1 = {{base_va = 0x100200000, addr_64 = 4297064448}, page_sz = 2097152, socket_id = 0, version = 0, 
  len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-0", '\000' <repeats 47 times>, 
    count = 0, len = 8192, elt_sz = 48, data = 0x10002e000, rwlock = {cnt = 0}}}

(gdb) p rte_config.mem_config.memsegs[1]
$2 = {{base_va = 0x500400000, addr_64 = 21479030784}, page_sz = 2097152, socket_id = 0, version = 0, 
  len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-1", '\000' <repeats 47 times>, 
    count = 0, len = 8192, elt_sz = 48, data = 0x500200000, rwlock = {cnt = 0}}}
    
(gdb) p rte_config.mem_config.memsegs[2]
$4 = {{base_va = 0x900600000, addr_64 = 38660997120}, page_sz = 2097152, socket_id = 0, version = 0, 
  len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-2", '\000' <repeats 47 times>, 
    count = 0, len = 8192, elt_sz = 48, data = 0x900400000, rwlock = {cnt = 0}}}
    
(gdb) p rte_config.mem_config.memsegs[3]
$5 = {{base_va = 0xd00800000, addr_64 = 55842963456}, page_sz = 2097152, socket_id = 0, version = 0, 
  len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-3", '\000' <repeats 47 times>, 
    count = 0, len = 8192, elt_sz = 48, data = 0xd00600000, rwlock = {cnt = 0}}}

2)eal_memalloc_init() :初始化memseg list 的fd

涉及的结构体:

static struct {
	int *fds; /**< dynamically allocated array of segment lock fd's */
	int memseg_list_fd; /**< memseg list fd */
	int len; /**< total length of the array */
	int count; /**< entries used in an array */
} fd_list[RTE_MAX_MEMSEG_LISTS];

涉及的代码块:

static int
fd_list_create_walk(const struct rte_memseg_list *msl,
		void *arg __rte_unused)
{
	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
	unsigned int len;
	int msl_idx;

	if (msl->external)
		return 0;

	msl_idx = msl - mcfg->memsegs;
	len = msl->memseg_arr.len;

	return alloc_list(msl_idx, len);
}

static int
alloc_list(int list_idx, int len)
{
	int *data;
	int i;

	/* ensure we have space to store fd per each possible segment */
	data = malloc(sizeof(int) * len);
	if (data == NULL) {
		RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
		return -1;
	}
	/* set all fd's as invalid */
	for (i = 0; i < len; i++)
		data[i] = -1;

	fd_list[list_idx].fds = data;
	fd_list[list_idx].len = len;
	fd_list[list_idx].count = 0;
	fd_list[list_idx].memseg_list_fd = -1;

	return 0;
}

如果是single-file-segments mode, 则对于一个rte_memseg_list,只使用一个file descriptor(fd_list中的memseg_list_fd)
如果是file-per-page, 则对于一个rte_memseg_list中的每一个mem segment, 都会使用一个file descriptor(fd_list中的fds)

GDB下看到的信息:

(gdb) p fd_list[0]
$19 = {fds = 0x7453fa0, memseg_list_fd = -1, len = 8192, count = 0}
(gdb) p fd_list[1]
$20 = {fds = 0x745bfb0, memseg_list_fd = -1, len = 8192, count = 0}
(gdb) p fd_list[2]
$21 = {fds = 0x7463fc0, memseg_list_fd = -1, len = 8192, count = 0}
(gdb) p fd_list[3]
$22 = {fds = 0x746bfd0, memseg_list_fd = -1, len = 8192, count = 0}

3)rte_eal_hugepage_init()初始化大页内存

涉及的结构体:

struct rte_memseg { //一个rte_memseg等同于一个hugepage
	RTE_STD_C11
	union {
		phys_addr_t phys_addr;  /**< deprecated - Start physical address. */
		rte_iova_t iova;        /**< Start IO address. */
	};
	RTE_STD_C11
	union {
		void *addr;             /**< Start virtual address. */
		uint64_t addr_64;       /**< Makes sure addr is always 64 bits */
	};
	size_t len;                 /**< Length of the segment. */
	uint64_t hugepage_sz;       /**< The pagesize of underlying memory */
	int32_t socket_id;          /**< NUMA socket ID. */
	uint32_t nchannel;          /**< Number of channels. */
	uint32_t nrank;             /**< Number of ranks. */
	uint32_t flags;             /**< Memseg-specific flags */
} __rte_packed;

struct hugepage_file {
	void *orig_va;      /**< virtual addr of first mmap() */
	void *final_va;     /**< virtual addr of 2nd mmap() */
	uint64_t physaddr;  /**< physical addr */
	size_t size;        /**< the page size */
	int socket_id;      /**< NUMA socket ID */
	int file_id;        /**< the '%d' in HUGEFILE_FMT */  	
	//这是第file_id个大小为size的hugepage
	
	char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */
	//filepath指明hugepage对应的文件
};

涉及的legacy代码块:
如果是legacy mem, 则调用eal_legacy_hugepage_init()

        /* create a memseg list */
		msl = &mcfg->memsegs[0];

		page_sz = RTE_PGSIZE_4K;
		n_segs = internal_config.memory / page_sz;
		
		addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
				MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

		msl->base_va = addr;
		msl->page_sz = page_sz;
		msl->socket_id = 0;
		msl->len = internal_config.memory;

		/* populate memsegs. each memseg is one page long */
		for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
			arr = &msl->memseg_arr;

			ms = rte_fbarray_get(arr, cur_seg);
			if (rte_eal_iova_mode() == RTE_IOVA_VA)
				ms->iova = (uintptr_t)addr;
			else
				ms->iova = RTE_BAD_IOVA;
			ms->addr = addr;
			ms->hugepage_sz = page_sz;
			ms->socket_id = 0;
			ms->len = page_sz;

			rte_fbarray_set_used(arr, cur_seg);

			addr = RTE_PTR_ADD(addr, (size_t)page_sz);
		}

a、根据internal_config->hugepage_info初始化hugepage_file, 并且将这些hugepage_file, 根据<socket id, pagesz>对应的rte_memseg_list中的rte_memseg进行映射;

	/* map all hugepages and sort them */
	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
		unsigned pages_old, pages_new;
		struct hugepage_info *hpi;

		/*
		 * we don't yet mark hugepages as used at this stage, so
		 * we just map all hugepages available to the system
		 * all hugepages are still located on socket 0
		 */
		hpi = &internal_config.hugepage_info[i];

		/* map all hugepages available */
		pages_old = hpi->num_pages[0];
		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);

		if (phys_addrs_available &&
				rte_eal_iova_mode() != RTE_IOVA_VA) {
			/* find physical addresses for each hugepage */
			if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
				RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
					"for %u MB pages\n",
					(unsigned int)(hpi->hugepage_sz / 0x100000));
				goto fail;
			}
		} else {
			/* set physical addresses for each hugepage */
			if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
				RTE_LOG(DEBUG, EAL, "Failed to set phys addr "
					"for %u MB pages\n",
					(unsigned int)(hpi->hugepage_sz / 0x100000));
				goto fail;
			}
		}

		if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
			RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
					(unsigned)(hpi->hugepage_sz / 0x100000));
			goto fail;
		}

		qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
		      sizeof(struct hugepage_file), cmp_physaddr);

		/* we have processed a num of hugepages of this size, so inc offset */
		hp_offset += hpi->num_pages[0];
	}

b、然后qsort对hugepage_file排序(使得按照页的size降序排序,同一种size按照物理地址升序排序);

c、然后find_numasocket根据internal_config->socket_mem计算hugepage在不同socket的分布;

d、之后remap_needed_hugepages循环调用remap_segment对所有的hugepage_file进行重映射,使得虚拟内存连续的mem segments在物理内存上也是连续的,并且同一个rte_memseg_list所有的mem_sgement的虚拟地址和物理地址都是单调递增。
e、接着,设置fd_list中对应的file descriptor。
这个方法会将hugepage_file写入到hugepage_data文件。

在实现的过程中采用read-ahead,目的是为了保证虚拟内存连续的mem segments在物理内存上也是连续的,同时也能够提前载入物理页,提高系统的性能。而对于nohugepage的情况,将其视为legacy, single-file mode,采用的页的大小为4K。

如果是dynamic mem, 则调用eal_hugepage_init()

涉及代码块:

	for (hp_sz_idx = 0;
			hp_sz_idx < (int)internal_config.num_hugepage_sizes;
			hp_sz_idx++) {
		for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
				socket_id++) {
			struct rte_memseg **pages;
			struct hugepage_info *hpi = &used_hp[hp_sz_idx];
			unsigned int num_pages = hpi->num_pages[socket_id];
			int num_pages_alloc, i;
			pages = malloc(sizeof(*pages) * num_pages);
			num_pages_alloc = eal_memalloc_alloc_seg_bulk(pages,
					num_pages, hpi->hugepage_sz,
					socket_id, true);
			if (num_pages_alloc < 0) {
				free(pages);
				return -1;
			}
			
	/* memalloc is locked, so it's safe to use thread-unsafe version */
	ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);

	page_sz = (size_t)msl->page_sz;

	msl_idx = msl - mcfg->memsegs;
	cur_msl = &mcfg->memsegs[msl_idx];

	need = wa->n_segs;

	/* try finding space in memseg list */
	cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
	for (i = 0; i < need; i++, cur_idx++) {
		struct rte_memseg *cur;
		void *map_addr;

		cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
		map_addr = RTE_PTR_ADD(cur_msl->base_va,
				cur_idx * page_sz);

	if (alloc_seg(cur, map_addr, wa->socket, wa->hi)
	
	ms->addr = addr;
	ms->hugepage_sz = alloc_sz;
	ms->len = alloc_sz;
	ms->nchannel = rte_memory_get_nchannel();
	ms->nrank = rte_memory_get_nrank();
	ms->iova = iova;
	ms->socket_id = socket_id;

根据socket_mem的需求,计算hugepage在不同socket的分布。然后使用了eal_memalloc_alloc_seg_bluk–alloc_seg_walk–alloc_seg进行分配,
由于这个方法是一个一个mem segment进行分配,所以不能保证分配完成后,虚拟空间上连续的mem segments在物理上也是连续的.
采用了pre-allocate,能够提高系统的性能。

4) rte_eal_memdevice_init()
设置mcfg->nchannel, mcfg->nrank

(gdb) p (struct rte_memseg)(rte_config.mem_config.memsegs.memseg_arr.data)
$32 = {{phys_addr = 4295155712, iova = 4295155712}, {addr = 0x0, addr_64 = 0}, len = 21479030784, 
  hugepage_sz = 2097152, socket_id = 0, nchannel = 0, nrank = 0, flags = 4}

6、rte_eal_malloc_heap_init()
初始化mcfg->malloc_heaps;并且注册进程间通信的handle,用于多进程环境下的内存分配;初始化heap的结构。其中每一个socket会对应一个heap。
初始化完成后heap的结构如下(一个例子):
在这里插入图片描述
假设系统支持两种大小的hugepage(2MB, 1GB)
上图的heap包含两个rte_memseg_list, 每一个都包含3个contiguous mem segments(其中可能包含一个或多个hugepage), 总共有6个contiguous mem segments(图中浅黄色的部分). 每一个contigous mem segments都包含一个malloc_elem,用于记录此contiguous mem segments的元数据。每一个struct malloc_heap都会指向第一个malloc_elem和最后一个malloc_elem;并且一个heap中,所有的malloc_elem会组成一个双向链表。

二、对于secondary process的内存初始化过程:
1、rte_config_init()
使用mmap()将config文件映射到此进程的mcfg,这样可以直接读取primary process的内存映像.

**2、eal_hugepage_info_init() **
读取hugepage文件的内容,并保存在struct internal_config->hugepage_info中。

3、rte_eal_memzone_init()
根据config文件中关于memzones的内容, 创建一个和primary process具有相同内存映像的mcfg->memzones

4、rte_eal_memory_init()
这是内存初始化过程的核心,其中包括了
----memseg_secondary_init(),
----eal_memalloc_init(),
----rte_eal_hugepage_attach(),
----rte_eal_memdevice_init().

1) memseg_secondary_init() :
直接根据config文件的内容创建和primary process相同的虚拟内存空间视图。

2)eal_memalloc_init() :
对mcfg中的struct rte_memseg_list, 创建一个本地副本(即local_memsegs),用于同步memory hotplug初始化struct fd_list,如果是single-file-segments mode, 则对于一个rte_memseg_list,只使用一个file descriptor(fd_list中的memseg_list_fd);如果是file-per-page, 则对于一个rte_memseg_list中的每一个mem segment, 都会使用一个file descriptor(fd_list中的fds)。

3)rte_eal_hugepage_attach():
如果是legacy mem, eal_legacy_hugepage_attach()
读取hugepage_data文件,根据文件的内容建立与primary process相应的内存映像,并且设置fd_list中相应的file descriptor。如果是dynamic mem, eal_hugepage_attach()调用eal_memalloc_sync_with_primary(), 将primary process的mcfg->memsegs同步到此进程的local_memsegs。

4)rte_eal_memdevice_init() :
不做任何操作。

5、rte_eal_malloc_heap_init()
初始化mcfg->malloc_heaps;并且注册进程间通信的handle,用于多进程环境下的内存分配;初始化heap的结构。

三、总结
1、如果没有采用hugetlbfs,则默认采用系统页(大小为4K)
2、DPDK有两种内存模式 :

legacy mode : 保证虚拟空间连续的contiguous mem segments在物理空间上也是连续的
dynamic mode : 分配hugepage时是一个一个分配的,不能和legacy mode有一样的保证

3、DPDK在memalloc时有两种模式single-file-segments, page-per-file, 每一种都在hugetlbfs的挂载点上有相应的文件形式(即存在于内存中的文件),这样在内存分配时可以使用对file descriptor操作的系统调用对内存进行操作。

4、每一个socket有一个heap, 每一个heap包含若干个rte_memseg_list, 每一个rte_memseg_list包含若干rte_memseg, 一个rte_memseg对应于一个memory page。

5、在分配内存时,采用了read-ahead, pre-allocated等方法,能够减少由于页错误而阻塞的情况,提高系统的性能。

  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值