dpdk内存管理之calc_num_pages_per_socket()函数分析

背景

最近在看dpdk内存管理相关的代码,但是看到calc_num_pages_per_socket()函数时,有点发懵,搞不懂为什么这个函数中的代码是这么写的,还有为什么需要调用这个函数,这两个问题困扰了我很久,终于有一天让我想明白了,所以决定记录一下。(可以先看结论,在看分析过程)

结论(这个结论隔了一个晚上我才想明白)

该函数是用于算出最终需要的大页数,比如:

1、请求内存比较大,实际的大页内存不够。这种情况下,会返回-1;

2、请求的内存比较小,实际的大页内存足够多,分三种:

        请求1G,那么就分配一个1G的大页;

        请求1000M,分配1G会有点多,看2M尺寸的大页够不够,如果够就分配2M的大页尺寸500个;如果不够,那就只能分配一个1G的大页;

直接上代码

// 代码路径: lib/librte_eal/linuxapp/eal/eal_memory.c
static int
calc_num_pages_per_socket(uint64_t * memory,
		struct hugepage_info *hp_info,
		struct hugepage_info *hp_used,
		unsigned num_hp_info)
{
	unsigned socket, j, i = 0;
	unsigned requested, available;
	int total_num_pages = 0;
	uint64_t remaining_mem, cur_mem;
	uint64_t total_mem = internal_config.memory;

	if (num_hp_info == 0)
		return -1;

	/* if specific memory amounts per socket weren't requested */
	if (internal_config.force_sockets == 0) {
		int cpu_per_socket[RTE_MAX_NUMA_NODES];
		size_t default_size, total_size;
		unsigned lcore_id;

		/* Compute number of cores per socket */
		memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
		RTE_LCORE_FOREACH(lcore_id) {
			cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
		}

		/*
		 * Automatically spread requested memory amongst detected sockets according
		 * to number of cores from cpu mask present on each socket
		 */
		total_size = internal_config.memory;
		for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++)                         
        {

			/* Set memory amount per socket */
			default_size = (internal_config.memory * cpu_per_socket[socket])
			                / rte_lcore_count();

			/* Limit to maximum available memory on socket */
			default_size = RTE_MIN(default_size, get_socket_mem_size(socket));

			/* Update sizes */
			memory[socket] = default_size;
			total_size -= default_size;
		}

		/*
		 * If some memory is remaining, try to allocate it by getting all
		 * available memory from sockets, one after the other
		 */
		for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) 
        {
			/* take whatever is available */
			default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
			                       total_size);

			/* Update sizes */
			memory[socket] += default_size;
			total_size -= default_size;
		}
	}

	for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
		/* skips if the memory on specific socket wasn't requested */
		for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
			hp_used[i].hugedir = hp_info[i].hugedir;
			hp_used[i].num_pages[socket] = RTE_MIN(
					memory[socket] / hp_info[i].hugepage_sz,
					hp_info[i].num_pages[socket]);

			cur_mem = hp_used[i].num_pages[socket] *
					hp_used[i].hugepage_sz;

			memory[socket] -= cur_mem;
			total_mem -= cur_mem;

			total_num_pages += hp_used[i].num_pages[socket];

			/* check if we have met all memory requests */
			if (memory[socket] == 0)
				break;

			/* check if we have any more pages left at this size, if so
			 * move on to next size */
			if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
				continue;
			/* At this point we know that there are more pages available that are
			 * bigger than the memory we want, so lets see if we can get enough
			 * from other page sizes.
			 */
			remaining_mem = 0;
			for (j = i+1; j < num_hp_info; j++)
				remaining_mem += hp_info[j].hugepage_sz *
				hp_info[j].num_pages[socket];

			/* is there enough other memory, if not allocate another page and quit */
			if (remaining_mem < memory[socket]){
				cur_mem = RTE_MIN(memory[socket],
						hp_info[i].hugepage_sz);
				memory[socket] -= cur_mem;
				total_mem -= cur_mem;
				hp_used[i].num_pages[socket]++;
				total_num_pages++;
				break; /* we are done with this socket*/
			}
		}
		/* if we didn't satisfy all memory requirements per socket */
		if (memory[socket] > 0) {
			/* to prevent icc errors */
			requested = (unsigned) (internal_config.socket_mem[socket] /
					0x100000);
			available = requested -
					((unsigned) (memory[socket] / 0x100000));
			RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! "
					"Requested: %uMB, available: %uMB\n", socket,
					requested, available);
			return -1;
		}
	}

	/* if we didn't satisfy total memory requirements */
	if (total_mem > 0) {
		requested = (unsigned) (internal_config.memory / 0x100000);
		available = requested - (unsigned) (total_mem / 0x100000);
		RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB,"
				" available: %uMB\n", requested, available);
		return -1;
	}
	return total_num_pages;
}

涉及到的结构体及变量

1、internal_config:

struct internal_config {
    ...
    volatile size_t memory; // 请求的内存大小
    volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; // 每个socket上的内存大小
    unsigned num_hugepage_size; // 大页尺寸的个数
    struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZE];
    ...
}

该变量中只需要关注几个相关的字段就行:

1)num_hugepage_size:大页尺寸的个数,比如我的系统上支持两种尺寸的大页,2M和1G,所以这里是2。

2)hugepage_info:该字段中保存的是检测到的大页的信息,对应的结构体如下:

struct hugepage_info {
    ...               
    uint64_t page_size;
    uint32_t num_pages[RTE_MAX_NUMA_NODES]; // 一般是4个socket
    ... 
}

比如在我的系统上,存在两种尺寸的大页,2M和1G。其中2M的大页配置了20个,1G的大页配了有2个,所以该字段中的信息如下:

// 1G
hugepage_info[0] = {
    page_size = 1G;
    num_pages[] = {1, 1}; // socket0上1个,socket1上有1个。
}

// 2M
hugepage_info[1] = {
    page_size = 2M;
    num_pages[] = {10, 10}; // socket0上10个,socket1上有10个。
}

3)socket_mem:每个socket上请求的内存大小,由参数--socket-mem 填充。

4)memory:该字段表示运行程序时请求的内存大小。如果没有手动指定--socket-mem参数,那么就是监测到的所有大页的内存总和;如果手动指定了--socket-mem 参数,那么就是该参数的总和。

比如:

        启动时增加参数"--socket-mem 256,256",表示在socket 0和socket 1上各需要256M的内存,那么internal_config.memory = 256+256 = 512M。

        如果没有指定--socket-mem 参数,那么internal_config.memory 就是监测到的所有大页的内存总和,比如2M大页内存有20个,1G大页内存有2个,那么memory = 1G * 2 + 2M * 20。

2、memory 变量

该变量的类型是 uint64_t *, 实际上是一个一维数组,表示的是每个socket上请求的内存大小,是internal_config.socket_mem的一份拷贝。

代码讲解

有了上面这些基础信息后,我们就可以开始calc_num_pages_per_socket()函数的讲解了。

1、填充 memory 数组

如果没有指定--socket-mem 参数,那么在进入calc_mem_pages_per_socket()函数时,memory[] 是全0的状态,并且internal_config.force_sockets == 0,所以会进入如下代码:

// 没有指定 --socket-mem 参数
if (internal_config.force_sockets == 0) {
    int cpu_per_socket[RTE_MAX_NUMA_NODES];
	size_t default_size, total_size;
	unsigned lcore_id;

	// 计算每个socket上启动了几个lcore
	memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
	RTE_LCORE_FOREACH(lcore_id) {
		cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
	}

	// 根据lcore的占比,给每个socket上自动分配内存大小。
    // 比如socket0上只启动了一个lcore,而socket1上启动了2个lcore,
    // 所以memory[0] = total_size * 1 / 3, 而memory[1] = total_size * 2 / 3
    // 最后在跟 get_socket_mem_size(socket)获取的大小进行比较,然后取较小的那个。
	total_size = internal_config.memory;
	for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++)                         
    {
	    /* Set memory amount per socket */
		default_size = (internal_config.memory * cpu_per_socket[socket])
	                / rte_lcore_count();

		/* Limit to maximum available memory on socket */
		default_size = RTE_MIN(default_size, get_socket_mem_size(socket));

		/* Update sizes */
		memory[socket] = default_size;
		total_size -= default_size;
	}

	// 经过上面的自动分配,很有可能有剩余的内存
	for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) 
    {
		/* take whatever is available */
		default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
	                       total_size);

		/* Update sizes */
		memory[socket] += default_size;
		total_size -= default_size;
	}
}

2、计算每个socket上的大页数量,填充到hp_used 变量中。(这也是我迷惑的地方)

for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
	// 如果memory[socket] == 0 跳过
    // 每种大页尺寸
	for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
		hp_used[i].hugedir = hp_info[i].hugedir;
		hp_used[i].num_pages[socket] = RTE_MIN(
				memory[socket] / hp_info[i].hugepage_sz,
				hp_info[i].num_pages[socket]);

		cur_mem = hp_used[i].num_pages[socket] * hp_used[i].hugepage_sz;
		memory[socket] -= cur_mem;
		total_mem -= cur_mem;

		total_num_pages += hp_used[i].num_pages[socket];

		// 检查是否已经满足内存要求,如果满足则退出,不满足继续。
		if (memory[socket] == 0)
			break;

		/* check if we have any more pages left at this size, if so
		 * move on to next size */
        // 我认为这里只会出现两种情况:
        // 1、 hp_used[i].num_pages = hp_info[i].num_pages,请求的内存比实际的内存大或者正好相等
        // 2、hp_used[i].num_pages < hp_info[i].num_pages, 请求的内存比实际的内存小,并且不是该size的整数倍
		if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
			continue;

		/* At this point we know that there are more pages available that are
		 * bigger than the memory we want, so lets see if we can get enough
		 * from other page sizes.
		 */
        // 到了这里,说明的请求的内存比实际内存小,并且不是该size的整数倍。
        // 1、其他的size没有足够的内存,那么就分配一个较大的内存,比如请求1000M,这里会分配一个1024M的内存页
        // 2、其他的size 有足够的内存。比如请求1000M,不是1024的倍数,但是是2M的倍数,所以2M的就会分配500页。
		remaining_mem = 0;
		for (j = i+1; j < num_hp_info; j++)
			remaining_mem += hp_info[j].hugepage_sz *
			hp_info[j].num_pages[socket];
	
    	/* is there enough other memory, if not allocate another page and quit */
		if (remaining_mem < memory[socket]){
			cur_mem = RTE_MIN(memory[socket], hp_info[i].hugepage_sz);
			memory[socket] -= cur_mem;
			total_mem -= cur_mem;
			hp_used[i].num_pages[socket]++;
			total_num_pages++;
			break; /* we are done with this socket*/
		}
	}

	/* if we didn't satisfy all memory requirements per socket */
    if (memory[socket] > 0) {
	    /* to prevent icc errors */
		requested = (unsigned) (internal_config.socket_mem[socket] / 0x100000);
		available = requested - ((unsigned) (memory[socket] / 0x100000));
		RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! "
				"Requested: %uMB, available: %uMB\n", socket,
				requested, available);
		return -1;
	}
}

具体情况: 

第一种情况:
	请求的内存多,但是实际拥有的大页内存比较少
	比如,socket 0上1G的大页配置了2个, 那么该socket上总的内存为get_soket_mem_size = 2 * 1G  = 2G;
	启动程序时手动指定socket 0上需要的内存为3G

	calc_num_pages_per_socket:
		hp_info[0] : page_size = 1G; num_pages[0] = 2
		memory[0] =3G

		所以:
		hp_used[0].num_pages[0] = rte_min( memory[0] / hp_used[0].page_size, hp_info[0].num_pages) = 2
		cur_mem = hp_used[0].num_pages[0]  * hp_used[0].page_size;
		memory[0] -= cur_mem = 1G
		total_mem -= cur_mem = 1G

		memory[0] != 0 
		hp_used[0].num_pages[0] == hp_info[0].num_pages[0]


	memory[0] != 0 , return -1;

	再比如:socket 0 上1G的大页配置了2个, 2M的大页配置了2个,但是我请求了2054M的内存

	memory[0] = 2054M = 1G * 2 + 2M * 3;
	hp_info[0].page_size = 1G, hp_info[0].num_pages[0] = 2
	hp_info[1].page_size = 2M, hp_info[1].num_pages[0] = 2

	// 1G
	{
		hp_used[0].page_size = 1G
		hp_used[0].num_pages[0] = rte_min(memory[0] / hp_used[0].page_size, hp_info[0].num_pages[0]) = 2
		
		cur_mem = 1G * 2
		
		total_mem -= cur_mem; // 6M
		memory[0] -= cur_mem; // 6M

		memory[0] != 0
		hp_used[0].num_pages[0] == hp_info[0].num_pages
	}

	// 2M
	{
		hp_used[1].page_size = 2M
		hp_used[1].num_pages[0] = rte_min(memory[0] / hp_used[1].page_size, hp_info[1].num_pages[0]) = 2

		cur_mem = 2M * 2 = 4M 

		total_mem -= cur_mem ; // 2M
		memory[0] -= cur_mem; // 2M


		memory[0] != 0
		hp_used[0].num_pages[0] == hp_info[0].num_pages
	}

第二种情况:
	只有一种大页尺寸(1G), 请求的内存较少,并且不是该尺寸的整数倍。
	比如:socket 0上 1G 的大页内存有2个,但是程序启动的时候只需要1000M的内存

	memory[0]  = 1000M
	total_mem = 1000M

	hp_info[0].page_size = 1G, hp_info[0].num_pages[0] = 2

	// 1G
	{
		hp_used[0].page_size = 1G
		hp_used[0].num_pages[0] = rte_min (memory[0] / hp_used[0].page_size, hp_info[0].num_pages[0]) = 0;
		
		cur_mem = 0;
		total_mem -= 0 ;
		memory[0] -= 0 ;

		memory[0] != 0;
		hp_used[0].num_pages[0] != hp_info[0].num_pages[0]

		remining_mem = 0;

		remining_mem < memory[0]
		
		cur_mem = rte_min (memory[0]. huge_used[0].page_size)  = memory[0];

		memory[0] -= cur_mem;
		total_mem -= cur_mem;
		
		num_pages ++
	}

	有两种大页尺寸,请求的内存大小不是第一个的整数倍,是第二个的整数倍,但是第二个预留的内存不够
	再比如:socket 0 上1G的大页内存有2个,2M的大页内存有2个,但是程序请求1000G的内存

	memory[0]  = 1000M
	total_mem = 1000M

	hp_info[0].page_size = 1G, hp_info[0].num_pages[0] = 2
	hp_info[1].page_size = 2M, hp_info[1].num_pages[0] = 2

	// 1G
	{
		hp_used[0].page_size = 1G
		hp_used[0].num_pages[0] = rte_min (memory[0] / hp_used[0].page_size, hp_info[0].num_pages[0]) = 0;
		
		cur_mem = 0;
		total_mem -= 0 ;
		memory[0] -= 0 ;

		memory[0] != 0;
		hp_used[0].num_pages[0] != hp_info[0].num_pages[0]

		remining_mem =  2M * 2 = 4M

		remining_mem < memory[0]
		
		cur_mem = rte_min (memory[0]. huge_used[0].page_size)  = memory[0];

		memory[0] -= cur_mem;
		total_mem -= cur_mem;
		
		num_pages ++
	}

第三种情况:
	大页尺寸有两个(1G、2M),请求的内存不是第一个的整数倍,但是第二个尺寸的大小足够该请求的内存大小
	再比如:socket 0 上1G的大页内存有2个,2M的大页内存有2000个,但是程序请求1000G的内存

	memory[0]  = 1000M
	total_mem = 1000M

	hp_info[0].page_size = 1G, hp_info[0].num_pages[0] = 2
	hp_info[1].page_size = 2M, hp_info[1].num_pages[0] = 2000

	// 1G
	{
		hp_used[0].page_size = 1G
		hp_used[0].num_pages[0] = rte_min (memory[0] / hp_used[0].page_size, hp_info[0].num_pages[0]) = 0;
		
		cur_mem = 0;
		total_mem -= 0 ;
		memory[0] -= 0 ;

		memory[0] != 0;
		hp_used[0].num_pages[0] != hp_info[0].num_pages[0]

		remining_mem =  2M * 2000 = 4000M

		remining_mem > memory[0]
	}

	// 2M
	{
		hp_used[1].page_size = 2M
		hp_used[1].num_pages[0] = rte_min (memory[0] / hp_used[1].page_size, hp_info[1].num_pages[0]) = 500;
		
		cur_mem = 1000;
		total_mem -= 1000 ;
		memory[0] -= 1000 ;

		memory[0] == 0;
	}
	

(真是绕)        

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值