背景
最近在看dpdk内存管理相关的代码,但是看到calc_num_pages_per_socket()函数时,有点发懵,搞不懂为什么这个函数中的代码是这么写的,还有为什么需要调用这个函数,这两个问题困扰了我很久,终于有一天让我想明白了,所以决定记录一下。(可以先看结论,在看分析过程)
结论(这个结论隔了一个晚上我才想明白)
该函数是用于算出最终需要的大页数,比如:
1、请求内存比较大,实际的大页内存不够。这种情况下,会返回-1;
2、请求的内存比较小,实际的大页内存足够多,分三种:
请求1G,那么就分配一个1G的大页;
请求1000M,分配1G会有点多,看2M尺寸的大页够不够,如果够就分配2M的大页尺寸500个;如果不够,那就只能分配一个1G的大页;
直接上代码
// 代码路径: lib/librte_eal/linuxapp/eal/eal_memory.c
static int
calc_num_pages_per_socket(uint64_t * memory,
struct hugepage_info *hp_info,
struct hugepage_info *hp_used,
unsigned num_hp_info)
{
unsigned socket, j, i = 0;
unsigned requested, available;
int total_num_pages = 0;
uint64_t remaining_mem, cur_mem;
uint64_t total_mem = internal_config.memory;
if (num_hp_info == 0)
return -1;
/* if specific memory amounts per socket weren't requested */
if (internal_config.force_sockets == 0) {
int cpu_per_socket[RTE_MAX_NUMA_NODES];
size_t default_size, total_size;
unsigned lcore_id;
/* Compute number of cores per socket */
memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
RTE_LCORE_FOREACH(lcore_id) {
cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
}
/*
* Automatically spread requested memory amongst detected sockets according
* to number of cores from cpu mask present on each socket
*/
total_size = internal_config.memory;
for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++)
{
/* Set memory amount per socket */
default_size = (internal_config.memory * cpu_per_socket[socket])
/ rte_lcore_count();
/* Limit to maximum available memory on socket */
default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
/* Update sizes */
memory[socket] = default_size;
total_size -= default_size;
}
/*
* If some memory is remaining, try to allocate it by getting all
* available memory from sockets, one after the other
*/
for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++)
{
/* take whatever is available */
default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
total_size);
/* Update sizes */
memory[socket] += default_size;
total_size -= default_size;
}
}
for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
/* skips if the memory on specific socket wasn't requested */
for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
hp_used[i].hugedir = hp_info[i].hugedir;
hp_used[i].num_pages[socket] = RTE_MIN(
memory[socket] / hp_info[i].hugepage_sz,
hp_info[i].num_pages[socket]);
cur_mem = hp_used[i].num_pages[socket] *
hp_used[i].hugepage_sz;
memory[socket] -= cur_mem;
total_mem -= cur_mem;
total_num_pages += hp_used[i].num_pages[socket];
/* check if we have met all memory requests */
if (memory[socket] == 0)
break;
/* check if we have any more pages left at this size, if so
* move on to next size */
if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
continue;
/* At this point we know that there are more pages available that are
* bigger than the memory we want, so lets see if we can get enough
* from other page sizes.
*/
remaining_mem = 0;
for (j = i+1; j < num_hp_info; j++)
remaining_mem += hp_info[j].hugepage_sz *
hp_info[j].num_pages[socket];
/* is there enough other memory, if not allocate another page and quit */
if (remaining_mem < memory[socket]){
cur_mem = RTE_MIN(memory[socket],
hp_info[i].hugepage_sz);
memory[socket] -= cur_mem;
total_mem -= cur_mem;
hp_used[i].num_pages[socket]++;
total_num_pages++;
break; /* we are done with this socket*/
}
}
/* if we didn't satisfy all memory requirements per socket */
if (memory[socket] > 0) {
/* to prevent icc errors */
requested = (unsigned) (internal_config.socket_mem[socket] /
0x100000);
available = requested -
((unsigned) (memory[socket] / 0x100000));
RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! "
"Requested: %uMB, available: %uMB\n", socket,
requested, available);
return -1;
}
}
/* if we didn't satisfy total memory requirements */
if (total_mem > 0) {
requested = (unsigned) (internal_config.memory / 0x100000);
available = requested - (unsigned) (total_mem / 0x100000);
RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB,"
" available: %uMB\n", requested, available);
return -1;
}
return total_num_pages;
}
涉及到的结构体及变量
1、internal_config:
struct internal_config {
...
volatile size_t memory; // 请求的内存大小
volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; // 每个socket上的内存大小
unsigned num_hugepage_size; // 大页尺寸的个数
struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZE];
...
}
该变量中只需要关注几个相关的字段就行:
1)num_hugepage_size:大页尺寸的个数,比如我的系统上支持两种尺寸的大页,2M和1G,所以这里是2。
2)hugepage_info:该字段中保存的是检测到的大页的信息,对应的结构体如下:
struct hugepage_info {
...
uint64_t page_size;
uint32_t num_pages[RTE_MAX_NUMA_NODES]; // 一般是4个socket
...
}
比如在我的系统上,存在两种尺寸的大页,2M和1G。其中2M的大页配置了20个,1G的大页配了有2个,所以该字段中的信息如下:
// 1G
hugepage_info[0] = {
page_size = 1G;
num_pages[] = {1, 1}; // socket0上1个,socket1上有1个。
}
// 2M
hugepage_info[1] = {
page_size = 2M;
num_pages[] = {10, 10}; // socket0上10个,socket1上有10个。
}
3)socket_mem:每个socket上请求的内存大小,由参数--socket-mem 填充。
4)memory:该字段表示运行程序时请求的内存大小。如果没有手动指定--socket-mem参数,那么就是监测到的所有大页的内存总和;如果手动指定了--socket-mem 参数,那么就是该参数的总和。
比如:
启动时增加参数"--socket-mem 256,256",表示在socket 0和socket 1上各需要256M的内存,那么internal_config.memory = 256+256 = 512M。
如果没有指定--socket-mem 参数,那么internal_config.memory 就是监测到的所有大页的内存总和,比如2M大页内存有20个,1G大页内存有2个,那么memory = 1G * 2 + 2M * 20。
2、memory 变量
该变量的类型是 uint64_t *, 实际上是一个一维数组,表示的是每个socket上请求的内存大小,是internal_config.socket_mem的一份拷贝。
代码讲解
有了上面这些基础信息后,我们就可以开始calc_num_pages_per_socket()函数的讲解了。
1、填充 memory 数组
如果没有指定--socket-mem 参数,那么在进入calc_mem_pages_per_socket()函数时,memory[] 是全0的状态,并且internal_config.force_sockets == 0,所以会进入如下代码:
// 没有指定 --socket-mem 参数
if (internal_config.force_sockets == 0) {
int cpu_per_socket[RTE_MAX_NUMA_NODES];
size_t default_size, total_size;
unsigned lcore_id;
// 计算每个socket上启动了几个lcore
memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
RTE_LCORE_FOREACH(lcore_id) {
cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
}
// 根据lcore的占比,给每个socket上自动分配内存大小。
// 比如socket0上只启动了一个lcore,而socket1上启动了2个lcore,
// 所以memory[0] = total_size * 1 / 3, 而memory[1] = total_size * 2 / 3
// 最后在跟 get_socket_mem_size(socket)获取的大小进行比较,然后取较小的那个。
total_size = internal_config.memory;
for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++)
{
/* Set memory amount per socket */
default_size = (internal_config.memory * cpu_per_socket[socket])
/ rte_lcore_count();
/* Limit to maximum available memory on socket */
default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
/* Update sizes */
memory[socket] = default_size;
total_size -= default_size;
}
// 经过上面的自动分配,很有可能有剩余的内存
for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++)
{
/* take whatever is available */
default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
total_size);
/* Update sizes */
memory[socket] += default_size;
total_size -= default_size;
}
}
2、计算每个socket上的大页数量,填充到hp_used 变量中。(这也是我迷惑的地方)
for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
// 如果memory[socket] == 0 跳过
// 每种大页尺寸
for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
hp_used[i].hugedir = hp_info[i].hugedir;
hp_used[i].num_pages[socket] = RTE_MIN(
memory[socket] / hp_info[i].hugepage_sz,
hp_info[i].num_pages[socket]);
cur_mem = hp_used[i].num_pages[socket] * hp_used[i].hugepage_sz;
memory[socket] -= cur_mem;
total_mem -= cur_mem;
total_num_pages += hp_used[i].num_pages[socket];
// 检查是否已经满足内存要求,如果满足则退出,不满足继续。
if (memory[socket] == 0)
break;
/* check if we have any more pages left at this size, if so
* move on to next size */
// 我认为这里只会出现两种情况:
// 1、 hp_used[i].num_pages = hp_info[i].num_pages,请求的内存比实际的内存大或者正好相等
// 2、hp_used[i].num_pages < hp_info[i].num_pages, 请求的内存比实际的内存小,并且不是该size的整数倍
if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
continue;
/* At this point we know that there are more pages available that are
* bigger than the memory we want, so lets see if we can get enough
* from other page sizes.
*/
// 到了这里,说明的请求的内存比实际内存小,并且不是该size的整数倍。
// 1、其他的size没有足够的内存,那么就分配一个较大的内存,比如请求1000M,这里会分配一个1024M的内存页
// 2、其他的size 有足够的内存。比如请求1000M,不是1024的倍数,但是是2M的倍数,所以2M的就会分配500页。
remaining_mem = 0;
for (j = i+1; j < num_hp_info; j++)
remaining_mem += hp_info[j].hugepage_sz *
hp_info[j].num_pages[socket];
/* is there enough other memory, if not allocate another page and quit */
if (remaining_mem < memory[socket]){
cur_mem = RTE_MIN(memory[socket], hp_info[i].hugepage_sz);
memory[socket] -= cur_mem;
total_mem -= cur_mem;
hp_used[i].num_pages[socket]++;
total_num_pages++;
break; /* we are done with this socket*/
}
}
/* if we didn't satisfy all memory requirements per socket */
if (memory[socket] > 0) {
/* to prevent icc errors */
requested = (unsigned) (internal_config.socket_mem[socket] / 0x100000);
available = requested - ((unsigned) (memory[socket] / 0x100000));
RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! "
"Requested: %uMB, available: %uMB\n", socket,
requested, available);
return -1;
}
}
具体情况:
第一种情况:
请求的内存多,但是实际拥有的大页内存比较少
比如,socket 0上1G的大页配置了2个, 那么该socket上总的内存为get_soket_mem_size = 2 * 1G = 2G;
启动程序时手动指定socket 0上需要的内存为3G
calc_num_pages_per_socket:
hp_info[0] : page_size = 1G; num_pages[0] = 2
memory[0] =3G
所以:
hp_used[0].num_pages[0] = rte_min( memory[0] / hp_used[0].page_size, hp_info[0].num_pages) = 2
cur_mem = hp_used[0].num_pages[0] * hp_used[0].page_size;
memory[0] -= cur_mem = 1G
total_mem -= cur_mem = 1G
memory[0] != 0
hp_used[0].num_pages[0] == hp_info[0].num_pages[0]
memory[0] != 0 , return -1;
再比如:socket 0 上1G的大页配置了2个, 2M的大页配置了2个,但是我请求了2054M的内存
memory[0] = 2054M = 1G * 2 + 2M * 3;
hp_info[0].page_size = 1G, hp_info[0].num_pages[0] = 2
hp_info[1].page_size = 2M, hp_info[1].num_pages[0] = 2
// 1G
{
hp_used[0].page_size = 1G
hp_used[0].num_pages[0] = rte_min(memory[0] / hp_used[0].page_size, hp_info[0].num_pages[0]) = 2
cur_mem = 1G * 2
total_mem -= cur_mem; // 6M
memory[0] -= cur_mem; // 6M
memory[0] != 0
hp_used[0].num_pages[0] == hp_info[0].num_pages
}
// 2M
{
hp_used[1].page_size = 2M
hp_used[1].num_pages[0] = rte_min(memory[0] / hp_used[1].page_size, hp_info[1].num_pages[0]) = 2
cur_mem = 2M * 2 = 4M
total_mem -= cur_mem ; // 2M
memory[0] -= cur_mem; // 2M
memory[0] != 0
hp_used[0].num_pages[0] == hp_info[0].num_pages
}
第二种情况:
只有一种大页尺寸(1G), 请求的内存较少,并且不是该尺寸的整数倍。
比如:socket 0上 1G 的大页内存有2个,但是程序启动的时候只需要1000M的内存
memory[0] = 1000M
total_mem = 1000M
hp_info[0].page_size = 1G, hp_info[0].num_pages[0] = 2
// 1G
{
hp_used[0].page_size = 1G
hp_used[0].num_pages[0] = rte_min (memory[0] / hp_used[0].page_size, hp_info[0].num_pages[0]) = 0;
cur_mem = 0;
total_mem -= 0 ;
memory[0] -= 0 ;
memory[0] != 0;
hp_used[0].num_pages[0] != hp_info[0].num_pages[0]
remining_mem = 0;
remining_mem < memory[0]
cur_mem = rte_min (memory[0]. huge_used[0].page_size) = memory[0];
memory[0] -= cur_mem;
total_mem -= cur_mem;
num_pages ++
}
有两种大页尺寸,请求的内存大小不是第一个的整数倍,是第二个的整数倍,但是第二个预留的内存不够
再比如:socket 0 上1G的大页内存有2个,2M的大页内存有2个,但是程序请求1000G的内存
memory[0] = 1000M
total_mem = 1000M
hp_info[0].page_size = 1G, hp_info[0].num_pages[0] = 2
hp_info[1].page_size = 2M, hp_info[1].num_pages[0] = 2
// 1G
{
hp_used[0].page_size = 1G
hp_used[0].num_pages[0] = rte_min (memory[0] / hp_used[0].page_size, hp_info[0].num_pages[0]) = 0;
cur_mem = 0;
total_mem -= 0 ;
memory[0] -= 0 ;
memory[0] != 0;
hp_used[0].num_pages[0] != hp_info[0].num_pages[0]
remining_mem = 2M * 2 = 4M
remining_mem < memory[0]
cur_mem = rte_min (memory[0]. huge_used[0].page_size) = memory[0];
memory[0] -= cur_mem;
total_mem -= cur_mem;
num_pages ++
}
第三种情况:
大页尺寸有两个(1G、2M),请求的内存不是第一个的整数倍,但是第二个尺寸的大小足够该请求的内存大小
再比如:socket 0 上1G的大页内存有2个,2M的大页内存有2000个,但是程序请求1000G的内存
memory[0] = 1000M
total_mem = 1000M
hp_info[0].page_size = 1G, hp_info[0].num_pages[0] = 2
hp_info[1].page_size = 2M, hp_info[1].num_pages[0] = 2000
// 1G
{
hp_used[0].page_size = 1G
hp_used[0].num_pages[0] = rte_min (memory[0] / hp_used[0].page_size, hp_info[0].num_pages[0]) = 0;
cur_mem = 0;
total_mem -= 0 ;
memory[0] -= 0 ;
memory[0] != 0;
hp_used[0].num_pages[0] != hp_info[0].num_pages[0]
remining_mem = 2M * 2000 = 4000M
remining_mem > memory[0]
}
// 2M
{
hp_used[1].page_size = 2M
hp_used[1].num_pages[0] = rte_min (memory[0] / hp_used[1].page_size, hp_info[1].num_pages[0]) = 500;
cur_mem = 1000;
total_mem -= 1000 ;
memory[0] -= 1000 ;
memory[0] == 0;
}
(真是绕)