Bootmem分配器是Linux内核启动初期使用的内存管理系统,在页分配器初始化好之前,都需要借助于bootmem分配器来分配、释放内存。页分配器初始化好之后,将不再使用bootmem分配器。其原理是通过位图来记录页面的状态,位图比特位为1表示页面已经分配出去,为0表示页面是空闲的。
Bootmem分配器管理的是低端内存(我的开发板是MIPS,对MIPS来说,就是物理地址0至256MB之间的内存),这部分内存是直接映射的,返回的是低端内存的虚拟地址。
初始化
这部分是体系结构相关的,我的开发板是MIPS的,下面的代码来自于MIPS,其他体系的原理基本相同,只是实现上会有差异。
bootmem_init
bootmem_init负责bootmem分配器的初始化,其调用轨迹是:
start_kernel->setup_arch->arch_mem_init->bootmem_init
此处我们提到的高端内存、低端内存更确切的说是高端物理地址和低端物理地址,参见地址空间一节。
static void __init bootmem_init(void)
{
unsigned long reserved_end;
unsigned long mapstart = ~0UL;
unsigned long bootmap_size;
int i;
/*
* Init any data related to initrd. It's a nop if INITRD is
* not selected. Once that done we can determine the low bound
* of usable memory.
*/
/* init_initrd()返回initrd之后第一个空闲页的pfn,_end是linux内核映像本身所占空间的截止地址,包括代码段、数据段、bss段等。这些空间属于已经占用的保留空间,全局变量reserved_end记录保留区后第一个空闲页的pfn */
reserved_end = max(init_initrd(),
(unsigned long) PFN_UP(__pa_symbol(&_end)));
/*
* max_low_pfn is not a number of pages. The number of pages
* of the system is given by 'max_low_pfn - min_low_pfn'.
*/
/*全局变量min_low_pfn和max_low_pfn为可用低端内存的最小、最大pfn */
min_low_pfn = ~0UL;
max_low_pfn = 0;
/*
* Find the highest page frame number we have available.
*/
/* boot_mem_map中保存着boot传进来的可用的空闲内存区。扫描boot_mem_map找出可用的最小、最大pfn,注意不仅仅是低端内存,还包含了高端内存,对于MIPS来说,就是物理地址高于512MB的内存区域。
在我的开发板上,内存总大小为2GB,boot_mem_map保存着如下可用的内存区域:
0:Determined physical RAM map:
0: memory: 0b700000 @ 00100000 0:(usable)
0: memory: 04000000 @ 0c000000 0:(usable)
0: memory: 6f000000 @ 20000000 0:(usable)
即下面的内存区:
0: 0000100000(1MB) --> 000b7fffff (184MB)
0: 000c000000(192MB) --> 000fffffff(256MB)
0: 0020000000(512MB) --> 008effffff (2288MB)
最低的1MB是boot为cpu硬件保留的中断向量区,184MB到192MB的空间被boot保留下来作其他用途,256MB到512MB是IO空间。reserved_end为0x5216,即内核映像的末尾在0x5215000(页面大小4KB),82MB多一点的地方。
扫描完成后各变量的取值为:
0:min_low_pfn:0x100(1MB)max_low_pfn:0x8f000(2288MB)mapstart:0x5216(reserved_end)
注意此时max_low_pfn记录的并不是低端内存区的末尾,而是整个内存的末尾。
*/
for (i = 0; i < boot_mem_map.nr_map; i++) {
unsigned long start, end;
/*不是可用的内存区域,跳过*/
if (boot_mem_map.map[i].type != BOOT_MEM_RAM)
continue;
/*获得这块内存区起止页面的pfn */
start = PFN_UP(boot_mem_map.map[i].addr);
end = PFN_DOWN(boot_mem_map.map[i].addr
+ boot_mem_map.map[i].size);
/*更新最大pfn */
if (end > max_low_pfn)
max_low_pfn = end;
/*更新最小pfn */
if (start < min_low_pfn)
min_low_pfn = start;
/*下面的代码计算bootmem位图存放的位置,位图应保存在reserved_end之上。这块内存区在保留区下面,跳过*/
if (end <= reserved_end)
continue;
/*这块内存区的起始pfn大于等于mapstart时,不用更新,跳过*/
if (start >= mapstart)
continue;
/* mapstart为保留区上面第一个可用页面的pfn,这个页面用于保存位图*/
mapstart = max(reserved_end, start);
}
/*最小pfn不可能大于等于最大pfn */
if (min_low_pfn >= max_low_pfn)
panic("Incorrect memory mapping !!!");
if (min_low_pfn > ARCH_PFN_OFFSET) {
/* ARCH_PFN_OFFSET为体系结构定义的可用pfn的偏移,我的mips开发板上其值为0,即从0开始的页面都是可用的。后面要将min_low_pfn设为ARCH_PFN_OFFSET。所以当min_low_pfn大于ARCH_PFN_OFFSET时,说明boot定义的可用内存起始地址高于体系结构定义的地址,差值范围内的内存是无效的,但也要为他们分配struct page结构,此时浪费的是这块内存区struct page结构占用的空间。参见alloc_node_mem_map函数,后面会介绍。*/
pr_info("Wasting %lu bytes for tracking %lu unused pages\n",
(min_low_pfn - ARCH_PFN_OFFSET) * sizeof(struct page),
min_low_pfn - ARCH_PFN_OFFSET);
} else if (min_low_pfn < ARCH_PFN_OFFSET) {
/* min_low_pfn小于ARCH_PFN_OFFSET说明boot定义的可用内存起始地址低于体系结构定义的地址,差值范围内的内存是有效的,但是并不会使用,此时浪费了(ARCH_PFN_OFFSET - min_low_pfn)个页面。
*/
pr_info("%lu free pages won't be used\n",
ARCH_PFN_OFFSET - min_low_pfn);
}
/*设置min_low_pfn为体系结构的最低PFN,我的开发板为0 */
min_low_pfn = ARCH_PFN_OFFSET;
/*
* Determine low and high memory ranges
*/
/*此时max_low_pfn为整个内存的截止pfn */
max_pfn = max_low_pfn;
/*计算高端内存的起止pfn */
if (max_low_pfn > PFN_DOWN(HIGHMEM_START)) {
/* HIGHMEM_START定义了MIPS的高端内存起始地址,其值为0x20000000(512MB)。如果max_low_pfn大于HIGHMEM_START,说明存在高端内存,计算高端内存区的起止pfn。
在我的开发板上:
highstart_pfn为0x20000(512MB)
highend_pfn为0x8f000(2288MB)
*/
#ifdef CONFIG_HIGHMEM
highstart_pfn = PFN_DOWN(HIGHMEM_START);
highend_pfn = max_low_pfn;
#endif
/*更新max_low_pfn指向低端内存区的截止pfn。
在我的开发板上:
max_low_pfn为0x20000(512MB)
*/
max_low_pfn = PFN_DOWN(HIGHMEM_START);
}
/*
* Initialize the boot-time allocator with low memory only.
*/
/*初始化bootmem分配器,其管理的内存仅限于低端内存区。对于我的MIPS开发板来说,所有bootmem都来自于一个节点,所以用NODE_DATA(0)。某些体系结构,bootmem可能来自于多个节点。*/
bootmap_size = init_bootmem_node(NODE_DATA(0), mapstart,
min_low_pfn, max_low_pfn);
/*遍历boot_mem_map,记录可用的内存区*/
for (i = 0; i < boot_mem_map.nr_map; i++) {
unsigned long start, end;
/*获得内存区的起止pfn */
start = PFN_UP(boot_mem_map.map[i].addr);
end = PFN_DOWN(boot_mem_map.map[i].addr
+ boot_mem_map.map[i].size);
/*起始pfn不能低于min_low_pfn */
if (start <= min_low_pfn)
start = min_low_pfn;
/* start高于end,无效内存区,跳过*/
if (start >= end)
continue;
#ifndef CONFIG_HIGHMEM
/*如果系统不支持高端内存,end就不能超过max_low_pfn */
if (end > max_low_pfn)
end = max_low_pfn;
/*
* ... finally, is the area going away?
*/
/*再判断一次*/
if (end <= start)
continue;
#endif
/*添加一个可用的内存区*/
add_active_range(0, start, end);
}
/*
* Register fully available low RAM pages with the bootmem allocator.
*/
/*遍历boot_mem_map,释放可用的内存区,前面init_bootmem_core置1了。
在我的开发板上:
reserved_end为0x5216,则可用的内存区共有两块:
0 start:0x5216end:0xb800size:0x65ea
0:start:0xc000end:0x10000size:0x4000
*/
for (i = 0; i < boot_mem_map.nr_map; i++) {
unsigned long start, end, size;
/*
* Reserve usable memory.
*/
/*如果不是可用的内存区,跳过*/
if (boot_mem_map.map[i].type != BOOT_MEM_RAM)
continue;
/*获得起止pfn */
start = PFN_UP(boot_mem_map.map[i].addr);
end= PFN_DOWN(boot_mem_map.map[i].addr
+ boot_mem_map.map[i].size);
/*
* We are rounding up the start address of usable memory
* and at the end of the usable range downwards.
*/
/* bootmem只管理低端内存*/
if (start >= max_low_pfn)
continue;
/* reserved_end下面的空闲区也算作保留区*/
if (start < reserved_end)
start = reserved_end;
/* bootmem只管理低端内存*/
if (end > max_low_pfn)
end = max_low_pfn;
/*
* ... finally, is the area going away?
*/
/*起止地址异常,跳过*/
if (end <= start)
continue;
size = end - start;
/* Register lowmem ranges */
/*释放这块低端内存给bootmem分配器*/
free_bootmem(PFN_PHYS(start), size << PAGE_SHIFT);
/* CONFIG_DISCONTIGMEM、CONFIG_SPARSEMEM相关,不去管它*/
memory_present(0, start, end);
}
/*
* Reserve the bootmap memory.
*/
/*保留bootmem位图所在的内存区*/
reserve_bootmem(PFN_PHYS(mapstart), bootmap_size, BOOTMEM_DEFAULT);
/*
* Reserve initrd memory if needed.
*/
/*保留initrd所在的内存区*/
finalize_initrd();
}
我的开发板Bootmem分配器初始化完成后,低端内存的分配情况如图所示,标记为“usable”的区域为bootmem管理的可用内存区:
init_bootmem_node
初始化某个节点的bootmem。
参数:
1)pgdat:此节点的struct pglist_data结构。每个struct pglist_data对象表示NUMA的一个内存节点,每个内存节点的bootmem用struct bootmem_data表示。
2)freepfn:保存bootmem位图的页块的首页面pfn
3)startpfn:bootmem内存区的起始pfn
4)endpfn:bootmem内存区的截止pfn
unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
unsigned long startpfn, unsigned long endpfn)
{
/*初始化此节点的bootmem */
return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
}
init_bootmem_core
初始化某内存节点的bootmem。
参数:
1)bdata:待初始化的某节点的bootmem
2)mapstart:保存bootmem位图的页块的首页面pfn
3)start:bootmem内存区的起始pfn
4)end:bootmem内存区的截止pfn
static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
unsigned long mapstart, unsigned long start, unsigned long end)
{
unsigned long mapsize;
/* CONFIG_SPARSEMEM相关,不去管它*/
mminit_validate_memmodel_limits(&start, &end);
/* node_bootmem_map保存本节点bootmem位图的虚拟地址*/
bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
/*本节点bootmem管理的内存区的最小pfn */
bdata->node_min_pfn = start;
/*本节点bootmem管理的内存区的最大pfn */
bdata->node_low_pfn = end;
/*全局链表bdata_list保存所有内存节点的bootmem,并且按照pfn,即物理地址有序排列*/
link_bootmem(bdata);
/*
* Initially all pages are reserved - setup_arch() has to
* register free RAM areas explicitly.
*/
/*将start与end间(包括空洞)的每一页都初始化为保留的,即位图对应的比特位置1。对我的开发板而言,start为0,end为0x20000,位图也就对应0至512MB的内存。这是低端内存区,共128K个页面,也就需要128K个比特位,即16K个字节,从而Mapsize为0x4000,因此位图共占用了4个页面。*/
mapsize = bootmap_bytes(end - start);
memset(bdata->node_bootmem_map, 0xff, mapsize);
bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
bdata - bootmem_node_data, start, mapstart, end, mapsize);
/*返回位图占用的空间大小*/
return mapsize;
}
add_active_range
添加一个可用的内存区域。
void __init add_active_range(unsigned int nid, unsigned long start_pfn,
unsigned long end_pfn)
{
int i;
mminit_dprintk(MMINIT_TRACE, "memory_register",
"Entering add_active_range(%d, %#lx, %#lx) "
"%d entries of %d used\n",
nid, start_pfn, end_pfn,
nr_nodemap_entries, MAX_ACTIVE_REGIONS);
/* CONFIG_SPARSEMEM相关*/
mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
/* Merge with existing active regions if possible */
/*检查已有的区域,看能否合并。全局变量nr_nodemap_entries保存已有区域数,全局数组early_node_map保存已有的可用内存区域*/
for (i = 0; i < nr_nodemap_entries; i++) {
/*不是同一个内存节点,不能合并*/
if (early_node_map[i].nid != nid)
continue;
/* Skip if an existing region covers this new one */
/*已有区域涵盖了待添加的区域,无需添加,返回*/
if (start_pfn >= early_node_map[i].start_pfn &&
end_pfn <= early_node_map[i].end_pfn)
return;
/*下面是前向、后向合并。没有考虑新添加区域完全覆盖原区域的情况,可能是不存在这种调用场景吧*/
/* Merge forward if suitable */
if (start_pfn <= early_node_map[i].end_pfn &&
end_pfn > early_node_map[i].end_pfn) {
early_node_map[i].end_pfn = end_pfn;
return;
}
/* Merge backward if suitable */
if (start_pfn < early_node_map[i].start_pfn &&
end_pfn >= early_node_map[i].start_pfn) {
early_node_map[i].start_pfn = start_pfn;
return;
}
}
/*走到这,说明无法合并,添加新的区域*/
/* Check that early_node_map is large enough */
/*区域数是否过多*/
if (i >= MAX_ACTIVE_REGIONS) {
printk(KERN_CRIT "More than %d memory regions, truncating\n",
MAX_ACTIVE_REGIONS);
return;
}
/*记录内存区所在的节点、起止pfn,区域数加一*/
early_node_map[i].nid = nid;
early_node_map[i].start_pfn = start_pfn;
early_node_map[i].end_pfn = end_pfn;
nr_nodemap_entries = i + 1;
}
reserve_bootmem
保留一块bootmem内存。
int __init reserve_bootmem(unsigned long addr, unsigned long size,
int flags)
{
#ifdef CONFIG_NO_BOOTMEM
panic("no bootmem");
return 0;
#else
unsigned long start, end;
start = PFN_DOWN(addr);
end = PFN_UP(addr + size);
/*将start与end间的位图比特位置1 */
return mark_bootmem(start, end, 1, flags);
#endif
}