bootmem是内核启动时使用的物理内存分配器,根据e820map中的可用内存来初始化bootmem可用内存;
bootmem启动之前分配的内存放入early_res预留内存区间中,初始化bootmem时将已经分配出去的内存在bootmem系统中标识为已分配
bootmem管理的是页帧,而e820map和early_res管理的是区间
e820map注册进bootmem时会对区间做页对齐操作;区间起始地址roundup,区间终止地址rounddown,见e820_register_active_regions
early_res注入bootmem时会对区间做页对齐操作;区间起始地址rounddown,区间终止地址roundup,见early_res_to_bootmem->reserve_bootmem_generic->reserve_bootmem->reserve_bootmem
mm/bootmem.c:
1 /*
2 * bootmem - A boot-time physical memory allocator and configurator
3 *
4 * Copyright (C) 1999 Ingo Molnar
5 * 1999 Kanoj Sarcar, SGI
6 * 2008 Johannes Weiner
7 *
8 * Access to this subsystem has to be serialized externally (which is true
9 * for the boot process anyway).
10 */
I、bootmem数据结构
include/linux/bootmem.h:
26 /*
27 * node_bootmem_map is a map pointer - the bits represent all physical
28 * memory pages (including holes) on the node.
29 */
30 typedef struct bootmem_data {
31 unsigned long node_min_pfn;
32 unsigned long node_low_pfn;
33 void *node_bootmem_map;
34 unsigned long last_end_off;
35 unsigned long hint_idx;
36 struct list_head list;
37 } bootmem_data_t;
mm/bootmem.c:
35 bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
36
37 static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
bootmem_node_data通过list组成链表,链表头为bdata_list;每个NUMA的node一个对应一个链表节点。x86使用UMA内存模型,则只有一个节点。
node_min_pfn:起始页
node_low_pfn:终止页
node_bootmem_map:页帧位图
last_end_off:上次分配内存在最后一页的页偏移
hint_idx:分配内存起始位置索引,分配内存时从第hint_idx开始查找空闲位图块;如果没找到才从头开始查找
list:组成链表
bootmem使用位图表示页帧的使用情况,bit-1表示保留(不可用)内存,bit-0表示可用内存
bdata_list是按node_min_pfn大小递增的有序链表
II、bootmem启动
1.节点bootmem初始化
bootmem初始化时将位图所有位置1,表示所有的页都已经保留;由setup_arch显示的清空来表示可用内存,如setup_arch->initmem_init->setup_bootmem_allocator->setup_node_bootmem->free_bootmem_with_active_regions将可用内存页帧位置0
90 /*
91 * Called once to set up the allocator itself.
92 */
93 static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
94 unsigned long mapstart, unsigned long start, unsigned long end)
95 {
96 unsigned long mapsize;
97
98 mminit_validate_memmodel_limits(&start, &end);
99 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
100 bdata->node_min_pfn = start;
101 bdata->node_low_pfn = end;
102 link_bootmem(bdata);
103
104 /*
105 * Initially all pages are reserved - setup_arch() has to
106 * register free RAM areas explicitly.
107 */
108 mapsize = bootmap_bytes(end - start);
109 memset(bdata->node_bootmem_map, 0xff, mapsize);
110
111 bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
112 bdata - bootmem_node_data, start, mapstart, end, mapsize);
113
114 return mapsize;
115 }
116
117 /**
118 * init_bootmem_node - register a node as boot memory
119 * @pgdat: node to register
120 * @freepfn: pfn where the bitmap for this node is to be placed
121 * @startpfn: first pfn on the node
122 * @endpfn: first pfn after the node
123 *
124 * Returns the number of bytes needed to hold the bitmap for this node.
125 */
126 unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
127 unsigned long startpfn, unsigned long endpfn)
128 {
129 return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
130 }
2.e820map可用内存注入到bootmem中
当节点bootmem初始化完成后,所有页帧标识为已用;将e820map中可用内存注入到bootmem中;e820map先放入active_regions,再从active_regions注入到bootmem中
a.e820map->active_regions
setup_arch->initmem_init->e820_register_active_regions
1154 /*
1155 * Finds an active region in the address range from start_pfn to last_pfn and
1156 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
1157 */
1158 int __init e820_find_active_region(const struct e820entry *ei,
1159 unsigned long start_pfn,
1160 unsigned long last_pfn,
1161 unsigned long *ei_startpfn,
1162 unsigned long *ei_endpfn)
1163 {
1164 u64 align = PAGE_SIZE;
1165
1166 *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
1167 *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
1168
1169 /* Skip map entries smaller than a page */
1170 if (*ei_startpfn >= *ei_endpfn)
1171 return 0;
1172
1173 /* Skip if map is outside the node */
1174 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
1175 *ei_startpfn >= last_pfn)
1176 return 0;
1177
1178 /* Check for overlaps */
1179 if (*ei_startpfn < start_pfn)
1180 *ei_startpfn = start_pfn;
1181 if (*ei_endpfn > last_pfn)
1182 *ei_endpfn = last_pfn;
1183
1184 return 1;
1185 }
1186
1187 /* Walk the e820 map and register active regions within a node */
1188 void __init e820_register_active_regions(int nid, unsigned long start_pfn,
1189 unsigned long last_pfn)
1190 {
1191 unsigned long ei_startpfn;
1192 unsigned long ei_endpfn;
1193 int i;
1194
1195 for (i = 0; i < e820.nr_map; i++)
1196 if (e820_find_active_region(&e820.map[i],
1197 start_pfn, last_pfn,
1198 &ei_startpfn, &ei_endpfn))
1199 add_active_range(nid, ei_startpfn, ei_endpfn);
b.active_regions->bootmem
setup_arch->initmem_init->setup_bootmem_allocator->setup_node_bootmem->free_bootmem_with_active_regions
3410 /**
3411 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
3412 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
3413 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
3414 *
3415 * If an architecture guarantees that all ranges registered with
3416 * add_active_ranges() contain no holes and may be freed, this
3417 * this function may be used instead of calling free_bootmem() manually.
3418 */
3419 void __init free_bootmem_with_active_regions(int nid,
3420 unsigned long max_low_pfn)
3421 {
3422 int i;
3423
3424 for_each_active_range_index_in_nid(i, nid) {
3425 unsigned long size_pages = 0;
3426 unsigned long end_pfn = early_node_map[i].end_pfn;
3427
3428 if (early_node_map[i].start_pfn >= max_low_pfn)
3429 continue;
3430
3431 if (end_pfn > max_low_pfn)
3432 end_pfn = max_low_pfn;
3433
3434 size_pages = end_pfn - early_node_map[i].start_pfn;
3435 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
3436 PFN_PHYS(early_node_map[i].start_pfn),
3437 size_pages << PAGE_SHIFT);
3438 }
3439 }}
3.early_res预留内存注入到bootmem中
将bootmem启动之前分配的内存放入early_res中,bootmem初始化时将预留的内存在bootmem中标识为1,表示已经分配。
setup_arch->initmem_init->setup_bootmem_allocator->setup_node_bootmem->early_res_to_bootmemarch/x86/kernel/e820.c:
917 void __init early_res_to_bootmem(u64 start, u64 end)
918 {
919 int i, count;
920 u64 final_start, final_end;
921
922 count = 0;
923 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
924 count++;
925
926 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
927 count, start, end);
928 for (i = 0; i < count; i++) {
929 struct early_res *r = &early_res[i];
930 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
931 r->start, r->end, r->name);
932 final_start = max(start, r->start);
933 final_end = min(end, r->end);
934 if (final_start >= final_end) {
935 printk(KERN_CONT "\n");
936 continue;
937 }
938 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
939 final_start, final_end);
940 reserve_bootmem_generic(final_start, final_end - final_start,
941 BOOTMEM_DEFAULT);
942 }
943 }
III、bootmem内存分配
1、alloc_bootmem_core是核心bootmem分配内存函数;alloc_bootmem、alloc_bootmem_pages、alloc_bootmem_node等都是对他的封装
434 static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
435 unsigned long size, unsigned long align,
436 unsigned long goal, unsigned long limit)
437 {
438 unsigned long fallback = 0;
439 unsigned long min, max, start, sidx, midx, step;
440
441 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
442 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
443 align, goal, limit);
444
445 BUG_ON(!size);
446 BUG_ON(align & (align - 1));
447 BUG_ON(limit && goal + size > limit);
448
449 if (!bdata->node_bootmem_map)
450 return NULL;
451
452 min = bdata->node_min_pfn;
453 max = bdata->node_low_pfn;
454
455 goal >>= PAGE_SHIFT;
456 limit >>= PAGE_SHIFT;
457
458 if (limit && max > limit)
459 max = limit;
460 if (max <= min)
461 return NULL;
462
463 step = max(align >> PAGE_SHIFT, 1UL);
464
465 if (goal && min < goal && goal < max)
466 start = ALIGN(goal, step);
467 else
468 start = ALIGN(min, step);
469
470 sidx = start - bdata->node_min_pfn;
471 midx = max - bdata->node_min_pfn;
472
473 if (bdata->hint_idx > sidx) {
474 /*
475 * Handle the valid case of sidx being zero and still
476 * catch the fallback below.
477 */
478 fallback = sidx + 1;
479 sidx = align_idx(bdata, bdata->hint_idx, step);
480 }
481
482 while (1) {
483 int merge;
484 void *region;
485 unsigned long eidx, i, start_off, end_off;
486 find_block:
487 sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
488 sidx = align_idx(bdata, sidx, step);
489 eidx = sidx + PFN_UP(size);
490
491 if (sidx >= midx || eidx > midx)
492 break;
493
494 for (i = sidx; i < eidx; i++)
495 if (test_bit(i, bdata->node_bootmem_map)) {
496 sidx = align_idx(bdata, i, step);
497 if (sidx == i)
498 sidx += step;
499 goto find_block;
500 }
501
502 if (bdata->last_end_off & (PAGE_SIZE - 1) &&
503 PFN_DOWN(bdata->last_end_off) + 1 == sidx)
504 start_off = align_off(bdata, bdata->last_end_off, align);
505 else
506 start_off = PFN_PHYS(sidx);
507
508 merge = PFN_DOWN(start_off) < sidx;
509 end_off = start_off + size;
510
511 bdata->last_end_off = end_off;
512 bdata->hint_idx = PFN_UP(end_off);
513
514 /*
515 * Reserve the area now:
516 */
517 if (__reserve(bdata, PFN_DOWN(start_off) + merge,
518 PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
519 BUG();
520
521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
522 start_off);
523 memset(region, 0, size);
524 /*
525 * The min_count is set to 0 so that bootmem allocated blocks
526 * are never reported as leaks.
527 */
528 kmemleak_alloc(region, size, 0, 0);
529 return region;
530 }
531
532 if (fallback) {
533 sidx = align_idx(bdata, fallback - 1, step);
534 fallback = 0;
535 goto find_block;
536 }
537
538 return NULL;
539 }
a、首先从hint_idx开始查找空闲内存块,如果没有找到从头开始(goal与node_min_pfn的较大值)
b、查找PFN_UP(size)个连续空闲的页帧;没找到后移PFN_UP(size)继续查找。
c、如果last_end_off在查找到空闲内存块的上一页内,则返回地址从last_end_off(align对齐后)开始;否则返回地址为空闲块起始页地址
d、保留页帧
e、申请内存块清空为全0
IV、bootmem内存回收
347 /**
348 * free_bootmem - mark a page range as usable
349 * @addr: starting address of the range
350 * @size: size of the range in bytes
351 *
352 * Partial pages will be considered reserved and left as they are.
353 *
354 * The range must be contiguous but may span node boundaries.
355 */
356 void __init free_bootmem(unsigned long addr, unsigned long size)
357 {
358 unsigned long start, end;
359
360 kmemleak_free_part(__va(addr), size);
361
362 start = PFN_UP(addr);
363 end = PFN_DOWN(addr + size);
364
365 mark_bootmem(start, end, 0, 0);
366 }
将起始地址向上页对齐,终止地址向下页对齐后,将这些页帧标为可用,将页帧释放
V、bootmem销毁
在伙伴系统启动后,将bootmem中的空闲内存释放到buddy系统中;在buddy系统启用后不再使用bootmem
146 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
147 {
148 int aligned;
149 struct page *page;
150 unsigned long start, end, pages, count = 0;
151
152 if (!bdata->node_bootmem_map)
153 return 0;
154
155 start = bdata->node_min_pfn;
156 end = bdata->node_low_pfn;
157
158 /*
159 * If the start is aligned to the machines wordsize, we might
160 * be able to free pages in bulks of that order.
161 */
162 aligned = !(start & (BITS_PER_LONG - 1));
163
164 bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
165 bdata - bootmem_node_data, start, end, aligned);
166
167 while (start < end) {
168 unsigned long *map, idx, vec;
169
170 map = bdata->node_bootmem_map;
171 idx = start - bdata->node_min_pfn;
172 vec = ~map[idx / BITS_PER_LONG];
173
174 if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
175 int order = ilog2(BITS_PER_LONG);
176
177 __free_pages_bootmem(pfn_to_page(start), order);
178 count += BITS_PER_LONG;
179 } else {
180 unsigned long off = 0;
181
182 while (vec && off < BITS_PER_LONG) {
183 if (vec & 1) {
184 page = pfn_to_page(start + off);
185 __free_pages_bootmem(page, 0);
186 count++;
187 }
188 vec >>= 1;
189 off++;
190 }
191 }
192 start += BITS_PER_LONG;
193 }
194
195 page = virt_to_page(bdata->node_bootmem_map);
196 pages = bdata->node_low_pfn - bdata->node_min_pfn;
197 pages = bootmem_bootmap_pages(pages);
198 count += pages;
199 while (pages--)
200 __free_pages_bootmem(page++, 0);
201
202 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
203
204 return count;
205 }
219 /**
220 * free_all_bootmem - release free pages to the buddy allocator
221 *
222 * Returns the number of pages actually released.
223 */
224 unsigned long __init free_all_bootmem(void)
225 {
226 return free_all_bootmem_core(NODE_DATA(0)->bdata);
227 }
1.将bootmem中的空闲内存释放到buddy系统中;如果bootmem起始页是字对齐的,则按批量方式注入到buddy系统,否则一页一页的注入到buddy系统
2.将bootmem用于标识内存使用情况的位图内存释放到buddy系统中