1. bootmem_init
内核代码像是一个美丽的女人,把最美丽的地方放在最神秘的位置。在跋山涉水之后,终于到了揭开面纱的时刻。
arch/arm64/mm/init.c
180 void __init bootmem_init(void)
181 {
182 unsigned long min, max;
183
184 min = PFN_UP(memblock_start_of_DRAM());
185 max = PFN_DOWN(memblock_end_of_DRAM());
186
187 /*
188 * Sparsemem tries to allocate bootmem in memory_present(), so must be
189 * done after the fixed reservations.
190 */
191 arm64_memory_present();
192
193 sparse_init();
194 zone_sizes_init(min, max);
195
196 high_memory = __va((max << PAGE_SHIFT) - 1) + 1;
197 max_pfn = max_low_pfn = max;
198 }
199
paging_init的主要功能,由 191,193,194行的函数实现。
CONFIG_SPARSEMEM在内核编译中有设置。
124 static void arm64_memory_present(void)
125 {
126 struct memblock_region *reg;
127
128 for_each_memblock(memory, reg)
129 memory_present(0, memblock_region_memory_base_pfn(reg),
130 memblock_region_memory_end_pfn(reg));
131 }
mm/sparse.c
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;
start &= PAGE_SECTION_MASK;
mminit_validate_memmodel_limits(&start, &end);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
unsigned long section = pfn_to_section_nr(pfn);
struct mem_section *ms;
sparse_index_init(section, nid);
set_section_nid(section, nid);
ms = __nr_to_section(section);
if (!ms->section_mem_map)
ms->section_mem_map = sparse_encode_early_nid(nid) |
SECTION_MARKED_PRESENT;
}
}
2 sparse_init
468 void __init sparse_init(void)
469 {
579
580 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
581 if (!present_section_nr(pnum))
582 continue;
583
584 usemap = usemap_map[pnum];
585 if (!usemap)
586 continue;
587
588 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
589 map = map_map[pnum];
590 #else
591 map = sparse_early_mem_map_alloc(pnum);
592 #endif
593 if (!map)
594 continue;
595
596 sparse_init_one_section(__nr_to_section(pnum), pnum, map,
597 usemap);
598 }
599
600 vmemmap_populate_print_last();
601
602 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
603 free_bootmem(__pa(map_map), size2);
604 #endif
605 free_bootmem(__pa(usemap_map), size);
606 }
607
sparse_init是一个相当复杂的函数,591行分配mem_map.
443 static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
444 {
445 struct page *map;
446 struct mem_section *ms = __nr_to_section(pnum);
447 int nid = sparse_early_nid(ms);
448
449 map = sparse_mem_map_populate(pnum, nid);
450 if (map)
451 return map;
452
453 printk(KERN_ERR "%s: sparsemem memory map backing failed "
454 "some memory will not be available.\n", __func__);
455 ms->section_mem_map = 0;
456 return NULL;
457 }
mm/sparse-vmemmap.c
struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
{
unsigned long start;
unsigned long end;
struct page *map;
map = pfn_to_page(pnum * PAGES_PER_SECTION);
start = (unsigned long)map;
end = (unsigned long)(map + PAGES_PER_SECTION);
if (vmemmap_populate(start, end, nid))
return NULL;
return map;
}
arch/arm64/mm/mmu.c
399 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
400 {
401 unsigned long addr = start;
402 unsigned long next;
403 pgd_t *pgd;
404 pud_t *pud;
405 pmd_t *pmd;
406
407 do {
408 next = pmd_addr_end(addr, end);
409
410 pgd = vmemmap_pgd_populate(addr, node);
411 if (!pgd)
412 return -ENOMEM;
413
414 pud = vmemmap_pud_populate(pgd, addr, node);
415 if (!pud)
416 return -ENOMEM;
417
418 pmd = pmd_offset(pud, addr);
419 if (pmd_none(*pmd)) {
420 void *p = NULL;
421
422 p = vmemmap_alloc_block_buf(PMD_SIZE, node);
423 if (!p)
424 return -ENOMEM;
425
426 set_pmd(pmd, __pmd(__pa(p) | prot_sect_kernel));
427 } else
428 vmemmap_verify((pte_t *)pmd, node, addr, next);
429 } while (addr = next, addr != end);
430
431 return 0;
432 }
422行 分配一个物理内存页,把物理地址放到PMD中
49 void * __meminit vmemmap_alloc_block(unsigned long size, int node)
50 {
51 /* If the main allocator is up use that, fallback to bootmem. */
52 if (slab_is_available()) {
53 struct page *page;
54
55 if (node_state(node, N_HIGH_MEMORY))
56 page = alloc_pages_node(
57 node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
58 get_order(size));
59 else
60 page = alloc_pages(
61 GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
62 get_order(size));
63 if (page)
64 return page_address(page);
65 return NULL;
66 } else
67 return __earlyonly_bootmem_alloc(node, size, size,
68 __pa(MAX_DMA_ADDRESS));
69 }
70
71 /* need to make sure size is all the same during early stage */
72 void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
73 {
74 void *ptr;
75
76 if (!vmemmap_buf)
77 return vmemmap_alloc_block(size, node);
78
79 /* take the from buf */
80 ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
81 if (ptr + size > vmemmap_buf_end)
82 return vmemmap_alloc_block(size, node);
83
84 vmemmap_buf = ptr + size;
85
86 return ptr;
87 }
38 static void * __init_refok __earlyonly_bootmem_alloc(int node,
39 unsigned long size,
40 unsigned long align,
41 unsigned long goal)
42 {
43 return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
44 }
如果你对着个内存分配十分感兴趣,在mm/nobootmem.c中可以找到函数定义,最终会调用到memblock_alloc.
sparse_init完成,每个物理页面均分配了相应的页表。
3. zone_sizes_init
arch/arm64/mm/init.c
72 static void __init zone_sizes_init(unsigned long min, unsigned long max)
73 {
74 struct memblock_region *reg;
75 unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
76 unsigned long max_dma32 = min;
77
78 memset(zone_size, 0, sizeof(zone_size));
79
80 #ifdef CONFIG_ZONE_DMA32
81 /* 4GB maximum for 32-bit only capable devices */
82 max_dma32 = max(min, min(max, MAX_DMA32_PFN));
83 zone_size[ZONE_DMA32] = max_dma32 - min;
84 #endif
85 zone_size[ZONE_NORMAL] = max - max_dma32;
86
87 memcpy(zhole_size, zone_size, sizeof(zhole_size));
88
89 for_each_memblock(memory, reg) {
90 unsigned long start = memblock_region_memory_base_pfn(reg);
91 unsigned long end = memblock_region_memory_end_pfn(reg);
92
93 if (start >= max)
94 continue;
95 #ifdef CONFIG_ZONE_DMA32
96 if (start < max_dma32) {
97 unsigned long dma_end = min(end, max_dma32);
98 zhole_size[ZONE_DMA32] -= dma_end - start;
99 }
100 #endif
101 if (end > max_dma32) {
102 unsigned long normal_end = min(end, max);
103 unsigned long normal_start = max(start, max_dma32);
104 zhole_size[ZONE_NORMAL] -= normal_end - normal_start;
105 }
106 }
107
108 free_area_init_node(0, zone_size, min, zhole_size);
109 }
110
73-107行,主要是设置zone_size, zhole_size两个变量。
mm/page_alloc.c
4887 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4888 unsigned long node_start_pfn, unsigned long *zholes_size)
4889 {
4890 pg_data_t *pgdat = NODE_DATA(nid);
4891
4892 /* pg_data_t should be reset to zero when it's allocated */
4893 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4894
4895 pgdat->node_id = nid;
4896 pgdat->node_start_pfn = node_start_pfn;
4897 init_zone_allows_reclaim(nid);
4898 calculate_node_totalpages(pgdat, zones_size, zholes_size);
4899
4900 alloc_node_mem_map(pgdat);
4901 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4902 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4903 nid, (unsigned long)pgdat,
4904 (unsigned long)pgdat->node_mem_map);
4905 #endif
4906
4907 free_area_init_core(pgdat, zones_size, zholes_size);
4908 }
4888-4906对Node的pgdat的部分变量进行设置。4907完成主要工作。
4754 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4755 unsigned long *zones_size, unsigned long *zholes_size)
4756 {
4757 enum zone_type j;
4758 int nid = pgdat->node_id;
4759 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4760 int ret;
4761
4762 pgdat_resize_init(pgdat);
4763 #ifdef CONFIG_NUMA_BALANCING
4764 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4765 pgdat->numabalancing_migrate_nr_pages = 0;
4766 pgdat->numabalancing_migrate_next_window = jiffies;
4767 #endif
4768 init_waitqueue_head(&pgdat->kswapd_wait);
4769 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4770 pgdat_page_cgroup_init(pgdat);
4771
4772 for (j = 0; j < MAX_NR_ZONES; j++) {
4773 struct zone *zone = pgdat->node_zones + j;
4774 unsigned long size, realsize, freesize, memmap_pages;
4775
4776 size = zone_spanned_pages_in_node(nid, j, zones_size);
4777 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4778 zholes_size);
4779
4780 /*
4781 * Adjust freesize so that it accounts for how much memory
4782 * is used by this zone for memmap. This affects the watermark
4783 * and per-cpu initialisations
4784 */
4785 memmap_pages = calc_memmap_size(size, realsize);
4786 if (freesize >= memmap_pages) {
4787 freesize -= memmap_pages;
4788 if (memmap_pages)
4789 printk(KERN_DEBUG
4790 " %s zone: %lu pages used for memmap\n",
4791 zone_names[j], memmap_pages);
4792 } else
4793 printk(KERN_WARNING
4794 " %s zone: %lu pages exceeds freesize %lu\n",
4795 zone_names[j], memmap_pages, freesize);
4796
4797 /* Account for reserved pages */
4798 if (j == 0 && freesize > dma_reserve) {
4799 freesize -= dma_reserve;
4800 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4801 zone_names[0], dma_reserve);
4802 }
4803
4804 if (!is_highmem_idx(j))
4805 nr_kernel_pages += freesize;
4806 /* Charge for highmem memmap if there are enough kernel pages */
4807 else if (nr_kernel_pages > memmap_pages * 2)
4808 nr_kernel_pages -= memmap_pages;
4809 nr_all_pages += freesize;
4810
4811 zone->spanned_pages = size;
4812 zone->present_pages = realsize;
4813 /*
4814 * Set an approximate value for lowmem here, it will be adjusted
4815 * when the bootmem allocator frees pages into the buddy system.
4816 * And all highmem pages will be managed by the buddy system.
4817 */
4818 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4819 #ifdef CONFIG_NUMA
4820 zone->node = nid;
4821 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4822 / 100;
4823 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4824 #endif
4825 zone->name = zone_names[j];
4826 spin_lock_init(&zone->lock);
4827 spin_lock_init(&zone->lru_lock);
4828 zone_seqlock_init(zone);
4829 zone->zone_pgdat = pgdat;
4830
4831 zone_pcp_init(zone);
4832 lruvec_init(&zone->lruvec);
4833 if (!size)
4834 continue;
4835
4836 set_pageblock_order();
4837 setup_usemap(pgdat, zone, zone_start_pfn, size);
4838 ret = init_currently_empty_zone(zone, zone_start_pfn,
4839 size, MEMMAP_EARLY);
4840 BUG_ON(ret);
4841 memmap_init(size, nid, j, zone_start_pfn);
4842 zone_start_pfn += size;
4843 }
4844 }
至此Node的pgdat和Zone设置完毕。