Linux 内存管理(4)paging_init(2)

1. bootmem_init

内核代码像是一个美丽的女人,把最美丽的地方放在最神秘的位置。在跋山涉水之后,终于到了揭开面纱的时刻。


arch/arm64/mm/init.c

180 void __init bootmem_init(void)
181 {
182         unsigned long min, max;
183
184         min = PFN_UP(memblock_start_of_DRAM());
185         max = PFN_DOWN(memblock_end_of_DRAM());
186
187         /*
188          * Sparsemem tries to allocate bootmem in memory_present(), so must be
189          * done after the fixed reservations.
190          */
191         arm64_memory_present();
192
193         sparse_init();
194         zone_sizes_init(min, max);
195
196         high_memory = __va((max << PAGE_SHIFT) - 1) + 1;
197         max_pfn = max_low_pfn = max;
198 }
199

paging_init的主要功能,由 191,193,194行的函数实现。

CONFIG_SPARSEMEM在内核编译中有设置。


124 static void arm64_memory_present(void)
125 {
126         struct memblock_region *reg;
127
128         for_each_memblock(memory, reg)
129                 memory_present(0, memblock_region_memory_base_pfn(reg),
130                                memblock_region_memory_end_pfn(reg));
131 }

mm/sparse.c

/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
        unsigned long pfn;

        start &= PAGE_SECTION_MASK;
        mminit_validate_memmodel_limits(&start, &end);
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
                unsigned long section = pfn_to_section_nr(pfn);
                struct mem_section *ms;

                sparse_index_init(section, nid);
                set_section_nid(section, nid);

                ms = __nr_to_section(section);
                if (!ms->section_mem_map)
                        ms->section_mem_map = sparse_encode_early_nid(nid) |
                                                        SECTION_MARKED_PRESENT;
        }
}

2 sparse_init

468 void __init sparse_init(void)
469 {


579
580         for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
581                 if (!present_section_nr(pnum))
582                         continue;
583
584                 usemap = usemap_map[pnum];
585                 if (!usemap)
586                         continue;
587
588 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
589                 map = map_map[pnum];
590 #else
591                 map = sparse_early_mem_map_alloc(pnum);
592 #endif
593                 if (!map)
594                         continue;
595
596                 sparse_init_one_section(__nr_to_section(pnum), pnum, map,
597                                                                 usemap);
598         }
599
600         vmemmap_populate_print_last();
601
602 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
603         free_bootmem(__pa(map_map), size2);
604 #endif
605         free_bootmem(__pa(usemap_map), size);
606 }
607

sparse_init是一个相当复杂的函数,591行分配mem_map.


443 static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
444 {
445         struct page *map;
446         struct mem_section *ms = __nr_to_section(pnum);
447         int nid = sparse_early_nid(ms);
448
449         map = sparse_mem_map_populate(pnum, nid);
450         if (map)
451                 return map;
452
453         printk(KERN_ERR "%s: sparsemem memory map backing failed "
454                         "some memory will not be available.\n", __func__);
455         ms->section_mem_map = 0;
456         return NULL;
457 }

mm/sparse-vmemmap.c

struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
{
        unsigned long start;
        unsigned long end;
        struct page *map;

        map = pfn_to_page(pnum * PAGES_PER_SECTION);
        start = (unsigned long)map;
        end = (unsigned long)(map + PAGES_PER_SECTION);

        if (vmemmap_populate(start, end, nid))
                return NULL;

        return map;
}

arch/arm64/mm/mmu.c

399 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
400 {
401         unsigned long addr = start;
402         unsigned long next;
403         pgd_t *pgd;
404         pud_t *pud;
405         pmd_t *pmd;
406
407         do {
408                 next = pmd_addr_end(addr, end);
409
410                 pgd = vmemmap_pgd_populate(addr, node);
411                 if (!pgd)
412                         return -ENOMEM;
413
414                 pud = vmemmap_pud_populate(pgd, addr, node);
415                 if (!pud)
416                         return -ENOMEM;
417
418                 pmd = pmd_offset(pud, addr);
419                 if (pmd_none(*pmd)) {
420                         void *p = NULL;
421
422                         p = vmemmap_alloc_block_buf(PMD_SIZE, node);
423                         if (!p)
424                                 return -ENOMEM;
425
426                         set_pmd(pmd, __pmd(__pa(p) | prot_sect_kernel));
427                 } else
428                         vmemmap_verify((pte_t *)pmd, node, addr, next);
429         } while (addr = next, addr != end);
430
431         return 0;
432 }

422行 分配一个物理内存页,把物理地址放到PMD中

 49 void * __meminit vmemmap_alloc_block(unsigned long size, int node)
 50 {
 51         /* If the main allocator is up use that, fallback to bootmem. */
 52         if (slab_is_available()) {
 53                 struct page *page;
 54
 55                 if (node_state(node, N_HIGH_MEMORY))
 56                         page = alloc_pages_node(
 57                                 node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
 58                                 get_order(size));
 59                 else
 60                         page = alloc_pages(
 61                                 GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
 62                                 get_order(size));
 63                 if (page)
 64                         return page_address(page);
 65                 return NULL;
 66         } else
 67                 return __earlyonly_bootmem_alloc(node, size, size,
 68                                 __pa(MAX_DMA_ADDRESS));
 69 }
 70
 71 /* need to make sure size is all the same during early stage */
 72 void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
 73 {
 74         void *ptr;
 75
 76         if (!vmemmap_buf)
 77                 return vmemmap_alloc_block(size, node);
 78
 79         /* take the from buf */
 80         ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
 81         if (ptr + size > vmemmap_buf_end)
 82                 return vmemmap_alloc_block(size, node);
 83
 84         vmemmap_buf = ptr + size;
 85
 86         return ptr;
 87 }


 38 static void * __init_refok __earlyonly_bootmem_alloc(int node,
 39                                 unsigned long size,
 40                                 unsigned long align,
 41                                 unsigned long goal)
 42 {
 43         return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
 44 }

如果你对着个内存分配十分感兴趣,在mm/nobootmem.c中可以找到函数定义,最终会调用到memblock_alloc.


sparse_init完成,每个物理页面均分配了相应的页表。


3. zone_sizes_init

arch/arm64/mm/init.c

 72 static void __init zone_sizes_init(unsigned long min, unsigned long max)
 73 {
 74         struct memblock_region *reg;
 75         unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
 76         unsigned long max_dma32 = min;
 77
 78         memset(zone_size, 0, sizeof(zone_size));
 79
 80 #ifdef CONFIG_ZONE_DMA32
 81         /* 4GB maximum for 32-bit only capable devices */
 82         max_dma32 = max(min, min(max, MAX_DMA32_PFN));
 83         zone_size[ZONE_DMA32] = max_dma32 - min;
 84 #endif
 85         zone_size[ZONE_NORMAL] = max - max_dma32;
 86
 87         memcpy(zhole_size, zone_size, sizeof(zhole_size));
 88
 89         for_each_memblock(memory, reg) {
 90                 unsigned long start = memblock_region_memory_base_pfn(reg);
 91                 unsigned long end = memblock_region_memory_end_pfn(reg);
 92
 93                 if (start >= max)
 94                         continue;
 95 #ifdef CONFIG_ZONE_DMA32
 96                 if (start < max_dma32) {
 97                         unsigned long dma_end = min(end, max_dma32);
 98                         zhole_size[ZONE_DMA32] -= dma_end - start;
 99                 }
100 #endif
101                 if (end > max_dma32) {
102                         unsigned long normal_end = min(end, max);
103                         unsigned long normal_start = max(start, max_dma32);
104                         zhole_size[ZONE_NORMAL] -= normal_end - normal_start;
105                 }
106         }
107
108         free_area_init_node(0, zone_size, min, zhole_size);
109 }
110

73-107行,主要是设置zone_size, zhole_size两个变量。


mm/page_alloc.c

4887 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4888                 unsigned long node_start_pfn, unsigned long *zholes_size)
4889 {
4890         pg_data_t *pgdat = NODE_DATA(nid);
4891
4892         /* pg_data_t should be reset to zero when it's allocated */
4893         WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4894
4895         pgdat->node_id = nid;
4896         pgdat->node_start_pfn = node_start_pfn;
4897         init_zone_allows_reclaim(nid);
4898         calculate_node_totalpages(pgdat, zones_size, zholes_size);
4899
4900         alloc_node_mem_map(pgdat);
4901 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4902         printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4903                 nid, (unsigned long)pgdat,
4904                 (unsigned long)pgdat->node_mem_map);
4905 #endif
4906
4907         free_area_init_core(pgdat, zones_size, zholes_size);
4908 }


4888-4906对Node的pgdat的部分变量进行设置。4907完成主要工作。


4754 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4755                 unsigned long *zones_size, unsigned long *zholes_size)
4756 {
4757         enum zone_type j;
4758         int nid = pgdat->node_id;
4759         unsigned long zone_start_pfn = pgdat->node_start_pfn;
4760         int ret;
4761
4762         pgdat_resize_init(pgdat);
4763 #ifdef CONFIG_NUMA_BALANCING
4764         spin_lock_init(&pgdat->numabalancing_migrate_lock);
4765         pgdat->numabalancing_migrate_nr_pages = 0;
4766         pgdat->numabalancing_migrate_next_window = jiffies;
4767 #endif
4768         init_waitqueue_head(&pgdat->kswapd_wait);
4769         init_waitqueue_head(&pgdat->pfmemalloc_wait);
4770         pgdat_page_cgroup_init(pgdat);
4771
4772         for (j = 0; j < MAX_NR_ZONES; j++) {
4773                 struct zone *zone = pgdat->node_zones + j;
4774                 unsigned long size, realsize, freesize, memmap_pages;
4775
4776                 size = zone_spanned_pages_in_node(nid, j, zones_size);
4777                 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4778                                                                 zholes_size);
4779
4780                 /*
4781                  * Adjust freesize so that it accounts for how much memory
4782                  * is used by this zone for memmap. This affects the watermark
4783                  * and per-cpu initialisations
4784                  */
4785                 memmap_pages = calc_memmap_size(size, realsize);
4786                 if (freesize >= memmap_pages) {
4787                         freesize -= memmap_pages;
4788                         if (memmap_pages)
4789                                 printk(KERN_DEBUG
4790                                        "  %s zone: %lu pages used for memmap\n",
4791                                        zone_names[j], memmap_pages);
4792                 } else
4793                         printk(KERN_WARNING
4794                                 "  %s zone: %lu pages exceeds freesize %lu\n",
4795                                 zone_names[j], memmap_pages, freesize);
4796
4797                 /* Account for reserved pages */
4798                 if (j == 0 && freesize > dma_reserve) {
4799                         freesize -= dma_reserve;
4800                         printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
4801                                         zone_names[0], dma_reserve);
4802                 }

4803
4804                 if (!is_highmem_idx(j))
4805                         nr_kernel_pages += freesize;
4806                 /* Charge for highmem memmap if there are enough kernel pages */
4807                 else if (nr_kernel_pages > memmap_pages * 2)
4808                         nr_kernel_pages -= memmap_pages;
4809                 nr_all_pages += freesize;
4810
4811                 zone->spanned_pages = size;
4812                 zone->present_pages = realsize;
4813                 /*
4814                  * Set an approximate value for lowmem here, it will be adjusted
4815                  * when the bootmem allocator frees pages into the buddy system.
4816                  * And all highmem pages will be managed by the buddy system.
4817                  */
4818                 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4819 #ifdef CONFIG_NUMA
4820                 zone->node = nid;
4821                 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4822                                                 / 100;
4823                 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4824 #endif
4825                 zone->name = zone_names[j];
4826                 spin_lock_init(&zone->lock);
4827                 spin_lock_init(&zone->lru_lock);
4828                 zone_seqlock_init(zone);
4829                 zone->zone_pgdat = pgdat;
4830
4831                 zone_pcp_init(zone);
4832                 lruvec_init(&zone->lruvec);
4833                 if (!size)
4834                         continue;
4835
4836                 set_pageblock_order();
4837                 setup_usemap(pgdat, zone, zone_start_pfn, size);
4838                 ret = init_currently_empty_zone(zone, zone_start_pfn,
4839                                                 size, MEMMAP_EARLY);
4840                 BUG_ON(ret);
4841                 memmap_init(size, nid, j, zone_start_pfn);
4842                 zone_start_pfn += size;
4843         }
4844 }

至此Node的pgdat和Zone设置完毕。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值