在Linux 内存管理中,在系统启动时,函数start_kernel()调用mm_init()对内存相关的模块初始化。这里我们关注mem_init()函数实现,这个是跟体系架构相关的,不同体系架构实现并不相同,但大致处理类似,即释放内存到伙伴系统,对一些内存方面的全局变量设置。我们下面看不同体系下实现:
ARM: void __init mem_init(void) { #ifdef CONFIG_HAVE_TCM /* These pointers are filled in on TCM detection */ extern u32 dtcm_end; extern u32 itcm_end; #endif
set_max_mapnr(pfn_to_page(max_pfn) - mem_map);
/* this will put all unused low memory onto the freelists */ free_unused_memmap(); free_all_bootmem();
#ifdef CONFIG_SA1111 /* now that our DMA memory is actually so designated, we can free it */ free_reserved_area(__va(PHYS_OFFSET), swapper_pg_dir, -1, NULL); #endif
free_highpages();
mem_init_print_info(NULL);
#define MLK(b, t) b, t, ((t) - (b)) >> 10 #define MLM(b, t) b, t, ((t) - (b)) >> 20 #define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K)
pr_notice("Virtual kernel memory layout:\n" " vector : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HAVE_TCM " DTCM : 0x%08lx - 0x%08lx (%4ld kB)\n" " ITCM : 0x%08lx - 0x%08lx (%4ld kB)\n" #endif " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" #ifdef CONFIG_HIGHMEM " pkmap : 0x%08lx - 0x%08lx (%4ld MB)\n" #endif #ifdef CONFIG_MODULES " modules : 0x%08lx - 0x%08lx (%4ld MB)\n" #endif " .text : 0x%p" " - 0x%p" " (%4td kB)\n" " .init : 0x%p" " - 0x%p" " (%4td kB)\n" " .data : 0x%p" " - 0x%p" " (%4td kB)\n" " .bss : 0x%p" " - 0x%p" " (%4td kB)\n",
MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) + (PAGE_SIZE)), #ifdef CONFIG_HAVE_TCM MLK(DTCM_OFFSET, (unsigned long) dtcm_end), MLK(ITCM_OFFSET, (unsigned long) itcm_end), #endif MLK(FIXADDR_START, FIXADDR_END), MLM(VMALLOC_START, VMALLOC_END), MLM(PAGE_OFFSET, (unsigned long)high_memory), #ifdef CONFIG_HIGHMEM MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) * (PAGE_SIZE)), #endif #ifdef CONFIG_MODULES MLM(MODULES_VADDR, MODULES_END), #endif
MLK_ROUNDUP(_text, _etext), MLK_ROUNDUP(__init_begin, __init_end), MLK_ROUNDUP(_sdata, _edata), MLK_ROUNDUP(__bss_start, __bss_stop));
#undef MLK #undef MLM #undef MLK_ROUNDUP
/* * Check boundaries twice: Some fundamental inconsistencies can * be detected at build time already. */ #ifdef CONFIG_MMU BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); BUG_ON(TASK_SIZE > MODULES_VADDR); #endif
#ifdef CONFIG_HIGHMEM BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP * PAGE_SIZE > PAGE_OFFSET); BUG_ON(PKMAP_BASE + LAST_PKMAP * PAGE_SIZE > PAGE_OFFSET); #endif
if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) { extern int sysctl_overcommit_memory; /* * On a machine this small we won't get * anywhere without overcommit, so turn * it on by default. */ sysctl_overcommit_memory = OVERCOMMIT_ALWAYS; } }
ARM64: void __init mem_init(void){ if (swiotlb_force || max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT)) swiotlb_init(1);
set_max_mapnr(pfn_to_page(max_pfn) - mem_map);
#ifndef CONFIG_SPARSEMEM_VMEMMAP free_unused_memmap(); #endif /* this will put all unused low memory onto the freelists */ free_all_bootmem();
mem_init_print_info(NULL);
#define MLK(b, t) b, t, ((t) - (b)) >> 10 #define MLM(b, t) b, t, ((t) - (b)) >> 20 #define MLG(b, t) b, t, ((t) - (b)) >> 30 #define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K)
pr_notice("Virtual kernel memory layout:\n"); #ifdef CONFIG_KASAN pr_notice(" kasan : 0x%16lx - 0x%16lx (%6ld GB)\n", MLG(KASAN_SHADOW_START, KASAN_SHADOW_END)); #endif pr_notice(" modules : 0x%16lx - 0x%16lx (%6ld MB)\n", MLM(MODULES_VADDR, MODULES_END)); pr_notice(" vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n", MLG(VMALLOC_START, VMALLOC_END)); pr_notice(" .text : 0x%p" " - 0x%p" " (%6ld KB)\n", MLK_ROUNDUP(_text, _etext)); pr_notice(" .rodata : 0x%p" " - 0x%p" " (%6ld KB)\n", MLK_ROUNDUP(__start_rodata, __init_begin)); pr_notice(" .init : 0x%p" " - 0x%p" " (%6ld KB)\n", MLK_ROUNDUP(__init_begin, __init_end)); pr_notice(" .data : 0x%p" " - 0x%p" " (%6ld KB)\n", MLK_ROUNDUP(_sdata, _edata)); pr_notice(" .bss : 0x%p" " - 0x%p" " (%6ld KB)\n", MLK_ROUNDUP(__bss_start, __bss_stop)); pr_notice(" fixed : 0x%16lx - 0x%16lx (%6ld KB)\n", MLK(FIXADDR_START, FIXADDR_TOP)); pr_notice(" PCI I/O : 0x%16lx - 0x%16lx (%6ld MB)\n", MLM(PCI_IO_START, PCI_IO_END)); #ifdef CONFIG_SPARSEMEM_VMEMMAP pr_notice(" vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n", MLG(VMEMMAP_START, VMEMMAP_START + VMEMMAP_SIZE)); pr_notice(" 0x%16lx - 0x%16lx (%6ld MB actual)\n", MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()), (unsigned long)virt_to_page(high_memory))); #endif pr_notice(" memory : 0x%16lx - 0x%16lx (%6ld MB)\n", MLM(__phys_to_virt(memblock_start_of_DRAM()), (unsigned long)high_memory));
#undef MLK #undef MLM #undef MLK_ROUNDUP
#ifdef CONFIG_COMPAT BUILD_BUG_ON(TASK_SIZE_32 > TASK_SIZE_64); #endif
BUILD_BUG_ON(sizeof(struct page) > (1 << STRUCT_PAGE_MAX_SHIFT));
if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) { extern int sysctl_overcommit_memory; sysctl_overcommit_memory = OVERCOMMIT_ALWAYS; } }
x86_32:
void __init mem_init(void) { pci_iommu_alloc();
#ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif /* * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to * be done before free_all_bootmem(). Memblock use free low memory for * temporary data (see find_range_array()) and for this purpose can use * pages that was already passed to the buddy allocator, hence marked as * not accessible in the page tables when compiled with * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not * important here. */ set_highmem_pages_init();
/* this will put all low memory onto the freelists */ free_all_bootmem();
after_bootmem = 1;
mem_init_print_info(NULL); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #endif " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", FIXADDR_START, FIXADDR_TOP, (FIXADDR_TOP - FIXADDR_START) >> 10,
#ifdef CONFIG_HIGHMEM PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, (LAST_PKMAP*PAGE_SIZE) >> 10, #endif
VMALLOC_START, VMALLOC_END, (VMALLOC_END - VMALLOC_START) >> 20,
(unsigned long)__va(0), (unsigned long)high_memory, ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
(unsigned long)&__init_begin, (unsigned long)&__init_end, ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
(unsigned long)&_etext, (unsigned long)&_edata, ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
(unsigned long)&_text, (unsigned long)&_etext, ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
/* * Check boundaries twice: Some fundamental inconsistencies can * be detected at build time already. */ #define __FIXADDR_TOP (-PAGE_SIZE) #ifdef CONFIG_HIGHMEM BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE); #endif #define high_memory (-128UL << 20) BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); #undef high_memory #undef __FIXADDR_TOP
#ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); BUG_ON(VMALLOC_END > PKMAP_BASE); #endif BUG_ON(VMALLOC_START >= VMALLOC_END); BUG_ON((unsigned long)high_memory > VMALLOC_START);
if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); }
X86_64:void __init mem_init(void) { pci_iommu_alloc();
/* clear_bss() already clear the empty_zero_page */
register_page_bootmem_info();
/* this will put all memory onto the freelists */ free_all_bootmem(); after_bootmem = 1;
/* Register memory areas for /proc/kcore */ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_OTHER);
mem_init_print_info(NULL); }
MIPS:void __init mem_init(void) { #ifdef CONFIG_HIGHMEM #ifdef CONFIG_DISCONTIGMEM #error "CONFIG_HIGHMEM and CONFIG_DISCONTIGMEM dont work together yet" #endif max_mapnr = highend_pfn ? highend_pfn : max_low_pfn; #else max_mapnr = max_low_pfn; #endif high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
maar_init(); free_all_bootmem(); setup_zero_pages(); /* Setup zeroed pages. */ mem_init_free_highmem(); mem_init_print_info(NULL);
#ifdef CONFIG_64BIT if ((unsigned long) &_text > (unsigned long) CKSEG0) /* The -4 is a hack so that user tools don't have to handle the overflow. */ kclist_add(&kcore_kseg0, (void *) CKSEG0, 0x80000000 - 4, KCORE_TEXT); #endif }
上面这些函数尽管不同体系架构实现不同,甚至同一架构下不同位的架构也实现不同。我们这里以ARM64为例,说明这些函数主要处理:
首先我们需要明白mem_map作用,其是描述所有的物理内存采用的struct page结构的数组的基指针。比如说,对于4GB的内存来说,如果一个页定义为4KB,即2^12字节。那么可想而知,总共这个mem_map数组大小为2^20个。注意我们这里都以flat型内存描述为主,即平坦型内存模型、
而这些页都有一个具体的页帧号与之对应。页帧号一般用pfn来表示,那么由于每个页都有一个页帧号,那最小的页帧号和最大的页帧号为多少呢?需要特别注意的是,页帧号也是与mem_map数组的index相对应。我们一般认为pfn_min为0,而最大pfn_max为mem_map数组下标的最大值,这个最大值也就是max_pfn,这个值跟内核的max_mapnr相对应。
函数set_max_mapnr()就是用于计算max_mapnr。我们可能会想,这个max_pfn是什么时候设置的呢?这个是在setup_arch的paging_init()中调用bootmem_init()来进行的。在成功设置max_mapnr后,我们要把启动过程时所有的空闲内存释放到伙伴系统,这里需要注意三点:
一. bootmem内存管理或者nobootmem管理
二. memblock内存管理
三. 伙伴系统
显然,启动时,不存在伙伴系统,在linux 内核启动的早期,BSP相关的代码需要把内核能使用的内存块大小告知内核,要么通过bootload传递参数给出DDR大小,要么通过命令行形式给出DDR大小,或者通过FDT等形式对DTS分析得出DDR大小。不管什么样的方法,内核需要了解这些信息,我们这里以DTS形式给出内存大小,这些内存块会以
memblock_add形式添加到memblock内存管理块中。这些添加到内核中的内存块被标记为memory类型,另外一种类型为reserve类型。BSP可以通过不同方式添加到内核,
但是在我们内核使用内存之前,必须先添加一块,否则我们使用的内存哪里来呢,使用的内存被标记为reserve。这样,通过这种简单的管理,memblock把所有的内存块维护起来,之后内核慢慢的从这些内存块中获取内存。我们一般称memblock是逻辑内存块管理。
对于bootmem来说,它是物理内存管理。我们这里不详细介绍,后面会有篇章分析。
函数free_unused_memmap()和free_all_bootmem()都是把空闲内存释放到伙伴系统,前者释放memblock中空闲内存,后者释放bootmem中内存。
函数mem_init_print_info()是把内核映像的各个段地址打印出来。我们这里看下这个信息:
[ 0.000000] Memory: 832MB 2080MB = 2912MB total [ 0.000000] <0>I{0}[0:swapper]Memory: 2852320k/2852320k available, 129568k reserved [ 0.000000] <0>I{0}[0:swapper]Virtual kernel memory layout: [ 0.000000] <0> vmalloc : 0xffffff8000000000 - 0xffffffbbffff0000 (245759 MB) [ 0.000000] <0> vmemmap : 0xffffffbc001dc000 - 0xffffffbc029c8000 ( 39 MB) [ 0.000000] <0> modules : 0xffffffbffc000000 - 0xffffffc000000000 ( 64 MB) [ 0.000000] <0> memory : 0xffffffc000000000 - 0xffffffc0b6800000 ( 2920 MB) [ 0.000000] <0> .init : 0xffffffc000d29000 - 0xffffffc000f881c0 ( 2429 kB) [ 0.000000] <0> .text : 0xffffffc000080000 - 0xffffffc000d28da4 ( 12964 kB) [ 0.000000] <0> .data : 0xffffffc000f89000 - 0xffffffc001088e78 ( 1024 kB)
上面的832MB和2080MB是说明有两个memblock,第一个memblock大小为832MB,第二个为2080MB,所以,总共内存大小为2912MB。
对于Memory:下面几个数字来说, 2852320k是当前系统空闲的内存,这说明总共有713080个空闲页。需要注意2852320k/2852320k 前面这个是当前系统空闲页数,它是个动态变化的值。它是由函数nr_free_pages()得到,可以看到,其采用global_page_state(NR_FREE_PAGES)方式获取的空闲内存,这个值会动态变化,每次申请内存时都会减少。而后面这个值是恒定的,它是从页的page_count(page)得到,因为对于内核来说,如果BSP申请内存一定,这个值就应该恒定,它是每个memblock中除了reserve之后的内存,所以,在BSP开发期间,可以通过这个值来了解内核可用内存是否减少。
从上面给出的log可以看到,这两个值相等,这说明memblock中reserve的值后并没有动态申请内存,否则前面这个值应该减小。
最后这个129568k 是reserve的内存,那么什么是reserve的内存呢?其实这个reserve内存是相对free内存来说的。因为其无法再分配供其他程序使用,这部分内存一般是内核一些模块申请,如vfs_caches_init_early()分配的目录项和索引节点hash,这些模块需要连续的物理内存,在系统启动时可以方便获取,这样我们在系统早期分配后,对其标记为reserve,再比如内核代码段,数据段等,这些都被标记。这样看来,上面129568KB的reserve内存,即32392个页被标记reserve。
空闲free页 + reserve页 = 总页数 = 713080 + 32392 = 745472页。
总内存2912MB = 745742个页。
对于下面的内核镜像内存布局则很容易理解了:
因为我们是64位系统,虚拟地址空间采用48位。
上面把物理内存完全映射到memory : 0xffffffc000000000 - 0xffffffc0b6800000 ( 2920 MB) 这个里面。 这里把0x0000000000000000 - 0xffffff8000000000-1 映射到用户地址空间
0xffffffc000000000 - 0xffff,ffff,ffff,ffff 映射到内核空间
可以看到,内核空间已经足够大,所以不需要高端内存,内核空间已经足够囊括3GB的物理内存了。整个物理内存映射到0xffffffc000000000 - 0xffffffc0b6800000 范围,这里只所以有8MB的洞,是保护非连续区内存管理使用的。
可以看到,内核代码段,init段,数据段大小。对于vmemmap和module段很容易理解其大小。这些都跟内核定义的一些宏或者常熟相关。