Linux内存初始化
说明
Kernel版本:4.14.111
ARM处理器,Contex-A7,四核
本文从内核汇编代码执行完毕,跳转到start_kernel,开始介绍内存初始化。跳转到start_kernel前,已知,内核已创建了kernel、dtb的线性映射,即内核可以访问自己代码段等区域,并能访问dtb所在内存区域的虚拟地址。
物理内存大小
一般情况下,内存大小设置有两种方法,通过设备树设置,或者通过bootargs设置。
1)通过dtb设置
memory@0 {
device_type = "memory";
reg = <0x40000000 0x10000000>;
};
上述配置意思为:物理内存地址从0x40000000开始,大小为0x10000000(256Mb)。
代码调用关系为
start_kernel->setup_arch->setup_machine_fdt->early_init_dt_scan_nodes-> of_scan_flat_dt(early_init_dt_scan_memory, NULL)->early_init_dt_scan_memory
2)通过bootargs设置
setup.c->early_mem
static int __init early_mem(char *p)
{
static int usermem __initdata = 0;
u64 size;
u64 start;
char *endp;
/*
* If the user specifies memory size, we
* blow away any automatically generated
* size.
*/
if (usermem == 0) {
usermem = 1;
memblock_remove(memblock_start_of_DRAM(),
memblock_end_of_DRAM() - memblock_start_of_DRAM());
}
start = PHYS_OFFSET;
size = memparse(p, &endp);
if (*endp == '@')
start = memparse(endp + 1, NULL);
arm_add_memory(start, size);
return 0;
}
early_param("mem", early_mem);
设置格式为:mem=1024M
,代码中会去bootargs(内核command line)中提取mem=关键字,获取系统内存大小。
early_fixmap_init
简单来说,Fixed map指的是虚拟地址中的一段区域,在该区域中所有的线性地址是在编译阶段就确定好的。可以从内核启动打印,看出fixmap所在虚拟地址空间。如:
vector : 0xffff0000 - 0xffff1000 ( 4 kB)
fixmap : 0xffc00000 - 0xfff00000 (3072 kB)
vmalloc : 0xf0800000 - 0xff800000 ( 240 MB)
lowmem : 0xc0000000 - 0xf0000000 ( 768 MB)
pkmap : 0xbfe00000 - 0xc0000000 ( 2 MB)
modules : 0xbf000000 - 0xbfe00000 ( 14 MB)
.text : 0xc0008000 - 0xc0c00000 (12256 kB)
.init : 0xc1000000 - 0xc1200000 (2048 kB)
.data : 0xc1200000 - 0xc1261b40 ( 391 kB)
.bss : 0xc126a8d8 - 0xc12b6890 ( 304 kB)
early_fixmap_init
代码实现在arch/arm/mm/mmu.c
中
start_kernel->setup_arch->early_fixmap_init
void __init early_fixmap_init(void)
{
pmd_t *pmd;
BUILD_BUG_ON((__fix_to_virt(__end_of_early_ioremap_region) >> PMD_SHIFT)
!= FIXADDR_TOP >> PMD_SHIFT);
pmd = fixmap_pmd(FIXADDR_TOP); (1)
pmd_populate_kernel(&init_mm, pmd, bm_pte); (2)
pte_offset_fixmap = pte_offset_early_fixmap;
}
static inline void
pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
{
/*
* The pmd must be loaded with the physical address of the PTE table
*/
__pmd_populate(pmdp, __pa(ptep), _PAGE_KERNEL_TABLE);
}
static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
pmdval_t prot)
{
pmdval_t pmdval = (pte + PTE_HWTABLE_OFF) | prot;
pmdp[0] = __pmd(pmdval);
#ifndef CONFIG_ARM_LPAE
pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t));
#endif
flush_pmd_entry(pmdp);
}
1)获取addr对应pmd全局页表中的节点,对于arm32,二级映射来说。
2)将bm_pte的物理地址写到pmd页表目录表中。bm_pte是全局数组,是fixmap的pte页表。定义如下:
#define PTRS_PER_PTE 512
#define PTE_HWTABLE_PTRS (PTRS_PER_PTE)
static pte_t bm_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]
__aligned(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE) __initdata;
early_fixmap_init
只是建立了一个映射的框架,并没有填充pte页表,即没有创建真正虚拟地址和物理地址的联系。
需要使用者在使用时再去填充具体的pte页表。比如下文的early_ioremap_init()
。
32位arm处理器对应内核的fixmap地址区间定义如下:
enum fixed_addresses {
FIX_EARLYCON_MEM_BASE,
__end_of_permanent_fixed_addresses,
FIX_KMAP_BEGIN = __end_of_permanent_fixed_addresses,
FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1,
/* Support writing RO kernel text via kprobes, jump labels, etc. */
FIX_TEXT_POKE0,
FIX_TEXT_POKE1,
__end_of_fixmap_region,
/*
* Share the kmap() region with early_ioremap(): this is guaranteed
* not to clash since early_ioremap() is only available before
* paging_init(), and kmap() only after.
*/
#define NR_FIX_BTMAPS 32
#define FIX_BTMAPS_SLOTS 7
#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
__end_of_early_ioremap_region
};
early_ioremap_init
在设备初始化早期,如果需要操作io内存,则需要依赖early ioremap模块初始化。
代码如下:
start_kernel->setup_arch->early_ioremap_init
void __init early_ioremap_init(void)
{
early_ioremap_setup();
}
void __init early_ioremap_setup(void)
{
int i;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
if (WARN_ON(prev_map[i]))
break;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
}
#define NR_FIX_BTMAPS 32
#define FIX_BTMAPS_SLOTS 7
#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
early_ioremap_setup实际操作就是在fixmap区域内,设置了一段7*32字节的区域,保存到了slot_virt[]数组中,当需要io操作时,最终会调用到__early_ioremap函数,该函数中会去填充对应的pte页表,从而完成物理地址和虚拟地址的映射。
memblock
memblock内存管理机制用于在Linux启动后管理内存,一直到free_initmem()为止。
在buddy系统初始化之前,内存由memblock管理,需要注意的是,memblock管理的内存为物理地址,非虚拟地址。
不知道有没有人和我一样疑惑,在shell命令行下,通过cat /proc/meminfo
orfree
命令,为什么totalmem和我们的物理内存大小不一致。
这部分内核无法管理的内存,就是由memblock机制,预留的。
memblock数据结构
struct memblock是memblock的核心数据结构,下面分为几种类型的memblock,每种类型memblock包含若干regions。
/* Definition of memblock flags. */
enum {
MEMBLOCK_NONE = 0x0, /* No special request */
MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */
MEMBLOCK_MIRROR = 0x2, /* mirrored region */
MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */
};
struct memblock_region {
phys_addr_t base; //region的开始物理地址
phys_addr_t size; //region的大小
unsigned long flags; //region的标志,上面枚举定义
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
#endif
};
struct memblock_type {
unsigned long cnt; /* number of regions */ //该类型内存的regions数量
unsigned long max; /* size of the allocated array */ //当前集合中记录内存区域最大大小
phys_addr_t total_size; /* size of all regions */ //regions总大小
struct memblock_region *regions; //指向region数组
char *name;
};
struct memblock {
bool bottom_up; /* is bottom up direction? */ //表明分配器的分配方式,true表示从低地址向高地址
phys_addr_t current_limit; //内存块大小的限制
struct memblock_type memory; //可用内存类型
struct memblock_type reserved; //预留内存类型
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
struct memblock_type physmem;
#endif
};
内核中定义了全局memblock数据结构,定义了初始值,这个全局变量在memblock生命周期内,会被频繁调用。
mm/memblock.c
struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = 1, /* empty dummy entry */
.memory.max = INIT_MEMBLOCK_REGIONS,
.memory.name = "memory",
.reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
.reserved.name = "reserved",
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
.physmem.regions = memblock_physmem_init_regions,
.physmem.cnt = 1, /* empty dummy entry */
.physmem.max = INIT_PHYSMEM_REGIONS,
.physmem.name = "physmem",
#endif
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
}
memblock API介绍
phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end,
int nid, ulong flags);
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align);
void memblock_allow_resize(void);
int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
int memblock_add(phys_addr_t base, phys_addr_t size);
int memblock_remove(phys_addr_t base, phys_addr_t size);
int memblock_free(phys_addr_t base, phys_addr_t size);
int memblock_reserve(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
bool memblock_overlaps_region(struct memblock_type *type,
phys_addr_t base, phys_addr_t size);
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
ulong choose_memblock_flags(void);
其中对不同类型memblock的分配释放主要有如下:其中memblock_add()和memblock_remove()是针对可用memlbock操作;memblock_reserve()和memblock_free()是针对reserved类型memblock操作。
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int nid)
{
return memblock_add_range(&memblock.memory, base, size, nid, 0);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
}
int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
return memblock_remove_range(&memblock.memory, base, size);
}
int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
kmemleak_free_part_phys(base, size);
return memblock_remove_range(&memblock.reserved, base, size);
}
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
}
memblock调试
在内核启动bootargs,可以加入"memblock=debug",会打开memblock的dbg打印,通过打印可以看出memblock的预留、分配等操作。
在内核编译时使能了内核debug功能后,还可以通过以下操作查看memblock信息。
cat /sys/kernel/debug/memblock/memory
cat /sys/kernel/debug/memblock/reserved
arm_memblock_init
arm_memblock_init是memblock的初始化函数,代码如下:
start_kernel->setup_arch->early_ioremap_init
void __init arm_memblock_init(const struct machine_desc *mdesc)
{
/* Register the kernel text, kernel data and initrd with memblock. */
memblock_reserve(__pa(KERNEL_START), KERNEL_END - KERNEL_START); (1)
arm_initrd_init(); (2)
arm_mm_memblock_reserve(); (3)
/* reserve any platform specific memblock areas */
if (mdesc->reserve)
mdesc->reserve();
early_init_fdt_reserve_self(); (4)
early_init_fdt_scan_reserved_mem(); (5)
/* reserve memory for DMA contiguous allocations */
dma_contiguous_reserve(arm_dma_limit); (6)
arm_memblock_steal_permitted = false;
memblock_dump_all(); (7)
}
1)将内核代码段设置为reserved类型memblock,其中的init段会在free_initmem()中返还给内核
2)将内核initrd段设置为reserved类型memblock
3)将内核一级页表区域设置为reserved类型memblock
4)将dtb本身区域设置为reserved类型memblock
5)将dtb中reserved-memory区域设置为reserved类型memblock,其中CMA区域会返还给内核
6)预留cma连续内存区域
7)打印memblock(全局变量)信息,前提是设置了bootargs中"memblock=debug"
memblock debug打印:
memblock_reserve: [0x0000000040200000-0x00000000412b688f] arm_memblock_init+0x34/0x1d8 //内核代码段
memblock_reserve: [0x0000000049dee000-0x0000000049ffffff] arm_memblock_init+0x160/0x1d8 //initrd
memblock_reserve: [0x0000000040003000-0x0000000040007fff] arm_memblock_init+0x17c/0x1d8 //内核一级页表
memblock_reserve: [0x0000000049de3000-0x0000000049deb07f] early_init_fdt_reserve_self+0x3c/0x44 //设备树预留自身
memblock_reserve: [0x0000000049de3000-0x0000000049deafff] early_init_fdt_scan_reserved_mem+0x58/0x78 //设备树扫描预留区域,这里是uboot设置的预留dtb,和设备树预留自身冲突
memblock_reserve: [0x0000000049dee000-0x0000000049fff82a] early_init_fdt_scan_reserved_mem+0x58/0x78 //设备树扫描预留区域,这里是uboot设置预留的ramdisk.gz区域内存。
memblock_reserve: [0x000000007c000000-0x000000007fffffff] memblock_alloc_range_nid+0x70/0x88 //cma连续内存,设置的为64M,从物理内存最后开始预留(设备物理内存0x40000000-0x7fffffff)
Reserved memory: created CMA memory pool at 0x000000007c000000, size 64 MiB
OF: reserved mem: initialized node linux,cma, compatible id shared-dma-pool
MEMBLOCK configuration:
memory size = 0x0000000040000000 reserved size = 0x00000000052d5910
memory.cnt = 0x1 //可用内存块,数量1
memory[0x0] [0x0000000040000000-0x000000007fffffff], 0x0000000040000000 bytes flags: 0x0 //打印可用内存
reserved.cnt = 0x5 //预留内存块,数量5,以下依次打印预留段的起止物理地址
reserved[0x0] [0x0000000040003000-0x0000000040007fff], 0x0000000000005000 bytes flags: 0x0
reserved[0x1] [0x0000000040200000-0x00000000412b688f], 0x00000000010b6890 bytes flags: 0x0
reserved[0x2] [0x0000000049de3000-0x0000000049deb07f], 0x0000000000008080 bytes flags: 0x0
reserved[0x3] [0x0000000049dee000-0x0000000049ffffff], 0x0000000000212000 bytes flags: 0x0
reserved[0x4] [0x000000007c000000-0x000000007fffffff], 0x0000000004000000 bytes flags: 0x0
在buddy系统初始化前,还会有很多用到memblock的地方,内核也还会预留一些其他的内存段。开启memblock debug,内核启动打印都可以看到。
小结
内核启动后,start_kernel调用关系为:
本文中介绍了,early_fixmap_init()、early_ioremap_init()、early_ioremap_init()。setup_arch中还调用到了paging_init,该接口涉及到内核low mem页表初始化、zone初始化,后续的文章中再介绍。
相关文章
Linux内存初始化(1)——memblock初始化
Linux内存初始化(2)——paging_init初始化
Linux内存初始化(3)——pglist_data/zone初始化
Linux内存初始化(4)——伙伴系统(buddy)