在Linux内核启动过程中,内核根据系统配置来设置虚拟地址空间的布局,如PAGE_OFFSET的起始地址,PHYS_OFFSET等。对于宏PHYS_OFFSET来说,其描述的是物理内存的起始地址,一般由硬件给出。如下面一些设置:
ARM:
arch/arm/include/asm/memory.h
#define PLAT_PHYS_OFFSET UL(CONFIG_PHYS_OFFSET)
#if defined(__virt_to_phys)
#define PHYS_OFFSET PLAT_PHYS_OFFSET
#define PHYS_PFN_OFFSET ((unsigned long)(PHYS_OFFSET >> PAGE_SHIFT))
#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT)
#elif defined(CONFIG_ARM_PATCH_PHYS_VIRT)
/*
* Constants used to force the right instruction encodings and shifts
* so that all we need to do is modify the 8-bit constant field.
*/
#define __PV_BITS_31_24 0x81000000
#define __PV_BITS_7_0 0x81
extern unsigned long __pv_phys_pfn_offset;
extern u64 __pv_offset;
extern void fixup_pv_table(const void *, unsigned long);
extern const void *__pv_table_begin, *__pv_table_end;
#define PHYS_OFFSET ((phys_addr_t)__pv_phys_pfn_offset << PAGE_SHIFT)
#define PHYS_PFN_OFFSET (__pv_phys_pfn_offset)
#define virt_to_pfn(kaddr) \
((((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT) + \
PHYS_PFN_OFFSET)
#define __pv_stub(from,to,instr,type) \
__asm__("@ __pv_stub\n" \
"1: " instr " %0, %1, %2\n" \
" .pushsection .pv_table,\"a\"\n" \
" .long 1b\n" \
" .popsection\n" \
: "=r" (to) \
: "r" (from), "I" (type))
#define __pv_stub_mov_hi(t) \
__asm__ volatile("@ __pv_stub_mov\n" \
"1: mov %R0, %1\n" \
" .pushsection .pv_table,\"a\"\n" \
" .long 1b\n" \
" .popsection\n" \
: "=r" (t) \
: "I" (__PV_BITS_7_0))
#define __pv_add_carry_stub(x, y) \
__asm__ volatile("@ __pv_add_carry_stub\n" \
"1: adds %Q0, %1, %2\n" \
" adc %R0, %R0, #0\n" \
" .pushsection .pv_table,\"a\"\n" \
" .long 1b\n" \
" .popsection\n" \
: "+r" (y) \
: "r" (x), "I" (__PV_BITS_31_24) \
: "cc")
static inline phys_addr_t __virt_to_phys(unsigned long x)
{
phys_addr_t t;
if (sizeof(phys_addr_t) == 4) {
__pv_stub(x, t, "add", __PV_BITS_31_24);
} else {
__pv_stub_mov_hi(t);
__pv_add_carry_stub(x, t);
}
return t;
}
static inline unsigned long __phys_to_virt(phys_addr_t x)
{
unsigned long t;
/*
* 'unsigned long' cast discard upper word when
* phys_addr_t is 64 bit, and makes sure that inline
* assembler expression receives 32 bit argument
* in place where 'r' 32 bit operand is expected.
*/
__pv_stub((unsigned long) x, t, "sub", __PV_BITS_31_24);
return t;
}
#else
#define PHYS_OFFSET PLAT_PHYS_OFFSET
#define PHYS_PFN_OFFSET ((unsigned long)(PHYS_OFFSET >> PAGE_SHIFT))
static inline phys_addr_t __virt_to_phys(unsigned long x)
{
return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
}
static inline unsigned long __phys_to_virt(phys_addr_t x)
{
return x - PHYS_OFFSET + PAGE_OFFSET;
}
#define virt_to_pfn(kaddr) \
((((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT) + \
PHYS_PFN_OFFSET)
#endif
可以认为PHYS_OFFSET是由CONFIG_PHYS_OFFSET来设置的。对于上面不同的定义,其影响物理地址到虚拟地址的转化,但并无本质的区别。
另外一个地址PAGE_OFFSET也是非常重要的,其也在arch/arm/include/asm/memory.h中给出,如下:
#define UL(x) _AC(x, UL)
/* PAGE_OFFSET - the virtual address of the start of the kernel image */
#define PAGE_OFFSET UL(CONFIG_PAGE_OFFSET)
注意PAGE_OFFSET 是内核虚拟地址开始的地方。通常来说,PAGE_OFFSET 会被直接映射到PHYS_OFFSET处。
我们本篇主要是描述虚拟的布局,可以看到,PAGE_OFFSET之上是内核虚拟地址,而PAGE_OFFSET之下是进程的用户空间地址。用户空间较为重要一个宏TASK_SIZE,如下:
#define TASK_SIZE (UL(CONFIG_PAGE_OFFSET) - UL(SZ_16M))
#define TASK_UNMAPPED_BASE ALIGN(TASK_SIZE / 3, SZ_16M)
也就是说,用户空间到内核空间有个SZ_16M大小的洞。
需要注意的是TASK_UNMAPPED_BASE,其给出了mmap类函数对虚拟地址空间映射时使用的最小虚拟地址。如果CONFIG_PAGE_OFFSET为0xC00 00000(3G) ,那么TASK_SIZE为0XBF0 00000.
TASK_SIZE / 3 = 0XBF0 00000/3 =0x3FAAAAAA
再对SZ_16M对齐,所以TASK_UNMAPPED_BASE为0x3FAAAAB0。
所以,使用mmap类函数,虚拟地址最小为3FAAAAB0,最大为TASK_SIZE。
通常PAGE_OFFSET被映射到物理地址起始处,即PHYS_OFFSET给出的地址。这样,以PHYS_OFFSET为开始地址,至于结束地址则不同的平台定义并不相同,但是有一点是相同的,即有块连续的物理内存被直接映射到虚拟地址空间,也就是我们说的低端内存直接映射。至于这块低端内存Linux内核是如何计算的,我们下文会详细的论述,这里与低端内存对应的是high_memory,即高端内存,从PHYS_OFFSET开始,到高端内存,我们用LOW_BOUNCE_HIGH做边界。max_low_pfn是给出的低端内存的最大页帧号,而min_low_pfn则是最小的页帧编号,由于我们从PHYS_OFFSET开始,其一般为0,所以min_low_pfn一般为0.如果我们把低端内存限制在512M,则max_low_pfn为512M/4k = 0x20000 = 131072,。内核还有一个max_pfn,其对应整个内存对应的最大的页帧号,如果内存为
mem= 1G, max_pfn= 2^18 =262144
mem= 2G, max_pfn=2^19 =524288
mem= 3G, max_pfn=0xC0000 = 786432
mem= 4G, max_pfn=2^20 =1048576
mem= 8G, max_pfn=2^21 =2097152
mem=16G, max_pfn=2^22= 4194304
对于低端内存区大小和高端内存区设置由平台来配置,对于high_memory端内存,其之上对应vmalloc区域,Linux中主要配置如下:
arch/arm/include/asm/pgtable.h
#define VMALLOC_OFFSET (8*1024*1024)
#define VMALLOC_START (((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))
#define VMALLOC_END 0xff000000UL // 4G - 16M == 4080M
可以看到,对于VMALLOC区域,其从高端内存high_memory附近开始,这里有8M的空洞。然后到VMALLOC_END处,可以看到,VMALLOC_END到0xFFFFFFFF还有16M的空间,这些一般被特殊的设备使用。
一般来说,VMALLOC区大小可以由内核的命令行参数来指定,如下函数分析命令行参数,设置VMALLOC区大小:
arch/arm/mm/mmu.c
static void * vmalloc_min =
(void *)(VMALLOC_END - (240 << 20) - VMALLOC_OFFSET);
/*
* vmalloc=size forces the vmalloc area to be exactly 'size'
* bytes. This can be used to increase (or decrease) the vmalloc
* area - the default is 240m. vmalloc
*/
static int __init early_vmalloc(char *arg)
{
unsigned long vmalloc_reserve = memparse(arg, NULL);
if (vmalloc_reserve < SZ_16M) {
vmalloc_reserve = SZ_16M;
pr_warn("vmalloc area too small, limiting to %luMB\n",
vmalloc_reserve >> 20);
}
if (vmalloc_reserve > VMALLOC_END - (PAGE_OFFSET + SZ_32M)) {
vmalloc_reserve = VMALLOC_END - (PAGE_OFFSET + SZ_32M);
pr_warn("vmalloc area is too big, limiting to %luMB\n",
vmalloc_reserve >> 20);
}
vmalloc_min = (void *)(VMALLOC_END - vmalloc_reserve);
return 0;
}
early_param("vmalloc", early_vmalloc);
这段代码接受VMALLOC大小在16M和976M之间。
如果命令行提供vmalloc大小,假若为240M,则vmalloc_min为0xF00 00000
如果命令行没有提供,内核采用默认配置,则vmalloc_min为0xEF8 00000
在了解上面内存布局主要参数定义后,我们再来详细的看看内核启动过程中主要函数处理,函数sanity_check_meminfo()对主要的的内存区检查。函数实现如下:
void __init sanity_check_meminfo(void)
{
phys_addr_t memblock_limit = 0;
int highmem = 0;
phys_addr_t vmalloc_limit = __pa(vmalloc_min - 1) + 1;
struct memblock_region *reg;
bool should_use_highmem = false;
//了解内存区情况
for_each_memblock(memory, reg) {
phys_addr_t block_start = reg->base;
phys_addr_t block_end = reg->base + reg->size;
phys_addr_t size_limit = reg->size;
if (reg->base >= vmalloc_limit)
highmem = 1;
else
size_limit = vmalloc_limit - reg->base;
if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) {
if (highmem) {
pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n",
&block_start, &block_end);
memblock_remove(reg->base, reg->size);
should_use_highmem = true;
continue;
}
if (reg->size > size_limit) {
phys_addr_t overlap_size = reg->size - size_limit;
pr_notice("Truncating RAM at %pa-%pa to -%pa",
&block_start, &block_end, &vmalloc_limit);
memblock_remove(vmalloc_limit, overlap_size);
block_end = vmalloc_limit;
should_use_highmem = true;
}
}
if (!highmem) {
if (block_end > arm_lowmem_limit) {
if (reg->size > size_limit)
arm_lowmem_limit = vmalloc_limit;
else
arm_lowmem_limit = block_end;
}
/*
* Find the first non-pmd-aligned page, and point
* memblock_limit at it. This relies on rounding the
* limit down to be pmd-aligned, which happens at the
* end of this function.
*
* With this algorithm, the start or end of almost any
* bank can be non-pmd-aligned. The only exception is
* that the start of the bank 0 must be section-
* aligned, since otherwise memory would need to be
* allocated when mapping the start of bank 0, which
* occurs before any free memory is mapped.
*/
if (!memblock_limit) {
if (!IS_ALIGNED(block_start, PMD_SIZE))
memblock_limit = block_start;
else if (!IS_ALIGNED(block_end, PMD_SIZE))
memblock_limit = arm_lowmem_limit;
}
}
}
if (should_use_highmem)
pr_notice("Consider using a HIGHMEM enabled kernel.\n");
high_memory = __va(arm_lowmem_limit - 1) + 1;
/*
* Round the memblock limit down to a pmd size. This
* helps to ensure that we will allocate memory from the
* last full pmd, which should be mapped.
*/
if (memblock_limit)
memblock_limit = round_down(memblock_limit, PMD_SIZE);
if (!memblock_limit)
memblock_limit = arm_lowmem_limit;
memblock_set_current_limit(memblock_limit);
}
在了解此函数之前,我们熟悉下面几个函数
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int nid)
在系统启动过程中,内核会通过上面函数添加有效内存。内核有几处添加处理:
一. early_init_dt_add_memory_arch() --memblock_add()
二. arm_add_memory() --> memblock_add()
三. 体系决定增加,直接调用函数memblock_add()
函数sanity_check_meminfo()作用很简单,对内存大小分析后设置high_memory和arm_lowmem_limit。