linux内存管理——内存初始化3—paging_init
文章目录
1 介绍
从Linux内存管理之物理内存初始化中,可知在paging_init调用之前,存放Kernel Image和DTB的两段物理内存区域可以访问了(相应的页表已经建立好)。尽管物理内存已经通过memblock_add添加进系统,但是这部分的物理内存到虚拟内存的映射还没有建立,可以通过memblock_alloc分配一段物理内存,但是还不能访问,一切还需要等待paging_init的执行。最终页表建立好后,可以通过虚拟地址去访问最终的物理地址了。
1.1 arm32页表映射
由于ARM32和Linux内核维护的页表项有所不同,所以维护了两套PTE。
PGD存放在swapper_pd_dir中,一个PGD目录项其实包含了两份ARM32 PGD。
所以再分配PTE的时候,共分配了1024个PTE,512个给Linux OS维护用;512个给ARM32 MMU用,对应两个PGD的页表数目。
由于Linux OS和ARM32的PTE紧邻,所以两者的转换也方便进行。
1.1.1 arm32页表映射过程
如果采用页表映射的方式,段映射表就变成一级映射表(Linux中称为PGD),其页表项提供的不再是物理地址,而是二级页表的基地址。32位虚拟地址的高12位(bit[31:20])作为访问一级页表的索引值,找到相应的表项,每个表项指向一个二级页表。以虚拟地址的次8位(bit[19:12])作为访问二级页表的索引值,得到相应的页表项,从这个页表项中找到20位的物理页面地址。最后将这20位物理页面地址和虚拟地址的低12位拼凑在一起,得到最终的32位物理地址。这个过程在ARM32架构中由MMU硬件完成,软件不需要接入。
1.1.2 ARMv7-AR中关于Short Descriptor映射概览图
关于4K页表的映射过程在ARMv7-AR用户架构手册有关介绍。
一个地址映射的概览图,32位虚拟地址从TTBR1中找到First-level table地址,然后取虚拟地址VA[31:20]作为序号找到Second-level table地址。
取虚拟地址VA[19:12]作为序号找到Page地址。
1.2 linux页表映射相关数据
1.2.1 mm_struct
1.2.1.1 什么是mm_struct
内存描述符也用一个结构体表示,这个结构体的名字叫做mm_struct(内存描述符),linux就是通过mm_struct这个结构体来实现内存管理。 一个进程的虚拟地址空间主要由两个数据结构来描述,一个是最高层次的mm_struct,一个是较高层次的vm_ares_struct。最高层次的mm_struct结构描述了一个进程的整个虚拟地址空间。较高层次的结构vm_area_struct描述了虚拟地址空间的一个区间(简称虚拟区)。每个进程只有一个mm_struct结构,在每个进程的task_struct结构体中,有一个指向该进程的结构。可以说,mm_struct结构是对整个用户空间的描述。
1.2.1.2 mm_struct的定义
mm_struct定义在include/linux/mm_types.h
struct mm_struct {
struct {
struct vm_area_struct *mmap; /* 指向VMAs系统列表*/ /* list of VMAs */
struct rb_root mm_rb; /*指向red_black树*/
u64 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
#endif
unsigned long mmap_base; /* base of mmap area */
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
/* Base adresses for compatible mmap() */
unsigned long mmap_compat_base;
unsigned long mmap_compat_legacy_base;
#endif
unsigned long task_size; /* size of task vm space */
unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd; /*指向进程的页目录*/
#ifdef CONFIG_MEMBARRIER
/**
* @membarrier_state: Flags controlling membarrier behavior.
*
* This field is close to @pgd to hopefully fit in the same
* cache-line, which needs to be touched by switch_mm().
*/
atomic_t membarrier_state;
#endif
/**
* @mm_users: The number of users including userspace.
*
* Use mmget()/mmget_not_zero()/mmput() to modify. When this
* drops to 0 (i.e. when the task exits and there are no other
* temporary reference holders), we also release a reference on
* @mm_count (which may then free the &struct mm_struct if
* @mm_count also drops to 0).
*/
atomic_t mm_users; /* 用户空间中的有多少用户*/
/**
* @mm_count: The number of references to &struct mm_struct
* (@mm_users count as 1).
*
* Use mmgrab()/mmdrop() to modify. When this drops to 0, the
* &struct mm_struct is freed.
*/
atomic_t mm_count; /* 对"struct mm_struct"有多少引用*/
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* PTE page table pages */
#endif
int map_count; /*虚拟区间个数*/ /* number of VMAs */
spinlock_t page_table_lock; /* 保护任务页表和mm->rss*/
/* Protects page tables and some
* counters
*/
struct rw_semaphore mmap_sem;
struct list_head mmlist; /* List of maybe swapped mm's. These
* are globally strung together off
* init_mm.mmlist, and are protected
* by mmlist_lock
*/
unsigned long hiwater_rss; /*进程所拥有的最大页框数*//* High-watermark of RSS usage */
unsigned long hiwater_vm; /* 进程现行区最大页数*//* High-water virtual memory usage */
/*
total_vm 进程地址空间的大小(页数)
locked_vm 锁住而不能换出的页的个数
*/
unsigned long total_vm; /* Total pages mapped */
unsigned long locked_vm; /* Pages that have PG_mlocked set */
atomic64_t pinned_vm; /* Refcount permanently increased */
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
unsigned long stack_vm; /* VM_STACK */
unsigned long def_flags; //线性区默认的访问标志
spinlock_t arg_lock; /* protect the below fields */
/*
start_code 代码段起始地址,
end_code 代码段结束地址,
start_data 数据段起始地址,
start_end 数据段结束地址
*/
unsigned long start_code, end_code, start_data, end_data;
/*
start_brk 和brk记录有关堆的信息,
start_brk是用户虚拟地址空间初始化时堆的结束地址,
brk 是当前堆的结束地址
start_stack 是栈的起始地址
*/
unsigned long start_brk, brk, start_stack;
/*
arg_start参数段的起始地址,
arg_end 参数段的结束地址,
env_start 环境段的起始地址,
env_end 环境段的结束地址
*/
unsigned long arg_start, arg_end, env_start, env_end;
/* 开始执行ELF程序时会使用到saved_auxv参数 */
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
/*
* Special counters, in some configurations protected by the
* page_table_lock, in other configurations by being atomic.
*/
struct mm_rss_stat rss_stat;
struct linux_binfmt *binfmt;
/* Architecture-specific MM context */
mm_context_t context;
unsigned long flags; /* Must use atomic bitops to access */
struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
struct kioctx_table __rcu *ioctx_table;
#endif
#ifdef CONFIG_MEMCG
/*
* "owner" points to a task that is regarded as the canonical
* user/owner of this mm. All of the following must be true in
* order for it to be changed:
*
* current == mm->owner
* current->mm != mm
* new_owner->mm == mm
* new_owner->alloc_lock is held
*/
struct task_struct __rcu *owner;
#endif
struct user_namespace *user_ns;
/* store ref to file /proc/<pid>/exe symlink points to */
struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_mm *mmu_notifier_mm;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_NUMA_BALANCING
/*
* numa_next_scan is the next time that the PTEs will be marked
* pte_numa. NUMA hinting faults will gather statistics and
* migrate pages to new nodes if necessary.
*/
unsigned long numa_next_scan;
/* Restart point for scanning and setting pte_numa */
unsigned long numa_scan_offset;
/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
#endif
/*
* An operation with batched TLB flushing is going on. Anything
* that can move process memory needs to flush the TLB when
* moving a PROT_NONE or PROT_NUMA mapped page.
*/
atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/* See flush_tlb_batched_pending() */
bool tlb_flush_batched;
#endif
struct uprobes_state uprobes_state;
#ifdef CONFIG_HUGETLB_PAGE
atomic_long_t hugetlb_usage;
#endif
struct work_struct async_put_work;
} __randomize_layout;
/*
* The mm_cpumask needs to be at the end of mm_struct, because it
* is dynamically sized based on nr_cpu_ids.
*/
unsigned long cpu_bitmap[];
};
1.2.1.3 init_mm初始化
init_mm是在mm/init-mm.c中初始化
/*
* For dynamically allocated mm_structs, there is a dynamically sized cpumask
* at the end of the structure, the size of which depends on the maximum CPU
* number the system can see. That way we allocate only as much memory for
* mm_cpumask() as needed for the hundreds, or thousands of processes that
* a system typically runs.
*
* Since there is only one init_mm in the entire system, keep it simple
* and size this cpu_bitmask to NR_CPUS.
*/
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
INIT_MM_CONTEXT(init_mm)
};
1.2.3 map_desc
arch\arm\include\asm\mach\map.h:
struct map_desc {
unsigned long virtual;------虚拟地址起始地址
unsigned long pfn;----------物理地址开始页帧号
unsigned long length;-------内存空间大小
unsigned int type;----------mem_types中的序号
};
map_desc中的type指向类型为struct mem_type的mem_types数组。
1.2.4 mem_type
mem_type的定义:arch/arm/mm/mm.h
struct mem_type {
pteval_t prot_pte; //二级页表(PTE)的属性
pteval_t prot_pte_s2; //定义CONFIG_ARM_LPAE才有效
pmdval_t prot_l1; //二级映射中的一级页表属性
pmdval_t prot_sect; //一级页表属性,只一级映射
unsigned int domain; //映射的页框所属的domain
};
mem_types的初始化:arch/arm/mm/mmu.c
static struct mem_type mem_types[] __ro_after_init = {
[MT_DEVICE] = { /* Strongly ordered / ARMv6 shared device */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
L_PTE_SHARED,
.prot_pte_s2 = s2_policy(PROT_PTE_S2_DEVICE) |
s2_policy(L_PTE_S2_MT_DEV_SHARED) |
L_PTE_SHARED,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE | PMD_SECT_S,
.domain = DOMAIN_IO,
},
[MT_DEVICE_NONSHARED] = { /* ARMv6 non-shared device */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_NONSHARED,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE,
.domain = DOMAIN_IO,
},
[MT_DEVICE_CACHED] = { /* ioremap_cached */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_CACHED,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE | PMD_SECT_WB,
.domain = DOMAIN_IO,
},
[MT_DEVICE_WC] = { /* ioremap_wc */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_WC,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE,
.domain = DOMAIN_IO,
},
[MT_UNCACHED] = {
.prot_pte = PROT_PTE_DEVICE,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
.domain = DOMAIN_IO,
},
[MT_CACHECLEAN] = {
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
.domain = DOMAIN_KERNEL,
},
#ifndef CONFIG_ARM_LPAE
[MT_MINICLEAN] = {
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN | PMD_SECT_MINICACHE,
.domain = DOMAIN_KERNEL,
},
#endif
[MT_LOW_VECTORS] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_RDONLY,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_VECTORS,
},
[MT_HIGH_VECTORS] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_USER | L_PTE_RDONLY,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_VECTORS,
},
[MT_MEMORY_RWX] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_RW] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_XN,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
.domain = DOMAIN_KERNEL,
},
[MT_ROM] = {
.prot_sect = PMD_TYPE_SECT,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_RWX_NONCACHED] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_MT_BUFFERABLE,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_RW_DTCM] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_XN,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_RWX_ITCM] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_RW_SO] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_MT_UNCACHED | L_PTE_XN,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_S |
PMD_SECT_UNCACHED | PMD_SECT_XN,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_DMA_READY] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_XN,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_KERNEL,
},
};
1.3 arm32二级页表PTE的定义
下面是Second-level descriptor详细说明:
由于Linux对于PTE的定义和ARM硬件不一致,下面的L_开头的定义都是针对Linux的,L_MT开头的是bit[5:2]表示的内存类型。
//arch/arm/include/asm/pgtable-2level.h
/*
* "Linux" PTE definitions.
*
* We keep two sets of PTEs - the hardware and the linux version.
* This allows greater flexibility in the way we map the Linux bits
* onto the hardware tables, and allows us to have YOUNG and DIRTY
* bits.
*
* The PTE table pointer refers to the hardware entries; the "Linux"
* entries are stored 1024 bytes below.
*/
#define L_PTE_VALID (_AT(pteval_t, 1) << 0) /* Valid */
#define L_PTE_PRESENT (_AT(pteval_t, 1) << 0)
#define L_PTE_YOUNG (_AT(pteval_t, 1) << 1)
#define L_PTE_DIRTY (_AT(pteval_t, 1) << 6)
#define L_PTE_RDONLY (_AT(pteval_t, 1) << 7)
#define L_PTE_USER (_AT(pteval_t, 1) << 8)
#define L_PTE_XN (_AT(pteval_t, 1) << 9)
#define L_PTE_SHARED (_AT(pteval_t, 1) << 10) /* shared(v6), coherent(xsc3) */
#define L_PTE_NONE (_AT(pteval_t, 1) << 11)
/*
* These are the memory types, defined to be compatible with
* pre-ARMv6 CPUs cacheable and bufferable bits: n/a,n/a,C,B
* ARMv6+ without TEX remapping, they are a table index.
* ARMv6+ with TEX remapping, they correspond to n/a,TEX(0),C,B
*
* MT type Pre-ARMv6 ARMv6+ type / cacheable status
* UNCACHED Uncached Strongly ordered
* BUFFERABLE Bufferable Normal memory / non-cacheable
* WRITETHROUGH Writethrough Normal memory / write through
* WRITEBACK Writeback Normal memory / write back, read alloc
* MINICACHE Minicache N/A
* WRITEALLOC Writeback Normal memory / write back, write alloc
* DEV_SHARED Uncached Device memory (shared)
* DEV_NONSHARED Uncached Device memory (non-shared)
* DEV_WC Bufferable Normal memory / non-cacheable
* DEV_CACHED Writeback Normal memory / write back, read alloc
* VECTORS Variable Normal memory / variable
*
* All normal memory mappings have the following properties:
* - reads can be repeated with no side effects
* - repeated reads return the last value written
* - reads can fetch additional locations without side effects
* - writes can be repeated (in certain cases) with no side effects
* - writes can be merged before accessing the target
* - unaligned accesses can be supported
*
* All device mappings have the following properties:
* - no access speculation
* - no repetition (eg, on return from an exception)
* - number, order and size of accesses are maintained
* - unaligned accesses are "unpredictable"
*/
#define L_PTE_MT_UNCACHED (_AT(pteval_t, 0x00) << 2) /* 0000 */
#define L_PTE_MT_BUFFERABLE (_AT(pteval_t, 0x01) << 2) /* 0001 */
#define L_PTE_MT_WRITETHROUGH (_AT(pteval_t, 0x02) << 2) /* 0010 */
#define L_PTE_MT_WRITEBACK (_AT(pteval_t, 0x03) << 2) /* 0011 */
#define L_PTE_MT_MINICACHE (_AT(pteval_t, 0x06) << 2) /* 0110 (sa1100, xscale) */
#define L_PTE_MT_WRITEALLOC (_AT(pteval_t, 0x07) << 2) /* 0111 */
#define L_PTE_MT_DEV_SHARED (_AT(pteval_t, 0x04) << 2) /* 0100 */
#define L_PTE_MT_DEV_NONSHARED (_AT(pteval_t, 0x0c) << 2) /* 1100 */
#define L_PTE_MT_DEV_WC (_AT(pteval_t, 0x09) << 2) /* 1001 */
#define L_PTE_MT_DEV_CACHED (_AT(pteval_t, 0x0b) << 2) /* 1011 */
#define L_PTE_MT_VECTORS (_AT(pteval_t, 0x0f) << 2) /* 1111 */
#define L_PTE_MT_MASK (_AT(pteval_t, 0x0f) << 2)
2 paging_init分析
源码:arch/arm/mm/mmu.c
/*
* paging_init() sets up the page tables, initialises the zone memory
* maps, and sets up the zero page, bad page and bad page tables.
*/
void __init paging_init(const struct machine_desc *mdesc)
{
void *zero_page;
prepare_page_table();//清楚pmd页表项
map_lowmem();//配置页表
memblock_set_current_limit(arm_lowmem_limit);
dma_contiguous_remap();//cma内存映射
early_fixmap_shutdown();
devicemaps_init(mdesc);//device map
kmap_init();
tcm_init();
top_pmd = pmd_off_k(0xffff0000);
/* allocate the zero page. */
zero_page = early_alloc(PAGE_SIZE);
bootmem_init();
empty_zero_page = virt_to_page(zero_page);
__flush_dcache_page(NULL, empty_zero_page);
/* Compute the virt/idmap offset, mostly for the sake of KVM */
kimage_voffset = (unsigned long)&kimage_voffset - virt_to_idmap(&kimage_voffset);
}
2.1 prepare_page_table分析
源码:arch/arm/mm/mmu.c
static inline void prepare_page_table(void)
{
unsigned long addr;
phys_addr_t end;
/*
* Clear out all the mappings below the kernel image.
*/
/*
* 以步长2M,清除0x00~0xC000_0000的pmd页表项
*/
for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
#ifdef CONFIG_XIP_KERNEL
/* The XIP kernel is mapped in the module area -- skip over it */
addr = ((unsigned long)_exiprom + PMD_SIZE - 1) & PMD_MASK;
#endif
/*
* 以步长2M,清除addr~0xC000_0000的pmd页表项
*/
for ( ; addr < PAGE_OFFSET; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
/*
* Find the end of the first block of lowmem.
*/
end = memblock.memory.regions[0].base + memblock.memory.regions[0].size;
if (end >= arm_lowmem_limit)
end = arm_lowmem_limit;
/*
* Clear out all the kernel space mappings, except for the first
* memory bank, up to the vmalloc region.
*/
/*
* #define VMALLOC_START (((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))
* 详见:adjust_lowmem_bounds()函数
*/
for (addr = __phys_to_virt(end);
addr < VMALLOC_START; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
}
2.2 map_lowmem分析
源码:arch/arm/mm/mmu.c
static void __init map_lowmem(void)
{
struct memblock_region *reg;
phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), SECTION_SIZE);
phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);
/* Map all the lowmem memory banks. */
/* 遍历memblock.memory.regions下的所有物理内存 */
for_each_memblock(memory, reg) {
phys_addr_t start = reg->base;
phys_addr_t end = start + reg->size;
struct map_desc map;
if (memblock_is_nomap(reg))
continue;
if (end > arm_lowmem_limit)
end = arm_lowmem_limit;
if (start >= end)
break;
if (end < kernel_x_start) {
//小于内核代码段的空间, kernel_x_start = 0x1000_8000
//内存类型为:MT_MEMORY_RWX
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = end - start;
map.type = MT_MEMORY_RWX;
create_mapping(&map);
} else if (start >= kernel_x_end) {
//大于内核代码段的空间,内存类型为:MT_MEMORY_RW
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = end - start;
map.type = MT_MEMORY_RW;
create_mapping(&map);
} else {
//其他空间,分三段创建
/* This better cover the entire kernel */
if (start < kernel_x_start) {
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = kernel_x_start - start;
map.type = MT_MEMORY_RW;
create_mapping(&map);
}
map.pfn = __phys_to_pfn(kernel_x_start);
map.virtual = __phys_to_virt(kernel_x_start);
map.length = kernel_x_end - kernel_x_start;
map.type = MT_MEMORY_RWX;
create_mapping(&map);
if (kernel_x_end < end) {
map.pfn = __phys_to_pfn(kernel_x_end);
map.virtual = __phys_to_virt(kernel_x_end);
map.length = end - kernel_x_end;
map.type = MT_MEMORY_RW;
create_mapping(&map);
}
}
}
}
2.3 create_mapping分析
create_mapping() --> __create_mapping() --> alloc_init_pud() --> alloc_init_pmd() --> alloc_init_pte() --> set_pte_ext() -->
cpu_v7_set_pte_ext()
源码:arch/arm/mm/mmu.c
/*
* Create the page directory entries and any necessary
* page tables for the mapping specified by `md'. We
* are able to cope here with varying sizes and address
* offsets, and we take full advantage of sections and
* supersections.
*/
static void __init create_mapping(struct map_desc *md)
{
//md->virtual = 0xffff0000,又md->virtual < 0xc000_000,则出错
if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) {
pr_warn("BUG: not creating mapping for 0x%08llx at 0x%08lx in user region\n",
(long long)__pfn_to_phys((u64)md->pfn), md->virtual);
return;
}
//判断type或空间是否合理
if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
md->virtual >= PAGE_OFFSET && md->virtual < FIXADDR_START &&
(md->virtual < VMALLOC_START || md->virtual >= VMALLOC_END)) {
pr_warn("BUG: mapping for 0x%08llx at 0x%08lx out of vmalloc space\n",
(long long)__pfn_to_phys((u64)md->pfn), md->virtual);
}
__create_mapping(&init_mm, md, early_alloc, false);
}
__create_mapping分析:
源码:arch/arm/mm/mmu.c
static void __init __create_mapping(struct mm_struct *mm, struct map_desc *md,
void *(*alloc)(unsigned long sz),
bool ng)
{
unsigned long addr, length, end;
phys_addr_t phys;
const struct mem_type *type;
pgd_t *pgd;
//根据map_desc,找到对应的struct mem_type,获取页表属性配置
type = &mem_types[md->type];
#ifndef CONFIG_ARM_LPAE
/*
* Catch 36-bit addresses
*/
if (md->pfn >= 0x100000) {
create_36bit_mapping(mm, md, type, ng);
return;
}
#endif
addr = md->virtual & PAGE_MASK; //虚拟地址页对齐
phys = __pfn_to_phys(md->pfn); //页地址转换为物理地址
length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
if (type->prot_l1 == 0 && ((addr | phys | length) & ~SECTION_MASK)) {
pr_warn("BUG: map for 0x%08llx at 0x%08lx can not be mapped using pages, ignoring.\n",
(long long)__pfn_to_phys(md->pfn), addr);
return;
}
//根据addr找到对应虚拟地址对应的pgd地址
//源码:arch/arm/include/asm/pgtable.h
//#define pgd_index(addr) ((addr) >> PGDIR_SHIFT)
//#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
//即:init_mm.pgd + (addr >> 21)
pgd = pgd_offset(mm, addr);
end = addr + length;
do {
unsigned long next = pgd_addr_end(addr, end);
//初始化下一级页表
alloc_init_pud(pgd, addr, next, phys, type, alloc, ng);
phys += next - addr;
addr = next;
} while (pgd++, addr != end);//遍历区间地址,步长是PGDIR_SIZE,即2MB大小的空间
}
alloc_init_pud分析:
源码:arch/arm/mm/mmu.c
static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
unsigned long end, phys_addr_t phys,
const struct mem_type *type,
void *(*alloc)(unsigned long sz), bool ng)
{
//arm32无pud,则跳过pud,pud = pgd
//详见:include/asm-generic/pgtable-nop4d-hack.h
pud_t *pud = pud_offset(pgd, addr);
unsigned long next;
do {
next = pud_addr_end(addr, end);
alloc_init_pmd(pud, addr, next, phys, type, alloc, ng);
phys += next - addr;
} while (pud++, addr = next, addr != end);
}
alloc_init_pmd分析:
源码:arch/arm/mm/mmu.c
static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
unsigned long end, phys_addr_t phys,
const struct mem_type *type,
void *(*alloc)(unsigned long sz), bool ng)
{
//arm32没有使能CONFIG_ARM_LPAE,则跳过pmd,pmd = pud = pgd
//详见:arch/arm/include/asm/pgtable-2level.h
pmd_t *pmd = pmd_offset(pud, addr);
unsigned long next;
do {
/*
* With LPAE, we must loop over to map
* all the pmds for the given range.
*/
next = pmd_addr_end(addr, end);
/*
* Try a section mapping - addr, next and phys must all be
* aligned to a section boundary.
*/
if (type->prot_sect &&
((addr | next | phys) & ~SECTION_MASK) == 0) {
__map_init_section(pmd, addr, next, phys, type, ng);
} else {
alloc_init_pte(pmd, addr, next,
__phys_to_pfn(phys), type, alloc, ng);
}
phys += next - addr;
} while (pmd++, addr = next, addr != end);
}
2.4 alloc_init_pte分析
源码:arch/arm/mm/mmu.c
static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
unsigned long end, unsigned long pfn,
const struct mem_type *type,
void *(*alloc)(unsigned long sz),
bool ng)
{
//使用prot_l1作为参数,创建PGD页表目录,返回addr对应的pte地址
pte_t *pte = arm_pte_alloc(pmd, addr, type->prot_l1, alloc);
do {
//调用体系结构相关汇编,配置PTE
set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)),
ng ? PTE_EXT_NG : 0);
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);//遍历[addr, end)区间内存,以PAGE_SIZE为步长
}
arm_pte_alloc
源码:arch/arm/mm/mmu.c
static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr,
unsigned long prot,
void *(*alloc)(unsigned long sz))
{
//如果PGD的内容为空,即PTE还没有创建,择取建立页面
if (pmd_none(*pmd)) {
//#define PTRS_PER_PTE 512
//#define PTRS_PER_PMD 1
//#define PTRS_PER_PGD 2048
//#define PTE_HWTABLE_PTRS (PTRS_PER_PTE)
//#define PTE_HWTABLE_OFF (PTE_HWTABLE_PTRS * sizeof(pte_t))
//#define PTE_HWTABLE_SIZE (PTRS_PER_PTE * sizeof(u32))
pte_t *pte = alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);//size=4096B
__pmd_populate(pmd, __pa(pte), prot);
}
BUG_ON(pmd_bad(*pmd));
return pte_offset_kernel(pmd, addr);//返回当前addr对应的PTE地址
}
alloc函数原型——early_alloc
源码:arch/arm/mm/mmu.c
static void __init *early_alloc(unsigned long sz)
{
void *ptr = memblock_alloc(sz, sz);//使用memblock分配内存
if (!ptr)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, sz, sz);
return ptr;
}
__pmd_populate分析:
Linux内核PGD/PTE映射关系
源码:arch/arm/include/asm/pgalloc.h
static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
pmdval_t prot)
{
pmdval_t pmdval = (pte + PTE_HWTABLE_OFF) | prot; //生成拼命地跑[0]的内容
pmdp[0] = __pmd(pmdval);
#ifndef CONFIG_ARM_LPAE
pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t)); //生成紧邻的pmdp[1]的内容
#endif
flush_pmd_entry(pmdp);//将pmdp两个刷入到RAM中
}
Linux的PGD页表目录和ARM32不同,总数和ARM32是一样的。
2.5 pgd、pud、pmd
pgd_offset
源码:arch/arm/include/asm/pgtable.h
/* to find an entry in a page-table-directory */
#define pgd_index(addr) ((addr) >> PGDIR_SHIFT)
#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
pgd_addr_end
源码:include/asm-generic/pgtable.h
#define pgd_addr_end(addr, end) \
({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \
(__boundary - 1 < (end) - 1)? __boundary: (end); \
})
pud_offset
源码:include/asm-generic/pgtable-nop4d-hack.h
static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
{
return (pud_t *)pgd;
}
pud_addr_end
源码:include/asm-generic/pgtable-nop4d-hack.h
#define pud_addr_end(addr, end) (end)
pmd_offset
源码:arch/arm/include/asm/pgtable-2level.h
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
{
return (pmd_t *)pud;
}
pmd_addr_end
源码:arch/arm/include/asm/pgtable-2level.h
/* we don't need complex calculations here as the pmd is folded into the pgd */
#define pmd_addr_end(addr,end) (end)
3 PTE页表项配置
cpu_v7_set_pte_ext分析
/*
* cpu_v7_set_pte_ext(ptep, pte)
*
* Set a level 2 translation table entry.
*
* - ptep - pointer to level 2 translation table entry
* (hardware version is stored at +2048 bytes) --------- 放入r0
* - pte - PTE value to store ------------------------------------ 放入r1
* - ext - value for extended PTE bits ---------------------------- 放入r2
*/
ENTRY(cpu_v7_set_pte_ext)
#ifdef CONFIG_MMU
str r1, [r0] @ linux version ------- 将r1的值存入r0地址的内存中
bic r3, r1, #0x000003f0 --------------- 清除r1的bit[9:4],存入r3
bic r3, r3, #PTE_TYPE_MASK --------- PTE_TYPE_MASK为0x03,即清除低2位
orr r3, r3, r2 --------------------------- r3与r2或,存入r3
orr r3, r3, #PTE_EXT_AP0 | 2 ---------- 这里将bit1和bit4置位,所以是Small page
tst r1, #1 << 4 ------------------------- 判断r1的bit4是否为0
orrne r3, r3, #PTE_EXT_TEX(1) ------ 设置TEX为1
eor r1, r1, #L_PTE_DIRTY
tst r1, #L_PTE_RDONLY | L_PTE_DIRTY
orrne r3, r3, #PTE_EXT_APX -------- 设置AP[2]
tst r1, #L_PTE_USER
orrne r3, r3, #PTE_EXT_AP1 -------- 设置AP[1]
tst r1, #L_PTE_XN
orrne r3, r3, #PTE_EXT_XN --------- 设置XN位
tst r1, #L_PTE_YOUNG
tstne r1, #L_PTE_VALID
eorne r1, r1, #L_PTE_NONE
tstne r1, #L_PTE_NONE
moveq r3, #0
ARM( str r3, [r0, #2048]! ) -------------- 并没有写入r0,而是写入r0+2048Bytes的偏移
THUMB( add r0, r0, #2048 )
THUMB( str r3, [r0] )
ALT_SMP(W(nop))
ALT_UP (mcr p15, 0, r0, c7, c10, 1) @ flush_pte
#endif
bx lr
ENDPROC(cpu_v7_set_pte_ext)
4 dma_contiguous_remap分析
本函数主要是对cma的内存区域做二次映射,并将其加入到start_vm中。
源码:arch/arm/mm/dma-mapping.h
void __init dma_contiguous_remap(void)
{
int i;
for (i = 0; i < dma_mmu_remap_num; i++) {//根据dma_remap的数量重新映射
phys_addr_t start = dma_mmu_remap[i].base;
phys_addr_t end = start + dma_mmu_remap[i].size;
struct map_desc map;
unsigned long addr;
//内存的有效区间判断
if (end > arm_lowmem_limit)
end = arm_lowmem_limit;
if (start >= end)
continue;
map.pfn = __phys_to_pfn(start); //获取页物理地址
map.virtual = __phys_to_virt(start); //获取虚拟地址
map.length = end - start;
map.type = MT_MEMORY_DMA_READY;
/*
* Clear previous low-memory mapping to ensure that the
* TLB does not see any conflicting entries, then flush
* the TLB of the old entries before creating new mappings.
*
* This ensures that any speculatively loaded TLB entries
* (even though they may be rare) can not cause any problems,
* and ensures that this code is architecturally compliant.
*/
for (addr = __phys_to_virt(start); addr < __phys_to_virt(end);
addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
//刷新cache
flush_tlb_kernel_range(__phys_to_virt(start),
__phys_to_virt(end));
iotable_init(&map, 1); //重新映射该段内存区域
}
}
iotable_init分析:
源码:arch/arm/mm/mmu.c
/*
* Create the architecture specific mappings
*/
void __init iotable_init(struct map_desc *io_desc, int nr)
{
struct map_desc *md;
struct vm_struct *vm;
struct static_vm *svm;
if (!nr)
return;
svm = memblock_alloc(sizeof(*svm) * nr, __alignof__(*svm));
if (!svm)
panic("%s: Failed to allocate %zu bytes align=0x%zx\n",
__func__, sizeof(*svm) * nr, __alignof__(*svm));
for (md = io_desc; nr; md++, nr--) {
create_mapping(md);重新映射该段内存
vm = &svm->vm;
vm->addr = (void *)(md->virtual & PAGE_MASK);
vm->size = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
vm->phys_addr = __pfn_to_phys(md->pfn);
vm->flags = VM_IOREMAP | VM_ARM_STATIC_MAPPING;
vm->flags |= VM_ARM_MTYPE(md->type);
vm->caller = iotable_init;
add_static_vm_early(svm++);//将该段内存加入到静态虚拟内存中
}
}
5 early_fixmap_shutdown分析
本函数主要是完成fixaddr的二次映射。
源码:arch/arm/mm/mmu.c
static void __init early_fixmap_shutdown(void)
{
int i;
unsigned long va = fix_to_virt(__end_of_permanent_fixed_addresses - 1);//va = 0xffc00000UL
pte_offset_fixmap = pte_offset_late_fixmap;
pmd_clear(fixmap_pmd(va));
local_flush_tlb_kernel_page(va);
for (i = 0; i < __end_of_permanent_fixed_addresses; i++) {
pte_t *pte;
struct map_desc map;
map.virtual = fix_to_virt(i);
pte = pte_offset_early_fixmap(pmd_off_k(map.virtual), map.virtual);
/* Only i/o device mappings are supported ATM */
if (pte_none(*pte) ||
(pte_val(*pte) & L_PTE_MT_MASK) != L_PTE_MT_DEV_SHARED)
continue;
map.pfn = pte_pfn(*pte);
map.type = MT_DEVICE;
map.length = PAGE_SIZE;
create_mapping(&map);//以步长=PAGE_SIZE,对fixaddr进行映射
}
}
6 devicemaps_init分析
本函数主要功能为vector迁移及映射,vmalloc映射,io映射
/*
* Set up the device mappings. Since we clear out the page tables for all
* mappings above VMALLOC_START, except early fixmap, we might remove debug
* device mappings. This means earlycon can be used to debug this function
* Any other function or debugging method which may touch any device _will_
* crash the kernel.
*/
static void __init devicemaps_init(const struct machine_desc *mdesc)
{
struct map_desc map;
unsigned long addr;
void *vectors;
/*
* Allocate the vector page early.
*/
vectors = early_alloc(PAGE_SIZE * 2); //从memblock申请2*4K,用作中断向量表
early_trap_init(vectors); //中断向量表的拷贝
/*
* Clear page table except top pmd used by early fixmaps
*/
//清除VMALLOC_START~(FIXADDR_TOP - 4K)内存域的映射
for (addr = VMALLOC_START; addr < (FIXADDR_TOP & PMD_MASK); addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
/*
* Map the kernel if it is XIP.
* It is always first in the modulearea.
*/
#ifdef CONFIG_XIP_KERNEL
map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK);
map.virtual = MODULES_VADDR;
map.length = ((unsigned long)_exiprom - map.virtual + ~SECTION_MASK) & SECTION_MASK;
map.type = MT_ROM;
create_mapping(&map);
#endif
/*
* Map the cache flushing regions.
*/
#ifdef FLUSH_BASE
map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS);
map.virtual = FLUSH_BASE;
map.length = SZ_1M;
map.type = MT_CACHECLEAN;
create_mapping(&map);
#endif
#ifdef FLUSH_BASE_MINICACHE
map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS + SZ_1M);
map.virtual = FLUSH_BASE_MINICACHE;
map.length = SZ_1M;
map.type = MT_MINICLEAN;
create_mapping(&map);
#endif
/*
* Create a mapping for the machine vectors at the high-vectors
* location (0xffff0000). If we aren't using high-vectors, also
* create a mapping at the low-vectors virtual address.
*/
//将vectors固定映射到高端内存的0xffff0000,大小为4K
map.pfn = __phys_to_pfn(virt_to_phys(vectors));
map.virtual = 0xffff0000;
map.length = PAGE_SIZE;
#ifdef CONFIG_KUSER_HELPERS
map.type = MT_HIGH_VECTORS;
#else
map.type = MT_LOW_VECTORS;
#endif
create_mapping(&map);
if (!vectors_high()) {
map.virtual = 0;
map.length = PAGE_SIZE * 2;
map.type = MT_LOW_VECTORS;
create_mapping(&map);
}
/* Now create a kernel read-only mapping */
//将vectors的剩余4K固定映射到高端内存的0xffff1000
map.pfn += 1;
map.virtual = 0xffff0000 + PAGE_SIZE;
map.length = PAGE_SIZE;
map.type = MT_LOW_VECTORS;
create_mapping(&map);
/*
* Ask the machine support to map in the statically mapped devices.
*/
//调用machine的map_io,具体看芯片厂家的代码
if (mdesc->map_io)
mdesc->map_io();
else
debug_ll_io_init();
fill_pmd_gaps();
/* Reserve fixed i/o space in VMALLOC region */
pci_reserve_io();
/*
* Finally flush the caches and tlb to ensure that we're in a
* consistent state wrt the writebuffer. This also ensures that
* any write-allocated cache lines in the vector page are written
* back. After this point, we can start to touch devices again.
*/
//更新本地cpu mmu中的tlb
local_flush_tlb_all();
//刷新cache
flush_cache_all();
/* Enable asynchronous aborts */
//使能abort中断
early_abt_enable();
}
early_trap_init分析:
void __init early_trap_init(void *vectors_base)
{
#ifndef CONFIG_CPU_V7M
unsigned long vectors = (unsigned long)vectors_base;
extern char __stubs_start[], __stubs_end[];
extern char __vectors_start[], __vectors_end[];
unsigned i;
vectors_page = vectors_base;
/*
* Poison the vectors page with an undefined instruction. This
* instruction is chosen to be undefined for both ARM and Thumb
* ISAs. The Thumb version is an undefined instruction with a
* branch back to the undefined instruction.
*/
for (i = 0; i < PAGE_SIZE / sizeof(u32); i++)
((u32 *)vectors_base)[i] = 0xe7fddef1;
/*
* Copy the vectors, stubs and kuser helpers (in entry-armv.S)
* into the vector page, mapped at 0xffff0000, and ensure these
* are visible to the instruction stream.
*/
memcpy((void *)vectors, __vectors_start, __vectors_end - __vectors_start);
memcpy((void *)vectors + 0x1000, __stubs_start, __stubs_end - __stubs_start);
kuser_init(vectors_base);
flush_icache_range(vectors, vectors + PAGE_SIZE * 2);
#else /* ifndef CONFIG_CPU_V7M */
/*
* on V7-M there is no need to copy the vector table to a dedicated
* memory area. The address is configurable and so a table in the kernel
* image can be used.
*/
#endif
}
7 kmap_init分析
源码:arch/arm/mm/mmu.c
static void __init kmap_init(void)
{
#ifdef CONFIG_HIGHMEM
pkmap_page_table = early_pte_alloc(pmd_off_k(PKMAP_BASE),
PKMAP_BASE, _PAGE_KERNEL_TABLE);
#endif
//FIXADDR pte页表内存申请
early_pte_alloc(pmd_off_k(FIXADDR_START), FIXADDR_START,
_PAGE_KERNEL_TABLE);
}
8 bootmem_init分析
源码:arch/arm/mm/init.c
void __init bootmem_init(void)
{
memblock_allow_resize();
find_limits(&min_low_pfn, &max_low_pfn, &max_pfn);
early_memtest((phys_addr_t)min_low_pfn << PAGE_SHIFT,
(phys_addr_t)max_low_pfn << PAGE_SHIFT);
/*
* Sparsemem tries to allocate bootmem in memory_present(),
* so must be done after the fixed reservations
*/
memblocks_present();
/*
* sparse_init() needs the bootmem allocator up and running.
*/
sparse_init();
/*
* Now free the memory - free_area_init_node needs
* the sparse mem_map arrays initialized by sparse_init()
* for memmap_init_zone(), otherwise all PFNs are invalid.
*/
zone_sizes_init(min_low_pfn, max_low_pfn, max_pfn);
}