linux3.10 paging_init页表初始化详解

arm架构的32位linux系统上面,使用到了两种形式的页表映射,段映射和分页映射。系统早期的页表映射都采用静态映射的方式,即对于要映射的某块物理内存,指定其映射的虚拟地址。在uboot 刚跳转到linux时候,先采用了段映射的方式,在arch/arm/kernel/head.S中可以看到源码:

//页表目录的物理地址起始地址放在代码段向下偏移页目录大小PG_DIR_SIZE的位置
.macro	pgtbl, rd, phys
	add	\rd, \phys, #TEXT_OFFSET - PG_DIR_SIZE
.endm

__create_page_tables:
	pgtbl	r4, r8			//获取页表放置的物理地址起始地址	@ page table address

	/*
	 * Clear the swapper page table
	 */
	mov	r0, r4
	mov	r3, #0
	add	r6, r0, #PG_DIR_SIZE          清除页目录,页目录的地址为kernel向下移动 
                                               PG_DIR_SIZE个距离
1:	str	r3, [r0], #4
	str	r3, [r0], #4
	str	r3, [r0], #4
	str	r3, [r0], #4
	teq	r0, r6
	bne	1b



	ldr	r7, [r10, #PROCINFO_MM_MMUFLAGS] @ mm_mmuflags

	/*
	 * Create identity mapping to cater for __enable_mmu.
	 * This identity mapping will be removed by paging_init().
	 */
	adr	r0, __turn_mmu_on_loc
	ldmia	r0, {r3, r5, r6}              对开启mmu的那段代码进行恒等映射
	sub	r0, r0, r3			@ virt->phys offset
	add	r5, r5, r0			@ phys __turn_mmu_on
	add	r6, r6, r0			@ phys __turn_mmu_on_end
	mov	r5, r5, lsr #SECTION_SHIFT
	mov	r6, r6, lsr #SECTION_SHIFT

1:	orr	r3, r7, r5, lsl #SECTION_SHIFT	@ flags + kernel base
	str	r3, [r4, r5, lsl #PMD_ORDER]	@ identity mapping
	cmp	r5, r6                       
	addlo	r5, r5, #1			@ next section
	blo	1b

	/*
	 * Map our RAM from the start to the end of the kernel .bss section.
	 */
	add	r0, r4, #PAGE_OFFSET >> (SECTION_SHIFT - PMD_ORDER)
	ldr	r6, =(_end - 1)
	orr	r3, r8, r7               对kernel代码进行段映射
	add	r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ORDER)
1:	str	r3, [r0], #1 << PMD_ORDER
	add	r3, r3, #1 << SECTION_SHIFT
	cmp	r0, r6
	bls	1b



	/*
	 * Then map boot params address in r2 if specified.
	 * We map 2 sections in case the ATAGs/DTB crosses a section boundary.
	 */
	mov	r0, r2, lsr #SECTION_SHIFT
	movs	r0, r0, lsl #SECTION_SHIFT      对dtb进行段映射
	subne	r3, r0, r8
	addne	r3, r3, #PAGE_OFFSET
	addne	r3, r4, r3, lsr #(SECTION_SHIFT - PMD_ORDER)
	orrne	r6, r7, r0
	strne	r6, [r3], #1 << PMD_ORDER
	addne	r6, r6, #1 << SECTION_SHIFT
	strne	r6, [r3]


	mov	pc, lr
ENDPROC(__create_page_tables)
	.ltorg
	.align
__turn_mmu_on_loc:
	.long	.
	.long	__turn_mmu_on
	.long	__turn_mmu_on_end

上面的代码是比较清晰的,为了开启mmu,需要先提前建立好页表,linux初始化早期,采用了段映射的方式,即把4G的空间划分成4096,每1M 为一段,分别

1开启mmu的那段代码需要做特殊处理,即虚拟地址要等于物理地址,进行恒等映射,开启前后执行才不会出错。

2 需要对kernel的地址进行段映射,因为页表和代码起始段在同一个1M的空间内,所以页表地址也会被映射

3对dtb进行段映射,

 

从系统起来以后的reserve的物理地址中可以看到使用的内存情况:

 reserved[0x0]           [0x00000030104000-0x00000030107fff], 0x4000 bytes

 reserved[0x1]           [0x00000030108400-0x000000306487df], 0x5403e0 bytes

 reserved[0x2]           [0x00000033ffb000-0x00000033ffcfff], 0x2000 bytes

0x0的区域是页表,4096个目录项,刚好16K,0x1是内核的地址,0x2是dtb的物理内存区域,从cpu proc_info中的mmu flag字段包含PMD_TYPE_SECT位也可以看出是段映射。

接着看start_kernel中,是如何再一次初始化页表的。主要代码在setup_arch的paging_init函数:

void __init paging_init(struct machine_desc *mdesc)
{
	void *zero_page;

	memblock_set_current_limit(arm_lowmem_limit);

	build_mem_type_table();  //该函数主要根据arm的架构,来设置mem_types表,该表对不同的内
存类型,进行不同的映射,并附加不同的标志位
	prepare_page_table(); //清零一些页目录
	map_lowmem(); //对ram的地址再进行一次段映射
	dma_contiguous_remap(); //dma 相关的一些映射,这边暂时没处理
	devicemaps_init(mdesc);  //包括中断向量的映射,io空间的映射
	kmap_init();
	tcm_init();

	top_pmd = pmd_off_k(0xffff0000);

	/* allocate the zero page. */
	zero_page = early_alloc(PAGE_SIZE);

	bootmem_init();

	empty_zero_page = virt_to_page(zero_page);
	__flush_dcache_page(NULL, empty_zero_page);
}

先来看一下prepare_page_table

static inline void prepare_page_table(void)
{
	unsigned long addr;
	phys_addr_t end;

	/*
	 * Clear out all the mappings below the kernel image.
	 */
	for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE)
		pmd_clear(pmd_off_k(addr));

#ifdef CONFIG_XIP_KERNEL
	/* The XIP kernel is mapped in the module area -- skip over it */
	addr = ((unsigned long)_etext + PMD_SIZE - 1) & PMD_MASK;
#endif
	for ( ; addr < PAGE_OFFSET; addr += PMD_SIZE)
		pmd_clear(pmd_off_k(addr));

	/*
	 * Find the end of the first block of lowmem.
	 */
	end = memblock.memory.regions[0].base + memblock.memory.regions[0].size;
	if (end >= arm_lowmem_limit)
		end = arm_lowmem_limit;

	/*
	 * Clear out all the kernel space mappings, except for the first
	 * memory bank, up to the vmalloc region.
	 */
	for (addr = __phys_to_virt(end);
	     addr < VMALLOC_START; addr += PMD_SIZE)
		pmd_clear(pmd_off_k(addr));
}

该函数主要对一些页表项做清理工作,这些页表项除了之前做恒等映射的一块使用过外(地址从0x3000000开始),其他应该都没有使用过。

然后调用map_lowmem

static void __init map_lowmem(void)
{
	struct memblock_region *reg;

	/* Map all the lowmem memory banks. */
	for_each_memblock(memory, reg) {
		phys_addr_t start = reg->base;  //物理地址起始
		phys_addr_t end = start + reg->size;  //物理地址end
		struct map_desc map;

		if (end > arm_lowmem_limit)
			end = arm_lowmem_limit;
		if (start >= end)
			break;

		map.pfn = __phys_to_pfn(start);
		map.virtual = __phys_to_virt(start);
		map.length = end - start;
		map.type = MT_MEMORY;

		create_mapping(&map);
	}
}

该函数把memory的物理地址都做了段映射,物理地址范围是0x30000000 到 0x34000000。之前在汇编里面kernel的地址已经做过段映射了,这边次把页表里面的内容再填一下,应该没什么影响(不然要是填的地址不一样,肯定跑飞了!),

memblock进行物理内存的管理,每次都通过他来申请物理内存

主要是create_mapping函数

static void __init create_mapping(struct map_desc *md)
{
	unsigned long addr, length, end;
	phys_addr_t phys;
	const struct mem_type *type;
	pgd_t *pgd;

	if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) {
		printk(KERN_WARNING "BUG: not creating mapping for 0x%08llx"
		       " at 0x%08lx in user region\n",
		       (long long)__pfn_to_phys((u64)md->pfn), md->virtual);
		return;
	}

	if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
	    md->virtual >= PAGE_OFFSET &&
	    (md->virtual < VMALLOC_START || md->virtual >= VMALLOC_END)) {
		printk(KERN_WARNING "BUG: mapping for 0x%08llx"
		       " at 0x%08lx out of vmalloc space\n",
		       (long long)__pfn_to_phys((u64)md->pfn), md->virtual);
	}

	type = &mem_types[md->type];

#ifndef CONFIG_ARM_LPAE
	/*
	 * Catch 36-bit addresses
	 */
	if (md->pfn >= 0x100000) {
		create_36bit_mapping(md, type);
		return;
	}
#endif

	addr = md->virtual & PAGE_MASK;
	phys = __pfn_to_phys(md->pfn);
	length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));

	if (type->prot_l1 == 0 && ((addr | phys | length) & ~SECTION_MASK)) {
		printk(KERN_WARNING "BUG: map for 0x%08llx at 0x%08lx can not "
		       "be mapped using pages, ignoring.\n",
		       (long long)__pfn_to_phys(md->pfn), addr);
		return;
	}

	pgd = pgd_offset_k(addr);  根据虚拟地址,得到页目录地址
	end = addr + length;
	do {
		
		unsigned long next = pgd_addr_end(addr, end);
		alloc_init_pud(pgd, addr, next, phys, type);

		phys += next - addr;
		addr = next;
	} while (pgd++, addr != end);
}

pgd_offset_k能根据虚拟地址得到页目录项的虚拟地址,如下定义:

#define pgd_offset_k(addr)    pgd_offset(&init_mm, addr),页目录根地址在init_mm的swapper_pg_dir字段中,基地址虚拟为0xc0104000,物理地址是0x30104000和head.S中第一次映射的页目录项基地址一样。

linux页表的实现和mmu hardware中是有差异的,mmu hardware中页目录项是12位,即4096个,每个目录映射1M,但是linux把每个目录项设为2048个,以2M为单位,所以可以看到

#define PGDIR_SHIFT        21

,页目录只有11位,但其实这边每个页目录有8字节,具体差异实现,看下面的博文介绍,篇幅太大,这边不展开:

https://www.cnblogs.com/arnoldlu/p/8087022.html

https://blog.csdn.net/zhoutaopower/article/details/88940727

alloc_init_pud把pgd转换成pud,32位linux只使用2级页映射,所以在这边pud,pmd都是等于pgd的,直接进去看alloc_init_pmd的实现:

static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
				      unsigned long end, phys_addr_t phys,
				      const struct mem_type *type)
{
	pmd_t *pmd = pmd_offset(pud, addr);
	unsigned long next;

	do {
		/*
		 * With LPAE, we must loop over to map
		 * all the pmds for the given range.
		 */
		next = pmd_addr_end(addr, end);
		/*
		 * Try a section mapping - addr, next and phys must all be
		 * aligned to a section boundary.
		 */
		if (type->prot_sect &&
				((addr | next | phys) & ~SECTION_MASK) == 0) {
			__map_init_section(pmd, addr, next, phys, type);
		} else {
			alloc_init_pte(pmd, addr, next,
						__phys_to_pfn(phys), type);
		}

		phys += next - addr;

	} while (pmd++, addr = next, addr != end);
}

物理内存的映射,都会调用__map_init_section,而alloc_init_pte的实现是进行二级页表映射,map_lowmem会调用__map_init_section:

static void __init __map_init_section(pmd_t *pmd, unsigned long addr,
			unsigned long end, phys_addr_t phys,
			const struct mem_type *type)
{
	pmd_t *p = pmd;

#ifndef CONFIG_ARM_LPAE
	/*
	 * In classic MMU format, puds and pmds are folded in to
	 * the pgds. pmd_offset gives the PGD entry. PGDs refer to a
	 * group of L1 entries making up one logical pointer to
	 * an L2 table (2MB), where as PMDs refer to the individual
	 * L1 entries (1MB). Hence increment to get the correct
	 * offset for odd 1MB sections.
	 * (See arch/arm/include/asm/pgtable-2level.h)
	 */
	if (addr & SECTION_SIZE)
		pmd++;
#endif
	do {
		*pmd = __pmd(phys | type->prot_sect);
		phys += SECTION_SIZE;
	} while (pmd++, addr += SECTION_SIZE, addr != end);

	flush_pmd_entry(p);
}

需要注意的是,已经把pgd转化成了pmd:可以看一下具体定义:

typedef pmdval_t pmd_t;
typedef pmdval_t pgd_t[2];

所以pgd++,地址为增加8位,而pmd++,地址加四,上面传下来的pgd和next之间的地址差为0x200000,刚好2M,8字节 pgd的分辨率为2M,而转换成pmd指针以后,分辨率为1M,这时刚好与mmu硬件目录设定相对应。这边把页目录进行填充,填充值分别为段的物理地址,和页表状态位,SECTION_SIZE为1M,从这边也可以看出,内核的映射是个逻辑映射,只是物理地址加上一个偏移就能得到虚拟地址,而且和start_kernel之前的页表初始化一样,采用的是段映射,

flush_pmd_entry 应该是用来刷新页表缓存的,不深究。上面就完成了整个memory的段映射,接着分析devicemaps_init:

static void __init devicemaps_init(struct machine_desc *mdesc)
{
	struct map_desc map;
	unsigned long addr;
	void *vectors;

	/*
	 * Allocate the vector page early.
	 */
	vectors = early_alloc(PAGE_SIZE);   
这边的early_alloc函数都是从memblock中进行分
配,然后添加到该reserve域中,标记为使用,memblock进行物理内存的管理,
这边返回的是虚拟地址,因为整个memory都被段映射了,所以这边的虚拟地址在物理地址
上加个偏移就能得到

	early_trap_init(vectors);   //异常向量页表的初始化

	for (addr = VMALLOC_START; addr; addr += PMD_SIZE)
		pmd_clear(pmd_off_k(addr));

	/*
	 * Map the kernel if it is XIP.
	 * It is always first in the modulearea.
	 */
#ifdef CONFIG_XIP_KERNEL
	map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK);
	map.virtual = MODULES_VADDR;
	map.length = ((unsigned long)_etext - map.virtual + ~SECTION_MASK) & SECTION_MASK;
	map.type = MT_ROM;
	create_mapping(&map);
#endif

	/*
	 * Map the cache flushing regions.
	 */
#ifdef FLUSH_BASE
	map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS);
	map.virtual = FLUSH_BASE;
	map.length = SZ_1M;
	map.type = MT_CACHECLEAN;
	create_mapping(&map);
#endif
#ifdef FLUSH_BASE_MINICACHE
	map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS + SZ_1M);
	map.virtual = FLUSH_BASE_MINICACHE;
	map.length = SZ_1M;
	map.type = MT_MINICLEAN;
	create_mapping(&map);
#endif

	/*
	 * Create a mapping for the machine vectors at the high-vectors
	 * location (0xffff0000).  If we aren't using high-vectors, also
	 * create a mapping at the low-vectors virtual address.
	 */
	map.pfn = __phys_to_pfn(virt_to_phys(vectors));
	map.virtual = 0xffff0000;
	map.length = PAGE_SIZE;
	map.type = MT_HIGH_VECTORS;
	create_mapping(&map);    进行异常向量表的映射

	if (!vectors_high()) {
		map.virtual = 0;
		map.type = MT_LOW_VECTORS;
		create_mapping(&map);
	}

	/*
	 * Ask the machine support to map in the statically mapped devices.
	 */
	if (mdesc->map_io)
		mdesc->map_io();
	fill_pmd_gaps();

	/* Reserve fixed i/o space in VMALLOC region */
	pci_reserve_io();

	/*
	 * Finally flush the caches and tlb to ensure that we're in a
	 * consistent state wrt the writebuffer.  This also ensures that
	 * any write-allocated cache lines in the vector page are written
	 * back.  After this point, we can start to touch devices again.
	 */
	local_flush_tlb_all();
	flush_cache_all();
}

首先是异常向量页表的初始化:

void __init early_trap_init(void *vectors_base)
{
	unsigned long vectors = (unsigned long)vectors_base;
	extern char __stubs_start[], __stubs_end[];
	extern char __vectors_start[], __vectors_end[];
	extern char __kuser_helper_start[], __kuser_helper_end[];
	int kuser_sz = __kuser_helper_end - __kuser_helper_start;

	vectors_page = vectors_base;

	/*
	 * Copy the vectors, stubs and kuser helpers (in entry-armv.S)
	 * into the vector page, mapped at 0xffff0000, and ensure these
	 * are visible to the instruction stream.
	 */
	异常向量表
	memcpy((void *)vectors, __vectors_start, __vectors_end - __vectors_start);
        异常向量执行程序
	memcpy((void *)vectors + 0x200, __stubs_start, __stubs_end - __stubs_start);
	memcpy((void *)vectors + 0x1000 - kuser_sz, __kuser_helper_start, kuser_sz);

	/*
	 * Do processor specific fixups for the kuser helpers
	 */
	kuser_get_tls_init(vectors);

	/*
	 * Copy signal return handlers into the vector page, and
	 * set sigreturn to be a pointer to these.
	 */
        拷贝信号处理函数
	memcpy((void *)(vectors + KERN_SIGRETURN_CODE - CONFIG_VECTORS_BASE),
	       sigreturn_codes, sizeof(sigreturn_codes));
        
	flush_icache_range(vectors, vectors + PAGE_SIZE);
        修改异常向量表的访问权限
	modify_domain(DOMAIN_USER, DOMAIN_CLIENT);
}

上面函数利用申请到的memory,填充异常向量表和异常处理函数。

linux在__vectors_start和__vectors_end中定义了中断向量表:

.globl	__vectors_start
__vectors_start:
 ARM(	swi	SYS_ERROR0	)
 THUMB(	svc	#0		)
 THUMB(	nop			)
	W(b)	vector_und + stubs_offset
	W(ldr)	pc, .LCvswi + stubs_offset
	W(b)	vector_pabt + stubs_offset
	W(b)	vector_dabt + stubs_offset
	W(b)	vector_addrexcptn + stubs_offset
	W(b)	vector_irq + stubs_offset
	W(b)	vector_fiq + stubs_offset

	.globl	__vectors_end
__vectors_end:

然后开始对向量表进行映射:

map.pfn = __phys_to_pfn(virt_to_phys(vectors));
map.virtual = 0xffff0000;

map.length = PAGE_SIZE;
map.type = MT_HIGH_VECTORS;
create_mapping(&map);

可以看到,其实页表存在多重映射的关系,也就是一个物理地址可以映射成几个虚拟地址,前面申请vectors的时候得到一个逻辑地址,由段页表完成映射,完了这边又为该块物理地址申请一个新的虚拟地址,通过页表进行映射。这里中断向量表的虚拟地址为0xffff0000。另外,linux可以设置协处理器c1的v位来控制是从0地址还是0xffff0000地址取异常向量表。接着进去看一下如何进行页表映射,在create_mapping函数中,利用0xffff0000创建对应高地址的pgd,最终调用alloc_init_pte函数:

static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
				  unsigned long end, unsigned long pfn,
				  const struct mem_type *type)
{
	pte_t *pte = early_pte_alloc(pmd, addr, type->prot_l1);  分配页目录项,并且把目录里面
对应的页表地址返回
	do {
		set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), 0); 填充页表
		pfn++;
	} while (pte++, addr += PAGE_SIZE, addr != end);
}

上面分为两步,先填充页目录项,再填充页表项:

static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr, unsigned long prot)
{
	if (pmd_none(*pmd)) {
		pte_t *pte = early_alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
		__pmd_populate(pmd, __pa(pte), prot);
	}
	BUG_ON(pmd_bad(*pmd));
	return pte_offset_kernel(pmd, addr);
}

PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE是 (512+512)*4,即分配4K 的空间,之前已经说过,一个pgd8字节,有2048个,每个对应512个页表,这边分配两个512,上半页给linux系统用,应该是用来记录一下信息位,下本页给mmu硬件使用。得到目录项中页表的逻辑地址以后,需要把该逻辑地址转化成物理地址填入pgd中,使用__pmd_populate:

static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
				  pmdval_t prot)
{
	pmdval_t pmdval = (pte + PTE_HWTABLE_OFF) | prot;
	pmdp[0] = __pmd(pmdval);
#ifndef CONFIG_ARM_LPAE
	pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t));
#endif
	flush_pmd_entry(pmdp);
}

可以看到传入的页表项物理地址先偏移了512*4,pgd的第一个页目录填充的是4K的下半段,这也符合前面说的,下半页才会填入页目录中,给mmu使用,在512的基础上再偏移256,填充下一个页目录,这样可寻址2M的页目录都被填充完。看起来和1M的页目录效果是一样的,只不过在分配页表的时候,一下子申请4K ,都可以被使用而不浪费。

接着调用pte_offset_kernel 返回页表项的逻辑地址,注意上面填入pmd[0]中的值页表的值应该为pte_index(addr)+512,

这边页对其以后返回的pte_offset_kernel应该是上半页的起始逻辑地址,后面再汇编里面设置页表可以看到

#define pte_offset_kernel(pmd,addr)	(pmd_page_vaddr(*(pmd)) + pte_index(addr))

然后调用下面的函数把物理地址填充到页表项里面

set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), 0);

其实是个宏定义,不同处理器的处理函数不同,可以看一下arm9的实现:

ENTRY(cpu_arm920_set_pte_ext)
#ifdef CONFIG_MMU
	armv3_set_pte_ext
	mov	r0, r0
	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
#endif
	mov	pc, lr


.macro	armv3_set_pte_ext wc_disable=1
	str	r1, [r0], #2048			@ linux version   把物理地址以及描述信息填在上半页的页表里,并加2048,偏移得到另一个页的地址

	eor	r3, r1, #L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY

	bic	r2, r1, #PTE_SMALL_AP_MASK	@ keep C, B bits
	bic	r2, r2, #PTE_TYPE_MASK
	orr	r2, r2, #PTE_TYPE_SMALL

	tst	r3, #L_PTE_USER			@ user?
	orrne	r2, r2, #PTE_SMALL_AP_URO_SRW

	tst	r3, #L_PTE_RDONLY | L_PTE_DIRTY	@ write and dirty?
	orreq	r2, r2, #PTE_SMALL_AP_UNO_SRW

	tst	r3, #L_PTE_PRESENT | L_PTE_YOUNG	@ present and young?
	movne	r2, #0

	.if	\wc_disable
#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
	tst	r2, #PTE_CACHEABLE
	bicne	r2, r2, #PTE_BUFFERABLE
#endif
	.endif
	str	r2, [r0]		@ hardware version把物理地址以及硬件相关的信息填入到
mmu使用的那个页表项中
	.endm

从上面可以看出,有两次填充页地址的行为,偏移2048地址的那个页,才是mmu使用的页表,置上L_PTE_PRESENT,mmu会认为该页表映射有效。

可以用一个图来描述linux 2级页表的映射关系:

系统每次对某个虚拟地址进行映射时,先申请pgd,这个pgd是8个字节,索引11位,对于mmu来说,11位的页目录显然不对,所以填写时又分为pmd[0],pmd[1],pmd 所以12位,以1M 为单位,这个时候和mmu能对应上了,然后申请页表,一次申请4K,上面2K 用来为下面2K的做标记,下面的每页偏移512*4个字节就能找到上面的标记页,然后把下面的512又分成两份填充pmd[0],和pmd[1]。所以从实现上来看,每次分配空间,都会在页目录中填两个页目录项,范围是2M。

接着在devicemaps_init函数中调用:

if (mdesc->map_io)
        mdesc->map_io();

如果使用的machine定义了map_io,则会调用map_io:

static void __init smdk2440_map_io(void)
{
	s3c24xx_init_io(smdk2440_iodesc, ARRAY_SIZE(smdk2440_iodesc));
	s3c24xx_init_clocks(12000000);
	s3c24xx_init_uarts(smdk2440_uartcfgs, ARRAY_SIZE(smdk2440_uartcfgs));
	samsung_set_timer_source(SAMSUNG_PWM3, SAMSUNG_PWM4);
}

先看s3c24xx_init_io:

void __init s3c24xx_init_io(struct map_desc *mach_desc, int size)
{
	arm_pm_idle = s3c24xx_default_idle;

	/* initialise the io descriptors we need for initialisation */
	iotable_init(mach_desc, size);    对io端口进行页表映射
	iotable_init(s3c_iodesc, ARRAY_SIZE(s3c_iodesc));

	if (cpu_architecture() >= CPU_ARCH_ARMv5) {
		samsung_cpu_id = s3c24xx_read_idcode_v5();
	} else {
		samsung_cpu_id = s3c24xx_read_idcode_v4();
	}
	s3c24xx_init_cpu();

	s3c_init_cpu(samsung_cpu_id, cpu_ids, ARRAY_SIZE(cpu_ids));
}

先调用iotable_init 对io端口寄存器进行页表映射:

void __init iotable_init(struct map_desc *io_desc, int nr)
{
	struct map_desc *md;
	struct vm_struct *vm;
	struct static_vm *svm;

	if (!nr)
		return;

	svm = early_alloc_aligned(sizeof(*svm) * nr, __alignof__(*svm));

	for (md = io_desc; nr; md++, nr--) {
		create_mapping(md);

		vm = &svm->vm;
		vm->addr = (void *)(md->virtual & PAGE_MASK);
		vm->size = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
		vm->phys_addr = __pfn_to_phys(md->pfn);
		vm->flags = VM_IOREMAP | VM_ARM_STATIC_MAPPING;
		vm->flags |= VM_ARM_MTYPE(md->type);
		vm->caller = iotable_init;
		add_static_vm_early(svm++);
	}
}

也是调用create_mapping依次对传入的io端口地址进行映射,而且利用add_static_vm_early函数添加到了vmlist和static_vmlist链表,这两个全局链表应该是用来管理虚拟内存的,标记io映射使用的虚拟内存,后面如果要分配内存,不能再使用这些区域(这是我猜测的,以后再回过来学习)。做完静态映射以后,处理器就可以使用这些io虚拟地址进行io读写了。然后调用s3c_init_cpu(samsung_cpu_id, cpu_ids, ARRAY_SIZE(cpu_ids));传入的是cpu_ids,定义在

arch/arm/mach-s3c24xx/common.c中:

static struct cpu_table cpu_ids[] __initdata = {
	{
		.idcode		= 0x32410000,
		.idmask		= 0xffffffff,
		.map_io		= s3c2410_map_io,
		.init_clocks	= s3c2410_init_clocks,
		.init_uarts	= s3c2410_init_uarts,
		.init		= s3c2410_init,
		.name		= name_s3c2410
	},
	{
		.idcode		= 0x32410002,
		.idmask		= 0xffffffff,
		.map_io		= s3c2410_map_io,
		.init_clocks	= s3c2410_init_clocks,
		.init_uarts	= s3c2410_init_uarts,
		.init		= s3c2410a_init,
		.name		= name_s3c2410a
	},
。。。。。。。。。。。。

定义了很多cpu类型。

void __init s3c_init_cpu(unsigned long idcode,
			 struct cpu_table *cputab, unsigned int cputab_size)
{
	cpu = s3c_lookup_cpu(idcode, cputab, cputab_size);

	if (cpu == NULL) {
		printk(KERN_ERR "Unknown CPU type 0x%08lx\n", idcode);
		panic("Unknown S3C24XX CPU");
	}

	printk("CPU %s (id 0x%08lx)\n", cpu->name, idcode);

	if (cpu->map_io == NULL || cpu->init == NULL) {
		printk(KERN_ERR "CPU %s support not enabled\n", cpu->name);
		panic("Unsupported Samsung CPU");
	}

	cpu->map_io();
}

从cpu表中根据id查找到对应的cpu,然后调用其map_io函数:

void __init s3c244x_map_io(void)
{
	/* register our io-tables */

	iotable_init(s3c244x_iodesc, ARRAY_SIZE(s3c244x_iodesc));

	/* rename any peripherals used differing from the s3c2410 */

	s3c_device_sdi.name  = "s3c2440-sdi";
	s3c_device_i2c0.name  = "s3c2440-i2c";
	s3c_nand_setname("s3c2440-nand");
	s3c_device_ts.name = "s3c2440-ts";
	s3c_device_usbgadget.name = "s3c2440-usbgadget";
}

可以看到依旧是建立io的一些映射页表。然后初始化clock,调用关系如下:

smdk2440_map_io

           ------------>s3c24xx_init_clocks

               ---------------->s3c244x_init_clocks

void __init s3c244x_init_clocks(int xtal)
{
	/* initialise the clocks here, to allow other things like the
	 * console to use them, and to add new ones after the initialisation
	 */

	s3c24xx_register_baseclocks(xtal);  初始化了一些clock结构,最重要的是clk_xtal,
把晶振的值赋值给他,其他的clock 好像都没填充,最终依次调用clkdev_add 加入到内核的clock链表中
	s3c244x_setup_clocks();
根据晶振值以及寄存器(寄存器读写就是利用之前静态映射的页表指定的虚拟地址来读写)
中设置的分频系数,依次算出各个clock的值,填充clock

	s3c2410_baseclk_add();
初始化系统中的各个模块的clock结构,利用clkdev_add 加入到内核的clock链表中
}

smdk2440_map_io

           --------> s3c24xx_init_uarts

                 --------->s3c24xx_init_uartdevs  主要工作就是利用设置的uart 参数,填充下面结构体重的各个uart

struct platform_device *s3c24xx_uart_src[4] = {
    &s3c24xx_uart_device0,
    &s3c24xx_uart_device1,
    &s3c24xx_uart_device2,
    &s3c24xx_uart_device3,
};

使用dts实现的话这些 platform_device 的参数结构应该可以放到dts里面来传递,这边采取以前的做法。到这边具体machine的map io就结束了。

最后还有个bootmem_init 函数:

void __init bootmem_init(void)
{
	unsigned long min, max_low, max_high;

	max_low = max_high = 0;

	find_limits(&min, &max_low, &max_high);
	arm_bootmem_init(min, max_low);

	/*
	 * Sparsemem tries to allocate bootmem in memory_present(),
	 * so must be done after the fixed reservations
	 */
	arm_memory_present();

	/*
	 * sparse_init() needs the bootmem allocator up and running.
	 */
	sparse_init();

	/*
	 * Now free the memory - free_area_init_node needs
	 * the sparse mem_map arrays initialized by sparse_init()
	 * for memmap_init_zone(), otherwise all PFNs are invalid.
	 */
	arm_bootmem_free(min, max_low, max_high);

	/*
	 * This doesn't seem to be used by the Linux memory manager any
	 * more, but is used by ll_rw_block.  If we can get rid of it, we
	 * also get rid of some of the stuff above as well.
	 *
	 * Note: max_low_pfn and max_pfn reflect the number of _pages_ in
	 * the system, not the maximum PFN.
	 */
	max_low_pfn = max_low - PHYS_PFN_OFFSET;
	max_pfn = max_high - PHYS_PFN_OFFSET;
}

查了很多资料,说linux在系统完全起来之前,先使用memblock机制或者bootmem机制来分配管理物理内存(早期的系统使用bootmem),前面已经看到使用memblock的方式来分配和管理内存,但是在bootmem_init 还初始化了bootmem方式的位图,以及伙伴内存系统使用的page,为什么要用memblock和bootmem两种方式来管理内存,不是很理解。arm_bootmem_init就是初始化了bootmem的位图:

static void __init arm_bootmem_init(unsigned long start_pfn,
	unsigned long end_pfn)
{
	struct memblock_region *reg;
	unsigned int boot_pages;
	phys_addr_t bitmap;
	pg_data_t *pgdat;

	/*
	 * Allocate the bootmem bitmap page.  This must be in a region
	 * of memory which has already been mapped.
	 */
	boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
	bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES,
				__pfn_to_phys(end_pfn));
	pr_err("boot_pages=%x   bitmap=%x \n",boot_pages,bitmap);
	/*
	 * Initialise the bootmem allocator, handing the
	 * memory banks over to bootmem.
	 */
	node_set_online(0);
	pgdat = NODE_DATA(0);
	init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn);

	/* Free the lowmem regions from memblock into bootmem. */
	for_each_memblock(memory, reg) {
		unsigned long start = memblock_region_memory_base_pfn(reg);
		unsigned long end = memblock_region_memory_end_pfn(reg);

		if (end >= end_pfn)
			end = end_pfn;
		if (start >= end)
			break;
		pr_err("start=%lx   end=%lx \n",start,end);
		free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT);
	}

	/* Reserve the lowmem memblock reserved regions in bootmem. */
	for_each_memblock(reserved, reg) {
		unsigned long start = memblock_region_reserved_base_pfn(reg);
		unsigned long end = memblock_region_reserved_end_pfn(reg);

		if (end >= end_pfn)
			end = end_pfn;
		if (start >= end)
			break;

		reserve_bootmem(__pfn_to_phys(start),
			        (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT);
	}
}

首先申请了位空间bitmap,bitmap中的一个bit代表的是一个页,最终把位空间赋值给pgdat->bdata->node_bootmem_map。然后遍历memblock中的memory节点和reserved节点,对使用过的内存在node_bootmem_map中做标记。初始化完node_bootmem_map以后,可以看到后面分配空间使用alloc_bootmem_node_nopanic来进行分配,分配内存的方式从memblock的方式转移到node_bootmem_map位图。接着调用arm_bootmem_free,该函数完成了伙伴系统page和zone的初始化。会为每个页分配一个struct page的空间,然后把这块空间赋值给pgdat->node_mem_map和mem_map,然后对这些page进行初始化

arm_bootmem_free

    -------->free_area_init_node

计算需要分配的page的数量,然后调用alloc_node_mem_map 为所有的page分配空间,接着对zone和page进行初始化:

static void __paginginit free_area_init_core(struct pglist_data *pgdat,
		unsigned long *zones_size, unsigned long *zholes_size)
{
	enum zone_type j;
	int nid = pgdat->node_id;
	unsigned long zone_start_pfn = pgdat->node_start_pfn;
	int ret;

	pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
	spin_lock_init(&pgdat->numabalancing_migrate_lock);
	pgdat->numabalancing_migrate_nr_pages = 0;
	pgdat->numabalancing_migrate_next_window = jiffies;
#endif
	init_waitqueue_head(&pgdat->kswapd_wait);
	init_waitqueue_head(&pgdat->pfmemalloc_wait);
	pgdat_page_cgroup_init(pgdat);

	for (j = 0; j < MAX_NR_ZONES; j++) {
		struct zone *zone = pgdat->node_zones + j;
		unsigned long size, realsize, freesize, memmap_pages;

		size = zone_spanned_pages_in_node(nid, j, zones_size);
		realsize = freesize = size - zone_absent_pages_in_node(nid, j,
								zholes_size);

		/*
		 * Adjust freesize so that it accounts for how much memory
		 * is used by this zone for memmap. This affects the watermark
		 * and per-cpu initialisations
		 */
		memmap_pages = calc_memmap_size(size, realsize);
		if (freesize >= memmap_pages) {
			freesize -= memmap_pages;
			if (memmap_pages)
				printk(KERN_DEBUG
				       "  %s zone: %lu pages used for memmap\n",
				       zone_names[j], memmap_pages);
		} else
			printk(KERN_WARNING
				"  %s zone: %lu pages exceeds freesize %lu\n",
				zone_names[j], memmap_pages, freesize);

		/* Account for reserved pages */
		if (j == 0 && freesize > dma_reserve) {
			freesize -= dma_reserve;
			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
					zone_names[0], dma_reserve);
		}

		if (!is_highmem_idx(j))
			nr_kernel_pages += freesize;
		/* Charge for highmem memmap if there are enough kernel pages */
		else if (nr_kernel_pages > memmap_pages * 2)
			nr_kernel_pages -= memmap_pages;
		nr_all_pages += freesize;

		zone->spanned_pages = size;
		zone->present_pages = realsize;
		/*
		 * Set an approximate value for lowmem here, it will be adjusted
		 * when the bootmem allocator frees pages into the buddy system.
		 * And all highmem pages will be managed by the buddy system.
		 */
		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
		zone->node = nid;
		zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
						/ 100;
		zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
		zone->name = zone_names[j];
		spin_lock_init(&zone->lock);
		spin_lock_init(&zone->lru_lock);
		zone_seqlock_init(zone);
		zone->zone_pgdat = pgdat;

		zone_pcp_init(zone);
		lruvec_init(&zone->lruvec);
		if (!size)
			continue;

		set_pageblock_order();
		setup_usemap(pgdat, zone, zone_start_pfn, size);
		ret = init_currently_empty_zone(zone, zone_start_pfn,
						size, MEMMAP_EARLY);
		BUG_ON(ret);
		memmap_init(size, nid, j, zone_start_pfn);
		zone_start_pfn += size;
	}
}

memmap_init

    --------->memmap_init_zone

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
		unsigned long start_pfn, enum memmap_context context)
{
	struct page *page;
	unsigned long end_pfn = start_pfn + size;
	unsigned long pfn;
	struct zone *z;

	if (highest_memmap_pfn < end_pfn - 1)
		highest_memmap_pfn = end_pfn - 1;

	z = &NODE_DATA(nid)->node_zones[zone];
	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
		/*
		 * There can be holes in boot-time mem_map[]s
		 * handed to this function.  They do not
		 * exist on hotplugged memory.
		 */
		if (context == MEMMAP_EARLY) {
			if (!early_pfn_valid(pfn))
				continue;
			if (!early_pfn_in_nid(pfn, nid))
				continue;
		}
		page = pfn_to_page(pfn);
		set_page_links(page, zone, nid, pfn);
		mminit_verify_page_links(page, zone, nid, pfn);
		init_page_count(page);
		page_mapcount_reset(page);
		page_nid_reset_last(page);
		SetPageReserved(page);
		/*
		 * Mark the block movable so that blocks are reserved for
		 * movable at startup. This will force kernel allocations
		 * to reserve their blocks rather than leaking throughout
		 * the address space during boot when many long-lived
		 * kernel allocations are made. Later some blocks near
		 * the start are marked MIGRATE_RESERVE by
		 * setup_zone_migrate_reserve()
		 *
		 * bitmap is created for zone's valid pfn range. but memmap
		 * can be created for invalid pages (for alignment)
		 * check here not to call set_pageblock_migratetype() against
		 * pfn out of zone.
		 */
		if ((z->zone_start_pfn <= pfn)
		    && (pfn < zone_end_pfn(z))
		    && !(pfn & (pageblock_nr_pages - 1)))
			set_pageblock_migratetype(page, MIGRATE_MOVABLE);

		INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
		if (!is_highmem_idx(zone))
			set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
	}
}

可以看到根据每一个页,从mem_map中取出对应的page描述,初始化该page。这边把伙伴系统进行了初步的初始化,后面再分析伙伴系统怎么运作。初始化完以后的内存分布大概如下:

找到几篇比较好的博文,贴在下面:

https://www.xuebuyuan.com/1636055.html
https://blog.csdn.net/gatieme/article/details/52403924
 
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值