Linux内存初始化(2)——paging_init初始化

10 篇文章 0 订阅

说明

Kernel版本:4.14.111
ARM处理器,Contex-A7,四核(arm32)

  在上文中,跳转到start_kernel前,已知,内核已创建了kernel、dtb的线性映射,即内核可以访问自己代码段等区域,并能访问dtb所在内存区域的虚拟地址。
  随后在内存管理系统未初始化时,尽管物理内存已经通过memblock_add添加进了系统,但是这部分物理内存到虚拟内存的映射还没有建立,虽然可以通过memblock_alloc分配一段内存,但是还不能访问。
  一切都要等待paging_init执行,建立完页表,就可以通过虚拟地址去访问最终的物理地址了。

paging_init

start_kernel->setup_arch->paging_init

/*
 * paging_init() sets up the page tables, initialises the zone memory
 * maps, and sets up the zero page, bad page and bad page tables.
 */
void __init paging_init(const struct machine_desc *mdesc)
{
	void *zero_page;

	prepare_page_table();									(1)
	map_lowmem();											(2)
	memblock_set_current_limit(arm_lowmem_limit);			(3)
	dma_contiguous_remap();									(4)
	early_fixmap_shutdown();								(5)
	devicemaps_init(mdesc);
	kmap_init();
	tcm_init();

	top_pmd = pmd_off_k(0xffff0000);

	/* allocate the zero page. */
	zero_page = early_alloc(PAGE_SIZE);						(6)

	bootmem_init();											(7)

	empty_zero_page = virt_to_page(zero_page);
	__flush_dcache_page(NULL, empty_zero_page);

	/* Compute the virt/idmap offset, mostly for the sake of KVM */
	kimage_voffset = (unsigned long)&kimage_voffset - virt_to_idmap(&kimage_voffset);
}

1)清除部分段页表
2)低端内存创建映射
3)设置memblock的限制为低端内存结束地址
4)为dma预留的连续内存区域创建页表映射
5)fixmap相关,看了下函数实现,大概是也是建立映射关系(page为单位)
6)申请zero page
7)bootmem_init,zone初始化等。
  下面对各个模块进行进一步分析:

prepare_page_table

start_kernel->setup_arch->paging_init->prepare_page_table

static inline void prepare_page_table(void)
{
	unsigned long addr;
	phys_addr_t end;

	/*
	 * Clear out all the mappings below the kernel image.
	 */
	for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE)
		pmd_clear(pmd_off_k(addr));

#ifdef CONFIG_XIP_KERNEL
	/* The XIP kernel is mapped in the module area -- skip over it */
	addr = ((unsigned long)_exiprom + PMD_SIZE - 1) & PMD_MASK;
#endif
	for ( ; addr < PAGE_OFFSET; addr += PMD_SIZE)
		pmd_clear(pmd_off_k(addr));

	/*
	 * Find the end of the first block of lowmem.
	 */
	end = memblock.memory.regions[0].base + memblock.memory.regions[0].size;
	if (end >= arm_lowmem_limit)
		end = arm_lowmem_limit;

	/*
	 * Clear out all the kernel space mappings, except for the first
	 * memory bank, up to the vmalloc region.
	 */
	for (addr = __phys_to_virt(end);
	     addr < VMALLOC_START; addr += PMD_SIZE)
		pmd_clear(pmd_off_k(addr));
}

上述代码,清除了三段代码的页表。
1)0~MODULES_VADDR
内核启动打印:

modules : 0xbf000000 - 0xbfe00000   (  14 MB)

2)MODULES_VADDR~PAGE_OFFSET

0xbfe00000~0xc0000000

3) arm_lowmem_limit~VMALLOC_START

vmalloc : 0xf0800000 - 0xff800000   ( 240 MB)
lowmem  : 0xc0000000 - 0xf0000000   ( 768 MB)
即清除的区域为:0xf0000000 ~ 0xf0800000

map_lowmem

start_kernel->setup_arch->paging_init->map_lowmem

static void __init map_lowmem(void)
{
	struct memblock_region *reg;
	phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), SECTION_SIZE);	(1)
	phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);		(2)

	/* Map all the lowmem memory banks. */
	for_each_memblock(memory, reg) {											(3)
		phys_addr_t start = reg->base;
		phys_addr_t end = start + reg->size;
		struct map_desc map;

		if (memblock_is_nomap(reg))
			continue;

		if (end > arm_lowmem_limit)
			end = arm_lowmem_limit;
		if (start >= end)
			break;

		if (end < kernel_x_start) {
			map.pfn = __phys_to_pfn(start);
			map.virtual = __phys_to_virt(start);
			map.length = end - start;
			map.type = MT_MEMORY_RWX;

			create_mapping(&map);
		} else if (start >= kernel_x_end) {
			map.pfn = __phys_to_pfn(start);
			map.virtual = __phys_to_virt(start);
			map.length = end - start;
			map.type = MT_MEMORY_RW;

			create_mapping(&map);
		} else {
			/* This better cover the entire kernel */
			if (start < kernel_x_start) {
				map.pfn = __phys_to_pfn(start);
				map.virtual = __phys_to_virt(start);
				map.length = kernel_x_start - start;
				map.type = MT_MEMORY_RW;

				create_mapping(&map);
			}

			map.pfn = __phys_to_pfn(kernel_x_start);
			map.virtual = __phys_to_virt(kernel_x_start);
			map.length = kernel_x_end - kernel_x_start;
			map.type = MT_MEMORY_RWX;

			create_mapping(&map);

			if (kernel_x_end < end) {
				map.pfn = __phys_to_pfn(kernel_x_end);
				map.virtual = __phys_to_virt(kernel_x_end);
				map.length = end - kernel_x_end;
				map.type = MT_MEMORY_RW;

				create_mapping(&map);
			}
		}
	}
}

1)内核开始区域内存向下以1M对齐
2)内核结束区域内存向上以1M对齐,这时得到的kernel_x_start 、kernel_x_end地址以段内存对齐。
3)遍历全局变量memblock中memory节点,并逐一创建内存映射。这里我的设备只有一个节点,内核启动打印可以看出,memory.cnt = 0x01。

MEMBLOCK configuration:
 memory size = 0x0000000040000000 reserved size = 0x00000000052d5910
 memory.cnt  = 0x1
 memory[0x0]     [0x0000000040000000-0x000000007fffffff], 0x0000000040000000 bytes flags: 0x0
 reserved.cnt  = 0x5
 reserved[0x0]   [0x0000000040003000-0x0000000040007fff], 0x0000000000005000 bytes flags: 0x0
 reserved[0x1]   [0x0000000040200000-0x00000000412b688f], 0x00000000010b6890 bytes flags: 0x0
 reserved[0x2]   [0x0000000049de3000-0x0000000049deb07f], 0x0000000000008080 bytes flags: 0x0
 reserved[0x3]   [0x0000000049dee000-0x0000000049ffffff], 0x0000000000212000 bytes flags: 0x0
 reserved[0x4]   [0x000000007c000000-0x000000007fffffff], 0x0000000004000000 bytes flags: 0x0

  MT_MEMORY_RWX和MT_MEMORY_RW区别在于ARM页表项有一个XN比特位,XN比特位置1,表示这段内存区域不允许执行。
  设备物理地址为0x40000000-0x7fffffff,则在这里创建的映射为:

0x40000000~kernel_x_start, 		MT_MEMORY_RW
kernel_x_start~kernel_x_end, 	MT_MEMORY_RWX
kernel_x_end~arm_lowmem_limit, 	MT_MEMORY_RW

注意:

		if (end > arm_lowmem_limit)
			end = arm_lowmem_limit;

  因为内存超过arm_lowmem_limit(我这里是768Mb),所以只映射到低端内存结束地址,根据函数名``map_lowmem`也可以看出这层意思。
  create_mapping函数创建的映射就是物理内存直接映射,或者叫做线性映射。就是在原有物理地址上,加上一个偏移地址,使之成为内核可以访问到的虚拟地址。

create_mapping

在这里插入图片描述

概念说明

1)如果采用单层的段映射,32位处理器最大寻址地址4GB,所以需要有4096个页表项,每个表项大小4bytes,则总共需要16KB地址。内核启动段映射页表一般在0xc0004000~0xc0008000。
当cpu访问内存时,32位虚拟地址的搞12位(bit31-bit20)为作为段映射表的索引,找到对应页表项的序号,买个页表项提供了一个12位的物理地址段,将这个12位和虚拟地址的低20位拼在一起,就是32位物理地址。
2)如果采用页表映射的方式,段映射表变成一级映射表,其表项提供的不再是物理地址段,而是二级页表的基地址。32位地址的高12位作为访问一级页表的索引值,找到相应的表项,每个表项指向一个二级页表。以虚拟地址的次8位(bit19-12)作为访问二级页表的索引值,得到相应的页表项,从这个页表项中找到20位的物理的地址,最后将这20位物理页面地址和虚拟地址的低12位拼凑在一起,得到最终的32位物理地址,这个过程由MMU硬件完成。

重要数据结构

struct map_desc {
	unsigned long virtual;      //虚拟地址的起始地址
	unsigned long pfn;          //物理地址的开始地址的页帧号
	unsigned long length;      //内存区间大小
	unsigned int type;           //类型(RW、RWX)
};

struct mem_type {
	pteval_t prot_pte;
	pteval_t prot_pte_s2;
	pmdval_t prot_l1;
	pmdval_t prot_sect;
	unsigned int domain;
};

/* domain定义如下 */
#ifndef CONFIG_IO_36
#define DOMAIN_KERNEL	0
#define DOMAIN_USER	1
#define DOMAIN_IO	2
#else
#define DOMAIN_KERNEL	2
#define DOMAIN_USER	1
#define DOMAIN_IO	0
#endif
#define DOMAIN_VECTORS	3

DOMAIN_KERNEL属于系统空间,DOMAIN_IO表示系统空间,DOMAIN_USER表示用户空间。

prot_pte成员用于页面表项的控制位和标志位
prot_l1成员表示一级页表项的控制位和标志位

系统中定义了一个全局的mem_type[]数组来描述所有的内存区间定义。arm结构如下:

/*arm mem_types实现*/
static struct mem_type mem_types[] __ro_after_init = {
	[MT_DEVICE] = {		  /* Strongly ordered / ARMv6 shared device */
		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
			  L_PTE_SHARED,
		.prot_pte_s2	= s2_policy(PROT_PTE_S2_DEVICE) |
			  s2_policy(L_PTE_S2_MT_DEV_SHARED) |
			  L_PTE_SHARED,
		.prot_l1	= PMD_TYPE_TABLE,
		.prot_sect	= PROT_SECT_DEVICE | PMD_SECT_S,
		.domain		= DOMAIN_IO,
	},
	[MT_DEVICE_NONSHARED] = { /* ARMv6 non-shared device */
		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_NONSHARED,
		.prot_l1	= PMD_TYPE_TABLE,
		.prot_sect	= PROT_SECT_DEVICE,
		.domain		= DOMAIN_IO,
	},
	[MT_DEVICE_CACHED] = {	  /* ioremap_cached */
		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_CACHED,
		.prot_l1	= PMD_TYPE_TABLE,
		.prot_sect	= PROT_SECT_DEVICE | PMD_SECT_WB,
		.domain		= DOMAIN_IO,
	},
	[MT_DEVICE_WC] = {	/* ioremap_wc */
		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_WC,
		.prot_l1	= PMD_TYPE_TABLE,
		.prot_sect	= PROT_SECT_DEVICE,
		.domain		= DOMAIN_IO,
	},
	[MT_UNCACHED] = {
		.prot_pte	= PROT_PTE_DEVICE,
		.prot_l1	= PMD_TYPE_TABLE,
		.prot_sect	= PMD_TYPE_SECT | PMD_SECT_XN,
		.domain		= DOMAIN_IO,
	},
	[MT_CACHECLEAN] = {
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
		.domain    = DOMAIN_KERNEL,
	},
#ifndef CONFIG_ARM_LPAE
	[MT_MINICLEAN] = {
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN | PMD_SECT_MINICACHE,
		.domain    = DOMAIN_KERNEL,
	},
#endif
	[MT_LOW_VECTORS] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
			L_PTE_RDONLY,
		.prot_l1   = PMD_TYPE_TABLE,
		.domain    = DOMAIN_VECTORS,
	},
	[MT_HIGH_VECTORS] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
			L_PTE_USER | L_PTE_RDONLY,
		.prot_l1   = PMD_TYPE_TABLE,
		.domain    = DOMAIN_VECTORS,
	},
	[MT_MEMORY_RWX] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
		.prot_l1   = PMD_TYPE_TABLE,
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_RW] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
			     L_PTE_XN,
		.prot_l1   = PMD_TYPE_TABLE,
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_ROM] = {
		.prot_sect = PMD_TYPE_SECT,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_RWX_NONCACHED] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
			L_PTE_MT_BUFFERABLE,
		.prot_l1   = PMD_TYPE_TABLE,
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_RW_DTCM] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
			L_PTE_XN,
		.prot_l1   = PMD_TYPE_TABLE,
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_RWX_ITCM] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
		.prot_l1   = PMD_TYPE_TABLE,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_RW_SO] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
			L_PTE_MT_UNCACHED | L_PTE_XN,
		.prot_l1   = PMD_TYPE_TABLE,
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_S |
			PMD_SECT_UNCACHED | PMD_SECT_XN,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_DMA_READY] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
			L_PTE_XN,
		.prot_l1   = PMD_TYPE_TABLE,
		.domain    = DOMAIN_KERNEL,
	},
}

create_mapping代码实现

create_mapping->__create_mapping

static void __init __create_mapping(struct mm_struct *mm, struct map_desc *md,
				    void *(*alloc)(unsigned long sz),
				    bool ng)
{
	unsigned long addr, length, end;
	phys_addr_t phys;
	const struct mem_type *type;
	pgd_t *pgd;

	type = &mem_types[md->type];									(1)

#ifndef CONFIG_ARM_LPAE
	/*
	 * Catch 36-bit addresses
	 */
	if (md->pfn >= 0x100000) {
		create_36bit_mapping(mm, md, type, ng);
		return;
	}
#endif

	addr = md->virtual & PAGE_MASK;
	phys = __pfn_to_phys(md->pfn);
	length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));

	if (type->prot_l1 == 0 && ((addr | phys | length) & ~SECTION_MASK)) {
		pr_warn("BUG: map for 0x%08llx at 0x%08lx can not be mapped using pages, ignoring.\n",
			(long long)__pfn_to_phys(md->pfn), addr);
		return;
	}

	pgd = pgd_offset(mm, addr);										(2)
	end = addr + length;
	do {
		unsigned long next = pgd_addr_end(addr, end);

		alloc_init_pud(pgd, addr, next, phys, type, alloc, ng);		(3)

		phys += next - addr;
		addr = next;
	} while (pgd++, addr != end);
}

1)首先通过md->type获取描述内存区域属性的mem_type数据结构。
2)通过pgd_offset函数获取所属的页面目录项PGD,内核的一级页表存放在swapper_pg_dir地址中,可以通过init_mm数据结构来获取。
3)alloc_init_pud,arm只有两级页表,所以一路调用如下:
create_mapping->alloc_init_pud->alloc_init_pmd

static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
				      unsigned long end, phys_addr_t phys,
				      const struct mem_type *type,
				      void *(*alloc)(unsigned long sz), bool ng)
{
	pmd_t *pmd = pmd_offset(pud, addr);
	unsigned long next;

	do {
		/*
		 * With LPAE, we must loop over to map
		 * all the pmds for the given range.
		 */
		next = pmd_addr_end(addr, end);

		/*
		 * Try a section mapping - addr, next and phys must all be
		 * aligned to a section boundary.
		 */
		if (type->prot_sect &&
				((addr | next | phys) & ~SECTION_MASK) == 0) {
			__map_init_section(pmd, addr, next, phys, type, ng);		(1)
		} else {
			alloc_init_pte(pmd, addr, next,								(2)
				       __phys_to_pfn(phys), type, alloc, ng);
		}

		phys += next - addr;

	} while (pmd++, addr = next, addr != end);
}

1)尝试段映射,如果映射地址、下一个映射地址、且物理地址都是段对齐,则进行段映射
2)如果不满足段映射条件,则初始化二级pte页表。需要申请pte页表内存,调用传参的alloc函数申请内存, 这里是static void __init *early_alloc(unsigned long sz)
下面来看alloc_init_pte实现

static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
				  unsigned long end, unsigned long pfn,
				  const struct mem_type *type,
				  void *(*alloc)(unsigned long sz),
				  bool ng)
{
	pte_t *pte = arm_pte_alloc(pmd, addr, type->prot_l1, alloc);
	do {
		set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)),
			    ng ? PTE_EXT_NG : 0);
		pfn++;
	} while (pte++, addr += PAGE_SIZE, addr != end);
}

static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr,
				unsigned long prot,
				void *(*alloc)(unsigned long sz))
{
	if (pmd_none(*pmd)) {
		pte_t *pte = alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
		__pmd_populate(pmd, __pa(pte), prot);
	}
	BUG_ON(pmd_bad(*pmd));
	return pte_offset_kernel(pmd, addr);
}

  pmd_noe去检查这个参数对应的PMD页表项的内容,如果为0,说明PTE页表还没建立,需要去建立新的页表项。这里会创建(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE)个页面表。即会分配512+512个页表项。4Kb,物理页面对齐。

  物理mmu默认一级页表偏移从20位开始,分4096段。内核从21位开始,分2048段。
  在arm linux中,一个PGD页表项,映射2M内存,4KB页面则需要512个表项。
  在真实页面中,一个PGD,映射1M,4Kb页面则需要256个表项。

  相当于一次映射2个相邻的一级页表项,两个相邻的二级页表都存放在一个page中。前512个pte表项,内核使用,后512个表项,硬件使用(对应2M内存寻址,即2个硬件一级页表项)

  然后把这个PTE页面表的基地址,通过__pmd_populate设置到PMD页表项中。

static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
				  pmdval_t prot)
{
	pmdval_t pmdval = (pte + PTE_HWTABLE_OFF) | prot;	(1)
	pmdp[0] = __pmd(pmdval);							(2)
#ifndef CONFIG_ARM_LPAE
	pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t));		(3)
#endif
	flush_pmd_entry(pmdp);
}

1)pte传参是pte页表的基地址,刚分配的1024个PTE页面表,硬件页表偏移地址在后512个表项,所以设置到硬件pmd中的pte地址需要偏移512*4字节,并或上prot标志位,写入上一级页表项PMD中
2)上文提到物理mmu默认一级页表偏移从20位开始,分4096段。内核从21位开始,分2048段。在申请pte页表时,申请了2段硬件pmd的pte。需要分别设置2段pmd的pte页表地址。这里设置第一段。
3)设置第二段pmd的pte页表地址。
  回到alloc_init_pte函数,
  最后通过set_pte_ext完成对硬件页表项的设置。把20位物理地址偏移、标志位写入Linux页表、硬件页表。

重要说明

  注意:低端内存基本都是段映射。

相关文章

Linux内存初始化(1)——memblock初始化
Linux内存初始化(2)——paging_init初始化
Linux内存初始化(3)——pglist_data/zone初始化
Linux内存初始化(4)——伙伴系统(buddy)

  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值