Linux启动并建立一套完整的页表机制要经过以下几个步骤:
1.临时内核页表的初始化(setup_32.s)
2.启动分页机制(head_32.s)
3.建立低端内存和高端内存固定映射区的页表( init_memory_mapping())
4.建立高端内存永久映射区的页表并获取固定映射区的临时映射区页表(paging_init())
下面主要介绍3和4
一、低端内存页表的建立
在setup_arch()中内核通过调用init_memory_mapping()来建立低端内存页表
void __init setup_arch(char **cmdline_p)
{
...
...
/* max_pfn_mapped is updated here */
max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
max_pfn_mapped = max_low_pfn_mapped;
...
...
}
内核将低端内存的起始地址(0),和低端内存的结束地址(max_low_pfn<<PAGE_SHIFT)传递给init_memory_mapping(),下面来看Init_memory_mapping()的具体实现,简单起见,只分析32位系统的情况
/*
* Setup the direct mapping of the physical memory at PAGE_OFFSET.
* This runs before bootmem is initialized and gets pages directly from
* the physical memory. To access them they are temporarily mapped.
*/
unsigned long __init_refok init_memory_mapping(unsigned long start,
unsigned long end)
{
unsigned long page_size_mask = 0;
unsigned long start_pfn, end_pfn;
unsigned long ret = 0;
unsigned long pos;
struct map_range mr[NR_RANGE_MR];
int nr_range, i;
int use_pse, use_gbpages;
printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
/*
* For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
use_pse = use_gbpages = 0;
#else
use_pse = cpu_has_pse;
use_gbpages = direct_gbpages;
#endif
set_nx();
if (nx_enabled)
printk(KERN_INFO "NX (Execute Disable) protection: active\n");
/* Enable PSE if available */
if (cpu_has_pse)
set_in_cr4(X86_CR4_PSE);
/* Enable PGE if available */
if (cpu_has_pge) {
set_in_cr4(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
/*这里确定是否使用大型页*/
if (use_gbpages)
page_size_mask |= 1 << PG_LEVEL_1G;
if (use_pse)
page_size_mask |= 1 << PG_LEVEL_2M;
memset(mr, 0, sizeof(mr));
nr_range = 0;
/* head if not big page alignment ? */
start_pfn = start >> PAGE_SHIFT;/*得到起始页框的编号*/
pos = start_pfn << PAGE_SHIFT; /*得到起始物理地址,取按PAGE_SHIFT位对齐后的结果*/
#ifdef CONFIG_X86_32
/*
* Don't use a large page for the first 2/4MB of memory
* because there are often fixed size MTRRs in there
* and overlapping MTRRs into large pages can cause
* slowdowns.
*/
/*设定结束页框与起始页框的距离为2M/4K = 512K个页框*/
if (pos == 0)
end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
else
end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
<< (PMD_SHIFT - PAGE_SHIFT);
#else /* CONFIG_X86_64 */
end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
<< (PMD_SHIFT - PAGE_SHIFT);
#endif
if (end_pfn > (end >> PAGE_SHIFT))/*设定的结束页框要大于指定的结束地址对应的页框,
则将结束页框下调*/
end_pfn = end >> PAGE_SHIFT;
if (start_pfn < end_pfn) { /*如果起始页框小于结束页框,则将该段保存,
这个段使用的都是小型页,即4K大小*/
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
pos = end_pfn << PAGE_SHIFT; /*将起始地址移到结束地址处*/
}
/* big page (2M) range */
start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)/*将起始地址右移2M*/
<< (PMD_SHIFT - PAGE_SHIFT);
#ifdef CONFIG_X86_32
end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);/*end_pfn设为指定的end对应的页框,
取按PMD_SHIFT位对齐后的结果*/
#else /* CONFIG_X86_64 */
end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
<< (PUD_SHIFT - PAGE_SHIFT);
if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
#endif
if (start_pfn < end_pfn) {/*如果start_pfn小于end_pfn则保存该内存段*/
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));/*这里确保如果2M的PG_LEVEL被置位的话,该段按2M的大型页分页*/
pos = end_pfn << PAGE_SHIFT;/*同样将起始位置调整至该段的结束页框*/
}
#ifdef CONFIG_X86_64
/* big page (1G) range */
start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
<< (PUD_SHIFT - PAGE_SHIFT);
end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
pos = end_pfn << PAGE_SHIFT;
}
/* tail is not big page (1G) alignment */
start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
<< (PMD_SHIFT - PAGE_SHIFT);
end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
pos = end_pfn << PAGE_SHIFT;
}
#endif
/* tail is not big page (2M) alignment */
/*将非对齐的末端进行保存为一个段*/
start_pfn = pos>>PAGE_SHIFT;
end_pfn = end>>PAGE_SHIFT;
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
/* try to merge same page size and continuous */
for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
unsigned long old_start;
if (mr[i].end != mr[i+1].start || /*如果前内存段的地址和后内存段的地址相同,并且页面大小相同*/
mr[i].page_size_mask != mr[i+1].page_size_mask)
continue;
/* move it */
old_start = mr[i].start;
memmove(&mr[i], &mr[i+1],
(nr_range - 1 - i) * sizeof(struct map_range));
mr[i--].start = old_start;
nr_range--;
}
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG " %010lx - %010lx page %s\n",
mr[i].start, mr[i].end,
(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
/*
* Find space for the kernel direct mapping tables.
*
* Later we should allocate these tables in the local node of the
* memory mapped. Unfortunately this is done currently before the
* nodes are discovered.
*/
/*如果还未建立bootmem allocator,则直接通过e820获取的信息来得到一个连续的空闲内存段保留内核页表*/
if (!after_bootmem)
find_early_table_space(end, use_pse, use_gbpages);
#ifdef CONFIG_X86_32
for (i = 0; i < nr_range; i++)/*遍历每个段,建立页表完成低端内存虚拟地址到物理地址的映射*/
kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
ret = end;
#else /* CONFIG_X86_64 */
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
#endif
#ifdef CONFIG_X86_32
early_ioremap_page_table_range_init();/*为内核的固定映射区分配页表*/
load_cr3(swapper_pg_dir);
#endif
#ifdef CONFIG_X86_64
if (!after_bootmem && !start) {
pud_t *pud;
pmd_t *pmd;
mmu_cr4_features = read_cr4();
/*
* _brk_end cannot change anymore, but it and _end may be
* located on different 2M pages. cleanup_highmap(), however,
* can only consider _end when it runs, so destroy any
* mappings beyond _brk_end here.
*/
pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
pmd = pmd_offset(pud, _brk_end - 1);
while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
pmd_clear(pmd);
}
#endif
__flush_tlb_all();
if (!after_bootmem && e820_table_end > e820_table_start)
reserve_early(e820_table_start << PAGE_SHIFT,
e820_table_end << PAGE_SHIFT, "PGTABLE");
if (!after_bootmem)
early_memtest(start, end);
return ret >> PAGE_SHIFT;
}
将init_memory_mapping()中的几个比较关键的部分提取出来进行分析
用来保存内存段信息的struct map_range结构体的定义如下
struct map_range {
unsigned long start;
unsigned long end;
unsigned page_size_mask;
};
其包含了一个段的起始地址,结束地址,以及该段是按多大的页面进行分页(4K,2M,1G)
而save_mr()函数的工作就是初始化这三项
static int __meminit save_mr(struct map_range *mr, int nr_range,
unsigned long start_pfn, unsigned long end_pfn,
unsigned long page_size_mask)
{
if (start_pfn < end_pfn) {
if (nr_range >= NR_RANGE_MR)
panic("run out of range for init_memory_mapping\n");
mr[nr_range].start = start_pfn<<PAGE_SHIFT;
mr[nr_range].end = end_pfn<<PAGE_SHIFT;
mr[nr_range].page_size_mask = page_size_mask;
nr_range++;
}
return nr_range;
}
完成分段工作后,内核对每个段调用关键函数kernel_physical_mapping_init()来完成虚拟地址到物理地址的映射,在看这个函数之前先介绍一些关于LINUX分页的基本概念和一些关键的宏
在2.6.11后,Linux采用四级分页模型,这四级页目录分别为
- 页全局目录(Page Global Directory)
- 页上级目录(Page Upper Directory)
- 页中间目录(Page Middle Directory)
- 页表(Page Table)
对于没有启动PAE的32位系统,Linux虽然也采用四级分页模型,但本质上只用到了两级分页,Linux通过将"页上级目录"位域和“页中间目录”位域全为0来达到使用两级分页的目的,但为了保证程序能32位和64系统上都能运行,内核保留了页上级目录和页中间目录在指针序列中的位置,它们的页目录数都被内核置为1,并把这2个页目录项映射到适合的全局目录项。
PAGE_SHIFT,PMD_SHIFT,PUD_SHIFT,PGDIR_SHIFT
对应相应的页目录所能映射的区域大小的位数,如PAGE_SHIFT为12,即页面大小为4k,PMD_SHIFT为线性地址的offset和table字段的总位数,未启用PAE的32位系统下,为22.
PTRS_PER_PTE, PTRS_PER_PMD, PTRS_PER_PUD, PTRS_PER_PGD
对应相应页目录中的表项数。32位系统下,当PAE被禁止时,他们的值分别为1024,,1,1和1024,也就是说只使用两级分页。
pgd_index(addr), pud_index, pmd_index(addr), pte_index(addr)
取addr在该目录中的索引。
pte_index(addr)的实现:
static inline unsigned long pte_index(unsigned long address)
{
return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
}
pud_offset(pgd,addr), pmd_offset(pud,addr), pte_offset(pmd,addr)
以pmd_offset为例,线性地址addr对应的pmd索引在在pud指定的pmd表的偏移地址。在两级或三级分页系统中,pmd_offset和pud_offset都返回页全局目录的地址
/*
* This maps the physical memory to kernel virtual address space, a total
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET:
*/
unsigned long __init
kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask)
{
int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
unsigned long start_pfn, end_pfn;
pgd_t *pgd_base = swapper_pg_dir;
int pgd_idx, pmd_idx, pte_ofs;
unsigned long pfn;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
unsigned pages_2m, pages_4k;
int mapping_iter;
start_pfn = start >> PAGE_SHIFT;
end_pfn = end >> PAGE_SHIFT;
/*
* First iteration will setup identity mapping using large/small pages
* based on use_pse, with other attributes same as set by
* the early code in head_32.S
*
* Second iteration will setup the appropriate attributes (NX, GLOBAL..)
* as desired for the kernel identity mapping.
*
* This two pass mechanism conforms to the TLB app note which says:
*
* "Software should not write to a paging-structure entry in a way
* that would change, for any linear address, both the page size
* and either the page frame or attributes."
*/
mapping_iter = 1;
if (!cpu_has_pse)
use_pse = 0;
repeat:
pages_2m = pages_4k = 0;
pfn = start_pfn; /*pfn保存起始页框号*/
pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);/*低端内存的虚拟起始地址对应的pgd的偏移*/
pgd = pgd_base + pgd_idx;/*得到起始页框对应的pgd*/
/*由pgd开始遍历*/
for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
pmd = one_md_table_init(pgd);/*得到一个pmd表*/
if (pfn >= end_pfn)
continue;
#ifdef CONFIG_X86_PAE
pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
pmd += pmd_idx;
#else
pmd_idx = 0;
#endif
/*遍历pmd表,对于未激活PAE的32位系统,PTRS_PER_PMD为1,激活PAE则为512*/
for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
pmd++, pmd_idx++) {
unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
/*
* Map with big pages if possible, otherwise
* create normal page tables:
*/
if (use_pse) {
unsigned int addr2;
pgprot_t prot = PAGE_KERNEL_LARGE;
/*
* first pass will use the same initial
* identity mapping attribute + _PAGE_PSE.
*/
pgprot_t init_prot =
__pgprot(PTE_IDENT_ATTR |
_PAGE_PSE);
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
if (is_kernel_text(addr) ||
is_kernel_text(addr2))
prot = PAGE_KERNEL_LARGE_EXEC;
pages_2m++;
if (mapping_iter == 1)
set_pmd(pmd, pfn_pmd(pfn, init_prot));
else
set_pmd(pmd, pfn_pmd(pfn, prot));
pfn += PTRS_PER_PTE;
continue;
}
pte = one_page_table_init(pmd);/*创建一个page table*/
/*得到pfn在page table中的偏移并定位到具体的pte*/
pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
pte += pte_ofs;
/*由pte开始遍历page table*/
for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
pgprot_t prot = PAGE_KERNEL;
/*
* first pass will use the same initial
* identity mapping attribute.
*/
pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
if (is_kernel_text(addr)) /*如果处于内核代码段,权限设为可执行*/
prot = PAGE_KERNEL_EXEC;
pages_4k++;
/*!!设置pte,与pfn关联*/
if (mapping_iter == 1)/*第一次执行将权限位设为init_prot*/
set_pte(pte, pfn_pte(pfn, init_prot));
else /*第二次执行将权限位置为prot*/
set_pte(pte, pfn_pte(pfn, prot));
}
}
}
if (mapping_iter == 1) {
/*
* update direct mapping page count only in the first
* iteration.
*/
update_page_count(PG_LEVEL_2M, pages_2m);
update_page_count(PG_LEVEL_4K, pages_4k);
/*
* local global flush tlb, which will flush the previous
* mappings present in both small and large page TLB's.
*/
__flush_tlb_all();
/*
* Second iteration will set the actual desired PTE attributes.
*/
mapping_iter = 2;
goto repeat;
}
return 0;
}
至此,低端内存的物理地址和虚拟地址之间的映射关系已全部建立起来!
二、高端内存固定映射区的建立
在init_memory_mapping()中完成了低端内存映射后,其将调用early_ioremap_page_table_range_init()来建立高端内存的固定映射区页表。与低端内存的页表初始化不同的是,固定映射区的页表只是被分配,相应的PTE项并未初始化,这个工作交由后面的各个固定映射区部分的相关代码调用set_fixmap()来将相关的固定映射区页表与物理内存关联。
每个固定映射区索引都以枚举类型的形式定义在enum fixed_addresses中
enum fixed_addresses {
#ifdef CONFIG_X86_32
FIX_HOLE,
FIX_VDSO,
#else
VSYSCALL_LAST_PAGE,
VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
VSYSCALL_HPET,
#endif
FIX_DBGP_BASE,
FIX_EARLYCON_MEM_BASE,
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
FIX_OHCI1394_BASE,
#endif
#ifdef CONFIG_X86_LOCAL_APIC
FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
#endif
#ifdef CONFIG_X86_IO_APIC
FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif
#ifdef CONFIG_X86_VISWS_APIC
FIX_CO_CPU, /* Cobalt timer */
FIX_CO_APIC, /* Cobalt APIC Redirection Table */
FIX_LI_PCIA, /* Lithium PCI Bridge A */
FIX_LI_PCIB, /* Lithium PCI Bridge B */
#endif
#ifdef CONFIG_X86_F00F_BUG
FIX_F00F_IDT, /* Virtual mapping for IDT */
#endif
#ifdef CONFIG_X86_CYCLONE_TIMER
FIX_CYCLONE_TIMER, /*cyclone timer register*/
#endif
#ifdef CONFIG_X86_32
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
#ifdef CONFIG_PCI_MMCONFIG
FIX_PCIE_MCFG,
#endif
#endif
#ifdef CONFIG_PARAVIRT
FIX_PARAVIRT_BOOTMAP,
#endif
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
__end_of_permanent_fixed_addresses,
/*
* 256 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
*
* We round it up to the next 256 pages boundary so that we
* can have a single pgd entry and a single pte table:
*/
#define NR_FIX_BTMAPS 64
#define FIX_BTMAPS_SLOTS 4
FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
(__end_of_permanent_fixed_addresses & 255),
FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
#ifdef CONFIG_X86_32
FIX_WP_TEST,
#endif
#ifdef CONFIG_INTEL_TXT
FIX_TBOOT_BASE,
#endif
__end_of_fixed_addresses
};
一个索引对应一个4KB的页框,固定映射区的结束地址为FIXADDR_TOP,即0xfffff000(4G-4K),固定映射区是反向生长的,也就是说第一个索引对应的地址离FIXADDR_TOP最近。宏__fix_to_virt(idx)通过索引来计算相应的固定映射区域的线性地址
#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
void __init early_ioremap_page_table_range_init(void)
{
pgd_t *pgd_base = swapper_pg_dir;
unsigned long vaddr, end;
/*
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
*/
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
page_table_range_init(vaddr, end, pgd_base);
early_ioremap_reset();
}
/*
* This function initializes a certain range of kernel virtual memory
* with new bootmem page tables, everywhere page tables are missing in
* the given range.
*
* NOTE: The pagetables are allocated contiguous on the physical space
* so we can cache the place of the first one and move around without
* checking the pgd every time.
*/
static void __init
page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
{
int pgd_idx, pmd_idx;
unsigned long vaddr;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte = NULL;
vaddr = start;
pgd_idx = pgd_index(vaddr);/*得到vaddr对应的pgd索引*/
pmd_idx = pmd_index(vaddr);/*得到vaddr对应的pmd索引*/
pgd = pgd_base + pgd_idx; /*得到pgd项*/
for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
pmd = one_md_table_init(pgd); /*得到pmd起始项*/
pmd = pmd + pmd_index(vaddr); /*得到偏移后的pmd*/
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
/*建立pte表项并检查是vaddr是否对应内核临时映射区,若是则重新申请一个页表来保存pte表*/
pte = page_table_kmap_check(one_page_table_init(pmd),
pmd, vaddr, pte);
vaddr += PMD_SIZE;
}
pmd_idx = 0;
}
}
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
unsigned long vaddr, pte_t *lastpte)
{
#ifdef CONFIG_HIGHMEM
/*
* Something (early fixmap) may already have put a pte
* page here, which causes the page table allocation
* to become nonlinear. Attempt to fix it, and if it
* is still nonlinear then we have to bug.
*/
/*得到内核固定映射区的临时映射区的起始和结束虚拟页框号*/
int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
/*如果:1.kmap_begin和kmap_end没有重叠
2.vaddr处于kmap区间
3.相应的pte没处在低端内存页表保存区间*/
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
&& (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
&& ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
|| (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
pte_t *newpte;
int i;
BUG_ON(after_bootmem);
newpte = alloc_low_page(); /*新申请一个页作为pte表以保证连续性*/
for (i = 0; i < PTRS_PER_PTE; i++) /*将pte表的内容拷贝到newpte表*/
set_pte(newpte + i, pte[i]);
paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));/*pmd与newpte表进行关联*/
BUG_ON(newpte != pte_offset_kernel(pmd, 0));
__flush_tlb_all();
paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
pte = newpte;
}
BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
&& vaddr > fix_to_virt(FIX_KMAP_END)
&& lastpte && lastpte + PTRS_PER_PTE != pte);
#endif
return pte;
}
至此高端内存的固定映射区的页表分配完成!
三、高端内存永久映射区页表的建立和临时映射区页表的获取
paging_init()负责完成剩下的页表建立工作
void __init paging_init(void)
{
pagetable_init();
__flush_tlb_all();
kmap_init();
/*
* NOTE: at this point the bootmem allocator is fully available.
*/
sparse_init();
zone_sizes_init();
}
static void __init pagetable_init(void)
{
pgd_t *pgd_base = swapper_pg_dir;
permanent_kmaps_init(pgd_base);
}
static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
unsigned long vaddr;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
vaddr = PKMAP_BASE;
/*对永久内存区进行页表分配*/
page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
pgd = swapper_pg_dir + pgd_index(vaddr);
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
pte = pte_offset_kernel(pmd, vaddr);
pkmap_page_table = pte;/*保存永久内存区的起始页表项*/
}
permanent_kmaps_init()完成了永久内存区的页表分配
static void __init kmap_init(void)
{
unsigned long kmap_vstart;
/*
* Cache the first kmap pte:
*/
kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);/*得到固定映射区的临时映射区的虚拟地址*/
kmap_pte = kmap_get_fixmap_pte(kmap_vstart);/*得到固定映射区中的临时映射区的起始页表项*/
kmap_prot = PAGE_KERNEL;
}
kmap_init()获取了之前已经分配了的临时映射区的页表,把起始页表项保存在kmap_pte中。