文章目录
最近工作中经常使用飞腾E2000的开发版,也遇到一些启动问题,所以追踪了一下linux内核启动流程。麒麟的代码我们看不了,但是我们可以直接看飞腾的开源内核代码,点击这里可以跳到gitee,我们使用的是5.10分支。
看这个文章需要的前置汇编知识点:
- .quad表示定义一个4字节的变量。.long表示定义一个8字节的变量。
- SYM_CODE_START表示定义一个函数。定义了之后可以通过bl或者b跳转到这个函数。
有一些指令不知道什么回事,可以看下一篇文章:看linux内核启动流程需要的汇编指令解释。
飞腾E2000的开发版可以使用uboot和UEFI,无论是uboot还是UEFI加载linux内核,并且启动linux内核都是从arch/arm64/kernel/head.S文件的_head这里开始运行的。下面我们来开始分析吧:
1.HEAD
__HEAD
_head:
/*
* DO NOT MODIFY. Image header expected by Linux boot-loaders.
*/
#ifdef CONFIG_EFI
/*
* This add instruction has no meaningful effect except that
* its opcode forms the magic "MZ" signature required by UEFI.
*/
add x13, x18, #0x16
b primary_entry
#else
b primary_entry // branch to kernel start, magic
.long 0 // reserved
#endif
.quad 0 // Image load offset from start of RAM, little-endian
le64sym _kernel_size_le // Effective size of kernel image, little-endian
le64sym _kernel_flags_le // Informative flags, little-endian
.quad 0 // reserved
.quad 0 // reserved
.quad 0 // reserved
.ascii ARM64_IMAGE_MAGIC // Magic number
#ifdef CONFIG_EFI
.long pe_header - _head // Offset to the PE header.
pe_header:
__EFI_PE_HEADER
#else
.long 0 // reserved
#endif
在_head里面只跑一个函数,就是primary_entry:
SYM_CODE_START(primary_entry)
bl preserve_boot_args //保留引导加载程序中传递的参数到boot_args中
bl el2_setup // 判断目前是EL1还是EL2,
//如果是EL1就简单了,配置sctlr_el1寄存器就好了。
//如果是EL2就复杂了,需要配置sctlr_el2寄存器,配置内存,hcr,gic
adrp x23, __PHYS_OFFSET
and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0
bl set_cpu_boot_mode_flag //把其他cpu都配置成跟cpu0同样的特权等级
bl __create_page_tables //创建页表
/*
* The following calls CPU setup code, see arch/arm64/mm/proc.S for
* details.
* On return, the CPU will be ready for the MMU to be turned on and
* the TCR will have been set.
*/
bl __cpu_setup // 初始化处理器以打开MMU。
b __primary_switch //设置TTBR0和TTBR1,使能MMU,将kernel image重定位,跳转到__primary_switched
SYM_CODE_END(primary_entry)
primary_entry主要执行了以下几个步骤:
- 调用函数preserve_boot_args保留引导加载程序中传递的参数到boot_args中
- 调用函数el2_setup判断目前是EL1还是EL2,如果是EL1就简单了,配置sctlr_el1寄存器就好了;如果是EL2就复杂了,需要配置sctlr_el2寄存器,配置内存,hcr,gic。
- 调用函数set_cpu_boot_mode_flag把其他cpu都配置成跟cpu0同样的特权等级
- 调用函数__create_page_tables创建页表
- 调用函数__cpu_setup初始化处理器以打开MMU
- 调用函数__primary_switch设置TTBR0和TTBR1,使能MMU,将kernel image重定位,跳转到__primary_switched。
1.preserve_boot_args
SYM_CODE_START_LOCAL(preserve_boot_args)
mov x21, x0 // x21=FDT,将FDT的地址暂存在x21寄存器中,释放出x0以便后续做临时变量使用
adr_l x0, boot_args // x0保存了boot_args变量的地址
stp x21, x1, [x0] // 保存x0和x1的值到boot_args[0]和boot_args[1]
stp x2, x3, [x0, #16] // 保存x2和x3的值到boot_args[2]和boot_args[3]
dmb sy // needed before dc ivac with
// MMU off
mov x1, #0x20 // 4 x 8 bytes
b __inval_dcache_area // 让[boot_args,boot_args+#0x20]的内存数据缓存失效
SYM_CODE_END(preserve_boot_args)
1.1 __inval_dcache_area
//__inval_dcache_area(kaddr, size)
SYM_FUNC_START_PI(__inval_dcache_area)
/* FALLTHROUGH */
/*
* __dma_inv_area(start, size)
* - start - virtual start address of region
* - size - size in question
*/
add x1, x1, x0 //X1存放kaddr+size
dcache_line_size x2, x3
sub x3, x2, #1
tst x1, x3 // end cache line aligned?
bic x1, x1, x3
b.eq 1f
dc civac, x1 // clean & invalidate D / U line
1: tst x0, x3 // start cache line aligned?
bic x0, x0, x3
b.eq 2f
dc civac, x0 // clean & invalidate D / U line
b 3f
2: dc ivac, x0 // invalidate D / U line
3: add x0, x0, x2
cmp x0, x1
b.lo 2b
dsb sy
ret
SYM_FUNC_END_PI(__inval_dcache_area)
2.el2_setup
/*
* If we're fortunate enough to boot at EL2, ensure that the world is
* sane before dropping to EL1.
*
* Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if
* booted in EL1 or EL2 respectively.
*/
SYM_FUNC_START(el2_setup)
msr SPsel, #1 //往SPsel写1,说明使用SP_ELx
mrs x0, CurrentEL //获取当前特权等级
cmp x0, #CurrentEL_EL2 //看看是不是特权等级是否为EL2
b.eq 1f //如果是,就跳转到1f
mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1)//
msr sctlr_el1, x0 //配置EL1的系统控制寄存器
mov w0, #BOOT_CPU_MODE_EL1 // 返回值存在w0寄存器中
isb //内存屏障
ret //返回
//这里说明当前等级是EL2
1: mov_q x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2)
msr sctlr_el2, x0 //配置EL2的系统控制寄存器
#ifdef CONFIG_ARM64_VHE
/*
* Check for VHE being present. For the rest of the EL2 setup,
* x2 being non-zero indicates that we do have VHE, and that the
* kernel is intended to run at EL2.
*/
mrs x2, id_aa64mmfr1_el1 //配置内存模式寄存器
ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4 //把虚拟机扩展支持位提取出来
#else
mov x2, xzr
#endif
/* Hyp configuration. *///Hypervisor配置寄存器
mov_q x0, HCR_HOST_NVHE_FLAGS //访问到EL2的指令转发到未定义指令
cbz x2, set_hcr //x2为0(不支持虚拟机扩展,也就是传统分裂模式)则跳转到set_hcr
mov_q x0, HCR_HOST_VHE_FLAGS //设置中断路由到EL2、启动EL2设施、
set_hcr://虚拟机扩展模式
msr hcr_el2, x0 //写入hcr_el2
isb //内存屏障
/*
* Allow Non-secure EL1 and EL0 to access physical timer and counter.
* This is not necessary for VHE, since the host kernel runs in EL2,
* and EL0 accesses are configured in the later stage of boot process.
* Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout
* as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined
* to access CNTHCTL_EL2. This allows the kernel designed to run at EL1
* to transparently mess with the EL0 bits via CNTKCTL_EL1 access in
* EL2.
*/
cbnz x2, 1f //x2为0(不支持虚拟机扩展)则跳转到1f
mrs x0, cnthctl_el2 //读取Hypervisor控制的计数寄存器
orr x0, x0, #3 // Enable EL1 physical timers
msr cnthctl_el2, x0
1:
msr cntvoff_el2, xzr // 物理计数器和虚拟计数器一致,不偏移
#ifdef CONFIG_ARM_GIC_V3
/* GICv3 system register access */
mrs x0, id_aa64pfr0_el1 //读取处理器特性寄存器
ubfx x0, x0, #ID_AA64PFR0_GIC_SHIFT, #4
cbz x0, 3f //如果不是gic3或者4.0。跳转到3f
//说明gic版本为3.0或者4.0
mrs_s x0, SYS_ICC_SRE_EL2 //读取中断控制器启用寄存器
orr x0, x0, #ICC_SRE_EL2_SRE // Set ICC_SRE_EL2.SRE==1
orr x0, x0, #ICC_SRE_EL2_ENABLE // Set ICC_SRE_EL2.Enable==1
msr_s SYS_ICC_SRE_EL2, x0
isb // Make sure SRE is now set
mrs_s x0, SYS_ICC_SRE_EL2 // Read SRE back,
tbz x0, #0, 3f // and check that it sticks
msr_s SYS_ICH_HCR_EL2, xzr // Reset ICC_HCR_EL2 to defaults
3:
#endif
/* Populate ID registers. */
//填充虚拟机ID寄存器
mrs x0, midr_el1
mrs x1, mpidr_el1
msr vpidr_el2, x0 //虚拟化处理器ID寄存器
msr vmpidr_el2, x1 //虚拟化多处理器ID寄存器
#ifdef CONFIG_COMPAT
msr hstr_el2, xzr // Disable CP15 traps to EL2
#endif
/* EL2 debug */
mrs x1, id_aa64dfr0_el1 //读取AArch64调试特性寄存器
sbfx x0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4
cmp x0, #1
b.lt 4f // Skip if no PMU present
mrs x0, pmcr_el0 //读取性能监视器控制寄存器
ubfx x0, x0, #11, #5 //允许EL2访问性能监视器控制寄存器
4:
csel x3, xzr, x0, lt // all PMU counters from EL1
/* Statistical profiling */
ubfx x0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4
cbz x0, 7f // Skip if SPE not present
cbnz x2, 6f // VHE?
mrs_s x4, SYS_PMBIDR_EL1 // If SPE available at EL2,找不到该寄存器
and x4, x4, #(1 << SYS_PMBIDR_EL1_P_SHIFT)
cbnz x4, 5f // then permit sampling of physical
mov x4, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \
1 << SYS_PMSCR_EL2_PA_SHIFT)
msr_s SYS_PMSCR_EL2, x4 // addresses and physical counter
5:
mov x1, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
orr x3, x3, x1 // If we don't have VHE, then
b 7f // use EL1&0 translation.
6: // For VHE, use EL2 translation
orr x3, x3, #MDCR_EL2_TPMS // and disable access from EL1
7:
msr mdcr_el2, x3 // Configure debug traps
/* LORegions */
mrs x1, id_aa64mmfr1_el1 //AArch64内存模型特征寄存器
ubfx x0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4
cbz x0, 1f
msr_s SYS_LORC_EL1, xzr
1:
/* Stage-2 translation */
msr vttbr_el2, xzr //虚拟化转换表基寄存器
cbz x2, install_el2_stub
mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2
isb
ret
SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL)
3. set_cpu_boot_mode_flag
SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag)
adr_l x1, __boot_cpu_mode //把__boot_cpu_mode地址赋值给x1
cmp w0, #BOOT_CPU_MODE_EL2 //如果当前cpu处于EL2
b.ne 1f //跳转到1
add x1, x1, #4 //当前cpu在EL1,使用__boot_cpu_mode[1]
//当前cpu在EL2,使用__boot_cpu_mode[0]
1: str w0, [x1] //将w0写入__boot_cpu_mode
dmb sy
dc ivac, x1 // Invalidate potentially stale cache line
ret
SYM_FUNC_END(set_cpu_boot_mode_flag)
set_cpu_boot_mode_flag主要是根据cpu当前的特权等级,把w0寄存器,也就是当前模式记录在__boot_cpu_mode中。
4. __create_page_tables
SYM_FUNC_START_LOCAL(__create_page_tables)
mov x28, lr //lr是连接寄存器
/*
* Invalidate the init page tables to avoid potential dirty cache lines
* being evicted. Other page tables are allocated in rodata as part of
* the kernel image, and thus are clean to the PoC per the boot
* protocol.
*/
adrp x0, init_pg_dir //获取内核init页表的基地址
adrp x1, init_pg_end //获取内核init页表的基地址
sub x1, x1, x0
bl __inval_dcache_area //清除Dcache
/*
* Clear the init page tables.
*/
//把init_pg_dir到init_pg_end这段内存清零
//也就是把内核页表清零
adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
1: stp xzr, xzr, [x0], #16 //把0写入以x0为地址的内存中,然后x0自增16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
subs x1, x1, #64
b.ne 1b
mov x7, SWAPPER_MM_MMUFLAGS
/*
* Create the identity mapping.
*/
//创建恒等映射,也就是虚拟地址和物理地址相同
adrp x0, idmap_pg_dir //恒等映射的页全局目录的起始地址
adrp x3, __idmap_text_start // 恒等映射代码节的起始地址
#ifdef CONFIG_ARM64_VA_BITS_52 //不支持,不用看
mrs_s x6, SYS_ID_AA64MMFR2_EL1
and x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT)
mov x5, #52
cbnz x6, 1f
#endif
mov x5, #VA_BITS_MIN //虚拟地址位数
1:
adr_l x6, vabits_actual //获取PC到vabits_actual的相对偏移地址
str x5, [x6] //定位PC的虚拟地址
dmb sy
dc ivac, x6 // 使x6所在的dcache失效
/*
* VA_BITS may be too small to allow for an ID mapping to be created
* that covers system RAM if that is located sufficiently high in the
* physical address space. So for the ID map, use an extended virtual
* range in that case, and configure an additional translation level
* if needed.
*
* Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
* entire ID map region can be mapped. As T0SZ == (64 - #bits used),
* this number conveniently equals the number of leading zeroes in
* the physical address of __idmap_text_end.
*/
//T0SZ决定了输出的物理地址位数,这里查看其是否足够覆盖物理地址
adrp x5, __idmap_text_end //获取__idmap_text_end的页基地址
clz x5, x5 //计算x5高位0的个数
cmp x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough?
b.ge 1f // .. then skip VA range extension
adr_l x6, idmap_t0sz //计算idmap_t0sz的页内偏移
str x5, [x6] //把x5的数据写入以x6为地址的内存中
dmb sy
dc ivac, x6 // Invalidate potentially stale cache line
#if (VA_BITS < 48)
#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3)
#define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT))
/*
* If VA_BITS < 48, we have to configure an additional table level.
* First, we have to verify our assumption that the current value of
* VA_BITS was chosen such that all translation levels are fully
* utilised, and that lowering T0SZ will always result in an additional
* translation level to be configured.
*/
#if VA_BITS != EXTRA_SHIFT
#error "Mismatch between VA_BITS and page size/number of translation levels"
#endif
mov x4, EXTRA_PTRS
create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6 //配置额外的页表
#else
/*
* If VA_BITS == 48, we don't have to configure an additional
* translation level, but the top-level table has more entries.
*/
mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
str_l x4, idmap_ptrs_per_pgd, x5
#endif
1:
ldr_l x4, idmap_ptrs_per_pgd //取idmap_ptrs_per_pgd的页内偏移到x4中
mov x5, x3 // __pa(__idmap_text_start)
adr_l x6, __idmap_text_end // __pa(__idmap_text_end)
//为指定的虚拟地址范围映射内存
map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14 //映射,写入页表
/*
* Map the kernel image (starting with PHYS_OFFSET).
*/
//内核镜像映射
adrp x0, init_pg_dir //页表基地址
mov_q x5, KIMAGE_VADDR // 代码段的虚拟地址
add x5, x5, x23 // add KASLR displacement
mov x4, PTRS_PER_PGD //PGD表项的数量
adrp x6, _end // 代码段的物理地址末端
adrp x3, _text // 代码段的物理地址起始位置
sub x6, x6, x3 // 代码段长度
add x6, x6, x5 // 代码段虚拟地址末端
map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14 //创建内核镜像的映射关系
/*
* Since the page tables have been populated with non-cacheable
* accesses (MMU disabled), invalidate those tables again to
* remove any speculatively loaded cache lines.
*/
dmb sy
adrp x0, idmap_pg_dir
adrp x1, idmap_pg_end
sub x1, x1, x0
bl __inval_dcache_area //使dcache失效
adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
bl __inval_dcache_area //使dcache失效
ret x28 //返回
SYM_FUNC_END(__create_page_tables)
__create_page_tables主要执行了一下几个步骤:
- mov x28, lr保存返回的地址
- 清除init页表的dcache
- 循环使用stp把init_pg_dir到init_pg_end这段内存写0
- 创建恒等映射,使得虚拟地址和物理地址相同
- 创建内核镜像的映射
- 使这两个页表的dcache失效
注意:
恒等映射将idmap_pg_dir页表对应的物理空间为__idmap_text_start 到__idmap_text_end,也就是代码段的范围。粗粒度内核页表将 init_pg_dir 地址保存到ttbr1_el1 ;init_pg_dir页表对应的物理空间为_text 到_end,也就是内核镜像代码段。这两个页表后面会在paging_init之后丢弃。
4.1map_memory
我们看看map_memory是怎么创建填写也页表的:
.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
sub \vend, \vend, #1 //虚拟地址减一
add \rtbl, \tbl, #PAGE_SIZE //第一级页表项的地址,是页全局基地址的下一页
mov \sv, \rtbl
mov \count, #0
//compute_indices是用来计算vstart和vend对应的 pgtable level的index的,两者之差保存在count中
compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
//populate_entries最终建立指向下一级的映射或者last level映射
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
mov \sv, \rtbl
#if SWAPPER_PGTABLE_LEVELS > 3
compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
mov \sv, \rtbl
#endif
#if SWAPPER_PGTABLE_LEVELS > 2
compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
#endif
compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count
bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1
populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
.endm
其中主要函数有两个:
- compute_indices:它是用来计算 vstart 和 vend 对应的 pgtable level 的 index 的,两者之差保存在 count 中;
- populate_entries:最终建立指向下一级的映射或者 last level 映射。
5. __cpu_setup
SYM_FUNC_START(__cpu_setup)
tlbi vmalle1 // 使本地TLB失效
dsb nsh
mov x1, #3 << 20 //x1=0x300000
msr cpacr_el1, x1 // 使能EL1和EL0执行 FP/ASIMD指令
mov x1, #1 << 12 // Reset mdscr_el1 and disable
msr mdscr_el1, x1 //对AArch64 DCC寄存器的L0访问被捕获
isb // Unmask debug exceptions now,
enable_dbg // since this is per-cpu
reset_pmuserenr_el0 x1 // Disable PMU access from EL0
reset_amuserenr_el0 x1 // Disable AMU access from EL0
/*
* Memory region attributes
*/
mov_q x5, MAIR_EL1_SET //设置nGnRnE等内存属性
#ifdef CONFIG_ARM64_MTE //如果使能内存标签扩展支持
/*
* Update MAIR_EL1, GCR_EL1 and TFSR*_EL1 if MTE is supported
* (ID_AA64PFR1_EL1[11:8] > 1).
*/
mrs x10, ID_AA64PFR1_EL1
ubfx x10, x10, #ID_AA64PFR1_MTE_SHIFT, #4
cmp x10, #ID_AA64PFR1_MTE
b.lt 1f
/* Normal Tagged memory type at the corresponding MAIR index */
mov x10, #MAIR_ATTR_NORMAL_TAGGED
bfi x5, x10, #(8 * MT_NORMAL_TAGGED), #8
/* initialize GCR_EL1: all non-zero tags excluded by default */
mov x10, #(SYS_GCR_EL1_RRND | SYS_GCR_EL1_EXCL_MASK)
msr_s SYS_GCR_EL1, x10
/*
* If GCR_EL1.RRND=1 is implemented the same way as RRND=0, then
* RGSR_EL1.SEED must be non-zero for IRG to produce
* pseudorandom numbers. As RGSR_EL1 is UNKNOWN out of reset, we
* must initialize it.
*/
mrs x10, CNTVCT_EL0
ands x10, x10, #SYS_RGSR_EL1_SEED_MASK
csinc x10, x10, xzr, ne
lsl x10, x10, #SYS_RGSR_EL1_SEED_SHIFT
msr_s SYS_RGSR_EL1, x10
/* clear any pending tag check faults in TFSR*_EL1 */
msr_s SYS_TFSR_EL1, xzr
msr_s SYS_TFSRE0_EL1, xzr
1:
#endif
msr mair_el1, x5 //对内存的8个区域写入属性
/*
* Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for
* both user and kernel.
*/
//准备TCR
mov_q x10, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
TCR_TBI0 | TCR_A1 | TCR_KASAN_FLAGS
tcr_clear_errata_bits x10, x9, x5 //清除该CPU上触发勘误表的TCR位。
#ifdef CONFIG_ARM64_VA_BITS_52
ldr_l x9, vabits_actual
sub x9, xzr, x9
add x9, x9, #64
tcr_set_t1sz x10, x9
#else
ldr_l x9, idmap_t0sz //读取idmap_t0sz
#endif
tcr_set_t0sz x10, x9 //跟新t0sz,这样我们就可以加载ID映射
/*
* Set the IPS bits in TCR_EL1.
*/
tcr_compute_pa_size x10, #TCR_IPS_SHIFT, x5, x6 //设置TCR.IPS到最高支持
#ifdef CONFIG_ARM64_HW_AFDBM //如果支持Access和Dirty页面标志的硬件更新
/*
* Enable hardware update of the Access Flags bit.
* Hardware dirty bit management is enabled later,
* via capabilities.
*/
mrs x9, ID_AA64MMFR1_EL1
and x9, x9, #0xf
cbz x9, 1f //如果cpu允许硬件访问标志更新功能
orr x10, x10, #TCR_HA // 设置硬件访问标志更新功能
1:
#endif /* CONFIG_ARM64_HW_AFDBM */
msr tcr_el1, x10 //写入tcr_el1
/*
* Prepare SCTLR
*/
mov_q x0, SCTLR_EL1_SET
ret // return to head.S
SYM_FUNC_END(__cpu_setup)
__cpu_setup执行步骤如下:
- tlbi vmalle1 使本地TLB失效
- 使能EL1和EL0执行 FP/ASIMD指令
- 允许AArch64 DCC寄存器的L0访问被捕获
- 禁止从EL0访问PMU和AMU
- 给内存的8个region设置上DEVICE_nGnRnE,DEVICE_nGnRE,DEVICE_GRE,NORMAL_NC,NORMAL,NORMAL_WT,NORMAL这8个属性。
- 清除该CPU上触发勘误表的TCR位
- 跟新t0sz,这样我们就可以加载ID映射
- 设置硬件访问标志更新功能
6. __primary_switch
SYM_FUNC_START_LOCAL(__primary_switch)
#ifdef CONFIG_RANDOMIZE_BASE
mov x19, x0 // 保留新的SCTLR_EL1值
mrs x20, sctlr_el1 // 保留旧的SCTLR EL1值
#endif
adrp x1, init_pg_dir //获取init_pg_dir的页表基地址
bl __enable_mmu //开启mmu
#ifdef CONFIG_RELOCATABLE
#ifdef CONFIG_RELR
mov x24, #0 // no RELR displacement yet
#endif
bl __relocate_kernel
#ifdef CONFIG_RANDOMIZE_BASE //我们没开,不看
ldr x8, =__primary_switched //把__primary_switched的内容放入x8
adrp x0, __PHYS_OFFSET //获取内核代码段的页表基地址
blr x8 //跳转到__primary_switched运行,返回的时候返回下一个指令
/*
* If we return here, we have a KASLR displacement in x23 which we need
* to take into account by discarding the current kernel mapping and
* creating a new one.
*/
pre_disable_mmu_workaround
msr sctlr_el1, x20 // disable the MMU
isb
bl __create_page_tables // recreate kernel mapping
tlbi vmalle1 // Remove any stale TLB entries
dsb nsh
isb
msr sctlr_el1, x19 // re-enable the MMU
isb
ic iallu // flush instructions fetched
dsb nsh // via old mapping
isb
bl __relocate_kernel
#endif
#endif
ldr x8, =__primary_switched //把__primary_switched的内容放入x8
adrp x0, __PHYS_OFFSET //获取内核代码段的页表基地址
br x8 //跳转到__primary_switched,并且不返回
SYM_FUNC_END(__primary_switch)
__primary_switch执行步骤如下:
- 获取init_pg_dir的页表基地址
- 调用函数__enable_mmu开启mmu
- 调用函数__primary_switched,并且不再返回
6.1 __enable_mmu
SYM_FUNC_START(__enable_mmu)
mrs x2, ID_AA64MMFR0_EL1 //读取内存模型特征寄存器
ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4 //提取28到31这4位
cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN //如果支持4k页
b.lt __no_granule_support //卡死
cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX //如果不支持4k页
b.gt __no_granule_support //卡死
//只有4KB粒度支持52位输入输出地址
update_early_cpu_boot_status 0, x2, x3 //启动中的CPU更新失败状态
adrp x2, idmap_pg_dir //读取内核页全局目录页表到x2
phys_to_ttbr x1, x1
phys_to_ttbr x2, x2
msr ttbr0_el1, x2 //内核页全局目录页表写入ttbr0_el1
offset_ttbr1 x1, x3
msr ttbr1_el1, x1 //内核镜像的init目录页表写入ttbr1_el1
isb
msr sctlr_el1, x0 //写入sctlr_el1寄存器
isb
/*
* Invalidate the local I-cache so that any instructions fetched
* speculatively from the PoC are discarded, since they may have
* been dynamically patched at the PoU.
*/
ic iallu //icache失效
dsb nsh //内存屏障
isb
ret
SYM_FUNC_END(__enable_mmu)
__enable_mmu执行步骤如下:
- 读取内存模型特征寄存器,判断是否支持我们内核设置的页大小,现在我们内核设置的页大小是4k,根据读取内存模型特征寄存器的值判断这个cpu是否支持4k页
- 启动中的CPU更新失败状态
- 设置ttbr0_el1和ttbr1_el1寄存器
- icache失效和内存屏障
6.2 __primary_switched
SYM_FUNC_START_LOCAL(__primary_switched)
adrp x4, init_thread_union //init_thread_union地址保存在x4中,它存放了init进程结构体
add sp, x4, #THREAD_SIZE //设置sp指针为init_thread_union偏移THREAD_SIZE
adr_l x5, init_task //init_task地址保存在x5
msr sp_el0, x5 //保存当前进程描述符到sp_el0,使用用户态的堆栈,说明是用户态程序
#ifdef CONFIG_ARM64_PTR_AUTH
__ptrauth_keys_init_cpu x5, x6, x7, x8
#endif
adr_l x8, vectors // 读取vectors的地址
msr vbar_el1, x8 // 设置异常向量表
isb
stp xzr, x30, [sp, #-16]! //把将xzr和保存在x30中的链接地址入栈
mov x29, sp //将栈指针保存到x29
#ifdef CONFIG_SHADOW_CALL_STACK
adr_l scs_sp, init_shadow_call_stack // Set shadow call stack
#endif
str_l x21, __fdt_pointer, x5 //将FDT地址保存到__fdt_pointer变量
ldr_l x4, kimage_vaddr // Save the offset between
sub x4, x4, x0 // the kernel virtual and
str_l x4, kimage_voffset, x5 //将kimage的虚拟地址和物理地址的偏移保存到kimage_voffset中
// Clear BSS
adr_l x0, __bss_start
mov x1, xzr
adr_l x2, __bss_stop
sub x2, x2, x0
bl __pi_memset //清理bss段数据
dsb ishst // Make zero page visible to PTW
#ifdef CONFIG_KASAN
bl kasan_early_init
#endif
#ifdef CONFIG_RANDOMIZE_BASE
tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized?
b.ne 0f
mov x0, x21 // pass FDT address in x0
bl kaslr_early_init // parse FDT for KASLR options
cbz x0, 0f // KASLR disabled? just proceed
orr x23, x23, x0 // record KASLR offset
ldp x29, x30, [sp], #16 // we must enable KASLR, return
ret // to __primary_switch()
0:
#endif
add sp, sp, #16 //sp加一
mov x29, #0
mov x30, #0
b start_kernel //跳转到start_kernel
SYM_FUNC_END(__primary_switched)
__primary_switched主要执行了一下步骤:
- 初始化init_task的结构体和堆栈
- 设置异常向量表
- 将FDT地址保存到__fdt_pointer变量
- 将kimage的虚拟地址和物理地址的偏移保存到kimage_voffset中
- 清理bss段数据
- 跳转到start_kernel
到这里head.S的启动就看完了。