虚拟地址转换为物理地址
// start.S
#define PHYS_LOAD_ADDRESS (MEMBASE + KERNEL_LOAD_OFFSET)
#define PHYS_ADDR_DELTA (KERNEL_BASE + KERNEL_LOAD_OFFSET - PHYS_LOAD_ADDRESS)
#define PHYS(x) ((x) - PHYS_ADDR_DELTA)
PHYS(x) 将x转换为物理地址
内核启动
Multiboot头部结构
// start.S
.section ".text.boot"
.code32
.global _start
_start:
jmp real_start
.align 8
/* flags for multiboot header */
#define MULTIBOOT_HEADER_FLAGS (MULTIBOOT_PAGE_ALIGN | MULTIBOOT_MEMORY_INFO | MULTIBOOT_AOUT_KLUDGE)
//MULTIBOOT_PAGE_ALIGN 0x00000001 MULTIBOOT_MEMORY_INFO 0x00000002 MULTIBOOT_AOUT_KLUDGE 0x00010000
.type multiboot_header,STT_OBJECT
multiboot_header:
/* magic */
.int MULTIBOOT_HEADER_MAGIC
/* flags */
.int MULTIBOOT_HEADER_FLAGS
/* checksum */
.int -(MULTIBOOT_HEADER_MAGIC + MULTIBOOT_HEADER_FLAGS)
/* header_addr */
.int PHYS(multiboot_header)
/* load_addr */
.int PHYS(_start)
/* load_end_addr */
.int PHYS(__data_end)
/* bss_end_addr */
.int PHYS(__bss_end)
/* entry_addr */
.int PHYS(real_start)
刚启动时,使用32位指令集,MULTIBOOT_HEADER_FLAGS
指定启动加载程序的功能,此处设置了4K字节对齐、multiboot_info需要包含mem_*字段以及设Multiboot偏移12-28处的字段有效
图中代码在multiboot.h
图中代码在start.S
Multiboot header地址含义可以参考Multiboot技术文档3.1.3小节,Multiboot_info可参考3.3小节
启动时的寄存器状态
Multiboot协议规定,EAX = 0x2BADB002(魔数) 表明操作系统是被符合Multiboot的加载程序进行加载的,此外Multiboot协议规定,EBX必须包含Multiboot_info的32位物理地址。有关机器启动时的状态可参考文档3.2小节。
real_start
// start.S
real_start:
cmpl $MULTIBOOT_BOOTLOADER_MAGIC, %eax
jne 0f
movl %ebx, PHYS(_multiboot_info)
0:
/* load our new gdt by physical pointer */
lgdt PHYS(_gdtr_phys)
/* load our data selectors */
movw $DATA_SELECTOR, %ax
movw %ax, %ds
movw %ax, %es
movw %ax, %fs
movw %ax, %ss
movw %ax, %gs
movw %ax, %ss
/* load initial stack pointer */
movl $PHYS(_kstack + 4096), %esp
/* far jump to load the CS from our GDT */
pushl $CODE_SELECTOR
pushl $PHYS(.Lfarjump)
lret
在real_start开始部分,先检查EAX中的值是否等于Multiboot魔数,等于则将EBX的值加载到multiboot_info的物理地址,否则直接跳转到标号0处执行。
将全局描述符表gdt的物理地址加载到GDTR中,然后将段寄存器的值设置为DATA_SELECTOR = 0x10 = 0001 0000
段选择子
Requestor Privilege-Level (RPL)表示处理器正在运行的特权级别
Table Indicator (TI)表示选择哪个描述符表,TI=0使用GDT,TI=1使用LDT
Selector Index Field(SI)表示索引
因此DATA_SELECTOR = 0x10 = 0001 0000
表示CPL=0,即最高权限;使用GDT,Index为2,使用GDT中的第二个段描述符
图中代码在gdt.S
段寄存器设置好后,用一个4K的数组作为栈,数组末尾作为栈顶,然后将CODE_SELECTOR和.Lfarjump的物理地址压栈,再跳转到.Lfarjump处运行。
CODE_SELECTOR = 0x08 = 0000 1000
即选择GDT的第一个段描述符
初始化BSS段
//start.S
.Lfarjump:
/* zero the bss section */
bss_setup:
movl $PHYS(__bss_start), %edi /* starting address of the bss */
movl $PHYS(__bss_end), %ecx /* find the length of the bss in bytes */
subl %edi, %ecx
shrl $2, %ecx /* convert to 32 bit words, since the bss is aligned anyway */
2:
movl $0, (%edi)
addl $4, %edi
loop 2b
初始化BSS段,其中_bss_start, _bss_end在kernel.ld文件中,在链接阶段分配地址
页表转换设置
CR4、CR3、EFER寄存器设置
//start.S
paging_setup:
/* Preparing 64 bit paging. We will use 2MB pages covering 1GB
* for initial bootstrap, this page table will be 1 to 1.
*/
/* PAE bit must be enabled for 64 bit paging*/
mov %cr4, %eax
btsl $(5), %eax
mov %eax, %cr4
/* load the physical pointer to the top level page table */
movl $PHYS(kernel_pml4), %eax
mov %eax, %cr3
/* Long Mode Enabled at this point*/
movl $MSR_EFER ,%ecx
rdmsr
orl $EFER_LME,%eax
wrmsr
将CR4位5置1,启用PAE(Physical-Address Extensions),并将PML4的地址存储在CR3中,然后设置MSR_EFER寄存器,启用长模式。
//start.S
#define MSR_EFER 0xc0000080
#define EFER_LME 0x00000100
MSR_EFER = 0xc0000080
看似是一个宏定义,其实是EFER寄存器的地址(在AMD手册3.1.7中给出)
页表映射
//mmu.c
/* top level kernel page tables, initialized in start.S */
map_addr_t kernel_pml4[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
map_addr_t kernel_pdp[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE); /* temporary */
map_addr_t kernel_pte[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
/* top level pdp needed to map the -512GB..0 space */
map_addr_t kernel_pdp_high[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
/* a big pile of page tables needed to map 64GB of memory into kernel space using 2MB pages */
map_addr_t kernel_linear_map_pdp[(64ULL*GB) / (2*MB)];
kernel_pml4、kernel_pdp、kernel_pte都是一个4K大小的数组,kernel_linear_map_pdp是一个4*64K大小的数组,在这里的作用是作为64个4K的kernel_pte
//start.S
/* Setting the First PML4E with a PDP table reference at index 0 */
movl $PHYS(kernel_pdp), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
movl %eax, PHYS(kernel_pml4)
/* Setting the First PDPTE with a Page table reference at index 0 */
movl $PHYS(kernel_pte), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
movl %eax, PHYS(kernel_pdp)
/* point the pml4e at the second high PDP (for -2GB mapping) at index 511 */
movl $PHYS(kernel_pdp_high), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
movl %eax, PHYS(kernel_pml4 + 8*511)
/* point the second pdp at the same low level page table */
movl $PHYS(kernel_pte), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
movl %eax, PHYS(kernel_pdp_high + 8*510)
/* map the first 1GB in this table */
movl $PHYS(kernel_pte), %esi
movl $0x200, %ecx /* 512 entries */
xor %eax, %eax /* start off at address 0 */
0:
mov %eax, %ebx
shll $21, %ebx
orl $X86_KERNEL_PD_LP_FLAGS, %ebx
movl %ebx, (%esi)
addl $8,%esi
inc %eax
loop 0b /* dec ecx and loop while > 0 */
使用的是2M的页表,实际上kernel_pte换成kernel_pde会更好,但只是个名字,并不影响实际运行。映射的结果如图:
/* set up a linear map of the first 64GB at 0xffffff8000000000 */
movl $PHYS(kernel_linear_map_pdp), %esi
movl $32768, %ecx
xor %eax, %eax
/* loop across these page tables, incrementing the address by 2MB */
0:
mov %eax, %ebx
shll $21, %ebx
orl $X86_KERNEL_PD_LP_FLAGS, %ebx # lower word of the entry
movl %ebx, (%esi)
mov %eax, %ebx
shrl $11, %ebx # upper word of the entry
movl %ebx, 4(%esi)
addl $8,%esi
inc %eax
loop 0b
/* point the high pdp at our linear mapping page tables */
movl $PHYS(kernel_pdp_high), %esi
movl $64, %ecx
movl $PHYS(kernel_linear_map_pdp), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
0:
movl %eax, (%esi)
add $8, %esi
addl $4096, %eax
loop 0b
/* Enabling Paging and from this point we are in 32 bit compatibility mode */
mov %cr0, %eax
btsl $(31), %eax
mov %eax, %cr0
1G内存映射完后,继续建立64G内存的映射关系,如下图所示,最后将CR0的31位置1,启动页表。
初始化IDT
/* Use a far jump to get into 64bit mode */
pushl $CODE_64_SELECTOR
pushl $PHYS(farjump64)
lret
.align 8
.code64
farjump64:
/* branch to our high address */
mov $highaddr, %rax
jmp *%rax
highaddr:
/* load the high kernel stack */
mov $(_kstack + 4096), %rsp
/* reload the gdtr */
lgdt _gdtr
/* set up the idt */
call setup_idt
/* call the main module */
call lk_main
0: /* just sit around waiting for interrupts */
hlt /* interrupts will unhalt the processor */
pause
jmp 0b /* so jump back to halt to conserve power */
启动分页机制后,需要重新加载堆栈地址以及GDTR的虚拟地址,然后调用setup_idt初始化IDT。setup_idt在exceptions.S文件中。_isr是中断服务程序的起始地址,一共有256个中断向量,所以循环256次。下图中,两个nop的作用是为了保证每个中断服务程序的长度相等。
使用objdump查看反汇编结果可以看到,两个nop占用2个字节,pushq 0占用两个字节,因为设置了对齐方式为8,所以得到的中断服务程序长度都是16B。
#define NUM_INT 0x100
#define NUM_EXC 0x14
.text
_isr:
.set i, 0
.rept NUM_INT
100: /* unnamed label for start of isr stub */
.if i == 8 || (i >= 10 && i <= 14) || i == 17
.align 16
nop /* error code pushed by exception */
nop /* 2 nops are the same length as push byte */
pushq $i /* interrupt number */
jmp interrupt_common
.align 16
.else
.align 16
pushq $0 /* fill in error code in iframe */
pushq $i /* interrupt number */
jmp interrupt_common
.align 16
.endif
.set i, i + 1
.endr
/* figure out the length of a single isr stub (usually 6 or 9 bytes) */
.set isr_stub_len, . - 100b
/* annoying, but force AS to use the same (longer) encoding of jmp for all of the stubs */
.fill 256
interrupt_common:
/* clear the direction bit */
cld
/* save general purpose registers */
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %r11
pushq %r10
pushq %r9
pushq %r8
pushq %rax
pushq %rcx
pushq %rdx
pushq %rbx
pushq %rbp
pushq %rsi
pushq %rdi
/* pass the iframe using rdi */
movq %rsp, %rdi
call x86_exception_handler
/* restore general purpose registers */
popq %rdi
popq %rsi
popq %rbp
popq %rbx
popq %rdx
popq %rcx
popq %rax
popq %r8
popq %r9
popq %r10
popq %r11
popq %r12
popq %r13
popq %r14
popq %r15
/* drop vector number and error code*/
addq $16, %rsp
iretq
在将控制寄存器压栈前,硬件已经将SS、SP、RFLAGS、CS、IP压栈,将控制寄存器压栈后,调用x86_exception_handler,中断处理程序执行完后,按照相反的顺序弹出控制寄存器。最后将rsp+16,是因为在jmp interrupt_common之前将中断向量i和错误码压栈,为了让rsp指向return IP,将rsp+16。
FUNCTION(setup_idt)
/* setup isr stub descriptors in the idt */
mov $_isr, %rsi
mov $_idt, %rdi
movl $NUM_INT, %ecx
.Lloop:
mov %rsi, %rbx
movw %bx, (%rdi) /* offset [0:15] in IDT(n).low */
shr $16, %rbx
movw %bx, 6(%rdi) /* offset [16:31] in IDT(n).high */
shr $16, %rbx
movl %ebx, 8(%rdi) /* offset [32:63] */
add $isr_stub_len, %rsi /* index the next ISR stub */
add $16, %rdi /* index the next IDT entry */
loop .Lloop
lidt _idtr
ret
在setup_idt中,循环256次,每次循环,将中断服务程序的地址填入IDT的门描述符中,最后使用lidt将IDT表的地址载入IDTR中,下图为长模式下门描述符格式。
.align 8
DATA(_idtr)
.short _idt_end - _idt - 1 /* IDT limit */
.quad _idt
.fill 8
.align 8
/* interrupt descriptor table (IDT) */
DATA(_idt)
.set i, 0
.rept NUM_INT
.short 0 /* low 16 bits of ISR offset (_isr#i & 0FFFFh) */
.short CODE_64_SELECTOR /* selector */
.byte 0
.byte 0x8e /* present, ring 0, 64-bit interrupt gate */
.short 0 /* high 16 bits of ISR offset (_isr#i / 65536) */
.short 0 /* ISR offset */
.short 0 /* ISR offset */
.short 0 /* 32bits Reserved */
.short 0 /* 32bits Reserved */
.set i, i + 1
.endr
_idtr中存储了_idt的长度以及_idt的起始地址,在_idt中,循环256次,对IDT表进行初始化。