i386 Linux内核进入保护模式引导流程

       在系统引导过程中,Bootloader将内核镜像加载到内存后,并将控制权转交给内核       ,通过长转移指令跳转到入口startup_32。

       实际上进入startup_32入口前,CPU已经处于了保护模式下的段式寻址方式。而CS寄存器已经设置成了

       Linux准备进入保护模式主要做了下面的几项初始化的工作:

  1. 设置ds,es,fs,gs等段寄存器(在进入startup_32入口前,CPU已经处于了保护模式下的段式寻址方式,cs寄存器在进入startup_32前就已经设置成了__KERNEL_CS)
  2. 初始化页目录项(再编译时已经初始化好了)
  3. 初始化页表
  4. 初始化cr3寄存器,使CR3寄存器指向初始化好的页面目录项首地址。
  5. 设置CR0寄存器,开启页面映射,自此CPU进入保护模式并开启页面映射。 

我们结合代码来看一下(整段代码)

/*
 *  linux/arch/i386/head.S -- the 32-bit startup code.
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Enhanced CPU detection and feature setting code by Mike Jagdis
 *  and Martin Mares, November 1997.
 */

.text
#include <linux/config.h>
#include <linux/threads.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/desc.h>

#define OLD_CL_MAGIC_ADDR	0x90020
#define OLD_CL_MAGIC		0xA33F
#define OLD_CL_BASE_ADDR	0x90000
#define OLD_CL_OFFSET		0x90022
#define NEW_CL_POINTER		0x228	/* Relative to real mode data */

/*
 * References to members of the boot_cpu_data structure.
 */

#define CPU_PARAMS	SYMBOL_NAME(boot_cpu_data)
#define X86		CPU_PARAMS+0
#define X86_VENDOR	CPU_PARAMS+1
#define X86_MODEL	CPU_PARAMS+2
#define X86_MASK	CPU_PARAMS+3
#define X86_HARD_MATH	CPU_PARAMS+6
#define X86_CPUID	CPU_PARAMS+8
#define X86_CAPABILITY	CPU_PARAMS+12
#define X86_VENDOR_ID	CPU_PARAMS+16

/*
 * swapper_pg_dir is the main page directory, address 0x00101000
 *
 * On entry, %esi points to the real-mode code as a 32-bit pointer.
 */
ENTRY(stext)
ENTRY(_stext)
startup_32:                     /*Linux内核入口*/
/*
 * Set segments to known values
 */
	cld
	movl $(__KERNEL_DS),%eax
	movl %eax,%ds		/*将这几个段寄存器设置为__KERNEL_DS */
	movl %eax,%es
	movl %eax,%fs
	movl %eax,%gs
#ifdef CONFIG_SMP                /*SMP相关的代码可以先不用看*/
	orw %bx,%bx
	jz 1f

/*
 *	New page tables may be in 4Mbyte page mode and may
 *	be using the global pages. 
 *
 *	NOTE! If we are on a 486 we may have no cr4 at all!
 *	So we do not try to touch it unless we really have
 *	some bits in it to set.  This won't work if the BSP
 *	implements cr4 but this AP does not -- very unlikely
 *	but be warned!  The same applies to the pse feature
 *	if not equally supported. --macro
 *
 *	NOTE! We have to correct for the fact that we're
 *	not yet offset PAGE_OFFSET..
 */
#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
	cmpl $0,cr4_bits
	je 3f
	movl %cr4,%eax		# Turn on paging options (PSE,PAE,..)
	orl cr4_bits,%eax
	movl %eax,%cr4
	jmp 3f
1:
#endif
/*
 * Initialize page tables            /*初始化页表*/
 */
	movl $pg0-__PAGE_OFFSET,%edi /* initialize page tables */
	movl $007,%eax		/* "007" doesn't mean with right to kill, but
				   PRESENT+RW+USER */
2:	stosl
	add $0x1000,%eax
	cmp $empty_zero_page-__PAGE_OFFSET,%edi
	jne 2b

/*
 * Enable paging
 */
3:
	movl $swapper_pg_dir-__PAGE_OFFSET,%eax
	movl %eax,%cr3		/* set the page table pointer.. */    /* 设置cr3寄存器 */
	movl %cr0,%eax 
	orl $0x80000000,%eax                                          /* 设置cr0寄存器 */    
	movl %eax,%cr0		/* ..and set paging (PG) bit */
	jmp 1f			/* flush the prefetch-queue */
1:
	movl $1f,%eax
	jmp *%eax		/* make sure eip is relocated */
1:
	/* Set up the stack pointer */
	lss stack_start,%esp

#ifdef CONFIG_SMP
	orw  %bx,%bx
	jz  1f				/* Initial CPU cleans BSS */
	pushl $0
	popfl
	jmp checkCPUtype
1:
#endif CONFIG_SMP

/*
 * Clear BSS first so that there are no surprises...
 * No need to cld as DF is already clear from cld above...
 */
	xorl %eax,%eax
	movl $ SYMBOL_NAME(__bss_start),%edi
	movl $ SYMBOL_NAME(_end),%ecx
	subl %edi,%ecx
	rep
	stosb

/*
 * start system 32-bit setup. We need to re-do some of the things done
 * in 16-bit mode for the "real" operations.
 */
	call setup_idt
/*
 * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
 * confuse the debugger if this code is traced.
 * XXX - best to initialize before switching to protected mode.
 */
	pushl $0
	popfl
/*
 * Copy bootup parameters out of the way. First 2kB of
 * _empty_zero_page is for boot parameters, second 2kB
 * is for the command line.
 *
 * Note: %esi still has the pointer to the real-mode data.
 */
	movl $ SYMBOL_NAME(empty_zero_page),%edi
	movl $512,%ecx
	cld
	rep
	movsl
	xorl %eax,%eax
	movl $512,%ecx
	rep
	stosl
	movl SYMBOL_NAME(empty_zero_page)+NEW_CL_POINTER,%esi
	andl %esi,%esi
	jnz 2f			# New command line protocol
	cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
	jne 1f
	movzwl OLD_CL_OFFSET,%esi
	addl $(OLD_CL_BASE_ADDR),%esi
2:
	movl $ SYMBOL_NAME(empty_zero_page)+2048,%edi
	movl $512,%ecx
	rep
	movsl
1:
#ifdef CONFIG_SMP
checkCPUtype:
#endif

	movl $-1,X86_CPUID		#  -1 for no CPUID initially

/* check if it is 486 or 386. */
/*
 * XXX - this does a lot of unnecessary setup.  Alignment checks don't
 * apply at our cpl of 0 and the stack ought to be aligned already, and
 * we don't need to preserve eflags.
 */

	movl $3,X86		# at least 386
	pushfl			# push EFLAGS
	popl %eax		# get EFLAGS
	movl %eax,%ecx		# save original EFLAGS
	xorl $0x40000,%eax	# flip AC bit in EFLAGS
	pushl %eax		# copy to EFLAGS
	popfl			# set EFLAGS
	pushfl			# get new EFLAGS
	popl %eax		# put it in eax
	xorl %ecx,%eax		# change in flags
	andl $0x40000,%eax	# check if AC bit changed
	je is386

	movl $4,X86		# at least 486
	movl %ecx,%eax
	xorl $0x200000,%eax	# check ID flag
	pushl %eax
	popfl			# if we are on a straight 486DX, SX, or
	pushfl			# 487SX we can't change it
	popl %eax
	xorl %ecx,%eax
	pushl %ecx		# restore original EFLAGS
	popfl
	andl $0x200000,%eax
	je is486

	/* get vendor info */
	xorl %eax,%eax			# call CPUID with 0 -> return vendor ID
	cpuid
	movl %eax,X86_CPUID		# save CPUID level
	movl %ebx,X86_VENDOR_ID		# lo 4 chars
	movl %edx,X86_VENDOR_ID+4	# next 4 chars
	movl %ecx,X86_VENDOR_ID+8	# last 4 chars

	orl %eax,%eax			# do we have processor info as well?
	je is486

	movl $1,%eax		# Use the CPUID instruction to get CPU type
	cpuid
	movb %al,%cl		# save reg for future use
	andb $0x0f,%ah		# mask processor family
	movb %ah,X86
	andb $0xf0,%al		# mask model
	shrb $4,%al
	movb %al,X86_MODEL
	andb $0x0f,%cl		# mask mask revision
	movb %cl,X86_MASK
	movl %edx,X86_CAPABILITY

is486:
	movl %cr0,%eax		# 486 or better
	andl $0x80000011,%eax	# Save PG,PE,ET
	orl $0x50022,%eax	# set AM, WP, NE and MP
	jmp 2f

is386:	pushl %ecx		# restore original EFLAGS
	popfl
	movl %cr0,%eax		# 386
	andl $0x80000011,%eax	# Save PG,PE,ET
	orl $2,%eax		# set MP
2:	movl %eax,%cr0
	call check_x87
#ifdef CONFIG_SMP
	incb ready
#endif
	lgdt gdt_descr
	lidt idt_descr
	ljmp $(__KERNEL_CS),$1f
1:	movl $(__KERNEL_DS),%eax	# reload all the segment registers
	movl %eax,%ds		# after changing gdt.
	movl %eax,%es
	movl %eax,%fs
	movl %eax,%gs
#ifdef CONFIG_SMP
	movl $(__KERNEL_DS), %eax
	movl %eax,%ss		# Reload the stack pointer (segment only)
#else
	lss stack_start,%esp	# Load processor stack
#endif
	xorl %eax,%eax
	lldt %ax
	cld			# gcc2 wants the direction flag cleared at all times
#ifdef CONFIG_SMP
	movb ready, %cl	
	cmpb $1,%cl
	je 1f			# the first CPU calls start_kernel
				# all other CPUs call initialize_secondary
	call SYMBOL_NAME(initialize_secondary)
	jmp L6
1:
#endif
	call SYMBOL_NAME(start_kernel)
L6:
	jmp L6			# main should never return here, but
				# just in case, we know what happens.

#ifdef CONFIG_SMP
ready:	.byte 0
#endif

/*
 * We depend on ET to be correct. This checks for 287/387.
 */
check_x87:
	movb $0,X86_HARD_MATH
	clts
	fninit
	fstsw %ax
	cmpb $0,%al
	je 1f
	movl %cr0,%eax		/* no coprocessor: have to set bits */
	xorl $4,%eax		/* set EM */
	movl %eax,%cr0
	ret
	ALIGN
1:	movb $1,X86_HARD_MATH
	.byte 0xDB,0xE4		/* fsetpm for 287, ignored by 387 */
	ret

/*
 *  setup_idt
 *
 *  sets up a idt with 256 entries pointing to
 *  ignore_int, interrupt gates. It doesn't actually load
 *  idt - that can be done only after paging has been enabled
 *  and the kernel moved to PAGE_OFFSET. Interrupts
 *  are enabled elsewhere, when we can be relatively
 *  sure everything is ok.
 */
setup_idt:
	lea ignore_int,%edx
	movl $(__KERNEL_CS << 16),%eax
	movw %dx,%ax		/* selector = 0x0010 = cs */
	movw $0x8E00,%dx	/* interrupt gate - dpl=0, present */

	lea SYMBOL_NAME(idt_table),%edi
	mov $256,%ecx
rp_sidt:
	movl %eax,(%edi)
	movl %edx,4(%edi)
	addl $8,%edi
	dec %ecx
	jne rp_sidt
	ret

ENTRY(stack_start)
	.long SYMBOL_NAME(init_task_union)+8192
	.long __KERNEL_DS

/* This is the default interrupt "handler" :-) */
int_msg:
	.asciz "Unknown interrupt\n"
	ALIGN
ignore_int:
	cld
	pushl %eax
	pushl %ecx
	pushl %edx
	pushl %es
	pushl %ds
	movl $(__KERNEL_DS),%eax
	movl %eax,%ds
	movl %eax,%es
	pushl $int_msg
	call SYMBOL_NAME(printk)
	popl %eax
	popl %ds
	popl %es
	popl %edx
	popl %ecx
	popl %eax
	iret

/*
 * The interrupt descriptor table has room for 256 idt's,
 * the global descriptor table is dependent on the number
 * of tasks we can have..
 */
#define IDT_ENTRIES	256
#define GDT_ENTRIES	(__TSS(NR_CPUS))


.globl SYMBOL_NAME(idt)
.globl SYMBOL_NAME(gdt)

	ALIGN
	.word 0
idt_descr:
	.word IDT_ENTRIES*8-1		# idt contains 256 entries
SYMBOL_NAME(idt):
	.long SYMBOL_NAME(idt_table)

	.word 0
gdt_descr:
	.word GDT_ENTRIES*8-1
SYMBOL_NAME(gdt):
	.long SYMBOL_NAME(gdt_table)

/*
 * This is initialized to create an identity-mapping at 0-8M (for bootup
 * purposes) and another mapping of the 0-8M area at virtual address
 * PAGE_OFFSET.
 */
.org 0x1000
ENTRY(swapper_pg_dir)                                    /* 页面目录初始化 */
	.long 0x00102007
	.long 0x00103007
	.fill BOOT_USER_PGD_PTRS-2,4,0
	/* default: 766 entries */
	.long 0x00102007
	.long 0x00103007
	/* default: 254 entries */
	.fill BOOT_KERNEL_PGD_PTRS-2,4,0

/*
 * The page tables are initialized to only 8MB here - the final page
 * tables are set up later depending on memory size.
 */
.org 0x2000
ENTRY(pg0)

.org 0x3000
ENTRY(pg1)

/*
 * empty_zero_page must immediately follow the page tables ! (The
 * initialization loop counts until empty_zero_page)
 */

.org 0x4000
ENTRY(empty_zero_page)

.org 0x5000
ENTRY(empty_bad_page)

.org 0x6000
ENTRY(empty_bad_pte_table)

#if CONFIG_X86_PAE

 .org 0x7000
 ENTRY(empty_bad_pmd_table)

 .org 0x8000

#else

 .org 0x7000

#endif

/*
 * This starts the data section. Note that the above is all
 * in the text section because it has alignment requirements
 * that we cannot fulfill any other way.
 */
.data

ALIGN
/*
 * This contains typically 140 quadwords, depending on NR_CPUS.
 *
 * NOTE! Make sure the gdt descriptor in head.S matches this if you
 * change anything.
 */
ENTRY(gdt_table)                        /* gdt 表初始化 */
	.quad 0x0000000000000000	/* NULL descriptor */
	.quad 0x0000000000000000	/* not used */
	.quad 0x00cf9a000000ffff	/* 0x10 kernel 4GB code at 0x00000000 */
	.quad 0x00cf92000000ffff	/* 0x18 kernel 4GB data at 0x00000000 */
	.quad 0x00cffa000000ffff	/* 0x23 user   4GB code at 0x00000000 */
	.quad 0x00cff2000000ffff	/* 0x2b user   4GB data at 0x00000000 */
	.quad 0x0000000000000000	/* not used */
	.quad 0x0000000000000000	/* not used */
	/*
	 * The APM segments have byte granularity and their bases
	 * and limits are set at run time.
	 */
	.quad 0x0040920000000000	/* 0x40 APM set up for bad BIOS's */
	.quad 0x00409a0000000000	/* 0x48 APM CS    code */
	.quad 0x00009a0000000000	/* 0x50 APM CS 16 code (16 bit) */
	.quad 0x0040920000000000	/* 0x58 APM DS    data */
	.fill NR_CPUS*4,8,0		/* space for TSS's and LDT's */
		
/*
 * This is to aid debugging, the various locking macros will be putting
 * code fragments here.  When an oops occurs we'd rather know that it's
 * inside the .text.lock section rather than as some offset from whatever
 * function happens to be last in the .text segment.
 */
.section .text.lock
ENTRY(stext_lock)

这里先解释一下,内核镜像在编译时已经在所有符号地址上增加了一个偏移量0xC000000,并且内核镜像是加载到物理内存0x10 0000即1MB,也就是说startup_32实际上被加载到物理内存0x100000地址上。所以startup_32的虚拟地址为0xC0100000处,但是CPU在进入startup_32入口时,还是位于保护模式下的段式寻址方式,所以实际上此时的IP地址为0x100000 而不是0xC0100000,所以从代码中可以看到有很多类似的**地址-__PAGE_OFFSET,__PAGE_OFFSET为0xC0000000,这样做就是为了在段式寻址中可以找到正确的物理地址。如下图所示:

      

设置页表也只是初始化了2张页表,物理页面的基地址分别为0x0, 0x1000, 0x2000,依次类推,也就是物理页面中的0, 1, 2。 一个页面表大小为4K, 2张页表能存2K个表项(4K / 4 * 2),所以两张页表可以映射8M 的存储空间,这就是Linux内核对内存大小的最低限度要求。代码如下:

/*
 * Initialize page tables
 */
	movl $pg0-__PAGE_OFFSET,%edi /* initialize page tables */
	movl $007,%eax		/* "007" doesn't mean with right to kill, but
				   PRESENT+RW+USER */
2:	stosl
	add $0x1000,%eax
	cmp $empty_zero_page-__PAGE_OFFSET,%edi
	jne 2b

       下面我们再来看页面目录的初始化,页面目录的初始化,不是在内核运行时由代码初始化的,而是编译时决定的,在内核镜像加载时初始化好的。 实际上该段代码初始化了1024个目录项,其中BOOT_USER_PGD_PTRS为768代表用户空间的目录表项

,BOOT_KERNEL_PGD_PTRS为256,代表系统空间的目录表项,也就是说,共有4GB的虚拟内存空间,其中前0-3G代表用户空间,后3G-4G代表系统空间。如下图所示:

上面内容的代码:

.org 0x1000
ENTRY(swapper_pg_dir)
	.long 0x00102007
	.long 0x00103007
	.fill BOOT_USER_PGD_PTRS-2,4,0
	/* default: 766 entries */
	.long 0x00102007
	.long 0x00103007
	/* default: 254 entries */
	.fill BOOT_KERNEL_PGD_PTRS-2,4,0

这里还有一点需要注意,一共初始化2个系统空间目录项和2个用户空间目录项,但是他们具有相同的映射,都指向了pg0(0x2000)这块物理地址。这是为什么呢。我们知道现在的IP寄存器里面存的地址是按照保护模式下段式映射存的,所以一旦开启了页面映射后,IP寄存器还是指向低区,这样就会导致指令不能继续执行,所以现在将低区地址的映射映射到和高区相同的物理地址就解决了这个问题。

设置完相应表项后,就设置相应寄存器,如cr3,指向页面目录的首地址,并设置Cr0寄存器,开启保护模式页式映射。

3:
	movl $swapper_pg_dir-__PAGE_OFFSET,%eax
	movl %eax,%cr3		/* set the page table pointer.. */
	movl %cr0,%eax
	orl $0x80000000,%eax
	movl %eax,%cr0		/* ..and set paging (PG) bit */
	jmp 1f			/* flush the prefetch-queue */

 自此,Linux访问地址就不再需要使用 某地址-__PAGE_OFFSET, CPU也进入保护模式下的页式映射。

(内核源码为2.4.0, 参考《Linux内核源码情景分析》

 

附:进入startup_32的相关代码(不知道该代码是不是bootloader的,如果有知道的同学,希望可以回复评论,谢谢O(∩_∩)O):

/*
 *
 *	Trampoline.S	Derived from Setup.S by Linus Torvalds
 *
 *	4 Jan 1997 Michael Chastain: changed to gnu as.
 *
 *	Entry: CS:IP point to the start of our code, we are 
 *	in real mode with no stack, but the rest of the 
 *	trampoline page to make our stack and everything else
 *	is a mystery.
 *
 *	In fact we don't actually need a stack so we don't
 *	set one up.
 *
 *	We jump into the boot/compressed/head.S code. So you'd
 *	better be running a compressed kernel image or you
 *	won't get very far.
 *
 *	On entry to trampoline_data, the processor is in real mode
 *	with 16-bit addressing and 16-bit data.  CS has some value
 *	and IP is zero.  Thus, data addresses need to be absolute
 *	(no relocation) and are taken with regard to r_base.
 *
 *	If you work on this file, check the object module with objdump
 *	--full-contents --reloc to make sure there are no relocation
 *	entries except for the gdt one..
 */

#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page.h>

.data

.code16

ENTRY(trampoline_data)
r_base = .

	mov	%cs, %ax	# Code and data in the same place
	mov	%ax, %ds

	mov	$1, %bx		# Flag an SMP trampoline
	cli			# We should be safe anyway

	movl	$0xA5A5A5A5, trampoline_data - r_base
				# write marker for master knows we're running

	lidt	idt_48 - r_base	# load idt with 0, 0			
	lgdt	gdt_48 - r_base	# load gdt with whatever is appropriate #设置gdt寄存器

	xor	%ax, %ax
	inc	%ax		# protected mode (PE) bit
	lmsw	%ax		# into protected mode
	jmp	flush_instr
flush_instr:
	ljmpl	$__KERNEL_CS, $0x00100000	#跳入startup_32入口地址
			# jump to startup_32 in arch/i386/kernel/head.S

idt_48:						#idt表项(没有)
	.word	0			# idt limit = 0
	.word	0, 0			# idt base = 0L

gdt_48:						#gdt表项
	.word	0x0800			# gdt limit = 2048, 256 GDT entries
	.long	gdt_table-__PAGE_OFFSET	# gdt base = gdt (first SMP CPU)

.globl SYMBOL_NAME(trampoline_end)
SYMBOL_NAME_LABEL(trampoline_end)

 

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值