--------------------------
1. handle_page_fault函数处理:
- /*
- * Top-level page fault handling.
- * This is in assembler because if do_page_fault tells us that
- * it is a bad kernel page fault, we want to save the non-volatile
- * registers before calling bad_page_fault.
- */
- .globl handle_page_fault
- handle_page_fault:
- stw r4,_DAR(r1)
- addi r3,r1,STACK_FRAME_OVERHEAD
- bl do_page_fault //准备好调用函数的参数,r3=寄存器列表地址,r4=异常发生时地址=DEAR,r5:错误代码=ESR寄存器内容
- cmpwi r3,0
- beq+ ret_from_except //如果返回值为0,表示正常处理了该异常,正常返回
- SAVE_NVGPRS(r1)
- lwz r0,_TRAP(r1)
- clrrwi r0,r0,1
- stw r0,_TRAP(r1)
- mr r5,r3
- addi r3,r1,STACK_FRAME_OVERHEAD
- lwz r4,_DAR(r1)
- bl bad_page_fault //异常未被正常处理,调用此函数处理内核不能处理的异常,如果该函数也未在异常列表中找到该异常,则系统打印错误信息后死机,否则函数返回,继续异常返回操作
- b ret_from_except_full //与ret_from_except处理基本一致,除了保存几个寄存器
先解释一下COW(copy on write):即在子进程创建时,子进程的不创建地址空间,而是将父子进程的PTE表项设为只读,当对地址空间进行写入时,引发dtlb miss异常,在异常处理程序中判断地址空间的数据页面是否为只读,如果是,则表明是向只读空间写入,是错误行为。否则认为是COW行为,则会重新分配一个生理页面。这样做是为了提高效率。COW的路径为:
do_fork->copy_process->copy_mm->dup_mm->dup_mmap->copy_page_range->copy_pud_range->
copy_pmd_range->copy_pte_range->copy_one_pte.
- /*
- * For 600- and 800-family processors, the error_code parameter is DSISR
- * for a data fault, SRR1 for an instruction fault. For 400-family processors
- * the error_code parameter is ESR for a data fault, 0 for an instruction
- * fault.
- * For 64-bit processors, the error_code parameter is
- * - DSISR for a non-SLB data access fault,
- * - SRR1 & 0x08000000 for a non-SLB instruction access fault
- * - 0 any SLB fault.
- *
- * The return value is 0 if the fault was handled, or the signal
- * number if this is a kernel fault that can't be handled here.
- */
- int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
- {
- struct vm_area_struct * vma;
- struct mm_struct *mm = current->mm;
- siginfo_t info;
- int code = SEGV_MAPERR;
- int is_write = 0, ret;
- int trap = TRAP(regs);
- int is_exec = trap == 0x400;
-
- #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
- /*
- * Fortunately the bit assignments in SRR1 for an instruction
- * fault and DSISR for a data fault are mostly the same for the
- * bits we are interested in. But there are some bits which
- * indicate errors in DSISR but can validly be set in SRR1.
- */
- if (trap == 0x400)
- error_code &= 0x48200000;
- else
- is_write = error_code & DSISR_ISSTORE;
- #else
- is_write = error_code & ESR_DST;
- #endif /* CONFIG_4xx || CONFIG_BOOKE */
-
- if (notify_page_fault(regs)) //给kprobe用
- return 0;
-
- if (unlikely(debugger_fault_handler(regs))) //调试用
- return 0;
- /* SLB=segment lookaside buffer , e500没有这类寄存器 */
- /* On a kernel SLB miss we can only check for a valid exception entry */
- if (!user_mode(regs) && (address >= TASK_SIZE)) //如果在内核模式并且address也是内核地址,则出错。
- return SIGSEGV;
- #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
- defined(CONFIG_PPC_BOOK3S_64))
- if (error_code & DSISR_DABRMATCH) {
- /* DABR match */
- do_dabr(regs, address, error_code);
- return 0;
- }
- #endif
-
- if (in_atomic() || mm == NULL) { //mm==null表明是内核进程
- if (!user_mode(regs)) //如果在内核态直接返回信号量
- return SIGSEGV;
- /* in_atomic() in user mode is really bad,
- as is current->mm == NULL. */
- printk(KERN_EMERG "Page fault in user mode with "
- "in_atomic() = %d mm = %p\n", in_atomic(), mm);
- printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
- regs->nip, regs->msr);
- die("Weird page fault", regs, SIGSEGV); //如果在用户态,则挂起内核
- }
-
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
-
- /* When running in the kernel we expect faults to occur only to
- * addresses in user space. All other faults represent errors in the
- * kernel and should generate an OOPS. Unfortunately, in the case of an
- * erroneous fault occurring in a code path which already holds mmap_sem
- * we will deadlock attempting to validate the fault against the
- * address space. Luckily the kernel only validly references user
- * space from well defined areas of code, which are listed in the
- * exceptions table.
- *
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibility of a deadlock.
- * Attempt to lock the address space, if we cannot we then validate the
- * source. If this is invalid we can skip the address space check,
- * thus avoiding the deadlock.
- */
- if (!down_read_trylock(&mm->mmap_sem)) {
- if (!user_mode(regs) && !search_exception_tables(regs->nip))
- goto bad_area_nosemaphore;
-
- down_read(&mm->mmap_sem);
- }
-
- vma = find_vma(mm, address);
- if (!vma) //如果不能找到内在空间段,则到bad_area
- goto bad_area;
- if (vma->vm_start <= address)
- goto good_area; // 合法地址,继续
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto bad_area; //检查是否可以向前增长地址
-
- /*
- * N.B. The POWER/Open ABI allows programs to access up to
- * 288 bytes below the stack pointer.
- * The kernel signal delivery code writes up to about 1.5kB
- * below the stack pointer (r1) before decrementing it.
- * The exec code can write slightly over 640kB to the stack
- * before setting the user r1. Thus we allow the stack to
- * expand to 1MB without further checks.
- */
- if (address + 0x100000 < vma->vm_end) { //判断当前虚拟地址后是否有1M空间,以便扩展当前进程的栈段空间
- /* get user regs even if this fault is in kernel mode */
- struct pt_regs *uregs = current->thread.regs;
- if (uregs == NULL)
- goto bad_area;
-
- /*
- * A user-mode access to an address a long way below
- * the stack pointer is only valid if the instruction
- * is one which would update the stack pointer to the
- * address accessed if the instruction completed,
- * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
- * (or the byte, halfword, float or double forms).
- *
- * If we don't check this then any write to the area
- * between the last mapped region and the stack will
- * expand the stack rather than segfaulting.
- */
- if (address + 2048 < uregs->gpr[1]
- && (!user_mode(regs) || !store_updates_sp(regs)))
- goto bad_area;
- }
- if (expand_stack(vma, address)) //扩展进程的栈段
- goto bad_area;
- good_area:
- code = SEGV_ACCERR;
- #if defined(CONFIG_6xx)
- if (error_code & 0x95700000)
- /* an error such as lwarx to I/O controller space,
- address matching DABR, eciwx, etc. */
- goto bad_area;
- #endif /* CONFIG_6xx */
- #if defined(CONFIG_8xx)
- /* 8xx sometimes need to load a invalid/non-present TLBs.
- * These must be invalidated separately as linux mm don't.
- */
- if (error_code & 0x40000000) /* no translation? */
- _tlbil_va(address, 0, 0, 0);
-
- /* The MPC8xx seems to always set 0x80000000, which is
- * "undefined". Of those that can be set, this is the only
- * one which seems bad.
- */
- if (error_code & 0x10000000)
- /* Guarded storage error. */
- goto bad_area;
- #endif /* CONFIG_8xx */
-
- if (is_exec) { //当异常是INSTRUCTION_STORAGE_EXCEPTION时,即指令存贮异常时执行的权限检查
- #ifdef CONFIG_PPC_STD_MMU
- /* Protection fault on exec go straight to failure on
- * Hash based MMUs as they either don't support per-page
- * execute permission, or if they do, it's handled already
- * at the hash level. This test would probably have to
- * be removed if we change the way this works to make hash
- * processors use the same I/D cache coherency mechanism
- * as embedded.
- */
- if (error_code & DSISR_PROTFAULT)
- goto bad_area;
- #endif /* CONFIG_PPC_STD_MMU */
-
- /*
- * Allow execution from readable areas if the MMU does not
- * provide separate controls over reading and executing.
- *
- * Note: That code used to not be enabled for 4xx/BookE.
- * It is now as I/D cache coherency for these is done at
- * set_pte_at() time and I see no reason why the test
- * below wouldn't be valid on those processors. This -may-
- * break programs compiled with a really old ABI though.
- */
- if (!(vma->vm_flags & VM_EXEC) &&
- (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
- !(vma->vm_flags & (VM_READ | VM_WRITE))))
- goto bad_area;
- /* a write */
- } else if (is_write) { //写权限检查
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
- /* a read */
- } else { //读权限检查
- /* protection fault */
- if (error_code & 0x08000000)
- goto bad_area;
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
- goto bad_area;
- }
-
- /*
- * If for any reason at all we couldn't handle the fault,
- * make sure we exit gracefully rather than endlessly redo
- * the fault.
- */ //分配PTE表,并建立映射
- ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
- if (unlikely(ret & VM_FAULT_ERROR)) {
- if (ret & VM_FAULT_OOM)
- goto out_of_memory;
- else if (ret & VM_FAULT_SIGBUS)
- goto do_sigbus; //错误原因:处理器总线不能正常访问address地址处的内存,通常未对齐的数据访问或者硬件错误会导致这类情况
- BUG();
- }
- if (ret & VM_FAULT_MAJOR) {
- current->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
- regs, address);
- #ifdef CONFIG_PPC_SMLPAR
- if (firmware_has_feature(FW_FEATURE_CMO)) {
- preempt_disable();
- get_lppaca()->page_ins += (1 << PAGE_FACTOR);
- preempt_enable();
- }
- #endif
- } else {
- current->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
- regs, address);
- }
- up_read(&mm->mmap_sem);
- return 0;
-
- bad_area:
- up_read(&mm->mmap_sem);
-
- bad_area_nosemaphore:
- /* User mode accesses cause a SIGSEGV */
- if (user_mode(regs)) {
- _exception(SIGSEGV, regs, code, address); //用户模式下,系统挂起
- return 0;
- }
-
- if (is_exec && (error_code & DSISR_PROTFAULT))
- printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
- " page (%lx) - exploit attempt? (uid: %d)\n",
- address, current_uid());
-
- return SIGSEGV; //SIGSEGV原因:意味着访问了无效地址,没有物理地址与该地址对应
-
- /*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
- out_of_memory:
- up_read(&mm->mmap_sem);
- if (!user_mode(regs))
- return SIGKILL;
- pagefault_out_of_memory();
- return 0;
-
- do_sigbus:
- up_read(&mm->mmap_sem);
- if (user_mode(regs)) {
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *)address;
- force_sig_info(SIGBUS, &info, current);
- return 0;
- }
- return SIGBUS;
- }
- .globl ret_from_except_full
- ret_from_except_full:
- REST_NVGPRS(r1)
- /* fall through */
-
- .globl ret_from_except
- ret_from_except:
- /* Hard-disable interrupts so that current_thread_info()->flags
- * can't change between when we test it and when we return
- * from the interrupt. */
- /* Note: We don't bother telling lockdep about it */
- LOAD_MSR_KERNEL(r10,MSR_KERNEL) //MSR_KERNEL->r10
- SYNC /* Some chip revs have problems here... */
- MTMSRD(r10) /* disable interrupts */
-
- lwz r3,_MSR(r1) /* Returning to user mode? */ //载入堆栈中的MSR寄存器,用来判断是否在内核态
- andi. r0,r3,MSR_PR //通过判断MSR_PR位来判断是否在内核态
- beq resume_kernel //内核态跳到resume_kernel处
-
- user_exc_return: /* r10 contains MSR_KERNEL here */ -------用户态恢复中断
- /* Check current_thread_info()->flags */
- rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
- lwz r9,TI_FLAGS(r9)
- andi. r0,r9,_TIF_USER_WORK_MASK //判断当前进程是否需要重新调度及是否有未处理的信号事件,有则调用do_work去处理
- bne do_work
- do_work: /* r10 contains MSR_KERNEL here */
- andi. r0,r9,_TIF_NEED_RESCHED //判断是需要重新调度还是需要处理信号
- beq do_user_signal //如果需要处理信号,则到do_user_signal处
- do_resched: /* r10 contains MSR_KERNEL here */
/* Note: We don't need to inform lockdep that we are enabling
* interrupts here. As far as it knows, they are already enabled
*/
ori r10,r10,MSR_EE
SYNC
MTMSRD(r10) /* hard-enable interrupts */
bl schedule
recheck:
/* Note: And we don't tell it we are disabling them again
* neither. Those disable/enable cycles used to peek at
* TI_FLAGS aren't advertised.
*/
LOAD_MSR_KERNEL(r10,MSR_KERNEL)
SYNC
MTMSRD(r10) /* disable interrupts */
rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
lwz r9,TI_FLAGS(r9)
andi. r0,r9,_TIF_NEED_RESCHED
bne- do_resched
andi. r0,r9,_TIF_USER_WORK_MASK
beq restore_user //一直处理到不需要重新调度为止,然后调用resotre_user恢复现场 -
- ----------------处理用户信号
do_user_signal: /* r10 contains MSR_KERNEL here */
ori r10,r10,MSR_EE
SYNC
MTMSRD(r10) /* hard-enable interrupts */
/* save r13-r31 in the exception frame, if not already done */
lwz r3,_TRAP(r1)
andi. r0,r3,1
beq 2f
SAVE_NVGPRS(r1)
rlwinm r3,r3,0,0,30
stw r3,_TRAP(r1)
2: addi r3,r1,STACK_FRAME_OVERHEAD
mr r4,r9
bl do_signal //信号处理,处理完之后再次检查是否需要调度
REST_NVGPRS(r1)
b recheck
-
- restore_user:
- #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
- /* Check whether this process has its own DBCR0 value. The internal
- debug mode bit tells us that dbcr0 should be loaded. */
- lwz r0,THREAD+THREAD_DBCR0(r2)
- andis. r10,r0,DBCR0_IDM@h
- bnel- load_dbcr0
- #endif
- #ifdef CONFIG_PREEMPT ------------------定义抢占时的处理
b restore
/* N.B. the only way to get here is from the beq following ret_from_except. */
resume_kernel:
/* check current_thread_info->preempt_count */
rlwinm r9,r1,0,0,(31-THREAD_SHIFT) //获得thread_info指针,并放入r9
lwz r0,TI_PREEMPT(r9) //判断thread_info->preempt_count是否为0,不为0表示不可抢占
cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
bne restore //不可抢占,直接调用restore恢复中断现场
lwz r0,TI_FLAGS(r9)
andi. r0,r0,_TIF_NEED_RESCHED //判断thread_info->flags,如果_TIF_NEED_RESCHED为1表示需要重新调度,即不恢复原来被中断的任务,而是选择一个新的任务继续运行。
beq+ restore //不需要调度,直接恢复中断现场
andi. r0,r3,MSR_EE /* interrupts off? */
beq restore /* don't schedule if so */ //因为ret_from_except会被异常和外部中断调用,当被异常调用时,EE为0,即不使能外部中断,此时需要立即返回,不进行任务调度。
#ifdef CONFIG_TRACE_IRQFLAGS
/* Lockdep thinks irqs are enabled, we need to call
* preempt_schedule_irq with IRQs off, so we inform lockdep
* now that we -did- turn them off already
*/
bl trace_hardirqs_off
#endif
1: bl preempt_schedule_irq
rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
lwz r3,TI_FLAGS(r9)
andi. r0,r3,_TIF_NEED_RESCHED
bne- 1b //判断是否需要再次调用调度函数,因为当preempt_schedule_irq执行完后,可能有新的任务需要调度
#ifdef CONFIG_TRACE_IRQFLAGS
/* And now, to properly rebalance the above, we tell lockdep they
* are being turned back on, which will happen when we return
*/
bl trace_hardirqs_on
#endif
#else ------------------未定义抢占时的处理处理
resume_kernel:
#endif /* CONFIG_PREEMPT */ -
/* interrupts are hard-disabled at this point */
restore: ----------------------真正的中断恢复处理
#ifdef CONFIG_44x
BEGIN_MMU_FTR_SECTION
b 1f
END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
lis r4,icache_44x_need_flush@ha
lwz r5,icache_44x_need_flush@l(r4)
cmplwi cr0,r5,0
beq+ 1f
li r6,0
iccci r0,r0
stw r6,icache_44x_need_flush@l(r4)
1:
#endif /* CONFIG_44x */
lwz r9,_MSR(r1)
#ifdef CONFIG_TRACE_IRQFLAGS
/* Lockdep doesn't know about the fact that IRQs are temporarily turned
* off in this assembly code while peeking at TI_FLAGS() and such. However
* we need to inform it if the exception turned interrupts off, and we
* are about to trun them back on.
*
* The problem here sadly is that we don't know whether the exceptions was
* one that turned interrupts off or not. So we always tell lockdep about
* turning them on here when we go back to wherever we came from with EE
* on, even if that may meen some redudant calls being tracked. Maybe later
* we could encode what the exception did somewhere or test the exception
* type in the pt_regs but that sounds overkill
*/
andi. r10,r9,MSR_EE
beq 1f
/*
* Since the ftrace irqsoff latency trace checks CALLER_ADDR1,
* which is the stack frame here, we need to force a stack frame
* in case we came from user space.
*/
stwu r1,-32(r1)
mflr r0
stw r0,4(r1)
stwu r1,-32(r1)
bl trace_hardirqs_on
lwz r1,0(r1)
lwz r1,0(r1)
lwz r9,_MSR(r1)
1:
#endif /* CONFIG_TRACE_IRQFLAGS */
lwz r0,GPR0(r1)
lwz r2,GPR2(r1)
REST_4GPRS(3, r1)
REST_2GPRS(7, r1)
lwz r10,_XER(r1)
lwz r11,_CTR(r1)
mtspr SPRN_XER,r10
mtctr r11
PPC405_ERR77(0,r1)
BEGIN_FTR_SECTION
lwarx r11,0,r1
END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
stwcx. r0,0,r1 /* to clear the reservation */
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
andi. r10,r9,MSR_RI /* check if this exception occurred */
beql nonrecoverable /* at a bad place (MSR:RI = 0) */
lwz r10,_CCR(r1)
lwz r11,_LINK(r1)
mtcrf 0xFF,r10
mtlr r11
/*
* Once we put values in SRR0 and SRR1, we are in a state
* where exceptions are not recoverable, since taking an
* exception will trash SRR0 and SRR1. Therefore we clear the
* MSR:RI bit to indicate this. If we do take an exception,
* we can't return to the point of the exception but we
* can restart the exception exit path at the label
* exc_exit_restart below. -- paulus
*/
LOAD_MSR_KERNEL(r10,MSR_KERNEL & ~MSR_RI)
SYNC
MTMSRD(r10) /* clear the RI bit */
.globl exc_exit_restart
exc_exit_restart:
lwz r12,_NIP(r1)
FIX_SRR1(r9,r10)
mtspr SPRN_SRR0,r12
mtspr SPRN_SRR1,r9
REST_4GPRS(9, r1)
lwz r1,GPR1(r1)
.globl exc_exit_restart_end
exc_exit_restart_end:
SYNC
RFI
#else /* !(CONFIG_4xx || CONFIG_BOOKE) */
/*
* This is a bit different on 4xx/Book-E because it doesn't have
* the RI bit in the MSR.
* The TLB miss handler checks if we have interrupted
* the exception exit path and restarts it if so
* (well maybe one day it will... :).
*/
lwz r11,_LINK(r1)
mtlr r11
lwz r10,_CCR(r1)
mtcrf 0xff,r10
REST_2GPRS(9, r1)
.globl exc_exit_restart
exc_exit_restart:
lwz r11,_NIP(r1)
lwz r12,_MSR(r1)
exc_exit_start:
mtspr SPRN_SRR0,r11
mtspr SPRN_SRR1,r12
REST_2GPRS(11, r1)
lwz r1,GPR1(r1)
.globl exc_exit_restart_end
exc_exit_restart_end:
PPC405_ERR77_SYNC
rfi
b . /* prevent prefetch past rfi */