E500 TLB miss 及 DSI处理分析(2)

最新推荐文章于 2023-09-11 21:32:02 发布

weixin_34378969

最新推荐文章于 2023-09-11 21:32:02 发布

阅读量258

点赞数

文章标签： python 操作系统

原文链接：https://my.oschina.net/mavericsoung/blog/133100

版权

2019独角兽企业重金招聘Python工程师标准>>>

based on kernel 3.0.18
--------------------------
1. handle_page_fault函数处理：

/*
* Top-level page fault handling.
* This is in assembler because if do_page_fault tells us that
* it is a bad kernel page fault, we want to save the non-volatile
* registers before calling bad_page_fault.
*/
.globl handle_page_fault
handle_page_fault:
stw r4,_DAR(r1)
addi r3,r1,STACK_FRAME_OVERHEAD
bl do_page_fault //准备好调用函数的参数，r3=寄存器列表地址，r4=异常发生时地址=DEAR，r5:错误代码=ESR寄存器内容
cmpwi r3,0
beq+ ret_from_except //如果返回值为0，表示正常处理了该异常，正常返回
SAVE_NVGPRS(r1)
lwz r0,_TRAP(r1)
clrrwi r0,r0,1
stw r0,_TRAP(r1)
mr r5,r3
addi r3,r1,STACK_FRAME_OVERHEAD
lwz r4,_DAR(r1)
bl bad_page_fault //异常未被正常处理，调用此函数处理内核不能处理的异常，如果该函数也未在异常列表中找到该异常，则系统打印错误信息后死机，否则函数返回，继续异常返回操作
b ret_from_except_full //与ret_from_except处理基本一致，除了保存几个寄存器

2. do_page_fault函数处理流程：
先解释一下COW（copy on write):即在子进程创建时，子进程的不创建地址空间，而是将父子进程的PTE表项设为只读，当对地址空间进行写入时，引发dtlb miss异常，在异常处理程序中判断地址空间的数据页面是否为只读，如果是，则表明是向只读空间写入，是错误行为。否则认为是COW行为，则会重新分配一个生理页面。这样做是为了提高效率。COW的路径为：
do_fork->copy_process->copy_mm->dup_mm->dup_mmap->copy_page_range->copy_pud_range->
copy_pmd_range->copy_pte_range->copy_one_pte.

/*
* For 600- and 800-family processors, the error_code parameter is DSISR
* for a data fault, SRR1 for an instruction fault. For 400-family processors
* the error_code parameter is ESR for a data fault, 0 for an instruction
* fault.
* For 64-bit processors, the error_code parameter is
* - DSISR for a non-SLB data access fault,
* - SRR1 & 0x08000000 for a non-SLB instruction access fault
* - 0 any SLB fault.
*
* The return value is 0 if the fault was handled, or the signal
* number if this is a kernel fault that can't be handled here.
*/
int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code)
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
siginfo_t info;
int code = SEGV_MAPERR;
int is_write = 0, ret;
int trap = TRAP(regs);
int is_exec = trap == 0x400;
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
/*
* Fortunately the bit assignments in SRR1 for an instruction
* fault and DSISR for a data fault are mostly the same for the
* bits we are interested in. But there are some bits which
* indicate errors in DSISR but can validly be set in SRR1.
*/
if (trap == 0x400)
error_code &= 0x48200000;
else
is_write = error_code & DSISR_ISSTORE;
#else
is_write = error_code & ESR_DST;
#endif /* CONFIG_4xx || CONFIG_BOOKE */
if (notify_page_fault(regs)) //给kprobe用
return 0;
if (unlikely(debugger_fault_handler(regs))) //调试用
return 0;
/* SLB=segment lookaside buffer , e500没有这类寄存器 */
/* On a kernel SLB miss we can only check for a valid exception entry */
if (!user_mode(regs) && (address >= TASK_SIZE)) //如果在内核模式并且address也是内核地址，则出错。
return SIGSEGV;
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
defined(CONFIG_PPC_BOOK3S_64))
if (error_code & DSISR_DABRMATCH) {
/* DABR match */
do_dabr(regs, address, error_code);
return 0;
}
#endif
if (in_atomic() || mm == NULL) { //mm==null表明是内核进程
if (!user_mode(regs)) //如果在内核态直接返回信号量
return SIGSEGV;
/* in_atomic() in user mode is really bad,
as is current->mm == NULL. */
printk(KERN_EMERG "Page fault in user mode with "
"in_atomic() = %d mm = %p\n", in_atomic(), mm);
printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
regs->nip, regs->msr);
die("Weird page fault", regs, SIGSEGV); //如果在用户态，则挂起内核
}
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
* erroneous fault occurring in a code path which already holds mmap_sem
* we will deadlock attempting to validate the fault against the
* address space. Luckily the kernel only validly references user
* space from well defined areas of code, which are listed in the
* exceptions table.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibility of a deadlock.
* Attempt to lock the address space, if we cannot we then validate the
* source. If this is invalid we can skip the address space check,
* thus avoiding the deadlock.
*/
if (!down_read_trylock(&mm->mmap_sem)) {
if (!user_mode(regs) && !search_exception_tables(regs->nip))
goto bad_area_nosemaphore;
down_read(&mm->mmap_sem);
}
vma = find_vma(mm, address);
if (!vma) //如果不能找到内在空间段，则到bad_area
goto bad_area;
if (vma->vm_start <= address)
goto good_area; // 合法地址，继续
if (!(vma->vm_flags & VM_GROWSDOWN))
goto bad_area; //检查是否可以向前增长地址
/*
* N.B. The POWER/Open ABI allows programs to access up to
* 288 bytes below the stack pointer.
* The kernel signal delivery code writes up to about 1.5kB
* below the stack pointer (r1) before decrementing it.
* The exec code can write slightly over 640kB to the stack
* before setting the user r1. Thus we allow the stack to
* expand to 1MB without further checks.
*/
if (address + 0x100000 < vma->vm_end) { //判断当前虚拟地址后是否有1M空间，以便扩展当前进程的栈段空间
/* get user regs even if this fault is in kernel mode */
struct pt_regs *uregs = current->thread.regs;
if (uregs == NULL)
goto bad_area;
/*
* A user-mode access to an address a long way below
* the stack pointer is only valid if the instruction
* is one which would update the stack pointer to the
* address accessed if the instruction completed,
* i.e. either stwu rs,n(r1) or stwux rs,r1,rb
* (or the byte, halfword, float or double forms).
*
* If we don't check this then any write to the area
* between the last mapped region and the stack will
* expand the stack rather than segfaulting.
*/
if (address + 2048 < uregs->gpr[1]
&& (!user_mode(regs) || !store_updates_sp(regs)))
goto bad_area;
}
if (expand_stack(vma, address)) //扩展进程的栈段
goto bad_area;
good_area:
code = SEGV_ACCERR;
#if defined(CONFIG_6xx)
if (error_code & 0x95700000)
/* an error such as lwarx to I/O controller space,
address matching DABR, eciwx, etc. */
goto bad_area;
#endif /* CONFIG_6xx */
#if defined(CONFIG_8xx)
/* 8xx sometimes need to load a invalid/non-present TLBs.
* These must be invalidated separately as linux mm don't.
*/
if (error_code & 0x40000000) /* no translation? */
_tlbil_va(address, 0, 0, 0);
/* The MPC8xx seems to always set 0x80000000, which is
* "undefined". Of those that can be set, this is the only
* one which seems bad.
*/
if (error_code & 0x10000000)
/* Guarded storage error. */
goto bad_area;
#endif /* CONFIG_8xx */
if (is_exec) { //当异常是INSTRUCTION_STORAGE_EXCEPTION时，即指令存贮异常时执行的权限检查
#ifdef CONFIG_PPC_STD_MMU
/* Protection fault on exec go straight to failure on
* Hash based MMUs as they either don't support per-page
* execute permission, or if they do, it's handled already
* at the hash level. This test would probably have to
* be removed if we change the way this works to make hash
* processors use the same I/D cache coherency mechanism
* as embedded.
*/
if (error_code & DSISR_PROTFAULT)
goto bad_area;
#endif /* CONFIG_PPC_STD_MMU */
/*
* Allow execution from readable areas if the MMU does not
* provide separate controls over reading and executing.
*
* Note: That code used to not be enabled for 4xx/BookE.
* It is now as I/D cache coherency for these is done at
* set_pte_at() time and I see no reason why the test
* below wouldn't be valid on those processors. This -may-
* break programs compiled with a really old ABI though.
*/
if (!(vma->vm_flags & VM_EXEC) &&
(cpu_has_feature(CPU_FTR_NOEXECUTE) ||
!(vma->vm_flags & (VM_READ | VM_WRITE))))
goto bad_area;
/* a write */
} else if (is_write) { //写权限检查
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
/* a read */
} else { //读权限检查
/* protection fault */
if (error_code & 0x08000000)
goto bad_area;
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
}
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/ //分配PTE表，并建立映射
ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
if (unlikely(ret & VM_FAULT_ERROR)) {
if (ret & VM_FAULT_OOM)
goto out_of_memory;
else if (ret & VM_FAULT_SIGBUS)
goto do_sigbus; //错误原因：处理器总线不能正常访问address地址处的内存，通常未对齐的数据访问或者硬件错误会导致这类情况
BUG();
}
if (ret & VM_FAULT_MAJOR) {
current->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
regs, address);
#ifdef CONFIG_PPC_SMLPAR
if (firmware_has_feature(FW_FEATURE_CMO)) {
preempt_disable();
get_lppaca()->page_ins += (1 << PAGE_FACTOR);
preempt_enable();
}
#endif
} else {
current->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
regs, address);
}
up_read(&mm->mmap_sem);
return 0;
bad_area:
up_read(&mm->mmap_sem);
bad_area_nosemaphore:
/* User mode accesses cause a SIGSEGV */
if (user_mode(regs)) {
_exception(SIGSEGV, regs, code, address); //用户模式下，系统挂起
return 0;
}
if (is_exec && (error_code & DSISR_PROTFAULT))
printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
" page (%lx) - exploit attempt? (uid: %d)\n",
address, current_uid());
return SIGSEGV; //SIGSEGV原因：意味着访问了无效地址，没有物理地址与该地址对应
/*
* We ran out of memory, or some other thing happened to us that made
* us unable to handle the page fault gracefully.
*/
out_of_memory:
up_read(&mm->mmap_sem);
if (!user_mode(regs))
return SIGKILL;
pagefault_out_of_memory();
return 0;
do_sigbus:
up_read(&mm->mmap_sem);
if (user_mode(regs)) {
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = BUS_ADRERR;
info.si_addr = (void __user *)address;
force_sig_info(SIGBUS, &info, current);
return 0;
}
return SIGBUS;
}

2. ret_from_except函数处理：

.globl ret_from_except_full
ret_from_except_full:
REST_NVGPRS(r1)
/* fall through */
.globl ret_from_except
ret_from_except:
/* Hard-disable interrupts so that current_thread_info()->flags
* can't change between when we test it and when we return
* from the interrupt. */
/* Note: We don't bother telling lockdep about it */
LOAD_MSR_KERNEL(r10,MSR_KERNEL) //MSR_KERNEL->r10
SYNC /* Some chip revs have problems here... */
MTMSRD(r10) /* disable interrupts */
lwz r3,_MSR(r1) /* Returning to user mode? */ //载入堆栈中的MSR寄存器，用来判断是否在内核态
andi. r0,r3,MSR_PR //通过判断MSR_PR位来判断是否在内核态
beq resume_kernel //内核态跳到resume_kernel处
user_exc_return: /* r10 contains MSR_KERNEL here */ -------用户态恢复中断
/* Check current_thread_info()->flags */
rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
lwz r9,TI_FLAGS(r9)
andi. r0,r9,_TIF_USER_WORK_MASK //判断当前进程是否需要重新调度及是否有未处理的信号事件，有则调用do_work去处理
bne do_work
1. do_work: /* r10 contains MSR_KERNEL here */
2. andi. r0,r9,_TIF_NEED_RESCHED //判断是需要重新调度还是需要处理信号
3. beq do_user_signal //如果需要处理信号，则到do_user_signal处
4. do_resched:         /* r10 contains MSR_KERNEL here */
      /* Note: We don't need to inform lockdep that we are enabling
       * interrupts here. As far as it knows, they are already enabled
       */
      ori r10,r10,MSR_EE
      SYNC
      MTMSRD(r10)     /* hard-enable interrupts */
      bl schedule
  recheck:
      /* Note: And we don't tell it we are disabling them again
       * neither. Those disable/enable cycles used to peek at
       * TI_FLAGS aren't advertised.
       */
      LOAD_MSR_KERNEL(r10,MSR_KERNEL)
      SYNC
      MTMSRD(r10)     /* disable interrupts */
      rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
      lwz r9,TI_FLAGS(r9)
      andi.   r0,r9,_TIF_NEED_RESCHED
      bne-    do_resched
      andi.   r0,r9,_TIF_USER_WORK_MASK
      beq restore_user //一直处理到不需要重新调度为止，然后调用resotre_user恢复现场
6. ----------------处理用户信号
  do_user_signal:         /* r10 contains MSR_KERNEL here */
      ori r10,r10,MSR_EE
      SYNC
      MTMSRD(r10)     /* hard-enable interrupts */
      /* save r13-r31 in the exception frame, if not already done */
      lwz r3,_TRAP(r1)
      andi.   r0,r3,1
      beq 2f
      SAVE_NVGPRS(r1)
      rlwinm r3,r3,0,0,30
      stw r3,_TRAP(r1)
  2: addi    r3,r1,STACK_FRAME_OVERHEAD
      mr r4,r9
      bl do_signal //信号处理，处理完之后再次检查是否需要调度
      REST_NVGPRS(r1)
      b   recheck
restore_user:
#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
/* Check whether this process has its own DBCR0 value. The internal
debug mode bit tells us that dbcr0 should be loaded. */
lwz r0,THREAD+THREAD_DBCR0(r2)
andis. r10,r0,DBCR0_IDM@h
bnel- load_dbcr0
#endif
#ifdef CONFIG_PREEMPT ------------------定义抢占时的处理
    b   restore

/* N.B. the only way to get here is from the beq following ret_from_except. */
resume_kernel:
    /* check current_thread_info->preempt_count */
    rlwinm r9,r1,0,0,(31-THREAD_SHIFT) //获得thread_info指针，并放入r9
    lwz r0,TI_PREEMPT(r9) //判断thread_info->preempt_count是否为0，不为0表示不可抢占
    cmpwi   0,r0,0      /* if non-zero, just restore regs and return */
    bne restore //不可抢占，直接调用restore恢复中断现场
    lwz r0,TI_FLAGS(r9)
    andi.   r0,r0,_TIF_NEED_RESCHED //判断thread_info->flags,如果_TIF_NEED_RESCHED为1表示需要重新调度，即不恢复原来被中断的任务，而是选择一个新的任务继续运行。
    beq+    restore //不需要调度，直接恢复中断现场
    andi.   r0,r3,MSR_EE    /* interrupts off? */
    beq restore     /* don't schedule if so */ //因为ret_from_except会被异常和外部中断调用，当被异常调用时，EE为0，即不使能外部中断，此时需要立即返回，不进行任务调度。
#ifdef CONFIG_TRACE_IRQFLAGS
    /* Lockdep thinks irqs are enabled, we need to call
     * preempt_schedule_irq with IRQs off, so we inform lockdep
     * now that we -did- turn them off already
     */
    bl trace_hardirqs_off
#endif
1: bl preempt_schedule_irq
    rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
    lwz r3,TI_FLAGS(r9)
    andi.   r0,r3,_TIF_NEED_RESCHED
    bne-    1b //判断是否需要再次调用调度函数，因为当preempt_schedule_irq执行完后，可能有新的任务需要调度
#ifdef CONFIG_TRACE_IRQFLAGS
    /* And now, to properly rebalance the above, we tell lockdep they
     * are being turned back on, which will happen when we return
     */
    bl trace_hardirqs_on
#endif
#else      ------------------未定义抢占时的处理处理
resume_kernel:
#endif /* CONFIG_PREEMPT */
    /* interrupts are hard-disabled at this point */
restore: ----------------------真正的中断恢复处理
#ifdef CONFIG_44x
BEGIN_MMU_FTR_SECTION
    b   1f
END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
    lis r4,icache_44x_need_flush@ha
    lwz r5,icache_44x_need_flush@l(r4)
    cmplwi cr0,r5,0
    beq+    1f
    li r6,0
    iccci   r0,r0
    stw r6,icache_44x_need_flush@l(r4)
1:
#endif /* CONFIG_44x */

    lwz r9,_MSR(r1)
#ifdef CONFIG_TRACE_IRQFLAGS
    /* Lockdep doesn't know about the fact that IRQs are temporarily turned
     * off in this assembly code while peeking at TI_FLAGS() and such. However
     * we need to inform it if the exception turned interrupts off, and we
     * are about to trun them back on.
     *
     * The problem here sadly is that we don't know whether the exceptions was
     * one that turned interrupts off or not. So we always tell lockdep about
     * turning them on here when we go back to wherever we came from with EE
     * on, even if that may meen some redudant calls being tracked. Maybe later
     * we could encode what the exception did somewhere or test the exception
     * type in the pt_regs but that sounds overkill
     */
    andi.   r10,r9,MSR_EE
    beq 1f
    /*
     * Since the ftrace irqsoff latency trace checks CALLER_ADDR1,
     * which is the stack frame here, we need to force a stack frame
     * in case we came from user space.
     */
    stwu    r1,-32(r1)
    mflr    r0
    stw r0,4(r1)
    stwu    r1,-32(r1)
    bl trace_hardirqs_on
    lwz r1,0(r1)
    lwz r1,0(r1)
    lwz r9,_MSR(r1)
1:
#endif /* CONFIG_TRACE_IRQFLAGS */

    lwz r0,GPR0(r1)
    lwz r2,GPR2(r1)
    REST_4GPRS(3, r1)
    REST_2GPRS(7, r1)

    lwz r10,_XER(r1)
    lwz r11,_CTR(r1)
    mtspr   SPRN_XER,r10
    mtctr   r11

    PPC405_ERR77(0,r1)
BEGIN_FTR_SECTION
    lwarx   r11,0,r1
END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
    stwcx. r0,0,r1         /* to clear the reservation */

#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
    andi.   r10,r9,MSR_RI       /* check if this exception occurred */
    beql    nonrecoverable      /* at a bad place (MSR:RI = 0) */

    lwz r10,_CCR(r1)
    lwz r11,_LINK(r1)
    mtcrf   0xFF,r10
    mtlr    r11

    /*
     * Once we put values in SRR0 and SRR1, we are in a state
     * where exceptions are not recoverable, since taking an
     * exception will trash SRR0 and SRR1. Therefore we clear the
     * MSR:RI bit to indicate this. If we do take an exception,
     * we can't return to the point of the exception but we
     * can restart the exception exit path at the label
     * exc_exit_restart below. -- paulus
     */
    LOAD_MSR_KERNEL(r10,MSR_KERNEL & ~MSR_RI)
    SYNC
    MTMSRD(r10)     /* clear the RI bit */
    .globl exc_exit_restart
exc_exit_restart:
    lwz r12,_NIP(r1)
    FIX_SRR1(r9,r10)
    mtspr   SPRN_SRR0,r12
    mtspr   SPRN_SRR1,r9
    REST_4GPRS(9, r1)
    lwz r1,GPR1(r1)
    .globl exc_exit_restart_end
exc_exit_restart_end:
    SYNC
    RFI

#else /* !(CONFIG_4xx || CONFIG_BOOKE) */
    /*
     * This is a bit different on 4xx/Book-E because it doesn't have
     * the RI bit in the MSR.
     * The TLB miss handler checks if we have interrupted
     * the exception exit path and restarts it if so
     * (well maybe one day it will... :).
     */
    lwz r11,_LINK(r1)
    mtlr    r11
    lwz r10,_CCR(r1)
    mtcrf   0xff,r10
    REST_2GPRS(9, r1)
    .globl exc_exit_restart
exc_exit_restart:
    lwz r11,_NIP(r1)
    lwz r12,_MSR(r1)
exc_exit_start:
    mtspr   SPRN_SRR0,r11
    mtspr   SPRN_SRR1,r12
    REST_2GPRS(11, r1)
    lwz r1,GPR1(r1)
    .globl exc_exit_restart_end
exc_exit_restart_end:
    PPC405_ERR77_SYNC
    rfi
    b   .           /* prevent prefetch past rfi */