1 什么是缺页异常
缺页异常是指CPU访问的虚拟地址时, MMU没有办法找到对应的物理地址映射关系,或者与该物理页的访问权不一致而发生的异常。
CPU通过地址总线可以访问连接在地址总线上的所有外设,包括物理内存、IO设备等等,但从CPU发出的访问地址并非是这些外设在地址总线上的物理地址,而是一个虚拟地址,由MMU将虚拟地址转换成物理地址再从地址总线上发出,MMU上的这种虚拟地址和物理地址的转换关系是需要创建的,并且还需要设置这个物理页的访问权限。
下面总结下缺页异常的几种情况:
1、当MMU中确实没有创建虚拟页物理页映射关系,并且在该虚拟地址之后再没有当前进程的线性区vma的时候,可以肯定这是一个编码错误,这将杀掉该进程;
2、当MMU中确实没有创建虚拟页物理页映射关系,并且在该虚拟地址之后存在当前进程的线性区vma的时候,这很可能是缺页异常,并且可能是栈溢出导致的缺页异常;
3、当使用malloc/mmap()等希望访问物理空间的库函数/系统调用后,由于linux并未真正给新创建的vma映射物理页,此时若先进行写操作,将如上面的2的情况产生缺页异常,若先进行读操作虽也会产生缺页异常,将被映射给默认的零页(zero_pfn),等再进行写操作时,仍会产生缺页异常,这次必须分配物理页了,进入写时复制的流程;
4、当使用fork()等系统调用创建子进程时,子进程不论有无自己的vma,“它的”vma都有对于物理页的映射,但它们共同映射的这些物理页属性为只读,即linux并未给子进程真正分配物理页,当父子进程任何一方要写相应物理页时,导致缺页异常的写时复制;
目前来看,应该就是这四种情况,还是比较清晰的,可发现一个重要规律就是,linux是直到实在不行的时候才会分配物理页,把握这个原则理解的会好一些,下面详细的看缺页处理。
2 异常处理流程
2.1 do_page_fault()
arm的缺页处理函数在arch/arm/mm/fault.c
static int __kprobes
do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
struct task_struct *tsk;
struct mm_struct *mm;
int sig, code;
vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
if (notify_page_fault(regs, fsr))
return 0;
/* 获取到缺页异常的进程描述符和其内存描述符 */
tsk = current;
mm = tsk->mm;
/* Enable interrupts if they were enabled in the parent context. */
if (interrupts_enabled(regs))
local_irq_enable();
/*
* If we're in an interrupt or have no user
* context, we must not take the fault..
*/
if (faulthandler_disabled() || !mm) // 判断当前是否在interrupt上下文或者禁止抢占或者内核进程中,是就跳转到on_context
goto no_context;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
if ((fsr & FSR_WRITE) && !(fsr & FSR_CM))
flags |= FAULT_FLAG_WRITE;
/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
* we can bug out early if this is from code which shouldn't.
*/
if (!down_read_trylock(&mm->mmap_sem)) {
if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc)) // 发生在内核空间,且没有在exception tables查询到该地址,跳转到no_context
goto no_context;
retry:
down_read(&mm->mmap_sem); // 发生在用户空间则睡眠等待锁持有者释放锁
} else {
/*
* The above down_read_trylock() might have succeeded in
* which case, we'll have missed the might_sleep() from
* down_read()
*/
might_sleep();
#ifdef CONFIG_DEBUG_VM
if (!user_mode(regs) &&
!search_exception_tables(regs->ARM_pc))
goto no_context;
#endif
}
fault = __do_page_fault(mm, addr, fsr, flags, tsk);
/* If we need to retry but a fatal signal is pending, handle the
* signal first. We do not need to release the mmap_sem because
* it would already be released in __lock_page_or_retry in
* mm/filemap.c. */
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
if (!user_mode(regs))
goto no_context;
return 0;
}
/*
* Major/minor page fault accounting is only done on the
* initial attempt. If we go through a retry, it is extremely
* likely that the page will be found in page cache at that point.
*/
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
regs, addr);
} else {
tsk->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
regs, addr);
}
if (fault & VM_FAULT_RETRY) {
/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
* of starvation. */
flags &= ~FAULT_FLAG_ALLOW_RETRY;
flags |= FAULT_FLAG_TRIED;
goto retry;
}
}
up_read(&mm->mmap_sem);
/*
* Handle the "normal" case first - VM_FAULT_MAJOR
*/
/*
* 如果返回值fault不是这里面的值,那么应该会是VM_FAULT_MAJOR或VM_FAULT_MINOR,说明问题解决了,返回。
* 一般正常情况下,__do_page_fault的返回值fault会是0(VM_FAULT_MINOR)或者其他一些值,都不是下面之后会看到的这些值
*/
if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
return 0;
/*
* If we are in kernel mode at this point, we
* have no context to handle this fault with.
*/
if (!user_mode(regs))
goto no_context;
if (fault & VM_FAULT_OOM) {
/*
* We ran out of memory, call the OOM killer, and return to
* userspace (which will retry the fault, or kill us if we
* got oom-killed)
*/
pagefault_out_of_memory(); // 内存耗尽
return 0;
}
if (fault & VM_FAULT_SIGBUS) {
/*
* We had some memory, but were unable to
* successfully fix up this page fault.
*/
sig = SIGBUS;
code = BUS_ADRERR;
} else {
/*
* Something tried to access memory that
* isn't in our memory map..
*/
sig = SIGSEGV;
code = fault == VM_FAULT_BADACCESS ?
SEGV_ACCERR : SEGV_MAPERR;
}
__do_user_fault(tsk, addr, fsr, sig, code, regs); // 用户模式下错误处理,通过给用户进程发信号:SIGBUS/SIGSEGV
return 0;
no_context:
__do_kernel_fault(mm, addr, fsr, regs); // 错误发生在内核模式,如果内核无法处理,此处产生oops错误
return 0;
}
处理流程解析:
- 如果是发生在内核空间,不论是在临界区(中断/推后执行/临界区)还是内核进程本身(内核的mm为NULL),执行__do_kernel_fault();
- __do_page_fault()尽可能地去解决缺页问题;
- 无法解决的错误:(1)发生在内核,执行__do_kernel_fault();(2)用户空间内存耗尽,会通过OOM杀死进程;(3)内存足够但仍然无法处理,发生信号杀死进程。
2.2 __do_page_fault()
__do_page_fault()将会做进一步的分析,并通过handle_mm_fault()完成上述4中情况的缺页处理。
static vm_fault_t __kprobes
__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
unsigned int flags, struct task_struct *tsk)
{
struct vm_area_struct *vma;
vm_fault_t fault;
vma = find_vma(mm, addr); // 搜索出现异常的地址前向最近的的vma,addr < vm_end
fault = VM_FAULT_BADMAP;
if (unlikely(!vma))
goto out;
if (unlikely(vma->vm_start > addr)) // 找到的vma如果不包含addr,可能是向下增长的栈溢出
goto check_stack;
/*
* Ok, we have a good vm_area for this
* memory access, so we can handle it.
*/
good_area:
/*
* 权限错误直接返回,比如缺页报错(由参数fsr标识)报的是不可写/不可执行的错误,但addr所属vma线性区本身就不可写/不可执行,
* 那么就直接返回,因为问题根本不是缺页,而是vma有问题
*/
if (access_error(fsr, vma)) {
fault = VM_FAULT_BADACCESS;
goto out;
}
return handle_mm_fault(vma, addr & PAGE_MASK, flags);
check_stack:
/* Don't allow expansion below FIRST_USER_ADDRESS */
/*
* addr后面的vma的vm_flags含有VM_GROWSDOWN标志,说明这个vma是属于栈,所以addr是在栈中,有可能是栈空间不够时再进栈导致的访问错误。
* 查看栈是否还能扩展,如果不能扩展(expand_stack返回非0)则确认确实是栈溢出导致,即addr确实是栈中地址,不是非法地址
*/
if (vma->vm_flags & VM_GROWSDOWN &&
addr >= FIRST_USER_ADDRESS && !expand_stack(vma, addr))
goto good_area;
out:
return fault;
}
处理流程解析:
- 查看缺页异常的这个虚拟地址addr,找它后面最近的vma,如果没有找到,说明访问的地址是真的错误了,因为它根本不在所分配的任何一个vma线性区;这是一种严重错误,将返回错误码(fault)VM_FAULT_BADMAP,内核会杀掉这个进程;
- 如果addr后面有vma,但addr并未落在这个vma的区间内,这存在一种可能,即addr可能是栈的一个地址,它后面的vma是栈的vma。栈溢出时,即访问addr时,该addr并未落在vma中所以更无二级页表映射,导致缺页异常。通过查看addr后面的vma是否是向下增长并且栈是否可以扩展,以此界定addr是不是栈地址,如果是则进入缺页异常处理流程,否则同样返回错误码(fault)VM_FAULT_BADMAP,内核会杀掉这个进程;
- 权限错误立即返回,比如缺页报错(fsr)报的是不可写,但vma本身就不可写,那么就直接返回。因为问题根本不是缺页,而是vma就已经有问题;返回错误码(fault) VM_FAULT_BADACCESS,这也是一种严重错误,内核会杀掉这个进程;
- 最后是对确实缺页异常的情况进行处理,调用函数handle_mm_fault(),正常情况下将返回VM_FAULT_MAJOR或VM_FAULT_MINOR,返回错误码fault,并加一task的maj_flt或min_flt成员。
2.3. handle_mm_fault()
进一步调用__handle_mm_fault()完成处理工作。
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
vm_fault_t ret;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
count_memcg_event_mm(vma->vm_mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
return VM_FAULT_SIGSEGV;
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
* VM_FAULT_OOM), there is no need to kill anything.
* Just clean up the OOM state peacefully.
*/
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
}
return ret;
}
2.4. __handle_mm_fault()
先确定缺页的各级页表是否存在,并初始化vmf用于记录和缺页相关的信息。
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
struct vm_fault vmf = {
.vma = vma,