缺页中断处理的核心函数是do_page_fault(),该函数的实现和具体的体系结构相关。
[arch/arm/mm/fault.c]
static int __kprobes
do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
struct task_struct *tsk;
struct mm_struct *mm;
int fault, sig, code;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
if (notify_page_fault(regs, fsr))
return 0;
tsk = current;
mm = tsk->mm;
/* Enable interrupts if they were enabled in the parent context. */
if (interrupts_enabled(regs))
local_irq_enable();
/*
* If we're in an interrupt or have no user
* context, we must not take the fault..
*/
/*in_atomic()判断当前状态是否处于中断上下文或禁止抢占状态,如果是
说明系统运行在原子上下文中(atomic context),那么跳转到no_context
标签处的__do_kernel_fault()函数。如果当前进程中没有struct mm_struct
数据结构,说明这是一个内核线程,同样跳转到__do_kernel_fault函数中。
这里可以看出,缺页中断是应用程序导致,内核不会触发缺页中断,只是会
调用缺页中断相关的函数*/
if (in_atomic() || !mm)
goto no_context; /*内核panic*/
/*如果是用户模式,那么flags置位FAULT_FLAG_USER*/
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
if (fsr & FSR_WRITE)
flags |= FAULT_FLAG_WRITE;
/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
* we can bug out early if this is from code which shouldn't.
*/
/*down_read_trylock()函数判断当前进程的mm->mmap_sem读写信号量是否可以获取,
返回1则表示成功获得锁,返回0则表示锁已被别人占用。mm->mmap_sem锁被别人占用
时要区分两种情况,一种是发生在内核空间,另一种是发生在用户空间。发生在用户空间
的情况可以调用down_read()来睡眠等待锁持有者释放该锁;发生在内核空间时,如果
没有在exception_tables查询到该地址,那么跳转到no_context*/
if (!down_read_trylock(&mm->mmap_sem)) {
/*search_exception_tables函数的作用不懂*/
if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))
goto no_context;
retry:
down_read(&mm->mmap_sem);
} else {
/*
* The above down_read_trylock() might have succeeded in
* which case, we'll have missed the might_sleep() from
* down_read()
*/
might_sleep();
#ifdef CONFIG_DEBUG_VM
if (!user_mode(regs) &&
!search_exception_tables(regs->ARM_pc))
goto no_context;
#endif
}
/*__do_page_fault()函数,立即查看下面的讲解。此函数通常返回VM_FAULT类型,下面有介绍
作用: 1. 判断addr是否在vma中 2.判断权限是否正确 3. 调用handle_mm_fault*/
fault = __do_page_fault(mm, addr, fsr, flags, tsk);
/* If we need to retry but a fatal signal is pending, handle the
* signal first. We do not need to release the mmap_sem because
* it would already be released in __lock_page_or_retry in
* mm/filemap.c. */
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
return 0;
/*
* Major/minor page fault accounting is only done on the
* initial attempt. If we go through a retry, it is extremely
* likely that the page will be found in page cache at that point.
*/
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
regs, addr);
} else {
tsk->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
regs, addr);
}
if (fault & VM_FAULT_RETRY) {
/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
* of starvation. */
flags &= ~FAULT_FLAG_ALLOW_RETRY;
flags |= FAULT_FLAG_TRIED;
goto retry;
}
}
up_read(&mm->mmap_sem);
/*
* Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
*/
/*如果没有返回(VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)错误类型,
那么说明缺页中断就处理完成。*/
if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
return 0;
/*
* If we are in kernel mode at this point, we
* have no context to handle this fault with.
*/
/*__do_page_fault()函数返回错误且当前处理内核模式,那么跳转
__do_kernel_fault()来处理。*/
if (!user_mode(regs))
goto no_context;
/*如果错误类型是VM_FAULT_OOM,说明当前系统没有足够的
内存,那么调用pagefault_nout_of_memory()函数来触发OOM机制*/
if (fault & VM_FAULT_OOM) {
/*
* We ran out of memory, call the OOM killer, and return to
* userspace (which will retry the fault, or kill us if we
* got oom-killed)
*/
pagefault_out_of_memory();
return 0;
}
if (fault & VM_FAULT_SIGBUS) {
/*
* We had some memory, but were unable to
* successfully fix up this page fault.
*/
sig = SIGBUS;
code = BUS_ADRERR;
} else {
/*
* Something tried to access memory that
* isn't in our memory map..
*/
sig = SIGSEGV;
code = fault == VM_FAULT_BADACCESS ?
SEGV_ACCERR : SEGV_MAPERR;
}
/*调用__do_user_fault()来给用户进程发信号(段错误),因为这时内核已经无能为力了。下面立即查看此函数的实现*/
__do_user_fault(tsk, addr, fsr, sig, code, regs);
return 0;
no_context:
/*错误发生在内核模式,如果内核无法处理,那么调用__do_kernel_fault函数发送Oops错误。查看下面此函数的实现*/
__do_kernel_fault(mm, addr, fsr, regs);
return 0;
}
__do_page_fault()函数:
[arch/arm/mm/fault.c]
[do_page_fault()->__do_page_fault()]
static int __kprobes
__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
unsigned int flags, struct task_struct *tsk)
{
struct vm_area_struct *vma;
int fault;
/*首先通过失效地址addr来查找vma,如果find_vma()找不到vma,说明addr
地址还没有在进程地址空间中,返回VM_FAULT_BADMAP错误。*/
vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
if (unlikely(!vma))
goto out;
if (unlikely(vma->vm_start > addr))
goto check_stack;
/*
* Ok, we have a good vm_area for this
* memory access, so we can handle it.
*/
good_area:
/*access_error()判断VMA是否具备可写或可执行等权限。如果发生一个写错误
的缺页中断,首先判断vma属性是否具有可写属性,如果没有,则返回
VM_FAULT_BADACCESS错误。*/
if (access_error(fsr, vma)) {
fault = VM_FAULT_BADACCESS;
goto out;
}
/*handle_mm_fault()是缺页中断的核心处理函数,等哈介绍*/
return handle_mm_fault(mm, vma, addr & PAGE_MASK, flags);
check_stack:
/* Don't allow expansion below FIRST_USER_ADDRESS */
if (vma->vm_flags & VM_GROWSDOWN &&
addr >= FIRST_USER_ADDRESS && !expand_stack(vma, addr))
goto good_area;
out:
return fault;
}
回到do_page_fault()函数
PAGE_FAULT类型
[include/linux/mm.h]
/*
* Different kinds of faults, as returned by handle_mm_fault().
* Used to decide whether a process gets delivered SIGBUS or
* just gets major/minor fault counters bumped up.
*/
#define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
#define VM_FAULT_OOM 0x0001
#define VM_FAULT_SIGBUS 0x0002
#define VM_FAULT_MAJOR 0x0004
#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
#define VM_FAULT_SIGSEGV 0x0040
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
VM_FAULT_FALLBACK)
__do_user_fault()函数:段错误返回
[do_page_fault()->__do_user_fault()]
/*
* Something tried to access memory that isn't in our memory map..
* User mode accesses just cause a SIGSEGV
*/
static void
__do_user_fault(struct task_struct *tsk, unsigned long addr,
unsigned int fsr, unsigned int sig, int code,
struct pt_regs *regs)
{
struct siginfo si;
#ifdef CONFIG_DEBUG_USER
if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) ||
((user_debug & UDBG_BUS) && (sig == SIGBUS))) {
printk(KERN_DEBUG "%s: unhandled page fault (%d) at 0x%08lx, code 0x%03x\n",
tsk->comm, sig, addr, fsr);
show_pte(tsk->mm, addr);
show_regs(regs);
}
#endif
tsk->thread.address = addr;
tsk->thread.error_code = fsr;
tsk->thread.trap_no = 14;
si.si_signo = sig;
si.si_errno = 0;
si.si_code = code;
si.si_addr = (void __user *)addr;
force_sig_info(sig, &si, tsk);
}
回到do_page_fault函数
__do_kernel_fault()函数
/*
* Oops. The kernel tried to access some page that wasn't present.
*/
static void
__do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
struct pt_regs *regs)
{
/*
* Are we prepared to handle this kernel fault?
*/
if (fixup_exception(regs))
return;
/*
* No handler, we'll have to terminate things with extreme prejudice.
*/
bust_spinlocks(1);
pr_alert("Unable to handle kernel %s at virtual address %08lx\n",
(addr < PAGE_SIZE) ? "NULL pointer dereference" :
"paging request", addr);
show_pte(mm, addr);
die("Oops", regs, fsr);
bust_spinlocks(0);
do_exit(SIGKILL);
}
回到do_page_fault函数
__handle_mm_fault()函数:
handle_mm_fault()函数的核心函数是__handle_mm_fault(),它的实现在mm/memory.c
[do_page_fault()->__do_page_fault()->handle_mm_fault()->__handle_mm_fault()]
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
/*pgd_offset(mm,address)宏获取addr对应在当前进程页表的PGD页面目录项。*/
pgd = pgd_offset(mm, address);
/*pud_alloc(mm, pgd, address)宏获取对应的PUD表项,如果PUD表项为空,则返回VM_FAULT_OOM错误*/
pud = pud_alloc(mm, pgd, address);
if (!pud)
return VM_FAULT_OOM;
/*同样的方法获取PMD*/
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
int ret = VM_FAULT_FALLBACK;
if (!vma->vm_ops)
ret = do_huge_pmd_anonymous_page(mm, vma, address,
pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pmd_t orig_pmd = *pmd;
int ret;
barrier();
if (pmd_trans_huge(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
/*
* If the pmd is splitting, return and retry the
* the fault. Alternative: wait until the split
* is done, and goto retry.
*/
if (pmd_trans_splitting(orig_pmd))
return 0;
if (pmd_protnone(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
if (dirty && !pmd_write(orig_pmd)) {
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pmd_set_accessed(mm, vma, address, pmd,
orig_pmd, dirty);
return 0;
}
}
}
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
/*如果address对应的pte不存在,则会分配一个pte(大小为4K),然后调用pmd_populate函数,将
刚分配的pte的地址填入mm->pgd+page_index(address)的地址*/
if (unlikely(pmd_none(*pmd)) &&
unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
* from under us anymore at this point because we hold the mmap_sem
* read mode and khugepaged takes it in write mode. So now it's
* safe to run pte_offset_map().
*/
/*获取address对应的pte表项*/
pte = pte_offset_map(pmd, address);
/*调用此函数,下面具体分析*/
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
/*回到handle_mm_fault()函数*/
handle_pte_fault()函数:
【do_page_fault()->__do_page_fault()->handle_mm_fault()->__handle_mm_fault()->handle_pte_fault()】
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
*
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
* so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
* The code below just needs a consistent view for the ifs and
* we later double check anyway with the ptl lock held. So here
* a barrier will do.
*/
/*上面的注释说明有的处理器体系结构会大于8Byte的pte表项,例如ppc44x定义了CONFIG_PTE_64BIT
和CONFIG_32BIT,所以READ_ONCE()和ACCESS_ONCE()并不保证访问的原子性,所以这里需要一个内存
屏障以保证正确读取PTE表项内容才会执行后面的判断语句。*/
entry = *pte;
barrier();
/*pte_present为0的情况,页不在内存中,即pte表项中的LPTE_PRESENT位没有置位,
所以pte还没有映射物理页面,这是真正的缺页*/
if (!pte_present(entry)) {
/*
(1)如果pte内容为空,即pte_none()
** 对于文件映射,通常VMA的vm_ops操作函数定义了fault()函数指针,那么调用do_fault()函数。
** 对于匿名也变,调用do_anonymous_page()函数
*/
if (pte_none(entry)) {
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
return do_fault(mm, vma, address, pte,
pmd, flags, entry);
}
return do_anonymous_page(mm, vma, address,
pte, pmd, flags);
}
/*(2) 如果pte内容不为空且PRESENT没有置位,说明该页被交换到swap分区,则
调用do_swap_page()函数*/
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
}
if (pte_protnone(entry))
return do_numa_page(mm, vma, address, entry, pte, pmd);
/*这里是pte有映射物理页面,但因为之前的pte设置了只读,现在需要可写操作,
所以触发了写时复制缺页中断,例如父子进程之间共享的内存,当其中一方需要写入新内容时,
就会触发写时复制。*/
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
/*如果传进来的flag设置了可写的属性且当前pte是只读的,那么调用do_wp_page()
函数并返回*/
if (flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))/*如果传进来的flag设置了可写的属性且当前PTE是只读的,那么调用do_wp_page()函数*/
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
/*pte_mkyoung对于x86体系结构是设置_PAGE_ACCESSED位,这相对简单些。对于ARM体系结构
是设置Linux版本的页表中PTE页表项的L_PTE_YOUNG位,是否需要写入硬件版本的页表由set_pte_at
函数来决定*/
entry = pte_mkyoung(entry);
/*如果pte内容发生变化,则需要把新的内容写入到pte表项中,并且要flush对应的TLB和cache*/
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vma, address, pte);
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
return 0;
}