产生缺页异常中断的几种情况
- 当内存管理单元(MMU)中确实没有创建虚拟物理页映射关系,并且在该虚拟地址之后再没有当前进程的线性区(vma)的时候,可以肯定这是一个编码错误,这将杀掉该进程
- 当MMU中确实没有创建虚拟页物理页映射关系,并且在该虚拟地址之后存在当前进程的线性区vma的时候,这很可能是缺页中断,并且可能是栈溢出导致的缺页中断;
- 当使用malloc/mmap等希望访问物理空间的库函数/系统调用后,由于linux并未真正给新创建的vma映射物理页,此时若先进行写操作,将和2产生缺页中断的情况一样;若先进行读操作虽然也会产生缺页异常,将被映射给默认的零页,等再进行写操作时,仍会产生缺页中断,这次必须分配1物理页了,进入写时复制的流程;
- 当使用fork等系统调用创建子进程时,子进程不论有无自己的vma,它的vma都有对于物理页的映射,但它们共同映射的这些物理页属性为只读,即linux并未给子进程真正分配物理页,当父子进程任何一方要写相应物理页时,导致缺页中断的写时复制
kernel代码流程
arch/x86/mm/fault.c
dotraplinkage void notrace
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
//缺页异常的地址默认存放于CR2寄存器中,x86硬件决定,需要读取他
unsigned long address = read_cr2(); /* Get the faulting address */
enum ctx_state prev_state;
prev_state = exception_enter();//进入异常处理的状态
if (trace_pagefault_enabled())
trace_page_fault_entries(address, regs, error_code);
__do_page_fault(regs, error_code, address);//重要的缺页异常处理函数
exception_exit(prev_state);//退出异常处理的状态
}
NOKPROBE_SYMBOL(do_page_fault);
__do_page_fault函数:
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
struct vm_area_struct *vma;//定义结构体指针变量,表示一个独立的虚拟内存空间
struct task_struct *tsk;//进程描述符
struct mm_struct *mm;//进程中的内存描述符
vm_fault_t fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
u32 pkey;
tsk = current;//获取当前cpu正在运行的进程的进程描述符
mm = tsk->mm;//然后回去进程中的内存描述符
prefetchw(&mm->mmap_sem);//提前获取读写信号量
//mmio不应该发生缺页,通常都会ioremap到vmalloc区,然后进行访问,所以不应该在这里分配
if (unlikely(kmmio_fault(regs, address)))
return;
/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {//如果缺页地址位于内核空间,
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {//但是异常不发生在内核
if (vmalloc_fault(address) >= 0)//如果缺页地址是在vmalloc区,则返回不分配,而是内核主页表向进程页表同步数据
return;
}
/* Can handle a stale RO->RW TLB: */
//检查是否是旧的TLB(页表缓存)导致的假的缺页异常(TLB延迟flush导致的,因为提前flush会有比较大的性能代价)
if (spurious_fault(error_code, address))
return;
/* kprobes don't want to hook the spurious faults: */
//判断是否kprobes引起的虚假错误
if (kprobes_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
//异常位于内核态,触发内核异常,但是位于vmalloc的缺页异常前面已经处理过了,所以如果不是缺页导致的,那就是内核有其他异常了,需要处理返回
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
/* kprobes don't want to hook the spurious faults: */
//现在是用户态的异常,也要判断是否kprobes引起的虚假错误
if (unlikely(kprobes_fault(regs)))
return;
if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) {//smap的保护特性,阻止了访问内核态虚拟地址
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
/*
* If we're in an interrupt, have no user context or are running
* in a region with pagefaults disabled then we must not take the fault
*/
//如果我们处于中断状态,没有用户上下文,或者在页面错误被禁用的区域中运行,则不能接收错误,需要处理返回
if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
*
* User-mode registers count as a user access even for any
* potential system fault or CPU buglet:
*/
//开中断,这种情况下,是安全的,可以缩短因缺页异常导致的关中断时长
if (user_mode(regs)) {
local_irq_enable();
error_code |= X86_PF_USER;
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
* the kernel and should generate an OOPS. Unfortunately, in the
* case of an erroneous fault occurring in a code path which already
* holds mmap_sem we will deadlock attempting to validate the fault
* against the address space. Luckily the kernel only validly
* references user space from well defined areas of code, which are
* listed in the exceptions table.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibility of a
* deadlock. Attempt to lock the address space, if we cannot we then
* validate the source. If this is invalid we can skip the address
* space check, thus avoiding the deadlock:
*/
//避免内核中的其他错误导致mmap_sem死锁
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if (!(error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
retry:
down_read(&mm->mmap_sem);
} else {
/*
* The above down_read_trylock() might have succeeded in
* which case we'll have missed the might_sleep() from
* down_read():
*/
might_sleep();
}
//在当前进程的地址空间中查找发生异常的地址对应的vma
vma = find_vma(mm, address);
if (unlikely(!vma)) {//如果找不到,则释放锁并且返回
bad_area(regs, error_code, address);
return;
}
if (likely(vma->vm_start <= address))//如果找到了,而且虚拟地址位于vma有效范围,则为正常的缺页异常,请求分配内存
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {//虚拟地址不在vma有效范围,则是进程访问了非法地址,需要处理后返回
bad_area(regs, error_code, address);
return;
}
if (error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
* and pusha to work. ("enter $65535, $31" pushes
* 32 pointers and then decrements %sp by 65535.)
*/
//如果虚拟地址位于堆栈区附近
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
bad_area(regs, error_code, address);
return;
}
}
//扩展堆栈区,堆栈区的虚拟地址是动态分配的,不是固定的
if (unlikely(expand_stack(vma, address))) {//扩展失败,处理后返回
bad_area(regs, error_code, address);
return;
}
/*
* Ok, we have a good vm_area for this memory access, so
* we can handle it..
*/
good_area://正常的缺页异常处理,进行请求调页,分配物理内存
if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address, vma);
return;
}
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
* we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*
* Note that handle_userfault() may also release and reacquire mmap_sem
* (and not return with VM_FAULT_RETRY), when returning to userland to
* repeat the page fault later with a VM_FAULT_NOPAGE retval
* (potentially after handling any pending signal during the return to
* userland). The return to userland is identified whenever
* FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
* Thus we have to be careful about not touching vma after handling the
* fault, so we read the pkey beforehand.
*/
pkey = vma_pkey(vma);//在处理故障后,我们必须小心不要接触vma,所以我们预先读取pkey
//然后进入真正的分配处理,这个是所有处理器共用的部分,专门处理用户空间的缺页异常
fault = handle_mm_fault(vma, address, flags);
//下面是分配后的一些判断和处理
major |= fault & VM_FAULT_MAJOR;
/*
* If we need to retry the mmap_sem has already been released,
* and if there is a fatal signal pending there is no guarantee
* that we made any progress. Handle this case first.
*/
if (unlikely(fault & VM_FAULT_RETRY)) {
/* Retry at most once */
if (flags & FAULT_FLAG_ALLOW_RETRY) {
flags &= ~FAULT_FLAG_ALLOW_RETRY;
flags |= FAULT_FLAG_TRIED;
if (!fatal_signal_pending(tsk))
goto retry;
}
/* User mode? Just return to handle the fatal exception */
if (flags & FAULT_FLAG_USER)
return;
/* Not returning to user mode? Handle exceptions or die: */
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, &pkey, fault);
return;
}
/*
* Major/minor page fault accounting. If any of the events
* returned VM_FAULT_MAJOR, we account it as a major fault.
*/
if (major) {
tsk->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
} else {
tsk->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
check_v8086_mode(regs, address, tsk);//vm8模式,为了兼容老的cpu做一些相关的检查
}
重点讲解handle_mm_fault函数,函数位于mm/memory.c文件中
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
vm_fault_t ret;
__set_current_state(TASK_RUNNING);//设置进程执行状态为运行
count_vm_event(PGFAULT);//vmstat的pagefault加一
count_memcg_event_mm(vma->vm_mm, PGFAULT);//cgroup的pagefault加一
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
//判断vma是否可以修改
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
return VM_FAULT_SIGSEGV;
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)//用户空间中触发的故障开启memcg OOM
mem_cgroup_enter_user_fault();
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);//巨页的缺页处理
else
ret = __handle_mm_fault(vma, address, flags);//普通的缺页处理
if (flags & FAULT_FLAG_USER) {//用户空间中触发的故障开启memcg OOM后记得关闭
mem_cgroup_exit_user_fault();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
* VM_FAULT_OOM), there is no need to kill anything.
* Just clean up the OOM state peacefully.
*/
//任务可能进入了memcg OOM情况,但是如果正确地处理了分配错误(没有VM_FAULT_OOM),就不需要杀死任何东西,只要和平地清理OOM标志。
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
}
return ret;
}
由于巨型页现在还没看,先说说普通的缺页处理函数__handle_mm_fault:
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
struct vm_fault vmf = {
.vma = vma,
.address = address & PAGE_MASK,
.flags = flags,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
};
unsigned int dirty = flags & FAULT_FLAG_WRITE;
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;
pgd = pgd_offset(mm, address);//在也全局目录中查找虚拟地址对应的表项,也是之前说过的页全局目录索引
p4d = p4d_alloc(mm, pgd, address);//查找页全局目录索引指向的页四级目录索引,如果不存在,则创建
if (!p4d)
return VM_FAULT_OOM;
vmf.pud = pud_alloc(mm, p4d, address);//查找页全局目录索引指向的页上层目录索引,如果不存在,则创建
if (!vmf.pud)
return VM_FAULT_OOM;
if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pud_t orig_pud = *vmf.pud;
barrier();
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
/* NUMA case for anonymous PUDs would go here */
if (dirty && !pud_write(orig_pud)) {
ret = wp_huge_pud(&vmf, orig_pud);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pud_set_accessed(&vmf, orig_pud);
return 0;
}
}
}
vmf.pmd = pmd_alloc(mm, vmf.pud, address);//查找页上层目录索引指向的页中间目录索引,如果不存在,则创建
if (!vmf.pmd)
return VM_FAULT_OOM;
if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pmd_t orig_pmd = *vmf.pmd;
barrier();
if (unlikely(is_swap_pmd(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(orig_pmd));
if (is_pmd_migration_entry(orig_pmd))
pmd_migration_entry_wait(mm, vmf.pmd);
return 0;
}
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf, orig_pmd);
if (dirty && !pmd_write(orig_pmd)) {
ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pmd_set_accessed(&vmf, orig_pmd);
return 0;
}
}
}
return handle_pte_fault(&vmf);//进入处理直接页表函数
}
函数handle_pte_fault处理页表流程如下:
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
if (unlikely(pmd_none(*vmf->pmd))) {//判断页中间目录页表是否为空
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
vmf->pte = NULL;//页中间目录页表为空,则直接页表肯定不存在,要设置为空
} else {
/* See comment in pte_alloc_one_map() */
//判断页中间目录页表是否不稳定
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
* pmd from under us anymore at this point because we hold the
* mmap_sem read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
//页中间目录页表不为空,则查找直接页表,并且记录直接页表位置
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
vmf->orig_pte = *vmf->pte;
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
* CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
* accesses. The code below just needs a consistent view
* for the ifs and we later double check anyway with the
* ptl lock held. So here a barrier will do.
*/
barrier();//指令隔离
if (pte_none(vmf->orig_pte)) {//判断页表项是否为空
pte_unmap(vmf->pte);//页表项为空,则取消页直接目录的映射
vmf->pte = NULL;//把页直接目录指向空
}
}
if (!vmf->pte) {//如果页直接目录为空
if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);//如果是私有映射,使用do_anonymous_page处理缺页异常
else
return do_fault(vmf);//如果是文件映射或者是共享映射,使用do_fault处理缺页异常
}
//下面是页直接目录存在的情况
//页不在物理内存中,说明页被换出交换分区,调用do_swap_page把页交换回来
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
//下面是页在物理内存中的情况
//物理页不平衡,并且vma是可以操作的,说明物理页再其他节点中,调用do_numa_page把物理页移回来
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
//开始处理页表和物理页都存在的情况,说明缺页异常是由于访问权限触发的
//获取页表锁地址,页表锁有两种,精粒度锁(一个进程一个锁)和粗粒度锁(一个页表一个锁)
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);//锁住页表
entry = vmf->orig_pte;
if (unlikely(!pte_same(*vmf->pte, entry)))//重新读取页表的值
goto unlock;
if (vmf->flags & FAULT_FLAG_WRITE) {//如果缺页异常是由写操作触发的
if (!pte_write(entry))
return do_wp_page(vmf);//如果页表没有写权限,则调用do_wp_page执行写时复制
entry = pte_mkdirty(entry);//如果页表有写权限,设置页表项的脏标志位,表示页数据被修改了
}
entry = pte_mkyoung(entry);//设置页表项的访问标志位,表示页数据刚刚被访问了(热页),避免被换出
//判断pte内容是否变化
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
vmf->flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vmf->vma, vmf->address, vmf->pte);//页表发生变化了,更新内存管理单元的页表高速缓存cache
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
如果缺页异常是由写操作触发的,使用flush_tlb_fix_spurious_fault避免进行无用的tlb刷新
if (vmf->flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
}
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);//解锁
return 0;
}
handle_pte_fault主要采用vm_fault数据结构来管理很多参数,它主要通过vma首先判断addr所对应的pte是否为空,主要区分两种情形进行处理:
a.PTE为空:需要进一步区分是匿名映射还是文件映射发生的page fault,分别进行不同处理:
(1)do_anonymous_page
对于匿名映射则通过do_anonymous_page分别区分vma是只读和可写两种情况,分配物理页面,并设置pte到硬件页表;
(2)do_fault
对于文件映射或者是共享映射则通过do_fault区分不同情况,总体是将文件内容读取到物理页面,并为此物理页面建立与缺页地址的映射关系,具体还要分情况讨论,看完代码会说。
b.PTE不为空:需要进一步区分几种情况,如页面是否已经换出、是否是访问权限不匹配等分别作不同处理
(1)do_numa_page
由于各个节点的物理页不平衡,并且判断vma结构体是可以操作的,说明物理页再其他节点中,需要把物理页从其他节点中国移回来到目前的节点内
(2)do_swap_page
由于匿名物理页面被回收了,所以进程再次访问一块虚拟地址时,就会产生缺页中断,最终进入到 do_swap_page,在这个函数中会重新分配新的页面,然后再从swap分区读回这块虚拟地址对应的数据
(3)do_wp_page
写时复制一般发生在父子进程之间,fork时copy_mm时会将父进程和子进程的页表项都设置为只读,无论是父进程或子进程执行写入都会触发page fault,通过此函数执行写时复制,分配新的page,拷贝旧page到新page, 并修改相应的页表项为读写。参数vmf->vma保存了线性区原有的读写权限
现在先看看a.(1) do_anonymous_page 函数:
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
vm_fault_t ret = 0;
pte_t entry;
/* File mapping without ->vm_ops ? */
//如果是共享映射,那么返回,因为共享映射不在这里处理,个人无法理解跟vm_ops的关系
if (vma->vm_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
/*
* Use pte_alloc() instead of pte_alloc_map(). We can't run
* pte_offset_map() on pmds where a huge pmd might be created
* from a different thread.
*
* pte_alloc_map() is safe to use under down_write(mmap_sem) or when
* parallel threads are excluded by other means.
*
* Here we only have down_read(mmap_sem).
*/
//如果页中间页表为空,说明直接页表不存在,那么分配直接页表
if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */
//如果申请的页中间页表指向的直接页表不稳定则返回失败
if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
/* Use the zero-page for reads */
//如果vma是不可写(只读)的,并且进程允许使用零页
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
//把虚拟页映射到一个专用的零页上(在后面的某个版本会取消零页)
entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
vma->vm_page_prot));
//给页表上锁,然后通过页中间页表查找页直接页表项
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
//如果页直接页表项不是空,说明不缺页,可能是其他处理器在使用同一个页直接页表,那么返回吧
if (!pte_none(*vmf->pte))
goto unlock;
//检查内存描述符的内存空间是否稳定
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {//如果是用户异常丢失
pte_unmap_unlock(vmf->pte, vmf->ptl);//解锁
return handle_userfault(vmf, VM_UFFD_MISSING);//之前已经分配了零页,这里处理一下用户异常
}
goto setpte;//之前已经分配了零页,这里可以跳过物理页分配,走快速返回路线
}
//下面是vma可写的情况
/* Allocate our own private page. */
//完成vma内存分配的准备,也就是说vma内物理页足够,如果不够则返回oom
if (unlikely(anon_vma_prepare(vma)))
goto oom;
//分配可移动的物理页,优先从高端内存区域分配,并且是用零初始化,如果分配失败则返回oom
page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
false))
goto oom_free_page;
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceeding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);//指令屏障,并且保证在写入之前,页面存储的内容是可见的
entry = mk_pte(page, vma->vm_page_prot);//使用页帧号和访问权限生成页直接目录(页表)
if (vma->vm_flags & VM_WRITE)//如果vma有写权限
entry = pte_mkwrite(pte_mkdirty(entry));//设置页表标志为脏,表示页数据被修改过
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);//上锁,设置pte指向新分配的页直接目录
if (!pte_none(*vmf->pte))//如果页直接目录不为空则快速release返回
goto release;
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {如果是用户异常丢失
pte_unmap_unlock(vmf->pte, vmf->ptl);//释放页表的锁
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);//之前已经分配了物理页,所以page_count计数加一
return handle_userfault(vmf, VM_UFFD_MISSING);//之前已经分配了物理页,这里处理一下用户异常后返回
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);//快速计算vma的匿名也数量
page_add_new_anon_rmap(page, vma, vmf->address, false);//匿名页面添加到RMAP系统
mem_cgroup_commit_charge(page, memcg, false, false);
//把物理页添加到LRU(最近最少使用)链表,方便页回收算法从LRU链表祖选择合适的物理页进行回收
lru_cache_add_active_or_unevictable(page, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);//将pte页表项值entry设置到硬件page_table页表项
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);//更新页表项的TLB cache
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);//释放页表的锁
return ret;
release:
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);//之前已经分配了物理页,所以page_count计数加一
goto unlock;
oom_free_page:
put_page(page);//之前已经分配了物理页,所以page_count计数加一
oom:
return VM_FAULT_OOM;
}
先看看a.(2) do_fault 函数:
static vm_fault_t do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *vm_mm = vma->vm_mm;
vm_fault_t ret;
/*
* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
*/
if (!vma->vm_ops->fault) {//如果虚拟内存区域没有提供页错误异常操作方法(下面肯定返回错误)
/*
* If we find a migration pmd entry or a none pmd entry, which
* should never happen, return SIGBUS
*/
if (unlikely(!pmd_present(*vmf->pmd)))
ret = VM_FAULT_SIGBUS;//如果页中间目录项不存在则返回错误
else {//如果页中间目录项存在则上锁并设置pte指向页直接目录
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
vmf->pmd,
vmf->address,
&vmf->ptl);
/*
* Make sure this is not a temporary clearing of pte
* by holding ptl and checking again. A R/M/W update
* of pte involves: take ptl, clearing the pte so that
* we don't have concurrent modification by hardware
* followed by an update.
*/
if (unlikely(pte_none(*vmf->pte)))
ret = VM_FAULT_SIGBUS;//如果页直接目录不存在则返回错误
else
ret = VM_FAULT_NOPAGE;//如果页直接目录存在则返回不需要物理页错误
pte_unmap_unlock(vmf->pte, vmf->ptl);//释放页表的锁
}
} else if (!(vmf->flags & FAULT_FLAG_WRITE))
ret = do_read_fault(vmf);//如果缺页异常是读文件触发的,调用do_read_fault
else if (!(vma->vm_flags & VM_SHARED))
ret = do_cow_fault(vmf);//如果缺页异常是写私有文件触发的,调用do_cow_fault执行写时复制
else
ret = do_shared_fault(vmf);//如果缺页异常是写共享文件触发的,调用do_shared_fault
/* preallocated pagetable is unused: free it */
//如果没有使用预分配的页表则释放它并且配置prealloc_pte为空
if (vmf->prealloc_pte) {
pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
}
a.(2) do_fault 函数有一下几种情况:
- do_read_fault:缺页由读文件导致,do_read_fault将文件内容读取到vmf->page页面,并为此物理页面建立与缺页地址的映射关系
- do_cow_fault:缺页由写私有文件导致,do_cow_fault分配vmf->page页面,并将文件内容读取到vmf->page页面,将vmf->page拷贝到vmf->cow_page,并为vmf->cow_page页面分配pte,
建立缺页地址与vmf->page页面的映射关系 - do_shared_fault:缺页由写共享文件导致,do_shared_fault将文件内容读取到vmf->page页面,并为此物理页面建立与缺页地址的映射关系
现在看看a.(2) .1 do_read_fault函数怎么读文件:
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = 0;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
//如果map_pages函数不为空并且fault_around_bytes有效,
//map_pages就是之前讲过的预读的操作函数,fault_around_bytes控制预读长度,一般64k
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
//调用do_fault_around预读几个页的文件内容读取到vmf->page,为了减少页错误异常的次数
ret = do_fault_around(vmf);
if (ret)
return ret;
}
ret = __do_fault(vmf);//不满足预读条件,将一页文件内容读取到vmf->page
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
ret |= finish_fault(vmf);//获取缺页异常对应的物理页面,并为此物理页面建立与缺页地址的映射关系
unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
put_page(vmf->page);//之前已经分配了物理页,所以page_count计数加一
return ret;
}
再看看a.(2) .2 do_cow_fault函数怎么写时复制:
static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
//完成vma内存分配的准备,也就是说vma内物理页足够,如果不够则返回oom
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
//预先分配一个物理页,后面写时复制会用到
vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!vmf->cow_page)
return VM_FAULT_OOM;
if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
&vmf->memcg, false)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
ret = __do_fault(vmf);//将一页文件内容读取到vmf->page
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
if (ret & VM_FAULT_DONE_COW)
return ret;
//把文件的页缓存中的物理页的数据复制到副本物理页
copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
__SetPageUptodate(vmf->cow_page);//指令屏障,并且保证在写入之前,页面存储的内容是可见的
//获取缺页异常对应的物理页面,并为此物理页面建立与缺页地址的映射关系
ret |= finish_fault(vmf);
unlock_page(vmf->page);//释放页表的锁
put_page(vmf->page);//之前已经分配了物理页,所以page_count计数加一
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
return ret;
uncharge_out:
mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
put_page(vmf->cow_page);//cow_page计数加一
return ret;
}
再看看a.(2) .3 do_shared_fault函数:
static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret, tmp;
ret = __do_fault(vmf);//将一页文件内容读取到vmf->page
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
/*
* Check if the backing address space wants to know that the page is
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {//如果虚拟内存操作集合有page_mkwrite操作方法
unlock_page(vmf->page);
tmp = do_page_mkwrite(vmf);//调用虚拟内存操作集合的page_mkwrite操作方法
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
put_page(vmf->page);
return tmp;
}
}
//获取缺页异常对应的物理页面,并为此物理页面建立与缺页地址的映射关系
ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
unlock_page(vmf->page);//释放页表的锁
put_page(vmf->page);
return ret;
}
//设置也的脏标志位,表示页数据被修改了,平衡并回写一部分脏页
fault_dirty_shared_page(vma, vmf->page);
return ret;
}
a.(2) .1.do_read_fault函数、a.(2) .2.do_cow_fault函数、a.(2) .3.do_shared_fault函数主要用到了do_fault_around、__do_fault、finish_fault这几个函数。关于do_fault_around和__do_fault的源代码就不贴出来了,可以自己搜索一下,就在同文件中,do_fault_around主要是调用了vmf->vma->vm_ops->map_pages操作函数,__do_fault主要是调用了vma->vm_ops->fault操作函数。finish_fault这个映射过程还是要讲讲的
vm_fault_t finish_fault(struct vm_fault *vmf)
{
struct page *page;
vm_fault_t ret = 0;
/* Did we COW the page? */
//判断是否写时复制
if ((vmf->flags & FAULT_FLAG_WRITE) &&
!(vmf->vma->vm_flags & VM_SHARED))
page = vmf->cow_page;
else
page = vmf->page;
/*
* check even for read faults because we might have lost our CoWed
* page
*/
if (!(vmf->vma->vm_flags & VM_SHARED))
ret = check_stable_address_space(vmf->vma->vm_mm);//如果是读私有内存,需要判断是否有稳定的内存
if (!ret)
ret = alloc_set_pte(vmf, vmf->memcg, page);//如果没有内存则申请物理页
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);//分配到物理页则释放页表锁
return ret;
}
其中分配内存的函数就是alloc_set_pte,我们来看看:
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
pte_t entry;
vm_fault_t ret;
if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
/* THP on COW? */
VM_BUG_ON_PAGE(memcg, page);
ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
//如果页直接页表不存在,那么直接分配页表,并且根据虚拟地址查找页表项,
if (!vmf->pte) {
ret = pte_alloc_one_map(vmf);
if (ret)
return ret;
}
/* Re-check under ptl */
//这次检查页表是否为空,如果不为空,说明其他处理器使用了这个页表,当前处理器放弃返回错误
if (unlikely(!pte_none(*vmf->pte)))
return VM_FAULT_NOPAGE;
flush_icache_page(vma, page);//分配完页表项后需要刷新icache,这个函数跟cpu架构相关,一般都是空操作
entry = mk_pte(page, vma->vm_page_prot);//使用页帧号和访问权限生成页直接目录(页表)
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {//如果是写时复制
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);//快速计算vma的匿名也数量
page_add_new_anon_rmap(page, vma, vmf->address, false);//匿名页面添加到RMAP系统
mem_cgroup_commit_charge(page, memcg, false, false);
//把物理页添加到LRU(最近最少使用)链表,方便页回收算法从LRU链表祖选择合适的物理页进行回收
lru_cache_add_active_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));//快速计算vma的文件映射也数量
page_add_file_rmap(page, false);
}
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);//设置页表项
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, vmf->address, vmf->pte);//页表发生变化了,更新内存管理单元的页表高速缓存cache
return 0;
}
简单的缺页异常处理流程图
详细的缺页异常处理流程表:
do_page_fault//函数来进行缺页处理
read_cr2()//缺页异常的地址默认存放于CR2寄存器中
exception_enter();//进入异常处理的状态
__do_page_fault//重要的缺页异常处理函数
prefetchw(&mm->mmap_sem);//提前获取读写信号量
if (unlikely(kmmio_fault(regs, address)))//mmio不应该发生缺页,返回
if (unlikely(fault_in_kernel_space(address))) {//缺页地址位于内核空间,
if (unlikely(kprobes_fault(regs)))//用户态的kprobes引起异常,返回
if (unlikely(smap_violation(error_code, regs))) {//smap的保护特性,访问内核态虚拟地址,返回
if (unlikely(faulthandler_disabled() || !mm)) {//处于中断状态,返回
find_vma//查找发生异常的地址对应的vma
if (unlikely(expand_stack(vma, address))) {//vma在堆栈区附近,则扩展堆栈区
vma_pkey(vma);//预先读取vma pkey
handle_mm_fault//然后进入真正的分配处理,这个是所有处理器共用的部分,专门处理用户空间的缺页异常
__set_current_state(TASK_RUNNING);//设置进程执行状态为运行
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);//巨页的缺页处理
else
ret = __handle_mm_fault(vma, address, flags);//普通页的缺页处理
pgd = pgd_offset(mm, address);//查找页全局目录索引
p4d = p4d_alloc(mm, pgd, address);//查找页四级目录索引,如果不存在,则创建
vmf.pud = pud_alloc(mm, p4d, address);//查找页上层目录索引,如果不存在,则创建
vmf.pmd = pmd_alloc(mm, vmf.pud, address);//查找页中间目录索引,如果不存在,则创建
handle_pte_fault(&vmf);//进入处理直接页表函数
if (!vmf->pte) {//如果页直接目录为空
if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);//如果是私有匿名映射,使用do_anonymous_page处理缺页异常
pte_alloc//分配直接页表
if (!(vmf->flags & FAULT_FLAG_WRITE)//如果vma是只读的
pte_mkspecial//把虚拟页映射到一个专用的零页上
pte_offset_map_lock//通过页中间页表查找页直接页表项
//下面是可写的
alloc_zeroed_user_highpage_movable//从高端内存分配可移动的物理页
lru_cache_add_active_or_unevictable//把物理页添加到LRU
set_pte_at//将pte页表项值entry设置到硬件page_table页表项
update_mmu_cache;//更新页表项的TLB cache
else
return do_fault(vmf);//如果是文件映射或者是共享映射,使用do_fault处理缺页异常
if (!vma->vm_ops->fault) {//如果虚拟内存区域没有提供页错误异常操作方法
return
else if (!(vmf->flags & FAULT_FLAG_WRITE))
do_read_fault(vmf);//如果缺页异常是读文件触发的
if (vma->vm_ops->map_pages)
do_fault_around(vmf);//预读几个页的文件内容读取到vmf->page
__do_fault(vmf);将一页文件内容读取到vmf->page
finish_fault(vmf);//获取物理页面,建立映射
else if (!(vma->vm_flags & VM_SHARED))
do_cow_fault(vmf);//如果缺页异常是写私有文件触发的,执行写时复制
alloc_page_vma//预先分配一个物理页,后面写时复制会用到
__do_fault(vmf);//将一页文件内容读取到vmf->page
copy_user_highpage//把文件的页缓存中的物理页的数据复制到副本物理页
__SetPageUptodate(vmf->cow_page);//指令屏障,
finish_fault(vmf);//获取物理页面,建立映射
else
do_shared_fault(vmf);//如果缺页异常是写共享文件触发的
__do_fault(vmf);//将一页文件内容读取到vmf->page
finish_fault(vmf);//获取物理页面,建立映射
fault_dirty_shared_page//设置也的脏标志位,平衡并回写一部分脏页
}
//下面是页直接目录存在的情况
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);//页不在物理内存中,说明页被换出交换分区,要把页交换回来
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);//物理页不平衡,说明物理页再其他节点中
if (vmf->flags & FAULT_FLAG_WRITE) {//如果缺页异常是由写操作触发的
return do_wp_page(vmf);//如果页表没有写权限,则调用do_wp_page执行写时复制
check_v8086_mode
exception_exit(prev_state);//退出异常处理的状态