前面简单的分析了内核处理用户空间缺页异常的流程,进入到了handle_mm_fault()函数,该函数为触发缺页异常的地址address分配各级的页目录,也就是说现在已经拥有了一个和address配对的pte了,但是这个pte如何去映射物理页框,内核又得根据pte的状态进行分类和判断,而这个过程又会牵扯出一些其他的概念……这也是初读linux内核源码的最大障碍吧,在一些复杂的处理中,一个点往往可以延伸出一个面,容易让人迷失方向……因此后面打算分几次将这个函数分析完,自己也没有完全理解透,所以不到位的地方欢迎大家指出,一起交流~
- static inline int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, unsigned int flags)
- {
- pte_t entry;
- spinlock_t *ptl;
- entry = *pte;
- if (!pte_present(entry)) {//如果页不在主存中
- if (pte_none(entry)) {//页表项内容为0,表明进程未访问过该页
- /*如果vm_ops字段和fault字段都不为空,则说明这是一个基于文件的映射*/
- if (vma->vm_ops) {
- if (likely(vma->vm_ops->fault))
- return do_linear_fault(mm, vma, address,
- pte, pmd, flags, entry);
- }
- /*否则分配匿名页*/
- return do_anonymous_page(mm, vma, address,
- pte, pmd, flags);
- }
- /*属于非线性文件映射且已被换出*/
- if (pte_file(entry))
- return do_nonlinear_fault(mm, vma, address,
- pte, pmd, flags, entry);
- /*页不在主存中,但是页表项保存了相关信息,则表明该页被内核换出,则要进行换入操作*/
- return do_swap_page(mm, vma, address,
- pte, pmd, flags, entry);
- }
- ...
- ...
- }
首先要确定的一点就是pte对应的页是否驻留在主存中,因为pte有可能之前映射了页,但是该页被换出了。上面的代码给出了pte对应的页没有驻留在主存中的情况。如果pte对应的页没有驻留在主存中,且没有映射任何页,即pte_present()返回0,pte_none()返回0,则要判断要分配一个匿名页还是一个映射页。在Linux虚拟内存中,如果页对应的vma映射的是文件,则称为映射页,如果不是映射的文件,则称为匿名页。两者最大的区别体现在页和vma的组织上,因为在页框回收处理时要通过页来逆向搜索映射了该页的vma。对于匿名页的逆映射,vma都是通过vma结构体中的vma_anon_node(链表节点)和anon_vma(链表头)组织起来,再把该链表头的信息保存在页描述符中;而映射页和vma的组织是通过vma中的优先树节点和页描述符中的mapping->i_mmap优先树树根进行组织的,具体可以参看ULK3。
来看基于文件的映射的处理:
- static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte)
- {
- pgoff_t pgoff = (((address & PAGE_MASK)
- - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- pte_unmap(page_table);//如果page_table之前用来建立了临时内核映射,则释放该映射
- return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
- }
关键函数__do_fault():
- static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
- {
- pte_t *page_table;
- spinlock_t *ptl;
- struct page *page;
- pte_t entry;
- int anon = 0;
- int charged = 0;
- struct page *dirty_page = NULL;
- struct vm_fault vmf;
- int ret;
- int page_mkwrite = 0;
- vmf.virtual_address = (void __user *)(address & PAGE_MASK);
- vmf.pgoff = pgoff;
- vmf.flags = flags;
- vmf.page = NULL;
- ret = vma->vm_ops->fault(vma, &vmf);//调用定义好的fault函数,确保将所需的文件数据读入到映射页
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
- return ret;
- if (unlikely(PageHWPoison(vmf.page))) {
- if (ret & VM_FAULT_LOCKED)
- unlock_page(vmf.page);
- return VM_FAULT_HWPOISON;
- }
- /*
- * For consistency in subsequent calls, make the faulted page always
- * locked.
- */
- if (unlikely(!(ret & VM_FAULT_LOCKED)))
- lock_page(vmf.page);
- else
- VM_BUG_ON(!PageLocked(vmf.page));
- /*
- * Should we do an early C-O-W break?
- */
- page = vmf.page;
- if (flags & FAULT_FLAG_WRITE) {//写访问
- if (!(vma->vm_flags & VM_SHARED)) {//私有映射,则要创建一个副本进行写时复制
- anon = 1;// 标记为一个匿名映射
- if (unlikely(anon_vma_prepare(vma))) {//创建一个anon_vma实例给vma
- ret = VM_FAULT_OOM;
- goto out;
- }
- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,//分配一个页
- vma, address);
- if (!page) {
- ret = VM_FAULT_OOM;
- goto out;
- }
- if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
- ret = VM_FAULT_OOM;
- page_cache_release(page);
- goto out;
- }
- charged = 1;
- /*
- * Don't let another task, with possibly unlocked vma,
- * keep the mlocked page.
- */
- if (vma->vm_flags & VM_LOCKED)
- clear_page_mlock(vmf.page);
- /*创建数据的副本,将数据拷贝到新分配的页*/
- copy_user_highpage(page, vmf.page, address, vma);
- __SetPageUptodate(page);
- } else {
- /*
- * If the page will be shareable, see if the backing
- * address space wants to know that the page is about
- * to become writable
- */
- if (vma->vm_ops->page_mkwrite) {
- int tmp;
- unlock_page(page);
- vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
- if (unlikely(tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
- ret = tmp;
- goto unwritable_page;
- }
- if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
- lock_page(page);
- if (!page->mapping) {
- ret = 0; /* retry the fault */
- unlock_page(page);
- goto unwritable_page;
- }
- } else
- VM_BUG_ON(!PageLocked(page));
- page_mkwrite = 1;
- }
- }
- }
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- /*
- * This silly early PAGE_DIRTY setting removes a race
- * due to the bad i386 page protection. But it's valid
- * for other architectures too.
- *
- * Note that if FAULT_FLAG_WRITE is set, we either now have
- * an exclusive copy of the page, or this is a shared mapping,
- * so we can make it writable and dirty to avoid having to
- * handle that later.
- */
- /* Only go through if we didn't race with anybody else... */
- if (likely(pte_same(*page_table, orig_pte))) {//确定没有竞争,也就是页表项中的内容和之前是一样的
- flush_icache_page(vma, page);
- entry = mk_pte(page, vma->vm_page_prot);//页表项指向对应的物理页
- /*如果是写操作,则将页的访问权限置为RW*/
- if (flags & FAULT_FLAG_WRITE)
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- /*如果之前生成的页是匿名的,则将其集成到逆向映射当中*/
- if (anon) {
- inc_mm_counter(mm, anon_rss);
- page_add_new_anon_rmap(page, vma, address);//建立匿名页与第一个vma的逆向映射
- } else {
- inc_mm_counter(mm, file_rss);
- page_add_file_rmap(page);//建立页与vma的普通映射
- if (flags & FAULT_FLAG_WRITE) {
- dirty_page = page;
- get_page(dirty_page);
- }
- }
- set_pte_at(mm, address, page_table, entry);//修改page_table使其指向entry对应的页框
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, address, entry);
- } else {
- if (charged)
- mem_cgroup_uncharge_page(page);
- if (anon)
- page_cache_release(page);
- else
- anon = 1; /* no anon but release faulted_page */
- }
- pte_unmap_unlock(page_table, ptl);
- out:
- if (dirty_page) {
- struct address_space *mapping = page->mapping;
- if (set_page_dirty(dirty_page))
- page_mkwrite = 1;
- unlock_page(dirty_page);
- put_page(dirty_page);
- if (page_mkwrite && mapping) {
- /*
- * Some device drivers do not set page.mapping but still
- * dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
- /* file_update_time outside page_lock */
- if (vma->vm_file)
- file_update_time(vma->vm_file);
- } else {
- unlock_page(vmf.page);
- if (anon)
- page_cache_release(vmf.page);
- }
- return ret;
- unwritable_page:
- page_cache_release(page);
- return ret;
- }
首先要做的就是调用vma->vm_ops中定义好的fault()函数,将所需的数据从文件读入到映射页中,该函数还会将vma插入到映射页的mapping->i_mmap优先树中。
文件一般以共享的方式进行映射,接下来就要判断触发异常的操作是否包含写操作,如果是写操作并且该vma不是以共享的方式映射该页,则要进行写时复制,也就是创建一个新的页来供该vma读写,此时会申请一个匿名页,并将数据拷贝到该匿名页中。
接下来就要计算出page对应的pte值是多少,并将page_table指向的pte以该值进行填充,这样就完成了页表项到物理页的映射
再来看分配匿名页的处理
- static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags)
- {
- struct page *page;
- spinlock_t *ptl;
- pte_t entry;
- pte_unmap(page_table);
- /* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, address) < 0)
- return VM_FAULT_SIGBUS;
- /* Use the zero-page for reads */
- /*如果是读操作,那么就让entry指向一个已有的填充为0的现有页,因为进程是第一次访问该页,
- 所以页中的内容是什么并不重要,这样进一步推迟了新页的分配*/
- if (!(flags & FAULT_FLAG_WRITE)) {
- entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
- vma->vm_page_prot));
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (!pte_none(*page_table))
- goto unlock;
- goto setpte;
- }
- /*如果是写操作,则要分配一个新的页*/
- /* Allocate our own private page. */
- if (unlikely(anon_vma_prepare(vma)))//分配一个anon_vma实例
- goto oom;
- /*分配一个被0填充的页*/
- page = alloc_zeroed_user_highpage_movable(vma, address);
- if (!page)
- goto oom;
- __SetPageUptodate(page);
- if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
- goto oom_free_page;
- /*获取页对应的PTE内容*/
- entry = mk_pte(page, vma->vm_page_prot);
- /*如果是写操作则将页的权限设为读写并设置为脏页*/
- if (vma->vm_flags & VM_WRITE)
- entry = pte_mkwrite(pte_mkdirty(entry));
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (!pte_none(*page_table))
- goto release;
- inc_mm_counter(mm, anon_rss);
- page_add_new_anon_rmap(page, vma, address);//建立线性区和匿名页的反向映射
- setpte:
- set_pte_at(mm, address, page_table, entry);//设置page_table对应的pte
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, entry);//更新MMU缓存
- unlock:
- pte_unmap_unlock(page_table, ptl);
- return 0;
- release:
- mem_cgroup_uncharge_page(page);
- page_cache_release(page);
- goto unlock;
- oom_free_page:
- page_cache_release(page);
- oom:
- return VM_FAULT_OOM;
- }
匿名页分配的工作和__do_fault()中分配匿名页差不多,只不过前面多了一个读写的判断,如果是读的话,不会分配匿名页,而是让pte指向一个被0填充的页,这样就进一步推迟了页的分配。也许你会觉得奇怪,既然要读数据怎么可以分配一个事先准备好的全0的页,其实仔细想想就会明白,缺页异常处理进行到这里,一定是第一次访问相应的内存时才会触发,匿名页对应的一般都是堆,栈这些区域,对这些区域的访问一定先是写而不是读,所以对于这种操作本身就不正常,分配一个被0填充的页使用户进程读出来的都是0也许会更安全一些。
如果不是这两种情况的话,也就是说pte_none()返回的是0,那就说明pte之前映射过页,只是该页已被换出
如果该页之前是用来进行非线性文件映射的话,其处理的主体函数就是上面介绍过的__do_fault()
- static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte)
- {
- pgoff_t pgoff;
- flags |= FAULT_FLAG_NONLINEAR;
- if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
- return 0;
- if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {//确保vma具有非线性映射属性
- /*
- * Page table corrupted: show pte and kill process.
- */
- print_bad_pte(vma, address, orig_pte, NULL);
- return VM_FAULT_SIGBUS;
- }
- pgoff = pte_to_pgoff(orig_pte);//获取映射的文件偏移
- return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
- }
pte_to_pgoff()这个函数是和pgoff_to_pte()相对的一组操作。在非线性文件映射的页被换出时,其映射文件的偏移会以PAGE_SIZE为单位进行编码,存储到其pte中,所以当要重新换入该页时,要进行相应的解码计算出pgoff,再由__do_fault()进行处理!
对于页没有驻留在主存的情况中的最后一种处理方式,do_swap_page(),留在下次再做分析!