用户空间缺页异常pte_handle_fault()分析--(下)--写时复制

最新推荐文章于 2024-04-30 15:38:57 发布

wh8_2011

最新推荐文章于 2024-04-30 15:38:57 发布

阅读量993

点赞数

分类专栏： linux-fork linux 内存管理文章标签： linux sched fork mm

linux 内存管理同时被 2 个专栏收录

42 篇文章 6 订阅

订阅专栏

linux-fork

9 篇文章 0 订阅

订阅专栏

在pte_handle_fault()中，如果触发异常的页存在于主存中，那么该异常往往是由写了一个只读页触发的，此时需要进行COW(写时复制操作)。如当一个父进程通过fork()创建了一个子进程时，子进程将会共享父进程的页框。之后，无论是父进程还是子进程要对相应的内存进行写操作，都要进行COW，也就是为自己重新分配一个页框，并把之前的数据复制到页框中去，再写。

[cpp]view plaincopy 
   
 static inline int handle_pte_fault(struct mm_struct *mm,  
         struct vm_area_struct *vma, unsigned long address,  
         pte_t *pte, pmd_t *pmd, unsigned int flags)  
 {  
     pte_t entry;  
     spinlock_t *ptl;  
   
     entry = *pte;  
   
     ...  
     ...  
     ...  
     /********页在主存中的情况***********/  
       
     ptl = pte_lockptr(mm, pmd);  
     spin_lock(ptl);  
     if (unlikely(!pte_same(*pte, entry)))  
         goto unlock;  
     if (flags & FAULT_FLAG_WRITE) {//异常由写访问触发  
         if (!pte_write(entry))//而对应的页是不可写的  
             return do_wp_page(mm, vma, address, //此时必须进行写时复制的操作  
                     pte, pmd, ptl, entry);  
         entry = pte_mkdirty(entry);  
     }  
     entry = pte_mkyoung(entry);  
     if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {  
         update_mmu_cache(vma, address, entry);  
     } else {  
         /* 
          * This is needed only for protection faults but the arch code 
          * is not yet telling us if this is a protection fault or not. 
          * This still avoids useless tlb flushes for .text page faults 
          * with threads. 
          */  
         if (flags & FAULT_FLAG_WRITE)  
             flush_tlb_page(vma, address);  
     }  
 unlock:  
     pte_unmap_unlock(pte, ptl);  
     return 0;  
 }  

可以看到，hand_pte_fault()函数处理页存在于主存中的情况的关键操作都集中在do_wp_page()函数上。该函数是用来处理COW的，不过在COW之前先要做一些检查，比如说，如果对应的页只有一个进程使用，那么便可以直接修改页的权限为可读可写，而不进行COW。总之，不到不得以的情况下是不会进行COW的。

[cpp]view plaincopy 
   
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,  
         unsigned long address, pte_t *page_table, pmd_t *pmd,  
         spinlock_t *ptl, pte_t orig_pte)  
 {  
     struct page *old_page, *new_page;  
     pte_t entry;  
     int reuse = 0, ret = 0;  
     int page_mkwrite = 0;  
     struct page *dirty_page = NULL;  
   
     old_page = vm_normal_page(vma, address, orig_pte);//获取共享页  
     if (!old_page) {//获取共享页失败  
         /* 
          * VM_MIXEDMAP !pfn_valid() case 
          * 
          * We should not cow pages in a shared writeable mapping. 
          * Just mark the pages writable as we can't do any dirty 
          * accounting on raw pfn maps. 
          */  
          /*如果vma的映射本来就是共享且可写的，则跳转至reuse直接使用orig_pte对应的页*/  
         if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==  
                      (VM_WRITE|VM_SHARED))  
             goto reuse;  
         /*否则跳转至gotten分配一个页*/  
         goto gotten;  
     }  
   
     /* 
      * Take out anonymous pages first, anonymous shared vmas are 
      * not dirty accountable. 
      */  
      /*下面首先判断匿名页的情况，如果old_page是匿名页，并且只有一个进程使用它(reuse为1)，则 
         则直接使用该页*/  
     if (PageAnon(old_page) && !PageKsm(old_page)) {  
         /*这里先判断是否有其他进程竞争，修改了页表*/  
         if (!trylock_page(old_page)) {  
             page_cache_get(old_page);  
             pte_unmap_unlock(page_table, ptl);  
             lock_page(old_page);  
             page_table = pte_offset_map_lock(mm, pmd, address,  
                              &ptl);  
             if (!pte_same(*page_table, orig_pte)) {  
                 unlock_page(old_page);  
                 page_cache_release(old_page);  
                 goto unlock;  
             }  
             page_cache_release(old_page);  
         }  
         /*确定没有其他进程竞争，则进行reuse判断，通过reuse_swap_page()函数判断 
          old_page的_mapcount字段是否为0，是的话则表明只有一个进程使用该匿名页*/  
         reuse = reuse_swap_page(old_page);  
         unlock_page(old_page);  
     } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==  
                     (VM_WRITE|VM_SHARED))) {//如果vma的映射本来就是共享且可写的  
         /* 
          * Only catch write-faults on shared writable pages, 
          * read-only shared pages can get COWed by 
          * get_user_pages(.write=1, .force=1). 
          */  
         if (vma->vm_ops && vma->vm_ops->page_mkwrite) {  
             struct vm_fault vmf;  
             int tmp;  
   
             vmf.virtual_address = (void __user *)(address &  
                                 PAGE_MASK);  
             vmf.pgoff = old_page->index;  
             vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;  
             vmf.page = old_page;  
   
             /* 
              * Notify the address space that the page is about to 
              * become writable so that it can prohibit this or wait 
              * for the page to get into an appropriate state. 
              * 
              * We do this without the lock held, so that it can 
              * sleep if it needs to. 
              */  
             page_cache_get(old_page);//增加old_page的引用计数作为保护  
             pte_unmap_unlock(page_table, ptl);  
   
             /*这里通知即将修改页的权限*/  
             tmp = vma->vm_ops->page_mkwrite(vma, &vmf);  
   
             /*如果无法修改的话，则跳转到unwritable_page*/  
             if (unlikely(tmp &  
                     (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {  
                 ret = tmp;  
                 goto unwritable_page;  
             }  
             if (unlikely(!(tmp & VM_FAULT_LOCKED))) {  
                 lock_page(old_page);  
                 if (!old_page->mapping) {  
                     ret = 0; /* retry the fault */  
                     unlock_page(old_page);  
                     goto unwritable_page;  
                 }  
             } else  
                 VM_BUG_ON(!PageLocked(old_page));  
   
             /* 
              * Since we dropped the lock we need to revalidate 
              * the PTE as someone else may have changed it.  If 
              * they did, we just return, as we can count on the 
              * MMU to tell us if they didn't also make it writable. 
              */  
              /*走到这里表示已经成功修改了页的权限了，这里同样重新获取页表，判断是否和之前一致*/  
             page_table = pte_offset_map_lock(mm, pmd, address,  
                              &ptl);  
             if (!pte_same(*page_table, orig_pte)) {  
                 unlock_page(old_page);  
                 page_cache_release(old_page);  
                 goto unlock;  
             }  
   
             page_mkwrite = 1;  
         }  
         dirty_page = old_page;  
         get_page(dirty_page);  
         reuse = 1;  
     }  
   
     if (reuse) {//reuse处理，也就是说不进行COW，可以直接在old_page上进行写操作  
 reuse:  
         flush_cache_page(vma, address, pte_pfn(orig_pte));  
         entry = pte_mkyoung(orig_pte);//标记_PAGE_ACCESSED位  
         entry = maybe_mkwrite(pte_mkdirty(entry), vma);//将页的权限修改为可读可写，并且标记为脏页  
         if (ptep_set_access_flags(vma, address, page_table, entry,1))  
             update_mmu_cache(vma, address, entry);  
         ret |= VM_FAULT_WRITE;  
         goto unlock;  
     }  
   
     /* 
      * Ok, we need to copy. Oh, well.. 
      */  
      /***************终于走到了不得已的一步了，下面只好进行COW了********************/  
     page_cache_get(old_page);  
 gotten:  
     pte_unmap_unlock(page_table, ptl);  
   
     if (unlikely(anon_vma_prepare(vma)))  
         goto oom;  
   
     if (is_zero_pfn(pte_pfn(orig_pte))) {  
         new_page = alloc_zeroed_user_highpage_movable(vma, address);//分配一个零页面  
         if (!new_page)  
             goto oom;  
     } else {  
         new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);//分配一个非零页面  
         if (!new_page)  
             goto oom;  
         cow_user_page(new_page, old_page, address, vma);//将old_page中的数据拷贝到new_page  
     }  
     __SetPageUptodate(new_page);  
   
     /* 
      * Don't let another task, with possibly unlocked vma, 
      * keep the mlocked page. 
      */  
     if ((vma->vm_flags & VM_LOCKED) && old_page) {  
         lock_page(old_page);    /* for LRU manipulation */  
         clear_page_mlock(old_page);  
         unlock_page(old_page);  
     }  
   
     if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))  
         goto oom_free_new;  
   
     /* 
      * Re-check the pte - we dropped the lock 
      */  
     page_table = pte_offset_map_lock(mm, pmd, address, &ptl);  
     if (likely(pte_same(*page_table, orig_pte))) {  
         if (old_page) {  
             if (!PageAnon(old_page)) {  
                 dec_mm_counter(mm, file_rss);  
                 inc_mm_counter(mm, anon_rss);  
             }  
         } else  
             inc_mm_counter(mm, anon_rss);  
         flush_cache_page(vma, address, pte_pfn(orig_pte));  
         entry = mk_pte(new_page, vma->vm_page_prot);//获取new_page的pte  
         entry = maybe_mkwrite(pte_mkdirty(entry), vma);//修改new_page的权限  
         /* 
          * Clear the pte entry and flush it first, before updating the 
          * pte with the new entry. This will avoid a race condition 
          * seen in the presence of one thread doing SMC and another 
          * thread doing COW. 
          */  
         ptep_clear_flush(vma, address, page_table);  
         page_add_new_anon_rmap(new_page, vma, address);  
         /* 
          * We call the notify macro here because, when using secondary 
          * mmu page tables (such as kvm shadow page tables), we want the 
          * new page to be mapped directly into the secondary page table. 
          */  
         set_pte_at_notify(mm, address, page_table, entry);  
         update_mmu_cache(vma, address, entry);  
         if (old_page) {  
             /* 
              * Only after switching the pte to the new page may 
              * we remove the mapcount here. Otherwise another 
              * process may come and find the rmap count decremented 
              * before the pte is switched to the new page, and 
              * "reuse" the old page writing into it while our pte 
              * here still points into it and can be read by other 
              * threads. 
              * 
              * The critical issue is to order this 
              * page_remove_rmap with the ptp_clear_flush above. 
              * Those stores are ordered by (if nothing else,) 
              * the barrier present in the atomic_add_negative 
              * in page_remove_rmap. 
              * 
              * Then the TLB flush in ptep_clear_flush ensures that 
              * no process can access the old page before the 
              * decremented mapcount is visible. And the old page 
              * cannot be reused until after the decremented 
              * mapcount is visible. So transitively, TLBs to 
              * old page will be flushed before it can be reused. 
              */  
             page_remove_rmap(old_page);  
         }  
   
         /* Free the old page.. */  
         new_page = old_page;  
         ret |= VM_FAULT_WRITE;  
     } else  
         mem_cgroup_uncharge_page(new_page);  
   
     if (new_page)  
         page_cache_release(new_page);  
     if (old_page)  
         page_cache_release(old_page);  
 unlock:  
     pte_unmap_unlock(page_table, ptl);  
     if (dirty_page) {  
         /* 
          * Yes, Virginia, this is actually required to prevent a race 
          * with clear_page_dirty_for_io() from clearing the page dirty 
          * bit after it clear all dirty ptes, but before a racing 
          * do_wp_page installs a dirty pte. 
          * 
          * do_no_page is protected similarly. 
          */  
         if (!page_mkwrite) {  
             wait_on_page_locked(dirty_page);  
             set_page_dirty_balance(dirty_page, page_mkwrite);  
         }  
         put_page(dirty_page);  
         if (page_mkwrite) {  
             struct address_space *mapping = dirty_page->mapping;  
   
             set_page_dirty(dirty_page);  
             unlock_page(dirty_page);  
             page_cache_release(dirty_page);  
             if (mapping)    {  
                 /* 
                  * Some device drivers do not set page.mapping 
                  * but still dirty their pages 
                  */  
                 balance_dirty_pages_ratelimited(mapping);  
             }  
         }  
   
         /* file_update_time outside page_lock */  
         if (vma->vm_file)  
             file_update_time(vma->vm_file);  
     }  
     return ret;  
 oom_free_new:  
     page_cache_release(new_page);  
 oom:  
     if (old_page) {  
         if (page_mkwrite) {  
             unlock_page(old_page);  
             page_cache_release(old_page);  
         }  
         page_cache_release(old_page);  
     }  
     return VM_FAULT_OOM;  
   
 unwritable_page:  
     page_cache_release(old_page);  
     return ret;  
 }