linux进程地址空间(2) 缺页异常详解(3)写时复制COW详解

接上一篇

现在分析写时复制COW,对于写时复制,首先把握一点就是只有写操作时才有可能触发写时复制,所以首先总要判断异常flag是否含有标志FAULT_FLAG_WRITE,然后判断二级页表条目值是否含有L_PTE_WRITE标志,这是意味着这个物理页是否可写,如果不可写则说明应该进入写时复制流程,调用处理函数do_wp_page

可见,COW的应用场合就是访问映射的页不可写,它包括两种情况,第一种是fork导致,第二种是如malloc后第一次对他进行读操作,获取到的是zero_pfn零页,当再次写时需要写时复制,共同特点都是虚拟地址的二级页表映射内容在内存中,但是对应的页不可写,在函数do_wp_page中对于这两种情况的处理基本相似的;

另外一个应该知道的是,如果该页只有一个进程在用,那么就直接修改这个页可写就行了,不要搞COW,总之,不到不得以的情况下是不会进行COW的,这也是内核对于COW使用的原则,就是尽量不使用;

函数do_wp_page源码如下

static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,

                   unsigned long address, pte_t *page_table, pmd_t *pmd,

                   spinlock_t *ptl, pte_t orig_pte)

{

         struct page *old_page, *new_page;

         pte_t entry;

         int reuse = 0, ret = 0;

         int page_mkwrite = 0;

         struct page *dirty_page = NULL;

    /*返回不可写的页的页描述符,如果是COW的第一种情况即zero_pfn可读页,返回NULL,将进入下面的if流程;第二种情况即(父子进程)共享页将正常返回其页描述符*/

         old_page = vm_normal_page(vma, address, orig_pte);

         if (!old_page) {

                   /*

                    * VM_MIXEDMAP !pfn_valid() case

                    *

                    * We should not cow pages in a shared writeable mapping.

                    * Just mark the pages writable as we can't do any dirty

                    * accounting on raw pfn maps.

                    */

                   /*如果这个vma是可写且共享的,跳到标号reuse,这就不会COW

          否则跳到标号gotten*/

                   if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==

                                          (VM_WRITE|VM_SHARED))

                            goto reuse;

                   goto gotten;

         }

         /*

          * Take out anonymous pages first, anonymous shared vmas are

          * not dirty accountable.

          */

/*下面的ifelse流程,都是为了尽可能不进行COW,它们试图进入标号reuse*/

   

         /*如果该页old_page是匿名页(由页描述符的mapping)

           并且只有一个进程使用该页(reuse_swap_page,由页描述符的_mapcount值是否为0),那么不要搞什么COW了,这个进程就是可以使用该页*/

         if (PageAnon(old_page) && !PageKsm(old_page)) {

        /*排除其他进程在使用该页的情况,由页描述符的flag*/

                   if (!trylock_page(old_page)) {

                            page_cache_get(old_page);

                            pte_unmap_unlock(page_table, ptl);

                            lock_page(old_page);

                            page_table = pte_offset_map_lock(mm, pmd, address,

                                                                  &ptl);

                            if (!pte_same(*page_table, orig_pte)) {

                                     unlock_page(old_page);

                                     page_cache_release(old_page);

                                     goto unlock;

                            }

                            page_cache_release(old_page);

                   }

        /*判断该页描述符的_mapcount值是否为0*/

                   reuse = reuse_swap_page(old_page);

                   unlock_page(old_page);

         }

    /*如果vma是共享且可写,看看这种情况下有没有机会不COW*/

    else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==

                                               (VM_WRITE|VM_SHARED))) {

                   /*

                    * Only catch write-faults on shared writable pages,

                    * read-only shared pages can get COWed by

                    * get_user_pages(.write=1, .force=1).

                    */

                   if (vma->vm_ops && vma->vm_ops->page_mkwrite) {

                            struct vm_fault vmf;

                            int tmp;

                            vmf.virtual_address = (void __user *)(address &

                                                                           PAGE_MASK);

                            vmf.pgoff = old_page->index;

                            vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

                            vmf.page = old_page;

                            /*

                             * Notify the address space that the page is about to

                             * become writable so that it can prohibit this or wait

                             * for the page to get into an appropriate state.

                             *

                             * We do this without the lock held, so that it can

                             * sleep if it needs to.

                             */

                            page_cache_get(old_page);

                            pte_unmap_unlock(page_table, ptl);

                            tmp = vma->vm_ops->page_mkwrite(vma, &vmf);

                            if (unlikely(tmp &

                                               (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {

                                     ret = tmp;

                                     goto unwritable_page;

                            }

                            if (unlikely(!(tmp & VM_FAULT_LOCKED))) {

                                     lock_page(old_page);

                                     if (!old_page->mapping) {

                                               ret = 0; /* retry the fault */

                                               unlock_page(old_page);

                                               goto unwritable_page;

                                     }

                            } else

                                     VM_BUG_ON(!PageLocked(old_page));

                            /*

                             * Since we dropped the lock we need to revalidate

                             * the PTE as someone else may have changed it.  If

                             * they did, we just return, as we can count on the

                             * MMU to tell us if they didn't also make it writable.

                             */

                            page_table = pte_offset_map_lock(mm, pmd, address,

                                                                  &ptl);

                            if (!pte_same(*page_table, orig_pte)) {

                                     unlock_page(old_page);

                                     page_cache_release(old_page);

                                     goto unlock;

                            }

                            page_mkwrite = 1;

                   }

                   dirty_page = old_page;

                   get_page(dirty_page);

                   reuse = 1;

         }

    /*reuse: 不进行COW,直接操作该页old_page*/

         if (reuse) {

reuse:

                   flush_cache_page(vma, address, pte_pfn(orig_pte));

                   entry = pte_mkyoung(orig_pte);

        /*写该页的二级页表属性,加入可写且脏*/

                   entry = maybe_mkwrite(pte_mkdirty(entry), vma);

                   if (ptep_set_access_flags(vma, address, page_table, entry,1))

                            update_mmu_cache(vma, address, entry);

                   ret |= VM_FAULT_WRITE;

                   goto unlock;

         }

         /*

          * Ok, we need to copy. Oh, well..

          */

/*真正的COW即将开始*/

    /*首先增加之前的页的被映射次数(get_page(), page->_count)*/

         page_cache_get(old_page);

gotten:

         pte_unmap_unlock(page_table, ptl);

         if (unlikely(anon_vma_prepare(vma)))

                   goto oom;

    /*COW的第一种情况(zero_pfn),将分配新页并清零该页*/

         if (is_zero_pfn(pte_pfn(orig_pte))) {

                   new_page = alloc_zeroed_user_highpage_movable(vma, address);

                   if (!new_page)

                            goto oom;

         }

    /*COW的第二种情况(fork),申请一个页,并把old_page页的内容拷贝到新页new_page(4K字节的内容)*/

    else {

                   new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);

                   if (!new_page)

                            goto oom;

                   cow_user_page(new_page, old_page, address, vma);

         }

         __SetPageUptodate(new_page);

         /*

          * Don't let another task, with possibly unlocked vma,

          * keep the mlocked page.

          */

         /*COW第二种情况下,如果vma还是锁定的,那还需要解锁*/

         if ((vma->vm_flags & VM_LOCKED) && old_page) {

                   lock_page(old_page);      /* for LRU manipulation */

                   clear_page_mlock(old_page);

                   unlock_page(old_page);

         }

    /*空函数*/

         if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))

                   goto oom_free_new;

         /*

          * Re-check the pte - we dropped the lock

          */

         /*再获取下访问异常的地址addr对应的二级页表条目地址page_table*/

         page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

         if (likely(pte_same(*page_table, orig_pte))) {

                   if (old_page) {

                            if (!PageAnon(old_page)) {

                                     dec_mm_counter(mm, file_rss);

                                     inc_mm_counter(mm, anon_rss);

                            }

                   } else

                            inc_mm_counter(mm, anon_rss);

                   flush_cache_page(vma, address, pte_pfn(orig_pte));

        /*写新页的二级页表条目内容为脏*/

                   entry = mk_pte(new_page, vma->vm_page_prot);

                   entry = maybe_mkwrite(pte_mkdirty(entry), vma);

                   /*

                    * Clear the pte entry and flush it first, before updating the

                    * pte with the new entry. This will avoid a race condition

                    * seen in the presence of one thread doing SMC and another

                    * thread doing COW.

                    */

                   ptep_clear_flush(vma, address, page_table);

                   page_add_new_anon_rmap(new_page, vma, address);

                   /*

                    * We call the notify macro here because, when using secondary

                    * mmu page tables (such as kvm shadow page tables), we want the

                    * new page to be mapped directly into the secondary page table.

                    */

                   set_pte_at_notify(mm, address, page_table, entry);

                   update_mmu_cache(vma, address, entry);

                   if (old_page) {

                            /*

                             * Only after switching the pte to the new page may

                             * we remove the mapcount here. Otherwise another

                             * process may come and find the rmap count decremented

                             * before the pte is switched to the new page, and

                             * "reuse" the old page writing into it while our pte

                             * here still points into it and can be read by other

                             * threads.

                             *

                             * The critical issue is to order this

                             * page_remove_rmap with the ptp_clear_flush above.

                             * Those stores are ordered by (if nothing else,)

                             * the barrier present in the atomic_add_negative

                             * in page_remove_rmap.

                             *

                             * Then the TLB flush in ptep_clear_flush ensures that

                             * no process can access the old page before the

                             * decremented mapcount is visible. And the old page

                             * cannot be reused until after the decremented

                             * mapcount is visible. So transitively, TLBs to

                             * old page will be flushed before it can be reused.

                             */

                            page_remove_rmap(old_page);

                   }

                   /* Free the old page.. */

                   new_page = old_page;

                   ret |= VM_FAULT_WRITE;

         }

    else

                   mem_cgroup_uncharge_page(new_page);

         if (new_page)

                   page_cache_release(new_page);

         if (old_page)

                   page_cache_release(old_page);

unlock:

         pte_unmap_unlock(page_table, ptl);

         if (dirty_page) {

                   /*

                    * Yes, Virginia, this is actually required to prevent a race

                    * with clear_page_dirty_for_io() from clearing the page dirty

                    * bit after it clear all dirty ptes, but before a racing

                    * do_wp_page installs a dirty pte.

                    *

                    * do_no_page is protected similarly.

                    */

                   if (!page_mkwrite) {

                            wait_on_page_locked(dirty_page);

                            set_page_dirty_balance(dirty_page, page_mkwrite);

                   }

                   put_page(dirty_page);

                   if (page_mkwrite) {

                            struct address_space *mapping = dirty_page->mapping;

                            set_page_dirty(dirty_page);

                            unlock_page(dirty_page);

                            page_cache_release(dirty_page);

                            if (mapping)      {

                                     /*

                                      * Some device drivers do not set page.mapping

                                      * but still dirty their pages

                                      */

                                     balance_dirty_pages_ratelimited(mapping);

                            }

                   }

                   /* file_update_time outside page_lock */

                   if (vma->vm_file)

                            file_update_time(vma->vm_file);

         }

         return ret;

oom_free_new:

         page_cache_release(new_page);

oom:

         if (old_page) {

                   if (page_mkwrite) {

                            unlock_page(old_page);

                            page_cache_release(old_page);

                   }

                   page_cache_release(old_page);

         }

         return VM_FAULT_OOM;

unwritable_page:

         page_cache_release(old_page);

         return ret;

}

一级一级返回,最终返回到函数__do_page_fault,会根据返回值fault累计task的相应异常类型次数(maj_fltmin_flt),并最终把fault返回给函数do_page_fault,释放信号量mmap_sem,正常情况下就返回0,缺页异常处理完毕

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值