linux进程地址空间(2) 缺页异常详解(3)写时复制COW详解

最新推荐文章于 2025-06-06 14:37:28 发布

FSak47

最新推荐文章于 2025-06-06 14:37:28 发布

阅读量3.2k

点赞数 1

CC 4.0 BY-SA版权

分类专栏：水滴石穿文章标签：写时复制COW 缺页异常处理用户进程缺页异常进程地址空间 mallocmmap

本文链接：https://blog.csdn.net/u010246947/article/details/10441739

水滴石穿专栏收录该内容

26 篇文章

订阅专栏

接上一篇

现在分析写时复制COW，对于写时复制，首先把握一点就是只有写操作时才有可能触发写时复制，所以首先总要判断异常flag是否含有标志FAULT_FLAG_WRITE，然后判断二级页表条目值是否含有L_PTE_WRITE标志，这是意味着这个物理页是否可写，如果不可写则说明应该进入写时复制流程，调用处理函数do_wp_page；

可见，COW的应用场合就是访问映射的页不可写，它包括两种情况，第一种是fork导致，第二种是如malloc后第一次对他进行读操作，获取到的是zero_pfn零页，当再次写时需要写时复制，共同特点都是虚拟地址的二级页表映射内容在内存中，但是对应的页不可写，在函数do_wp_page中对于这两种情况的处理基本相似的；

另外一个应该知道的是，如果该页只有一个进程在用，那么就直接修改这个页可写就行了，不要搞COW，总之，不到不得以的情况下是不会进行COW的，这也是内核对于COW使用的原则，就是尽量不使用；

函数do_wp_page源码如下：

static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,

unsigned long address, pte_t *page_table, pmd_t *pmd,

spinlock_t *ptl, pte_t orig_pte)

{

struct page *old_page, *new_page;

pte_t entry;

int reuse = 0, ret = 0;

int page_mkwrite = 0;

struct page *dirty_page = NULL;

/*返回不可写的页的页描述符，如果是COW的第一种情况即zero_pfn可读页，返回NULL，将进入下面的if流程；第二种情况即(父子进程)共享页将正常返回其页描述符*/

old_page = vm_normal_page(vma, address, orig_pte);

if (!old_page) {

* VM_MIXEDMAP !pfn_valid() case

* We should not cow pages in a shared writeable mapping.

* Just mark the pages writable as we can't do any dirty

* accounting on raw pfn maps.

/*如果这个vma是可写且共享的，跳到标号reuse，这就不会COW

否则跳到标号gotten*/

if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==

(VM_WRITE|VM_SHARED))

goto reuse;

goto gotten;

}

* Take out anonymous pages first, anonymous shared vmas are

* not dirty accountable.

/*下面的if和else流程，都是为了尽可能不进行COW，它们试图进入标号reuse*/

/*如果该页old_page是匿名页(由页描述符的mapping)，

并且只有一个进程使用该页(reuse_swap_page，由页描述符的_mapcount值是否为0)，那么不要搞什么COW了，这个进程就是可以使用该页*/

if (PageAnon(old_page) && !PageKsm(old_page)) {

/*排除其他进程在使用该页的情况，由页描述符的flag*/

if (!trylock_page(old_page)) {

page_cache_get(old_page);

pte_unmap_unlock(page_table, ptl);

lock_page(old_page);

page_table = pte_offset_map_lock(mm, pmd, address,

&ptl);

if (!pte_same(*page_table, orig_pte)) {

unlock_page(old_page);

page_cache_release(old_page);

goto unlock;

}

page_cache_release(old_page);

}

/*判断该页描述符的_mapcount值是否为0*/

reuse = reuse_swap_page(old_page);

unlock_page(old_page);

}

/*如果vma是共享且可写，看看这种情况下有没有机会不COW*/

else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==

(VM_WRITE|VM_SHARED))) {

* Only catch write-faults on shared writable pages,

* read-only shared pages can get COWed by

* get_user_pages(.write=1, .force=1).

if (vma->vm_ops && vma->vm_ops->page_mkwrite) {

struct vm_fault vmf;

int tmp;

vmf.virtual_address = (void __user *)(address &

PAGE_MASK);

vmf.pgoff = old_page->index;

vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

vmf.page = old_page;

* Notify the address space that the page is about to

* become writable so that it can prohibit this or wait

* for the page to get into an appropriate state.

* We do this without the lock held, so that it can

* sleep if it needs to.

page_cache_get(old_page);

pte_unmap_unlock(page_table, ptl);

tmp = vma->vm_ops->page_mkwrite(vma, &vmf);

if (unlikely(tmp &

(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {

ret = tmp;

goto unwritable_page;

}

if (unlikely(!(tmp & VM_FAULT_LOCKED))) {

lock_page(old_page);

if (!old_page->mapping) {

ret = 0; /* retry the fault */

unlock_page(old_page);

goto unwritable_page;

}

} else

VM_BUG_ON(!PageLocked(old_page));

* Since we dropped the lock we need to revalidate

* the PTE as someone else may have changed it. If

* they did, we just return, as we can count on the

* MMU to tell us if they didn't also make it writable.

page_table = pte_offset_map_lock(mm, pmd, address,

&ptl);

if (!pte_same(*page_table, orig_pte)) {

unlock_page(old_page);

page_cache_release(old_page);

goto unlock;

}

page_mkwrite = 1;

}

dirty_page = old_page;

get_page(dirty_page);

reuse = 1;

}

/*reuse: 不进行COW，直接操作该页old_page*/

if (reuse) {

reuse:

flush_cache_page(vma, address, pte_pfn(orig_pte));

entry = pte_mkyoung(orig_pte);

/*写该页的二级页表属性，加入可写且脏*/

entry = maybe_mkwrite(pte_mkdirty(entry), vma);

if (ptep_set_access_flags(vma, address, page_table, entry,1))

update_mmu_cache(vma, address, entry);

ret |= VM_FAULT_WRITE;

goto unlock;

}

* Ok, we need to copy. Oh, well..

/*真正的COW即将开始*/

/*首先增加之前的页的被映射次数(get_page(), page->_count)*/

page_cache_get(old_page);

gotten:

pte_unmap_unlock(page_table, ptl);

if (unlikely(anon_vma_prepare(vma)))

goto oom;

/*COW的第一种情况(zero_pfn)，将分配新页并清零该页*/

if (is_zero_pfn(pte_pfn(orig_pte))) {

new_page = alloc_zeroed_user_highpage_movable(vma, address);

if (!new_page)

goto oom;

}

/*COW的第二种情况(fork)，申请一个页，并把old_page页的内容拷贝到新页new_page(4K字节的内容)*/

else {

new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);

if (!new_page)

goto oom;

cow_user_page(new_page, old_page, address, vma);

}

__SetPageUptodate(new_page);

* Don't let another task, with possibly unlocked vma,

* keep the mlocked page.

/*COW第二种情况下，如果vma还是锁定的，那还需要解锁*/

if ((vma->vm_flags & VM_LOCKED) && old_page) {

lock_page(old_page); /* for LRU manipulation */

clear_page_mlock(old_page);

unlock_page(old_page);

}

/*空函数*/

if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))

goto oom_free_new;

* Re-check the pte - we dropped the lock

/*再获取下访问异常的地址addr对应的二级页表条目地址page_table*/

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

if (likely(pte_same(*page_table, orig_pte))) {

if (old_page) {

if (!PageAnon(old_page)) {

dec_mm_counter(mm, file_rss);

inc_mm_counter(mm, anon_rss);

}

} else

inc_mm_counter(mm, anon_rss);

flush_cache_page(vma, address, pte_pfn(orig_pte));

/*写新页的二级页表条目内容为脏*/

entry = mk_pte(new_page, vma->vm_page_prot);

entry = maybe_mkwrite(pte_mkdirty(entry), vma);

* Clear the pte entry and flush it first, before updating the

* pte with the new entry. This will avoid a race condition

* seen in the presence of one thread doing SMC and another

* thread doing COW.

ptep_clear_flush(vma, address, page_table);

page_add_new_anon_rmap(new_page, vma, address);

* We call the notify macro here because, when using secondary

* mmu page tables (such as kvm shadow page tables), we want the

* new page to be mapped directly into the secondary page table.

set_pte_at_notify(mm, address, page_table, entry);

update_mmu_cache(vma, address, entry);

if (old_page) {

* Only after switching the pte to the new page may

* we remove the mapcount here. Otherwise another

* process may come and find the rmap count decremented

* before the pte is switched to the new page, and

* "reuse" the old page writing into it while our pte

* here still points into it and can be read by other

* threads.

* The critical issue is to order this

* page_remove_rmap with the ptp_clear_flush above.

* Those stores are ordered by (if nothing else,)

* the barrier present in the atomic_add_negative

* in page_remove_rmap.

* Then the TLB flush in ptep_clear_flush ensures that

* no process can access the old page before the

* decremented mapcount is visible. And the old page

* cannot be reused until after the decremented

* mapcount is visible. So transitively, TLBs to

* old page will be flushed before it can be reused.

page_remove_rmap(old_page);

}

/* Free the old page.. */

new_page = old_page;

ret |= VM_FAULT_WRITE;

}

else

mem_cgroup_uncharge_page(new_page);

if (new_page)

page_cache_release(new_page);

if (old_page)

page_cache_release(old_page);

unlock:

pte_unmap_unlock(page_table, ptl);

if (dirty_page) {

* Yes, Virginia, this is actually required to prevent a race

* with clear_page_dirty_for_io() from clearing the page dirty

* bit after it clear all dirty ptes, but before a racing

* do_wp_page installs a dirty pte.

* do_no_page is protected similarly.

if (!page_mkwrite) {

wait_on_page_locked(dirty_page);

set_page_dirty_balance(dirty_page, page_mkwrite);

}

put_page(dirty_page);

if (page_mkwrite) {

struct address_space *mapping = dirty_page->mapping;

set_page_dirty(dirty_page);

unlock_page(dirty_page);

page_cache_release(dirty_page);

if (mapping) {

* Some device drivers do not set page.mapping

* but still dirty their pages

balance_dirty_pages_ratelimited(mapping);

}

/* file_update_time outside page_lock */

if (vma->vm_file)

file_update_time(vma->vm_file);

}

return ret;

oom_free_new:

page_cache_release(new_page);

oom:

if (old_page) {

if (page_mkwrite) {

unlock_page(old_page);

page_cache_release(old_page);

}

page_cache_release(old_page);

}

return VM_FAULT_OOM;

unwritable_page:

page_cache_release(old_page);

return ret;

}

一级一级返回，最终返回到函数__do_page_fault，会根据返回值fault累计task的相应异常类型次数(maj_flt或min_flt)，并最终把fault返回给函数do_page_fault，释放信号量mmap_sem，正常情况下就返回0，缺页异常处理完毕。