接上一篇
现在分析写时复制COW,对于写时复制,首先把握一点就是只有写操作时才有可能触发写时复制,所以首先总要判断异常flag是否含有标志FAULT_FLAG_WRITE,然后判断二级页表条目值是否含有L_PTE_WRITE标志,这是意味着这个物理页是否可写,如果不可写则说明应该进入写时复制流程,调用处理函数do_wp_page;
可见,COW的应用场合就是访问映射的页不可写,它包括两种情况,第一种是fork导致,第二种是如malloc后第一次对他进行读操作,获取到的是zero_pfn零页,当再次写时需要写时复制,共同特点都是虚拟地址的二级页表映射内容在内存中,但是对应的页不可写,在函数do_wp_page中对于这两种情况的处理基本相似的;
另外一个应该知道的是,如果该页只有一个进程在用,那么就直接修改这个页可写就行了,不要搞COW,总之,不到不得以的情况下是不会进行COW的,这也是内核对于COW使用的原则,就是尽量不使用;
函数do_wp_page源码如下:
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte)
{
struct page *old_page, *new_page;
pte_t entry;
int reuse = 0, ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
/*返回不可写的页的页描述符,如果是COW的第一种情况即zero_pfn可读页,返回NULL,将进入下面的if流程;第二种情况即(父子进程)共享页将正常返回其页描述符*/
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
/*
* VM_MIXEDMAP !pfn_valid() case
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable as we can't do any dirty
* accounting on raw pfn maps.
*/
/*如果这个vma是可写且共享的,跳到标号reuse,这就不会COW
否则跳到标号gotten*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
goto reuse;
goto gotten;
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
/*下面的if和else流程,都是为了尽可能不进行COW,它们试图进入标号reuse*/
/*如果该页old_page是匿名页(由页描述符的mapping),
并且只有一个进程使用该页(reuse_swap_page,由页描述符的_mapcount值是否为0),那么不要搞什么COW了,这个进程就是可以使用该页*/
if (PageAnon(old_page) && !PageKsm(old_page)) {
/*排除其他进程在使用该页的情况,由页描述符的flag*/
if (!trylock_page(old_page)) {
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_cache_release(old_page);
goto unlock;
}
page_cache_release(old_page);
}
/*判断该页描述符的_mapcount值是否为0*/
reuse = reuse_swap_page(old_page);
unlock_page(old_page);
}
/*如果vma是共享且可写,看看这种情况下有没有机会不COW*/
else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
/*
* Only catch write-faults on shared writable pages,
* read-only shared pages can get COWed by
* get_user_pages(.write=1, .force=1).
*/
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
struct vm_fault vmf;
int tmp;
vmf.virtual_address = (void __user *)(address &
PAGE_MASK);
vmf.pgoff = old_page->index;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
vmf.page = old_page;
/*
* Notify the address space that the page is about to
* become writable so that it can prohibit this or wait
* for the page to get into an appropriate state.
*
* We do this without the lock held, so that it can
* sleep if it needs to.
*/
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
goto unwritable_page;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(old_page);
if (!old_page->mapping) {
ret = 0; /* retry the fault */
unlock_page(old_page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(old_page));
/*
* Since we dropped the lock we need to revalidate
* the PTE as someone else may have changed it. If
* they did, we just return, as we can count on the
* MMU to tell us if they didn't also make it writable.
*/
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_cache_release(old_page);
goto unlock;
}
page_mkwrite = 1;
}
dirty_page = old_page;
get_page(dirty_page);
reuse = 1;
}
/*reuse: 不进行COW,直接操作该页old_page*/
if (reuse) {
reuse:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
/*写该页的二级页表属性,加入可写且脏*/
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, entry);
ret |= VM_FAULT_WRITE;
goto unlock;
}
/*
* Ok, we need to copy. Oh, well..
*/
/*真正的COW即将开始*/
/*首先增加之前的页的被映射次数(get_page(), page->_count)*/
page_cache_get(old_page);
gotten:
pte_unmap_unlock(page_table, ptl);
if (unlikely(anon_vma_prepare(vma)))
goto oom;
/*COW的第一种情况(zero_pfn),将分配新页并清零该页*/
if (is_zero_pfn(pte_pfn(orig_pte))) {
new_page = alloc_zeroed_user_highpage_movable(vma, address);
if (!new_page)
goto oom;
}
/*COW的第二种情况(fork),申请一个页,并把old_page页的内容拷贝到新页new_page(4K字节的内容)*/
else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address, vma);
}
__SetPageUptodate(new_page);
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
/*COW第二种情况下,如果vma还是锁定的,那还需要解锁*/
if ((vma->vm_flags & VM_LOCKED) && old_page) {
lock_page(old_page); /* for LRU manipulation */
clear_page_mlock(old_page);
unlock_page(old_page);
}
/*空函数*/
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
/*
* Re-check the pte - we dropped the lock
*/
/*再获取下访问异常的地址addr对应的二级页表条目地址page_table*/
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter(mm, file_rss);
inc_mm_counter(mm, anon_rss);
}
} else
inc_mm_counter(mm, anon_rss);
flush_cache_page(vma, address, pte_pfn(orig_pte));
/*写新页的二级页表条目内容为脏*/
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry. This will avoid a race condition
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
ptep_clear_flush(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
set_pte_at_notify(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
if (old_page) {
/*
* Only after switching the pte to the new page may
* we remove the mapcount here. Otherwise another
* process may come and find the rmap count decremented
* before the pte is switched to the new page, and
* "reuse" the old page writing into it while our pte
* here still points into it and can be read by other
* threads.
*
* The critical issue is to order this
* page_remove_rmap with the ptp_clear_flush above.
* Those stores are ordered by (if nothing else,)
* the barrier present in the atomic_add_negative
* in page_remove_rmap.
*
* Then the TLB flush in ptep_clear_flush ensures that
* no process can access the old page before the
* decremented mapcount is visible. And the old page
* cannot be reused until after the decremented
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
page_remove_rmap(old_page);
}
/* Free the old page.. */
new_page = old_page;
ret |= VM_FAULT_WRITE;
}
else
mem_cgroup_uncharge_page(new_page);
if (new_page)
page_cache_release(new_page);
if (old_page)
page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
/*
* Yes, Virginia, this is actually required to prevent a race
* with clear_page_dirty_for_io() from clearing the page dirty
* bit after it clear all dirty ptes, but before a racing
* do_wp_page installs a dirty pte.
*
* do_no_page is protected similarly.
*/
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
set_page_dirty_balance(dirty_page, page_mkwrite);
}
put_page(dirty_page);
if (page_mkwrite) {
struct address_space *mapping = dirty_page->mapping;
set_page_dirty(dirty_page);
unlock_page(dirty_page);
page_cache_release(dirty_page);
if (mapping) {
/*
* Some device drivers do not set page.mapping
* but still dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
}
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
}
return ret;
oom_free_new:
page_cache_release(new_page);
oom:
if (old_page) {
if (page_mkwrite) {
unlock_page(old_page);
page_cache_release(old_page);
}
page_cache_release(old_page);
}
return VM_FAULT_OOM;
unwritable_page:
page_cache_release(old_page);
return ret;
}
一级一级返回,最终返回到函数__do_page_fault,会根据返回值fault累计task的相应异常类型次数(maj_flt或min_flt),并最终把fault返回给函数do_page_fault,释放信号量mmap_sem,正常情况下就返回0,缺页异常处理完毕。