子进程发生COW
如果子进程的VMA发送COW那么会使用子进程VMA创建的anon_vma数据结构,即page->mapping指针指向子进程VMA对应的anon_vma数据结构。在do_wp_page()函数中处理COW场景的情况。
子进程和父进程共享的匿名页面,子进程的VMA发送COW
->缺页中断
->handle_pte_fault
->do_wp_page
->分配一个新的匿名页面
->_page_set_anon_rmap使用子进程的anon_vma来设置page->mapping
RMAP应用
内核中通常有通过struct page数据结构找到所有映射这个page的VMA的需求。早期的linux内核实现通过扫描所有进程的VMA,这种方法相当耗时。在linux2.5开发期间,反向映射的概念已经形成,经过多年的优化形成现在的版本。
反向映射的典型应用场景如下:
-
kswapd内核线程回收页面需要断开所有映射了该匿名页面的用户PTE页表项。
-
页面迁移时,需要断开所有映射到匿名页面的用户PTE页表项。
try_to_unmap()函数实现: 反向映射的核心函数是try_to_unmap(),内核中的其他模块会调用此函数来断开一个页面的所有映射。
[mm/rmap.c]
/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
* @flags: action and flags
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
* Return values are:
*
* SWAP_SUCCESS - we succeeded in removing all mappings 成功解除了所有映射的pte
* SWAP_AGAIN - we missed a mapping, try again later 可能错过了一个映射的pte,需要重新来一次。
* SWAP_FAIL - the page is unswappable 失败
* SWAP_MLOCK - page is mlocked. 页面被锁住了
*/
int try_to_unmap(struct page *page, enum ttu_flags flags)
{
int ret;
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
* page tables leading to a race where migration cannot
* find the migration ptes. Rather than increasing the
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;
ret = rmap_walk(page, &rwc);
if (ret != SWAP_MLOCK && !page_mapped(page))
ret = SWAP_SUCCESS;
return ret;
}
内核中有3种页面需要unmap操作,即KSM页面、匿名页面和文件映射页面,因此定义一个rmap_walk_control控制数据结构来统一管理unmap操作。
struct rmap_walk_control {
void *arg;
/*ramp_one表示具体断开某个VMA上映射的pte*/
int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
unsigned long addr, void *arg);
/*done 表示判断一个页面是否断开成功的条件*/
int (*done)(struct page *page);
/*anon_lock 实现一个锁机制*/
struct anon_vma *(*anon_lock)(struct page *page);
/*表示跳过无效的VMA*/
bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};
rmap_walk()函数实现:
[try_to_unmap()->rmap_walk()->rmap_walk_anon()]
int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
{
if (unlikely(PageKsm(page)))
return rmap_walk_ksm(page, rwc);
else if (PageAnon(page))
return rmap_walk_anon(page, rwc);
else
return rmap_walk_file(page, rwc);
}
static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
pgoff_t pgoff;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
/*获取页面page->mapping指向的anon_vma数据结构,并申请一个读者锁,下面查看此函数实现*/
anon_vma = rmap_walk_anon_lock(page, rwc);
if (!anon_vma)
return ret;
pgoff = page_to_pgoff(page);
/*遍历anon_vma->rb_root红黑树的avc,从avc中可以得到相应的VMA,然后
调用rmap_one()来完成断开用户PTE页表项*/
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
/*下面查看try_to_unmap_one()函数实现*/
ret = rwc->rmap_one(page, vma, address, rwc->arg);
if (ret != SWAP_AGAIN)
break;
if (rwc->done && rwc->done(page))
break;
}
anon_vma_unlock_read(anon_vma);
return ret;
}
rmap_walk_anon_lock()函数实现:
[try_to_unmap()->rmap_walk()->rmap_walk_anon()->rmap_walk_anon_lock()]
static struct anon_vma *rmap_walk_anon_lock(struct page *page,
struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
if (rwc->anon_lock)
return rwc->anon_lock(page);
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
* are holding mmap_sem. Users without mmap_sem are required to
* take a reference count to prevent the anon_vma disappearing
*/
anon_vma = page_anon_vma(page);
if (!anon_vma)
return NULL;
anon_vma_lock_read(anon_vma);
return anon_vma;
}
回到rmap_walk_anon()函数
try_to_unmap_one()函数实现:
[try_to_unmap()->rmap_walk()->rmap_walk_anon()->try_to_unmap_one()]
/*
* @arg: enum ttu_flags will be passed to this argument
*/
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
enum ttu_flags flags = (enum ttu_flags)arg;
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
* skipped over this mm) then we should reactivate it.
*/
if (!(flags & TTU_IGNORE_MLOCK)) {
if (vma->vm_flags & VM_LOCKED)
goto out_mlock;
if (flags & TTU_MUNLOCK)
goto out_unmap;
}
if (!(flags & TTU_IGNORE_ACCESS)) {
if (ptep_clear_flush_young_notify(vma, address, pte)) {
ret = SWAP_FAIL;
goto out_unmap;
}
}
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
pteval = ptep_clear_flush(vma, address, pte);
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
set_page_dirty(page);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
if (!PageHuge(page)) {
if (PageAnon(page))
dec_mm_counter(mm, MM_ANONPAGES);
else
dec_mm_counter(mm, MM_FILEPAGES);
}
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
} else if (pte_unused(pteval)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
* will take care of the rest.
*/
if (PageAnon(page))
dec_mm_counter(mm, MM_ANONPAGES);
else
dec_mm_counter(mm, MM_FILEPAGES);
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
pte_t swp_pte;
if (PageSwapCache(page)) {
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pte, pteval);
ret = SWAP_FAIL;
goto out_unmap;
}
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
dec_mm_counter(mm, MM_ANONPAGES);
inc_mm_counter(mm, MM_SWAPENTS);
} else if (IS_ENABLED(CONFIG_MIGRATION)) {
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
BUG_ON(!(flags & TTU_MIGRATION));
entry = make_migration_entry(page, pte_write(pteval));
}
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, address, pte, swp_pte);
} else if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & TTU_MIGRATION)) {
/* Establish migration entry for a file page */
swp_entry_t entry;
entry = make_migration_entry(page, pte_write(pteval));
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
dec_mm_counter(mm, MM_FILEPAGES);
page_remove_rmap(page);
page_cache_release(page);
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
mmu_notifier_invalidate_page(mm, address);
out:
return ret;
out_mlock:
pte_unmap_unlock(pte, ptl);
/*
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
* unstable result and race. Plus, We can't wait here because
* we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
* if trylock failed, the page remain in evictable lru and later
* vmscan could retry to move the page to unevictable lru if the
* page is actually mlocked.
*/
if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
if (vma->vm_flags & VM_LOCKED) {
mlock_vma_page(page);
ret = SWAP_MLOCK;
}
up_read(&vma->vm_mm->mmap_sem);
}
return ret;
}