13.3 子进程发生COW和RMAP应用

子进程发生COW    

如果子进程的VMA发送COW那么会使用子进程VMA创建的anon_vma数据结构,即page->mapping指针指向子进程VMA对应的anon_vma数据结构。在do_wp_page()函数中处理COW场景的情况。

子进程和父进程共享的匿名页面,子进程的VMA发送COW
->缺页中断
    ->handle_pte_fault
        ->do_wp_page
            ->分配一个新的匿名页面
                ->_page_set_anon_rmap使用子进程的anon_vma来设置page->mapping

RMAP应用

    内核中通常有通过struct page数据结构找到所有映射这个page的VMA的需求。早期的linux内核实现通过扫描所有进程的VMA,这种方法相当耗时。在linux2.5开发期间,反向映射的概念已经形成,经过多年的优化形成现在的版本。

    反向映射的典型应用场景如下:

  • kswapd内核线程回收页面需要断开所有映射了该匿名页面的用户PTE页表项。

  • 页面迁移时,需要断开所有映射到匿名页面的用户PTE页表项。

   

try_to_unmap()函数实现: 反向映射的核心函数是try_to_unmap(),内核中的其他模块会调用此函数来断开一个页面的所有映射。

[mm/rmap.c]

/**
 * try_to_unmap - try to remove all page table mappings to a page
 * @page: the page to get unmapped
 * @flags: action and flags
 *
 * Tries to remove all the page table entries which are mapping this
 * page, used in the pageout path.  Caller must hold the page lock.
 * Return values are:
 *
 * SWAP_SUCCESS - we succeeded in removing all mappings 成功解除了所有映射的pte
 * SWAP_AGAIN   - we missed a mapping, try again later 可能错过了一个映射的pte,需要重新来一次。
 * SWAP_FAIL    - the page is unswappable 失败
 * SWAP_MLOCK   - page is mlocked. 页面被锁住了
 */
int try_to_unmap(struct page *page, enum ttu_flags flags)
{
    int ret;
    struct rmap_walk_control rwc = {
        .rmap_one = try_to_unmap_one,
        .arg = (void *)flags,
        .done = page_not_mapped,
        .anon_lock = page_lock_anon_vma_read,
    };

    VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);

    /*
     * During exec, a temporary VMA is setup and later moved.
     * The VMA is moved under the anon_vma lock but not the
     * page tables leading to a race where migration cannot
     * find the migration ptes. Rather than increasing the
     * locking requirements of exec(), migration skips
     * temporary VMAs until after exec() completes.
     */
    if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
        rwc.invalid_vma = invalid_migration_vma;

    ret = rmap_walk(page, &rwc);

    if (ret != SWAP_MLOCK && !page_mapped(page))
        ret = SWAP_SUCCESS;
    return ret;
}

内核中有3种页面需要unmap操作,即KSM页面、匿名页面和文件映射页面,因此定义一个rmap_walk_control控制数据结构来统一管理unmap操作。

struct rmap_walk_control {
    void *arg;
    /*ramp_one表示具体断开某个VMA上映射的pte*/
    int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
                    unsigned long addr, void *arg);
    /*done 表示判断一个页面是否断开成功的条件*/
    int (*done)(struct page *page);
    /*anon_lock 实现一个锁机制*/
    struct anon_vma *(*anon_lock)(struct page *page);
    /*表示跳过无效的VMA*/
    bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};

rmap_walk()函数实现:

[try_to_unmap()->rmap_walk()->rmap_walk_anon()]

int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
{
    if (unlikely(PageKsm(page)))
        return rmap_walk_ksm(page, rwc);
    else if (PageAnon(page))
        return rmap_walk_anon(page, rwc);
    else
        return rmap_walk_file(page, rwc);
}

static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
{
    struct anon_vma *anon_vma;
    pgoff_t pgoff;
    struct anon_vma_chain *avc;
    int ret = SWAP_AGAIN;
    
    /*获取页面page->mapping指向的anon_vma数据结构,并申请一个读者锁,下面查看此函数实现*/
    anon_vma = rmap_walk_anon_lock(page, rwc);
    if (!anon_vma)
        return ret;

    pgoff = page_to_pgoff(page);
    /*遍历anon_vma->rb_root红黑树的avc,从avc中可以得到相应的VMA,然后
    调用rmap_one()来完成断开用户PTE页表项*/
    anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
        struct vm_area_struct *vma = avc->vma;
        unsigned long address = vma_address(page, vma);

        if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
            continue;
        /*下面查看try_to_unmap_one()函数实现*/
        ret = rwc->rmap_one(page, vma, address, rwc->arg);
        if (ret != SWAP_AGAIN)
            break;
        if (rwc->done && rwc->done(page))
            break;
    }
    anon_vma_unlock_read(anon_vma);
    return ret;
}

rmap_walk_anon_lock()函数实现:

[try_to_unmap()->rmap_walk()->rmap_walk_anon()->rmap_walk_anon_lock()]

static struct anon_vma *rmap_walk_anon_lock(struct page *page,
                    struct rmap_walk_control *rwc)
{
    struct anon_vma *anon_vma;

    if (rwc->anon_lock)
        return rwc->anon_lock(page);

    /*
     * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
     * because that depends on page_mapped(); but not all its usages
     * are holding mmap_sem. Users without mmap_sem are required to
     * take a reference count to prevent the anon_vma disappearing
     */
    anon_vma = page_anon_vma(page);
    if (!anon_vma)
        return NULL;

    anon_vma_lock_read(anon_vma);
    return anon_vma;
}
回到rmap_walk_anon()函数

try_to_unmap_one()函数实现:

[try_to_unmap()->rmap_walk()->rmap_walk_anon()->try_to_unmap_one()]

/*
 * @arg: enum ttu_flags will be passed to this argument
 */
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
             unsigned long address, void *arg)
{
    struct mm_struct *mm = vma->vm_mm;
    pte_t *pte;
    pte_t pteval;
    spinlock_t *ptl;
    int ret = SWAP_AGAIN;
    enum ttu_flags flags = (enum ttu_flags)arg;

    pte = page_check_address(page, mm, address, &ptl, 0);
    if (!pte)
        goto out;

    /*
     * If the page is mlock()d, we cannot swap it out.
     * If it's recently referenced (perhaps page_referenced
     * skipped over this mm) then we should reactivate it.
     */
    if (!(flags & TTU_IGNORE_MLOCK)) {
        if (vma->vm_flags & VM_LOCKED)
            goto out_mlock;

        if (flags & TTU_MUNLOCK)
            goto out_unmap;
    }
    if (!(flags & TTU_IGNORE_ACCESS)) {
        if (ptep_clear_flush_young_notify(vma, address, pte)) {
            ret = SWAP_FAIL;
            goto out_unmap;
        }
    }

    /* Nuke the page table entry. */
    flush_cache_page(vma, address, page_to_pfn(page));
    pteval = ptep_clear_flush(vma, address, pte);

    /* Move the dirty bit to the physical page now the pte is gone. */
    if (pte_dirty(pteval))
        set_page_dirty(page);

    /* Update high watermark before we lower rss */
    update_hiwater_rss(mm);

    if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
        if (!PageHuge(page)) {
            if (PageAnon(page))
                dec_mm_counter(mm, MM_ANONPAGES);
            else
                dec_mm_counter(mm, MM_FILEPAGES);
        }
        set_pte_at(mm, address, pte,
               swp_entry_to_pte(make_hwpoison_entry(page)));
    } else if (pte_unused(pteval)) {
        /*
         * The guest indicated that the page content is of no
         * interest anymore. Simply discard the pte, vmscan
         * will take care of the rest.
         */
        if (PageAnon(page))
            dec_mm_counter(mm, MM_ANONPAGES);
        else
            dec_mm_counter(mm, MM_FILEPAGES);
    } else if (PageAnon(page)) {
        swp_entry_t entry = { .val = page_private(page) };
        pte_t swp_pte;

        if (PageSwapCache(page)) {
            /*
             * Store the swap location in the pte.
             * See handle_pte_fault() ...
             */
            if (swap_duplicate(entry) < 0) {
                set_pte_at(mm, address, pte, pteval);
                ret = SWAP_FAIL;
                goto out_unmap;
            }
            if (list_empty(&mm->mmlist)) {
                spin_lock(&mmlist_lock);
                if (list_empty(&mm->mmlist))
                    list_add(&mm->mmlist, &init_mm.mmlist);
                spin_unlock(&mmlist_lock);
            }
            dec_mm_counter(mm, MM_ANONPAGES);
            inc_mm_counter(mm, MM_SWAPENTS);
        } else if (IS_ENABLED(CONFIG_MIGRATION)) {
            /*
             * Store the pfn of the page in a special migration
             * pte. do_swap_page() will wait until the migration
             * pte is removed and then restart fault handling.
             */
            BUG_ON(!(flags & TTU_MIGRATION));
            entry = make_migration_entry(page, pte_write(pteval));
        }
        swp_pte = swp_entry_to_pte(entry);
        if (pte_soft_dirty(pteval))
            swp_pte = pte_swp_mksoft_dirty(swp_pte);
        set_pte_at(mm, address, pte, swp_pte);
    } else if (IS_ENABLED(CONFIG_MIGRATION) &&
           (flags & TTU_MIGRATION)) {
        /* Establish migration entry for a file page */
        swp_entry_t entry;
        entry = make_migration_entry(page, pte_write(pteval));
        set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
    } else
        dec_mm_counter(mm, MM_FILEPAGES);

    page_remove_rmap(page);
    page_cache_release(page);

out_unmap:
    pte_unmap_unlock(pte, ptl);
    if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
        mmu_notifier_invalidate_page(mm, address);
out:
    return ret;

out_mlock:
    pte_unmap_unlock(pte, ptl);


    /*
     * We need mmap_sem locking, Otherwise VM_LOCKED check makes
     * unstable result and race. Plus, We can't wait here because
     * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
     * if trylock failed, the page remain in evictable lru and later
     * vmscan could retry to move the page to unevictable lru if the
     * page is actually mlocked.
     */
    if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
        if (vma->vm_flags & VM_LOCKED) {
            mlock_vma_page(page);
            ret = SWAP_MLOCK;
        }
        up_read(&vma->vm_mm->mmap_sem);
    }
    return ret;
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

byd yes

你的鼓励是我最大的动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值