11.3 文件映射缺页中断

下面来看页面不在内存中且页表内容为空(!pte_present(entry)&&pte_none(entry))的另一种情况。即VMA定义了fault方式函数vma->vm_ops->fault。

[handle_pte_fault()->do_fault()]

/*
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults).
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
/*do_fault()函数处理VMA中的vm_ops操作函数集里定义了fault函数指针的情况,具体分为3种情况*/
static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long address, pte_t *page_table, pmd_t *pmd,
        unsigned int flags, pte_t orig_pte)
{
    pgoff_t pgoff = (((address & PAGE_MASK)
            - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;

    pte_unmap(page_table);/*不明白是什么意思*/
    /*(1) flags不为FAULT_FLAG_WRITE,即只读异常,见下面do_read_fault()代码*/
    if (!(flags & FAULT_FLAG_WRITE))
        return do_read_fault(mm, vma, address, pmd, pgoff, flags,
                orig_pte);
    /*(2) VMA的vm_flags没有定义VM_SHARED,即这是一个私有映射且发生了写时复制COW,见下面do_cow_fault()代码*/
    if (!(vma->vm_flags & VM_SHARED))
        return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
                orig_pte);
    /*(3) 其余情况是在共享映射中发生了写缺页异常,见do_shared_fault()代码*/
    return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}

(1)读异常

do_read_fault()函数:处理只读异常(mm/memory.c)

[handle_pte_fault()->do_fault()->do_read_fault()]

static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long address, pmd_t *pmd,
        pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
    struct page *fault_page;
    spinlock_t *ptl;
    pte_t *pte;
    int ret = 0;

    /*
     * Let's call ->map_pages() first and use ->fault() as fallback
     * if page by the offset is not ready to be mapped (cold cache or
     * something).
     */
    /*VMA定义了map_pages()方法,可以围绕在缺页异常地址周围提前映射尽可能多的页面。
    提前建立进程地址空间和page cache的映射关系有利于减少发生缺页中断的次数,从而提高
    效率。注意,这里只是和现存的page cache提前建立映射关系,而不会去创建page cache,
    创建新的page cache是在__do_fault()函数中。fault_around_bytes是一个全局变量
    定义在mm/memory.c文件中,默认是65536字节,即16个页面大小。
    static unsigned long fault_around_bytes __read_mostly =
        rounddown_pow_of_two(65536);
    */
    if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        /*下面查看do_fault_around实现,我的理解:分配虚拟地址的PTE,未映射物理表项,__do_fault函数建立
     虚拟地址与物理页表的映射关系*/
        do_fault_around(vma, address, pte, pgoff, flags);
        if (!pte_same(*pte, orig_pte))
            goto unlock_out;
        pte_unmap_unlock(pte, ptl);
    }
    /*真正为异常地址分配page cache是在do_read_fault()函数中的__do_fault()函数,下面查看其实现*/
    ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
    if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
        return ret;

    pte = pte_offset_map_lock(mm, pmd, address, &ptl);
    /*重新读取当前缺页异常地址addr对应pte的值与以前读取出来的值是否一致,如果不一致,
    说明这期间有人修改了pte,那么刚才通过__do_fault()函数分配的页面就没用了*/
    if (unlikely(!pte_same(*pte, orig_pte))) {
        pte_unmap_unlock(pte, ptl);
        unlock_page(fault_page);
        page_cache_release(fault_page);
        return ret;
    }
    /*利用刚才分配的页面新生成一个PTE entry设置到硬件页表中*/
    do_set_pte(vma, address, fault_page, pte, false, false);
    unlock_page(fault_page);
unlock_out:
    pte_unmap_unlock(pte, ptl);
    return ret;
}
回到do_fault()函数

do_fault_around()函数:

以当前缺页异常地址addr为中心,start_addr是以16个page大小对齐的起始地址,然后从start_addr开始去检查相应的pte是否空。若为空,则从这个pte开始到max_pgoff为止使用VMA的操作函数map_pages()来映射PTE,除非所需要的page cache还没有准备好或page cache 被锁住了。该函数预测异常地址周围的page cache可能会被马上读取,所以把已经有的page cache提前建立好映射,有利于减少发生缺页中断的次数,但注意并不会去新建page cache。流程如下图:

static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
        pte_t *pte, pgoff_t pgoff, unsigned int flags)
{
    unsigned long start_addr, nr_pages, mask;
    pgoff_t max_pgoff;
    struct vm_fault vmf;
    int off;

    nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
    mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;

    start_addr = max(address & mask, vma->vm_start);
    off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
    pte -= off;
    pgoff -= off;

    /*
     *  max_pgoff is either end of page table or end of vma
     *  or fault_around_pages() from pgoff, depending what is nearest.
     */
    max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
        PTRS_PER_PTE - 1;
    max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
            pgoff + nr_pages - 1);

    /* Check if it makes any sense to call ->map_pages */
    while (!pte_none(*pte)) {
        if (++pgoff > max_pgoff)
            return;
        start_addr += PAGE_SIZE;
        if (start_addr >= vma->vm_end)
            return;
        pte++;
    }

    vmf.virtual_address = (void __user *) start_addr;
    vmf.pte = pte;
    vmf.pgoff = pgoff;
    vmf.max_pgoff = max_pgoff;
    vmf.flags = flags;
    vma->vm_ops->map_pages(vma, &vmf);
}
回到do_read_fault()函数

__do_fault()函数: 异常地址分配page cache

/*
 * The mmap_sem must have been held on entry, and may have been
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
static int __do_fault(struct vm_area_struct *vma, unsigned long address,
            pgoff_t pgoff, unsigned int flags,
            struct page *cow_page, struct page **page)
{
    struct vm_fault vmf;
    int ret;

    vmf.virtual_address = (void __user *)(address & PAGE_MASK);
    vmf.pgoff = pgoff;
    vmf.flags = flags;
    vmf.page = NULL;
    vmf.cow_page = cow_page;
    /*建立一个page cache,*/
    ret = vma->vm_ops->fault(vma, &vmf);
    if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
        return ret;
    if (!vmf.page)
        goto out;

    if (unlikely(PageHWPoison(vmf.page))) {
        if (ret & VM_FAULT_LOCKED)
            unlock_page(vmf.page);
        page_cache_release(vmf.page);
        return VM_FAULT_HWPOISON;
    }
    /*如果返回值ret不包含VM_FAULT_LOCKED,那么调用lock_page函数为page加锁PG_locked,
    否则在打开了CONFIG_DEBUG_VM的情况下,会去检查这个page是否已经locked了*/
    if (unlikely(!(ret & VM_FAULT_LOCKED)))
        lock_page(vmf.page);
    else
        VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);

 out:
    *page = vmf.page;
    return ret;
}
返回do_read_fault()函数

(2)写时复制异常

do_cow_fault()函数: 处理私有文件映射的VMA中发生了写时复制。

[handle_pte_fault()->do_fault()->do_cow_fault()]

static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long address, pmd_t *pmd,
        pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
    struct page *fault_page, *new_page;
    struct mem_cgroup *memcg;
    spinlock_t *ptl;
    pte_t *pte;
    int ret;
    
    /*检查该VMA是否初始化了RMAP反向映射*/
    if (unlikely(anon_vma_prepare(vma)))
        return VM_FAULT_OOM;
    
    /*为GFP_HIGHUSER|__GFP_MOVABLE的新页面new_page分配一个分配掩码,也就是优先使用
    高端内存highmem*/
    new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
    if (!new_page)
        return VM_FAULT_OOM;

    if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
        page_cache_release(new_page);
        return VM_FAULT_OOM;
    }
    /*通过vma->vm_ops->fault()函数读取文件内容到fault_page页里面。*/
    ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
    if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
        goto uncharge_out;

    /*把fault_page页面的内容复制到刚才新分配的页面new_page中*/
    if (fault_page)
        copy_user_highpage(new_page, fault_page, address, vma);
    __SetPageUptodate(new_page);

    pte = pte_offset_map_lock(mm, pmd, address, &ptl);
    /*重新获取该异常地址对应的pte,如果当前pte的内容和之前的org_pte内容不一样,说明有人修改了
    pte,那么释放new_page和fault_page并返回*/
    if (unlikely(!pte_same(*pte, orig_pte))) {
        pte_unmap_unlock(pte, ptl);
        if (fault_page) {
            unlock_page(fault_page);
            page_cache_release(fault_page);
        } else {
            /*
             * The fault handler has no page to lock, so it holds
             * i_mmap_lock for read to protect against truncate.
             */
            i_mmap_unlock_read(vma->vm_file->f_mapping);
        }
        goto uncharge_out;
    }
    /*利用new_page新生成一个PTE entry并设置到硬件页表项pte中*/
    do_set_pte(vma, address, new_page, pte, true, true);
    mem_cgroup_commit_charge(new_page, memcg, false);
    /*把new_page加入到活跃的LRU链表中*/
    lru_cache_add_active_or_unevictable(new_page, vma);
    pte_unmap_unlock(pte, ptl);
    if (fault_page) {
        unlock_page(fault_page);
        /*释放fault_page*/
        page_cache_release(fault_page);
    } else {
        /*
         * The fault handler has no page to lock, so it holds
         * i_mmap_lock for read to protect against truncate.
         */
        i_mmap_unlock_read(vma->vm_file->f_mapping);
    }
    return ret;
uncharge_out:
    mem_cgroup_cancel_charge(new_page, memcg);
    page_cache_release(new_page);
    return ret;
}

(3) 写缺页异常

do_shared_fault()函数: 处理在一个可写的共享映射中发生缺页中断的情况。

[handle_pte_fault()->do_fault()->do_shared_fault()]

static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long address, pmd_t *pmd,
        pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
    struct page *fault_page;
    struct address_space *mapping;
    spinlock_t *ptl;
    pte_t *pte;
    int dirtied = 0;
    int ret, tmp;

    /*首先通过__do_fault()函数读取文件内容到fault_page页面中*/
    ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
    if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
        return ret;

    /*
     * Check if the backing address space wants to know that the page is
     * about to become writable
     */
    /*如果VMA的操作函数中定义了page_mkwrite()方法,那么调用page_mkwrite()来通知进程地址空间,
    page将变成可写的,一个页面变成可写的,那么进程有可能需要等待这个page的内容回写成功(writeback)*/
    if (vma->vm_ops->page_mkwrite) {
        unlock_page(fault_page);
        tmp = do_page_mkwrite(vma, fault_page, address);
        if (unlikely(!tmp ||
                (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
            page_cache_release(fault_page);
            return tmp;
        }
    }

    pte = pte_offset_map_lock(mm, pmd, address, &ptl);
    /*判断该异常地址对应的硬件页表项pte的内容是否与之前的pte一致*/
    if (unlikely(!pte_same(*pte, orig_pte))) {
        pte_unmap_unlock(pte, ptl);
        unlock_page(fault_page);
        page_cache_release(fault_page);
        return ret;
    }
    /*利用fault_page新生成一个PTE entry并设置到硬件页表项pte中,注意这里设置PTE为可写属性*/
    do_set_pte(vma, address, fault_page, pte, true, false);
    pte_unmap_unlock(pte, ptl);
    
    /*设置page为脏页面*/
    if (set_page_dirty(fault_page))
        dirtied = 1;
    /*
     * Take a local copy of the address_space - page.mapping may be zeroed
     * by truncate after unlock_page().   The address_space itself remains
     * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
     * release semantics to prevent the compiler from undoing this copying.
     */
    mapping = fault_page->mapping;
    unlock_page(fault_page);
    if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
        /*
         * Some device drivers do not set page.mapping but still
         * dirty their pages
         */
        /*通过此函数来平衡并回写一部分脏页面*/
        balance_dirty_pages_ratelimited(mapping);
    }

    if (!vma->vm_ops->page_mkwrite)
        file_update_time(vma->vm_file);

    return ret;
}

私有映射

共享映射

匿名映射

私有匿名映射->通常用于内存分配(do_anonymous_page)

共享匿名映射->通常用于进程间共享内存

文件映射

私有文件映射->通常用于加载动态库

共享文件映射->通常用于内存映射I/O,进程间通信

do_shared_fault

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

byd yes

你的鼓励是我最大的动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值