9.1 VM_LOCK情况

  当指定VM_LOCK标志位时,表示需要马上为这块进程地址空间VMA的分配物理页面并建立映射关系。mm_populate()函数(include/linux/mm.h)内部调用__mm_populate()函数

brk系统调用->mm_populate()->__mm_populate()

static inline void mm_populate(unsigned long addr, unsigned long len)
{
    /* Ignore errors */
    (void) __mm_populate(addr, len, 1);
}

/*
 * __mm_populate - populate and/or mlock pages within a range of address space.
 *
 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
 * flags. VMAs must be already marked with the desired vm_flags, and
 * mmap_sem must not be held.
 */
/*参数说明:
@start: VMA的起始地址
@len: VMA的长度
@ignore_errors: 表示当前分配页面发生错误时会继续重试
*/
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
{
    struct mm_struct *mm = current->mm;
    unsigned long end, nstart, nend;
    struct vm_area_struct *vma = NULL;
    int locked = 0;
    long ret = 0;

    VM_BUG_ON(start & ~PAGE_MASK);
    VM_BUG_ON(len != PAGE_ALIGN(len));
    end = start + len;

    for (nstart = start; nstart < end; nstart = nend) {
        /*
         * We want to fault in pages for [nstart; end) address range.
         * Find first corresponding VMA.
         */
        if (!locked) {
            locked = 1;
            down_read(&mm->mmap_sem);
            /*以start为起始地址,先通过find_vma()查找VMA,如果没找到VMA,则退出循环*/
            vma = find_vma(mm, nstart);
        } else if (nstart >= vma->vm_end)
            vma = vma->vm_next;
        if (!vma || vma->vm_start >= end)
            break;
        /*
         * Set [nstart; nend) to intersection of desired address
         * range with the first VMA. Also, skip undesirable VMA types.
         */
        nend = min(end, vma->vm_end);
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
            continue;
        if (nstart < vma->vm_start)
            nstart = vma->vm_start;
        /*
         * Now fault in a range of pages. __mlock_vma_pages_range()
         * double checks the vma flags, so that it won't mlock pages
         * if the vma was already munlocked.
         */
        /*__mlock_vma_pages_range()函数为VMA分配物理内存,下面我们查看此函数*/
        ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
        if (ret < 0) {
            if (ignore_errors) {
                ret = 0;
                continue;   /* continue at next VMA */
            }
            ret = __mlock_posix_error_return(ret);
            break;
        }
        nend = nstart + ret * PAGE_SIZE;
        ret = 0;
    }
    if (locked)
        up_read(&mm->mmap_sem);
    return ret; /* 0 or negative error code */
}

__mlock_vma_pages_range()函数的实现:为VMA分配物理内存

[brk系统调用->mm_populate()->__mm_populate()->__mlock_vma_pages_range()]

/**
 * __mlock_vma_pages_range() -  mlock a range of pages in the vma.
 * @vma:   target vma
 * @start: start address
 * @end:   end address
 * @nonblocking:
 *
 * This takes care of making the pages present too.
 *
 * return 0 on success, negative error code on error.
 *
 * vma->vm_mm->mmap_sem must be held.
 *
 * If @nonblocking is NULL, it may be held for read or write and will
 * be unperturbed.
 *
 * If @nonblocking is non-NULL, it must held for read only and may be
 * released.  If it's released, *@nonblocking will be set to 0.
 */
long __mlock_vma_pages_range(struct vm_area_struct *vma,
        unsigned long start, unsigned long end, int *nonblocking)
{
    struct mm_struct *mm = vma->vm_mm;
    unsigned long nr_pages = (end - start) / PAGE_SIZE;
    int gup_flags;
    /*做一些错误判断,start和end地址必须以页面对齐,VM_BUG_ON_VMA和VM_BUG_ON_MM宏需要打开CONFIG_DEBUG_VM配置
    才会起作用,内存管理代码常常使用这些宏来做debug。*/
    VM_BUG_ON(start & ~PAGE_MASK);
    VM_BUG_ON(end   & ~PAGE_MASK);
    VM_BUG_ON_VMA(start < vma->vm_start, vma);
    VM_BUG_ON_VMA(end   > vma->vm_end, vma);
    VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);

    gup_flags = FOLL_TOUCH | FOLL_MLOCK;/*设置分配掩码,含义下面给出了*/
    /*
     * We want to touch writable mappings with a write fault in order
     * to break COW, except for shared mappings because these don't COW
     * and we would not want to dirty them for nothing.
     */
    /*如果VMA的标志域vm_flags具有可写的属性(VM_WRITE),那么这里必须设置FOLL_WRITE标志位。*/
    if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
        gup_flags |= FOLL_WRITE;

    /*
     * We want mlock to succeed for regions that have any permissions
     * other than PROT_NONE.
     */
    /*如果vm_flags是可读、可写和可执行的,那么设置FOLL_FORCE标志位。*/
    if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
        gup_flags |= FOLL_FORCE;

    /*
     * We made sure addr is within a VMA, so the following will
     * not result in a stack expansion that recurses back here.
     */
    /*最后调用__get_user_pages()来为进程地址空间分配物理内存并且建立映射关系,下面我们查看此函数实现*/
    return __get_user_pages(current, mm, start, nr_pages, gup_flags,
                NULL, NULL, nonblocking);
}
/*回到__mm_populate()函数*/
#define FOLL_WRITE  0x01    /* check pte is writable 判断PTE是否具有可写属性*/
#define FOLL_TOUCH  0x02    /* mark page accessed 标记page可访问*/
#define FOLL_GET    0x04    /* do get_page on page 在这个page执行get_page()操作,增加_count计数*/
#define FOLL_DUMP   0x08    /* give error on hole if it would be zero */
#define FOLL_FORCE  0x10    /* get_user_pages read/write w/o permission get_user_page函数具有读写权限*/
#define FOLL_NOWAIT 0x20    /* if a disk transfer is needed, start the IO
                 * and return without waiting upon it 如果需要一个磁盘传输,那么开始一个IO传输不需要为其等待*/
#define FOLL_MLOCK  0x40    /* mark page as mlocked 标记这个page是mlocked*/
#define FOLL_SPLIT  0x80    /* don't return transhuge pages, split them 不返回大页面,切分它们*/
#define FOLL_HWPOISON   0x100   /* check page is hwpoisoned 检查这个page是否hwpoisoned(中毒的意思) */
#define FOLL_NUMA   0x200   /* force NUMA hinting page fault 强制NUMA触发一个缺页中断*/
#define FOLL_MIGRATION  0x400   /* wait for page to replace migration entry 等待页面合并*/
#define FOLL_TRIED  0x800   /* a retry, previous pass started an IO */

__get_user_pages()函数实现:是分配物理内存的接口函数

[mm/gup.c]

[brk系统调用->mm_populate()->__mm_populate()->__mlock_vma_pages_range()->__get_user_pages()]

/**
 * __get_user_pages() - pin user pages in memory
 * @tsk:    task_struct of target task
 * @mm:     mm_struct of target mm
 * @start:  starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:  array that receives pointers to the pages pinned.
 *      Should be at least nr_pages long. Or NULL, if caller
 *      only intends to ensure the pages are faulted in.
 * @vmas:   array of pointers to vmas corresponding to each page.
 *      Or NULL if the caller does not require them.
 * @nonblocking: whether waiting for disk IO or mmap_sem contention
 *
 * Returns number of pages pinned. This may be fewer than the number
 * requested. If nr_pages is 0 or negative, returns 0. If no pages
 * were pinned, returns -errno. Each page returned must be released
 * with a put_page() call when it is finished with. vmas will only
 * remain valid while mmap_sem is held.
 *
 * Must be called with mmap_sem held.  It may be released.  See below.
 *
 * __get_user_pages walks a process's page tables and takes a reference to
 * each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * __get_user_pages returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 * appropriate) must be called after the page is finished with, and
 * before put_page is called.
 *
 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
 * or mmap_sem contention, and if waiting is needed to pin all pages,
 * *@nonblocking will be set to 0.  Further, if @gup_flags does not
 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
 * this case.
 *
 * A caller using such a combination of @nonblocking and @gup_flags
 * must therefore hold the mmap_sem for reading only, and recognize
 * when it's been released.  Otherwise, it must be held for either
 * reading or writing and will not be released.
 *
 * In most cases, get_user_pages or get_user_pages_fast should be used
 * instead of __get_user_pages. __get_user_pages should be used only if
 * you need some special @gup_flags.
 */
/*参数说明:
@tsk: 表示进程的struct task_struct数据结构
@mm: 表示进程管理的struct mm_struct数据结构
@start: 表示进程地址空间VMA的起始地址
@nr_pages: 表示需要分配多少个页面
@gup_flags: 分配掩码
@pages:表示物理页面的二级指针
@vmas: 进程地址空间VMA
@nonblocking: 表示是否等待I/O操作
*/
//__get_user_pages(current, mm, start, nr_pages, gup_flags,NULL, NULL, nonblocking);
long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long start, unsigned long nr_pages,
        unsigned int gup_flags, struct page **pages,
        struct vm_area_struct **vmas, int *nonblocking)
{
    long i = 0;
    unsigned int page_mask;
    struct vm_area_struct *vma = NULL;

    if (!nr_pages)
        return 0;

    VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));

    /*
     * If FOLL_FORCE is set then do not force a full fault as the hinting
     * fault information is unrelated to the reference behaviour of a task
     * using the address space
     */
    if (!(gup_flags & FOLL_FORCE))
        gup_flags |= FOLL_NUMA;

    do {
        struct page *page;
        unsigned int foll_flags = gup_flags;
        unsigned int page_increm;

        /* first iteration or cross vma bound */
        if (!vma || start >= vma->vm_end) {
            /*find_extend_vma() 函数查找VMA,它会调用find_vma()查找VMA,如果VMA->vma_start大于查找地址start,
            那么它会尝试去扩增VMA,把VMA->vm_start边界扩大到start中。如果find_extend_vma()没有找到合适VMA,
            且start地址恰好在gate_vma中,那么使用gate页面,当然这种情况比较罕见。*/
            vma = find_extend_vma(mm, start);
            if (!vma && in_gate_area(mm, start)) {
                int ret;
                ret = get_gate_page(mm, start & PAGE_MASK,
                        gup_flags, &vma,
                        pages ? &pages[i] : NULL);
                if (ret)
                    return i ? : ret;
                page_mask = 0;
                goto next_page;
            }

            if (!vma || check_vma_flags(vma, gup_flags))
                return i ? : -EFAULT;
            if (is_vm_hugetlb_page(vma)) {
                i = follow_hugetlb_page(mm, vma, pages, vmas,
                        &start, &nr_pages, i,
                        gup_flags);
                continue;
            }
        }
retry:
        /*
         * If we have a pending SIGKILL, don't keep faulting pages and
         * potentially allocating memory.
         */
        /*如果当前进程收到一个SKIGILL信号,那么不需要继续做内存分配,直接报错退出。*/
        if (unlikely(fatal_signal_pending(current)))
            return i ? i : -ERESTARTSYS;
        /*cond_resched()判断当前进程是否需要被调度,内核代码通常在while()循环中添加cond_resched(),从而优化系统的延迟*/
        cond_resched();
    
        /*调用follow_page_mask()查看VMA中的虚拟页面是否已经分配了物理内存。follow_page_mask()是内存管理核心API函数
        follow_page()的具体实现,follow_page()在页面合并和KSM中有广泛的应用,下面查看此函数的具体实现,返回用户进程地址空间
    中已经有映射过的normal mapping页面的struct page数据结构。如果没有返回page数据结构,那么调用faultin_page()函数,然后继续调用
    handle_mm_fault()来触发一个缺页中断。handle_mm_fault()函数是缺页中断处理的核心函数,后续会介绍*/
        page = follow_page_mask(vma, start, foll_flags, &page_mask);
        if (!page) {
            int ret;
            ret = faultin_page(tsk, vma, start, &foll_flags,
                    nonblocking);
            switch (ret) {
            case 0:
                goto retry;
            case -EFAULT:
            case -ENOMEM:
            case -EHWPOISON:
                return i ? i : ret;
            case -EBUSY:
                return i;
            case -ENOENT:
                goto next_page;
            }
            BUG();
        }
        if (IS_ERR(page))
            return i ? i : PTR_ERR(page);
        if (pages) {
            /*分配完页面后,pages指针数组指向这些page,最后调用flush_anon_page()和
            flush_dcache_page()来刷新这些页面对应的cache*/
            pages[i] = page;
            flush_anon_page(vma, page, start);
            flush_dcache_page(page);
            page_mask = 0;
        }
next_page:
        /*为下一次循环做准备*/
        if (vmas) {
            vmas[i] = vma;
            page_mask = 0;
        }
        page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
        if (page_increm > nr_pages)
            page_increm = nr_pages;
        i += page_increm;
        start += page_increm * PAGE_SIZE;
        nr_pages -= page_increm;
    } while (nr_pages);
    return i;
}

follow_page()函数:

此函数作用:完成从pgd到page的转换

1.  由虚拟地址vaddr通过查询页表找到pte

2.  由pte找到页帧号pfn,然后在mem_map[]中找到对应的struct page数据结构。

    2.1 vma得到其所属的mm

    2.2 mm->pgb(进程页表pgb的起始位置)

    2.3 mm->pgb和address得到address对应的pgd(使用宏pgd_offset(mm, addr))

    2.4 pgd得到pte,在ARM页表中,无pud和pmd,代码中通过pte_offset_map_lock函数得到pte

说明:PFN即页帧号(就是mem_map数组下标,就是所有的物理页描述结构的数组,通过这个数组可以找到struct page)

https://blog.csdn.net/wowricky/article/details/81055208

[include/linux/mm.h]

static inline struct page *follow_page(struct vm_area_struct *vma,
        unsigned long address, unsigned int foll_flags)
{
    unsigned int unused_page_mask;
    return follow_page_mask(vma, address, foll_flags, &unused_page_mask);
}

follow_page_mask()函数实现:

为用户空间虚拟地址寻找一个page描述符

[mm/gup.c]

[follow_page()->follow_page_mask()]


/**
 * follow_page_mask - look up a page descriptor from a user-virtual address
 * @vma: vm_area_struct mapping @address
 * @address: virtual address to look up
 * @flags: flags modifying lookup behaviour
 * @page_mask: on output, *page_mask is set according to the size of the page
 *
 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 *
 * Returns the mapped (struct page *), %NULL if no mapping exists, or
 * an error pointer if there is a mapping to something not represented
 * by a page descriptor (see also vm_normal_page()).
 */
/*此函数有很多大页面的处理情况,暂时忽略大页面相关代码*/
struct page *follow_page_mask(struct vm_area_struct *vma,
                  unsigned long address, unsigned int flags,
                  unsigned int *page_mask)
{
    pgd_t *pgd;
    pud_t *pud;
    pmd_t *pmd;
    spinlock_t *ptl;
    struct page *page;
    struct mm_struct *mm = vma->vm_mm;

    *page_mask = 0;

    page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
    if (!IS_ERR(page)) {
        BUG_ON(flags & FOLL_GET);
        return page;
    }

    /*通过pgd_offset()辅助函数由mm和地址addr找到当前进程页面表对应的PGD页面目录项,
    用户进程内存管理的struct mm_struct数据结构的pgd成员(mm->pgd)指向用户进程的页表
    的基地址。如果PGD表项的内容为空或者表项无效,那么报错返回。接着检查PUD/PMD,在二级
    页表中,PUD和PMD都指向PGD,最后调用follow_page_pte()来检查PTE页表*/
    pgd = pgd_offset(mm, address);
    if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
        return no_page_table(vma, flags);

    pud = pud_offset(pgd, address);
    if (pud_none(*pud))
        return no_page_table(vma, flags);
    if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
        page = follow_huge_pud(mm, address, pud, flags);
        if (page)
            return page;
        return no_page_table(vma, flags);
    }
    if (unlikely(pud_bad(*pud)))
        return no_page_table(vma, flags);

    pmd = pmd_offset(pud, address);
    if (pmd_none(*pmd))
        return no_page_table(vma, flags);
    if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
        page = follow_huge_pmd(mm, address, pmd, flags);
        if (page)
            return page;
        return no_page_table(vma, flags);
    }
    if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
        return no_page_table(vma, flags);
    if (pmd_trans_huge(*pmd)) {
        if (flags & FOLL_SPLIT) {
            split_huge_page_pmd(vma, address, pmd);
            return follow_page_pte(vma, address, pmd, flags);
        }
        ptl = pmd_lock(mm, pmd);
        if (likely(pmd_trans_huge(*pmd))) {
            if (unlikely(pmd_trans_splitting(*pmd))) {
                spin_unlock(ptl);
                wait_split_huge_page(vma->anon_vma, pmd);
            } else {
                page = follow_trans_huge_pmd(vma, address,
                                 pmd, flags);
                spin_unlock(ptl);
                *page_mask = HPAGE_PMD_NR - 1;
                return page;
            }
        } else
            spin_unlock(ptl);
    }
    /*查看follow_page_pte函数*/
    return follow_page_pte(vma, address, pmd, flags);
}

返回__get_user_pages函数中

follow_page_pte()函数实现

[follow_page()->follow_page_mask()->follow_page_pte()]

static struct page *follow_page_pte(struct vm_area_struct *vma,
        unsigned long address, pmd_t *pmd, unsigned int flags)
{
    struct mm_struct *mm = vma->vm_mm;
    struct page *page;
    spinlock_t *ptl;
    pte_t *ptep, pte;

retry:
    /*检查pmd是否有效*/
    if (unlikely(pmd_bad(*pmd)))
        return no_page_table(vma, flags);

    /*pte_offset_map_lock()宏通过PMD和地址addr获取pte页表项,这里还获取了一个spinlock锁,这个函数在返回
    时需要调用pte_unmap_unlock()来释放spinlock锁,下面查看此宏的实现*/
    ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
    pte = *ptep;
    /*pte_present()判断pte页表中的L_PTE_PRESENT位是否置位,L_PTE_PRESENT标志位标识该页在内存中。*/
    if (!pte_present(pte)) {
        //这里表示处理页表不在内存中的情况
        swp_entry_t entry;
        /*
         * KSM's break_ksm() relies upon recognizing a ksm page
         * even while it is being migrated, so for that case we
         * need migration_entry_wait().
         */
        /*如果分配掩码没有定义FOLL_MIGRATION,即这个页面没有在页面合并过程中,那么错误返回*/
        if (likely(!(flags & FOLL_MIGRATION)))
            goto no_page;
        /*如果pte为空,则错误返回*/
        if (pte_none(pte))
            goto no_page;
        entry = pte_to_swp_entry(pte);
        /*如果pte是正在合并中的swap页面,那么调用migrate_entry_wait()等待
        这个页面合并完成后再尝试。*/
        if (!is_migration_entry(entry))
            goto no_page;
        pte_unmap_unlock(ptep, ptl);
        migration_entry_wait(mm, pmd, address);
        goto retry;
    }
    if ((flags & FOLL_NUMA) && pte_protnone(pte))
        goto no_page;
    /*如果分配掩码支持可写属性(FOLL_WRITE),但是pte的表项只具有只读属性,那么也返回NULL。*/
    if ((flags & FOLL_WRITE) && !pte_write(pte)) {
        pte_unmap_unlock(ptep, ptl);
        return NULL;
    }

    /*vm_normal_page()函数根据pte来返回normal mapping页面的struct page数据结构,
    下面我们查看此函数的实现*/
    page = vm_normal_page(vma, address, pte);
    if (unlikely(!page)) {
        if ((flags & FOLL_DUMP) ||
            !is_zero_pfn(pte_pfn(pte)))
            goto bad_page;
        page = pte_page(pte);
    }

    /*如果flags设置了FOLL_GET,get_page_foll()会增加page的_count计数。*/
    if (flags & FOLL_GET)
        get_page_foll(page);

    /*flag设置FOLL_TOUCH时,需要标记page可访问,调用mark_page_accessed()函数设置page是活跃的,
    mark_page_accessed()函数是页面回收的核心辅助函数*/
    if (flags & FOLL_TOUCH) {
        if ((flags & FOLL_WRITE) &&
            !pte_dirty(pte) && !PageDirty(page))
            set_page_dirty(page);
        /*
         * pte_mkyoung() would be more correct here, but atomic care
         * is needed to avoid losing the dirty bit: it is easier to use
         * mark_page_accessed().
         */
        mark_page_accessed(page);
    }
    /*调用者想将页面锁在内存*/
    if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
        /*
         * The preliminary mapping check is mainly to avoid the
         * pointless overhead of lock_page on the ZERO_PAGE
         * which might bounce very badly if there is contention.
         *
         * If the page is already locked, we don't need to
         * handle it now - vmscan will handle it later if and
         * when it attempts to reclaim the page.
         */
       /*锁住页面,不交换到外部存储器*/
        if (page->mapping && trylock_page(page)) {
            lru_add_drain();  /* push cached pages to LRU */
            /*
             * Because we lock page here, and migration is
             * blocked by the pte's page reference, and we
             * know the page is still mapped, we don't even
             * need to check for file-cache page truncation.
             */
            mlock_vma_page(page);
            unlock_page(page);
        }
    }
    pte_unmap_unlock(ptep, ptl);
    return page;
bad_page:
    pte_unmap_unlock(ptep, ptl);
    return ERR_PTR(-EFAULT);

no_page:
    pte_unmap_unlock(ptep, ptl);
    if (!pte_none(pte))
        return NULL;
    return no_page_table(vma, flags);
}
回到follow_page_mask函数

pte_offset_map_lock()宏实现:

通过PMD和地址addr获取pte页表项,这里还获取了一个spinlock锁,这个函数在返回时需要调用pte_unmap_unlock()来释放spinlock锁

[include/linux/mm.h]

[follow_page()->follow_page_mask()->follow_page_pte()->pte_offset_map_lock()]

#define pte_offset_map_lock(mm, pmd, address, ptlp) \
({                          \
    spinlock_t *__ptl = pte_lockptr(mm, pmd);   \
    pte_t *__pte = pte_offset_map(pmd, address);    \
    *(ptlp) = __ptl;                \
    spin_lock(__ptl);               \
    __pte;                      \
})
#define pte_offset_map(pmd,addr)    (__pte_map(pmd) + pte_index(addr))

#define pte_index(addr)     (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
#define PTRS_PER_PTE        512
#define PAGE_SHIFT  12

#define __pte_map(pmd)      (pte_t *)kmap_atomic(pmd_page(*(pmd)))
#define pmd_page(pmd)       pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
#define pmd_val(x)      (x)
#define __phys_to_pfn(paddr)    ((unsigned long)((paddr) >> PAGE_SHIFT))

vm_normal_page()函数的实现:根据pte来返回normal mapping页面的struct page数据结构

[mm/memory.c]

[follow_page()->follow_page_mask()->follow_page_pte()->vm_normal_page()]

/*此函数是一个很有意思的函数,它返回normal mapping页面的struct page数据结构,一些特殊映射的页面是不会返回struct page
  数据结构的,这些页面不希望被参与内存管理的一些活动中,例如页面回收、页迁移和KSM等。HAVE_PTE_SPECIAL宏利用PTE页表
项的空闲比特位来做一些有意思的事情,在ARM32架构的3级页表和ARM64的代码中会用到这个特性,而ARM32架构的2级页表里没有实现这个
  特性。在ARM64中,定义了PTE_SECIAL比特位,注意这是利用硬件上空闲的比特位来定义的
arch/arm64/include/asm/pgtable.h
/*
 * Software defined PTE bits definition.
 */
#define PTE_VALID       (_AT(pteval_t, 1) << 0)
#define PTE_DIRTY       (_AT(pteval_t, 1) << 55)
#define PTE_SPECIAL     (_AT(pteval_t, 1) << 56)
#define PTE_WRITE       (_AT(pteval_t, 1) << 57)
#define PTE_PROT_NONE       (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */
    内核通常使用pte_mkspecial()宏来设置PTE_SPECIAL软件定义的比特位,主要用于以下用途:
1.内核的零页面 zero page。
2.大量的驱动程序使用remap_pfn_range()函数来实现映射内核页面到用户空间。这些用户
    程序使用的VMA通常设置了(VM_IO|VM_PFNMAP|VM_DONTEXPAND|VM_DONTDUMP)属性。
3.vm_insert_page()/vm_insert_pfn()映射内核页面到用户空间。
    
    vm_normal_page()函数把page页面分为两个阵营,一个是normal page,另一个是special page。
    1.normal page通常指正常mapping的页面,例如匿名页面、page chache和共享内存页面等。
2.special page通常指不正常mapping的页面,这些页面不希望参与内存管理的回收或者合并的功能。例如映射如下特定页面
     2.1 VM_IO: 为I/O设备映射内存
     2.2 VM_PFN_MAP: 纯PFN映射
     2.3 VM_MIXEDMAP: 固定映射
*/
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                pte_t pte)
{
    unsigned long pfn = pte_pfn(pte);

    /*特殊映射,见上面说明。如果pte的PTE_SPECIAL比特位没有置位,那么跳转到check_pfn继续检查。*/
    if (HAVE_PTE_SPECIAL) {
        if (likely(!pte_special(pte)))
            goto check_pfn;
        if (vma->vm_ops && vma->vm_ops->find_special_page)
            return vma->vm_ops->find_special_page(vma, addr);
         /*如果vm_flags设置了下面两个标志位,那么这是special mapping,返回NULL*/
        if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
            return NULL;
        if (!is_zero_pfn(pfn))
            print_bad_pte(vma, addr, pte, NULL);
        return NULL;
    }

    /* !HAVE_PTE_SPECIAL case follows: */
    /*如果没有定义HAVE_PTE_SPECIAL,检查(VM_PFNMAP|VM_MIXEDMAP)的情况*/
    if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
        if (vma->vm_flags & VM_MIXEDMAP) {
            if (!pfn_valid(pfn))
                return NULL;
            goto out;
        } else {
            unsigned long off;
            off = (addr - vma->vm_start) >> PAGE_SHIFT;
            if (pfn == vma->vm_pgoff + off)/*这里判断special mapping的情况*/
                return NULL;
            /*虚拟地址线性映射到pfn,如果映射是COW mapping(写时复制映射),那么页面也是normal映射*/
            if (!is_cow_mapping(vma->vm_flags))
                return NULL;
        }
    }

    if (is_zero_pfn(pfn))
        return NULL;
check_pfn:
    if (unlikely(pfn > highest_memmap_pfn)) {
        print_bad_pte(vma, addr, pte, NULL);
        return NULL;
    }

    /*
     * NOTE! We still have PageReserved() pages in the page tables.
     * eg. VDSO mappings can cause them to exist.
     */
out:
    /*最后通过pfn_to_page()返回struct page数据结构实例*/
    return pfn_to_page(pfn);
}
返回follow_page_pte()函数

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

byd yes

你的鼓励是我最大的动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值