当指定VM_LOCK标志位时,表示需要马上为这块进程地址空间VMA的分配物理页面并建立映射关系。mm_populate()函数(include/linux/mm.h)内部调用__mm_populate()函数
brk系统调用->mm_populate()->__mm_populate()
static inline void mm_populate(unsigned long addr, unsigned long len)
{
/* Ignore errors */
(void) __mm_populate(addr, len, 1);
}
/*
* __mm_populate - populate and/or mlock pages within a range of address space.
*
* This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
* flags. VMAs must be already marked with the desired vm_flags, and
* mmap_sem must not be held.
*/
/*参数说明:
@start: VMA的起始地址
@len: VMA的长度
@ignore_errors: 表示当前分配页面发生错误时会继续重试
*/
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
{
struct mm_struct *mm = current->mm;
unsigned long end, nstart, nend;
struct vm_area_struct *vma = NULL;
int locked = 0;
long ret = 0;
VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;
for (nstart = start; nstart < end; nstart = nend) {
/*
* We want to fault in pages for [nstart; end) address range.
* Find first corresponding VMA.
*/
if (!locked) {
locked = 1;
down_read(&mm->mmap_sem);
/*以start为起始地址,先通过find_vma()查找VMA,如果没找到VMA,则退出循环*/
vma = find_vma(mm, nstart);
} else if (nstart >= vma->vm_end)
vma = vma->vm_next;
if (!vma || vma->vm_start >= end)
break;
/*
* Set [nstart; nend) to intersection of desired address
* range with the first VMA. Also, skip undesirable VMA types.
*/
nend = min(end, vma->vm_end);
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
continue;
if (nstart < vma->vm_start)
nstart = vma->vm_start;
/*
* Now fault in a range of pages. __mlock_vma_pages_range()
* double checks the vma flags, so that it won't mlock pages
* if the vma was already munlocked.
*/
/*__mlock_vma_pages_range()函数为VMA分配物理内存,下面我们查看此函数*/
ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
if (ret < 0) {
if (ignore_errors) {
ret = 0;
continue; /* continue at next VMA */
}
ret = __mlock_posix_error_return(ret);
break;
}
nend = nstart + ret * PAGE_SIZE;
ret = 0;
}
if (locked)
up_read(&mm->mmap_sem);
return ret; /* 0 or negative error code */
}
__mlock_vma_pages_range()函数的实现:为VMA分配物理内存
[brk系统调用->mm_populate()->__mm_populate()->__mlock_vma_pages_range()]
/**
* __mlock_vma_pages_range() - mlock a range of pages in the vma.
* @vma: target vma
* @start: start address
* @end: end address
* @nonblocking:
*
* This takes care of making the pages present too.
*
* return 0 on success, negative error code on error.
*
* vma->vm_mm->mmap_sem must be held.
*
* If @nonblocking is NULL, it may be held for read or write and will
* be unperturbed.
*
* If @nonblocking is non-NULL, it must held for read only and may be
* released. If it's released, *@nonblocking will be set to 0.
*/
long __mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *nonblocking)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE;
int gup_flags;
/*做一些错误判断,start和end地址必须以页面对齐,VM_BUG_ON_VMA和VM_BUG_ON_MM宏需要打开CONFIG_DEBUG_VM配置
才会起作用,内存管理代码常常使用这些宏来做debug。*/
VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(end & ~PAGE_MASK);
VM_BUG_ON_VMA(start < vma->vm_start, vma);
VM_BUG_ON_VMA(end > vma->vm_end, vma);
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
gup_flags = FOLL_TOUCH | FOLL_MLOCK;/*设置分配掩码,含义下面给出了*/
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
* and we would not want to dirty them for nothing.
*/
/*如果VMA的标志域vm_flags具有可写的属性(VM_WRITE),那么这里必须设置FOLL_WRITE标志位。*/
if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
gup_flags |= FOLL_WRITE;
/*
* We want mlock to succeed for regions that have any permissions
* other than PROT_NONE.
*/
/*如果vm_flags是可读、可写和可执行的,那么设置FOLL_FORCE标志位。*/
if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
gup_flags |= FOLL_FORCE;
/*
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
/*最后调用__get_user_pages()来为进程地址空间分配物理内存并且建立映射关系,下面我们查看此函数实现*/
return __get_user_pages(current, mm, start, nr_pages, gup_flags,
NULL, NULL, nonblocking);
}
/*回到__mm_populate()函数*/
#define FOLL_WRITE 0x01 /* check pte is writable 判断PTE是否具有可写属性*/
#define FOLL_TOUCH 0x02 /* mark page accessed 标记page可访问*/
#define FOLL_GET 0x04 /* do get_page on page 在这个page执行get_page()操作,增加_count计数*/
#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */
#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission get_user_page函数具有读写权限*/
#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
* and return without waiting upon it 如果需要一个磁盘传输,那么开始一个IO传输不需要为其等待*/
#define FOLL_MLOCK 0x40 /* mark page as mlocked 标记这个page是mlocked*/
#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them 不返回大页面,切分它们*/
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned 检查这个page是否hwpoisoned(中毒的意思) */
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault 强制NUMA触发一个缺页中断*/
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry 等待页面合并*/
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
__get_user_pages()函数实现:是分配物理内存的接口函数
[mm/gup.c]
[brk系统调用->mm_populate()->__mm_populate()->__mlock_vma_pages_range()->__get_user_pages()]
/**
* __get_user_pages() - pin user pages in memory
* @tsk: task_struct of target task
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @nonblocking: whether waiting for disk IO or mmap_sem contention
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno. Each page returned must be released
* with a put_page() call when it is finished with. vmas will only
* remain valid while mmap_sem is held.
*
* Must be called with mmap_sem held. It may be released. See below.
*
* __get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
* __get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can't be held over the syscall boundary.
*
* If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
* the page is written to, set_page_dirty (or set_page_dirty_lock, as
* appropriate) must be called after the page is finished with, and
* before put_page is called.
*
* If @nonblocking != NULL, __get_user_pages will not wait for disk IO
* or mmap_sem contention, and if waiting is needed to pin all pages,
* *@nonblocking will be set to 0. Further, if @gup_flags does not
* include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
* this case.
*
* A caller using such a combination of @nonblocking and @gup_flags
* must therefore hold the mmap_sem for reading only, and recognize
* when it's been released. Otherwise, it must be held for either
* reading or writing and will not be released.
*
* In most cases, get_user_pages or get_user_pages_fast should be used
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
*/
/*参数说明:
@tsk: 表示进程的struct task_struct数据结构
@mm: 表示进程管理的struct mm_struct数据结构
@start: 表示进程地址空间VMA的起始地址
@nr_pages: 表示需要分配多少个页面
@gup_flags: 分配掩码
@pages:表示物理页面的二级指针
@vmas: 进程地址空间VMA
@nonblocking: 表示是否等待I/O操作
*/
//__get_user_pages(current, mm, start, nr_pages, gup_flags,NULL, NULL, nonblocking);
long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *nonblocking)
{
long i = 0;
unsigned int page_mask;
struct vm_area_struct *vma = NULL;
if (!nr_pages)
return 0;
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
/*
* If FOLL_FORCE is set then do not force a full fault as the hinting
* fault information is unrelated to the reference behaviour of a task
* using the address space
*/
if (!(gup_flags & FOLL_FORCE))
gup_flags |= FOLL_NUMA;
do {
struct page *page;
unsigned int foll_flags = gup_flags;
unsigned int page_increm;
/* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) {
/*find_extend_vma() 函数查找VMA,它会调用find_vma()查找VMA,如果VMA->vma_start大于查找地址start,
那么它会尝试去扩增VMA,把VMA->vm_start边界扩大到start中。如果find_extend_vma()没有找到合适VMA,
且start地址恰好在gate_vma中,那么使用gate页面,当然这种情况比较罕见。*/
vma = find_extend_vma(mm, start);
if (!vma && in_gate_area(mm, start)) {
int ret;
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
pages ? &pages[i] : NULL);
if (ret)
return i ? : ret;
page_mask = 0;
goto next_page;
}
if (!vma || check_vma_flags(vma, gup_flags))
return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
gup_flags);
continue;
}
}
retry:
/*
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
/*如果当前进程收到一个SKIGILL信号,那么不需要继续做内存分配,直接报错退出。*/
if (unlikely(fatal_signal_pending(current)))
return i ? i : -ERESTARTSYS;
/*cond_resched()判断当前进程是否需要被调度,内核代码通常在while()循环中添加cond_resched(),从而优化系统的延迟*/
cond_resched();
/*调用follow_page_mask()查看VMA中的虚拟页面是否已经分配了物理内存。follow_page_mask()是内存管理核心API函数
follow_page()的具体实现,follow_page()在页面合并和KSM中有广泛的应用,下面查看此函数的具体实现,返回用户进程地址空间
中已经有映射过的normal mapping页面的struct page数据结构。如果没有返回page数据结构,那么调用faultin_page()函数,然后继续调用
handle_mm_fault()来触发一个缺页中断。handle_mm_fault()函数是缺页中断处理的核心函数,后续会介绍*/
page = follow_page_mask(vma, start, foll_flags, &page_mask);
if (!page) {
int ret;
ret = faultin_page(tsk, vma, start, &foll_flags,
nonblocking);
switch (ret) {
case 0:
goto retry;
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
return i ? i : ret;
case -EBUSY:
return i;
case -ENOENT:
goto next_page;
}
BUG();
}
if (IS_ERR(page))
return i ? i : PTR_ERR(page);
if (pages) {
/*分配完页面后,pages指针数组指向这些page,最后调用flush_anon_page()和
flush_dcache_page()来刷新这些页面对应的cache*/
pages[i] = page;
flush_anon_page(vma, page, start);
flush_dcache_page(page);
page_mask = 0;
}
next_page:
/*为下一次循环做准备*/
if (vmas) {
vmas[i] = vma;
page_mask = 0;
}
page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
} while (nr_pages);
return i;
}
follow_page()函数:
此函数作用:完成从pgd到page的转换
1. 由虚拟地址vaddr通过查询页表找到pte
2. 由pte找到页帧号pfn,然后在mem_map[]中找到对应的struct page数据结构。
2.1 vma得到其所属的mm
2.2 mm->pgb(进程页表pgb的起始位置)
2.3 mm->pgb和address得到address对应的pgd(使用宏pgd_offset(mm, addr))
2.4 pgd得到pte,在ARM页表中,无pud和pmd,代码中通过pte_offset_map_lock函数得到pte
说明:PFN即页帧号(就是mem_map数组下标,就是所有的物理页描述结构的数组,通过这个数组可以找到struct page)
https://blog.csdn.net/wowricky/article/details/81055208
[include/linux/mm.h]
static inline struct page *follow_page(struct vm_area_struct *vma,
unsigned long address, unsigned int foll_flags)
{
unsigned int unused_page_mask;
return follow_page_mask(vma, address, foll_flags, &unused_page_mask);
}
follow_page_mask()函数实现:
为用户空间虚拟地址寻找一个page描述符
[mm/gup.c]
[follow_page()->follow_page_mask()]
/**
* follow_page_mask - look up a page descriptor from a user-virtual address
* @vma: vm_area_struct mapping @address
* @address: virtual address to look up
* @flags: flags modifying lookup behaviour
* @page_mask: on output, *page_mask is set according to the size of the page
*
* @flags can have FOLL_ flags set, defined in <linux/mm.h>
*
* Returns the mapped (struct page *), %NULL if no mapping exists, or
* an error pointer if there is a mapping to something not represented
* by a page descriptor (see also vm_normal_page()).
*/
/*此函数有很多大页面的处理情况,暂时忽略大页面相关代码*/
struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned int *page_mask)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
*page_mask = 0;
page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
if (!IS_ERR(page)) {
BUG_ON(flags & FOLL_GET);
return page;
}
/*通过pgd_offset()辅助函数由mm和地址addr找到当前进程页面表对应的PGD页面目录项,
用户进程内存管理的struct mm_struct数据结构的pgd成员(mm->pgd)指向用户进程的页表
的基地址。如果PGD表项的内容为空或者表项无效,那么报错返回。接着检查PUD/PMD,在二级
页表中,PUD和PMD都指向PGD,最后调用follow_page_pte()来检查PTE页表*/
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return no_page_table(vma, flags);
pud = pud_offset(pgd, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
page = follow_huge_pud(mm, address, pud, flags);
if (page)
return page;
return no_page_table(vma, flags);
}
if (unlikely(pud_bad(*pud)))
return no_page_table(vma, flags);
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
return no_page_table(vma, flags);
if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
page = follow_huge_pmd(mm, address, pmd, flags);
if (page)
return page;
return no_page_table(vma, flags);
}
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
if (pmd_trans_huge(*pmd)) {
if (flags & FOLL_SPLIT) {
split_huge_page_pmd(vma, address, pmd);
return follow_page_pte(vma, address, pmd, flags);
}
ptl = pmd_lock(mm, pmd);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
spin_unlock(ptl);
wait_split_huge_page(vma->anon_vma, pmd);
} else {
page = follow_trans_huge_pmd(vma, address,
pmd, flags);
spin_unlock(ptl);
*page_mask = HPAGE_PMD_NR - 1;
return page;
}
} else
spin_unlock(ptl);
}
/*查看follow_page_pte函数*/
return follow_page_pte(vma, address, pmd, flags);
}
返回__get_user_pages函数中
follow_page_pte()函数实现
[follow_page()->follow_page_mask()->follow_page_pte()]
static struct page *follow_page_pte(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
retry:
/*检查pmd是否有效*/
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
/*pte_offset_map_lock()宏通过PMD和地址addr获取pte页表项,这里还获取了一个spinlock锁,这个函数在返回
时需要调用pte_unmap_unlock()来释放spinlock锁,下面查看此宏的实现*/
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
pte = *ptep;
/*pte_present()判断pte页表中的L_PTE_PRESENT位是否置位,L_PTE_PRESENT标志位标识该页在内存中。*/
if (!pte_present(pte)) {
//这里表示处理页表不在内存中的情况
swp_entry_t entry;
/*
* KSM's break_ksm() relies upon recognizing a ksm page
* even while it is being migrated, so for that case we
* need migration_entry_wait().
*/
/*如果分配掩码没有定义FOLL_MIGRATION,即这个页面没有在页面合并过程中,那么错误返回*/
if (likely(!(flags & FOLL_MIGRATION)))
goto no_page;
/*如果pte为空,则错误返回*/
if (pte_none(pte))
goto no_page;
entry = pte_to_swp_entry(pte);
/*如果pte是正在合并中的swap页面,那么调用migrate_entry_wait()等待
这个页面合并完成后再尝试。*/
if (!is_migration_entry(entry))
goto no_page;
pte_unmap_unlock(ptep, ptl);
migration_entry_wait(mm, pmd, address);
goto retry;
}
if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
/*如果分配掩码支持可写属性(FOLL_WRITE),但是pte的表项只具有只读属性,那么也返回NULL。*/
if ((flags & FOLL_WRITE) && !pte_write(pte)) {
pte_unmap_unlock(ptep, ptl);
return NULL;
}
/*vm_normal_page()函数根据pte来返回normal mapping页面的struct page数据结构,
下面我们查看此函数的实现*/
page = vm_normal_page(vma, address, pte);
if (unlikely(!page)) {
if ((flags & FOLL_DUMP) ||
!is_zero_pfn(pte_pfn(pte)))
goto bad_page;
page = pte_page(pte);
}
/*如果flags设置了FOLL_GET,get_page_foll()会增加page的_count计数。*/
if (flags & FOLL_GET)
get_page_foll(page);
/*flag设置FOLL_TOUCH时,需要标记page可访问,调用mark_page_accessed()函数设置page是活跃的,
mark_page_accessed()函数是页面回收的核心辅助函数*/
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
set_page_dirty(page);
/*
* pte_mkyoung() would be more correct here, but atomic care
* is needed to avoid losing the dirty bit: it is easier to use
* mark_page_accessed().
*/
mark_page_accessed(page);
}
/*调用者想将页面锁在内存*/
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
* which might bounce very badly if there is contention.
*
* If the page is already locked, we don't need to
* handle it now - vmscan will handle it later if and
* when it attempts to reclaim the page.
*/
/*锁住页面,不交换到外部存储器*/
if (page->mapping && trylock_page(page)) {
lru_add_drain(); /* push cached pages to LRU */
/*
* Because we lock page here, and migration is
* blocked by the pte's page reference, and we
* know the page is still mapped, we don't even
* need to check for file-cache page truncation.
*/
mlock_vma_page(page);
unlock_page(page);
}
}
pte_unmap_unlock(ptep, ptl);
return page;
bad_page:
pte_unmap_unlock(ptep, ptl);
return ERR_PTR(-EFAULT);
no_page:
pte_unmap_unlock(ptep, ptl);
if (!pte_none(pte))
return NULL;
return no_page_table(vma, flags);
}
回到follow_page_mask函数
pte_offset_map_lock()宏实现:
通过PMD和地址addr获取pte页表项,这里还获取了一个spinlock锁,这个函数在返回时需要调用pte_unmap_unlock()来释放spinlock锁
[include/linux/mm.h]
[follow_page()->follow_page_mask()->follow_page_pte()->pte_offset_map_lock()]
#define pte_offset_map_lock(mm, pmd, address, ptlp) \
({ \
spinlock_t *__ptl = pte_lockptr(mm, pmd); \
pte_t *__pte = pte_offset_map(pmd, address); \
*(ptlp) = __ptl; \
spin_lock(__ptl); \
__pte; \
})
#define pte_offset_map(pmd,addr) (__pte_map(pmd) + pte_index(addr))
#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
#define PTRS_PER_PTE 512
#define PAGE_SHIFT 12
#define __pte_map(pmd) (pte_t *)kmap_atomic(pmd_page(*(pmd)))
#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
#define pmd_val(x) (x)
#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT))
vm_normal_page()函数的实现:根据pte来返回normal mapping页面的struct page数据结构
[mm/memory.c]
[follow_page()->follow_page_mask()->follow_page_pte()->vm_normal_page()]
/*此函数是一个很有意思的函数,它返回normal mapping页面的struct page数据结构,一些特殊映射的页面是不会返回struct page
数据结构的,这些页面不希望被参与内存管理的一些活动中,例如页面回收、页迁移和KSM等。HAVE_PTE_SPECIAL宏利用PTE页表
项的空闲比特位来做一些有意思的事情,在ARM32架构的3级页表和ARM64的代码中会用到这个特性,而ARM32架构的2级页表里没有实现这个
特性。在ARM64中,定义了PTE_SECIAL比特位,注意这是利用硬件上空闲的比特位来定义的
arch/arm64/include/asm/pgtable.h
/*
* Software defined PTE bits definition.
*/
#define PTE_VALID (_AT(pteval_t, 1) << 0)
#define PTE_DIRTY (_AT(pteval_t, 1) << 55)
#define PTE_SPECIAL (_AT(pteval_t, 1) << 56)
#define PTE_WRITE (_AT(pteval_t, 1) << 57)
#define PTE_PROT_NONE (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */
内核通常使用pte_mkspecial()宏来设置PTE_SPECIAL软件定义的比特位,主要用于以下用途:
1.内核的零页面 zero page。
2.大量的驱动程序使用remap_pfn_range()函数来实现映射内核页面到用户空间。这些用户
程序使用的VMA通常设置了(VM_IO|VM_PFNMAP|VM_DONTEXPAND|VM_DONTDUMP)属性。
3.vm_insert_page()/vm_insert_pfn()映射内核页面到用户空间。
vm_normal_page()函数把page页面分为两个阵营,一个是normal page,另一个是special page。
1.normal page通常指正常mapping的页面,例如匿名页面、page chache和共享内存页面等。
2.special page通常指不正常mapping的页面,这些页面不希望参与内存管理的回收或者合并的功能。例如映射如下特定页面
2.1 VM_IO: 为I/O设备映射内存
2.2 VM_PFN_MAP: 纯PFN映射
2.3 VM_MIXEDMAP: 固定映射
*/
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
/*特殊映射,见上面说明。如果pte的PTE_SPECIAL比特位没有置位,那么跳转到check_pfn继续检查。*/
if (HAVE_PTE_SPECIAL) {
if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page)
return vma->vm_ops->find_special_page(vma, addr);
/*如果vm_flags设置了下面两个标志位,那么这是special mapping,返回NULL*/
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (!is_zero_pfn(pfn))
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
/* !HAVE_PTE_SPECIAL case follows: */
/*如果没有定义HAVE_PTE_SPECIAL,检查(VM_PFNMAP|VM_MIXEDMAP)的情况*/
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
} else {
unsigned long off;
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)/*这里判断special mapping的情况*/
return NULL;
/*虚拟地址线性映射到pfn,如果映射是COW mapping(写时复制映射),那么页面也是normal映射*/
if (!is_cow_mapping(vma->vm_flags))
return NULL;
}
}
if (is_zero_pfn(pfn))
return NULL;
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
*/
out:
/*最后通过pfn_to_page()返回struct page数据结构实例*/
return pfn_to_page(pfn);
}
返回follow_page_pte()函数