下面来看页面不在内存中且页表内容为空(!pte_present(entry)&&pte_none(entry))的另一种情况。即VMA定义了fault方式函数vma->vm_ops->fault。
[handle_pte_fault()->do_fault()]
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults).
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
/*do_fault()函数处理VMA中的vm_ops操作函数集里定义了fault函数指针的情况,具体分为3种情况*/
static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);/*不明白是什么意思*/
/*(1) flags不为FAULT_FLAG_WRITE,即只读异常,见下面do_read_fault()代码*/
if (!(flags & FAULT_FLAG_WRITE))
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
/*(2) VMA的vm_flags没有定义VM_SHARED,即这是一个私有映射且发生了写时复制COW,见下面do_cow_fault()代码*/
if (!(vma->vm_flags & VM_SHARED))
return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
/*(3) 其余情况是在共享映射中发生了写缺页异常,见do_shared_fault()代码*/
return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
(1)读异常
do_read_fault()函数:处理只读异常(mm/memory.c)
[handle_pte_fault()->do_fault()->do_read_fault()]
static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
struct page *fault_page;
spinlock_t *ptl;
pte_t *pte;
int ret = 0;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
/*VMA定义了map_pages()方法,可以围绕在缺页异常地址周围提前映射尽可能多的页面。
提前建立进程地址空间和page cache的映射关系有利于减少发生缺页中断的次数,从而提高
效率。注意,这里只是和现存的page cache提前建立映射关系,而不会去创建page cache,
创建新的page cache是在__do_fault()函数中。fault_around_bytes是一个全局变量
定义在mm/memory.c文件中,默认是65536字节,即16个页面大小。
static unsigned long fault_around_bytes __read_mostly =
rounddown_pow_of_two(65536);
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
/*下面查看do_fault_around实现,我的理解:分配虚拟地址的PTE,未映射物理表项,__do_fault函数建立
虚拟地址与物理页表的映射关系*/
do_fault_around(vma, address, pte, pgoff, flags);
if (!pte_same(*pte, orig_pte))
goto unlock_out;
pte_unmap_unlock(pte, ptl);
}
/*真正为异常地址分配page cache是在do_read_fault()函数中的__do_fault()函数,下面查看其实现*/
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
/*重新读取当前缺页异常地址addr对应pte的值与以前读取出来的值是否一致,如果不一致,
说明这期间有人修改了pte,那么刚才通过__do_fault()函数分配的页面就没用了*/
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
page_cache_release(fault_page);
return ret;
}
/*利用刚才分配的页面新生成一个PTE entry设置到硬件页表中*/
do_set_pte(vma, address, fault_page, pte, false, false);
unlock_page(fault_page);
unlock_out:
pte_unmap_unlock(pte, ptl);
return ret;
}
回到do_fault()函数
do_fault_around()函数:
以当前缺页异常地址addr为中心,start_addr是以16个page大小对齐的起始地址,然后从start_addr开始去检查相应的pte是否空。若为空,则从这个pte开始到max_pgoff为止使用VMA的操作函数map_pages()来映射PTE,除非所需要的page cache还没有准备好或page cache 被锁住了。该函数预测异常地址周围的page cache可能会被马上读取,所以把已经有的page cache提前建立好映射,有利于减少发生缺页中断的次数,但注意并不会去新建page cache。流程如下图:
static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pgoff_t pgoff, unsigned int flags)
{
unsigned long start_addr, nr_pages, mask;
pgoff_t max_pgoff;
struct vm_fault vmf;
int off;
nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
start_addr = max(address & mask, vma->vm_start);
off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
pte -= off;
pgoff -= off;
/*
* max_pgoff is either end of page table or end of vma
* or fault_around_pages() from pgoff, depending what is nearest.
*/
max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
pgoff + nr_pages - 1);
/* Check if it makes any sense to call ->map_pages */
while (!pte_none(*pte)) {
if (++pgoff > max_pgoff)
return;
start_addr += PAGE_SIZE;
if (start_addr >= vma->vm_end)
return;
pte++;
}
vmf.virtual_address = (void __user *) start_addr;
vmf.pte = pte;
vmf.pgoff = pgoff;
vmf.max_pgoff = max_pgoff;
vmf.flags = flags;
vma->vm_ops->map_pages(vma, &vmf);
}
回到do_read_fault()函数
__do_fault()函数: 异常地址分配page cache
/*
* The mmap_sem must have been held on entry, and may have been
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
static int __do_fault(struct vm_area_struct *vma, unsigned long address,
pgoff_t pgoff, unsigned int flags,
struct page *cow_page, struct page **page)
{
struct vm_fault vmf;
int ret;
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
vmf.cow_page = cow_page;
/*建立一个page cache,*/
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
if (!vmf.page)
goto out;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
page_cache_release(vmf.page);
return VM_FAULT_HWPOISON;
}
/*如果返回值ret不包含VM_FAULT_LOCKED,那么调用lock_page函数为page加锁PG_locked,
否则在打开了CONFIG_DEBUG_VM的情况下,会去检查这个page是否已经locked了*/
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf.page);
else
VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
out:
*page = vmf.page;
return ret;
}
返回do_read_fault()函数
(2)写时复制异常
do_cow_fault()函数: 处理私有文件映射的VMA中发生了写时复制。
[handle_pte_fault()->do_fault()->do_cow_fault()]
static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
struct page *fault_page, *new_page;
struct mem_cgroup *memcg;
spinlock_t *ptl;
pte_t *pte;
int ret;
/*检查该VMA是否初始化了RMAP反向映射*/
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
/*为GFP_HIGHUSER|__GFP_MOVABLE的新页面new_page分配一个分配掩码,也就是优先使用
高端内存highmem*/
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
return VM_FAULT_OOM;
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
page_cache_release(new_page);
return VM_FAULT_OOM;
}
/*通过vma->vm_ops->fault()函数读取文件内容到fault_page页里面。*/
ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
/*把fault_page页面的内容复制到刚才新分配的页面new_page中*/
if (fault_page)
copy_user_highpage(new_page, fault_page, address, vma);
__SetPageUptodate(new_page);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
/*重新获取该异常地址对应的pte,如果当前pte的内容和之前的org_pte内容不一样,说明有人修改了
pte,那么释放new_page和fault_page并返回*/
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
if (fault_page) {
unlock_page(fault_page);
page_cache_release(fault_page);
} else {
/*
* The fault handler has no page to lock, so it holds
* i_mmap_lock for read to protect against truncate.
*/
i_mmap_unlock_read(vma->vm_file->f_mapping);
}
goto uncharge_out;
}
/*利用new_page新生成一个PTE entry并设置到硬件页表项pte中*/
do_set_pte(vma, address, new_page, pte, true, true);
mem_cgroup_commit_charge(new_page, memcg, false);
/*把new_page加入到活跃的LRU链表中*/
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
if (fault_page) {
unlock_page(fault_page);
/*释放fault_page*/
page_cache_release(fault_page);
} else {
/*
* The fault handler has no page to lock, so it holds
* i_mmap_lock for read to protect against truncate.
*/
i_mmap_unlock_read(vma->vm_file->f_mapping);
}
return ret;
uncharge_out:
mem_cgroup_cancel_charge(new_page, memcg);
page_cache_release(new_page);
return ret;
}
(3) 写缺页异常
do_shared_fault()函数: 处理在一个可写的共享映射中发生缺页中断的情况。
[handle_pte_fault()->do_fault()->do_shared_fault()]
static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
struct page *fault_page;
struct address_space *mapping;
spinlock_t *ptl;
pte_t *pte;
int dirtied = 0;
int ret, tmp;
/*首先通过__do_fault()函数读取文件内容到fault_page页面中*/
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
/*
* Check if the backing address space wants to know that the page is
* about to become writable
*/
/*如果VMA的操作函数中定义了page_mkwrite()方法,那么调用page_mkwrite()来通知进程地址空间,
page将变成可写的,一个页面变成可写的,那么进程有可能需要等待这个page的内容回写成功(writeback)*/
if (vma->vm_ops->page_mkwrite) {
unlock_page(fault_page);
tmp = do_page_mkwrite(vma, fault_page, address);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
page_cache_release(fault_page);
return tmp;
}
}
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
/*判断该异常地址对应的硬件页表项pte的内容是否与之前的pte一致*/
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
page_cache_release(fault_page);
return ret;
}
/*利用fault_page新生成一个PTE entry并设置到硬件页表项pte中,注意这里设置PTE为可写属性*/
do_set_pte(vma, address, fault_page, pte, true, false);
pte_unmap_unlock(pte, ptl);
/*设置page为脏页面*/
if (set_page_dirty(fault_page))
dirtied = 1;
/*
* Take a local copy of the address_space - page.mapping may be zeroed
* by truncate after unlock_page(). The address_space itself remains
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
mapping = fault_page->mapping;
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
/*通过此函数来平衡并回写一部分脏页面*/
balance_dirty_pages_ratelimited(mapping);
}
if (!vma->vm_ops->page_mkwrite)
file_update_time(vma->vm_file);
return ret;
}
私有映射 | 共享映射 | |
匿名映射 | 私有匿名映射->通常用于内存分配(do_anonymous_page) | 共享匿名映射->通常用于进程间共享内存 |
文件映射 | 私有文件映射->通常用于加载动态库 | 共享文件映射->通常用于内存映射I/O,进程间通信 do_shared_fault |