VII)Understanding Swap in
接口函数: swapin_readahead,handle_mm_fault,make_pages_present.
我们从handle_mm_fault谈起,在分析fault.c的时候提到过一个mm fault产生的
几种情形,首先是进入到do_page_fault的时候
/*
* 有三种情况下执行流到达此函数:
* 1. pmd, pgt, pte 有一个为空, 即未建立映射或已经撤销.(nomal page
cache)
* 2. 页面不在内存(not present)被内核交换到了磁盘(swap space).
* 3. 权限不正确. (内核已经授权或者根本不容许那样访问)
*
* 说明:
* 情况1)涉及的页面或者没有建立映射或者属于nomal address map(like mmap)
* 一般要到page cache中试图寻找.
* try_to_swap_out处理nomal address map的时候,把相应的pte置成0.
*
* 情况2.涉及页面属于swapper_space管理. try_to_swap_out 不会把pte 置0,
而是讲pte置为相应的swp_entry_t并将present位置0.
*
* 再转交下一级函数处理时, 所有非法操作都在这个函数中处理掉了.所谓非法
* 是用户企图进行一次内核(vma)不容许的操作,如vma无写属性,而进程进行了写操
* 作.
*/
asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
.........
/*
* 如果处时指令流抵达此地,说明异常点在一个完好的vma中,
* 并且符合OS 赋予用户的权限.
*/
switch (handle_mm_fault(mm, vma, address, write)) {
case 1:
tsk->min_flt++;
break;
case 2:
tsk->maj_flt++;
break;
case 0:
goto do_sigbus;
default:
goto out_of_memory;
}
........
}
上面的分析给出了handle_mm_fault(mm, vma, address, write)执行时所面临的
条件"说明异常点在一个完好的vma中,并且符合OS 赋予用户的权限":vma是经过了
扩展(expand stack)或修改,用户的这次操作应该得到相应的服务.
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
unsigned long address, int write_access)
{
int ret = -1;
pgd_t *pgd;
pmd_t *pmd;
pgd = pgd_offset(mm, address);
pmd = pmd_alloc(pgd, address); /*查找或者分配pmd page*/
if (pmd) {
pte_t * pte = pte_alloc(pmd, address); /*查找或者分配page table*/
if (pte)
ret = handle_pte_fault(mm, vma, address, write_access, pte);
}
return ret;
}
可见在发生页面异常的时候,容许进程被调度,和中断有所不同.这是合理的:可
以看作进程请求内核处理mm fault,就像一个系统调用.
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct * vma, unsigned long address,
int write_access, pte_t * pte)
{
pte_t entry;
/*
* We need the page table lock to synchronize with kswapd
* and the SMP-safe atomic PTE updates.
*/
spin_lock(&mm->page_table_lock);
entry = *pte;
/*异常属于映射链断裂的情况*/
if (!pte_present(entry)) {
/*
* If it truly wasn't present, we know that kswapd
* and the PTE updates will not touch it later. So
* drop the lock.
*/
spin_unlock(&mm->page_table_lock);
if (pte_none(entry))
return do_no_page(mm, vma, //页面不存在, 就是没有,或者被 try_to_swap_out
address, // 断开, 这种页面一般属于一个nomal address map
write_access,
pte //参见filemap_nopage 了解page cache 的换入
);
return do_swap_page(mm, vma,//not present, 属于swapper_space
address, pte,
pte_to_swp_entry(entry),
write_access
);
}
/*异常属于内核授权,但是cpu不容许的情况,COW*/
//如果是read 时产生陷入或者是pte的映射问题
//或者是一个非法的操作,已经在do_page_fault 过滤掉了
if (write_access) {//由于页面写保护产生的陷入, 而OS 却容许用户写入
if (!pte_write(entry))
return do_wp_page( mm, //就是处理COW 的第二步
vma, //Copy on Write
address, //第一步是建立一个不容许写的页
pte, entry //却在vma 中赋予用户写的权限
);
entry = pte_mkdirty(entry);
}
/*好像一般都满足write_access为1,read not present从
if (!pte_present(entry))就返回了(fix me),read present更是连
handle mem fault都进不了*/
entry = pte_mkyoung(entry);
establish_pte(vma, address, pte, entry);
spin_unlock(&mm->page_table_lock);
return 1;
}
handle_mm_fault 已经修复了映射链上的系统页面,handle_pte_fault主要是修复
映射链上的用户页面:
1)do_no_page:还未建立映射,或者已经被断开还存在于lru cache(page cache)
或者已经回收到了node-zone-buddy.
2)do_swap_page:从lru恢复,如果已经被回收到node-zone-buddy就从磁盘调入.
3)do_wp_page: 处理COW的copy操作,用户现在要写此页面,copy一份给他.
1)do_no_page:
分配一个匿名页面,或者用vm指定的操作寻找对应页面.在设置pte
的时候考虑COW:对于写操作,表示内核容许写(这里不会遭遇COW,cow是另一个处理
函数,handle_pte_fault已经区分的很清楚了),直接将pte置为可写.如果是read操
作,考虑mmap创建的vma(not anonymous page),并且页面已经是共用页面,则我们
不能直接给这个进程写权限,而是要取消写权限这是COW处理中的一环:建立一个写
保护的页面.参照分析filemap.c时对函数filemap_nopage的分析,那里讲的很详细
-->filemap_nopage 对这个read操作的进程直接返回一个共享页面,如果这是第一
个要求访问此页面的进程do_no_page不会取消这个进程的写操作权限.
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long address, int write_access, pte_t *page_table)
{
struct page * new_page;
pte_t entry;
if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table, write_access, address);
/*
* The third argument is "no_share", which tells the low-level code
* to copy, not share the page even if sharing is possible. It's
* essentially an early COW detection.
* 对于mmap建立的vm,此函数是 filemap_nopage
*/
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
if (new_page == NULL) /* no page was available -- SIGBUS */
return 0;
if (new_page == NOPAGE_OOM)
return -1;
++mm->rss;
//其实在上面两个调入函数中(do_anonymous_page vma->vm_ops->nopage)
//也存在cow 的第二步操作,即写操作时do_anonymous_page 返回
//新分配的页面, vma->vm_ops->nopage 不共享页面,而进行copy 操作.
//cow 本质: 写时拷贝,读时共享
//写时拷贝就是刚分析的do_anonymous_page vma->vm_ops->nopage 加上
//do_wp_page
//读时共享就是do_anonymous_page 在read 条件下,返回zero 公共页
//vma->vm_ops->nopage 在read 条件下尽力共享页面.
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if write_access is true, we either now have
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
flush_page_to_ram(new_page);
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access) {
entry = pte_mkwrite(pte_mkdirty(entry));
} else if (page_count(new_page) > 1 &&
!(vma->vm_flags & VM_SHARED))
entry = pte_wrprotect(entry);
set_pte(page_table, entry);
/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
return 2; /* Major fault */
}
2)do_swap_page
对于开始还出的页面,可以从lru恢复,如果已经被回收到node-zone-buddy就从
磁盘调入.
static int do_swap_page(struct mm_struct * mm,
struct vm_area_struct * vma, unsigned long address,
pte_t * page_table, swp_entry_t entry, int write_access)
{
struct page *page = lookup_swap_cache(entry); //先查找swap cache
//如果查找的到,肯定也存在于lru,说不定就要开始换出了
pte_t pte;
if (!page) { /*如果找不到,就从磁盘swap in*/
lock_kernel();
swapin_readahead(entry); /*先预读一部分*/
page = read_swap_cache(entry); /*类似普通文件读写,先读入,并加入swap
cache,并返回指定页面*/
unlock_kernel();
if (!page)
return -1;
flush_page_to_ram(page);
flush_icache_page(vma, page);
}
mm->rss++;
/*得到指定的页面,就开始恢复页面映射*/
pte = mk_pte(page, vma->vm_page_prot);
/*
* Freeze the "shared"ness of the page, ie page_count + swap_count.
* Must lock page before transferring our swap count to already
* obtained page count.
*/
lock_page(page); /*is_page_shared 要求锁定页面,详见filemap.c的分析*/
swap_free(entry);
if (write_access && !is_page_shared(page))
pte = pte_mkwrite(pte_mkdirty(pte));
UnlockPage(page);
set_pte(page_table, pte);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
return 1; /* Minor fault */
}
其中预读和读取到swap cache(read_swap_cache)留做以后详解,这里仅仅简单
分析一下预读.
/*
* Primitive swap readahead code. We simply read an aligned block of
* (1 << page_cluster) entries in the swap area. This method is chosen
* because it doesn't cost us any seek time. We also make sure to queue
* the 'original' request together with the readahead ones...
*/
void swapin_readahead(swp_entry_t entry)
{
int i, num;
struct page *new_page;
unsigned long offset;
/*
* Get the number of handles we should do readahead io to. Also,
* grab temporary references on them, releasing them as io completes.
*/
/*从entry所在cluster的开始预读一个连续区间(may be whole cluster)*/
num = valid_swaphandles(entry, &offset);
for (i = 0; i < num; offset++, i++) {
/* Don't block on I/O for read-ahead */
/*如果在异步状态的页面总数大于容许预读的总数,此次预读夭折*/
if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster
* (1 << page_cluster)) {
while (i++ < num)
swap_free(SWP_ENTRY(SWP_TYPE(entry), offset++));
break;
}
/* Ok, do the async read-ahead now */
/* read_swap_cache_async try lock page, may be sleep.*/
new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
if (new_page != NULL)
page_cache_release(new_page);
swap_free(SWP_ENTRY(SWP_TYPE(entry), offset));
}
return;
}
全局变量page_cluster定义一个cluster包含多少个页面.当系统中处于异步io
状态的页面大于容许预读的页面总数的时候,不再进行预读.
3)do_wp_page
就是Copy 一个页面.只不过尽力避免分copy,相关逻辑比较难懂的也就是为何调
用is_page_shared要lock页面,这个我们已经分析过了.
int make_pages_present(unsigned long addr, unsigned long end)模拟从addr
开始到end结束的这段空间发生page fault的情况,从而讲对应页面统统调入内存.
全文完.
|