ksm 是kernel samepage merge的简称,它会定期进行页而扫描,并把识别到的内存页副本进行合并,进而释放这些页面以供其他程序使用。通常合并后的页会被置为只读,当有其他程序要更改这个页面时,就会触发cow机制。系统会重新为这个程序单独生成一份副本
ksm机制
ksm维护两棵树,一棵stable树,一棵unstable树。其中stable树上维护的是已经merged过的页,由于这些页都是只读的所以基本上比较稳定。
另外一棵是unstable树,这个棵树维护的页是在一段时间内没有改变过的页,但不能保证过了这段时间会不会发生变化
具体逻辑
当分配内存时,用户可以通过madvise系统调用,将要扫描的区域加到ksm扫描的列表里面。接着,内核会起一个ksmd的系统进程,定期进行扫描。看一下具体代码
static void ksm_do_scan(unsigned int scan_npages)
{
struct rmap_item *rmap_item;
struct page *uninitialized_var(page);
/* scan_npages 一次扫描的页数 */
while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
/*对区间的每个page进行扫描 */
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
return;
cmp_and_merge_page(page, rmap_item);
put_page(page);
}
}
在分析scan_get_next_rmap_item之前,我们先来看一下struct rmap_item这个数据结构,
struct rmap_item {
struct rmap_item *rmap_list;
union {
struct anon_vma *anon_vma; /* when stable */
#ifdef CONFIG_NUMA
int nid; /* when node of unstable tree */
#endif
};
struct mm_struct *mm;
unsigned long address; /* + low bits used for flags below */
unsigned int oldchecksum; /* when unstable */
union {
struct rb_node node; /* when node of unstable tree */
struct { /* when listed from stable tree */
struct stable_node *head;
struct hlist_node hlist;
};
};
};
这个结构描述的是虚拟地址的一个反向映射,也就是说从一个虚拟地址可以找到一个mm slot。接着,继续分析scan_get_next_rmap_item
static struct rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
struct mm_slot *slot;
struct vm_area_struct *vma;
struct rmap_item *rmap_item;
int nid;
/* 如果扫描列表是空的则返回*/
if (list_empty(&ksm_mm_head.mm_list))
return NULL;
slot = ksm_scan.mm_slot;
if (slot == &ksm_mm_head) {
/*
* A number of pages can hang around indefinitely on per-cpu
* pagevecs, raised page count preventing write_protect_page
* from merging them. Though it doesn't really matter much,
A * it is puzzling to see some stuck in pages_volatile until
* other activity jostles them out, and they also prevented
* LTP's KSM test from succeeding deterministically; so drain
* them here (here rather than on entry to ksm_do_scan(),
* so we don't IPI too often when pages_to_scan is set low).
*/
lru_add_drain_all();
/*
* Whereas stale stable_nodes on the stable_tree itself
* get pruned in the regular course of stable_tree_search(),
* those moved out to the migrate_nodes list can accumulate:
* so prune them once before each full scan. */
if (!ksm_merge_across_nodes) {
struct stable_node *stable_node, *next;
struct page *page;
list_for_each_entry_safe(stable_node, next,
&migrate_nodes, list) {
page = get_ksm_page(stable_node, false);
if (page)
put_page(page);
cond_resched();
}
}
for (nid = 0; nid < ksm_nr_node_ids; nid++)
root_unstable_tree[nid] = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
/*获取扫描的slot*/
slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
ksm_scan.mm_slot = slot;
spin_unlock(&ksm_mmlist_lock);
/*
* Although we tested list_empty() above, a racing __ksm_exit
* of the last mm on the list may have removed it since then.
*/
/*做double check 避免竞争*/
if (slot == &ksm_mm_head)
return NULL;
next_mm:
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
}
mm = slot->mm;
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
vma = NULL;
else
/*根据address在mm里面找到vma*/
vma = find_vma(mm, ksm_scan.address);
for (; vma; vma = vma->vm_next) {
/*vma类型必须是VM_MERGEABLE才能进行合并*/
if (!(vma->vm_flags & VM_MERGEABLE))
continue;
if (ksm_scan.address < vma->vm_start)
ksm_scan.address = vma->vm_start;
/*vma必须是anon*/
if (!vma->anon_vma)
ksm_scan.address = vma->vm_end;
while (ksm_scan.address < vma->vm_end) {
if (ksm_test_exit(mm))
break;
/*根据虚拟地址获取物理页*/
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
if (IS_ERR_OR_NULL(*page)) {
ksm_scan.address += PAGE_SIZE;
cond_resched();
continue;
}
/*必须是anon page*/
if (PageAnon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
/*创建rmat_item*/
rmap_item = get_next_rmap_item(slot,
ksm_scan.rmap_list, ksm_scan.address);
if (rmap_item) {
ksm_scan.rmap_list =
&rmap_item->rmap_list;
ksm_scan.address += PAGE_SIZE;
} else
put_page(*page);
up_read(&mm->mmap_sem);
return rmap_item;
}
put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
}
/*没有找到*/
if (ksm_test_exit(mm)) {
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
}
/*将slot从scan的rmap_list中去除*/
remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(slot->mm_list.next,
struct mm_slot, mm_list);
if (ksm_scan.address == 0) {
/*
* We've completed a full scan of all vmas, holding mmap_sem
* throughout, and found no VM_MERGEABLE: so do the same as
* __ksm_exit does to remove this mm from all our lists now.
* This applies either when cleaning up after __ksm_exit
* (but beware: we can reach here even before __ksm_exit),
* or when all VM_MERGEABLE areas have been unmapped (and
* mmap_sem then protects against race with MADV_MERGEABLE).
*/
hash_del(&slot->link);
list_del(&slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
free_mm_slot(slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
up_read(&mm->mmap_sem);
mmdrop(mm);
} else {
up_read(&mm->mmap_sem);
/*
* up_read(&mm->mmap_sem) first because after
* spin_unlock(&ksm_mmlist_lock) run, the "mm" may
* already have been freed under us by __ksm_exit()
* because the "mm_slot" is still hashed and
* ksm_scan.mm_slot doesn't point to it anymore.
*/
spin_unlock(&ksm_mmlist_lock);
}
/* Repeat until we've completed scanning the whole list */
/*继续扫描*/
slot = ksm_scan.mm_slot;
if (slot != &ksm_mm_head)
goto next_mm;
ksm_scan.seqnr++;
return NULL;
}
static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
{
struct rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
struct stable_node *stable_node;
struct page *kpage;
unsigned int checksum;
int err;
stable_node = page_stable_node(page);
if (stable_node) {
/*如果是ksm page但是不在migrate_nodes 列表上,但是nid不同*/
if (stable_node->head != &migrate_nodes &&
get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
rb_erase(&stable_node->node,
root_stable_tree + NUMA(stable_node->nid));
stable_node->head = &migrate_nodes;
list_add(&stable_node->list, stable_node->head);
}
/*如果这个页不在migrate_nodes lits上,但是rmap_item在stable_node 说明这个是ksm page fork出来的*/
if (stable_node->head != &migrate_nodes &&
rmap_item->head == stable_node)
return;
}
/* We first start with searching the page inside the stable tree */
/*先在stable树上找到这个page*/
kpage = stable_tree_search(page);
/*如果是同一个page则直接把这个page释放,并返回*/
if (kpage == page && rmap_item->head == stable_node) {
put_page(kpage);
return;
}
/*暂时将rmap_item从list中去除 */
remove_rmap_item_from_tree(rmap_item);
if (kpage) {
/*尝试去合并这两个page*/
err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
if (!err) {
/*
* The page was successfully merged:
* add its rmap_item to the stable tree.
*/
lock_page(kpage);
/*如果合并成功,则半rmap_item添加到stable tree*/
stable_tree_append(rmap_item, page_stable_node(kpage));
unlock_page(kpage);
}
put_page(kpage);
return;
}
/*
* If the hash value of the page has changed from the last time
* we calculated it, this page is changing frequently: therefore we
* don't want to insert it in the unstable tree, and we don't want
* to waste our time searching for something identical to it there.
*/
/*如果page 变化频率比较高,则先不要把它放到unstable树上*/
checksum = calc_checksum(page);
if (rmap_item->oldchecksum != checksum) {
rmap_item->oldchecksum = checksum;
return;
}
/*
* Same checksum as an empty page. We attempt to merge it with the
* appropriate zero page if the user enabled this via sysfs.
*/
if (ksm_use_zero_pages && (checksum == zero_checksum)) {
struct vm_area_struct *vma;
vma = find_mergeable_vma(rmap_item->mm, rmap_item->address);
err = try_to_merge_one_page(vma, page,
ZERO_PAGE(rmap_item->address));
/*
* In case of failure, the page was not really empty, so we
* need to continue. Otherwise we're done.
*/
if (!err)
return;
}
/*如果变化的频率不是很高,则在非稳定树上找到相同页*/
tree_rmap_item =
unstable_tree_search_insert(rmap_item, page, &tree_page);
if (tree_rmap_item) {
/*尝试合并这两个页*/
kpage = try_to_merge_two_pages(rmap_item, page,
tree_rmap_item, tree_page);
put_page(tree_page);
if (kpage) {
/*
* The pages were successfully merged: insert new
* node in the stable tree and add both rmap_items.
*/
/*如果合并成功,则把这个page插入到stable stree上*/
lock_page(kpage);
stable_node = stable_tree_insert(kpage);
if (stable_node) {
/*把两个rmap_item都放到stable_node的hash 表上*/
stable_tree_append(tree_rmap_item, stable_node);
stable_tree_append(rmap_item, stable_node);
}
unlock_page(kpage);
/*
* If we fail to insert the page into the stable tree,
* we will have 2 virtual addresses that are pointing
* to a ksm page left outside the stable tree,
* in which case we need to break_cow on both.
*/
if (!stable_node) {
/*如果失败,则把ksm page 再copy一份给正在发起写动作的应用,*/ break_cow(tree_rmap_item);
break_cow(rmap_item);
}
}
}
}
看一下merge_with_ksm_page逻辑
static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
struct page *page, struct page *kpage)
{
/*通过rmap_item找到 mm*/
struct mm_struct *mm = rmap_item->mm;
struct vm_area_struct *vma;
int err = -EFAULT;
down_read(&mm->mmap_sem);
/*通过虚拟地址找到vma区域*/
vma = find_mergeable_vma(mm, rmap_item->address);
if (!vma)
goto out;
err = try_to_merge_one_page(vma, page, kpage);
if (err)
goto out;
/* Unstable nid is in union with stable anon_vma: remove first */
remove_rmap_item_from_tree(rmap_item);
/* Must get reference to anon_vma while still holding mmap_sem */
rmap_item->anon_vma = vma->anon_vma;
get_anon_vma(vma->anon_vma);
out:
up_read(&mm->mmap_sem);
return err;
}
看一下try_to_merge_one_page这个函数的逻辑
static int try_to_merge_one_page(struct vm_area_struct *vma,
struct page *page, struct page *kpage)
{
pte_t orig_pte = __pte(0);
int err = -EFAULT;
/*如果两个page相等说明是通过ksm page fork出来的*/
if (page == kpage) /* ksm page forked */
return 0;
/*如果不是匿名页则跳出*/
if (!PageAnon(page))
goto out;
/*
* We need the page lock to read a stable PageSwapCache in
* write_protect_page(). We use trylock_page() instead of
* lock_page() because we don't want to wait here - we
* prefer to continue scanning and merging different pages,
* then come back to this page when it is unlocked.
*/
if (!trylock_page(page))
goto out;
/*如果是大页,则需要split*/
if (PageTransCompound(page)) {
err = split_huge_page(page);
if (err)
goto out_unlock;
}
/*
* If this anonymous page is mapped only here, its pte may need
* to be write-protected. If it's mapped elsewhere, all of its
* ptes are necessarily already write-protected. But in either
* case, we need to lock and check page_count is not raised.
*/
if (write_protect_page(vma, page, &orig_pte) == 0) {
if (!kpage) {
/*
* While we hold page lock, upgrade page from
* PageAnon+anon_vma to PageKsm+NULL stable_node:
* stable_tree_insert() will update stable_node.
*/
set_page_stable_node(page, NULL);
mark_page_accessed(page);
/*
* Page reclaim just frees a clean page with no dirty
* ptes: make sure that the ksm page would be swapped.
*/
if (!PageDirty(page))
SetPageDirty(page);
err = 0;
else if (pages_identical(page, kpage))
/*如果页内容相同,则替换*/
err = replace_page(vma, page, kpage, orig_pte);
}
/*如果vma 类型是VM_LOCKED 还需要把原来的page解锁,否则无法释放*/
if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
munlock_vma_page(page);
if (!PageMlocked(kpage)) {
unlock_page(page);
lock_page(kpage);
mlock_vma_page(kpage);
page = kpage; /* for final unlock */
}
}
out_unlock:
unlock_page(page);
out:
return err;
}
ksm的使用
1.开关
echo 1 > /sys/kernel/msm/ksm/run
2.控制
/sys/kernel/mm/ksm/sleep_millisecs :定期扫描的间隔
pages_to_scan:一次扫描的页数
3.监控
page_shared:稳定树上的page数
page_unshared:非稳定树上的page数
pages_sharing:表示有多少页被共享,举个例子:比如10个页合并到一个页面上,则sharing 为9,而shared为1.那么节省了8个页