一:概述
虚拟机页表建立的核心数据结构 :kvm_memory_slot,kvm_mmu_page
KVM把虚拟机中一段一段的内存叫做slot,由kvm_memory_slot表示,里面包含虚拟机物理页编号base_gfn,该slot总共的page数量npages,host端用户态地址userspace_addr,虚拟机到物理机的地址转换最关键的地方就是这里。查到地址的物理页首先获取gfn,根据偏移找到host的虚拟地址,再找到VMA,再填充page,然后将映射关系写到kvm_mmu_page页表信息中,在设置pte。再次寻址由EPT完成。
kvm_mmu_page中包含页表的层级role.level ,spt页表。所有的页表连接在link链表中。
二:数据结构和关系详解
struct kvm_memory_slot {
gfn_t base_gfn; //该slot的开始虚拟机gfn
unsigned long npages;//总page数
unsigned long *dirty_bitmap;
struct kvm_arch_memory_slot arch; //反向映射信息
unsigned long userspace_addr; //host端虚拟机开始的虚拟地址,qemu用的地址
u32 flags;
short id;
};
struct kvm_mmu_page {
struct list_head link; //所有的页表结构都链接在一起
struct hlist_node hash_link;
struct list_head lpage_disallowed_link;
gfn_t gfn; //guest frame number 虚拟机的物理地址
union kvm_mmu_page_role role;
u64 *spt; //页表页
gfn_t *gfns;
bool unsync;
bool lpage_disallowed;
int root_count;
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes;
unsigned long mmu_valid_gen;
unsigned long unsync_child_bitmap[8];
atomic_t write_flooding_count;
}
//数据样例:
struct kvm_mmu_page {
link = {
next = 0xffff8ed0ff5231f8,
prev = 0xffff8ed11a768000
},
hash_link = {
next = 0x0,
pprev = 0xffffb4690b256d80
},
lpage_disallowed_link = {
next = 0xffff8ed11a768020,
prev = 0xffff8ed0ff523218
},
gfn = 4344832,
role = {
word = 1921,
{
level = 1, //一级页表,里面是具体的页
cr4_pae = 0,
quadrant = 0,
direct = 1,// direct map
access = 7,
invalid = 0,
nxe = 0,
cr0_wp = 0,
smep_andnot_wp = 0,
smap_andnot_wp = 0,
ad_disabled = 0,
guest_mode = 0,
smm = 0
}
},
spt = 0xffff8ed0f9537000,//页表页
gfns = 0x0,
unsync = false,
lpage_disallowed = true,
root_count = 0,
unsync_children = 0,
parent_ptes = {
val = 18446619627041077552
},
mmu_valid_gen = 48,
unsync_child_bitmap = {0, 0, 0, 0, 0, 0, 0, 0},
write_flooding_count = {
counter = 0
}
}
展示一个四级页表的读取过程,方便理解:
4级页表:
crash> rd 0xffff8ed12e4b3000 10
ffff8ed12e4b3000: 000000102e4b2107 0000000000000000 .!K………….
3级页表:
rd -p 000000102e4b2000 0x12
102e4b2000: 0000000ff5936107 0000000000000000 .a…………..
102e4b2010: 0000000ff593a107 0000000ff5939107 …………….
102e4b2020: 0000000000000000 000000102e68f107 ……….h…..
102e4b2030: 0000000000000000 0000000000000000 …………….
102e4b2040: 0000000000000000 0000000000000000 …………….
102e4b2050: 0000000000000000 0000000000000000 …………….
102e4b2060: 0000000000000000 0000000000000000 …………….
102e4b2070: 000000102e8a2107 000000102e4b1107 .!……..K…..
102e4b2080: 000000102e4b0107 0000000000000000 ..K………….
2级页表:
rd -p 0000000ff5936000 0x200
ff5936000: 0000001015d20107 0000000000000000 …………….
ff5936a80: 0000000f13200ff3 0000000000000000 .. ………….
ff5936c60: 0000000000000000 0000000f12c00ff3 …………….
ff5936c70: 0000000f12e00ff3 0000000f12800ff3 …………….
ff5936c80: 0000000f12a00ff3 0000000f12400ff3 ……….@…..
ff5936c90: 0000000f12600ff3 0000000f12000ff3 ...............
ff5936ca0: 0000000f12200ff3 0000000f11c00ff3 .. .............
ff5936cb0: 0000000f11e00ff3 0000000f11800ff3 ................
ff5936cc0: 0000000f11a00ff3 0000000f11400ff3 ..........@.....
ff5936cd0: 0000000f11600ff3 0000000f11000ff3 ..………….
ff5936ce0: 0000000f11200ff3 0000000f10c00ff3 .. ………….
ff5936cf0: 0000000f10e00ff3 0000000f10800ff3 …………….
ff5936d00: 0000001011407107 000000102fcfc107 .q@……../….
ff5936d10: 0000000fa0000ff3 000000101151f107 ……….Q…..
ff5936d20: 0000000f9f800ff3 000000102fd1b107 ………../….
ff5936d30: 0000001031350107 000000103063f107 ..51……c0….
ff5936d40: 000000102d607107 0000001032918107 .q`-…….2….
ff5936d50: 00000010117d2107 00000010289aa107 .!}……..(….
ff5936d60: 0000000f9d000ff3 0000000f9ce00ff3 …………….
ff5936d70: 000000102fe05107 000000102dab6107 .Q./…..a.-….
ff5936d80: 0000000ff0a1a107 000000101a4ec107 ……….N…..
1级页表:
rd -p 0000001015d20000 0x200
1015d20400: 0000000fa2080f77 0000000fa2081f77 w…….w…….
1015d20410: 0000000fa2082f77 0000000fa2083f77 w/……w?……
1015d20480: 0000000fa2090f77 0000000fa2091f77 w…….w…….
1015d20490: 0000000fa2092f77 0000000fa2093f77 w/……w?……
1015d204a0: 0000000fa2094f77 0000000fa2095f77 wO……w_……
1015d204b0: 0000000fa2096f77 0000000fa2097f77 wo……w…….
1015d204c0: 0000000fa2098f77 0000000000000000 w……………
读取第一个page的内容:
fa2080f77: 34362e343632383a 362e34362d33352e :8264.64.53-64.6
fa2080f87: 2f34362c32342e34 373932383a2d3335 4.42,64/53-:8297
fa2080f97: 362d33352e343631 3137392b30322e34 164.53-64.20+971
fa2080fa7: 34362d33352e3436 362e34362d33352e 64.53-64.53-64.6
fa2080fb7: 2b30322d33352e34 383a2e34362e3436 4.53-20+64.64.:8
fa2080fc7: 382e343630363732 353b3d2e34363036 276064.86064.=;5
fa2080fd7: 34362e34362d3335 362c32332e34362e 53-64.64.64.32,6
fa2080fe7: 2e3436343a3c2e34 35372c32342b3133 4.<:464.31+42,75
fa2080ff7: 3c2e33352e34362f 36303638353a3c3a /64.53.<:<:58606
fa2081007: 2a2f312c32342e34 31332d33352e3436 4.42,1/64.53-31
fa2081017: 332e34363036382c 2e34362f35372c31 ,86064.31,75/64.
fa2081027: 3638343a3c2e3436 392f35372e343630 64.<:486064.75/9
fa2081037: 2d33352e34363137 32342e34362e3436 7164.53-64.64.42
fa2081047: 333036382e34362c 2e34362e34362b31 ,64.86031+64.64.
fa2081057: 34362f34362c3133 352c32342e34362e 31,64/64.64.42,5
fa2081067: 32383a2e34362d33 32332e3436313739 3-64.:8297164.32
fa2081077: 342e34362f35372c 2d33352d32342c32 ,75/64.42,42-53-
fa2081087: 383a2e33352d3234 382e34362d333532 42-53.:8253-64.8
fa2081097: 2e34362e34363036 34362e34362e3436 6064.64.64.64.64
fa20810a7: 32373d3f2b31332e 32383a2e34362a30 .31+?=72064.:82
三:代码详解
调用的关系如下所示:
vmx_handle_exit //vcpu_run->vcpu_enter_guest,Vcpu运行退出后调用
handle_exception //is_page_fault(intr_info)
kvm_handle_page_fault
kvm_mmu_page_fault //处理MMIO和普通内存fault,vcpu->arch.mmu.page_fault
tdp_page_fault //kvm处理内存异常核心函数
try_async_pf
__direct_map //完成映射
static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,bool prefault)
{
r = mmu_topup_memory_caches(vcpu);//保证MMU相关cache内存足够
level = mapping_level(vcpu, gfn, &force_pt_level);
//fast路径只处理也write和trace属性相关的异常
if (fast_page_fault(vcpu, gpa, level, error_code))
return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
//异步处理page fault,核心函数,后面详细说明
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
return r;
//page构建完成后再生成KVM_MMU_page页表,后续寻址由ept完成,后面详解
r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
prefault, lpage_disallowed);
}
关键的两个函数是try_async_pf
和__direct_map
,一个用来产生page(产生页的意思是内核不会第一时间给虚拟机地址分配物理地址,真正要用时才会分配),一个用来生成页表(完成虚拟机物理地址到物理机物理地址的映射)。
1:try_async_pf
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
bool *writable)
{
struct kvm_memory_slot *slot;
bool async;
/*通过判断flag来标记是否可用,gfn去遍历mem_slot,根据base_gfn和npages去判断是否在
* 这个区间。
*/
if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
*pfn = KVM_PFN_NOSLOT;
return false;
}
//gfn到slot的转换,每个memslot有base_gfn和npage,查看gfn落在哪个区间
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
async = false;
//
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
if (!async)
return false; /* *pfn has correct page already */
kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
bool atomic, bool *async, bool write_fault,
bool *writable)
{
/*gfn转为userspace地址的公式
*slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
*/
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
// hva到pfn,填充page
return hva_to_pfn(addr, atomic, async, write_fault,
writable);
}
hva_to_pfn是host端虚拟地址到物理地址的转换,有两种方式,hva_to_pfn_fast、hva_to_pfn_slow。
hva_to_pfn_fast //快速的hva到pfn切换,有页返回ture
__get_user_pages_fast //往下走是根据页表一步步往下读,如果有某级页表为空
gup_pgd_range //则直接返回,过程不会睡眠。突出快。有页表空或者没有也会
gup_p4d_range //进入到slow的路径。
gup_pud_range
gup_pmd_range
gup_pte_range
hva_to_pfn_slow
//slow的流程可能会睡眠。fast不会
hva_to_pfn_slow
get_user_pages_unlocked
__get_user_pages_locked
__get_user_pages
faultin_page //没有page会通过page_fault流程进行处理。
follow_page_mask //有page会一步步读取页表获取页
follow_p4d_mask
follow_pud_mask
follow_pmd_mask
follow_page_pte
我们对关键的函数进行查看:
static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *nonblocking)
{
do {
struct page *page;
unsigned int foll_flags = gup_flags;
unsigned int page_increm;
/* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) {
vma = find_extend_vma(mm, start);
}
retry:
//层层读取页表,访问页
page = follow_page_mask(vma, start, foll_flags, &page_mask);
if (!page) {
int ret; //如果没有找到页则通过page_fault申请页。
ret = faultin_page(tsk, vma, start, &foll_flags,nonblocking);
if (pages) {
pages[i] = page;
flush_anon_page(vma, page, start);
flush_dcache_page(page);
page_mask = 0;
}
page_fault是kernel响应用户态内存访问的关键,但不是本文重点,就不赘述。假设现在我们已经又了page,还剩下最后一步,将虚拟机里面的物理地址映射到该page,下次访问由硬件完成寻址,最后一步映射由__direct_map完成。
__direct_map
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
int map_writable, int level, kvm_pfn_t pfn,
bool prefault, bool lpage_disallowed)
{
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
int ret;
gfn_t gfn = gpa >> PAGE_SHIFT;
gfn_t base_gfn = gfn;
//宏定义详情如下,root_hpa开始查找,获取相关SP,
for_each_shadow_entry(vcpu, gpa, it) {
if (!is_shadow_present_pte(*it.sptep)) {
//pte不存在,会寻找sp,有就直接用,没有就创建sp
sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
it.level - 1, true, ACC_ALL);
//把sp和pte连接起来
link_shadow_page(vcpu, it.sptep, sp);
if (lpage_disallowed)
account_huge_nx_page(vcpu->kvm, sp);
}
}
//ptep和pfn连接.末尾12bit是flag
ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
write, level, base_gfn, pfn, prefault,
map_writable);
direct_pte_prefetch(vcpu, it.sptep);
++vcpu->stat.pf_fixed;
return ret;
}
#define for_each_shadow_entry(_vcpu, _addr, _walker) \
for (shadow_walk_init(&(_walker), _vcpu, _addr); \
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
#init:
iterator->addr = addr;
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
#okay:
iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
#next:
is_last_spte(spte, iterator->level)
iterator->level = 0;
return;
iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
--iterator->level;
static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
bool speculative, bool host_writable)
{
//spte有的话会先drop掉
if (is_shadow_present_pte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
*/
if (level > PT_PAGE_TABLE_LEVEL &&
!is_large_pte(*sptep)) {
struct kvm_mmu_page *child;
u64 pte = *sptep;
child = page_header(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %llx new %llx\n",
spte_to_pfn(*sptep), pfn);
drop_spte(vcpu->kvm, sptep);
flush = true;
} else
was_rmapped = 1;
}
//pfn<<page_shift 转化spte,然后赋值到sptep,完成页表设置
set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
speculative, true, host_writable);
if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
if (write_fault)
ret = RET_PF_EMULATE;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
kvm_flush_remote_tlbs(vcpu->kvm);
if (unlikely(is_mmio_spte(*sptep)))
ret = RET_PF_EMULATE;
if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;
//mem_arch相关反向映射的填充
if (is_shadow_present_pte(*sptep)) {
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
}
}
return ret;
}