3.3.1 EPT初始化
(1) kvm_arch_init ==》 kvm_mmu_module_init
a) 建立pte_list_desc_cache缓存结构
struct pte_list_desc {
u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
};
b) 建立mmu_page_header_cache缓存结构,该结构用于kvm_mmu_page
c) register_shrinker(&mmu_shrinker);当系统内存回收被调用时的钩子
(2) EPT identity map
实模式下会先建立等值映射
用户态程序 kvm_init==>kvm_arch_init
identity_base = 0xfeffc000;
ret = kvm_vm_ioctl(s,KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
设置EPT identity map物理地址,该地址为gpa,内核态
static intkvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64ident_addr)
{
kvm->arch.ept_identity_map_addr = ident_addr;
return 0;
}
Note: identify map仅用于实模式虚拟化
(3) vmx_create_vcpu
if (!kvm->arch.ept_identity_map_addr)
kvm->arch.ept_identity_map_addr =
VMX_EPT_IDENTITY_PAGETABLE_ADDR;
alloc_identity_pagetable(kvm);
init_rmode_identity_map(kvm));
intalloc_identity_pagetable(struct kvm *kvm) (vmx.c)
//kvm memslot 的位置IDENTITY_PAGETABLE_PRIVATE_MEMSLOT是特意为ept准备的
kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
kvm_userspace_mem.flags = 0;
kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr;
kvm_userspace_mem.memory_size = PAGE_SIZE; //大小为一个页
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
//根据GPA的页号得到对应内核物理地址的页结构
page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr>> PAGE_SHIFT);
kvm->arch.ept_identity_pagetable = page;
init_rmode_identity_map //为实模式准备identify_map
//将ept_identity_map_addr对应的hva清0
r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
为实模式建立1024个页的等值映射
for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER|
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
r = kvm_write_guest_page(kvm, identity_map_pfn,
&tmp, i * sizeof(tmp), sizeof(tmp));
if (r < 0)
goto out;
}
kvm->arch.ept_identity_pagetable_done = true;
(4) EPT表的载入
vcpu_enter_guest(struct kvm_vcpu *vcpu) ==》 kvm_mmu_reload ==> kvm_mmu_load
int kvm_mmu_load(structkvm_vcpu *vcpu)
//从缓存中分配
//vcpu->arch.mmu_pte_list_desc_cache 和 vcpu->arch.mmu_page_header_cache
r = mmu_topup_memory_caches(vcpu);
r = mmu_alloc_roots(vcpu); // call mmu_alloc_direct_roots 取到arch.mmu.root_hpa
kvm_mmu_sync_roots(vcpu);
/* set_cr3() should ensure TLB has been flushed */
vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
static intmmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
根据当前vcpu的分页模式建立 ept顶层页表的管理结构
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
1, ACC_ALL,NULL); //为PML4T的地址
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
} else if (vcpu->arch.mmu.shadow_root_level ==PT32E_ROOT_LEVEL) {
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
ASSERT(!VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
i<< 30,
PT32_ROOT_LEVEL, 1, ACC_ALL,
NULL);//32地址bit[31:30]只需要4项,所以在这里都分配出来,可以减少以后vm-exit次数
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);//32bit为pae的地址
}
(5) gfn_to_page
该函数处理gpa的页号到hpa的page结构
pfn = gfn_to_pfn(kvm, gfn); // call __gfn_to_pfn
return kvm_pfn_to_page(pfn); //call pfn_to_page
gfn_to_pfn ==>__gfn_to_pfn ==>
a. _gfn_to_hva_many
static inline unsigned long
__gfn_to_hva_memslot(structkvm_memory_slot *slot, gfn_t gfn)
{
return slot->userspace_addr + (gfn - slot->base_gfn) *PAGE_SIZE;
}
b. hva_to_pfn ==> hva_to_pfn_slow ==> get_user_page_nowait
3.3.2 EPT表的管理
(1) kvm_mmu_page
由于gpa对应的hva已经存在, 所以可以直接根据gpa得到hpa, gfn_to_page函数实现了这个功能. 但该页对应的ept页表内存却并未分配,所以kvm需要管理分页内存和页面映射. 为此kvm引入了kvm_mmu_page结构。
struct kvm_mmu_page {
struct list_head link;
//KVM中会为所有的mmu_page维护一个hash链表,用于快速找到对应的kvm_mmu_page实例,
struct hlist_node hash_link;
gfn_t gfn;
union kvm_mmu_page_role role;
u64 *spt; //该kvm_mmu_page对应的页表页的宿主机虚拟地址hva
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
bool unsync;
int root_count; /*Currently serving as active root */
unsigned int unsync_children;
unsigned long parent_ptes; //表示有哪些上一级页表页的页表项指向该页表页
unsigned long mmu_valid_gen; //该页的generation number,用于和 kvm->arch.mmu_valid_gen 进行比较,比它小表示该页是invalid的
DECLARE_BITMAP(unsync_child_bitmap, 512);
#ifdef CONFIG_X86_32
int clear_spte_count;
#endif
int write_flooding_count;
};
union kvm_mmu_page_role {
unsigned word;
struct {
unsigned glevels:4;
unsigned level:4; 该页表页的层级
unsigned quadrant:2;
unsigned pad_for_nice_hex_output:6;
unsigned direct:1;
unsigned access:3; //访问权限
unsigned invalid:1; 表示该页是否有效
unsigned cr4_pge:1; 记录了cr4.pae的值,如果是direct模式,该值为0
unsigned nxe:1; 记录了efer.nxe的值
};
};
创建函数如下:
static struct kvm_mmu_page*kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
u64*parent_pte, int direct)
{
struct kvm_mmu_page *sp;
sp =mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
spt就是页匡,用mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache)分配
sp->spt =mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
if (!direct) // gfn_t *gfns 用于影子页表的case
sp->gfns =mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
//用于将该结构加入vcpu->kvm->arch.active_mmu_pages链表中
list_add(&sp->link,&vcpu->kvm->arch.active_mmu_pages);
sp->parent_ptes = 0;
页框是分级的,所以要关联它与parent
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
}
mmu_page_add_parent_pte==》 pte_list_add(vcpu, parent_pte,&sp->parent_ptes)
当parent_pte不存在时,该mmu_page_add_parent_pte直接返回
static intpte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
unsigned long *pte_list)
{
struct pte_list_desc *desc;
int i, count = 0;
if (!*pte_list) { //sp首次初始化时sp->parent_ptes = parent_pte
*pte_list = (unsigned long)spte;
} else if (!(*pte_list & 1)) { //第二次时
desc = mmu_alloc_pte_list_desc(vcpu);
desc->sptes[0] = (u64 *)*pte_list;
desc->sptes[1] = spte;
*pte_list = (unsigned long)desc | 1;
++count;
} else { //以后在增加时由于desc已存在不需再alloc
desc = (struct pte_list_desc *)(*pte_list & ~1ul);
while (desc->sptes[PTE_LIST_EXT-1] &&desc->more) {
desc = desc->more;
count += PTE_LIST_EXT;
}
//当sptes数量太多时用more链表来管理
if (desc->sptes[PTE_LIST_EXT-1]) {
desc->more = mmu_alloc_pte_list_desc(vcpu);
desc = desc->more;
}
for (i = 0; desc->sptes[i]; ++i)
++count;
desc->sptes[i] = spte;
}
return count;
}
由于spte的地址只可能是8的倍数,所以其第一位肯定是0,那么我们就利用这个特点:
· 我们用一个 unsignedlong * 来表示 pte_list ;
· 如果这个 pte_list 为空,则表示这个之前没有创建过,那么将其赋值,即上文中 0->1 的情况;
· 如果这个 pte_list 不为空,但是其第一位是,则表示这个rmap之前已经被设置了一个值,那么需要将这个 pte_list 的值改为某个 struct pte_list_desc 的地址,然后将第一位设成 ,来表示该地址并不是单纯的一个spte的地址,而是指向某个 structpte_list_desc ,这是上文中 1->many 的情况;
· 如果这个 pte_list 不为空,而且其第一位是,那么通过访问由这个地址得到struct pte_list_desc ,得到更多的sptes,即上文中 many->many 的情况。
struct pte_list_desc 结构定义如下:
arch/x86/kvm/mmu.c
struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; }; |
它是一个单链表的节点,每个节点都存有3个spte的地址,以及下一个节点的位置。这个反向映射用于如下case:如操作系统需要进行页面回收或换出,如果宿主机需要把某个客户机物理页换到disk,那么它就需要修改这个页的物理地址gpa对应的spte,将其设置成不存在。 反向映射的遍历函数如下:
static voidpte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
{
struct pte_list_desc *desc;
int i;
if (!*pte_list)
return;
if (!(*pte_list & 1))
return fn((u64 *)*pte_list);
desc = (struct pte_list_desc *)(*pte_list & ~1ul);
while (desc) {
for (i = 0; i < PTE_LIST_EXT && desc->sptes[i];++i)
fn(desc->sptes[i]);
desc = desc->more;
}
}
下一小节再来分析kvm_mmu_alloc_page的上层函数kvm_mmu_get_page
(2) kvm_mmu_get_page 流程分析
函数定义:
static struct kvm_mmu_page*kvm_mmu_get_page(
struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
unsigned level, 页表页对应的level,可能取值为3,2,1
int direct, //ept为1
unsigned access, 该页表页的访问权限
u64 *parent_pte) 上一级页表页中指向该级页表页的页表项的地址
a) 一开始会初始化 role,在EPT机制下,vcpu->arch.mmu.base_role 最开始是被初始化为0的;
b) 然后调用for_each_gfn_sp 查找之前已经使用过的 kvm_mmu_page ,该宏根据gfn的值在 kvm_mmu_page 结构中的hash_link进行
#define for_each_gfn_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)],hash_link) \
if ((_sp)->gfn != (_gfn)) {} else
c) 如果找到了,调用 mmu_page_add_parent_pte
for_each_gfn_sp(vcpu->kvm, sp, gfn) {
...
if (is_obsolete_sp(vcpu->kvm, sp))
continue;
//函数设置parent pte对应的reverse map
mmu_page_add_parent_pte(vcpu, sp,parent_pte);
if (sp->unsync_children) { //记录该页表页中有多少个spte是unsync状态的
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
kvm_mmu_mark_parents_unsync(sp); //将sp的parent标记为unsync
} else if (sp->unsync)
kvm_mmu_mark_parents_unsync(sp);
return sp;
}
d) 如果sp有unsync的子项,标记KVM_REQ_MMU_SYNC,下次vm-entry时处理,将指向sp的叶框页标记为unsync状态。
static voidkvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
pte_list_walk(&sp->parent_ptes,mark_unsync);
}
e) 如果该gfn对应的页表页不存在,则调用 kvm_mmu_alloc_page
sp = kvm_mmu_alloc_page(vcpu, parent_pte,direct);
sp->gfn = gfn;
sp->role = role;
//通过hash_link添加到kvm->arch.mmu_page_hash hash表中
hlist_add_head(&sp->hash_link,
&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
sp->mmu_valid_gen =vcpu->kvm->arch.mmu_valid_gen;
init_shadow_page_table(sp); //sp->spt[i]= NULL
f) 当sp->mmu_valid_gen!= vcpu->kvm->arch.mmu_valid_gen 表明该sp无效
static bool is_obsolete_sp(struct kvm *kvm,struct kvm_mmu_page *sp)
{
return unlikely(sp->mmu_valid_gen!= kvm->arch.mmu_valid_gen);
}
函数kvm_mmu_invalidate_zap_all_pages将会使kvm->arch.mmu_valid_gen++,从而使所有sp无效
(3) kvm_mmu_free_page
当页无效时最终回调kvm_mmu_get_page回收, 这里不在分析代码,仅看看何时会回收
kvm_mmu_commit_zap_page==> kvm_mmu_free_page
list_for_each_entry_safe(sp, nsp, invalid_list, link) {
WARN_ON(!sp->role.invalid || sp->root_count);
kvm_mmu_free_page(sp);
}
从invalid_list回收页。
prepare_zap_oldest_mmu_page将sp放入invalid list.
有三种case调用该函数:
(1) host 内存需要回收时mmu_shrink_scan
(2) Guest OS memory region 发生变化
(3) ept 加载时mmu_alloc_direct_roots==>make_mmu_pages_available
3.3.3 EPT vm-entry处理
(1) KVM_REQ_MMU_RELOAD
kvm_mmu_invalidate_zap_all_pages==> kvm_reload_remote_mmus==> make_all_cpus_request(kvm,KVM_REQ_MMU_RELOAD);
vcpu_enter_guest==>
if(kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu); //call mmu_free_roots
kvm_mmu_unload会调用prepare_zap_oldest_mmu_page,将所有sp放入invalid_list
(2) KVM_REQ_MMU_SYNC
kvm_mmu_get_page
if (sp->unsync_children) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
kvm_mmu_mark_parents_unsync(sp);
}
set_spte ==》 mmu_need_write_protect ==》kvm_unsync_pages ==》__kvm_unsync_page==》kvm_mmu_mark_parents_unsync
vcpu_enter_guest==>
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu); //call mmu_sync_roots
kvm_sync_roots==> mmu_sync_children
while (mmu_unsync_walk(parent, &pages)) { //统计有多少页为sync
bool protected = false;
for_each_sp(pages, sp, parents, i)
protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
if (protected)
kvm_flush_remote_tlbs(vcpu->kvm);
for_each_sp(pages, sp, parents, i) {
kvm_sync_page(vcpu, sp, &invalid_list); //
mmu_pages_clear_parents(&parents);
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
cond_resched_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_pages_init(parent, &parents, &pages);
}
static int__kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head*invalid_list, bool clear_unsync)
{
if (sp->role.cr4_pae != !!is_pae(vcpu)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
if (clear_unsync)
kvm_unlink_unsync_page(vcpu->kvm, sp); // 减少sp->unsync = 0;
//同步页, ept case sync_page = nonpaging_sync_page,该函数直接返回1
if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); //同步完后将页移除invalid_list
return 1;
}
kvm_mmu_flush_tlb(vcpu);
return 0;
}
(3) KVM_REQ_TLB_FLUSH
kvm_mmu_flush_tlb ==》 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
vcpu_enter_guest==>
if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_x86_ops->tlb_flush(vcpu);
static voidvmx_flush_tlb(struct kvm_vcpu *vcpu)
{
vpid_sync_context(to_vmx(vcpu));
if (enable_ept) {
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
}
}
static inline void__invvpid(int ext, u16 vpid, gva_t gva)
{
struct {
u64 vpid : 16;
u64 rsvd : 48;
u64 gva;
} operand = { vpid, 0, gva };
asm volatile (__ex(ASM_VMX_INVVPID)
/* CF==1 or ZF==1--> rc = -1 */
"; ja 1f ; ud2 ;1:"
: :"a"(&operand), "c"(ext) : "cc","memory");
}
static inline void__invept(int ext, u64 eptp, gpa_t gpa)
{
struct {
u64 eptp, gpa;
} operand = {eptp, gpa};
asm volatile (__ex(ASM_VMX_INVEPT)
/* CF==1 or ZF==1 --> rc = -1 */
"; ja 1f ; ud2 ; 1:\n"
: : "a" (&operand), "c" (ext) :"cc", "memory");
}
3.3.4 EPT VM-exit 处理
(1) CR3寄存器
static voidvmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
unsigned long guest_cr3;
u64 eptp;
guest_cr3 = cr3;
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp); //设置ept地址
if (is_paging(vcpu) || is_guest_mode(vcpu)) //分页或nest kvm
guest_cr3 = kvm_read_cr3(vcpu);
else
guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;//实模式
ept_load_pdptrs(vcpu);
}
vmx_flush_tlb(vcpu);
vmcs_writel(GUEST_CR3, guest_cr3);
}
mmu_alloc_direct_roots中会分配arch.mmu.root_hpa
kvm_mmu_load==> vcpu->arch.mmu.set_cr3(vcpu,vcpu->arch.mmu.root_hpa);
(2) handle_ept_violation
a当EXIT_QUALIFICATION
bit7=0bit8=1为非法case
bit7=1 时gpa来自对guest-linear-address的转换, bit8为0表明eptviolation发生在访问guest paging structure.
bit7 =1, bit8 = 0表明ept violation发生在gpa->hpa
bit7=0bit8=0 表明gpa不是由guest-linear-addr引起(例如mov to cr3 指令导致pdpte加载)
b. /* Itis a write fault? */
error_code= exit_qualification & (1U << 1);
/* Itis a fetch fault? */
error_code|= (exit_qualification & (1U << 2)) << 2;
/* eptpage table is present? */
error_code|= (exit_qualification >> 3) & 0x1;
vcpu->arch.exit_qualification= exit_qualification;
returnkvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
kvm_mmu_page_fault==》 vcpu->arch.mmu.page_fault ==》tdp_page_fault
{
。。。。。。。
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
if (likely(r != RET_MMIO_PF_INVALID))
return r;
}
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
if (likely(!force_pt_level)) {
level = mapping_level(vcpu, gfn);
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
} else
level = PT_PAGE_TABLE_LEVEL;
if (fast_page_fault(vcpu, gpa, level, error_code))
return 0;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write,&map_writable))
return 0;
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
return r;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn,&level);
r = __direct_map(vcpu, gpa, write, map_writable,
level, gfn, pfn,prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
。。。。。。
}
mapping_level 根据gfn得到映射页表的级数
static intmapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
int host_level, level, max_level;
host_level = host_mapping_level(vcpu->kvm, large_gfn);
if (host_level == PT_PAGE_TABLE_LEVEL)
return host_level;
max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
break;
return level - 1;
}
try_async_pf的到gpa对应的hpa
static int __direct_map(structkvm_vcpu *vcpu, gpa_t v, int write,
int map_writable, int level, gfn_t gfn, pfn_t pfn,
bool prefault)
{
.......
//循环遍历guest os页帧号在影子页表中的页表项
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT,iterator) {
if (iterator.level == level) { //如果等于要设置的页表级别,设置对应的页表
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
write,&emulate, level, gfn, pfn,
prefault,map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
}
drop_large_spte(vcpu, iterator.sptep);
if (!is_shadow_present_pte(*iterator.sptep)) { //如果设置的页表之前的页表项为空,那么要建立页表
u64 base_addr = iterator.addr;
base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
pseudo_gfn = base_addr >> PAGE_SHIFT;
sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
iterator.level - 1,
1,ACC_ALL, iterator.sptep); 分配一个新的页表
link_shadow_page(iterator.sptep, sp, true);
}
}
return emulate;
}
__direct_map 这个函数是根据传进来的gpa进行计算,从第4级(level-4)页表页开始,一级一级地填写相应页表项,这些都是在for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) 这个宏定义里面实现的.这两种情况是这样子的:
第一种情况是指如果当前页表页的层数(iterator.level )是最后一层( level )的页表页,那么直接通过调用 mmu_set_spte (之后会细讲)设置页表项。
第二种情况是指如果当前页表页 A 不是最后一层,而是中间某一层(leve-4, level-3, level-2),而且该页表项之前并没有初始化(!is_shadow_present_pte(*iterator.sptep) ),那么需要调用kvm_mmu_get_page 得到或者新建一个页表页 B ,然后通过 link_shadow_page 将其link到页表页 A 相对应的页表项中
对于高层级的页表页,我们只需要调用link_shadow_page ,将页表项的值和相应的权限位直接设置上去就好了,但是对于最后一级的页表项,我们除了设置页表项对应的值之外,还需要做另一件事, rmap_add :arch/x86/kvm/mmu.c
static void mmu_set_spte(...)
{
...
if (set_spte(vcpu, sptep, pte_access, level,gfn, pfn, speculative,
true, host_writable)) {
...
}
...
if (is_shadow_present_pte(*sptep)) {
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
...
}
}
...
}
static int rmap_add(structkvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
...
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt,gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn,sp->role.level);
return pte_list_add(vcpu, spte, rmapp);
}
(3) EPT遍历操作
#definefor_each_shadow_entry(_vcpu, _addr, _walker) \
for (shadow_walk_init(&(_walker), _vcpu, _addr); \
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
structkvm_shadow_walk_iterator {
u64 addr;
hpa_t shadow_addr;
u64 *sptep;
int level;
unsigned index;
};
遍历开始时(shadow_walk_init)
iterator->addr = addr;
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
取当前项的值,shadow_walk_okay
iterator->index= SHADOW_PT_INDEX(iterator->addr, iterator->level);
iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) +iterator->index;
取下一项
__shadow_walk_next:
iterator->shadow_addr= spte & PT64_BASE_ADDR_MASK;
--iterator->level;
3.3.5 Sync page
(1) mmu_sync_roots
从root_hpa开始调用mmu_sync_children
mmu_unsync_walk(struct kvm_mmu_page *sp, structkvm_mmu_pages *pvec)
a) mmu_unsync_walk得到所有子sp 将这些sp都加入到pvec中
b) for_each_sp(pages,sp, parents, i) //对pvec中的页面(不包括用于page dir的page)
protected |=rmap_write_protect(vcpu->kvm, sp->gfn); // 计算该页和他的page dir是否有一个为write protected
c) if(protected)
kvm_flush_remote_tlbs(vcpu->kvm);
d) for_each_sp(pages,sp, parents, i) {
kvm_sync_page(vcpu, sp, &invalid_list);//同步page
mmu_pages_clear_parents(&parents); //遍历减少unsync_children
i. }
e) kvm_mmu_commit_zap_page(vcpu->kvm,&invalid_list); //释放invalid_list中的页
(2) kvm_sync_page==> __kvm_sync_page时 clear_unsync 设为true
static int__kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head*invalid_list, bool clear_unsync)
a. if (clear_unsync)
kvm_unlink_unsync_page(vcpu->kvm, sp); //sp->unsync = 0;
b. if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
(3) kvm_mmu_prepare_zap_page
a) ret =mmu_zap_unsync_children(kvm, sp, invalid_list);
通过mmu_unsync_walk; for_each_sp, 对所有sp进行如下操作
kvm_mmu_prepare_zap_pag; mmu_pages_clear_parents
b) kvm_mmu_page_unlink_children
对sp所指向的页做mmu_page_zap_pte==>drop_parent_pte
c) kvm_mmu_unlink_parents:
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);//sp->unsync=0
d) 如果free 了root则 将页移入invalid_list
不是则移入kvm->arch.active_mmu_pages)
对所有指向sp的页框做drop_parent_pte
(4) unsync的设置时机
__direct_map ==> mmu_set_spte==>set_spte ==》 mmu_need_write_protect ==》 kvm_unsync_pages ==》 __kvm_unsync_page==》kvm_mmu_mark_parents_unsync