3.3EPT内存虚拟化


3.3.1 EPT初始化

(1) kvm_arch_init ==》 kvm_mmu_module_init

a)  建立pte_list_desc_cache缓存结构

struct pte_list_desc {

u64 *sptes[PTE_LIST_EXT];

struct pte_list_desc *more;

};

b)  建立mmu_page_header_cache缓存结构,该结构用于kvm_mmu_page

c)  register_shrinker(&mmu_shrinker);当系统内存回收被调用时的钩子

 

(2) EPT identity map

实模式下会先建立等值映射

  用户态程序 kvm_init==>kvm_arch_init

       identity_base = 0xfeffc000;

        ret = kvm_vm_ioctl(s,KVM_SET_IDENTITY_MAP_ADDR, &identity_base);

    设置EPT identity map物理地址,该地址为gpa,内核态

static intkvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,

                        u64ident_addr)

{

    kvm->arch.ept_identity_map_addr = ident_addr;

    return 0;

}

Note: identify map仅用于实模式虚拟化

 

(3) vmx_create_vcpu

       if (!kvm->arch.ept_identity_map_addr)

           kvm->arch.ept_identity_map_addr =

              VMX_EPT_IDENTITY_PAGETABLE_ADDR;

      

        alloc_identity_pagetable(kvm);

       init_rmode_identity_map(kvm));

 

intalloc_identity_pagetable(struct kvm *kvm) (vmx.c)

    //kvm memslot 的位置IDENTITY_PAGETABLE_PRIVATE_MEMSLOT是特意为ept准备的

    kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;

 

    kvm_userspace_mem.flags = 0;

    kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr;

    kvm_userspace_mem.memory_size = PAGE_SIZE; //大小为一个页

    r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);

   //根据GPA的页号得到对应内核物理地址的页结构

    page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr>> PAGE_SHIFT);

    kvm->arch.ept_identity_pagetable = page;

 

init_rmode_identity_map //为实模式准备identify_map

    //将ept_identity_map_addr对应的hva清0

    r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);

    为实模式建立1024个页的等值映射

    for (i = 0; i < PT32_ENT_PER_PAGE; i++) {

       tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER|

           _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);

       r = kvm_write_guest_page(kvm, identity_map_pfn,

              &tmp, i * sizeof(tmp), sizeof(tmp));

       if (r < 0)

           goto out;

    }

    kvm->arch.ept_identity_pagetable_done = true;

 

 (4) EPT表的载入

vcpu_enter_guest(struct kvm_vcpu *vcpu)  ==》 kvm_mmu_reload ==> kvm_mmu_load

int kvm_mmu_load(structkvm_vcpu *vcpu)

  

    //从缓存中分配

   //vcpu->arch.mmu_pte_list_desc_cache 和 vcpu->arch.mmu_page_header_cache

    r = mmu_topup_memory_caches(vcpu);

    r = mmu_alloc_roots(vcpu); // call mmu_alloc_direct_roots 取到arch.mmu.root_hpa

    kvm_mmu_sync_roots(vcpu);

    /* set_cr3() should ensure TLB has been flushed */

    vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);

 

static intmmu_alloc_direct_roots(struct kvm_vcpu *vcpu)

根据当前vcpu的分页模式建立 ept顶层页表的管理结构

    if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {

       spin_lock(&vcpu->kvm->mmu_lock);

       make_mmu_pages_available(vcpu);

       sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,

                    1, ACC_ALL,NULL); //为PML4T的地址

       ++sp->root_count;

       spin_unlock(&vcpu->kvm->mmu_lock);

       vcpu->arch.mmu.root_hpa = __pa(sp->spt);

    } else if (vcpu->arch.mmu.shadow_root_level ==PT32E_ROOT_LEVEL) {

       for (i = 0; i < 4; ++i) {

           hpa_t root = vcpu->arch.mmu.pae_root[i];

 

           ASSERT(!VALID_PAGE(root));

           spin_lock(&vcpu->kvm->mmu_lock);

           make_mmu_pages_available(vcpu);

           sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),

                        i<< 30,

                       PT32_ROOT_LEVEL, 1, ACC_ALL,

                        NULL);//32地址bit[31:30]只需要4项,所以在这里都分配出来,可以减少以后vm-exit次数

           root = __pa(sp->spt);

           ++sp->root_count;

           spin_unlock(&vcpu->kvm->mmu_lock);

           vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;

       }

       vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);//32bit为pae的地址

}

 

(5) gfn_to_page

该函数处理gpa的页号到hpa的page结构

    pfn = gfn_to_pfn(kvm, gfn); // call __gfn_to_pfn

    return kvm_pfn_to_page(pfn); //call pfn_to_page

 

gfn_to_pfn ==>__gfn_to_pfn ==>

a. _gfn_to_hva_many

static inline unsigned long

__gfn_to_hva_memslot(structkvm_memory_slot *slot, gfn_t gfn)

{

    return slot->userspace_addr + (gfn - slot->base_gfn) *PAGE_SIZE;

}

 b. hva_to_pfn ==> hva_to_pfn_slow ==> get_user_page_nowait

 

3.3.2 EPT表的管理

(1) kvm_mmu_page

 由于gpa对应的hva已经存在, 所以可以直接根据gpa得到hpa, gfn_to_page函数实现了这个功能. 但该页对应的ept页表内存却并未分配,所以kvm需要管理分页内存和页面映射. 为此kvm引入了kvm_mmu_page结构。

 

struct kvm_mmu_page {

    struct list_head link;

  //KVM中会为所有的mmu_page维护一个hash链表,用于快速找到对应的kvm_mmu_page实例,

    struct hlist_node hash_link;

    gfn_t gfn;

    union kvm_mmu_page_role role;

    u64 *spt; //该kvm_mmu_page对应的页表页的宿主机虚拟地址hva

    /* hold the gfn of each spte inside spt */

    gfn_t *gfns;

    bool unsync;

    int root_count;          /*Currently serving as active root */

    unsigned int unsync_children;

    unsigned long parent_ptes;  //表示有哪些上一级页表页的页表项指向该页表页

    unsigned long mmu_valid_gen; //该页的generation number,用于和 kvm->arch.mmu_valid_gen 进行比较,比它小表示该页是invalid的

 

    DECLARE_BITMAP(unsync_child_bitmap, 512);

 

#ifdef CONFIG_X86_32

    int clear_spte_count;

#endif

    int write_flooding_count;

};

 

union kvm_mmu_page_role {

    unsigned word;

    struct {

       unsigned glevels:4;

       unsigned level:4; 该页表页的层级

       unsigned quadrant:2;

       unsigned pad_for_nice_hex_output:6;

       unsigned direct:1;

       unsigned access:3; //访问权限

       unsigned invalid:1; 表示该页是否有效

       unsigned cr4_pge:1; 记录了cr4.pae的值,如果是direct模式,该值为0

       unsigned nxe:1; 记录了efer.nxe的值

    };

};

创建函数如下:

static struct kvm_mmu_page*kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,

                         u64*parent_pte, int direct)

{

    struct kvm_mmu_page *sp;

 

    sp =mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);

    spt就是页匡,用mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache)分配

    sp->spt =mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);

    if (!direct) // gfn_t *gfns 用于影子页表的case

       sp->gfns =mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);

    set_page_private(virt_to_page(sp->spt), (unsigned long)sp);

 

    //用于将该结构加入vcpu->kvm->arch.active_mmu_pages链表中

    list_add(&sp->link,&vcpu->kvm->arch.active_mmu_pages);

    sp->parent_ptes = 0;

    页框是分级的,所以要关联它与parent

    mmu_page_add_parent_pte(vcpu, sp, parent_pte);

    kvm_mod_used_mmu_pages(vcpu->kvm, +1);

    return sp;

}

 

mmu_page_add_parent_pte==》 pte_list_add(vcpu, parent_pte,&sp->parent_ptes)

当parent_pte不存在时,该mmu_page_add_parent_pte直接返回

 

static intpte_list_add(struct kvm_vcpu *vcpu, u64 *spte,

           unsigned long *pte_list)

{

    struct pte_list_desc *desc;

    int i, count = 0;

 

    if (!*pte_list) { //sp首次初始化时sp->parent_ptes = parent_pte

       *pte_list = (unsigned long)spte;

    } else if (!(*pte_list & 1)) { //第二次时

       desc = mmu_alloc_pte_list_desc(vcpu);

       desc->sptes[0] = (u64 *)*pte_list;

       desc->sptes[1] = spte;

       *pte_list = (unsigned long)desc | 1;

       ++count;

    } else { //以后在增加时由于desc已存在不需再alloc

       desc = (struct pte_list_desc *)(*pte_list & ~1ul);

       while (desc->sptes[PTE_LIST_EXT-1] &&desc->more) {

           desc = desc->more;

           count += PTE_LIST_EXT;

       }

//当sptes数量太多时用more链表来管理

       if (desc->sptes[PTE_LIST_EXT-1]) {

           desc->more = mmu_alloc_pte_list_desc(vcpu);

           desc = desc->more;

       }

       for (i = 0; desc->sptes[i]; ++i)

           ++count;

       desc->sptes[i] = spte;

    }

    return count;

}

由于spte的地址只可能是8的倍数,所以其第一位肯定是0,那么我们就利用这个特点:

·        我们用一个 unsignedlong * 来表示 pte_list ;

·        如果这个 pte_list 为空,则表示这个之前没有创建过,那么将其赋值,即上文中 0->1 的情况;

·        如果这个 pte_list 不为空,但是其第一位是,则表示这个rmap之前已经被设置了一个值,那么需要将这个 pte_list 的值改为某个 struct pte_list_desc 的地址,然后将第一位设成 ,来表示该地址并不是单纯的一个spte的地址,而是指向某个 structpte_list_desc ,这是上文中 1->many 的情况;

·        如果这个 pte_list 不为空,而且其第一位是,那么通过访问由这个地址得到struct pte_list_desc ,得到更多的sptes,即上文中 many->many 的情况。

struct pte_list_desc 结构定义如下:

arch/x86/kvm/mmu.c

struct pte_list_desc {

    u64 *sptes[PTE_LIST_EXT];

    struct pte_list_desc *more;

};

它是一个单链表的节点,每个节点都存有3个spte的地址,以及下一个节点的位置。这个反向映射用于如下case:如操作系统需要进行页面回收或换出,如果宿主机需要把某个客户机物理页换到disk,那么它就需要修改这个页的物理地址gpa对应的spte,将其设置成不存在。 反向映射的遍历函数如下:

static voidpte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)

{

    struct pte_list_desc *desc;

    int i;

    if (!*pte_list)

       return;

    if (!(*pte_list & 1))

       return fn((u64 *)*pte_list);

    desc = (struct pte_list_desc *)(*pte_list & ~1ul);

    while (desc) {

       for (i = 0; i < PTE_LIST_EXT && desc->sptes[i];++i)

           fn(desc->sptes[i]);

       desc = desc->more;

    }

}

 

下一小节再来分析kvm_mmu_alloc_page的上层函数kvm_mmu_get_page

 

(2) kvm_mmu_get_page 流程分析

  函数定义:

static struct kvm_mmu_page*kvm_mmu_get_page(

           struct kvm_vcpu *vcpu,

           gfn_t gfn,

           gva_t gaddr,

           unsigned level, 页表页对应的level,可能取值为3,2,1

           int direct, //ept为1

           unsigned access, 该页表页的访问权限

           u64 *parent_pte) 上一级页表页中指向该级页表页的页表项的地址

 

a)  一开始会初始化 role,在EPT机制下,vcpu->arch.mmu.base_role 最开始是被初始化为0的;

b)  然后调用for_each_gfn_sp 查找之前已经使用过的 kvm_mmu_page ,该宏根据gfn的值在 kvm_mmu_page 结构中的hash_link进行

#define for_each_gfn_sp(_kvm, _sp, _gfn)             \

hlist_for_each_entry(_sp,                 \

&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)],hash_link) \

if ((_sp)->gfn != (_gfn)) {} else

c)  如果找到了,调用 mmu_page_add_parent_pte

for_each_gfn_sp(vcpu->kvm, sp, gfn) {

...

if (is_obsolete_sp(vcpu->kvm, sp))

continue;

//函数设置parent pte对应的reverse map

mmu_page_add_parent_pte(vcpu, sp,parent_pte);

if (sp->unsync_children) { //记录该页表页中有多少个spte是unsync状态的

kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);

kvm_mmu_mark_parents_unsync(sp); //将sp的parent标记为unsync

} else if (sp->unsync)

kvm_mmu_mark_parents_unsync(sp);

return sp;

}

d)  如果sp有unsync的子项,标记KVM_REQ_MMU_SYNC,下次vm-entry时处理,将指向sp的叶框页标记为unsync状态。

static voidkvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)

{

pte_list_walk(&sp->parent_ptes,mark_unsync);

}

e)  如果该gfn对应的页表页不存在,则调用 kvm_mmu_alloc_page

sp = kvm_mmu_alloc_page(vcpu, parent_pte,direct);

sp->gfn = gfn;

sp->role = role;

//通过hash_link添加到kvm->arch.mmu_page_hash hash表中

hlist_add_head(&sp->hash_link,

&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);

sp->mmu_valid_gen =vcpu->kvm->arch.mmu_valid_gen;

init_shadow_page_table(sp); //sp->spt[i]= NULL

 

f)  当sp->mmu_valid_gen!= vcpu->kvm->arch.mmu_valid_gen 表明该sp无效

static bool is_obsolete_sp(struct kvm *kvm,struct kvm_mmu_page *sp)

{

  return unlikely(sp->mmu_valid_gen!= kvm->arch.mmu_valid_gen);

}

函数kvm_mmu_invalidate_zap_all_pages将会使kvm->arch.mmu_valid_gen++,从而使所有sp无效

 

(3) kvm_mmu_free_page

  当页无效时最终回调kvm_mmu_get_page回收, 这里不在分析代码,仅看看何时会回收

kvm_mmu_commit_zap_page==> kvm_mmu_free_page

    list_for_each_entry_safe(sp, nsp, invalid_list, link) {

       WARN_ON(!sp->role.invalid || sp->root_count);

       kvm_mmu_free_page(sp);

    }

从invalid_list回收页。

 

prepare_zap_oldest_mmu_page将sp放入invalid list.

有三种case调用该函数:

(1) host 内存需要回收时mmu_shrink_scan

(2) Guest OS memory region 发生变化

(3) ept 加载时mmu_alloc_direct_roots==>make_mmu_pages_available

 

3.3.3 EPT vm-entry处理

(1) KVM_REQ_MMU_RELOAD

kvm_mmu_invalidate_zap_all_pages==> kvm_reload_remote_mmus==> make_all_cpus_request(kvm,KVM_REQ_MMU_RELOAD);

 

 

vcpu_enter_guest==>

if(kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))

           kvm_mmu_unload(vcpu); //call mmu_free_roots

kvm_mmu_unload会调用prepare_zap_oldest_mmu_page,将所有sp放入invalid_list

 

(2) KVM_REQ_MMU_SYNC

kvm_mmu_get_page

    if (sp->unsync_children) {

           kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);

           kvm_mmu_mark_parents_unsync(sp);

       }

 

set_spte  ==》 mmu_need_write_protect  ==》kvm_unsync_pages  ==》__kvm_unsync_page==》kvm_mmu_mark_parents_unsync

 

vcpu_enter_guest==>

       if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))

           kvm_mmu_sync_roots(vcpu); //call mmu_sync_roots

 

kvm_sync_roots==> mmu_sync_children

    while (mmu_unsync_walk(parent, &pages)) { //统计有多少页为sync

       bool protected = false;

 

       for_each_sp(pages, sp, parents, i)

           protected |= rmap_write_protect(vcpu->kvm, sp->gfn);

 

       if (protected)

           kvm_flush_remote_tlbs(vcpu->kvm);

 

       for_each_sp(pages, sp, parents, i) {

           kvm_sync_page(vcpu, sp, &invalid_list); //

           mmu_pages_clear_parents(&parents);

       }

       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);

       cond_resched_lock(&vcpu->kvm->mmu_lock);

       kvm_mmu_pages_init(parent, &parents, &pages);

    }

 

static int__kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,

              struct list_head*invalid_list, bool clear_unsync)

{

    if (sp->role.cr4_pae != !!is_pae(vcpu)) {

       kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);

       return 1;

    }

 

    if (clear_unsync)

       kvm_unlink_unsync_page(vcpu->kvm, sp); // 减少sp->unsync = 0;

   //同步页, ept case sync_page = nonpaging_sync_page,该函数直接返回1

    if (vcpu->arch.mmu.sync_page(vcpu, sp)) {

       kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); //同步完后将页移除invalid_list

       return 1;

    }

 

    kvm_mmu_flush_tlb(vcpu);

    return 0;

}

 

(3) KVM_REQ_TLB_FLUSH

kvm_mmu_flush_tlb ==》 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);

 

vcpu_enter_guest==>

    if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))

           kvm_x86_ops->tlb_flush(vcpu);

 

static voidvmx_flush_tlb(struct kvm_vcpu *vcpu)

{

    vpid_sync_context(to_vmx(vcpu));

    if (enable_ept) {

       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))

           return;

       ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));

    }

}

static inline void__invvpid(int ext, u16 vpid, gva_t gva)

{

    struct {

    u64 vpid : 16;

    u64 rsvd : 48;

    u64 gva;

    } operand = { vpid, 0, gva };

 

    asm volatile (__ex(ASM_VMX_INVVPID)

         /* CF==1 or ZF==1--> rc = -1 */

         "; ja 1f ; ud2 ;1:"

         : :"a"(&operand), "c"(ext) : "cc","memory");

}

static inline void__invept(int ext, u64 eptp, gpa_t gpa)

{

    struct {

       u64 eptp, gpa;

    } operand = {eptp, gpa};

 

    asm volatile (__ex(ASM_VMX_INVEPT)

           /* CF==1 or ZF==1 --> rc = -1 */

           "; ja 1f ; ud2 ; 1:\n"

           : : "a" (&operand), "c" (ext) :"cc", "memory");

}

 

3.3.4 EPT  VM-exit 处理

(1) CR3寄存器

static voidvmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)

{

    unsigned long guest_cr3;

    u64 eptp;

 

    guest_cr3 = cr3;

    if (enable_ept) {

       eptp = construct_eptp(cr3);

       vmcs_write64(EPT_POINTER, eptp); //设置ept地址

       if (is_paging(vcpu) || is_guest_mode(vcpu)) //分页或nest kvm

           guest_cr3 = kvm_read_cr3(vcpu);

       else

           guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;//实模式

       ept_load_pdptrs(vcpu);

    }

 

    vmx_flush_tlb(vcpu);

    vmcs_writel(GUEST_CR3, guest_cr3);

}

 

mmu_alloc_direct_roots中会分配arch.mmu.root_hpa

kvm_mmu_load==> vcpu->arch.mmu.set_cr3(vcpu,vcpu->arch.mmu.root_hpa);

 

(2) handle_ept_violation

  a当EXIT_QUALIFICATION

    bit7=0bit8=1为非法case

    bit7=1 时gpa来自对guest-linear-address的转换, bit8为0表明eptviolation发生在访问guest paging structure.

   bit7 =1, bit8 = 0表明ept violation发生在gpa->hpa

   bit7=0bit8=0 表明gpa不是由guest-linear-addr引起(例如mov to cr3 指令导致pdpte加载)

 

b. /* Itis a write fault? */

    error_code= exit_qualification & (1U << 1);

    /* Itis a fetch fault? */

    error_code|= (exit_qualification & (1U << 2)) << 2;

    /* eptpage table is present? */

    error_code|= (exit_qualification >> 3) & 0x1;

    vcpu->arch.exit_qualification= exit_qualification;

    returnkvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);

 

kvm_mmu_page_fault==》 vcpu->arch.mmu.page_fault ==》tdp_page_fault

 

{

    。。。。。。。

    if (unlikely(error_code & PFERR_RSVD_MASK)) {

       r = handle_mmio_page_fault(vcpu, gpa, error_code, true);

 

       if (likely(r != RET_MMIO_PF_INVALID))

           return r;

    }

 

    r = mmu_topup_memory_caches(vcpu);

    if (r)

       return r;

 

    force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);

    if (likely(!force_pt_level)) {

       level = mapping_level(vcpu, gfn);

       gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);

    } else

       level = PT_PAGE_TABLE_LEVEL;

 

    if (fast_page_fault(vcpu, gpa, level, error_code))

       return 0;

 

    mmu_seq = vcpu->kvm->mmu_notifier_seq;

    smp_rmb();

 

    if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write,&map_writable))

       return 0;

 

    if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))

       return r;

 

    spin_lock(&vcpu->kvm->mmu_lock);

    if (mmu_notifier_retry(vcpu->kvm, mmu_seq))

       goto out_unlock;

    make_mmu_pages_available(vcpu);

    if (likely(!force_pt_level))

       transparent_hugepage_adjust(vcpu, &gfn, &pfn,&level);

    r = __direct_map(vcpu, gpa, write, map_writable,

            level, gfn, pfn,prefault);

    spin_unlock(&vcpu->kvm->mmu_lock);

    。。。。。。

}

mapping_level 根据gfn得到映射页表的级数

static intmapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)

{

    int host_level, level, max_level;

 

    host_level = host_mapping_level(vcpu->kvm, large_gfn);

 

    if (host_level == PT_PAGE_TABLE_LEVEL)

       return host_level;

 

    max_level = min(kvm_x86_ops->get_lpage_level(), host_level);

 

    for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)

       if (has_wrprotected_page(vcpu->kvm, large_gfn, level))

           break;

 

    return level - 1;

}

 

try_async_pf的到gpa对应的hpa

 

static int __direct_map(structkvm_vcpu *vcpu, gpa_t v, int write,

           int map_writable, int level, gfn_t gfn, pfn_t pfn,

           bool prefault)

{

   .......

//循环遍历guest os页帧号在影子页表中的页表项   

    for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT,iterator) {

       if (iterator.level == level) { //如果等于要设置的页表级别,设置对应的页表

           mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,

                   write,&emulate, level, gfn, pfn,

                   prefault,map_writable);

           direct_pte_prefetch(vcpu, iterator.sptep);

           ++vcpu->stat.pf_fixed;

           break;

       }

 

       drop_large_spte(vcpu, iterator.sptep);

 

       if (!is_shadow_present_pte(*iterator.sptep)) { //如果设置的页表之前的页表项为空,那么要建立页表

           u64 base_addr = iterator.addr;

 

           base_addr &= PT64_LVL_ADDR_MASK(iterator.level);

           pseudo_gfn = base_addr >> PAGE_SHIFT;

           sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,

                       iterator.level - 1,

                        1,ACC_ALL, iterator.sptep); 分配一个新的页表

 

           link_shadow_page(iterator.sptep, sp, true);

       }

    }

    return emulate;

}

__direct_map 这个函数是根据传进来的gpa进行计算,从第4级(level-4)页表页开始,一级一级地填写相应页表项,这些都是在for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) 这个宏定义里面实现的.这两种情况是这样子的:

第一种情况是指如果当前页表页的层数(iterator.level )是最后一层( level )的页表页,那么直接通过调用 mmu_set_spte (之后会细讲)设置页表项。

第二种情况是指如果当前页表页 A 不是最后一层,而是中间某一层(leve-4, level-3, level-2),而且该页表项之前并没有初始化(!is_shadow_present_pte(*iterator.sptep) ),那么需要调用kvm_mmu_get_page 得到或者新建一个页表页 B ,然后通过 link_shadow_page 将其link到页表页 A 相对应的页表项中

 

对于高层级的页表页,我们只需要调用link_shadow_page ,将页表项的值和相应的权限位直接设置上去就好了,但是对于最后一级的页表项,我们除了设置页表项对应的值之外,还需要做另一件事, rmap_add :arch/x86/kvm/mmu.c

static void mmu_set_spte(...)

{

  ...

  if (set_spte(vcpu, sptep, pte_access, level,gfn, pfn, speculative,

        true, host_writable)) {

    ...

  }

  ...

  if (is_shadow_present_pte(*sptep)) {

    if (!was_rmapped) {

      rmap_count = rmap_add(vcpu, sptep, gfn);

      ...

    }

  }

 

  ...

}

 

static int rmap_add(structkvm_vcpu *vcpu, u64 *spte, gfn_t gfn)

{

  ...

  sp = page_header(__pa(spte));

  kvm_mmu_page_set_gfn(sp, spte - sp->spt,gfn);

  rmapp = gfn_to_rmap(vcpu->kvm, gfn,sp->role.level);

  return pte_list_add(vcpu, spte, rmapp);

}

 

(3) EPT遍历操作

#definefor_each_shadow_entry(_vcpu, _addr, _walker)   \

    for (shadow_walk_init(&(_walker), _vcpu, _addr); \

         shadow_walk_okay(&(_walker));        \

        shadow_walk_next(&(_walker)))

 

structkvm_shadow_walk_iterator {

    u64 addr;

    hpa_t shadow_addr;

    u64 *sptep;

    int level;

    unsigned index;

};

 

遍历开始时(shadow_walk_init)

    iterator->addr = addr;

    iterator->shadow_addr = vcpu->arch.mmu.root_hpa;

    iterator->level = vcpu->arch.mmu.shadow_root_level;

 

取当前项的值,shadow_walk_okay

    iterator->index= SHADOW_PT_INDEX(iterator->addr, iterator->level);

    iterator->sptep   = ((u64 *)__va(iterator->shadow_addr)) +iterator->index;

 

取下一项

__shadow_walk_next

    iterator->shadow_addr= spte & PT64_BASE_ADDR_MASK;

    --iterator->level;

 

3.3.5 Sync page

(1) mmu_sync_roots

  从root_hpa开始调用mmu_sync_children

  mmu_unsync_walk(struct kvm_mmu_page *sp, structkvm_mmu_pages *pvec)

a)  mmu_unsync_walk得到所有子sp  将这些sp都加入到pvec中

b)  for_each_sp(pages,sp, parents, i) //对pvec中的页面(不包括用于page dir的page)

protected |=rmap_write_protect(vcpu->kvm, sp->gfn); // 计算该页和他的page dir是否有一个为write protected

c)  if(protected)

kvm_flush_remote_tlbs(vcpu->kvm);

d)  for_each_sp(pages,sp, parents, i) {

kvm_sync_page(vcpu, sp, &invalid_list);//同步page

mmu_pages_clear_parents(&parents); //遍历减少unsync_children

       i.     }

e)  kvm_mmu_commit_zap_page(vcpu->kvm,&invalid_list); //释放invalid_list中的页

 

(2) kvm_sync_page==> __kvm_sync_page时 clear_unsync 设为true

static int__kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,

              struct list_head*invalid_list, bool clear_unsync)

 

a. if (clear_unsync)

       kvm_unlink_unsync_page(vcpu->kvm, sp); //sp->unsync = 0;

b. if (vcpu->arch.mmu.sync_page(vcpu, sp)) {

       kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);

 

(3) kvm_mmu_prepare_zap_page

a)  ret =mmu_zap_unsync_children(kvm, sp, invalid_list);

通过mmu_unsync_walk; for_each_sp, 对所有sp进行如下操作

kvm_mmu_prepare_zap_pag; mmu_pages_clear_parents

b)  kvm_mmu_page_unlink_children

对sp所指向的页做mmu_page_zap_pte==>drop_parent_pte

c)  kvm_mmu_unlink_parents:

if (sp->unsync)

kvm_unlink_unsync_page(kvm, sp);//sp->unsync=0

d)  如果free 了root则 将页移入invalid_list

不是则移入kvm->arch.active_mmu_pages)

对所有指向sp的页框做drop_parent_pte

 

(4) unsync的设置时机

__direct_map ==> mmu_set_spte==>set_spte  ==》 mmu_need_write_protect  ==》 kvm_unsync_pages  ==》 __kvm_unsync_page==》kvm_mmu_mark_parents_unsync

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值