KVM建立虚拟机页表过程

一:概述

虚拟机页表建立的核心数据结构 :kvm_memory_slot,kvm_mmu_page

KVM把虚拟机中一段一段的内存叫做slot,由kvm_memory_slot表示,里面包含虚拟机物理页编号base_gfn,该slot总共的page数量npages,host端用户态地址userspace_addr,虚拟机到物理机的地址转换最关键的地方就是这里。查到地址的物理页首先获取gfn,根据偏移找到host的虚拟地址,再找到VMA,再填充page,然后将映射关系写到kvm_mmu_page页表信息中,在设置pte。再次寻址由EPT完成。

kvm_mmu_page中包含页表的层级role.level ,spt页表。所有的页表连接在link链表中。

二:数据结构和关系详解

struct kvm_memory_slot {
        gfn_t base_gfn; //该slot的开始虚拟机gfn
        unsigned long npages;//总page数
        unsigned long *dirty_bitmap;
        struct kvm_arch_memory_slot arch; //反向映射信息
        unsigned long userspace_addr; //host端虚拟机开始的虚拟地址,qemu用的地址
        u32 flags;
        short id;
};
struct kvm_mmu_page {
    struct list_head link; //所有的页表结构都链接在一起
    struct hlist_node hash_link;
    struct list_head lpage_disallowed_link;
    gfn_t gfn; //guest frame number  虚拟机的物理地址
    union kvm_mmu_page_role role;
    u64 *spt; //页表页
    gfn_t *gfns;
    bool unsync;
    bool lpage_disallowed;
    int root_count;
    unsigned int unsync_children;
    struct kvm_rmap_head parent_ptes;
    unsigned long mmu_valid_gen;
    unsigned long unsync_child_bitmap[8];
    atomic_t write_flooding_count;
}

//数据样例:
struct kvm_mmu_page {
  link = {
    next = 0xffff8ed0ff5231f8,
    prev = 0xffff8ed11a768000
  },
  hash_link = {
    next = 0x0,
    pprev = 0xffffb4690b256d80
  },
  lpage_disallowed_link = {
    next = 0xffff8ed11a768020,
    prev = 0xffff8ed0ff523218
  },
  gfn = 4344832,
  role = {
    word = 1921,
    {
      level = 1, //一级页表,里面是具体的页
      cr4_pae = 0,
      quadrant = 0,
      direct = 1,// direct map
      access = 7,
      invalid = 0,
      nxe = 0,
      cr0_wp = 0,
      smep_andnot_wp = 0,
      smap_andnot_wp = 0,
      ad_disabled = 0,
      guest_mode = 0,
      smm = 0
    }
  },
  spt = 0xffff8ed0f9537000,//页表页
  gfns = 0x0,
  unsync = false,
  lpage_disallowed = true,
  root_count = 0,
  unsync_children = 0,
  parent_ptes = {
    val = 18446619627041077552
  },
  mmu_valid_gen = 48,
  unsync_child_bitmap = {0, 0, 0, 0, 0, 0, 0, 0},
  write_flooding_count = {
    counter = 0
  }
}

展示一个四级页表的读取过程,方便理解:

4级页表:
crash> rd 0xffff8ed12e4b3000 10
ffff8ed12e4b3000: 000000102e4b2107 0000000000000000 .!K………….
3级页表:
rd -p 000000102e4b2000 0x12
 102e4b2000: 0000000ff5936107 0000000000000000 .a…………..
 102e4b2010: 0000000ff593a107 0000000ff5939107 …………….
 102e4b2020: 0000000000000000 000000102e68f107 ……….h…..
 102e4b2030: 0000000000000000 0000000000000000 …………….
 102e4b2040: 0000000000000000 0000000000000000 …………….
 102e4b2050: 0000000000000000 0000000000000000 …………….
 102e4b2060: 0000000000000000 0000000000000000 …………….
 102e4b2070: 000000102e8a2107 000000102e4b1107 .!……..K…..
 102e4b2080: 000000102e4b0107 0000000000000000 ..K………….
2级页表:
 rd -p 0000000ff5936000 0x200
 ff5936000: 0000001015d20107 0000000000000000 …………….
 ff5936a80: 0000000f13200ff3 0000000000000000 .. ………….
 ff5936c60: 0000000000000000 0000000f12c00ff3 …………….
 ff5936c70: 0000000f12e00ff3 0000000f12800ff3 …………….
 ff5936c80: 0000000f12a00ff3 0000000f12400ff3 ……….@…..
 ff5936c90: 0000000f12600ff3 0000000f12000ff3 ...............
 ff5936ca0: 0000000f12200ff3 0000000f11c00ff3 .. .............
 ff5936cb0: 0000000f11e00ff3 0000000f11800ff3 ................
 ff5936cc0: 0000000f11a00ff3 0000000f11400ff3 ..........@.....
 ff5936cd0: 0000000f11600ff3 0000000f11000ff3 ..………….
 ff5936ce0: 0000000f11200ff3 0000000f10c00ff3 .. ………….
 ff5936cf0: 0000000f10e00ff3 0000000f10800ff3 …………….
 ff5936d00: 0000001011407107 000000102fcfc107 .q@……../….
 ff5936d10: 0000000fa0000ff3 000000101151f107 ……….Q…..
 ff5936d20: 0000000f9f800ff3 000000102fd1b107 ………../….
 ff5936d30: 0000001031350107 000000103063f107 ..51……c0….
 ff5936d40: 000000102d607107 0000001032918107 .q`-…….2….
 ff5936d50: 00000010117d2107 00000010289aa107 .!}……..(….
 ff5936d60: 0000000f9d000ff3 0000000f9ce00ff3 …………….
 ff5936d70: 000000102fe05107 000000102dab6107 .Q./…..a.-….
 ff5936d80: 0000000ff0a1a107 000000101a4ec107 ……….N…..
1级页表:
 rd -p 0000001015d20000 0x200
 1015d20400: 0000000fa2080f77 0000000fa2081f77 w…….w…….
 1015d20410: 0000000fa2082f77 0000000fa2083f77 w/……w?……
 1015d20480: 0000000fa2090f77 0000000fa2091f77 w…….w…….
 1015d20490: 0000000fa2092f77 0000000fa2093f77 w/……w?……
 1015d204a0: 0000000fa2094f77 0000000fa2095f77 wO……w_……
 1015d204b0: 0000000fa2096f77 0000000fa2097f77 wo……w…….
 1015d204c0: 0000000fa2098f77 0000000000000000 w……………
读取第一个page的内容:
 fa2080f77: 34362e343632383a 362e34362d33352e :8264.64.53-64.6
 fa2080f87: 2f34362c32342e34 373932383a2d3335 4.42,64/53-:8297
 fa2080f97: 362d33352e343631 3137392b30322e34 164.53-64.20+971
 fa2080fa7: 34362d33352e3436 362e34362d33352e 64.53-64.53-64.6
 fa2080fb7: 2b30322d33352e34 383a2e34362e3436 4.53-20+64.64.:8
 fa2080fc7: 382e343630363732 353b3d2e34363036 276064.86064.=;5
 fa2080fd7: 34362e34362d3335 362c32332e34362e 53-64.64.64.32,6
 fa2080fe7: 2e3436343a3c2e34 35372c32342b3133 4.<:464.31+42,75
 fa2080ff7: 3c2e33352e34362f 36303638353a3c3a /64.53.<:<:58606
 fa2081007: 2a2f312c32342e34 31332d33352e3436 4.42,1/64.53-31
 fa2081017: 332e34363036382c 2e34362f35372c31 ,86064.31,75/64.
 fa2081027: 3638343a3c2e3436 392f35372e343630 64.<:486064.75/9
 fa2081037: 2d33352e34363137 32342e34362e3436 7164.53-64.64.42
 fa2081047: 333036382e34362c 2e34362e34362b31 ,64.86031+64.64.
 fa2081057: 34362f34362c3133 352c32342e34362e 31,64/64.64.42,5
 fa2081067: 32383a2e34362d33 32332e3436313739 3-64.:8297164.32
 fa2081077: 342e34362f35372c 2d33352d32342c32 ,75/64.42,42-53-
 fa2081087: 383a2e33352d3234 382e34362d333532 42-53.:8253-64.8
 fa2081097: 2e34362e34363036 34362e34362e3436 6064.64.64.64.64
 fa20810a7: 32373d3f2b31332e 32383a2e34362a30 .31+?=72064.:82

三:代码详解

调用的关系如下所示:

vmx_handle_exit  //vcpu_run->vcpu_enter_guest,Vcpu运行退出后调用
   handle_exception //is_page_fault(intr_info)
     kvm_handle_page_fault 
       kvm_mmu_page_fault //处理MMIO和普通内存fault,vcpu->arch.mmu.page_fault
         tdp_page_fault //kvm处理内存异常核心函数  
            try_async_pf  
                __direct_map //完成映射
static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,bool prefault)
{
r = mmu_topup_memory_caches(vcpu);//保证MMU相关cache内存足够
level = mapping_level(vcpu, gfn, &force_pt_level);
//fast路径只处理也write和trace属性相关的异常
if (fast_page_fault(vcpu, gpa, level, error_code))
    return RET_PF_RETRY;

mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
//异步处理page fault,核心函数,后面详细说明
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
    return RET_PF_RETRY;

if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
    return r;
//page构建完成后再生成KVM_MMU_page页表,后续寻址由ept完成,后面详解
r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
         prefault, lpage_disallowed);
         
}

关键的两个函数是try_async_pf__direct_map,一个用来产生page(产生页的意思是内核不会第一时间给虚拟机地址分配物理地址,真正要用时才会分配),一个用来生成页表(完成虚拟机物理地址到物理机物理地址的映射)。

1:try_async_pf

static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
 bool *writable)
{
 struct kvm_memory_slot *slot;
 bool async;
/*通过判断flag来标记是否可用,gfn去遍历mem_slot,根据base_gfn和npages去判断是否在
 * 这个区间。
 */
if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
    *pfn = KVM_PFN_NOSLOT;
    return false;
}
//gfn到slot的转换,每个memslot有base_gfn和npage,查看gfn落在哪个区间
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
async = false;
//
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
if (!async)
    return false; /* *pfn has correct page already */

kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
 bool atomic, bool *async, bool write_fault,
 bool *writable)
{
   /*gfn转为userspace地址的公式
    *slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
    */
     unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
     // hva到pfn,填充page
     return hva_to_pfn(addr, atomic, async, write_fault,
              writable);
}

hva_to_pfn是host端虚拟地址到物理地址的转换,有两种方式,hva_to_pfn_fast、hva_to_pfn_slow。

hva_to_pfn_fast       //快速的hva到pfn切换,有页返回ture
   __get_user_pages_fast   //往下走是根据页表一步步往下读,如果有某级页表为空
     gup_pgd_range        //则直接返回,过程不会睡眠。突出快。有页表空或者没有也会
       gup_p4d_range      //进入到slow的路径。
         gup_pud_range 
           gup_pmd_range 
              gup_pte_range

hva_to_pfn_slow

//slow的流程可能会睡眠。fast不会
hva_to_pfn_slow 
  get_user_pages_unlocked
    __get_user_pages_locked
      __get_user_pages
        faultin_page  //没有page会通过page_fault流程进行处理。
        follow_page_mask //有page会一步步读取页表获取页
            follow_p4d_mask
               follow_pud_mask
                 follow_pmd_mask
                   follow_page_pte

我们对关键的函数进行查看:

static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 unsigned long start, unsigned long nr_pages,
 unsigned int gup_flags, struct page **pages,
 struct vm_area_struct **vmas, int *nonblocking)
{
do {
    struct page *page;
    unsigned int foll_flags = gup_flags;
    unsigned int page_increm;

    /* first iteration or cross vma bound */
    if (!vma || start >= vma->vm_end) {
        vma = find_extend_vma(mm, start);
        }
retry:
    //层层读取页表,访问页
    page = follow_page_mask(vma, start, foll_flags, &page_mask);
 if (!page) {
   int ret; //如果没有找到页则通过page_fault申请页。
   ret = faultin_page(tsk, vma, start, &foll_flags,nonblocking);
 if (pages) {
     pages[i] = page;
     flush_anon_page(vma, page, start);
     flush_dcache_page(page);
     page_mask = 0;
 }

page_fault是kernel响应用户态内存访问的关键,但不是本文重点,就不赘述。假设现在我们已经又了page,还剩下最后一步,将虚拟机里面的物理地址映射到该page,下次访问由硬件完成寻址,最后一步映射由__direct_map完成。

__direct_map

static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
 int map_writable, int level, kvm_pfn_t pfn,
 bool prefault, bool lpage_disallowed)
{
 struct kvm_shadow_walk_iterator it;
 struct kvm_mmu_page *sp;
 int ret;
 gfn_t gfn = gpa >> PAGE_SHIFT;
 gfn_t base_gfn = gfn;
 
//宏定义详情如下,root_hpa开始查找,获取相关SP,
for_each_shadow_entry(vcpu, gpa, it) {
    if (!is_shadow_present_pte(*it.sptep)) {
        //pte不存在,会寻找sp,有就直接用,没有就创建sp
        sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
                      it.level - 1, true, ACC_ALL);
        //把sp和pte连接起来
        link_shadow_page(vcpu, it.sptep, sp);
        if (lpage_disallowed)
            account_huge_nx_page(vcpu->kvm, sp);
    }
}
//ptep和pfn连接.末尾12bit是flag
ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
           write, level, base_gfn, pfn, prefault,
           map_writable);
direct_pte_prefetch(vcpu, it.sptep);
++vcpu->stat.pf_fixed;
return ret;
}

#define for_each_shadow_entry(_vcpu, _addr, _walker)            \
        for (shadow_walk_init(&(_walker), _vcpu, _addr);         \
             shadow_walk_okay(&(_walker));                        \
             shadow_walk_next(&(_walker)))           
#init:
    iterator->addr = addr;
    iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
    iterator->level = vcpu->arch.mmu.shadow_root_level;
#okay:
    iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
    iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
#next:
    is_last_spte(spte, iterator->level)
        iterator->level = 0;
        return;
    iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
    --iterator->level;
static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
 bool speculative, bool host_writable)
{
 //spte有的话会先drop掉
if (is_shadow_present_pte(*sptep)) {
    /*
     * If we overwrite a PTE page pointer with a 2MB PMD, unlink
     * the parent of the now unreachable PTE.
     */
    if (level > PT_PAGE_TABLE_LEVEL &&
        !is_large_pte(*sptep)) {
        struct kvm_mmu_page *child;
        u64 pte = *sptep;

        child = page_header(pte & PT64_BASE_ADDR_MASK);
        drop_parent_pte(child, sptep);
        flush = true;
    } else if (pfn != spte_to_pfn(*sptep)) {
        pgprintk("hfn old %llx new %llx\n",
             spte_to_pfn(*sptep), pfn);
        drop_spte(vcpu->kvm, sptep);
        flush = true;
    } else
        was_rmapped = 1;
}
//pfn<<page_shift 转化spte,然后赋值到sptep,完成页表设置
set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
            speculative, true, host_writable);
if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
    if (write_fault)
        ret = RET_PF_EMULATE;
    kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
    kvm_flush_remote_tlbs(vcpu->kvm);

if (unlikely(is_mmio_spte(*sptep)))
    ret = RET_PF_EMULATE;

if (!was_rmapped && is_large_pte(*sptep))
    ++vcpu->kvm->stat.lpages;
//mem_arch相关反向映射的填充
if (is_shadow_present_pte(*sptep)) {
    if (!was_rmapped) {
        rmap_count = rmap_add(vcpu, sptep, gfn);
        if (rmap_count > RMAP_RECYCLE_THRESHOLD)
            rmap_recycle(vcpu, sptep, gfn);
    }
}

return ret;
}

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值