分析KVM,对我这种菜鸟确实难度太大。下面简单的先从虚拟机的创建和运行调用的函数分析。。。
首先申明一个kvm_context_t 变量用以描述用户态虚拟机上下文信息,然后调用kvm_init()函数初始化虚拟机上下文信息;函数kvm_create()创建虚拟机实例,该函数通过ioctl系统调用创建虚拟机相关的内核数据结构并且返回虚拟机文件描述符给用户态kvm_context_t数据结构;
<span style="font-size:18px;">2587 int kvm_init(void *opaque, unsigned int vcpu_size,
2588 struct module *module)
2589 {
2590 int r;
2591 int cpu;
2592
2593 r = kvm_arch_init(opaque);
2594 if (r)
2595 goto out_fail;
2596
2597 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2598
2599 if (bad_page == NULL) {
2600 r = -ENOMEM;
2601 goto out;
2602 }
2603
2604 bad_pfn = page_to_pfn(bad_page);
2605
2606 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2607 r = -ENOMEM;
2608 goto out_free_0;
2609 }
2610
2611 r = kvm_arch_hardware_setup();
2612 if (r < 0)
2613 goto out_free_0a;
2614
2615 for_each_online_cpu(cpu) {
2616 smp_call_function_single(cpu,
2617 kvm_arch_check_processor_compat,
2618 &r, 1);
2619 if (r < 0)
2620 goto out_free_1;
2621 }
2622
2623 on_each_cpu(hardware_enable, NULL, 1);
2624 r = register_cpu_notifier(&kvm_cpu_notifier);
2625 if (r)
2626 goto out_free_2;
2627 register_reboot_notifier(&kvm_reboot_notifier);
2628
2629 r = sysdev_class_register(&kvm_sysdev_class);
2630 if (r)
2631 goto out_free_3;
2632
2633 r = sysdev_register(&kvm_sysdev);
2634 if (r)
2635 goto out_free_4;
2636
2637 /* A kmem cache lets us meet the alignment requirements of fx_save. */
2638 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
2639 __alignof__(struct kvm_vcpu),
2640 0, NULL);
2641 if (!kvm_vcpu_cache) {
2642 r = -ENOMEM;
2643 goto out_free_5;
2644 }
2645
2646 kvm_chardev_ops.owner = module;
2647 kvm_vm_fops.owner = module;
2648 kvm_vcpu_fops.owner = module;
2649
2650 r = misc_register(&kvm_dev);
2651 if (r) {
2652 printk(KERN_ERR "kvm: misc device register failed\n");
2653 goto out_free;
2654 }
2655
2656 kvm_preempt_ops.sched_in = kvm_sched_in;
2657 kvm_preempt_ops.sched_out = kvm_sched_out;
2658
2659 kvm_init_debug();
2660
2661 return 0;
2662
2663 out_free:
2664 kmem_cache_destroy(kvm_vcpu_cache);
2665 out_free_5:
2666 sysdev_unregister(&kvm_sysdev);
2667 out_free_4:
2668 sysdev_class_unregister(&kvm_sysdev_class);
2669 out_free_3:
2670 unregister_reboot_notifier(&kvm_reboot_notifier);
2671 unregister_cpu_notifier(&kvm_cpu_notifier);
2672 out_free_2:
2673 on_each_cpu(hardware_disable, NULL, 1);
2674 out_free_1:
2675 kvm_arch_hardware_unsetup();
2676 out_free_0a:
2677 free_cpumask_var(cpus_hardware_enabled);
2678 out_free_0:
2679 __free_page(bad_page);
2680 out:
2681 kvm_arch_exit();
2682 out_fail:
2683 return r;
2684 }</span>
下面稍微详细分析下面流程:
首先,用户态的Qemu代码调用kvm_init函数,kvm_init通过qemu_open(“/dev/kvm”)检查内核驱动插入情况,通过kvm_ioctl(s, KVM_GET_API_VERSION, 0)获取API接口版本,最是调用了kvm_ioctl(s, KVM_CREATE_VM, 0)创建了KVM虚拟机,获取虚拟机句柄。
简单点说,就是在用户态调用了 KVM_Init(), 然后用户态的Qemu调用kvm_ioctl(s, KVM_CREATE_VM, 0)来获取KVM虚拟机接口。那我们必须还要知道调用了这个函数之后会发生什么,也就是KVM是如何由这个函数展开,然后创建虚拟机的。
内核对应的入口代码在此:
<span style="font-size:18px;">static int kvm_dev_ioctl_create_vm(void)
2271 {
2272 int fd;
2273 struct kvm *kvm;
2274
2275 kvm = kvm_create_vm();
2276 if (IS_ERR(kvm))
2277 return PTR_ERR(kvm);
2278 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0);
2279 if (fd < 0)
2280 kvm_put_kvm(kvm);
2281
2282 return fd;
2283 }</span>
从上面可以看出,是通过Kvm_create_vm来进一步调用。找到kvm_create_vm:
<span style="font-size:18px;">945 static struct kvm *kvm_create_vm(void)
946 {
947 struct kvm *kvm = kvm_arch_create_vm();
948 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
949 struct page *page;
950 #endif
951
952 if (IS_ERR(kvm))
953 goto out;
954 #ifdef CONFIG_HAVE_KVM_IRQCHIP
955 INIT_LIST_HEAD(&kvm->irq_routing);
956 INIT_HLIST_HEAD(&kvm->mask_notifier_list);
957 #endif
958
959 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
960 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
961 if (!page) {
962 kfree(kvm);
963 return ERR_PTR(-ENOMEM);
964 }
965 kvm->coalesced_mmio_ring =
966 (struct kvm_coalesced_mmio_ring *)page_address(page);
967 #endif
968
969 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
970 {
971 int err;
972 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
973 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
974 if (err) {
975 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
976 put_page(page);
977 #endif
978 kfree(kvm);
979 return ERR_PTR(err);
980 }
981 }
982 #endif
983
984 kvm->mm = current->mm;
985 atomic_inc(&kvm->mm->mm_count);
986 spin_lock_init(&kvm->mmu_lock);
987 spin_lock_init(&kvm->requests_lock);
988 kvm_io_bus_init(&kvm->pio_bus);
989 mutex_init(&kvm->lock);
990 kvm_io_bus_init(&kvm->mmio_bus);
991 init_rwsem(&kvm->slots_lock);
992 atomic_set(&kvm->users_count, 1);
993 spin_lock(&kvm_lock);
994 list_add(&kvm->vm_list, &vm_list);
995 spin_unlock(&kvm_lock);
996 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
997 kvm_coalesced_mmio_init(kvm);
998 #endif
999 out:
1000 return kvm;
1001 }</span>
这里kvm_arch_create_vm():是用来初始化KVM结构体信息。
总结这个函数吧,kvm_create_vm事实上也就做了初始化和启动硬件特性两件事,然后将相应的句柄返回给用户态。
创建完内核虚拟机数据结构后,再创建内核pit以及mmio等基本外设模拟设备,然后调用kvm_create_vcpu()函数来创建虚拟处理器,kvm_create_vcpu()
下面看下kvm_create_vcpu()函数
<span style="font-size:18px;">1726 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1727 {
1728 int r;
1729 struct kvm_vcpu *vcpu;
1730
1731 if (!valid_vcpu(n))
1732 return -EINVAL;
1733
1734 vcpu = kvm_arch_vcpu_create(kvm, n);
1735 if (IS_ERR(vcpu))
1736 return PTR_ERR(vcpu);
1737
1738 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1739
1740 r = kvm_arch_vcpu_setup(vcpu);
1741 if (r)
1742 return r;
1743
1744 mutex_lock(&kvm->lock);
1745 if (kvm->vcpus[n]) {
1746 r = -EEXIST;
1747 goto vcpu_destroy;
1748 }
1749 kvm->vcpus[n] = vcpu;
1750 mutex_unlock(&kvm->lock);
1751
1752 /* Now it's all set up, let userspace reach it */
1753 kvm_get_kvm(kvm);
1754 r = create_vcpu_fd(vcpu);
1755 if (r < 0)
1756 goto unlink;
1757 return r;
1758
1759 unlink:
1760 mutex_lock(&kvm->lock);
1761 kvm->vcpus[n] = NULL;
1762 vcpu_destroy:
1763 mutex_unlock(&kvm->lock);
1764 kvm_arch_vcpu_destroy(vcpu);
1765 return r;
1766 }
1767 </span>
<span style="font-size:18px;">4365
4366 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4367 {
4368 int r;
4369
4370 /* We do fxsave: this must be aligned. */
4371 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4372
4373 vcpu->arch.mtrr_state.have_fixed = 1;
4374 vcpu_load(vcpu);
4375 r = kvm_arch_vcpu_reset(vcpu);
4376 if (r == 0)
4377 r = kvm_mmu_setup(vcpu);
4378 vcpu_put(vcpu);
4379 if (r < 0)
4380 goto free_vcpu;
4381
4382 return 0;
4383 free_vcpu:
4384 kvm_x86_ops->vcpu_free(vcpu);
4385 return r;
4386 }</span>
继续流程分析:
函数通过ioctl()系统调用向由vm_fd文件描述符指向的虚拟文件调用创建虚拟处理器,并将虚拟处理器的文件描述符返回给用户态程序,用以以后的调度使用;
好,CPU的初始化和创建暂时完成:下面是内存,即影子页表的初始化:
创建完虚拟处理器后,由用户态的QEMU程序申请客户机用户空间,用以加载和运行客户机代码;为了使得客户虚拟机正确执行,必须要在内核中为客户机建立正确的内存映射关系,即影子页表信息。因此,申请客户机内存地址空间后,调用函数kvm_create_phys_mem()创建客户机内存映射关系,该函数主要通过ioctl系统调用向vm_fd指向的虚拟文件调用设置内核数据结构中客户机内存域相关信息,主要建立影子页表信息;当创建好虚拟处理器和影子页表后,即可读取客户机到指定分配的空间中,然后调度虚拟处理器运行。
kvm_create_phys_mem():代码在此
945 static struct kvm *kvm_create_vm(void)
946 {
947 struct kvm *kvm = kvm_arch_create_vm();
948 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
949 struct page *page;
950 #endif
951
952 if (IS_ERR(kvm))
953 goto out;
954 #ifdef CONFIG_HAVE_KVM_IRQCHIP
955 INIT_LIST_HEAD(&kvm->irq_routing);
956 INIT_HLIST_HEAD(&kvm->mask_notifier_list);
957 #endif
958
959 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
960 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
961 if (!page) {
962 kfree(kvm);
963 return ERR_PTR(-ENOMEM);
964 }
965 kvm->coalesced_mmio_ring =
966 (struct kvm_coalesced_mmio_ring *)page_address(page);
967 #endif
968
969 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
970 {
971 int err;
972 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
973 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
974 if (err) {
975 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
976 put_page(page);
977 #endif
978 kfree(kvm);
979 return ERR_PTR(err);
980 }
981 }
982 #endif
983
984 kvm->mm = current->mm;
985 atomic_inc(&kvm->mm->mm_count);
986 spin_lock_init(&kvm->mmu_lock);
987 spin_lock_init(&kvm->requests_lock);
988 kvm_io_bus_init(&kvm->pio_bus);
989 mutex_init(&kvm->lock);
990 kvm_io_bus_init(&kvm->mmio_bus);
991 init_rwsem(&kvm->slots_lock);
992 atomic_set(&kvm->users_count, 1);
993 spin_lock(&kvm_lock);
994 list_add(&kvm->vm_list, &vm_list);
995 spin_unlock(&kvm_lock);
996 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
997 kvm_coalesced_mmio_init(kvm);
998 #endif
999 out:
1000 return kvm;
1001 }
1002
内存创建之后,即可以运行虚拟机了。
调度虚拟机的函数为kvm_run(),代码如下:
3466 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3467 {
3468 int r;
3469 sigset_t sigsaved;
3470
3471 vcpu_load(vcpu);
3472
3473 if (vcpu->sigset_active)
3474 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3475
3476 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3477 kvm_vcpu_block(vcpu);
3478 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3479 r = -EAGAIN;
3480 goto out;
3481 }
3482
3483 /* re-sync apic's tpr */
3484 if (!irqchip_in_kernel(vcpu->kvm))
3485 kvm_set_cr8(vcpu, kvm_run->cr8);
3486
3487 if (vcpu->arch.pio.cur_count) {
3488 r = complete_pio(vcpu);
3489 if (r)
3490 goto out;
3491 }
3492 #if CONFIG_HAS_IOMEM
3493 if (vcpu->mmio_needed) {
3494 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3495 vcpu->mmio_read_completed = 1;
3496 vcpu->mmio_needed = 0;
3497
3498 down_read(&vcpu->kvm->slots_lock);
3499 r = emulate_instruction(vcpu, kvm_run,
3500 vcpu->arch.mmio_fault_cr2, 0,
3501 EMULTYPE_NO_DECODE);
3502 up_read(&vcpu->kvm->slots_lock);
3503 if (r == EMULATE_DO_MMIO) {
3504 /*
3505 * Read-modify-write. Back to userspace.
3506 */
3507 r = 0;
3508 goto out;
3509 }
3510 }
3511 #endif
3512 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3513 kvm_register_write(vcpu, VCPU_REGS_RAX,
3514 kvm_run->hypercall.ret);
3515
3516 r = __vcpu_run(vcpu, kvm_run);
3517
3518 out:
3519 if (vcpu->sigset_active)
3520 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3521
3522 vcpu_put(vcpu);
3523 return r;
3524 }
该函数通过ioctl系统调用调用由虚拟处理器文件描述符指向的虚拟文件调度处理函数kvm_run()调度虚拟处理器的执行,该系统调用将虚拟处理器vcpu信息加载到物理处理器中,通过vm_entry执行进入客户机执行。
后面就是陷入和捕获以及上下文切换了,后面分析。。。
在客户机正常运行期间kvm_run()函数不返回,只有发生以下两种情况时,函数返回:1,发生了I/O事件,如客户机发出读写I/O的指令;2,产生了客户机和内核KVM都无法处理的异常。I/O事件处理完毕后,通过重新调用KVM_RUN()函数继续调度客户机的执行。
大致流程就是如此,还得继续细细分析。