CPU虚拟化的整个流程是怎样?
qemu通过ioctl与kvm通信。
kvm与qemu的配合流程如下:
qemu运行流程伪代码如下:
vcpu创建流程
vcpu整体创建流程如下:
kvm_vm_ioctl()-->kvm_ioctl_create_vcpu()-->kvm_arch_vcpu_create(kvm, id)
--> kvm_x86_ops->vcpu_create(kvm, id) == vmx_create_vcpu() (intel)
== svm_create_vcpu() (amd)
qemu想要创建vcpu时,就会通过ioctl向KVM传递KVM_CREATE_VCPU
因此,vcpu的创建入口应该看kvm中引用了KVM_CREATE_VCPU的语句,可以找到,该语句在kvm_vm_ioctl()函数中。
kvm_vm_ioctl()
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
if (kvm->mm != current->mm)
return -EIO;
switch (ioctl) {
case KVM_CREATE_VCPU:
r = kvm_vm_ioctl_create_vcpu(kvm, arg);
break;
当qemu通过ioctl向kvm传递KVM_CREATE_VCPU后,程序的控制权就转到kvm中的kvm_vm_ioctl()函数,并调用kvm_vm_ioctl_create_vcpu(kvm, arg);
kvm_ioctl_create_vcpu()
/*
* Creates some virtual cpus. Good luck creating more than one.
*/
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
{
int r;
struct kvm_vcpu *vcpu, *v;
if (id >= KVM_MAX_VCPU_ID)
return -EINVAL;
vcpu = kvm_arch_vcpu_create(kvm, id);//根据arch(体系结构)的不同选择相应的create函数创建vcpu
if (IS_ERR(vcpu))
return PTR_ERR(vcpu);
preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
r = kvm_arch_vcpu_setup(vcpu);
if (r)
goto vcpu_destroy;
r = kvm_create_vcpu_debugfs(vcpu);
if (r)
goto vcpu_destroy;
mutex_lock(&kvm->lock);
if (!kvm_vcpu_compatible(vcpu)) {
r = -EINVAL;
goto unlock_vcpu_destroy;
}
if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
r = -EINVAL;
goto unlock_vcpu_destroy;
}
kvm_for_each_vcpu(r, v, kvm)
if (v->vcpu_id == id) {
r = -EEXIST;
goto unlock_vcpu_destroy;
}
BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
/* Now it's all set up, let userspace reach it */
kvm_get_kvm(kvm);
r = create_vcpu_fd(vcpu);
if (r < 0) {
kvm_put_kvm(kvm);
goto unlock_vcpu_destroy;
}
kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
/*
* Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus
* before kvm->online_vcpu's incremented value.
*/
smp_wmb();
atomic_inc(&kvm->online_vcpus);
mutex_unlock(&kvm->lock);
kvm_arch_vcpu_postcreate(vcpu);
return r;
unlock_vcpu_destroy:
mutex_unlock(&kvm->lock);
debugfs_remove_recursive(vcpu->debugfs_dentry);
vcpu_destroy:
kvm_arch_vcpu_destroy(vcpu);
return r;
}
其中最重要的是vcpu = kvm_arch_vcpu_create(kvm, id);
kvm_arch_vcpu_create(kvm, id)
正如上面代码中的注释所说,该函数的作用是根据不同的arch(体系结构)选择相应的create函数创建vcpu。
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
unsigned int id)
{
struct kvm_vcpu *vcpu;
if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
printk_once(KERN_WARNING
"kvm: SMP vm created on host with unstable TSC; "
"guest TSC will not be reliable\n");
vcpu = kvm_x86_ops->vcpu_create(kvm, id);
return vcpu;
}
host kernel的kvm包含两部分代码:virt/kvm/和arch/x86/kvm。
不同的硬件虚拟化对应不同的mode,如kvm-intel.ko 包含vmx.c和pmu_intel.c,而amd硬件虚拟化则是 svm.c和pmu_amd.c。
根据体系结构的不同,vcpu_create()也有两种定义,分别是svm_create_vcpu()或者vmx_create_vcpu()
以下主要介绍vmx_create_vcpu()。
vmx_create_vcpu()
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
int cpu;
if (!vmx)
return ERR_PTR(-ENOMEM);
vmx->vpid = allocate_vpid();
err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
if (err)
goto free_vcpu;
err = -ENOMEM;
/*
* If PML is turned on, failure on enabling PML just results in failure
* of creating the vcpu, therefore we can simplify PML logic (by
* avoiding dealing with cases, such as enabling PML partially on vcpus
* for the guest, etc.
* 页面修改记录PML
* Intel VT 2015年推出page-modification logging(PML),
* VMM可以利用EPT监控虚拟机在运行期间物理页面的修改。
* 在没有PML前,VMM要监控xu虚拟机中物理页面的修改,需要将EPT的页面结构设置为not-present
* 或者read-only,这样会触发许多EPT violations,开销非常大。
* PML建立在CPU对EPT中的accessed与dirty标志位支持上。
* 当启用PML时,对EPT中设置了dirty标志位的写操作都会产生一条in-memory记录,
* 报告写操作的虚拟机物理地址,当记录写满时,触发一次VM Exit,然后VMM就可以监控被修改的页面。
*/
if (enable_pml) {
vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!vmx->pml_pg)
goto uninit_vcpu;
}
/* MSR(Model Specific Register)是x86架构中的概念,指的是在x86架构处理器中,
* 一系列用于控制CPU运行、功能开关、调试、跟踪程序执行、监测CPU性能等方面的寄存器。
*/
vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
> PAGE_SIZE);
if (!vmx->guest_msrs)
goto free_pml;
/*vmcs的创建与初始化*/
vmx->loaded_vmcs = &vmx->vmcs01;
vmx->loaded_vmcs->vmcs = alloc_vmcs();
vmx->loaded_vmcs->shadow_vmcs = NULL;
if (!vmx->loaded_vmcs->vmcs)
goto free_msrs;
if (!vmm_exclusive)
kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
loaded_vmcs_init(vmx->loaded_vmcs);
if (!vmm_exclusive)
kvm_cpu_vmxoff();
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
vmx->vcpu.cpu = cpu;
err = vmx_vcpu_setup(vmx);
vmx_vcpu_put(&vmx->vcpu);
put_cpu();
if (err)
goto free_vmcs;
if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
err = alloc_apic_access_page(kvm);
if (err)
goto free_vmcs;
}
if (enable_ept) {
if (!kvm->arch.ept_identity_map_addr)
kvm->arch.ept_identity_map_addr =
VMX_EPT_IDENTITY_PAGETABLE_ADDR;
err = init_rmode_identity_map(kvm);
if (err)
goto free_vmcs;
}
if (nested) {
nested_vmx_setup_ctls_msrs(vmx);
vmx->nested.vpid02 = allocate_vpid();
}
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
vmx->nested.current_vmcs12 = NULL;
vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
return &vmx->vcpu;
free_vmcs:
free_vpid(vmx->nested.vpid02);
free_loaded_vmcs(vmx->loaded_vmcs);
free_msrs:
kfree(vmx->guest_msrs);
free_pml:
vmx_destroy_pml_buffer(vmx);
uninit_vcpu:
kvm_vcpu_uninit(&vmx->vcpu);
free_vcpu:
free_vpid(vmx->vpid);
kmem_cache_free(kvm_vcpu_cache, vmx);
return ERR_PTR(err);
}
创建VCPU实际上是创建VCPU描述符。分配相应大小的内存,并进一步初始化。
- 分配VCPU标识
vmx->vpid = allocate_vpid();
- 初始化虚拟寄存器组:主要指初始化VMCS相关域。
- 初始化VCPU状态信息
- 初始化额外部件:将未被VMCS包含的虚拟寄存器初始化为物理CPU上电后的值,并配置LAPIC等部件。
- 初始化其他信息:根据VMM的实现初始化VCPU的私有数据。
svm_create_vcpu()或者vmx_create_vcpu()两个函数都调用了kvm_vcpu_init(),只是参数不同。