1、基本原理
KVM虚拟机通过字符设备/dev/kvm的ioctl接口创建和运行,相关原理见之前的文章说明。
虚拟机的运行通过/dev/kvm设备ioctl VCPU接口的KVM_RUN指令实现,在VM和VCPU创建好并完成初始化后,就可以调度该虚拟机运行了,通常,一个VCPU对应于一个线程,虚拟机运行的本质为调度该虚拟机相关的VCPU所在线程运行。虚拟机(VCPU)的运行主要任务是要进行上下文切换,上下文主要包括相关寄存器、APIC状态、TLB等,通常上下文切换的过程如下:
1、 保存当前的上下文。
2、 使用kvm_vcpu结构体中的上下文信息,加载到物理CPU中。
3、 执行kvm_x86_ops中的run_vcpu函数,调用硬件相关的指令(如VMLAUNCH),进入虚拟机运行环境中。
虚拟机运行于qemu-kvm的进程上下文中,从硬件的角度看,虚拟机的运行过程,实质为相关指令的执行过程,虚拟机编译后的也�%Bquot;
"mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
"mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
"mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
"mov %%r10, %c[r10](%0) \n\t"
"mov %%r11, %c[r11](%0) \n\t"
"mov %%r12, %c[r12](%0) \n\t"
"mov %%r13, %c[r13](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
#endif
"mov %%cr2, %%" _ASM_AX " \n\t"
"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
"pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
"setbe %c[fail](%0) \n\t"
".pushsection .rodata \n\t"
".global vmx_return \n\t"
"vmx_return: " _ASM_PTR " 2b \n\t"
".popsection"
:
:
"c"(vmx),
"d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
/*[host_rsp]是tag,可以在前面以%[host_rsp]方式引用*/
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
#ifdef CONFIG_X86_64
[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
[wordsize]"i"(sizeof(ulong))
:
"cc",
"memory"/*clobber list,cc表示寄存器,memory表示内存*/
#ifdef CONFIG_X86_64
,
"rax",
"rbx",
"rdi",
"rsi"
,
"r8",
"r9",
"r10",
"r11",
"r12",
"r13",
"r14",
"r15"
#else
,
"eax",
"ebx",
"edi",
"esi"
#endif
);
// 运行到这里,说明已经发生了VM-exit,返回到了root模式
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if
(debugctlmsr)
update_debugctlmsr(debugctlmsr);
#ifndef CONFIG_X86_64
/*
* The sysexit path does not restore ds/es, so we must set them to
* a reasonable value ourselves.
*
* We can't defer this to vmx_load_host_state() since that function
* may be executed in interrupt context, which saves and restore segments
* around it, nullifying its effect.
*/
/*重新加载ds/es段寄存器,因为VM-exit不会自动加载他们*/
loadsegment(ds, __USER_DS);
loadsegment(es, __USER_DS);
#endif
vcpu->arch.regs_avail =
~((1 << VCPU_REGS_RIP)
|
(1 << VCPU_REGS_RSP)
|
(1 << VCPU_EXREG_RFLAGS)
|
(1 << VCPU_EXREG_CPL)
|
(1 << VCPU_EXREG_PDPTR)
|
(1 << VCPU_EXREG_SEGMENTS)
|
(1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
// 从硬件VMCS中读取中断向量表信息
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx->loaded_vmcs->launched = 1;
// 从硬件VMCS中读取VM-exit原因信息,这些信息是VM-exit过程中由硬件自动写入的
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
/*处理MCE异常和NMI中断*/
vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
KVM虚拟机通过字符设备/dev/kvm的ioctl接口创建和运行,相关原理见之前的文章说明。
虚拟机的运行通过/dev/kvm设备ioctl VCPU接口的KVM_RUN指令实现,在VM和VCPU创建好并完成初始化后,就可以调度该虚拟机运行了,通常,一个VCPU对应于一个线程,虚拟机运行的本质为调度该虚拟机相关的VCPU所在线程运行。虚拟机(VCPU)的运行主要任务是要进行上下文切换,上下文主要包括相关寄存器、APIC状态、TLB等,通常上下文切换的过程如下:
1、 保存当前的上下文。
2、 使用kvm_vcpu结构体中的上下文信息,加载到物理CPU中。
3、 执行kvm_x86_ops中的run_vcpu函数,调用硬件相关的指令(如VMLAUNCH),进入虚拟机运行环境中。
虚拟机运行于qemu-kvm的进程上下文中,从硬件的角度看,虚拟机的运行过程,实质为相关指令的执行过程,虚拟机编译后的也�%Bquot;
"mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
"mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
"mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
"mov %%r10, %c[r10](%0) \n\t"
"mov %%r11, %c[r11](%0) \n\t"
"mov %%r12, %c[r12](%0) \n\t"
"mov %%r13, %c[r13](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
#endif
"mov %%cr2, %%" _ASM_AX " \n\t"
"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
"pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
"setbe %c[fail](%0) \n\t"
".pushsection .rodata \n\t"
".global vmx_return \n\t"
"vmx_return: " _ASM_PTR " 2b \n\t"
".popsection"
:
:
"c"(vmx),
"d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
/*[host_rsp]是tag,可以在前面以%[host_rsp]方式引用*/
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
#ifdef CONFIG_X86_64
[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
[wordsize]"i"(sizeof(ulong))
:
"cc",
"memory"/*clobber list,cc表示寄存器,memory表示内存*/
#ifdef CONFIG_X86_64
,
"rax",
"rbx",
"rdi",
"rsi"
,
"r8",
"r9",
"r10",
"r11",
"r12",
"r13",
"r14",
"r15"
#else
,
"eax",
"ebx",
"edi",
"esi"
#endif
);
// 运行到这里,说明已经发生了VM-exit,返回到了root模式
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if
(debugctlmsr)
update_debugctlmsr(debugctlmsr);
#ifndef CONFIG_X86_64
/*
* The sysexit path does not restore ds/es, so we must set them to
* a reasonable value ourselves.
*
* We can't defer this to vmx_load_host_state() since that function
* may be executed in interrupt context, which saves and restore segments
* around it, nullifying its effect.
*/
/*重新加载ds/es段寄存器,因为VM-exit不会自动加载他们*/
loadsegment(ds, __USER_DS);
loadsegment(es, __USER_DS);
#endif
vcpu->arch.regs_avail =
~((1 << VCPU_REGS_RIP)
|
(1 << VCPU_REGS_RSP)
|
(1 << VCPU_EXREG_RFLAGS)
|
(1 << VCPU_EXREG_CPL)
|
(1 << VCPU_EXREG_PDPTR)
|
(1 << VCPU_EXREG_SEGMENTS)
|
(1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
// 从硬件VMCS中读取中断向量表信息
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx->loaded_vmcs->launched = 1;
// 从硬件VMCS中读取VM-exit原因信息,这些信息是VM-exit过程中由硬件自动写入的
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
/*处理MCE异常和NMI中断*/
vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}