2.5.1 CPU Exception 处理
(1) handle_exception
a) 处理machinecheck
if (is_machine_check(intr_info))
returnhandle_machine_check(vcpu);
b) 对于nmi类别在vmx_vcpu_run处理
vmx_complete_interrupts ==>__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, VM_EXIT_INSTRUCTION_LEN,IDT_VECTORING_ERROR_CODE);
c) #UD 异常
if (is_invalid_opcode(intr_info)) {
er = emulate_instruction(vcpu,EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
1. kvm_queue_exception(vcpu,UD_VECTOR);
return 1;
}
d) #PF 处理
e) #DB 与 #BP 处理
f) 其它类
kvm_run->exit_reason =KVM_EXIT_EXCEPTION;
kvm_run->ex.exception = ex_no;
kvm_run->ex.error_code = error_code;
(2) kvm_multiple_exception
void kvm_requeue_exception_e(structkvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
kvm_multiple_exception(vcpu, nr, true, error_code, true);
}
voidkvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
kvm_multiple_exception(vcpu, nr, true, error_code, false);
}
该函数的目的是设置
struct kvm_queued_exception {
bool pending;
bool has_error_code;
bool reinject;
u8 nr;
u32 error_code;
} exception;
在vcpu_enter_guest ==> inject_pending_event ==> kvm_x86_ops->queue_exception(vcpu,vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code,vcpu->arch.exception.reinject); ==> vmx_queue_exception
(3) vmx_queue_exception
if (has_error_code) {
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
if (vmx->rmode.vm86_active) {
int inc_eip = 0;
if (kvm_exception_is_soft(nr))
inc_eip = vcpu->arch.event_exit_inst_len;
if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) !=EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
if (kvm_exception_is_soft(nr)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
} else
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
设置VM-Entry时的 VM_ENTRY_INTR_INFO_FIELD VM_ENTRY_EXCEPTION_ERROR_CODE
(4) __vmx_complete_interrupts
A. kvm_clear_exception_queue:
vcpu->arch.exception.pending = false; // 清kvm_multiple_exception中设置的标志位
B.
case INTR_TYPE_NMI_INTR:
vcpu->arch.nmi_injected = true;
vmx_set_nmi_mask(vcpu, false);
break;
case INTR_TYPE_SOFT_EXCEPTION:
vcpu->arch.event_exit_inst_len =vmcs_read32(instr_len_field);
/* fall through */
case INTR_TYPE_HARD_EXCEPTION:
if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK){
u32 err = vmcs_read32(error_code_field);
kvm_requeue_exception_e(vcpu, vector, err);
} else
kvm_requeue_exception(vcpu, vector);
break;
case INTR_TYPE_SOFT_INTR:
vcpu->arch.event_exit_inst_len =vmcs_read32(instr_len_field);
/* fall through */
case INTR_TYPE_EXT_INTR:
kvm_queue_interrupt(vcpu, vector, type ==INTR_TYPE_SOFT_INTR);
break;
(5) emulate_instruction ==> x86_emulate_instruction
a. init_emulate_ctxt(vcpu);
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
ctxt->eflags = kvm_get_rflags(vcpu);
ctxt->eip = kvm_rip_read(vcpu);
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(ctxt->eflags& X86_EFLAGS_VM) ?X86EMUL_MODE_VM86 :
(cs_l &&is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64:
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
ctxt->guest_mode = is_guest_mode(vcpu);
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
b. r =x86_decode_insn(ctxt, insn, insn_len);
ctxt->execute = opcode.u.execute
static struct {
void (*func) (atom_exec_context *, int *, int);
int arg;
} opcode_table[ATOM_OP_CNT]
包含了指令与模拟执行函数
d. rc = fastop(ctxt, fop);
rc= ctxt->execute(ctxt);
需要emulate_instruction的Case
a) IO case
b) handle_invalid_guest_state
c) eptmisconfig
d) handle_invd
e) execpton#UD
f) vm86exception
2.5.2 实模式虚拟化
(1) 初始化
由于cpu复位后将工作在实模式,所以KVM必须实现实模式的虚拟化
VMX模式要求处理器必须工作在分页保护模式下,因此VMM不可能允许guest os将处理器切换到实模式或非分页保护模式,而只能在分页保护模式下为guest os模拟出类似实模式或非分页保护模式的环境。可采用一个vm86任务来模拟客户机所需的实模式环境。
用户态:
kvm_init_vcpu==》 env->kvm_vcpu_dirty = 1;
x86_cpu_reset==》cpu_x86_update_cr0(env, 0x60000010);
这里会初始化cr0
设置cr0到内核态kvm_cpu_exec==> kvm_arch_put_registers(env,KVM_PUT_RUNTIME_STATE);
内核态:kvm_arch_vcpu_ioctl_set_sregs ==> kvm_x86_ops->set_cr0(vcpu,sregs->cr0) = vmx_set_cr0
vmx_vcpu_reset==>
vmx->vcpu.arch.cr0= X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
vmx_set_cr0(&vmx->vcpu,kvm_read_cr0(vcpu)); /* enter rmode */
vmx_set_cr0==> enter_rmode(vcpu);
a) 保存段寄存器到软件变量vmx->rmode.segs
b) vmx->rmode.vm86_active = 1;
c) vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
d) flags = vmcs_readl(GUEST_RFLAGS);
vmx->rmode.save_rflags = flags;
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);//开启vm86
e) update_exception_bitmap
f) fix_rmode_seg 设置实模式段
var.dpl = 0x3; //最低访问权限为ring3
if (seg == VCPU_SREG_CS)
var.type = 0x3; //可执行,可读,已访问
g) kvm_mmu_reset_context
kvm_mmu_unload(vcpu);
init_kvm_mmu(vcpu);
(2) 执行时需要特殊处理的地方
a) vmx_set_rflags 要设置 rflags |= X86_EFLAGS_IOPL |X86_EFLAGS_VM;
但在get_rflags时不能让用户感到时vm86 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
b) vmx_queue_exception 实模式中断注入方式不同
if(vmx->rmode.vm86_active) {
int inc_eip = 0;
if(kvm_exception_is_soft(nr))
1. inc_eip = vcpu->arch.event_exit_inst_len;
if (kvm_inject_realmode_interrupt(vcpu,nr, inc_eip) != EMULATE_DONE) //模拟int执行
2. kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
c) kvm_inject_realmode_interrupt ==》 __emulate_int_real
d) vmx_set_cr4
hw_cr4 = cr4 |(to_vmx(vcpu)->rmode.vm86_active ?
a) KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
e) vmx_get_segment, vmx_set_segment
f) vmx_inject_irq vmx_inject_nmi
g) handle_exception ==> handle_rmode_exception
2.5.3 模式切换
(1) 实模式 到 保护模式
产生vm-exit 进入 handle_set_cr0
==》 kvm_set_cr0 ==》 kvm_x86_ops->set_cr0(vcpu,cr0) == vmx_set_cr0
vmx_set_cr0 ==> enter_pmode
a) 保存段寄存器到软件变量vmx->rmode.segs
b) vmx->rmode.vm86_active = 0;
c) flags = vmcs_readl(GUEST_RFLAGS);
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) &~X86_CR4_VME) |
1. (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
d) update_exception_bitmap
e) fix_pmode_seg ==> vmx_set_segment
(2) 分页开启
vmx_set_cr0==> ept_update_paging_mode_cr0
(!is_paging(vcpu)){
/* From nonpaging to paging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
}
采用EPT,不需要监控cr3
下面先看一下辅助函数:
staticinline int is_paging(struct kvm_vcpu *vcpu)
{
return likely(kvm_read_cr0_bits(vcpu,X86_CR0_PG));
}
一开始is_paging返回false,在上面vcpu->arch.cr0 = cr0;执行后is_pageing 将返回true.
vmx_set_cr4:
unsigned long hw_cr4 = cr4 |(to_vmx(vcpu)->rmode.vm86_active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
......
vmcs_writel(CR4_READ_SHADOW,cr4);
vmcs_writel(GUEST_CR4,hw_cr4);
根据vmcs的设置,Guest 读CR0或CR4 返回shadow的值; 写CR0或CR4时如果与shadow值不等,产生vm-exit, 所以这里需要同时更新GUEST_CR4 和CR4_READOW.
当放生模式切换时CR0 的设置值一定与shadow值不同,因而产生vm-exit.
(3) 开启long-mode
long mode可以从实模式,也可以从保护模式进入, long-mode又分为64bit mode和compatibility mode. EFER.LME=1时开启long-mode,但long-mode必须在开启paing后才真正生效,这时EFER.LMA == 1.
vmx_set_cr0
if(!is_paging(vcpu) && (cr0 & X86_CR0_PG))
enter_lmode(vcpu); //开启CR0_PG时进入long-mode
enter_lmode
a. 更新TR段属性到64bit
vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK)
|AR_TYPE_BUSY_64_TSS);
b. vmx_set_efer(vcpu,vcpu->arch.efer | EFER_LMA); EFER.LMA设为1
vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
该函数设置VM_ENTRY_CONTROLS bit9 to 1 (控制进入IA32-E模式)