这里只关注x86架构。
按内核文档中,x86架构下,KVM Hypercall是一个3字节的指令,vmcall指令或者vmmcall指令。
通过寄存器rbx、rcx、rdx、rsi,最多传输四个参数。然后hypercall的调用号存放于rax,并且调用返回值也存放于rax中,不涉及其他寄存器。
原理分析
VM调用接口
arch/x86/include/asm/kvm_para.h
中定义了hypercall接口:
#define KVM_HYPERCALL \
ALTERNATIVE("vmcall", "vmmcall", X86_FEATURE_VMMCALL)
static inline long kvm_hypercall0(unsigned int nr)
{
long ret;
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr)
: "memory");
return ret;
}
static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
{
long ret;
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1)
: "memory");
return ret;
}
static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
unsigned long p2)
{
long ret;
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2)
: "memory");
return ret;
}
static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
unsigned long p2, unsigned long p3)
{
long ret;
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3)
: "memory");
return ret;
}
static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
unsigned long p2, unsigned long p3,
unsigned long p4)
{
long ret;
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
: "memory");
return ret;
}
static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1,
unsigned long p2, unsigned long p3)
{
long ret;
asm volatile("vmmcall"
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3)
: "memory");
return ret;
}
主要不同是参数的不同。
KVM_HYPERCALL是一个3字节的指令序列,x86架构下即是vmcall指令。vmcall指令会导致VM exit到VMM。
以其中kvm_hypercall4()函数为例:
"=a"(ret)
:表示返回值存放在rax寄存器中;
"a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
:表示调用号nr存放于rax寄存器,p1参数存放于rbx,p2存放于rcx,p3存放于rdx,p4存放于rsi。
VMM处理接口
arch/x86/kvm/vmx/vmx.c
中定义了vm_exit的handle处理:
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
* to be done to userspace and return 0.
*/
static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
...
[EXIT_REASON_VMCALL] = handle_vmcall,
...
}
static int handle_vmcall(struct kvm_vcpu *vcpu)
{
return kvm_emulate_hypercall(vcpu);
}
具体处理hypercall的函数kvm_emulate_hypercall()则在arch/x86/kvm/x86.c
中定义:
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
int op_64_bit;
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
nr = kvm_rax_read(vcpu); /*从寄存器中取参数*/
a0 = kvm_rbx_read(vcpu);
a1 = kvm_rcx_read(vcpu);
a2 = kvm_rdx_read(vcpu);
a3 = kvm_rsi_read(vcpu);
trace_kvm_hypercall(nr, a0, a1, a2, a3);
op_64_bit = is_64_bit_mode(vcpu);
if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
a1 &= 0xFFFFFFFF;
a2 &= 0xFFFFFFFF;
a3 &= 0xFFFFFFFF;
}
if (kvm_x86_ops.get_cpl(vcpu) != 0) {
ret = -KVM_EPERM;
goto out;
}
ret = -KVM_ENOSYS;
switch (nr) { /*根据调用号nr进行分类处理*/
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
case KVM_HC_KICK_CPU:
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
break;
kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
kvm_sched_yield(vcpu->kvm, a1);
ret = 0;
break;
#ifdef CONFIG_X86_64
case KVM_HC_CLOCK_PAIRING:
ret = kvm_pv_clock_pairing(vcpu, a0, a1);
break;
#endif
case KVM_HC_SEND_IPI:
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
break;
ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
break;
case KVM_HC_SCHED_YIELD:
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
break;
kvm_sched_yield(vcpu->kvm, a0);
ret = 0;
break;
default:
ret = -KVM_ENOSYS;
break;
}
out:
if (!op_64_bit)
ret = (u32)ret;
kvm_rax_write(vcpu, ret); /*将结果写回rax寄存器*/
++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
其中,调用号nr有以下几类:
-
KVM_HC_VAPIC_POLL_IRQ
触发VM客户机退出,以便主机host再重新进入时检查挂起的中断。
-
KVM_HC_KICK_CPU
将vcpu从HLT状态唤醒。
使用举例:
客户机中某个vcpu正由于登台某个资源(比如spinlock),一旦忙于等待超过时间阈值,则可以执行HLT指令。执行了HLT指令,VMM会将该vcpu睡眠继续等待。然后该VM客户机的另一个vcpu可以通过KVM_HC_KICK_CPU hypercall来唤醒指定APIC ID(a1参数)的vcpu,附加参数a0供以后使用。
-
KVM_HC_CLOCK_PAIRING
同步VMM与VM的时钟。
a0:主机拷贝的
struct kvm_clock_offset
结构体在VM中的物理地址a1: clock_type, ATM 只支持 KVM_CLOCK_PAIRING_WALLCLOCK (0) , (对应于主机host的 CLOCK_REALTIME 时钟)
-
KVM_HC_SEND_IPI
发送核间中断至多个vCPUs。返回成功传送IPI的vCPU数量。
hypercall允许客户机发送多播IPI,64位下最多128个目的地址,32位下最多64个目的地址。
-
KVM_HC_SCHED_YIELD
用于yield如果IPI目标vcpu中有被preempted的。
当正在发送多播IPI目标时,如果目标中有vCPU被抢占了,则yield让出。
VM调用hypercall实例
在 Linux-5.10.59 的内核代码中,搜索kvm_hypercall*的函数调用地方:
[root@localhost linux-5.10.59]# grep --include="*.c" -nr kvm_hypercall* .
./arch/arm64/kvm/pvtime.c:35:long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
./arch/arm64/kvm/hypercalls.c:67: val = kvm_hypercall_pv_features(vcpu);
./arch/x86/kvm/x86.c:8156: trace_kvm_hypercall(nr, a0, a1, a2, a3);
./arch/x86/kernel/kvm.c:538: ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
./arch/x86/kernel/kvm.c:549: ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
./arch/x86/kernel/kvm.c:594: kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
./arch/x86/kernel/kvm.c:872: kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
./tools/testing/selftests/kvm/lib/x86_64/processor.c:1249:uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
./tools/testing/selftests/kvm/x86_64/kvm_pv_test.c:126: r = kvm_hypercall(hc->nr, 0, 0, 0, 0);
./drivers/ptp/ptp_kvm.c:58: ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
./drivers/ptp/ptp_kvm.c:119: ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
./drivers/ptp/ptp_kvm.c:180: ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
以其中一处为例,arch/x86/kernel/kvm.c 中唤醒一个vcpu:
/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
static void kvm_kick_cpu(int cpu)
{
int apicid;
unsigned long flags = 0;
apicid = per_cpu(x86_cpu_to_apicid, cpu);
kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
}