内核态代码位于Linux内核代码的 virt 和arch/x86/kvm 两个目录下;本节将分析内核态代码的架构与模块划分以及内核态对用户空间提供的接口。
1.2.1 对外字符设备
Intel 的虚拟化模块初始化入口如下:
vmx_init (arch/x86/kvm/vmx.c)
该函数为msr_bitmap 和 io_bitmap分配内存, 并调用
kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx),THIS_MODULE);
static struct kvm_x86_ops vmx_x86_ops = {
......
};
该结构为virt 层 调用arch/x86/kvm的接口
int kvm_init(void *opaque, unsigned vcpu_size, unsignedvcpu_align,
struct module *module)
该函数流程如下:
a. kvm_arch_init 注册 x86 arch 函数操作结构
b. kvm_irqfd_init; //初始化irqfd_cleanup_wq workqueue; KVM_IRQFD ioctrl时会用到
c. kvm_arch_hardware_setup ==》 kvm_x86_ops->hardware_setup
d. 对每个 Cpu 执行kvm_x86_ops->check_processor_compatibility(rtn);
e. 注册register_cpu_notifier(&kvm_cpu_notifier);
f. 注册register_reboot_notifier(&kvm_reboot_notifier);
g. 注册char 设备 misc_register(&kvm_dev);//kvm_dev是kvm对应用层的访问接口
h. register_syscore_ops(&kvm_syscore_ops);注册电源管理回调
i. kvm_preempt_ops 初始化, 该结构在创建vcpu时使用(kvm_vm_ioctl_create_vcpu)
static struct syscore_ops kvm_syscore_ops = { //当VMM host收到电源管理时被调用
.suspend = kvm_suspend,==》 kvm_x86_ops->hardware_disable
.resume = kvm_resume,==》 kvm_x86_ops->hardware_enable
};
当vcpu所在进程被调度运行或被调度不运行时,会触发下面回调。
kvm_preempt_ops.sched_in= kvm_sched_in;
kvm_preempt_ops.sched_out= kvm_sched_out;
static struct miscdevice kvm_dev = {
KVM_MINOR,
"kvm",
&kvm_chardev_ops,
};
static struct file_operations kvm_chardev_ops = {
.unlocked_ioctl =kvm_dev_ioctl,
.compat_ioctl = kvm_dev_ioctl,
.llseek = noop_llseek,
};
应用层qemu通过kvm接口调用内核,其中KVM_CREATE_VM用于创建虚拟机。
static long kvm_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
long r = -EINVAL;
switch (ioctl) {
caseKVM_GET_API_VERSION:
r = KVM_API_VERSION;
break;
case KVM_CREATE_VM:
r = kvm_dev_ioctl_create_vm(arg); //建立vm对应用层的char设备
break;
caseKVM_CHECK_EXTENSION:
r = kvm_dev_ioctl_check_extension_generic(arg);
break;
caseKVM_GET_VCPU_MMAP_SIZE:
r = -EINVAL;
if (arg)
goto out;
r = PAGE_SIZE; /* struct kvm_run */
#ifdef CONFIG_X86
r += PAGE_SIZE; /* pio data page */
#endif
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
r += PAGE_SIZE; /* coalesced mmio ring page */
#endif
break;
......
default:
return kvm_arch_dev_ioctl(filp, ioctl, arg);
}
out:
return r;
}
kvm_dev_ioctl_create_vm 创建VM虚拟机,流程如下:
a. kvm_create_vm (虚拟机结构 为 struct kvm)
==> kvm_arch_init_vm(arch/kvm/x86.c)
==> kvm_init_mmu_notifier注册 vmm内存通知结构kvm_mmu_notifier_ops
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_page = kvm_mmu_notifier_invalidate_page,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
.test_young = kvm_mmu_notifier_test_young,
.change_pte = kvm_mmu_notifier_change_pte,
.release = kvm_mmu_notifier_release,
};
b. hardware_enable_all
c. kvm_eventfd_init ==》 kvm_x86_ops->hardware_enable
d. 注册虚拟机字符设备 (kvm_vm_fops)
1.2.2 VM 字符设备
static struct file_operations kvm_vm_fops = {
.release = kvm_vm_release,
.unlocked_ioctl =kvm_vm_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = kvm_vm_compat_ioctl,
#endif
.llseek = noop_llseek,
};
Ioctl Cmd | Implement function | Src file |
KVM_CREATE_VCPU | kvm_vm_ioctl_create_vcpu | Kvm_main.c |
KVM_SET_USER_MEMORY_REGION | kvm_vm_ioctl_set_memory_region | Kvm_main.c |
KVM_GET_DIRTY_LOG | kvm_vm_ioctl_get_dirty_log | Arch/x86/kvm/x86.c |
KVM_REGISTER_COALESCED_MMIO | kvm_vm_ioctl_register_coalesced_mmio | Mmio.c |
KVM_UNREGISTER_COALESCED_MMIO | kvm_vm_ioctl_unregister_coalesced_mmio | Mmio.c |
KVM_IRQFD | kvm_irqfd | Eventfd.c |
KVM_IOEVENTFD | kvm_ioeventfd | Eventfd.c |
KVM_SIGNAL_MSI | kvm_send_userspace_msi | Irqchip.c |
KVM_SET_GSI_ROUTING | kvm_set_irq_routing | Irqchip.c |
KVM_CREATE_DEVICE | kvm_ioctl_create_device | Kvm_main.c |
defautl | kvm_arch_vm_ioctl kvm_vm_ioctl_assigned_device | Arch/x86/kvm/x86.c Assigend-dev.c |
下面是vcpu创建的代码分析:
kvm_vm_ioctl_create_vcpu
a. kvm_arch_vcpu_create ==> kvm_x86_ops -> vcpu_create
b. preempt_notifier_init(&vcpu->preempt_notifier,&kvm_preempt_ops);
c. kvm_arch_vcpu_setup ==> kvm_x86_ops->vcpu_load
d. create_vcpu_fd ==> anon_inode_getfd("kvm-vcpu",&kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); //创建vcpu 的字符设备
static struct file_operations kvm_vcpu_fops = {
.release = kvm_vcpu_release,
.unlocked_ioctl =kvm_vcpu_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = kvm_vcpu_compat_ioctl,
#endif
.mmap = kvm_vcpu_mmap,
.llseek = noop_llseek,
};
kvm_vcpu_mmap: vma->vm_ops = &kvm_vcpu_vm_ops;
static const struct vm_operations_struct kvm_vcpu_vm_ops = {
.fault =kvm_vcpu_fault,
};
kvm_vcpu_falut: 映射vcpu->run 的地址,这样qemu可以直接访问该内存。
kvm_x86_ops -> vcpu_create = vmx_create_vcpu ==> kvm_vcpu_init 会分配 vcpu->run的空间
kvm_vcpu_ioctl
Ioctl Cmd | Implement function |
KVM_RUN | kvm_arch_vcpu_ioctl_run 进入vm-entry |
KVM_GET_REGS | kvm_arch_vcpu_ioctl_get_regs==> kvm_register_read(kvm_cache_reg.h) |
KVM_SET_REGS | kvm_arch_vcpu_ioctl_set_regs ==> kvm_register_write |
KVM_GET_SREGS | kvm_arch_vcpu_ioctl_get_sregs |
KVM_SET_SREGS | kvm_arch_vcpu_ioctl_set_sregs |
KVM_GET_MP_STATE | kvm_arch_vcpu_ioctl_get_mpstate |
KVM_SET_MP_STATE | kvm_arch_vcpu_ioctl_set_mpstate |
KVM_TRANSLATE | kvm_arch_vcpu_ioctl_translate |
KVM_SET_SIGNAL_MASK | kvm_vcpu_ioctl_set_sigmask |
KVM_GET_FPU | kvm_arch_vcpu_ioctl_get_fpu |
KVM_SET_FPU | kvm_arch_vcpu_ioctl_set_fpu |
kvm_arch_vcpu_ioctl |
|
1.2.3 KVM 内核空间模块
(1) arch/x86/kvm 下的模块
File | Description | Interface func | Function note | |
Cpuid.c | 处理cpuid指令相关代码 | kvm_emulate_cpuid | handle_cpuid 响应cpuid造成的vm-exit | |
kvm_cpuid | 响应kvm_x86_ops--》ioctrl get_cpuid | |||
Emulate.c | 模拟指令的执行 | x86_emulate_insn | 当异常造成vm_exits时x86_emulate_instruction | |
I8254.c | 管理虚拟8254 控制器 | kvm_create_pit | ioctl KVM_CREATE_PIT2 kvm_create_pit | |
I8259.c | 管理虚拟8259控制器 | kvm_create_pic | KVM_CREATE_IRQCHIP: kvm_create_pic | |
Irq.c | 虚拟中断管理 | kvm_cpu_get_interrupt kvm_inject_pending_timer_irqs …… |
| |
Lapic.c |
|
| Irq.c依赖该文件 | |
Mmu.c | 虚拟机内存管理 | kvm_mmu_page_fault kvm_mmu_invlpg
| 页与缓存管理 | |
Pmu.c | 虚拟机性能监控管理 |
|
| |
Vmx.c | X86虚拟机管理主函数 | 各种异常处理函数入口 kvm_vmx_exit_handlers
x86 arch的入口文件 |
| |
X86.c | Virt 模块到x86 arch的中间层 |
|
|
(2) virt下的模块
File | Description | Detail | |
Kvm_main.c | 内核态的主模块 | Kvm的初始化,对应用层接口的实现。Os回调的接口实现 | |
Ioapic.c Irqchip.c Irq_comm.c | 中断模块 | Kvm虚拟机中断控制器的实现入口 | |
coalesced_mmio.c eventfd.c | io虚拟化 |
| |
Pci_assign.c Vfio.c | 直接io虚拟化 | Pci_assign与vfio是两种不同的直接io实现方式 | |
Iommu.c | Iommu入口 | 用于直接io,调用drivers/iommu模块完成相应功能,第7章会分析该部分 | |
Async_pf.c | 异步任务 | 用于cpu内存与中断的异步任务执行 |
(3) guest_os下的模块
Kvm的内核模块除了在vm host运行外,对于采用了半虚拟化的情况下,还需要在guest os上有对应的驱动(假设guest os 也运行Linux). 下面是其代码路径与说明:
File | Description | Detail |
Drivers/virtio/*.c | Virtio模块 | 第6章分析 |
Arch/x86/ (arch\x86\kernel\ kvmclock.c pvclock.c) | 时间虚拟化 | 4.3节分析 |