KVM 是 Linux 內核的一個模組,它會以裝置 /dev/kvm
向外界提供它的功能。QEMU 透過ioctl
去讀寫該裝置請求 KVM 完成特定任務。KVM 主要的工作有兩個: 第一,它負責檢視客戶機 VM Exit 的原因並做相對應的處理; 第二,它負責透過 VM Entry 啟動客戶機。客戶機若因為操作 IO 而觸發 VM Exit,KVM 會轉交QEMU 完成 IO。整個 KVM 流程基本如下14):
-
開啟
/dev/kvm
取得 fd。 -
透過
ioctl
操作/dev/kvm
取得 VM fd。 -
再透過
ioctl
操作 VM fd,針對每一個 VCPU 取得個別的 fd。
-
struct KVMState
和struct KVMSlot
分別是其重要資料結構
typedef struct KVMSlot { target_phys_addr_t start_addr; ram_addr_t memory_size; void *ram; int slot; int flags; } KVMSlot; struct KVMState { KVMSlot slots[32]; int fd; int vmfd; int coalesced_mmio; struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; bool coalesced_flush_in_progress; int broken_set_mem_region; int migration_log; int vcpu_events; int robust_singlestep; int debugregs; int pit_state2; int xsave, xcrs; int many_ioeventfds; /* The man page (and posix) say ioctl numbers are signed int, but * they're not. Linux, glibc and *BSD all treat ioctl numbers as * unsigned, and treating them as signed here can break things */ unsigned irqchip_inject_ioctl; };
-
CPU_COMMON
(cpu-defs.h
) 裡有欄位給 KVM 使用。#define CPU_COMMON struct KVMState *kvm_state; \ struct kvm_run *kvm_run; \ int kvm_fd; \ int kvm_vcpu_dirty;
-
main
(vl.c
) 會呼叫configure_accelerator
檢查使用者是否選用 KVM。int main(int argc, char **argv, char **envp) { ... 略 ... /* init the memory */ if (ram_size == 0) { ram_size = DEFAULT_RAM_SIZE * 1024 * 1024; } configure_accelerator(); qemu_init_cpu_loop(); if (qemu_init_main_loop()) { fprintf(stderr, "qemu_init_main_loop failed\n"); exit(1); } ... 略 ... }
-
kvm_init
(kvm-all.c
)。
int kvm_init(void) { KVMState *s; // slot 是用來記錄客戶機物理位址與 QEMU 虛擬位址的映射。 for (i = 0; i < ARRAY_SIZE(s->slots); i++) { s->slots[i].slot = i; } // 開啟 ''/dev/kvm'' 取得 fd。 s->fd = qemu_open("/dev/kvm", O_RDWR); // 透過 ''ioctl'' 操作 ''/dev/kvm'' 取得 VM fd。 s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0); kvm_state = s; // kvm_state 為一全域變數。 memory_listener_register(&kvm_memory_listener, NULL);
-
-
當 KVM 開啟時,VCPU handler 為
qemu_kvm_cpu_thread_fn
,qemu_kvm_start_vcpu
會喚起一個執行緒執行qemu_kvm_cpu_thread_fn
。若是原本 TCG 的模式,則改由qemu_tcg_init_vcpu
喚起qemu_tcg_cpu_thread_fn
。
static void qemu_kvm_start_vcpu(CPUArchState *env) { env->thread = g_malloc0(sizeof(QemuThread)); env->halt_cond = g_malloc0(sizeof(QemuCond)); qemu_cond_init(env->halt_cond); qemu_thread_create(env->thread, qemu_kvm_cpu_thread_fn, env, QEMU_THREAD_JOINABLE); while (env->created == 0) { qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex); } } void qemu_init_vcpu(void *_env) { CPUArchState *env = _env; env->nr_cores = smp_cores; env->nr_threads = smp_threads; env->stopped = 1; if (kvm_enabled()) { qemu_kvm_start_vcpu(env); } else if (tcg_enabled()) { qemu_tcg_init_vcpu(env); } else { qemu_dummy_start_vcpu(env); } }
-
qemu_kvm_cpu_thread_fn
呼叫kvm_cpu_exec
此一主要執行迴圈。static void *qemu_kvm_cpu_thread_fn(void *arg) { ... 略 ... r = kvm_init_vcpu(env); if (r < 0) { fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r)); exit(1); } qemu_kvm_init_cpu_signals(env); /* signal CPU creation */ env->created = 1; qemu_cond_signal(&qemu_cpu_cond); while (1) { if (cpu_can_run(env)) { r = kvm_cpu_exec(env); if (r == EXCP_DEBUG) { cpu_handle_guest_debug(env); } } qemu_kvm_wait_io_event(env); } return NULL; }
-
kvm_init_vcpu
(kvm-all.c
)。
int kvm_init_vcpu(CPUArchState *env) { KVMState *s = kvm_state; long mmap_size; int ret; ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index); env->kvm_fd = ret; // VCPU fd 而非 KVM fd。http://lists.gnu.org/archive/html/qemu-devel/2012-06/msg02302.html env->kvm_state = s; env->kvm_vcpu_dirty = 1; // QEMU 的 kvm_run 被 mmap 到 VCPU fd。這非常重要,當後續 KVM 將客戶機的 IO 交給 QEMU 執行, // QEMU 就是透過 kvm_run 讀取 IO 相關細節。 env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, env->kvm_fd, 0); ret = kvm_arch_init_vcpu(env); if (ret == 0) { qemu_register_reset(kvm_reset_vcpu, env); kvm_arch_reset_vcpu(env); } err: return ret; }
-
-
主要執行迴圈為
kvm_cpu_exec
(kvm-all.c
)。int kvm_cpu_exec(CPUArchState *env) { struct kvm_run *run = env->kvm_run; do { ... 略 ... run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); // 檢視 VMExit 的原因,並做相應的處理。若 VMExit 可由 KVM (內核) 處理,由 KVM 處理。 // 其餘諸如 IO 則交給 QEMU。 switch (run->exit_reason) { // IO 交由 QEMU (用戶態) 處理。 case KVM_EXIT_IO: DPRINTF("handle_io\n"); kvm_handle_io(run->io.port, (uint8_t *)run + run->io.data_offset, run->io.direction, run->io.size, run->io.count); ret = 0; break; case KVM_EXIT_MMIO: DPRINTF("handle_mmio\n"); cpu_physical_memory_rw(run->mmio.phys_addr, run->mmio.data, run->mmio.len, run->mmio.is_write); ret = 0; break; ... 略 ... // 其餘交由平台特定的 handler 處理。 default: DPRINTF("kvm_arch_handle_exit\n"); ret = kvm_arch_handle_exit(env, run); break; } } while (ret == 0); }
-
不同平台定義不同的
kvm_arch_handle_exit
。以 x86 為例,kvm_arch_handle_exit
(target-i386/kvm.c
)。int kvm_arch_handle_exit(CPUX86State *env, struct kvm_run *run){ }
-
KVM 和 QEMU 之間會同步一些資料結構,例如:
struct kvm_run
。/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ __u8 request_interrupt_window; __u8 padding1[7]; /* out */ __u32 exit_reason; __u8 ready_for_interrupt_injection; __u8 if_flag; __u8 padding2[2]; /* in (pre_kvm_run), out (post_kvm_run) */ __u64 cr8; __u64 apic_base; ... 略 ... }; struct kvm_vcpu { ... 略 ... struct kvm_run *run; ... 略 ... }; static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, unsigned short port, void *val, unsigned int count, bool in) { trace_kvm_pio(!in, port, size, count); vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { vcpu->arch.pio.count = 0; return 1; } // 回到 QEMU 之後,QEMU 會檢視以下來欄位。 vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = size; vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; return 0; }
-
上述流程可以參考 http://mac-on-linux.svn.sourceforge.net/viewvc/mac-on-linux/trunk/src/cpu/kvm/misc.c?revision=166&view=markup 中的
molcpu_mainloop
。
-