8.2.1 虚拟机状态的维护
本节通过Cpu,内存和硬盘来分析虚拟机的状态维护
(1) CPU
cpu_exec_init ==>
vmstate_register(NULL, cpu_index,&vmstate_cpu_common, env);
register_savevm(NULL, "cpu", cpu_index,CPU_SAVE_VERSION,
cpu_save, cpu_load, env);//用户保存和恢复cpu的状态
static constVMStateDescription vmstate_cpu_common = {
.name = "cpu_common",
.version_id = 1,
.minimum_version_id = 1,
.minimum_version_id_old = 1,
.post_load = cpu_common_post_load,
.fields = (VMStateField []) {
VMSTATE_UINT32(halted, CPUArchState),
VMSTATE_UINT32(interrupt_request,CPUArchState),
VMSTATE_END_OF_LIST()
}
};
void cpu_save(QEMUFile *f,void *opaque) {
vmstate_save_state(f, &vmstate_cpu,opaque);
}
int cpu_load(QEMUFile *f,void *opaque, int version_id) {
return vmstate_load_state(f,&vmstate_cpu, opaque, version_id);
}
vmstate_cpu 也是VMStateDescription ; 它里面会保存cpu的寄存器。
(2)内存
pc_memory_init==> vmstate_register_ram_global(ram) ==>
vmstate_register_ram(mr, NULL);
voidvmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
{
qemu_ram_set_idstr(memory_region_get_ram_addr(mr) &TARGET_PAGE_MASK,
memory_region_name(mr),dev);
}
找到要保存内存在ram_list中的位置,ram_list在qemu_ram_alloc_from_ptr中分配。
vl.c ==> register_savevm_live(NULL, "ram", 0,4, &savevm_ram_handlers, NULL);
SaveVMHandlerssavevm_ram_handlers = {
.save_live_setup = ram_save_setup,
.save_live_iterate = ram_save_iterate,
.save_live_complete = ram_save_complete,
.load_state = ram_load,
.cancel = ram_migration_cancel,
};
ram_save_iterate ==> ram_save_block ==>save_block_hdr
将虚拟机内存持久化到snapshot文件中.
(3) 硬盘
block 层
blk_mig_init==>register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
&block_mig_state);
SaveVMHandlerssavevm_block_handlers = {
.set_params = block_set_params,
.save_live_setup = block_save_setup,
.save_live_iterate = block_save_iterate,
.save_live_complete = block_save_complete,
.load_state = block_load,
.cancel = block_migration_cancel,
.is_active = block_is_active,
};
ide controller:
vmstate_register(&d->dev.qdev,0, &vmstate_ide_pci, d);
const VMStateDescriptionvmstate_ide_pci = {
.name = "ide",
.version_id = 3,
.minimum_version_id = 0,
.minimum_version_id_old = 0,
.post_load = ide_pci_post_load,
.fields = (VMStateField []) {
VMSTATE_PCI_DEVICE(dev, PCIIDEState),
VMSTATE_STRUCT_ARRAY(bmdma,PCIIDEState, 2, 0,
vmstate_bmdma,BMDMAState),
VMSTATE_IDE_BUS_ARRAY(bus, PCIIDEState,2),
VMSTATE_IDE_DRIVES(bus[0].ifs, PCIIDEState),
VMSTATE_IDE_DRIVES(bus[1].ifs,PCIIDEState),
VMSTATE_END_OF_LIST()
}
};
pci_piix_init_ports ==》 qemu_add_vm_change_state_handler(d->bus[i].dma->ops->restart_cb,
&d->bmdma[i].dma);
restart_cb = bmdma_restart_cb
static voidbmdma_restart_cb(void *opaque, int running, RunState state)
{
........
if (!running) //还未running在结束
return;
if (!bm->bh) {
bm->bh =qemu_bh_new(bmdma_restart_bh, &bm->dma);
qemu_bh_schedule(bm->bh);
}
}
注册了vm_change的notify
8.2.2虚拟机vm status数据结构
structVMStateDescription {
const char *name;
int unmigratable;
int version_id;
int minimum_version_id;
int minimum_version_id_old;
LoadStateHandler *load_state_old;
int (*pre_load)(void *opaque);
int (*post_load)(void *opaque, intversion_id);
void (*pre_save)(void *opaque);
VMStateField *fields;
const VMStateSubsection *subsections;
};
其中VMStateField 用于保存 opaque变量中的某些字段。
例子1:VMStateField结构体指定了在opaque中的偏移和长度。对于cpu而言,opaque为env.;
VMSTATE_UINT32(halted,CPUArchState) 的含义为:env- 的类别为CPUArchState, 保存其成员halted. 即env->halted。
#define VMSTATE_UINT32_V(_f,_s, _v) \
VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint32,uint32_t)
const VMStateInfovmstate_info_uint32 = {
.name = "uint32",
.get = get_uint32,
.put = put_uint32,
};
例子2:VMSTATE_STRUCT_ARRAY(bmdma, PCIIDEState, 2, 0,
vmstate_bmdma,BMDMAState),
PCIIDEState结构体成员数组BMDMAState bmdma[2];
intregister_savevm(DeviceState *dev,
const char *idstr, intinstance_id,
int version_id,
SaveStateHandler*save_state,
LoadStateHandler *load_state,
void *opaque)
{
SaveVMHandlers *ops =g_malloc0(sizeof(SaveVMHandlers));
ops->save_state = save_state;
ops->load_state =load_state;
return register_savevm_live(dev, idstr,instance_id, version_id,
ops, opaque);
}
register_savevm_live ==》 {
SaveStateEntry *se;
se = g_malloc0(sizeof(SaveStateEntry));
se->version_id = version_id;
se->section_id = global_section_id++;
se->ops = ops;
se->opaque = opaque;
se->vmsd = NULL;
se->no_migrate = 0;
。。。。。。
pstrcat(se->idstr, sizeof(se->idstr),idstr);
if (instance_id == -1)
se->instance_id =calculate_new_instance_id(se->idstr);
else
se->instance_id = instance_id;
QTAILQ_INSERT_TAIL(&savevm_handlers, se, entry);
}
savevm_handlers记录了所有的save 单元
vmstate_register ==> vmstate_register_with_alias_id{
se = g_malloc0(sizeof(SaveStateEntry));
se->version_id = vmsd->version_id;
se->section_id = global_section_id++;
se->opaque = opaque;
se->vmsd = vmsd;
se->alias_id = alias_id;
se->no_migrate = vmsd->unmigratable;
..............
}
与上一个的区别在于没有给ops赋值
8.2.3虚拟机Save流程
(1) save 主流程
vm save/load的主流程在vmsave.c; 本节重点分析虚拟机save的流程.其入口函数为:
savevm.c: void do_savevm(Monitor*mon, const QDict *qdict)
a. 得到能做snapshot的blockdevicebs
b. 停止虚拟机
saved_vm_running = runstate_is_running();
vm_stop(RUN_STATE_SAVE_VM) ==> do_vm_stop
static void do_vm_stop(RunState state) {
if (runstate_is_running()) {
cpu_disable_ticks();
pause_all_vcpus(); //停止vcpu的运行,停止运行vcpu的线程
runstate_set(state);
vm_state_notify(0, state); //vm state nofity 的回调会被调用,如ide的
bdrv_drain_all();
bdrv_flush_all();
monitor_protocol_event(QEVENT_STOP, NULL);
}
}
c. 得到虚拟机停止时的时间
sn->vm_clock_nsec =qemu_get_clock_ns(vm_clock);
d. 如果old snapshot文件存在,删除之
e. 保存虚拟机状态
QEMUFile * f = qemu_fopen_bdrv(bs, 1);
QEMUFile 结构提供了2MB的缓存,当缓存满时,会调用
QEMUFilePutBufferFunc *put_buffer; =block_put_buffer
QEMUFileGetBufferFunc *get_buffer; =block_get_buffer
将数据写/读到后背block device.
ret = qemu_savevm_state(f);
vm_state_size = qemu_ftell(f);
qemu_fclose(f);
f. 创建snapshot
(2) vm sate save
qemu_savevm_state {
//默认se->no_migrate为0,如果有1的情况, savevm将不能继续
qemu_savevm_state_blocked(NULL));
ret = qemu_savevm_state_begin(f,¶ms);
do {
ret = qemu_savevm_state_iterate(f);
} while (ret == 0);
ret = qemu_savevm_state_complete(f);
}
qemu_savevm_state_begin:
a. 生成vm state magic info
qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
qemu_put_be32(f, QEMU_VM_FILE_VERSION);
b. 对每个SaveStateEntr se
如果
if (!se->ops ||!se->ops->save_live_setup) {
continue;
if(!se->ops->is_active(se->opaque))
continue;
否则就保存 (内存 & block都会执行该分支):
qemu_put_byte(f,QEMU_VM_SECTION_START);
qemu_put_be32(f, se->section_id);
len = strlen(se->idstr);
qemu_put_byte(f, len);
qemu_put_buffer(f, (uint8_t*)se->idstr, len);
qemu_put_be32(f, se->instance_id);
qemu_put_be32(f, se->version_id);
ret = se->ops->save_live_setup(f,se->opaque);
qemu_savevm_state_iterate:
对每个SaveStateEntr se
if (!se->ops ||!se->ops->save_live_iterate)
continue;
if (se->ops &&se->ops->is_active)
if(!se->ops->is_active(se->opaque)) {
continue;
}
qemu_put_byte(f, QEMU_VM_SECTION_PART);
qemu_put_be32(f, se->section_id);
ret =se->ops->save_live_iterate(f, se->opaque);
qemu_savevm_state_complete:
a. 对每个SaveStateEntr se
if (!se->ops ||!se->ops->save_live_iterate)
continue;
if (se->ops && se->ops->is_active)
if(!se->ops->is_active(se->opaque)) {
continue;
}
qemu_put_byte(f, QEMU_VM_SECTION_END);
qemu_put_be32(f, se->section_id);
ret =se->ops->save_live_complete(f, se->opaque);
b. 对每个SaveStateEntr se
if ((!se->ops ||!se->ops->save_state) && !se->vmsd) {
continue;
qemu_put_byte(f, QEMU_VM_SECTION_FULL);
qemu_put_be32(f, se->section_id);
len = strlen(se->idstr);
qemu_put_byte(f, len);
qemu_put_buffer(f, (uint8_t*)se->idstr, len);
qemu_put_be32(f, se->instance_id);
qemu_put_be32(f, se->version_id);
vmstate_save(f, se); //call vmstate_save_state(f,se->vmsd,se->opaque);
vmstate_save_state根据vmsd的VMStateField数组信息,保存其描述的在se->opaque对应内存位置的值,如果对应位置是结构体,则递归调用:
if (field->flags & VMS_STRUCT){
vmstate_save_state(f,field->vmsd, addr);
} else {
field->info->put(f,addr, size);
}
(3) vm load流程简介
qemu_loadvm_state(savevm.c) ,其流程与savevm相反:其流程如下:
(1) 获取要恢复的设备信息:
根据保存的 instance_id =qemu_get_be32(f);
version_id = qemu_get_be32(f);
和savevm_handler, 得到要恢复设备的对象信息
se = find_se(idstr, instance_id);
并将其加入到
le = g_malloc0(sizeof(*le));
le->se = se;
le->section_id = section_id;
le->version_id = version_id;
QLIST_INSERT_HEAD(&loadvm_handlers, le, entry);
(2) 对每个个单元做数据恢复
vmstate_load(f, le->se,le->version_id);
(3) 恢复vcpu的寄存器
cpu_synchronize_all_post_init==》 kvm_arch_put_registers
8.2.4 动态迁移
动态迁移的目的是:主要目标就是在客户机没有感觉的情况下,将客户机,迁移到另一个物理机器上,从而保证了服务器正常使用。由于在迁移过程中会出现从源主机中迁移操作开始到目的主机上客户机服务处于不可用状态的时间,此时源主机上客户机已经暂停服务,目的主机上的客户机还未恢复服务。因此设计目标是尽可能的缩短该时间。动态迁移要满足如下条件才能进行:
1)源宿主机和目的宿主机直接尽量用网络共享的存储系统来保存客户机磁盘镜像,尽管kvm动态迁移也支持联通磁盘镜像一起复制,共享存储(如NFS )来源宿主机和目的上的挂载位置必须完全一致
2) 为了提高动态迁移的成功率,尽量在同类型cpu的主机上面进行动态迁移, 3) 64位的客户机只能运行在64宿主机之间的迁移,而32位客户机可以在32宿主机和64位宿主机之间迁移。
4) 动态迁移的源宿主机和目的宿主机对NX(一种安全特性) 位的设置是相同,要么同为关闭状态,要么同为打开状态。在Intel平台上的linux系统中,用“cat /proc/couinfo |grep nx ”命令可以查看是否有NX的支持
5) 在进行动态迁移时,被迁移客户机的名称是唯一的,在目的宿主机上不能有与源宿主机被迁移客户机同名的客户机存在
使用步骤如下:
a. 源和目的宿主机上挂载nfs文件系统
b. 增加选项-incoming tcp:0(允许来自任何主机的连接):xxxx(端口号) 在目的宿主机上启动虚拟机
c.源虚拟机的monitor输入migrate tcp:目的机ip:xxxx
下面简单分析动态迁移的代码流程:
(1) 源端:
qmp_migrate(migration.c) ==》
migrate_init(¶ms);
if (strstart(uri, "tcp:",&p))
ret = tcp_start_outgoing_migration(s,p, errp);
}else if (strstart(uri, "exec:",&p))
ret = exec_start_outgoing_migration(s,p);
}else if (strstart(uri, "unix:",&p))
ret = unix_start_outgoing_migration(s,p);
}else if (strstart(uri, "fd:",&p))
ret = fd_start_outgoing_migration(s,p);
tcp_start_outgoing_migration==>inet_nonblocking_connect(host_port, tcp_wait_for_connect,s,
errp);
tcp_wait_for_connect ==>migrate_fd_connect
voidmigrate_fd_connect(MigrationState *s)
{
int ret;
s->state = MIG_STATE_ACTIVE;
s->file = qemu_fopen_ops_buffered(s, s->bandwidth_limit,
migrate_fd_put_buffer,
migrate_fd_put_ready,
migrate_fd_wait_for_unfreeze,
migrate_fd_close);
ret = qemu_savevm_state_begin(s->file,&s->params); //保存支持动态迁移的设备信息
migrate_fd_put_ready(s);
}
static voidmigrate_fd_put_ready(void *opaque)
{
ret = qemu_savevm_state_iterate(s->file);
if (ret < 0) {
migrate_fd_error(s);
} else if (ret == 1) {
int old_vm_running =runstate_is_running();
qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
if(qemu_savevm_state_complete(s->file) < 0) {
migrate_fd_error(s);
} else {
migrate_fd_completed(s);
}
s->total_time =qemu_get_clock_ms(rt_clock) - s->total_time;
if (s->state != MIG_STATE_COMPLETED){
if (old_vm_running) {
vm_start();
}
}
}
}
(2) 目的端:
main(vl.c):
if (incoming) {
Error *errp = NULL;
int ret =qemu_start_incoming_migration(incoming, &errp);
}
qemu_start_incoming_migration==> tcp_start_incoming_migration ==>
qemu_set_fd_handler2(s, NULL,tcp_accept_incoming_migration, NULL,
(void *)(intptr_t)s);
tcp_accept_incoming_migration==> process_incoming_migration
voidprocess_incoming_migration(QEMUFile *f)
{
if (qemu_loadvm_state(f) < 0) {
fprintf(stderr, "load of migrationfailed\n");
exit(0);
}
qemu_announce_self();
bdrv_clear_incoming_migration_all();
bdrv_invalidate_cache_all();
if (autostart) {
vm_start();
} else {
runstate_set(RUN_STATE_PRELAUNCH);
}
}