3.2.1 Qemu内存管理结构
(1) KVM内存管理初始化
main(vl.c)==>configure_accelerator==>kvm_init(kvm_all.c)==> memory_listener_register(&kvm_memory_listener,NULL);
Qemu中可以注册多个listener, 用memory_listeners链表来维护
voidmemory_listener_register(MemoryListener *listener, MemoryRegion *filter)
{
MemoryListener *other = NULL;
listener->address_space_filter = filter;
if (QTAILQ_EMPTY(&memory_listeners)
|| listener->priority >=QTAILQ_LAST(&memory_listeners,
memory_listeners)->priority) {
QTAILQ_INSERT_TAIL(&memory_listeners, listener, link);
} else {
QTAILQ_FOREACH(other,&memory_listeners, link) {
if (listener->priority <other->priority) {
break;
}
}
QTAILQ_INSERT_BEFORE(other, listener,link);
}
listener_add_address_space(listener,&address_space_memory);
listener_add_address_space(listener,&address_space_io);
}
static MemoryListenerkvm_memory_listener = {
.begin = kvm_begin,
.commit = kvm_commit,
.region_add = kvm_region_add,
.region_del = kvm_region_del,
.region_nop = kvm_region_nop,
.log_start = kvm_log_start,
.log_stop = kvm_log_stop,
.log_sync = kvm_log_sync,
.log_global_start = kvm_log_global_start,
.log_global_stop = kvm_log_global_stop,
.eventfd_add = kvm_eventfd_add,
.eventfd_del = kvm_eventfd_del,
.priority = 10,
};
kvm_region_add==>kvm_set_phys_mem(section, true);
kvm_region_del==>kvm_set_phys_mem(section, false);
log_global_xxx用于动态迁移,本章暂不讨论。
kvm_eventfd_add,kvm_eventfd_del用于eventfd的管理
(2) System Memory初始化化
在Qemu初始化时会 main(vl.c)==>cpu_exec_init_all(exec.c)==>memory_map_init(exec.c)
static voidmemory_map_init(void)
{
system_memory =g_malloc(sizeof(*system_memory));
memory_region_init(system_memory,"system", INT64_MAX);
set_system_memory_map(system_memory);
system_io = g_malloc(sizeof(*system_io));
memory_region_init(system_io,"io", 65536);
set_system_io_map(system_io);
memory_listener_register(&core_memory_listener, system_memory);
memory_listener_register(&io_memory_listener, system_io);
}
qemu中系统内存system_memory来管理,io内存用system_io来管理,io内存管理将在第5章分析。static MemoryRegion *system_memory.MemoryRegion可以有子区域。 而memory_lister负责处理添加和移除内存区域的管理。
set_system_memory_map(system_memory);用system_memory来初始化address_space_memory.
void set_system_memory_map(MemoryRegion*mr)
{
memory_region_transaction_begin();
address_space_memory.root = mr;
memory_region_transaction_commit();
}
AddressSpace的定义如下:
struct AddressSpace {
MemoryRegion *root;
FlatView current_map;
int ioeventfd_nb;
MemoryRegionIoeventfd *ioeventfds;
};
(3) Memory Listener 管理
voidmemory_region_transaction_begin(void)
{
qemu_flush_coalesced_mmio_buffer();
++memory_region_transaction_depth;
}
qemu_flush_coalesced_mmio_buffer==>kvm_flush_coalesced_mmio_buffer(kvm_all.c)
voidmemory_region_transaction_commit(void)
{
--memory_region_transaction_depth;
if (!memory_region_transaction_depth) {
MEMORY_LISTENER_CALL_GLOBAL(begin,Forward);
if (address_space_memory.root) {
address_space_update_topology(&address_space_memory);
}
if (address_space_io.root) {
address_space_update_topology(&address_space_io);
}
MEMORY_LISTENER_CALL_GLOBAL(commit,Forward);
}
}
static void address_space_update_topology(AddressSpace*as)
{
FlatView old_view = as->current_map;
FlatView new_view =generate_memory_topology(as->root);
address_space_update_topology_pass(as,old_view, new_view, false);
address_space_update_topology_pass(as, old_view,new_view, true);
as->current_map = new_view;
flatview_destroy(&old_view);
address_space_update_ioeventfds(as);
}
address_space_update_topology_pass==》 MEMORY_LISTENER_UPDATE_REGION
#defineMEMORY_LISTENER_UPDATE_REGION(fr, as, dir, callback) \
MEMORY_LISTENER_CALL(callback, dir,(&(MemoryRegionSection) { \
.mr = (fr)->mr, \
.address_space = (as)->root, \
.offset_within_region =(fr)->offset_in_region, \
.size =int128_get64((fr)->addr.size), \
.offset_within_address_space =int128_get64((fr)->addr.start), \
.readonly = (fr)->readonly, \
}))
MEMORY_LISTENER_CALL会从前到后或从后到前遍历memory_listeners,并调用相应方法如region_add, region_del等。调用region_add的示例如下:
MEMORY_LISTENER_UPDATE_REGION(frnew,as, Forward, region_add);
3.2.1 PC内存管理流程分析
(1)RAM初始化
pc_init1(hw\pc_piix.c)==》pc_memory_init内存被分为两段0 ~ 0xE000_0000, 0xE000_0000以上pc_memory_init(hw\pc.c)
{ 。。。。。。
MemoryRegion * ram = g_malloc(sizeof(*ram));
//分配整个内存区域
memory_region_init_ram(ram,"pc.ram",
below_4g_mem_size +above_4g_mem_size);
vmstate_register_ram_global(ram);
*ram_memory = ram;
ram_below_4g =g_malloc(sizeof(*ram_below_4g));
memory_region_init_alias(ram_below_4g,"ram-below-4g", ram,
0, below_4g_mem_size);
memory_region_add_subregion(system_memory,0, ram_below_4g);
if (above_4g_mem_size > 0) {
ram_above_4g =g_malloc(sizeof(*ram_above_4g));
memory_region_init_alias(ram_above_4g,"ram-above-4g", ram,
below_4g_mem_size, above_4g_mem_size);
memory_region_add_subregion(system_memory, 0x100000000ULL,
ram_above_4g);
}
。。。。。。
}
voidmemory_region_init_ram(MemoryRegion *mr,
const char *name,
uint64_t size)
{
memory_region_init(mr, name, size);
mr->ram = true;
mr->terminates = true;
mr->destructor =memory_region_destructor_ram;
mr->ram_addr = qemu_ram_alloc(size, mr);
}
mr->ram_addr =qemu_ram_alloc(size, mr); 分配HVA
qemu_ram_alloc==》qemu_ram_alloc_from_ptr
a) 向ram_list 加入一个RAMBlock 结构;同时扩大ram_list.phys_dirty用于记录脏页
b) ==》kvm_vmalloc==》qemu_vmalloc
qemu_vmalloc调用操作系统虚拟内存分配接口函数。
void memory_region_init_alias(MemoryRegion*mr,
const char *name,
MemoryRegion*orig,
target_phys_addr_t offset,
uint64_t size)
{
memory_region_init(mr, name, size);
mr->alias = orig;
mr->alias_offset = offset;
}
memory_region_init_alias(ram_below_4g,"ram-below-4g", ram,0, below_4g_mem_size);
ram_below_4g->alias = ram;ram_below_4g->offset= 0;
memory_region_add_subregion(system_memory,0, ram_below_4g);
//将ram_below_4g加入到system_memory的subregion中去
memory_region_add_subregion==>memory_region_add_subregion_common
static voidmemory_region_add_subregion_common(MemoryRegion *mr,
target_phys_addr_t offset,
MemoryRegion *subregion)
{
MemoryRegion *other;
memory_region_transaction_begin();
assert(!subregion->parent);
subregion->parent = mr;
subregion->addr = offset;
if (subregion->may_overlap ||other->may_overlap) { //over la
continue;
}
。。。。。。。
}
QTAILQ_FOREACH(other,&mr->subregions, subregions_link) {
if (subregion->priority >=other->priority) {
QTAILQ_INSERT_BEFORE(other,subregion, subregions_link);
goto done;
}
}
QTAILQ_INSERT_TAIL(&mr->subregions,subregion, subregions_link);
done:
memory_region_transaction_commit();
}
由于此时core_memory_listener,kvm_memory_listener都以注册,memory_region_transaction_commit();将触发他们的add_region被调用。
(2) rom区域
pc_init1:
pci_memory = g_new(MemoryRegion, 1);
memory_region_init(pci_memory,"pci", INT64_MAX);
rom_memory = pci_memory;
pc_memory_init==> pc_system_firmware_init==》old_pc_system_rom_init
bios rom区域的建立:
memory_region_init_ram(bios,"pc.bios", bios_size);
vmstate_register_ram_global(bios);
memory_region_set_readonly(bios, true);
isa_bios = g_malloc(sizeof(*isa_bios));
memory_region_init_alias(isa_bios,"isa-bios", bios,
bios_size -isa_bios_size, isa_bios_size);
memory_region_add_subregion_overlap(rom_memory,
0x100000 - isa_bios_size,
isa_bios,
1);
memory_region_set_readonly(isa_bios, true);
/* map all the bios at the top of memory */
memory_region_add_subregion(rom_memory,
(uint32_t)(-bios_size),
bios);
(3) Ram RW VM-Exit处理
kvm_cpu_exec==》case KVM_EXIT_MMIO ==>cpu_physical_memory_rw 下面的示例为ram区域的写:
ram_addr_t addr1;
addr1 =memory_region_get_ram_addr(section->mr)
+memory_region_section_addr(section, addr);
/* RAM case */
ptr = qemu_get_ram_ptr(addr1);
memcpy(ptr, buf, l);
invalidate_and_set_dirty(addr1,l);
qemu_put_ram_ptr(ptr);
static voidinvalidate_and_set_dirty(target_phys_addr_t addr,
target_phys_addr_t length)
{
if (!cpu_physical_memory_is_dirty(addr)) {
/* invalidate code */
tb_invalidate_phys_page_range(addr,addr + length, 0);
/* set dirty bit */
cpu_physical_memory_set_dirty_flags(addr, (0xff &~CODE_DIRTY_FLAG));
}
}
将页标记为脏
static inline intcpu_physical_memory_set_dirty_flags(ram_addr_t addr,
int dirty_flags)
{
if ((dirty_flags &MIGRATION_DIRTY_FLAG) &&
!cpu_physical_memory_get_dirty(addr,TARGET_PAGE_SIZE,
MIGRATION_DIRTY_FLAG)) {
ram_list.dirty_pages++;
}
return ram_list.phys_dirty[addr >>TARGET_PAGE_BITS] |= dirty_flags;
}
(4) 其他
memory_region_transaction_commit==》address_space_update_topology==》address_space_update_topology_pass ==》
if (adding) {
MEMORY_LISTENER_UPDATE_REGION(frnew, as,Forward, region_nop);
if (frold->dirty_log_mask&& !frnew->dirty_log_mask) {
MEMORY_LISTENER_UPDATE_REGION(frnew, as, Reverse, log_stop);
} else if (frnew->dirty_log_mask&& !frold->dirty_log_mask) {
MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, log_start);
}
}
当添加/移除或更新内存是会调用memory_region_transaction_commit,此时如果更新前后区域相同,则对原区域调用log_stop,新区域调用log_start. log目前用于vga 虚拟驱动.
3.2.3 Qemu到KVM内存管理接口分析
kvm_set_phys_mem用于设置内存, 该函数流程如下:
(1) start_addr = section->offset_within_address_space
ram_addr_t size = section->size;
根据物理起始地址和长度,在kvm_state中搜索已建立的KVMSlot *mem区域
typedef struct KVMSlot
{
target_phys_addr_t start_addr;
ram_addr_t memory_size;
void *ram;
int slot;
int flags;
} KVMSlot;
(2) 如果没找到,则推出循环并建立一个slot; add 为false时直接退出
mem = kvm_alloc_slot(s);
mem->memory_size = size;
mem->start_addr = start_addr;
mem->ram = ram;
mem->flags = kvm_mem_flags(s,log_dirty);
然后调用 err = kvm_set_user_memory_region(s, mem); 通知内核态建立内存区域
static intkvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
{
struct kvm_userspace_memory_region mem;
mem.slot = slot->slot;
mem.guest_phys_addr = slot->start_addr;
mem.memory_size = slot->memory_size;
mem.userspace_addr = (unsignedlong)slot->ram;
mem.flags = slot->flags;
if (s->migration_log) {
mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
}
return kvm_vm_ioctl(s,KVM_SET_USER_MEMORY_REGION, &mem);
}
(3) 如果找到,且区域完全重合则调用 并且add==true
kvm_slot_dirty_pages_log_change(mem,log_dirty);并返回 其中
log_dirty =memory_region_is_logging(mr); //return mr->dirty_log_mask;
if (mem->flags &KVM_MEM_LOG_DIRTY_PAGES) {
kvm_physical_sync_dirty_bitmap(section);
}
当kvm_log_global_start时KVM_MEM_LOG_DIRTY_PAGES flag会被设置
(4) 如果找到,但不完全重合
a. 取消slot区域
old = *mem;
/* unregister the overlapping slot */
mem->memory_size = 0;
err = kvm_set_user_memory_region(s,mem);
b.将新建两个区域
slot->StartAddr to mr->startaddr
mr->start_addr to (slot->startadd + slot->memory_size)
3.2.4 KVM内存虚拟化框架
(1) memslots
kvm_vm_ioctl ==> kvm_vm_ioctl_set_memory_region ==> __kvm_set_memory_region
内核态也维护了一个slots, struct kvm->memslots,其定义如下:
struct kvm_memslots {
u64 generation;
struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
/* The mapping table from slot id to the index in memslots[]. */
short id_to_index[KVM_MEM_SLOTS_NUM];
};
struct kvm_memory_slot {
gfn_t base_gfn; //guestphysical page numer
unsigned long npages; // page numbers
unsigned long *dirty_bitmap;
struct kvm_arch_memory_slot arch;
unsigned long userspace_addr; //guest virtual start address
u32 flags;
short id;
};
内核态slot的管理策略是根据用户空间的slot_id一一对应的
slot =id_to_memslot(kvm->memslots, mem->slot); //根据用户态slot号得到内核slot结构
__kvm_set_memory_region流程如下:
a. 根据用户态slot号得到内核slot结构
b.根据slot中的值和要设置的值,决定要操作的类别:
enum kvm_mr_change {
KVM_MR_CREATE,
KVM_MR_DELETE,
KVM_MR_MOVE,
KVM_MR_FLAGS_ONLY,
};
c. 根据b中的动作进行操作
i . KVM_MR_CREATE: kvm_arch_create_memslot
X86 arch layer memslot 该结构按大页分级页表来gpa
struct kvm_arch_memory_slot {
unsigned long *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; //记录页被写的次数
};
ii KVM_MR_DELETE OR KVM_MR_MOVE:
1. 将原slot标记为无效
slot->flags|= KVM_MEMSLOT_INVALID;
old_memslots = install_new_memslots(kvm, slots, NULL);
kvm_iommu_unmap_pages(kvm, &old);
kvm_arch_flush_shadow_memslot 刷新影子页表3.4节分析
2. 安装新slot,对于delete而言会将新slot清零memset(&new.arch, 0, sizeof(new.arch));
iii r = kvm_arch_prepare_memory_region(kvm,&new, mem, change);
通过vm_mmap调用为hva分配空间
iv 删除要取消映射的区域
install_new_memslots(kvm, slots,&new);
kvm_arch_commit_memory_region(kvm,mem, &old, change); //vm_unmap
kvm_free_physmem_slot(kvm,&old, &new);
kfree(old_memslots);
v KVM_MR_CREATEOR KVM_MR_MOVE:
kvm_iommu_map_pages(kvm, &new);// 在第7章分析