6.3.1 virtio_balloon Guest OS 端
(1) balloon 原理
balloon可以让Guest os在运行时动态调整它所占用的宿主机内存资源. 其工作原理如下:
1. VMM(即KVM)发送请求到客户机操作系统让其归还一定数量的内存给hypervisor。
2.GUEST OS中的virtio_balloon驱动接收到hypervisor的请求。
3. virtio_balloon驱动使客户机的内存气球膨胀,气球中的内存就不能被客户机访问。如果此时客户机中内存剩余量不多(如某应用程序绑定/申请了大量的内存),并不能让内存气球膨胀到足够大以满足VMM的请求,那么virtio_balloon驱动也会让尽可能多地提供内存内存使气球膨胀,尽量去满足VMM的请求中的内存数量(即使不一定能完全满足)。
4. GUEST OS归还气球中的内存给VMM。
5. VMM可以将从气球中得来的内存分配到任何需要的地方。
6. 如果从气球中得到来内存没有处于使用中,VMM也可以将内存返还到GuestOS中,这个过程为:a. VMM发请求到客户机的virtio_balloon驱动;b. 这个请求让GUEST OS压缩内存气球;c. 在气球中的内存被释放出来,重新让客户机可以访问和使用。
(2) Guest OS 初始化
下面先看看guest os 端的实现(drivers/virtio/virtio_balloon.c)
static struct virtio_driver virtio_balloon_driver = {
.feature_table =features,
.feature_table_size =ARRAY_SIZE(features),
.driver.name = KBUILD_MODNAME,
.driver.owner = THIS_MODULE,
.id_table = id_table,
.probe = virtballoon_probe,
.remove = virtballoon_remove,
.config_changed =virtballoon_changed,
#ifdef CONFIG_PM_SLEEP
.freeze = virtballoon_freeze,
.restore = virtballoon_restore,
#endif
};:
virtballoon_probe
a. balloon_dev_info *vb_devinfo = balloon_devinfo_alloc(vb); 建立管理数据结构
b. 建立一个address_space
vb_mapping = balloon_mapping_alloc(vb_devinfo,
(balloon_compaction_check()) ?
&virtio_balloon_aops: NULL);
c. init_vqs建立virtio_queue最多三个
vb->inflate_vq = vqs[0]; //用于膨胀, callback为balloon_ack, ,
vb->deflate_vq =vqs[1];//用于收缩, callback为balloon_ack,
vb->stats_vq =vqs[2]; 用于状态管理, callback为stats_request,
d. 启动内核线程vb->thread= kthread_run(balloon, vb, "vballoon");
(2) balloon 的工作
static int balloon(void *_vballoon)
{
struct virtio_balloon*vb = _vballoon;
set_freezable();
while (!kthread_should_stop()){
s64 diff;
try_to_freeze();
wait_event_interruptible(vb->config_change,
(diff = towards_target(vb)) != 0
|| vb->need_stats_update
|| kthread_should_stop()
|| freezing(current));
if (vb->need_stats_update)
stats_handle_request(vb);
if (diff > 0)
fill_balloon(vb,diff);
else if (diff <0)
leak_balloon(vb,-diff);
update_balloon_size(vb);
cond_resched();
}
return 0;
}
当diff = towards_target(vb)), diff!=0时,线程将被唤醒,根据diff的状况来调用
fill_balloon或leak_ballloon.
static inline s64 towards_target(struct virtio_balloon *vb)
{
......
virtio_cread(vb->vdev,struct virtio_balloon_config, num_pages, &v);
target =le32_to_cpu(v);
return target -vb->num_pages;
}
virtio_cread读取的是配置信息,VMM能够更改该信息;所以当VMM更改后,该线程会被唤醒。
static void fill_balloon(struct virtio_balloon *vb, size_t num)
{
struct balloon_dev_info*vb_dev_info = vb->vb_dev_info;
num = min(num,ARRAY_SIZE(vb->pfns));
mutex_lock(&vb->balloon_lock);
for (vb->num_pfns =0; vb->num_pfns < num;
vb->num_pfns +=VIRTIO_BALLOON_PAGES_PER_PAGE) {
//通过alloc_page,占用了guest 内存
struct page *page =balloon_page_enqueue(vb_dev_info);
if (!page) {
msleep(200);
break;
}
//将这些也的gpa存入vb结构
set_page_pfns(vb->pfns+ vb->num_pfns, page);
vb->num_pages +=VIRTIO_BALLOON_PAGES_PER_PAGE;
adjust_managed_page_count(page,-1);
}
if (vb->num_pfns !=0)
tell_host(vb,vb->inflate_vq);//通知vmm哪些页guest os 不能被访问了
mutex_unlock(&vb->balloon_lock);
}
static void tell_host(struct virtio_balloon *vb, structvirtqueue *vq)
{
。。。。。。
sg_init_one(&sg,vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
virtqueue_add_outbuf(vq,&sg, 1, vb, GFP_KERNEL);
virtqueue_kick(vq);
wait_event(vb->acked,virtqueue_get_buf(vq, &len)); //等待完成
}
host vmm处理完后balloon_ack回调被执行
balloon_ack ==》 wake_up(&vb->acked);
tell_host函数就从wait_event返回.
leak_balloon与fill_balloon相反
static void leak_balloon(struct virtio_balloon *vb, size_t num)
{
。。。。。。。
num = min(num, ARRAY_SIZE(vb->pfns));
mutex_lock(&vb->balloon_lock);
for (vb->num_pfns =0; vb->num_pfns < num;
vb->num_pfns +=VIRTIO_BALLOON_PAGES_PER_PAGE) {
page =balloon_page_dequeue(vb_dev_info);
if (!page)
break;
set_page_pfns(vb->pfns+ vb->num_pfns, page);
vb->num_pages -=VIRTIO_BALLOON_PAGES_PER_PAGE;
}
if (vb->num_pfns !=0)
tell_host(vb,vb->deflate_vq);
mutex_unlock(&vb->balloon_lock);
release_pages_by_pfn(vb->pfns,vb->num_pfns);
}
6.3.2 virtio_balloon VM Host
qeum: virtio-pci.c
static TypeInfo virtio_balloon_info = {
.name = "virtio-balloon-pci",
.parent = TYPE_PCI_DEVICE,
.instance_size =sizeof(VirtIOPCIProxy),
.class_init = virtio_balloon_class_init,
};
virtio_balloon_init_pci ==> virtio_balloon_init(virtio-balloon.c)
{
s->vdev.get_config= virtio_balloon_get_config;
s->vdev.set_config= virtio_balloon_set_config;
s->vdev.get_features = virtio_balloon_get_features;
ret =qemu_add_balloon_handler(virtio_balloon_to_target,
virtio_balloon_stat, s);
s->ivq =virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output);
s->dvq =virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output);
s->svq =virtio_add_queue(&s->vdev, 128, virtio_balloon_receive_stats);
}
(1) virtio_balloon_set_config
该函数会引起balloon的change virtio_balloon_set_config==> qemu_balloon_changed
当变化发生后,内核态线程被唤醒,做响应处理,并提交queue.
这样virtio_balloon_handle_output会被调用
static void virtio_balloon_handle_output(VirtIODevice *vdev,VirtQueue *vq)
{
。。。。。。。
while(virtqueue_pop(vq, &elem)) {
size_t offset = 0;
uint32_t pfn;
while(iov_to_buf(elem.out_sg, elem.out_num, offset, &pfn, 4) == 4) {
ram_addr_t pa;
ram_addr_taddr;
pa =(ram_addr_t)ldl_p(&pfn) << VIRTIO_BALLOON_PFN_SHIFT;
offset += 4;
section =memory_region_find(get_system_memory(), pa, 1);
if(!section.size || !memory_region_is_ram(section.mr))
continue;
addr =section.offset_within_region;
balloon_page(memory_region_get_ram_ptr(section.mr) + addr,
!!(vq == s->dvq));
}
virtqueue_push(vq,&elem, offset);
virtio_notify(vdev, vq);
}
}
balloon_page用来处理这些被deflate或inflate的页
static void balloon_page(void *addr, int deflate)
{
#if defined(__linux__)
if (!kvm_enabled() ||kvm_has_sync_mmu())
qemu_madvise(addr,TARGET_PAGE_SIZE,
deflate ?QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
#endif
}
#define QEMU_MADV_WILLNEED MADV_WILLNEED
#define QEMU_MADV_DONTNEED MADV_DONTNEED
qemu_madvise ==> madvise
advise() 函数建议内核,在从 addr 指定的地址开始,长度等于 len 参数值的范围内,该区域的用户虚拟内存应遵循特定的使用模式。内核使用这些信息优化与指定范围关联的资源的处理和维护过程。
MADV_WILLNEED
Specifiesthat the application expects to access the specified range in the near future.
MADV_DONTNEED
Specifiesthat the application expects that it will not access the specified range in thenear future.