7.2.1 qemu pci-assign模块
虚拟机上的设备是由qemu创建出来的,对于直接io也是如此。 区别在于直接io时,qemu直接调用vm host上的硬件设备完成相应功能;而不需要更多的软件处理。
static const TypeInfoassign_info = { (pci-assign.c)
.name = "kvm-pci-assign",
.parent = TYPE_PCI_DEVICE,
.instance_size = sizeof(AssignedDevice),
.class_init = assign_class_init,
};
(1) 初始化
static intassigned_initfn(struct PCIDevice *pci_dev)
{
AssignedDevice *dev =DO_UPCAST(AssignedDevice, dev, pci_dev);
//对config空间的虚拟寄存做初始化, 将寄存器的值存在软件变量dev 的emulate_config_read 和emulate_config_write中
assigned_dev_emulate_config_read(dev, 0,PCI_CONFIG_SPACE_SIZE);
assigned_dev_direct_config_read(dev, PCI_STATUS,2);
。。。。。。。。。。
//和真实的pci设备关联, 由于启动时会输入pci bus,device,func号,所以依据这些信息能得到pci deice对应在vm host上的设别文件
get_real_device(dev, dev->host.domain,dev->host.bus,
dev->host.slot,dev->host.function)
assigned_device_pci_cap_init(pci_dev)为pci_dev添加capability
//增加misx的mmio处理回调assigned_dev_msix_mmio_ops
assigned_dev_register_msix_mmio(dev);
//为pci device的memory空间建立mmap
assigned_dev_register_regions(dev->real_device.regions,
dev->real_device.region_number, dev)};
r = assign_device(dev); //调用kvm的KVM_ASSIGN_PCI_DEVICE,
r = assign_intx(dev);//调用kvm 的KVM_ASSIGN_DEV_IRQ,管理中断
....
}
下面分析其中的关键函数:
get_real_device ==》
a. snprintf(dir, sizeof(dir),
"/sys/bus/pci/devices/%04x:%02x:%02x.%x/",r_seg, r_bus, r_dev, r_func);
dev->config_fd = open(name, O_RDWR); //打开真实设备的config,并读出内容
read(dev->config_fd,pci_dev->dev.config, pci_config_size(&pci_dev->dev));
但对bar地址做特殊处理
memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24);
memset(&pci_dev->dev.config[PCI_ROM_ADDRESS],0, 4);
b. 记录mmio信息到PCIRegion *rp;结构
snprintf(name, sizeof(name),"%sresource", dir);
f = fopen(name, "r");
对每个bar做:
fscanf(f, "%" SCNi64 "%" SCNi64 " %" SCNi64 "\n",&start, &end,&flags) ;
rp = dev->regions + r; rp->valid = 0;
rp->resource_fd = -1;
size = end - start + 1;
snprintf(name, sizeof(name),"%sresource%d", dir, r);
fd = open(name, O_RDWR);
rp->resource_fd = fd;
rp->type = flags; rp->valid = 1; rp->base_addr = start; rp->size = size;
pci_dev->v_addrs[r].region = rp;
assigned_dev_register_regions==》
a. pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL,cur_region->size,
PROT_WRITE |PROT_READ, MAP_SHARED,
cur_region->resource_fd, (off_t)0);
b.分为mmio和pio的case 分开处理(下面仅分析mmio)关联mmio gpa到真实设备的hva:
若mmio size < 0x1000(没有到一个内存page大小)
则 memory_region_init_io(&pci_dev->v_addrs[i].real_iomem,
&slow_bar_ops,&pci_dev->v_addrs[i],
"assigned-dev-slow-bar", cur_region->size);
否则用:void *virtbase = pci_dev->v_addrs[i].u.r_virtbase;
memory_region_init_ram_ptr(&pci_dev->v_addrs[i].real_iomem,
name, cur_region->size,virtbase);
//当EPt建立好后,guest os访问gpa时就直接访问真实设备了不会有vm-exit发生
c. assigned_dev_iomem_setup(&pci_dev->dev, i, cur_region->size);
pci_register_bar((PCIDevice *)pci_dev, i, t,
&pci_dev->v_addrs[i].container);
assign_device ==> kvm_device_pci_assign ==>
kvm_vm_ioctl(s, KVM_ASSIGN_PCI_DEVICE, &dev_data); (注意assgin时同时设置了host 与guest)
assign_intx ==>
a. intx_route = pci_device_route_intx_to_irq(&dev->dev,dev->intpin); ==》
pci_device_route_intx_to_irq(call piix3_route_intx_pin_to_irq)得到当前dev的irq信息
b. deassign当前irq ==》kvm_vm_ioctl(s, KVM_DEASSIGN_DEV_IRQ,&assigned_irq);
c.重新assign当前设置kvm_device_intx_assign(kvm_state,dev->dev_id, intx_host_msi,
intx_route.irq);==》
static intkvm_assign_irq_internal(KVMState *s, uint32_t dev_id,
uint32_tirq_type, uint32_t guest_irq)
{
struct kvm_assigned_irq assigned_irq = {
.assigned_dev_id = dev_id,
.guest_irq = guest_irq,
.flags = irq_type,
};
if (kvm_check_extension(s,KVM_CAP_ASSIGN_DEV_IRQ)) {
return kvm_vm_ioctl(s,KVM_ASSIGN_DEV_IRQ, &assigned_irq);
} else {
return kvm_vm_ioctl(s, KVM_ASSIGN_IRQ,&assigned_irq);
}
}
kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ,&assigned_irq);
(2) Reset
reset_assigned_device:
a. 对于msix设备调用assigned_dev_update_msix(pci_dev);
b. 真实设备reset
snprintf(reset_file, sizeof(reset_file), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset",
adev->host.domain,adev->host.bus, adev->host.slot, adev->host.function);
fd = open(reset_file, O_WRONLY);
ret = write(fd, reset, strlen(reset));
c. assigned_dev_pci_write_config(pci_dev,PCI_COMMAND, 0, 1);
assigned_dev_update_msi==>分misx和intx的case
msix case: 1. virq =kvm_irqchip_add_msi_route(kvm_state, msg);
2. kvm_device_msi_assign(kvm_state,assigned_dev->dev_id, virq);
最终调用kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ,&assigned_irq);
intx case: assign_intx(assigned_dev);
intkvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
{
......
virq = kvm_irqchip_get_virq(s); //软件分配一个空闲irq号
kroute.gsi = virq;
kroute.type = KVM_IRQ_ROUTING_MSI;
kroute.flags = 0;
kroute.u.msi.address_lo =(uint32_t)msg.address;
kroute.u.msi.address_hi = msg.address>> 32;
kroute.u.msi.data = msg.data;
// 调用kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING,s->irq_routes);
kvm_add_routing_entry(s, &kroute);
return virq;
7.2.2 kvm pci assgin
源码位于virt\assignd-dev.c:
kvm_vm_ioctl_assigned_device==> case KVM_ASSIGN_PCI_DEVICE ==>
kvm_vm_ioctl_assign_device==>
a) kvm_find_assigned_dev查看dev是否assigned,若已assigned,直接返回
b) pci_get_domain_bus_and_slot根据设备地址得到该设备的pci_device
c) probe_sysfs_permissions打开设备sysfs的访问权限,这样qeum能访问
d) pcidevice的相关初始化
pci_enable_device(dev);
pci_request_regions(dev,"kvm_assigned_device");
pci_reset_function(dev);
pci_save_state(dev);
match->pci_saved_state =pci_store_saved_state(dev);
e) 加入设备到assignedlist list_add(&match->list, &kvm->arch.assigned_dev_head);
f) 若vm的iommu domain未建立则kvm_iommu_map_guest(kvm);
g) r =kvm_assign_device(kvm, match); 将设备关联到iommu
kvm_iommu_map_guest ==》
kvm->arch.iommu_domain= iommu_domain_alloc(&pci_bus_type);
kvm_iommu_map_memslots(kvm);
kvm_assign_device(virtio/iommu.c)==>
a. r = iommu_attach_device(domain,&pdev->dev); //调用iommu关联设备
kvm_iommu_map_memslots ==》
slots =kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
r = kvm_iommu_map_pages(kvm, memslot);
if (r)
break;
}
kvm_iommu_map_pages ==》
对slot中的每个gfn
a. iommu_iova_to_phys(domain,gfn_to_gpa(gfn)) 检查是否已建立iommu映射
b. iommu_map(domain, gfn_to_gpa(gfn),pfn_to_hpa(pfn), page_size, flags);建立映射
同时在qemu新增加memory映射时该函数也会被调用:
__kvm_set_memory_region==》
if((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
r = kvm_iommu_map_pages(kvm, &new);
return r;
}
7.2.3 kvm interruptassgin
(1) 中断assign
kvm_vm_ioctl_assigned_device==》 case KVM_ASSIGN_DEV_IRQ==>
kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); ==>
if (host_irq_type)
r = assign_host_irq(kvm, match, host_irq_type);
if (guest_irq_type)
r = assign_guest_irq(kvm, match, assigned_irq,guest_irq_type);
assign_host_irq ==> 分为了intx, msi,和msix三种case,我们以后仅分析msix的case:
==> assigned_device_enable_host_msix==>
a. pci_enable_msix_exact(dev->dev, dev->host_msix_entries,dev->entries_nr);
b. request_threaded_irq(dev->host_msix_entries[i].vector,
kvm_assigned_dev_msix,
kvm_assigned_dev_thread_msix,
0,dev->irq_name, dev); //注册了中断处理函数
assign_guest_irq ==》分为了intx, msi,和msix三种case,我们以后仅分析msix的case:
a .id =kvm_request_irq_source_id(kvm); ==》
b. assigned_device_enable_guest_msix
assigned_device_enable_guest_msix(structkvm *kvm,
struct kvm_assigned_dev_kernel *dev,
struct kvm_assigned_irq *irq)
{
dev->guest_irq = irq->guest_irq;//guest_irq为guest os 的中断号
dev->ack_notifier.gsi = -1;
return 0;
}
(2) MSIX中断管理
对于misx的guest irq号由assigned_dev_update_msi ==》kvm_irqchip_add_msi_route分配
对应内核态为:
kvm_vm_ioctl ==》case KVM_SET_GSI_ROUTING ==> kvm_set_irq_routing (virt\irqchip.c) ==》
setup_routing_entry ==》 kvm_set_routing_entry (irq_comm.c) ==>
case KVM_IRQ_ROUTING_MSI
e->set = kvm_set_msi;//中断注入回调函数
e->msi.address_lo = ue->u.msi.address_lo;
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
int kvm_set_msi(structkvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level, boolline_status)
{
struct kvm_lapic_irq irq;
kvm_set_msi_irq(e, &irq);
return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
}
下面来看看host端中断处理:
kvm_assigned_dev_raise_guest_irq(structkvm_assigned_dev_kernel *assigned_dev,
int vector)
{
if (unlikely(assigned_dev->irq_requested_type &
KVM_DEV_IRQ_GUEST_INTX)) {
spin_lock(&assigned_dev->intx_mask_lock);
if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
kvm_set_irq(assigned_dev->kvm,
assigned_dev->irq_source_id, vector, 1,
false);
spin_unlock(&assigned_dev->intx_mask_lock);
} else
kvm_set_irq(assigned_dev->kvm,assigned_dev->irq_source_id,
vector, 1,false);
}
所以kvm_set_irq ==> kvm_set_msi(
static irqreturn_tkvm_assigned_dev_msix(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
int index = find_index_from_host_irq(assigned_dev, irq);
u32 vector;
int ret = 0;
if (index >= 0) {
vector = assigned_dev->guest_msix_entries[index].vector;
ret = kvm_set_irq_inatomic(assigned_dev->kvm,
assigned_dev->irq_source_id,
vector,1);
}
return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD :IRQ_HANDLED;
}
kvm_set_irq_inatomic==》kvm_set_msi_inatomic ==》 kvm_irq_delivery_to_apic_fast
当真实中断发生时,向guest assigneddevice注入中断
(3) intx中断管理
assign_host_irq ==》 request_threaded_irq(dev->host_irq,irq_handler,
kvm_assigned_dev_thread_intx, flags,
dev->irq_name, dev);
assign_guest_irq ==> caseintx
static intassigned_device_enable_guest_intx(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
struct kvm_assigned_irq *irq)
{
dev->guest_irq = irq->guest_irq;
dev->ack_notifier.gsi = irq->guest_irq;
return 0;
}
kvm_assigned_dev_thread_intx==》kvm_assigned_dev_raise_guest_irq ==> kvm_set_irq
由此可知,kvm的中断虚拟化流程如下:
(1) 同时注册真实设备的中断处理函数
(2) 当中断发生时,根据真实设别中断号对应虚拟设备号,注入中断
如果系统采用了irq remap机制,则host的中断不会产生,直接在guest os上产生中断。
下一节将讨论iommu.
除pci-assign外,另一种直接io方法,为vfio. 它与pci-assign的区别在于,vfio更多的虚拟化实现放在了qemu用户空间中实现。 但其底层任然会使用iommu;本文就不详细分析vfio了。 其源代码位于:
Qemu: hw\vfio_pci.c
Host driver:drivers/pci/vfio/