7.2 Qemu/KVM 直接IO框架


7.2.1 qemu pci-assign模块

虚拟机上的设备是由qemu创建出来的,对于直接io也是如此。 区别在于直接io时,qemu直接调用vm host上的硬件设备完成相应功能;而不需要更多的软件处理。

static const TypeInfoassign_info = { (pci-assign.c)

    .name               = "kvm-pci-assign",

    .parent             = TYPE_PCI_DEVICE,

    .instance_size      = sizeof(AssignedDevice),

    .class_init         = assign_class_init,

};

(1) 初始化

static intassigned_initfn(struct PCIDevice *pci_dev)

{

    AssignedDevice *dev =DO_UPCAST(AssignedDevice, dev, pci_dev);

   //对config空间的虚拟寄存做初始化, 将寄存器的值存在软件变量dev 的emulate_config_read 和emulate_config_write中

    assigned_dev_emulate_config_read(dev, 0,PCI_CONFIG_SPACE_SIZE);

    assigned_dev_direct_config_read(dev, PCI_STATUS,2);

    。。。。。。。。。。

  //和真实的pci设备关联, 由于启动时会输入pci bus,device,func号,所以依据这些信息能得到pci deice对应在vm host上的设别文件

  get_real_device(dev, dev->host.domain,dev->host.bus,

                        dev->host.slot,dev->host.function)

 

assigned_device_pci_cap_init(pci_dev)为pci_dev添加capability

//增加misx的mmio处理回调assigned_dev_msix_mmio_ops

assigned_dev_register_msix_mmio(dev);

//为pci device的memory空间建立mmap

assigned_dev_register_regions(dev->real_device.regions,

                         dev->real_device.region_number, dev)};

r = assign_device(dev); //调用kvm的KVM_ASSIGN_PCI_DEVICE,

 r = assign_intx(dev);//调用kvm 的KVM_ASSIGN_DEV_IRQ,管理中断

....

}

下面分析其中的关键函数:

get_real_device ==》

a. snprintf(dir, sizeof(dir),

"/sys/bus/pci/devices/%04x:%02x:%02x.%x/",r_seg, r_bus, r_dev, r_func);

   dev->config_fd = open(name, O_RDWR); //打开真实设备的config,并读出内容

    read(dev->config_fd,pci_dev->dev.config, pci_config_size(&pci_dev->dev));

   但对bar地址做特殊处理

   memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24);

    memset(&pci_dev->dev.config[PCI_ROM_ADDRESS],0, 4);

b. 记录mmio信息到PCIRegion *rp;结构

    snprintf(name, sizeof(name),"%sresource", dir);

    f = fopen(name, "r");

    对每个bar做:

        fscanf(f, "%" SCNi64 "%" SCNi64 " %" SCNi64 "\n",&start, &end,&flags) ;

        rp = dev->regions + r;  rp->valid = 0;

        rp->resource_fd = -1;

        size = end - start + 1;

       snprintf(name, sizeof(name),"%sresource%d", dir, r);

        fd = open(name, O_RDWR);

        rp->resource_fd = fd;

        rp->type = flags; rp->valid = 1;  rp->base_addr = start; rp->size = size;

        pci_dev->v_addrs[r].region = rp;

 

assigned_dev_register_regions==》

a.  pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL,cur_region->size,

                               PROT_WRITE |PROT_READ, MAP_SHARED,

                              cur_region->resource_fd, (off_t)0);

b.分为mmio和pio的case 分开处理(下面仅分析mmio)关联mmio gpa到真实设备的hva:

  若mmio size < 0x1000(没有到一个内存page大小)

     则   memory_region_init_io(&pci_dev->v_addrs[i].real_iomem,

                                      &slow_bar_ops,&pci_dev->v_addrs[i],

                                      "assigned-dev-slow-bar", cur_region->size);

  否则用:void *virtbase = pci_dev->v_addrs[i].u.r_virtbase;

     memory_region_init_ram_ptr(&pci_dev->v_addrs[i].real_iomem,

                               name, cur_region->size,virtbase);

    //当EPt建立好后,guest os访问gpa时就直接访问真实设备了不会有vm-exit发生

c.           assigned_dev_iomem_setup(&pci_dev->dev, i, cur_region->size);

            pci_register_bar((PCIDevice *)pci_dev, i, t,

                            &pci_dev->v_addrs[i].container);

 

assign_device ==>  kvm_device_pci_assign ==>

 kvm_vm_ioctl(s, KVM_ASSIGN_PCI_DEVICE, &dev_data); (注意assgin时同时设置了host 与guest)

 

assign_intx ==>

 a.   intx_route = pci_device_route_intx_to_irq(&dev->dev,dev->intpin); ==》

   pci_device_route_intx_to_irq(call piix3_route_intx_pin_to_irq)得到当前dev的irq信息

 b. deassign当前irq  ==》kvm_vm_ioctl(s, KVM_DEASSIGN_DEV_IRQ,&assigned_irq);

 c.重新assign当前设置kvm_device_intx_assign(kvm_state,dev->dev_id, intx_host_msi,

                               intx_route.irq);==》

static intkvm_assign_irq_internal(KVMState *s, uint32_t dev_id,

                                   uint32_tirq_type, uint32_t guest_irq)

{

    struct kvm_assigned_irq assigned_irq = {

        .assigned_dev_id = dev_id,

        .guest_irq = guest_irq,

        .flags = irq_type,

    };

 

    if (kvm_check_extension(s,KVM_CAP_ASSIGN_DEV_IRQ)) {

        return kvm_vm_ioctl(s,KVM_ASSIGN_DEV_IRQ, &assigned_irq);

    } else {

        return kvm_vm_ioctl(s, KVM_ASSIGN_IRQ,&assigned_irq);

    }

}

 

kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ,&assigned_irq);

 

(2) Reset

reset_assigned_device:

a. 对于msix设备调用assigned_dev_update_msix(pci_dev);

b. 真实设备reset

    snprintf(reset_file, sizeof(reset_file), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset",

             adev->host.domain,adev->host.bus, adev->host.slot, adev->host.function);

 

    fd = open(reset_file, O_WRONLY);

    ret = write(fd, reset, strlen(reset));

c. assigned_dev_pci_write_config(pci_dev,PCI_COMMAND, 0, 1);

 

assigned_dev_update_msi==>分misx和intx的case

msix case: 1. virq =kvm_irqchip_add_msi_route(kvm_state, msg);

                2. kvm_device_msi_assign(kvm_state,assigned_dev->dev_id, virq);

        最终调用kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ,&assigned_irq);

intx case:   assign_intx(assigned_dev);

 

intkvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)

{

    ......

    virq = kvm_irqchip_get_virq(s); //软件分配一个空闲irq号

 

    kroute.gsi = virq;

    kroute.type = KVM_IRQ_ROUTING_MSI;

    kroute.flags = 0;

    kroute.u.msi.address_lo =(uint32_t)msg.address;

    kroute.u.msi.address_hi = msg.address>> 32;

    kroute.u.msi.data = msg.data;

   //  调用kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING,s->irq_routes);

    kvm_add_routing_entry(s, &kroute);

    return virq;

 

7.2.2  kvm pci assgin

源码位于virt\assignd-dev.c:

kvm_vm_ioctl_assigned_device==> case KVM_ASSIGN_PCI_DEVICE ==>

kvm_vm_ioctl_assign_device==>

a)  kvm_find_assigned_dev查看dev是否assigned,若已assigned,直接返回

b)  pci_get_domain_bus_and_slot根据设备地址得到该设备的pci_device

c)  probe_sysfs_permissions打开设备sysfs的访问权限,这样qeum能访问

d)  pcidevice的相关初始化

pci_enable_device(dev);

pci_request_regions(dev,"kvm_assigned_device");

pci_reset_function(dev);

pci_save_state(dev);

match->pci_saved_state =pci_store_saved_state(dev);

e)  加入设备到assignedlist list_add(&match->list, &kvm->arch.assigned_dev_head);

f)  若vm的iommu domain未建立则kvm_iommu_map_guest(kvm);

g)  r =kvm_assign_device(kvm, match); 将设备关联到iommu

 

kvm_iommu_map_guest ==》

     kvm->arch.iommu_domain= iommu_domain_alloc(&pci_bus_type);

     kvm_iommu_map_memslots(kvm);

 

kvm_assign_device(virtio/iommu.c)==>

    a. r = iommu_attach_device(domain,&pdev->dev); //调用iommu关联设备

 

kvm_iommu_map_memslots ==》

        slots =kvm_memslots(kvm);

    kvm_for_each_memslot(memslot, slots) {

       r = kvm_iommu_map_pages(kvm, memslot);

       if (r)

           break;

    }

 

kvm_iommu_map_pages ==》

 对slot中的每个gfn

   a. iommu_iova_to_phys(domain,gfn_to_gpa(gfn)) 检查是否已建立iommu映射

   b. iommu_map(domain, gfn_to_gpa(gfn),pfn_to_hpa(pfn), page_size, flags);建立映射

 

同时在qemu新增加memory映射时该函数也会被调用:

__kvm_set_memory_region==》

        if((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {

       r = kvm_iommu_map_pages(kvm, &new);

       return r;

    }

 

7.2.3 kvm  interruptassgin

(1) 中断assign

kvm_vm_ioctl_assigned_device==》 case KVM_ASSIGN_DEV_IRQ==>

    kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); ==>

 

    if (host_irq_type)

       r = assign_host_irq(kvm, match, host_irq_type);

    if (guest_irq_type)

       r = assign_guest_irq(kvm, match, assigned_irq,guest_irq_type);

 

assign_host_irq ==> 分为了intx, msi,和msix三种case,我们以后仅分析msix的case:

==> assigned_device_enable_host_msix==>

    a. pci_enable_msix_exact(dev->dev, dev->host_msix_entries,dev->entries_nr);

    b. request_threaded_irq(dev->host_msix_entries[i].vector,

                   kvm_assigned_dev_msix,

                   kvm_assigned_dev_thread_msix,

                   0,dev->irq_name, dev); //注册了中断处理函数

 

assign_guest_irq ==》分为了intx, msi,和msix三种case,我们以后仅分析msix的case:

a .id =kvm_request_irq_source_id(kvm); ==》

b. assigned_device_enable_guest_msix

 

assigned_device_enable_guest_msix(structkvm *kvm,

           struct kvm_assigned_dev_kernel *dev,

           struct kvm_assigned_irq *irq)

{

    dev->guest_irq = irq->guest_irq;//guest_irq为guest os 的中断号

    dev->ack_notifier.gsi = -1;

    return 0;

}

 

 

(2) MSIX中断管理

对于misx的guest irq号由assigned_dev_update_msi ==》kvm_irqchip_add_msi_route分配

对应内核态为:

kvm_vm_ioctl ==》case KVM_SET_GSI_ROUTING  ==> kvm_set_irq_routing (virt\irqchip.c) ==》

setup_routing_entry ==》 kvm_set_routing_entry (irq_comm.c) ==>

case KVM_IRQ_ROUTING_MSI

       e->set = kvm_set_msi;//中断注入回调函数

       e->msi.address_lo = ue->u.msi.address_lo;

       e->msi.address_hi = ue->u.msi.address_hi;

       e->msi.data = ue->u.msi.data;

 

int kvm_set_msi(structkvm_kernel_irq_routing_entry *e,

       struct kvm *kvm, int irq_source_id, int level, boolline_status)

{

    struct kvm_lapic_irq irq;

    kvm_set_msi_irq(e, &irq);

    return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);

}

 

下面来看看host端中断处理:

kvm_assigned_dev_raise_guest_irq(structkvm_assigned_dev_kernel *assigned_dev,

               int vector)

{

    if (unlikely(assigned_dev->irq_requested_type &

           KVM_DEV_IRQ_GUEST_INTX)) {

       spin_lock(&assigned_dev->intx_mask_lock);

       if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))

           kvm_set_irq(assigned_dev->kvm,

                 assigned_dev->irq_source_id, vector, 1,

                  false);

       spin_unlock(&assigned_dev->intx_mask_lock);

    } else

       kvm_set_irq(assigned_dev->kvm,assigned_dev->irq_source_id,

               vector, 1,false);

}

所以kvm_set_irq ==> kvm_set_msi(

 

static irqreturn_tkvm_assigned_dev_msix(int irq, void *dev_id)

{

    struct kvm_assigned_dev_kernel *assigned_dev = dev_id;

    int index = find_index_from_host_irq(assigned_dev, irq);

    u32 vector;

    int ret = 0;

 

    if (index >= 0) {

       vector = assigned_dev->guest_msix_entries[index].vector;

       ret = kvm_set_irq_inatomic(assigned_dev->kvm,

                     assigned_dev->irq_source_id,

                     vector,1);

    }

 

    return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD :IRQ_HANDLED;

}

 

kvm_set_irq_inatomic==》kvm_set_msi_inatomic ==》 kvm_irq_delivery_to_apic_fast

当真实中断发生时,向guest assigneddevice注入中断

 

(3) intx中断管理

assign_host_irq ==》 request_threaded_irq(dev->host_irq,irq_handler,

               kvm_assigned_dev_thread_intx, flags,

               dev->irq_name, dev);

assign_guest_irq ==> caseintx

static intassigned_device_enable_guest_intx(struct kvm *kvm,

              struct kvm_assigned_dev_kernel *dev,

              struct kvm_assigned_irq *irq)

{

    dev->guest_irq = irq->guest_irq;

    dev->ack_notifier.gsi = irq->guest_irq;

    return 0;

}

 

kvm_assigned_dev_thread_intx==》kvm_assigned_dev_raise_guest_irq ==> kvm_set_irq

 

由此可知,kvm的中断虚拟化流程如下:

(1) 同时注册真实设备的中断处理函数

(2) 当中断发生时,根据真实设别中断号对应虚拟设备号,注入中断

如果系统采用了irq remap机制,则host的中断不会产生,直接在guest os上产生中断。

下一节将讨论iommu.

 

除pci-assign外,另一种直接io方法,为vfio. 它与pci-assign的区别在于,vfio更多的虚拟化实现放在了qemu用户空间中实现。 但其底层任然会使用iommu;本文就不详细分析vfio了。 其源代码位于:

Qemu:  hw\vfio_pci.c

Host driver:drivers/pci/vfio/

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值