本节通过分析piix磁盘 控制器来看看虚拟化驱动的骨架与实现要点。相关源码路径如下:
hw\ide\piix.c; hw\ide\core.c;
5.2.1虚拟驱动初始化
pc_init1 ==> 创建设备pci_piix3_ide_init(pci_bus, hd, piix3_devfn + 1);
PCIDevice *pci_piix3_ide_init(PCIBus*bus, DriveInfo **hd_table, int devfn)
{
PCIDevice *dev;
dev = pci_create_simple(bus, devfn,"piix3-ide");
pci_ide_create_devs(dev, hd_table);
return dev;
}
a type注册: type_init(piix_ide_register_types)
b piix3_ide_class_init{
DeviceClass *dc = DEVICE_CLASS(klass);
PCIDeviceClass *k =PCI_DEVICE_CLASS(klass);
k->no_hotplug = 1;
k->init = pci_piix_ide_initfn;
k->exit = pci_piix_ide_exitfn;
k->vendor_id = PCI_VENDOR_ID_INTEL;
k->device_id =PCI_DEVICE_ID_INTEL_82371SB_1;
k->class_id = PCI_CLASS_STORAGE_IDE;
dc->no_user = 1;
init : pci_piix_ide_initfn
qemu_register_reset(piix3_reset, d); //添加一个系统reset回调
bmdma_setup_bar(d);
pci_register_bar(&d->dev, 4, PCI_BASE_ADDRESS_SPACE_IO,&d->bmdma_bar);
vmstate_register(&d->dev.qdev, 0,&vmstate_ide_pci, d);
pci_piix_init_ports(d);
pci_piix_init_ports
for (i = 0; i < 2; i++) {
ide_bus_new(&d->bus[i],&d->dev.qdev, i);
ide_init_ioport(&d->bus[i],NULL, port_info[i].iobase,
port_info[i].iobase2);
ide_init2(&d->bus[i],isa_get_irq(NULL, port_info[i].isairq));
bmdma_init(&d->bus[i],&d->bmdma[i], d);
d->bmdma[i].bus = &d->bus[i];
qemu_add_vm_change_state_handler(d->bus[i].dma->ops->restart_cb,
&d->bmdma[i].dma);
}//为每个port创建一个ide bus
void ide_bus_new(IDEBus*idebus, DeviceState *dev, int bus_id)
{
qbus_create_inplace(&idebus->qbus,TYPE_IDE_BUS, dev, NULL);
idebus->bus_id = bus_id;
}
//每个port最多对应两个disk
void ide_init2(IDEBus *bus,qemu_irq irq)
{
for(i = 0; i < 2; i++) {
ide_init1(bus, i);
ide_reset(&bus->ifs[i]);
}
bus->irq = irq;
bus->dma = &ide_dma_nop;
}
void ide_init_ioport(IDEBus*bus, ISADevice *dev, int iobase, int iobase2)
{
isa_register_portio_list(dev, iobase,ide_portio_list, bus, "ide");
if (iobase2) {
isa_register_portio_list(dev, iobase2,ide_portio2_list, bus, "ide");
}
} 注册io port
void bmdma_init(IDEBus *bus,BMDMAState *bm, PCIIDEState *d)
{
qemu_irq *irq;
if (bus->dma == &bm->dma) {
return;
}
bm->dma.ops = &bmdma_ops;
bus->dma = &bm->dma;
bm->irq = bus->irq;
irq = qemu_allocate_irqs(bmdma_irq, bm, 1);
bus->irq = *irq;
bm->pci_dev = d;
}
static const struct IDEDMAOpsbmdma_ops = {
.start_dma = bmdma_start_dma,
.start_transfer = bmdma_start_transfer,
.prepare_buf = bmdma_prepare_buf,
.rw_buf = bmdma_rw_buf,
.set_unit = bmdma_set_unit,
.add_status = bmdma_add_status,
.set_inactive = bmdma_set_inactive,
.restart_cb = bmdma_restart_cb,
.reset = bmdma_reset,
};
ide_init1 ==》 为数据传输准备内部s->io_buffer, 512扇区 + 4 = 258KB
s->io_buffer_total_len =IDE_DMA_BUF_SECTORS*512 + 4;
s->io_buffer = qemu_memalign(2048,s->io_buffer_total_len);
memset(s->io_buffer, 0,s->io_buffer_total_len);
s->smart_selftest_data =qemu_blockalign(s->bs, 512);
memset(s->smart_selftest_data, 0, 512);
s->sector_write_timer =qemu_new_timer_ns(vm_clock, //为准备一个timer
ide_sector_write_timer_cb, s);
本节将分析中断处理框架与dma数据传输框架;对于普通寄存器的虚拟化实现本节不再讨论。
5.2.2数据传输发送模拟
piix的传输从下命令开始ide_ioport_write ==> case 7::
ide_exec_cmd(bus, val); ==>
case WIN_READDMA:
case WIN_READDMA_ONCE:
ide_cmd_lba48_transform(s,lba48);
ide_sector_start_dma(s, IDE_DMA_READ);
static voidide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd)
{
s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
s->io_buffer_index = 0;
s->io_buffer_size = 0;
s->dma_cmd = dma_cmd;
switch (dma_cmd) {
case IDE_DMA_READ:
bdrv_acct_start(s->bs,&s->acct, s->nsector * BDRV_SECTOR_SIZE,
BDRV_ACCT_READ);
break;
case IDE_DMA_WRITE:
bdrv_acct_start(s->bs,&s->acct, s->nsector * BDRV_SECTOR_SIZE,
BDRV_ACCT_WRITE);
break;
default:
break;
}
s->bus->dma->ops->start_dma(s->bus->dma, s,ide_dma_cb);
}
(1 )数据 Prepare 阶段
bmdma_prepare_buf 该函数主要根据guest os的数据地址建立sglist
其中pci_dma_read(&bm->pci_dev->dev,bm->cur_addr, &prd, 8); 用于
读取guest os 数据
pci_dma_read ==》 pci_dma_rw==》dma_memory_rw==》 dma_memory_rw_relaxed
对于普通case ==> cpu_physical_memory_rw
对于ram的情况该函数根据gpa取得hva,并读出数据
ptr =qemu_get_ram_ptr(section->mr->ram_addr
+memory_region_section_addr(section,
addr));
memcpy(buf, ptr, l);
qemu_put_ram_ptr(ptr);
(2) 数据传输阶段
调用块设备的数据读取函数bdrv_acct_start;同时启动dma operation
s->bus->dma->ops->start_dma(s->bus->dma,s, ide_dma_cb)
static voidbmdma_start_dma(IDEDMA *dma, IDEState *s,
BlockDriverCompletionFunc *dma_cb)
{
......
bm->unit = s->unit;
bm->dma_cb = dma_cb;
bm->cur_prd_last = 0;
bm->cur_prd_addr = 0;
bm->cur_prd_len = 0;
bm->sector_num = ide_get_sector(s);
bm->nsector = s->nsector;
if (bm->status & BM_STATUS_DMAING) {
bm->dma_cb(bmdma_active_if(bm), 0);
}
}; ide_dma_cb时完成回调函数。
ide_dma_cb ==》
s->bus->dma->aiocb =dma_bdrv_read(s->bs, &s->sg, sector_num,
ide_dma_cb,s);
void ide_dma_cb(void *opaque,int ret)
{
......
if (ret < 0) {
int op = BM_STATUS_DMA_RETRY;
if (s->dma_cmd == IDE_DMA_READ)
op |= BM_STATUS_RETRY_READ;
else if (s->dma_cmd == IDE_DMA_TRIM)
op |= BM_STATUS_RETRY_TRIM;
if (ide_handle_rw_error(s, -ret, op)) {
return;
}
}
n = s->io_buffer_size >> 9;
sector_num = ide_get_sector(s);
if (n > 0) {
dma_buf_commit(s);
sector_num += n;
ide_set_sector(s, sector_num);
s->nsector -= n;
}
//如果传输完成,触发传输完成中断
if (s->nsector == 0) {
s->status = READY_STAT | SEEK_STAT;
ide_set_irq(s->bus);
goto eot;
}
/* launch next transfer */
n = s->nsector;
s->io_buffer_index = 0;
s->io_buffer_size = n * 512;
if(s->bus->dma->ops->prepare_buf(s->bus->dma,ide_cmd_is_read(s)) == 0) {
goto eot;
}
switch (s->dma_cmd) {
case IDE_DMA_READ:
s->bus->dma->aiocb =dma_bdrv_read(s->bs, &s->sg, sector_num,
ide_dma_cb, s);
break;
case IDE_DMA_WRITE:
s->bus->dma->aiocb =dma_bdrv_write(s->bs, &s->sg, sector_num,
ide_dma_cb, s);
break;
case IDE_DMA_TRIM:
s->bus->dma->aiocb =dma_bdrv_io(s->bs, &s->sg, sector_num,
ide_issue_trim, ide_dma_cb, s,
DMA_DIRECTION_TO_DEVICE);
break;
}
return;
eot:
if (s->dma_cmd == IDE_DMA_READ ||s->dma_cmd == IDE_DMA_WRITE) {
bdrv_acct_done(s->bs,&s->acct);
}
ide_set_inactive(s);
}
调用dma_bdrv_read 的读取指定扇区,并注册完成回调ide_dma_cb.
关于dma_bdrv_read的实现我们在5.4分析;但传输完成后会将数据拷回到guest dma地址
dma_bdrv_read ==》dma_bdrv_cb ==》
传输时根据sglist调用 dma_memory_map
完成时调用: dma_complete
dma_memory_map将guest gpa转换成了host gva 让qemu(工作在vmm上)能操作data.
其实现思想是,找到gpa对应的memory area, 然后重新分配一个vmm bounce->buffer,并用cpu_physical_memory_read从原gpa对应区域取出数据拷贝到新分配的bounce->buffer中; 传输完成后dma_complete==》 dma_bdrv_unmap==>dma_memory_ummap执行反向的流程
ide_dma_cb 当数据传输完成时会goto eoi;
if (s->dma_cmd == IDE_DMA_READ ||s->dma_cmd == IDE_DMA_WRITE) {
bdrv_acct_done(s->bs,&s->acct);
}
ide_set_inactive(s);
static intbmdma_set_inactive(IDEDMA *dma)
{
......
bm->status &= ~BM_STATUS_DMAING;
bm->dma_cb = NULL;
bm->unit = -1;
return 0;
}
(3) 传输完成中断
ide_set_irq(s->bus);
这里的irq由pci_piix_init_ports ==》bmdma_init(&d->bus[i],&d->bmdma[i], d);指定
void bmdma_init(IDEBus *bus,BMDMAState *bm, PCIIDEState *d)
{
。。。。。。。
bm->dma.ops = &bmdma_ops;
bus->dma = &bm->dma;
bm->irq = bus->irq; //记录上层设备的irq
irq =qemu_allocate_irqs(bmdma_irq, bm, 1);
bus->irq = *irq;
bm->pci_dev = d;
}
static void bmdma_irq(void*opaque, int n, int level)
{
.......
bm->status |= BM_STATUS_INT;
/* trigger the real irq */
qemu_set_irq(bm->irq, level);
}
bm->irq = bus->irq; bus由
pci_piix_init_ports ==》 ide_init2(&d->bus[i], isa_get_irq(NULL,port_info[i].isairq));
指定为isa bus. 所以最终会调用到isa_bus de set_irq. isa_bus的irq 从何而来,5.3节会分析。
5.2.3 ide bus 与ide-drive关联
pc machine 的定义示例
static QEMUMachinepc_machine_v0_10 = {
.name = "pc-0.10",
.desc = "Standard PC, qemu 0.10",
.init = pc_init_pci_no_kvmclock,
.max_cpus = 255,
.default_machine_opts =KVM_MACHINE_OPTIONS,
.compat_props = (GlobalProperty[]) {
PC_COMPAT_0_11,
{
.driver = "virtio-blk-pci",
.property = "class",
.value = stringify(PCI_CLASS_STORAGE_OTHER),
},
...... {
.driver = "ide-drive",
.property = "ver",
.value = "0.10",
},{
.driver = "scsi-disk",
.property = "ver",
.value = "0.10",
},
{ /* end of list */ }
},
.hw_version = "0.10",
};
vl.c main
A. default_drive(default_cdrom, snapshot,machine->use_scsi,
IF_DEFAULT, 2, CDROM_OPTS);//设定default drive, 部位scsi就选ide
==>drive_add //用于添加disk drive
drive重要的属性是file. 5.3节会描述drive与block_drv的绑定drive_init会实现该绑定
B. if (machine->compat_props) {
qdev_prop_register_global_list(machine->compat_props);
}
将上面的属性添加到global中去了
C.该步骤在machine_init之后,这时ide_bus_new已经创建了ide_bus.
在ide_bus_new ==> qbus_create_inplace(&idebus->qbus,TYPE_IDE_BUS, dev, NULL);
static const TypeInfo ide_bus_info= { (hw\ide\qdev.c)
.name = TYPE_IDE_BUS,
.parent = TYPE_BUS,
.instance_size = sizeof(IDEBus),
.class_init = ide_bus_class_init,
};
pc_init1 ==>
ide_drive_get(hd,MAX_IDE_BUS);
pci_piix3_ide_init(pci_bus,hd, piix3_devfn + 1);
pci_piix3_ide_init( {
dev = pci_create_simple(bus, devfn,"piix3-ide");
pci_ide_create_devs(dev, hd_table); //创建ide设备
}
pci_ide_create_devs ==> ide_create_drive==>
qdev_create(&bus->qbus,drive->media_cd ? "ide-cd" : "ide-hd");
ide device创建:
static voidide_hd_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
IDEDeviceClass *k =IDE_DEVICE_CLASS(klass);
k->init = ide_hd_initfn;
dc->fw_name = "drive";
dc->desc = "virtual IDE disk";
dc->props = ide_hd_properties;
}
static TypeInfo ide_hd_info ={
.name = "ide-hd",
.parent = TYPE_IDE_DEVICE,
.instance_size = sizeof(IDEDrive),
.class_init = ide_hd_class_init,
};
static TypeInfoide_device_type_info = {
.name = TYPE_IDE_DEVICE,
.parent = TYPE_DEVICE,
.instance_size = sizeof(IDEDevice),
.abstract = true,
.class_size = sizeof(IDEDeviceClass),
.class_init = ide_device_class_init,
};
parent表明了继承关系, ide_hd_info 继承自ide_device_type_info, ide_device_type_info继承自device_type_info
static TypeInfodevice_type_info = {
.name = TYPE_DEVICE,
.parent = TYPE_OBJECT,
.instance_size = sizeof(DeviceState),
.instance_init = device_initfn,
.instance_finalize = device_finalize,
.class_base_init = device_class_base_init,
.abstract = true,
.class_size = sizeof(DeviceClass),
};
device_initfn会遍历global属性表,当匹配时qdev_prop_set_globals将全局setting设到对应的dev中
qdev_prop_set_globals ==> qdev_prop_parse==>object_property_parse==>object_property_set
static Propertyide_hd_properties[] = {
DEFINE_IDE_DEV_PROPERTIES(),
DEFINE_BLOCK_CHS_PROPERTIES(IDEDrive, dev.conf), //dev conf属性
DEFINE_PROP_BIOS_CHS_TRANS("bios-chs-trans",
IDEDrive, dev.chs_trans,BIOS_ATA_TRANSLATION_AUTO),
DEFINE_PROP_END_OF_LIST(),
};
DEFINE_PROP_DRIVE("drive",_state, _conf.bs),
#define DEFINE_PROP_DRIVE(_n,_s, _f) \
DEFINE_PROP(_n, _s, _f, qdev_prop_drive,BlockDriverState *)
PropertyInfo qdev_prop_drive= {
.name = "drive",
.get = get_drive,
.set = set_drive,
.release = release_drive,
};
实现了底层的property set用于设置conf;因此object_property_set会用global设置dev.conf
ide_hd_initfn ==> ide_dev_initfn ==>
a. blkconf_serial 通过conf获得ide对应的brv_dev
b. ide_init_drive
b. add_boot_device_path //添加到启动设备列表
ide_device_type_info 建立了ide-drive与 ide bus间的关联
static voidide_device_class_init(ObjectClass *klass, void *data)
{
DeviceClass *k = DEVICE_CLASS(klass);
k->init = ide_qdev_init;
k->bus_type = TYPE_IDE_BUS;
k->props = ide_props;
}
static intide_qdev_init(DeviceState *qdev)
IDEDevice *dev = IDE_DEVICE(qdev);
IDEDeviceClass *dc =IDE_DEVICE_GET_CLASS(dev);
IDEBus *bus = DO_UPCAST(IDEBus, qbus,qdev->parent_bus);
switch (dev->unit) {
case 0:
......
bus->master = dev;
break;
case 1:
.......
bus->slave = dev;
break;
default:
error_report("Invalid IDE unit%d", dev->unit);
goto err;
}
return dc->init(dev);
(5) boot 项的选定
在pc_memory_init ==》bochs_bios_init==》fw_cfg_init
==》 a.qdev_create(NULL, "fw_cfg");
b. s->machine_ready.notify = fw_cfg_machine_ready;
static voidfw_cfg_machine_ready(struct Notifier *n, void *data)
{
uint32_t len;
FWCfgState *s = container_of(n, FWCfgState,machine_ready);
char *bootindex =get_boot_devices_list(&len);
fw_cfg_add_file(s, "bootorder",(uint8_t*)bootindex, len);
}
fw_cfg_add_file ==>get_boot_devices_list (vl.c)取得bootdevice list ==> qdev_get_fw_dev_path ==> idebus_get_fw_dev_path
小结:
a) 全局配置设定了drive的类别与对应的host上的文件;由此将一个drive绑定到block device.
b) machine定义了drive的global conf如ide-drive; 并用代码创建ide drive
c) qdev_create创建对象时由device_type 根据globalconf动态设置dev.conf到ide-drive对象
d) 通过ide-bus绑定ide drive到piix-ide host