在网上看到很多人说Qemu
的PCI-Assign
透传不支持IOMMU
,而VFIO
透传却可以(还被当做一种优势进行推荐)。而VFIO
跟SRIOV
并非有必然联系,那就是说VFIO
和PCI-Assign
进本都是靠软件实现的了?既然都是软件实现的,为啥PCI-Assign
不可以,而VFIO
可以呢?这不科学啊!从来也没人说清楚这件事,到源码里看一下吧!(最后发现他们说的都是错的,网上无根据的文章不要乱信)
PCI-Assign透传的基本使用步骤
随便在网上找一个PCI-Assign
透传的使用方法:
How to use 'pci pass-through' to run Linux in Qemu accessing real Ath9k adapter
===============================================================================
# Boot kernel with 'intel_iommu=on'
# Unbind driver from the device and bind 'pci-stub' to it
$ echo "168c 0030" > /sys/bus/pci/drivers/pci-stub/new_id
$ echo 0000:0b:00.0 > /sys/bus/pci/devices/0000:0b:00.0/driver/unbind
$ echo 0000:0b:00.0 > /sys/bus/pci/drivers/pci-stub/bind
# Then just run
$ sudo qemu-system-i386 -m 1024 \
-device pci-assign,host=0b:00.0,rombar=0 \
-enable-kvm \
-kernel $KERNEL \
-hda $DISK \
-boot c \
-append "root=/dev/sda rw"
# In case of
qemu-system-i386: -device pci-assign,host=0b:00.0: Failed to assign device "(null)" : Operation not permitted
qemu-system-i386: -device pci-assign,host=0b:00.0: Device initialization failed.
qemu-system-i386: -device pci-assign,host=0b:00.0: Device 'kvm-pci-assign' could not be initialized
$ dmesg | tail
[ 112.129138] kvm_iommu_map_guest: No interrupt remapping support, disallowing device assignment. Re-enble with "allow_unsafe_assigned_interrupts=1" module option.
# run
$ echo 1 > /sys/module/kvm/parameters/allow_unsafe_assigned_interrupts
总结一下基本步骤:
- 在内核启动参数中配置
intel_iommu=on
,开启IOMMU
(从这里就可以看出,PCI-Assign
不支持IOMMU
的话,要配置IOMMU
干什么呢?); - 向
pci-stub
驱动的new_id
文件写入要绑定设备的VendorID
和DeviceID
; - 把
PCI
设备与原有驱动解绑; - 把
PCI
设备绑定到pci-stub
驱动上。 - 在
Qemu
上以PCI
设备的PCI
地址为形参启动虚拟机。
疑问:我们知道用户态程序使用一个设备,必须要使用它的用户态接口,通常就是一个设备文件,而这里指定一个地址是什么意思?Qemu
是如何使用这个PCI
设备的?
PCI设备与pci-stub驱动绑定
接下来我们就看看这个pci-stub
驱动到底是怎么绑定到PCI
设备上的。
驱动内核模块加载时绑定
- 这是内核中
pci-stub
驱动的源码,如果在模块加载时指定了ids
参数,将会在加载时为这个pci-stub
驱动动态增加匹配的VendorID
和DeviceID
。我们知道,Linux
内核的Device
、Driver
和Bus
驱动模型中,PCI
总线是通过VendorID
和DeviceID
进行匹配的,因此这时就可对应的PCI
设备进行了匹配绑定。同时,这个驱动啥都没干,没有提供任何上层的应用接口,比如说字符设备或者块设备等(drivers/pci/pci-stub.c):
// SPDX-License-Identifier: GPL-2.0
/*
* Simple stub driver to reserve a PCI device
*
* Copyright (C) 2008 Red Hat, Inc.
* Author:
* Chris Wright
*
* Usage is simple, allocate a new id to the stub driver and bind the
* device to it. For example:
*
* # echo "8086 10f5" > /sys/bus/pci/drivers/pci-stub/new_id
* # echo -n 0000:00:19.0 > /sys/bus/pci/drivers/e1000e/unbind
* # echo -n 0000:00:19.0 > /sys/bus/pci/drivers/pci-stub/bind
* # ls -l /sys/bus/pci/devices/0000:00:19.0/driver
* .../0000:00:19.0/driver -> ../../../bus/pci/drivers/pci-stub
*/
#include <linux/module.h>
#include <linux/pci.h>
static char ids[1024] __initdata;
module_param_string(ids, ids, sizeof(ids), 0);
MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the stub driver, format is "
"\"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\""
" and multiple comma separated entries can be specified");
static int pci_stub_probe(struct pci_dev *dev, const struct pci_device_id *id)
{
pci_info(dev, "claimed by stub\n");
return 0;
}
static struct pci_driver stub_driver = {
.name = "pci-stub",
.id_table = NULL, /* only dynamic id's */
.probe = pci_stub_probe,
};
static int __init pci_stub_init(void)
{
char *p, *id;
int rc;
rc = pci_register_driver(&stub_driver);
if (rc)
return rc;
/* no ids passed actually */
if (ids[0] == '\0')
return 0;
/* add ids specified in the module parameter */
p = ids;
while ((id = strsep(&p, ","))) {
unsigned int vendor, device, subvendor = PCI_ANY_ID,
subdevice = PCI_ANY_ID, class = 0, class_mask = 0;
int fields;
if (!strlen(id))
continue;
fields = sscanf(id, "%x:%x:%x:%x:%x:%x",
&vendor, &device, &subvendor, &subdevice,
&class, &class_mask);
if (fields < 2) {
printk(KERN_WARNING
"pci-stub: invalid id string \"%s\"\n", id);
continue;
}
printk(KERN_INFO
"pci-stub: add %04X:%04X sub=%04X:%04X cls=%08X/%08X\n",
vendor, device, subvendor, subdevice, class, class_mask);
rc = pci_add_dynid(&stub_driver, vendor, device,
subvendor, subdevice, class, class_mask, 0);
if (rc)
printk(KERN_WARNING
"pci-stub: failed to add dynamic id (%d)\n", rc);
}
return 0;
}
static void __exit pci_stub_exit(void)
{
pci_unregister_driver(&stub_driver);
}
module_init(pci_stub_init);
module_exit(pci_stub_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Chris Wright <chrisw@sous-sol.org>");
- 我们再看看
pci_add_dynid
这个函数的实现,他申请并初始化了一个pci_dynid
结构,增加到Driver
的一个链表中,最后调用了driver_attach
函数触发PCI
Bus
的Device
与Driver
匹配(drivers/pci/pci-driver.c):
/**
* pci_add_dynid - add a new PCI device ID to this driver and re-probe devices
* @drv: target pci driver
* @vendor: PCI vendor ID
* @device: PCI device ID
* @subvendor: PCI subvendor ID
* @subdevice: PCI subdevice ID
* @class: PCI class
* @class_mask: PCI class mask
* @driver_data: private driver data
*
* Adds a new dynamic pci device ID to this driver and causes the
* driver to probe for all devices again. @drv must have been
* registered prior to calling this function.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int pci_add_dynid(struct pci_driver *drv,
unsigned int vendor, unsigned int device,
unsigned int subvendor, unsigned int subdevice,
unsigned int class, unsigned int class_mask,
unsigned long driver_data)
{
struct pci_dynid *dynid;
dynid = kzalloc(sizeof(*dynid), GFP_KERNEL);
if (!dynid)
return -ENOMEM;
dynid->id.vendor = vendor;
dynid->id.device = device;
dynid->id.subvendor = subvendor;
dynid->id.subdevice = subdevice;
dynid->id.class = class;
dynid->id.class_mask = class_mask;
dynid->id.driver_data = driver_data;
spin_lock(&drv->dynids.lock);
list_add_tail(&dynid->node, &drv->dynids.list);
spin_unlock(&drv->dynids.lock);
return driver_attach(&drv->driver);
}
EXPORT_SYMBOL_GPL(pci_add_dynid);
driver_attach
遍历PCI
Bus
上的所有Device
,使用VendorID
和DeviceID
与Driver
进行匹配(匹配方法由Bus
驱动提供)和绑定(drivers/base/dd.c):
static int __driver_attach(struct device *dev, void *data)
{
struct device_driver *drv = data;
int ret;
/*
* Lock device and try to bind to it. We drop the error
* here and always return 0, because we need to keep trying
* to bind to devices and some drivers will return an error
* simply if it didn't support the device.
*
* driver_probe_device() will spit a warning if there
* is an error.
*/
ret = driver_match_device(drv, dev);
if (ret == 0) {
/* no match */
return 0;
} else if (ret == -EPROBE_DEFER) {
dev_dbg(dev, "Device match requests probe deferral\n");
driver_deferred_probe_add(dev);
} else if (ret < 0) {
dev_dbg(dev, "Bus failed to match device: %d", ret);
return ret;
} /* ret > 0 means positive match */
if (dev->parent) /* Needed for USB */
device_lock(dev->parent);
device_lock(dev);
if (!dev->driver)
driver_probe_device(drv, dev);
device_unlock(dev);
if (dev->parent)
device_unlock(dev->parent);
return 0;
}
/**
* driver_attach - try to bind driver to devices.
* @drv: driver.
*
* Walk the list of devices that the bus has on it and try to
* match the driver with each one. If driver_probe_device()
* returns 0 and the @dev->driver is set, we've found a
* compatible pair.
*/
int driver_attach(struct device_driver *drv)
{
return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);
}
EXPORT_SYMBOL_GPL(driver_attach);
通过设备的sysfs文件绑定
PCI
Bus
驱动在注册时分别注册了三个(Bus
、Device
、Driver
)的SysFS
属性组,而Driver
的SysFS
属性组包含new_id
这个SysFS
属性文件。在对这个文件的进行设置的函数new_id_store
中,写入数据中解析出VendorID
和DeviceID
,最后调用pci_add_dynid
进行与设备的绑定(drivers/pci/pci-driver.c):
/**
* store_new_id - sysfs frontend to pci_add_dynid()
* @driver: target device driver
* @buf: buffer for scanning device ID data
* @count: input size
*
* Allow PCI IDs to be added to an existing driver via sysfs.
*/
static ssize_t new_id_store(struct device_driver *driver, const char *buf,
size_t count)
{
struct pci_driver *pdrv = to_pci_driver(driver);
const struct pci_device_id *ids = pdrv->id_table;
__u32 vendor, device, subvendor = PCI_ANY_ID,
subdevice = PCI_ANY_ID, class = 0, class_mask = 0;
unsigned long driver_data = 0;
int fields = 0;
int retval = 0;
fields = sscanf(buf, "%x %x %x %x %x %x %lx",
&vendor, &device, &subvendor, &subdevice,
&class, &class_mask, &driver_data);
if (fields < 2)
return -EINVAL;
if (fields != 7) {
struct pci_dev *pdev = kzalloc(sizeof(*pdev), GFP_KERNEL);
if (!pdev)
return -ENOMEM;
pdev->vendor = vendor;
pdev->device = device;
pdev->subsystem_vendor = subvendor;
pdev->subsystem_device = subdevice;
pdev->class = class;
if (pci_match_id(pdrv->id_table, pdev))
retval = -EEXIST;
kfree(pdev);
if (retval)
return retval;
}
/* Only accept driver_data values that match an existing id_table
entry */
if (ids) {
retval = -EINVAL;
while (ids->vendor || ids->subvendor || ids->class_mask) {
if (driver_data == ids->driver_data) {
retval = 0;
break;
}
ids++;
}
if (retval) /* No match */
return retval;
}
retval = pci_add_dynid(pdrv, vendor, device, subvendor, subdevice,
class, class_mask, driver_data);
if (retval)
return retval;
return count;
}
static DRIVER_ATTR_WO(new_id);
static struct attribute *pci_drv_attrs[] = {
&driver_attr_new_id.attr,
&driver_attr_remove_id.attr,
NULL,
};
ATTRIBUTE_GROUPS(pci_drv);
struct bus_type pci_bus_type = {
.name = "pci",
.match = pci_bus_match,
.uevent = pci_uevent,
.probe = pci_device_probe,
.remove = pci_device_remove,
.shutdown = pci_device_shutdown,
.dev_groups = pci_dev_groups,
.bus_groups = pci_bus_groups,
.drv_groups = pci_drv_groups,
.pm = PCI_PM_OPS_PTR,
.num_vf = pci_bus_num_vf,
.force_dma = true,
};
EXPORT_SYMBOL(pci_bus_type);
static int __init pci_driver_init(void)
{
return bus_register(&pci_bus_type);
}
postcore_initcall(pci_driver_init);
Qemu中的pci-assign设备
- 快速查一下
Qemu
源码中引用pci-assign
的位置,我们要看的源码应该就在hw/i386/kvm/pci-assign.c
这个位置(至于为什么?