目录
2.3.2 pci_uio_map_resource_by_index
一、概述
DPDK通过内核的UIO机制将硬件资源(MMIO, IO,interrupt)映射到用户态,其基本原理很简单,都是内核的基本机制。
二、实现
2.1 UIO驱动
2.1.1 API
- uio_register_device(parent, info)
其中info对应下面的结构,需要用户自行提供:
struct uio_info {
struct uio_device *uio_dev;
const char *name;
const char *version;
struct uio_mem mem[MAX_UIO_MAPS];
struct uio_port port[MAX_UIO_PORT_REGIONS];
long irq;
unsigned long irq_flags;
void *priv;
irqreturn_t (*handler)(int irq, struct uio_info *dev_info);
int (*mmap)(struct uio_info *info, struct vm_area_struct *vma);
int (*open)(struct uio_info *info, struct inode *inode);
int (*release)(struct uio_info *info, struct inode *inode);
int (*irqcontrol)(struct uio_info *info, s32 irq_on);
};
2.1.2 UIO初始化
[drivers/uio/uio.c]
uio_init->uio_major_init
模块初始化uio_init,内部其实就是注册一个字符设备驱动,比较常规,留意一下uio cdev的方法:
static const struct file_operations uio_fops = {
.owner = THIS_MODULE,
.open = uio_open,
.release = uio_release,
.read = uio_read,
.write = uio_write,
.mmap = uio_mmap,
.poll = uio_poll,
.fasync = uio_fasync,
.llseek = noop_llseek,
};
最后关注一下uio的主设备号和cdev
uio_major = MAJOR(uio_dev);
uio_cdev = cdev;
2.1.3 UIO注册
#define uio_register_device(parent, info) \
__uio_register_device(THIS_MODULE, parent, info)
int __uio_register_device(struct module *owner,
struct device *parent,
struct uio_info *info)
{
struct uio_device *idev;
int ret = 0;
idev = devm_kzalloc(parent, sizeof(*idev), GFP_KERNEL);
idev->owner = owner;
idev->info = info;
init_waitqueue_head(&idev->wait);
atomic_set(&idev->event, 0);
ret = uio_get_minor(idev);
if (ret)
return ret;
idev->dev = device_create(&uio_class, parent,
MKDEV(uio_major, idev->minor), idev,
"uio%d", idev->minor);
ret = uio_dev_add_attributes(idev);
if (ret)
goto err_uio_dev_add_attributes;
info->uio_dev = idev;
if (info->irq && (info->irq != UIO_IRQ_CUSTOM)) {
ret = request_irq(info->irq, uio_interrupt,
info->irq_flags, info->name, idev);
if (ret)
goto err_request_irq;
}
return 0;
}
代码删除了无关部分,可以看出一个uio注册的过程就是根据uio_info构建一个uio_dev,并将其纳入驱动模型中,我们来看一下uio_dev
struct uio_device {
struct module *owner;
struct device *dev;
int minor;
atomic_t event;
struct fasync_struct *async_queue;
wait_queue_head_t wait;
struct uio_info *info;
struct kobject *map_dir;
struct kobject *portio_dir;
};
- uio_get_minor分配一个新的次设备号
- 剩下部分最重要的是uio_dev_add_attributes
uio_dev_add_attributes实现对资源的存储,这里先看一下uio的资源抽象:
struct uio_mem {
const char *name;
phys_addr_t addr;
unsigned long offs;
resource_size_t size;
int memtype;
void __iomem *internal_addr;
struct uio_map *map;
};
我们逐段来看uio_dev_add_attributes,首先是对uio_mem的初始化
for (mi = 0; mi < MAX_UIO_MAPS; mi++) {
mem = &idev->info->mem[mi];
if (mem->size == 0)
break;
if (!map_found) {
map_found = 1;
idev->map_dir = kobject_create_and_add("maps",
&idev->dev->kobj);
if (!idev->map_dir) {
ret = -ENOMEM;
goto err_map;
}
}
map = kzalloc(sizeof(*map), GFP_KERNEL);
if (!map) {
ret = -ENOMEM;
goto err_map;
}
kobject_init(&map->kobj, &map_attr_type);
map->mem = mem;
mem->map = map;
ret = kobject_add(&map->kobj, idev->map_dir, "map%d", mi);
if (ret)
goto err_map_kobj;
ret = kobject_uevent(&map->kobj, KOBJ_ADD);
if (ret)
goto err_map_kobj;
}
- 根据传递进来的mem参数,在parent dev对应的目录下建立maps目录,下面根据mem映射的个数建立maps/map_X文件,
for (pi = 0; pi < MAX_UIO_PORT_REGIONS; pi++) {
port = &idev->info->port[pi];
if (port->size == 0)
break;
if (!portio_found) {
portio_found = 1;
idev->portio_dir = kobject_create_and_add("portio",
&idev->dev->kobj);
if (!idev->portio_dir) {
ret = -ENOMEM;
goto err_portio;
}
}
portio = kzalloc(sizeof(*portio), GFP_KERNEL);
if (!portio) {
ret = -ENOMEM;
goto err_portio;
}
kobject_init(&portio->kobj, &portio_attr_type);
portio->port = port;
port->portio = portio;
ret = kobject_add(&portio->kobj, idev->portio_dir,
"port%d", pi);
if (ret)
goto err_portio_kobj;
ret = kobject_uevent(&portio->kobj, KOBJ_ADD);
if (ret)
goto err_portio_kobj;
- 和上面类似只不过名称变为portio,表示portio的映射
2.2 IGB_UIO驱动
igb_uio是一个pci driver,这一节主要看dpdk是如何使用Linux UIO接口的
2.2.1 初始化
都是标准的东西,主要是获取PCI设备的MMIO和IOPORT并注册,这里就直接贴了:
static int __init igbuio_pci_init_module(void)
{
igbuio_config_intr_mode(intr_mode);
return pci_register_driver(&igbuio_pci_driver);
}
intr_mode是模块参数,有msix, msi, legacy,默认情况下msix中断
static struct pci_driver igbuio_pci_driver = {
.name = "igb_uio",
.id_table = NULL,
.probe = igbuio_pci_probe,
.remove = igbuio_pci_remove,
};
当系统进行总线扫描的时候,如果匹配会执行driver的probe,即igbuio_pci_probe
[kernel/linux/igb_uio/igb_uio.c]
static int igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) {
udev = kzalloc(sizeof(struct rte_uio_pci_dev), GFP_KERNEL);
pci_enable_device(dev);
pci_set_master(dev);
igbuio_setup_bars(dev, &udev->info);
pci_set_dma_mask(dev, DMA_BIT_MASK(64));
/* fill uio infos */
udev->info.name = "igb_uio";
udev->info.version = "0.1";
udev->info.irqcontrol = igbuio_pci_irqcontrol;
udev->info.open = igbuio_pci_open;
udev->info.release = igbuio_pci_release;
udev->info.priv = udev;
udev->pdev = dev;
atomic_set(&udev->refcnt, 0);
err = sysfs_create_group(&dev->dev.kobj, &dev_attr_grp);
if (err != 0)
goto fail_release_iomem;
/* register uio driver */
err = uio_register_device(&dev->dev, &udev->info);
if (err != 0)
goto fail_remove_group;
pci_set_drvdata(dev, udev);
}
- igbuio_pci_probe执行了pci驱动的基本操作,如使能dma映射,它主要的工作还是对igb相关的配置空间和memory空间,msix中断进行了映射
上一节分析过的参数在device私有数据中:
struct rte_uio_pci_dev {
struct uio_info info;
struct pci_dev *pdev;
enum rte_intr_mode mode;
atomic_t refcnt;
};
igbuio_setup_bars会去读bar地址,根据bar指向空间的类型(mem, io)填充对应的info->mem
static int igbuio_pci_setup_iomem(struct pci_dev *dev, struct uio_info *info,
int n, int pci_bar, const char *name)
{
unsigned long addr, len;
void *internal_addr;
addr = pci_resource_start(dev, pci_bar);
len = pci_resource_len(dev, pci_bar);
if (addr == 0 || len == 0)
return -1;
if (wc_activate == 0) {
internal_addr = ioremap(addr, len);
if (internal_addr == NULL)
return -1;
} else {
internal_addr = NULL;
}
info->mem[n].name = name;
info->mem[n].addr = addr;
info->mem[n].internal_addr = internal_addr;
info->mem[n].size = len;
info->mem[n].memtype = UIO_MEM_PHYS;
return 0;
}
- addr是物理地址(相对应于pci域)的首地址,internal_addr是经过ioremap过的虚拟地址,其范围在vmalloc空间,但是不会分配内存。ioport过程类似,就不再贴了。
2.3 用户态使用
到现在为止,uio将igb_uio的资源:mmio和ioport的资源都记录下来了,并提供给用户对应的字符设备文件,接下来就分析一下用户态映射设备资源的过程。
在用户态pci driver probe过程中,如果driver设置了RTE_PCI_DRV_NEED_MAPPING标记,就会执行地址映射,对应函数:
int rte_pci_map_device(struct rte_pci_device *dev)
{
int ret = -1;
/* try mapping the NIC resources using VFIO if it exists */
switch (dev->kdrv) {
case RTE_KDRV_VFIO:
#ifdef VFIO_PRESENT
if (pci_vfio_is_enabled())
ret = pci_vfio_map_resource(dev);
#endif
break;
case RTE_KDRV_IGB_UIO:
case RTE_KDRV_UIO_GENERIC:
if (rte_eal_using_phys_addrs()) {
/* map resources for devices that use uio */
ret = pci_uio_map_resource(dev);
}
break;
default:
RTE_LOG(DEBUG, EAL,
" Not managed by a supported kernel driver, skipped\n");
ret = 1;
break;
}
return ret;
}
2.3.1 pci_uio_alloc_resource
我们关注上述UIO对应的pci_uio_map_resource,这个函数主要有两个部分,先看pci_uio_alloc_resource。
[driver/bus/pci/pci_common_uio.c]
int pci_uio_alloc_resource(struct rte_pci_device *dev, struct mapped_pci_resource **uio_res)
{
char dirname[PATH_MAX];
char cfgname[PATH_MAX];
char devname[PATH_MAX]; /* contains the /dev/uioX */
int uio_num;
struct rte_pci_addr *loc;
loc = &dev->addr;
/* find uio resource */
uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 1);
if (uio_num < 0) {
RTE_LOG(WARNING, EAL, " "PCI_PRI_FMT" not managed by UIO driver, "
"skipping\n", loc->domain, loc->bus, loc->devid, loc->function);
return 1;
}
snprintf(devname, sizeof(devname), "/dev/uio%u", uio_num);
/* save fd if in primary process */
dev->intr_handle.fd = open(devname, O_RDWR);
if (dev->intr_handle.fd < 0) {
RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
devname, strerror(errno));
goto error;
}
snprintf(cfgname, sizeof(cfgname),
"/sys/class/uio/uio%u/device/config", uio_num);
dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR);
if (dev->intr_handle.uio_cfg_fd < 0) {
RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
cfgname, strerror(errno));
goto error;
}
if (dev->kdrv == RTE_KDRV_IGB_UIO)
dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
else {
dev->intr_handle.type = RTE_INTR_HANDLE_UIO_INTX;
/* set bus master that is not done by uio_pci_generic */
if (pci_uio_set_bus_master(dev->intr_handle.uio_cfg_fd)) {
RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
goto error;
}
}
/* allocate the mapping details for secondary processes*/
*uio_res = rte_zmalloc("UIO_RES", sizeof(**uio_res), 0);
if (*uio_res == NULL) {
RTE_LOG(ERR, EAL,
"%s(): cannot store uio mmap details\n", __func__);
goto error;
}
snprintf((*uio_res)->path, sizeof((*uio_res)->path), "%s", devname);
memcpy(&(*uio_res)->pci_addr, &dev->addr, sizeof((*uio_res)->pci_addr));
return 0;
error:
pci_uio_free_resource(dev, *uio_res);
return -1;
}
- 先找到对应的字符设备,位置在/sys/bus/pci/devices/DBDF/uio目录(/sys/bus/pci/devices/DBDF是个符号链接指向具体设备),找到实际对应的uioX,这样就可以打开/dev/uioX, 赋值给dev->intr_handle.fd
- 同时打开位于/sys/class/uio/uio%u/device/config,赋值给dev->intr_handle.uio_cfg_fd
对/dev/uioX的open动作会调用到uio的内核字符驱动open方法uio_open
函数做两件事情,首先分配文件的私有数据:
struct uio_listener {
struct uio_device *dev;
s32 event_count;
};
如果注册了idev->info->open则调用,igb_uio中对应的是igbuio_pci_open->igbuio_pci_enable_interrupts
这里我们只分析默认的msix中断,在这时才分配igb_uio对应的中断,因为一开始info->irq=UIO_IRQ_NONE,在注册的时候并没有执行。
case RTE_INTR_MODE_MSIX:
/* Only 1 msi-x vector needed */
#ifndef HAVE_ALLOC_IRQ_VECTORS
msix_entry.entry = 0;
if (pci_enable_msix(udev->pdev, &msix_entry, 1) == 0) {
dev_dbg(&udev->pdev->dev, "using MSI-X");
udev->info.irq_flags = IRQF_NO_THREAD;
udev->info.irq = msix_entry.vector;
udev->mode = RTE_INTR_MODE_MSIX;
break;
}
#else
if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSIX) == 1) {
dev_dbg(&udev->pdev->dev, "using MSI-X");
udev->info.irq_flags = IRQF_NO_THREAD;
udev->info.irq = pci_irq_vector(udev->pdev, 0);
udev->mode = RTE_INTR_MODE_MSIX;
break;
}
#endif
if (udev->info.irq != UIO_IRQ_NONE) /* 在这注册中断 */
err = request_irq(udev->info.irq, igbuio_pci_irqhandler,
udev->info.irq_flags, udev->info.name,
udev);
dev_info(&udev->pdev->dev, "uio device registered with irq %ld\n",
udev->info.irq);
- pci_enable_msi,获取irq,request_irq注册,这里将msix中断注册到kernel中,注意对igb_uio来说在注册udev时那个request_irq是没有执行的。而且这里只有一个中断。
中断处理函数:
static irqreturn_t igbuio_pci_irqhandler(int irq, void *dev_id)
{
struct rte_uio_pci_dev *udev = (struct rte_uio_pci_dev *)dev_id;
struct uio_info *info = &udev->info;
/* Legacy mode need to mask in hardware */
if (udev->mode == RTE_INTR_MODE_LEGACY &&
!pci_check_and_mask_intx(udev->pdev))
return IRQ_NONE;
uio_event_notify(info);
/* Message signal mode, no share IRQ and automasked */
return IRQ_HANDLED;
}
主要就是调用uio_event_notify
void uio_event_notify(struct uio_info *info)
{
struct uio_device *idev = info->uio_dev;
atomic_inc(&idev->event);
wake_up_interruptible(&idev->wait);
kill_fasync(&idev->async_queue, SIGIO, POLL_IN);
}
可以看到在中断发生时会触发用户态epoll,这样通过注册的中断callback就可以执行了。
在DPDK初始化分析时,系统对中断的初始化是驱动注册中断的回调函数,由中断控制线程去扫描中断注册传递过来的fd,这里的fd正是上面的dev->intr_handle.fd,中断初始化函数rte_eal_intr_init将该fd加入到epoll中,而文件本身对应字符文件/dev/uioX,那么我们这里看一下该字符文件的poll方法uio_poll
static unsigned int uio_poll(struct file *filep, poll_table *wait)
{
struct uio_listener *listener = filep->private_data;
struct uio_device *idev = listener->dev;
if (!idev->info->irq)
return -EIO;
poll_wait(filep, &idev->wait, wait);
if (listener->event_count != atomic_read(&idev->event))
return POLLIN | POLLRDNORM;
return 0;
}
这里再加一点,中断使能,中断是向字符设备写“1”,驱动会调用uio_write
static ssize_t uio_write(struct file *filep, const char __user *buf,
size_t count, loff_t *ppos)
{
struct uio_listener *listener = filep->private_data;
struct uio_device *idev = listener->dev;
ssize_t retval;
s32 irq_on;
if (!idev->info->irq)
return -EIO;
if (count != sizeof(s32))
return -EINVAL;
if (!idev->info->irqcontrol)
return -ENOSYS;
if (copy_from_user(&irq_on, buf, count))
return -EFAULT;
retval = idev->info->irqcontrol(idev->info, irq_on);
return retval ? retval : sizeof(s32);
}
就是调用idev->info->irqcontrol,对于igb_uio驱动来说,是igbuio_pci_irqcontrol
static int
igbuio_pci_irqcontrol(struct uio_info *info, s32 irq_state)
{
struct rte_uio_pci_dev *udev = info->priv;
struct pci_dev *pdev = udev->pdev;
#ifdef HAVE_PCI_MSI_MASK_IRQ
struct irq_data *irq = irq_get_irq_data(udev->info.irq);
#endif
pci_cfg_access_lock(pdev);
if (udev->mode == RTE_INTR_MODE_MSIX || udev->mode == RTE_INTR_MODE_MSI) {
#ifdef HAVE_PCI_MSI_MASK_IRQ
if (irq_state == 1)
pci_msi_unmask_irq(irq);
else
pci_msi_mask_irq(irq);
#else
igbuio_mask_irq(pdev, udev->mode, irq_state);
#endif
}
if (udev->mode == RTE_INTR_MODE_LEGACY)
pci_intx(pdev, !!irq_state);
pci_cfg_access_unlock(pdev);
return 0;
}
核心就是pci_msi_unmask_irq。
回到原来的函数,打开并赋值了两个fd后,这个函数填充以下结构:
struct mapped_pci_resource {
TAILQ_ENTRY(mapped_pci_resource) next;struct rte_pci_addr pci_addr;
char path[PATH_MAX]; // /dev/uioX
int nb_maps;
struct pci_map maps[PCI_MAX_RESOURCE];
struct pci_msix_table msix_table;
};
在这之后可以开始真正的映射工作了。
2.3.2 pci_uio_map_resource_by_index
int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx,
struct mapped_pci_resource *uio_res, int map_idx)
{
int fd = -1;
char devname[PATH_MAX];
void *mapaddr;
struct rte_pci_addr *loc;
struct pci_map *maps;
int wc_activate = 0;
if (dev->driver != NULL)
wc_activate = dev->driver->drv_flags & RTE_PCI_DRV_WC_ACTIVATE;
loc = &dev->addr;
maps = uio_res->maps;
/* allocate memory to keep path */
maps[map_idx].path = rte_malloc(NULL, sizeof(devname), 0);
if (maps[map_idx].path == NULL) {
RTE_LOG(ERR, EAL, "Cannot allocate memory for path: %s\n",
strerror(errno));
return -1;
}
/*
* open resource file, to mmap it
*/
if (!wc_activate || fd < 0) {
snprintf(devname, sizeof(devname),
"%s/" PCI_PRI_FMT "/resource%d",
rte_pci_get_sysfs_path(),
loc->domain, loc->bus, loc->devid,
loc->function, res_idx);
/* then try to map resource file */
fd = open(devname, O_RDWR);
}
/* try mapping somewhere close to the end of hugepages */
if (pci_map_addr == NULL)
pci_map_addr = pci_find_max_end_va();
mapaddr = pci_map_resource(pci_map_addr, fd, 0,
(size_t)dev->mem_resource[res_idx].len, 0);
close(fd);
if (mapaddr == MAP_FAILED)
goto error;
pci_map_addr = RTE_PTR_ADD(mapaddr,
(size_t)dev->mem_resource[res_idx].len);
maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr;
maps[map_idx].size = dev->mem_resource[res_idx].len;
maps[map_idx].addr = mapaddr;
maps[map_idx].offset = 0;
strcpy(maps[map_idx].path, devname);
dev->mem_resource[res_idx].addr = mapaddr;
return 0;
error:
rte_free(maps[map_idx].path);
return -1;
}
- 资源的映射还是通过/sys/bus/pci/DBDF/resource
- 在大页中找一个靠近结尾的地址,根据mem长度使用mmap(pci_map_resource)进行映射
关于pci设备通过sysfs 映射出来的资源,可以参照Document/filesystems/sysfs-pci.txt
resource PCI resource host addresses (ascii, ro)
resource0..N PCI resource N, if present (binary, mmap, rw[1])
resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap)
这部分代码的实现在
[drivers/pci/pci-sysfs.c]
static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine)
{
/* allocate attribute structure, piggyback attribute name */
int name_len = write_combine ? 13 : 10;
struct bin_attribute *res_attr;
char *res_attr_name;
int retval;
res_attr = kzalloc(sizeof(*res_attr) + name_len, GFP_ATOMIC);
if (!res_attr)
return -ENOMEM;
res_attr_name = (char *)(res_attr + 1);
sysfs_bin_attr_init(res_attr);
if (write_combine) {
pdev->res_attr_wc[num] = res_attr;
sprintf(res_attr_name, "resource%d_wc", num);
res_attr->mmap = pci_mmap_resource_wc;
} else {
pdev->res_attr[num] = res_attr;
sprintf(res_attr_name, "resource%d", num);
if (pci_resource_flags(pdev, num) & IORESOURCE_IO) {
res_attr->read = pci_read_resource_io;
res_attr->write = pci_write_resource_io;
if (arch_can_pci_mmap_io())
res_attr->mmap = pci_mmap_resource_uc;
} else {
res_attr->mmap = pci_mmap_resource_uc;
}
}
res_attr->attr.name = res_attr_name;
res_attr->attr.mode = S_IRUSR | S_IWUSR;
res_attr->size = pci_resource_len(pdev, num);
res_attr->private = (void *)(unsigned long)num;
retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr);
if (retval)
kfree(res_attr);
return retval;
}
注意,pci的这种mmio空间的映射和一般内存的映射不同,它是不分配物理内存的,只建立对应的页表,在访问这部分pci对应的虚拟地址时,通过MMU转换成物理地址,这部分地址是被PCI主控制器“认领的”(而非内存控制器),主控制器将物理地址转换为总线地址完成访问。
在用户调用mmap时,最终调用sysfs的pci_mmap_resource_uc
int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
struct vm_area_struct *vma,
enum pci_mmap_state mmap_state, int write_combine)
{
unsigned long size;
int ret;
size = ((pci_resource_len(pdev, bar) - 1) >> PAGE_SHIFT) + 1;
if (vma->vm_pgoff + vma_pages(vma) > size)
return -EINVAL;
if (write_combine)
vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
else
vma->vm_page_prot = pgprot_device(vma->vm_page_prot);
if (mmap_state == pci_mmap_io) {
ret = pci_iobar_pfn(pdev, bar, vma);
if (ret)
return ret;
} else
vma->vm_pgoff += (pci_resource_start(pdev, bar) >> PAGE_SHIFT);
vma->vm_ops = &pci_phys_vm_ops;
return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
vma->vm_end - vma->vm_start,
vma->vm_page_prot);
}
实现还是很常规,构造一个vma,再通过io_remap_pfn_range将页表建立起来。
OK,那么至此bar空间到用户空间的映射算是建立起来了,填充的是
struct pci_map {
void *addr;
char *path;
uint64_t offset;
uint64_t size;
uint64_t phaddr;
};
maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr;
maps[map_idx].size = dev->mem_resource[res_idx].len;
maps[map_idx].addr = mapaddr;
maps[map_idx].offset = 0;
strcpy(maps[map_idx].path, devname);
dev->mem_resource[res_idx].addr = mapaddr;
可以看到,最终mmio的这部分映射完成后放在dev->mem_resource中,这样驱动程序就可以直接使用了。