一、基础数据结构
在前面介绍过DPDK中virtio源码的分布,其中在底层设备抽象的是virtio_pci.h和virtio_pci.c,它主要用来对PCI设备的检测并实现相关设备的驱动,看一下基础的数据结构和宏定义:
/* VirtIO PCI vendor/device ID. */
#define VIRTIO_PCI_VENDORID 0x1AF4
#define VIRTIO_PCI_LEGACY_DEVICEID_NET 0x1000
#define VIRTIO_PCI_MODERN_DEVICEID_NET 0x1041
/* VirtIO ABI version, this must match exactly. */
#define VIRTIO_PCI_ABI_VERSION 0
/*
* VirtIO Header, located in BAR 0.
*/
#define VIRTIO_PCI_HOST_FEATURES 0 /* host's supported features (32bit, RO)*/
#define VIRTIO_PCI_GUEST_FEATURES 4 /* guest's supported features (32, RW) */
#define VIRTIO_PCI_QUEUE_PFN 8 /* physical address of VQ (32, RW) */
#define VIRTIO_PCI_QUEUE_NUM 12 /* number of ring entries (16, RO) */
#define VIRTIO_PCI_QUEUE_SEL 14 /* current VQ selection (16, RW) */
#define VIRTIO_PCI_QUEUE_NOTIFY 16 /* notify host regarding VQ (16, RW) */
#define VIRTIO_PCI_STATUS 18 /* device status register (8, RW) */
#define VIRTIO_PCI_ISR 19 /* interrupt status register, reading
* also clears the register (8, RO) */
/* Only if MSIX is enabled: */
#define VIRTIO_MSI_CONFIG_VECTOR 20 /* configuration change vector (16, RW) */
#define VIRTIO_MSI_QUEUE_VECTOR 22 /* vector for selected VQ notifications
(16, RW) */
/* The bit of the ISR which indicates a device has an interrupt. */
#define VIRTIO_PCI_ISR_INTR 0x1
/* The bit of the ISR which indicates a device configuration change. */
#define VIRTIO_PCI_ISR_CONFIG 0x2
/* Vector value used to disable MSI for queue. */
#define VIRTIO_MSI_NO_VECTOR 0xFFFF
/* VirtIO device IDs. */
#define VIRTIO_ID_NETWORK 0x01
#define VIRTIO_ID_BLOCK 0x02
#define VIRTIO_ID_CONSOLE 0x03
#define VIRTIO_ID_ENTROPY 0x04
#define VIRTIO_ID_BALLOON 0x05
#define VIRTIO_ID_IOMEMORY 0x06
#define VIRTIO_ID_9P 0x09
/* Status byte for guest to report progress. */
#define VIRTIO_CONFIG_STATUS_RESET 0x00
#define VIRTIO_CONFIG_STATUS_ACK 0x01
#define VIRTIO_CONFIG_STATUS_DRIVER 0x02
#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
#define VIRTIO_CONFIG_STATUS_FAILED 0x80
/*
* Each virtqueue indirect descriptor list must be physically contiguous.
* To allow us to malloc(9) each list individually, limit the number
* supported to what will fit in one page. With 4KB pages, this is a limit
* of 256 descriptors. If there is ever a need for more, we can switch to
* contigmalloc(9) for the larger allocations, similar to what
* bus_dmamem_alloc(9) does.
*
* Note the sizeof(struct vring_desc) is 16 bytes.
*/
#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
/* The feature bitmap for virtio net */
#define VIRTIO_NET_F_CSUM 0 /* Host handles pkts w/ partial csum */
#define VIRTIO_NET_F_GUEST_CSUM 1 /* Guest handles pkts w/ partial csum */
#define VIRTIO_NET_F_MTU 3 /* Initial MTU advice. */
#define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */
#define VIRTIO_NET_F_GUEST_TSO4 7 /* Guest can handle TSOv4 in. */
#define VIRTIO_NET_F_GUEST_TSO6 8 /* Guest can handle TSOv6 in. */
#define VIRTIO_NET_F_GUEST_ECN 9 /* Guest can handle TSO[6] w/ ECN in. */
#define VIRTIO_NET_F_GUEST_UFO 10 /* Guest can handle UFO in. */
#define VIRTIO_NET_F_HOST_TSO4 11 /* Host can handle TSOv4 in. */
#define VIRTIO_NET_F_HOST_TSO6 12 /* Host can handle TSOv6 in. */
#define VIRTIO_NET_F_HOST_ECN 13 /* Host can handle TSO[6] w/ ECN in. */
#define VIRTIO_NET_F_HOST_UFO 14 /* Host can handle UFO in. */
#define VIRTIO_NET_F_MRG_RXBUF 15 /* Host can merge receive buffers. */
#define VIRTIO_NET_F_STATUS 16 /* virtio_net_config.status available */
#define VIRTIO_NET_F_CTRL_VQ 17 /* Control channel available */
#define VIRTIO_NET_F_CTRL_RX 18 /* Control channel RX mode support */
#define VIRTIO_NET_F_CTRL_VLAN 19 /* Control channel VLAN filtering */
#define VIRTIO_NET_F_CTRL_RX_EXTRA 20 /* Extra RX mode control support */
#define VIRTIO_NET_F_GUEST_ANNOUNCE 21 /* Guest can announce device on the
* network */
#define VIRTIO_NET_F_MQ 22 /* Device supports Receive Flow
* Steering */
#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */
/* Do we get callbacks when the ring is completely used, even if we've
* suppressed them? */
#define VIRTIO_F_NOTIFY_ON_EMPTY 24
/* Can the device handle any descriptor layout? */
#define VIRTIO_F_ANY_LAYOUT 27
/* We support indirect buffer descriptors */
#define VIRTIO_RING_F_INDIRECT_DESC 28
#define VIRTIO_F_VERSION_1 32
#define VIRTIO_F_IOMMU_PLATFORM 33
#define VIRTIO_F_RING_PACKED 34
/*
* Some VirtIO feature bits (currently bits 28 through 31) are
* reserved for the transport being used (eg. virtio_ring), the
* rest are per-device feature bits.
*/
#define VIRTIO_TRANSPORT_F_START 28
#define VIRTIO_TRANSPORT_F_END 34
/*
* Inorder feature indicates that all buffers are used by the device
* in the same order in which they have been made available.
*/
#define VIRTIO_F_IN_ORDER 35
/*
* This feature indicates that memory accesses by the driver and the device
* are ordered in a way described by the platform.
*/
#define VIRTIO_F_ORDER_PLATFORM 36
/* The Guest publishes the used index for which it expects an interrupt
* at the end of the avail ring. Host should ignore the avail->flags field. */
/* The Host publishes the avail index for which it expects a kick
* at the end of the used ring. Guest should ignore the used->flags field. */
#define VIRTIO_RING_F_EVENT_IDX 29
#define VIRTIO_NET_S_LINK_UP 1 /* Link is up */
#define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */
/*
* Maximum number of virtqueues per device.
*/
#define VIRTIO_MAX_VIRTQUEUE_PAIRS 8
#define VIRTIO_MAX_VIRTQUEUES (VIRTIO_MAX_VIRTQUEUE_PAIRS * 2 + 1)
/* Common configuration */
#define VIRTIO_PCI_CAP_COMMON_CFG 1
/* Notifications */
#define VIRTIO_PCI_CAP_NOTIFY_CFG 2
/* ISR Status */
#define VIRTIO_PCI_CAP_ISR_CFG 3
/* Device specific configuration */
#define VIRTIO_PCI_CAP_DEVICE_CFG 4
/* PCI configuration access */
#define VIRTIO_PCI_CAP_PCI_CFG 5
上面的宏定义中,英文的注释已经很明显主要分为:设备控制,设备类型,版本控制,virtio设备标准,状态配置等等。下面再看一下相关的数据结构:
/* This is the PCI capability header: */
struct virtio_pci_cap {
uint8_t cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */
uint8_t cap_next; /* Generic PCI field: next ptr. */
uint8_t cap_len; /* Generic PCI field: capability length */
uint8_t cfg_type; /* Identifies the structure. */
uint8_t bar; /* Where to find it. */
uint8_t padding[3]; /* Pad to full dword. */
uint32_t offset; /* Offset within bar. */
uint32_t length; /* Length of the structure, in bytes. */
};
struct virtio_pci_notify_cap {
struct virtio_pci_cap cap;
uint32_t notify_off_multiplier; /* Multiplier for queue_notify_off. */
};
/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
struct virtio_pci_common_cfg {
/* About the whole device. */
uint32_t device_feature_select; /* read-write */
uint32_t device_feature; /* read-only */
uint32_t guest_feature_select; /* read-write */
uint32_t guest_feature; /* read-write */
uint16_t msix_config; /* read-write */
uint16_t num_queues; /* read-only */
uint8_t device_status; /* read-write */
uint8_t config_generation; /* read-only */
/* About a specific virtqueue. */
uint16_t queue_select; /* read-write */
uint16_t queue_size; /* read-write, power of 2. */
uint16_t queue_msix_vector; /* read-write */
uint16_t queue_enable; /* read-write */
uint16_t queue_notify_off; /* read-only */
uint32_t queue_desc_lo; /* read-write */
uint32_t queue_desc_hi; /* read-write */
uint32_t queue_avail_lo; /* read-write */
uint32_t queue_avail_hi; /* read-write */
uint32_t queue_used_lo; /* read-write */
uint32_t queue_used_hi; /* read-write * /
};
struct virtio_hw;
struct virtio_pci_ops {
void (*read_dev_cfg)(struct virtio_hw * hw, size_t offset,
void * dst, int len);
void (*write_dev_cfg)(struct virtio_hw * hw, size_t offset,
const void * src, int len);
uint8_t (*get_status)(struct virtio_hw * hw);
void (*set_status)(struct virtio_hw * hw, uint8_t status);
uint64_t (*get_features)(struct virtio_hw * hw);
void (*set_features)(struct virtio_hw * hw, uint64_t features);
uint8_t (*get_isr)(struct virtio_hw * hw);
uint16_t (*set_config_irq)(struct virtio_hw * hw, uint16_t vec);
uint16_t (*set_queue_irq)(struct virtio_hw * hw, struct virtqueue * vq,
uint16_t vec);
uint16_t (*get_queue_num)(struct virtio_hw * hw, uint16_t queue_id);
int (*setup_queue)(struct virtio_hw * hw, struct virtqueue * vq);
void (*del_queue)(struct virtio_hw * hw, struct virtqueue * vq);
void (*notify_queue)(struct virtio_hw * hw, struct virtqueue * vq);
};
struct virtio_net_config;
//硬件设备的抽象
struct virtio_hw {
struct virtnet_ctl * cvq;
uint64_t req_guest_features;
uint64_t guest_features;
uint32_t max_queue_pairs;
bool started;
uint16_t max_mtu;
uint16_t vtnet_hdr_size;
uint8_t vlan_strip;
uint8_t use_msix;
uint8_t modern;
uint8_t use_simple_rx;
uint8_t use_inorder_rx;
uint8_t use_inorder_tx;
uint8_t weak_barriers;
bool has_tx_offload;
bool has_rx_offload;
uint16_t port_id;
uint8_t mac_addr[RTE_ETHER_ADDR_LEN];
uint32_t notify_off_multiplier;
uint8_t * isr;
uint16_t * notify_base;
struct virtio_pci_common_cfg * common_cfg;
struct virtio_net_config * dev_cfg;
void * virtio_user_dev;
/*
* App management thread and virtio interrupt handler thread
* both can change device state, this lock is meant to avoid
* such a contention.
* /
rte_spinlock_t state_lock;
struct rte_mbuf ** inject_pkts;
bool opened;
struct virtqueue ** vqs;
};
/*
* While virtio_hw is stored in shared memory, this structure stores
* some infos that may vary in the multiple process model locally.
* For example, the vtpci_ops pointer.
*/
struct virtio_hw_internal {
const struct virtio_pci_ops * vtpci_ops;
struct rte_pci_ioport io;
};
#define VTPCI_OPS(hw) (virtio_hw_internal[(hw)->port_id].vtpci_ops)
#define VTPCI_IO(hw) (&virtio_hw_internal[(hw)->port_id].io)
extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_ETHPORTS];
/*
* This structure is just a reference to read
* net device specific config space; it just a chodu structure
*
*/
struct virtio_net_config {
/* The config defining mac address (if VIRTIO_NET_F_MAC) * /
uint8_t mac[RTE_ETHER_ADDR_LEN];
/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above * /
uint16_t status;
uint16_t max_virtqueue_pairs;
uint16_t mtu;
} __attribute__((packed));
/*
* How many bits to shift physical queue address written to QUEUE_PFN.
* 12 is historical, and due to x86 page size.
*/
#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
/* The alignment to use between consumer and producer parts of vring. */
#define VIRTIO_PCI_VRING_ALIGN 4096
enum virtio_msix_status {
VIRTIO_MSIX_NONE = 0,
VIRTIO_MSIX_DISABLED = 1,
VIRTIO_MSIX_ENABLED = 2
};
其实有过驱动开发经验的可以比较容易的看出,抽象出来的PCI设备,要有描述其设备功能的数据结构体,也有要有相关配置的结构体,对硬件设备的映射以及具体操作的数据结构体封装的函数指针。
在读Linux内核代码的过程中,可以发现基本上这类数据结构体,都包含描述数据结构体,配置结构体和操作函数指针结构体。可能再辅助一些相关的数据内容结构体,通过上面的代码可以再次印证这一点。
二、主要功能
这一部分的主要功能其实就是对硬件到抽象的一次抽象,特别是在计算机设备的发展过程中,经历了各种总线的出现和共存,到现在发展到PCI总线。换句话说,现在的很多设备都支持在PCI总线上的挂载。通过抽象的PCI总线设备支持就可以对DPDK中支持的包括存储、网络等进行统一的支持。使得整个设备的管理更方便,更容易扩展新的设备。DPDK既然是直接从设备到应用层,也不能脱离这个框架,这就是这一层的主要作用。
PCI设备有自己的配置空间,所以在DPDK的定义中也要有相关的配置定义(包括宏和数据结构体),每个PCI设备由最多6个BAR寄存器组成,在此设备出厂时,其相关的地址及offset就已经在硬件上固定写好。PCI设备在整个管理层维护着一个PCI设备链表,这也是在数据结构体中可以看到的设备地址、ID、BAR的物理地址和中断地址及其相关的驱动。
更加详细的匹配说明可以参看相关PCI设备的资料和书籍。
三、基本流程
在上面提到了,本层主要是进行PCI设备的发现注册和挂载等流程,看一下相关的源码:
1、发现
//drvice/bus/pci/pci_common.c
struct rte_pci_bus rte_pci_bus = {
.bus = {
.scan = rte_pci_scan,
.probe = pci_probe,
.find_device = pci_find_device,
.plug = pci_plug,
.unplug = pci_unplug,
.parse = pci_parse,
.dma_map = pci_dma_map,
.dma_unmap = pci_dma_unmap,
.get_iommu_class = rte_pci_get_iommu_class,
.dev_iterate = rte_pci_dev_iterate,
.hot_unplug_handler = pci_hot_unplug_handler,
.sigbus_handler = pci_sigbus_handler,
},
.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
};
RTE_REGISTER_BUS(pci, rte_pci_bus.bus);
//pci.c
/*
* Scan the content of the PCI bus, and the devices in the devices
* list
*/
int
rte_pci_scan(void)
{
struct dirent * e;
DIR * dir;
char dirname[PATH_MAX];
struct rte_pci_addr addr;
/* for debug purposes, PCI can be disabled * /
if (!rte_eal_has_pci())
return 0;
#ifdef VFIO_PRESENT
if (!pci_vfio_is_enabled())
RTE_LOG(DEBUG, EAL, "VFIO PCI modules not loaded\n");
#endif
dir = opendir(rte_pci_get_sysfs_path());
if (dir == NULL) {
RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n",
__func__, strerror(errno));
return -1;
}
while ((e = readdir(dir)) != NULL) {
if (e->d_name[0] == '.')
continue;
if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &addr) != 0)
continue;
snprintf(dirname, sizeof(dirname), "%s/%s",
rte_pci_get_sysfs_path(), e->d_name);
if (pci_scan_one(dirname, &addr) < 0)
goto error;
}
closedir(dir);
return 0;
error:
closedir(dir);
return -1;
}
在Linux中一切都是文件,所以可以看到上面的代码先是判断指定路径上(/sys/bus/pci/devices)有没有相关的PCI设备,有的话开始遍历读取。而pci_scan_one就是将PCI设备目录下的文件读取出来:
/* Scan one pci sysfs entry, and fill the devices list from it. */
static int
pci_scan_one(const char * dirname, const struct rte_pci_addr * addr)
{
char filename[PATH_MAX];
unsigned long tmp;
struct rte_pci_device * dev;
char driver[PATH_MAX];
int ret;
dev = malloc(sizeof(*dev));
if (dev == NULL)
return -1;
memset(dev, 0, sizeof(*dev));
dev->device.bus = &rte_pci_bus.bus;
dev->addr = * addr;
/* get vendor id * /
snprintf(filename, sizeof(filename), "%s/vendor", dirname);
if (eal_parse_sysfs_value(filename, &tmp) < 0) {
free(dev);
return -1;
}
dev->id.vendor_id = (uint16_t)tmp;
/* get device id * /
snprintf(filename, sizeof(filename), "%s/device", dirname);
if (eal_parse_sysfs_value(filename, &tmp) < 0) {
free(dev);
return -1;
}
dev->id.device_id = (uint16_t)tmp;
/* get subsystem_vendor id * /
snprintf(filename, sizeof(filename), "%s/subsystem_vendor",
dirname);
if (eal_parse_sysfs_value(filename, &tmp) < 0) {
free(dev);
return -1;
}
dev->id.subsystem_vendor_id = (uint16_t)tmp;
/* get subsystem_device id * /
snprintf(filename, sizeof(filename), "%s/subsystem_device",
dirname);
if (eal_parse_sysfs_value(filename, &tmp) < 0) {
free(dev);
return -1;
}
dev->id.subsystem_device_id = (uint16_t)tmp;
/* get class_id * /
snprintf(filename, sizeof(filename), "%s/class",
dirname);
if (eal_parse_sysfs_value(filename, &tmp) < 0) {
free(dev);
return -1;
}
/* the least 24 bits are valid: class, subclass, program interface * /
dev->id.class_id = (uint32_t)tmp & RTE_CLASS_ANY_ID;
/* get max_vfs * /
dev->max_vfs = 0;
snprintf(filename, sizeof(filename), "%s/max_vfs", dirname);
if (!access(filename, F_OK) &&
eal_parse_sysfs_value(filename, &tmp) == 0)
dev->max_vfs = (uint16_t)tmp;
else {
/* for non igb_uio driver, need kernel version >= 3.8 * /
snprintf(filename, sizeof(filename),
"%s/sriov_numvfs", dirname);
if (!access(filename, F_OK) &&
eal_parse_sysfs_value(filename, &tmp) == 0)
dev->max_vfs = (uint16_t)tmp;
}
/* get numa node, default to 0 if not present * /
snprintf(filename, sizeof(filename), "%s/numa_node",
dirname);
if (access(filename, F_OK) != -1) {
if (eal_parse_sysfs_value(filename, &tmp) == 0)
dev->device.numa_node = tmp;
else
dev->device.numa_node = -1;
} else {
dev->device.numa_node = 0;
}
pci_name_set(dev);
/* parse resources * /
snprintf(filename, sizeof(filename), "%s/resource", dirname);
if (pci_parse_sysfs_resource(filename, dev) < 0) {
RTE_LOG(ERR, EAL, "%s(): cannot parse resource\n", __func__);
free(dev);
return -1;
}
/* parse driver * /
snprintf(filename, sizeof(filename), "%s/driver", dirname);
ret = pci_get_kernel_driver_by_path(filename, driver, sizeof(driver));
if (ret < 0) {
RTE_LOG(ERR, EAL, "Fail to get kernel driver\n");
free(dev);
return -1;
}
if (!ret) {
if (!strcmp(driver, "vfio-pci"))
dev->kdrv = RTE_KDRV_VFIO;
else if (!strcmp(driver, "igb_uio"))
dev->kdrv = RTE_KDRV_IGB_UIO;
else if (!strcmp(driver, "uio_pci_generic"))
dev->kdrv = RTE_KDRV_UIO_GENERIC;
else
dev->kdrv = RTE_KDRV_UNKNOWN;
} else
dev->kdrv = RTE_KDRV_NONE;
/* device is valid, add in list (sorted) * /
if (TAILQ_EMPTY(&rte_pci_bus.device_list)) {
rte_pci_add_device(dev);
} else {
struct rte_pci_device * dev2;
int ret;
TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) {
ret = rte_pci_addr_cmp(&dev->addr, &dev2->addr);
if (ret > 0)
continue;
if (ret < 0) {
rte_pci_insert_device(dev2, dev);
} else { /* already registered * /
if (!rte_dev_is_probed(&dev2->device)) {
dev2->kdrv = dev->kdrv;
dev2->max_vfs = dev->max_vfs;
pci_name_set(dev2);
memmove(dev2->mem_resource,
dev->mem_resource,
sizeof(dev->mem_resource));
} else {
/**
* If device is plugged and driver is
* probed already, (This happens when
* we call rte_dev_probe which will
* scan all device on the bus) we don't
* need to do anything here unless...
**/
if (dev2->kdrv != dev->kdrv ||
dev2->max_vfs != dev->max_vfs)
/*
* This should not happens.
* But it is still possible if
* we unbind a device from
* vfio or uio before hotplug
* remove and rebind it with
* a different configure.
* So we just print out the
* error as an alarm.
*/
RTE_LOG(ERR, EAL, "Unexpected device scan at %s!\n",
filename);
else if (dev2->device.devargs !=
dev->device.devargs) {
rte_devargs_remove(dev2->device.devargs);
pci_name_set(dev2);
}
}
free(dev);
}
return 0;
}
rte_pci_add_device(dev);
}
return 0;
}
上面的pci_parse_sysfs_resource就是用来读取PCI设备中的bar寄存器映射后的物理空间地址,然后再将其设备以地址大小顺序插入到List中。
2、 探测
/*
* If vendor/device ID match, call the probe() function of all
* registered driver for the given device. Return < 0 if initialization
* failed, return 1 if no driver is found for this device.
*/
static int
pci_probe_all_drivers(struct rte_pci_device *dev)
{
struct rte_pci_driver *dr = NULL;
int rc = 0;
if (dev == NULL)
return -EINVAL;
FOREACH_DRIVER_ON_PCIBUS(dr) {
rc = rte_pci_probe_one_driver(dr, dev);
if (rc < 0)
/* negative value is an error */
return rc;
if (rc > 0)
/* positive value means driver doesn't support it */
continue;
return 0;
}
return 1;
}
/*
* Scan the content of the PCI bus, and call the probe() function for
* all registered drivers that have a matching entry in its id_table
* for discovered devices.
*/
static int
pci_probe(void)
{
struct rte_pci_device *dev = NULL;
size_t probed = 0, failed = 0;
struct rte_devargs *devargs;
int probe_all = 0;
int ret = 0;
if (rte_pci_bus.bus.conf.scan_mode != RTE_BUS_SCAN_WHITELIST)
probe_all = 1;
FOREACH_DEVICE_ON_PCIBUS(dev) {
probed++;
devargs = dev->device.devargs;
/* probe all or only whitelisted devices */
if (probe_all)
ret = pci_probe_all_drivers(dev);
else if (devargs != NULL &&
devargs->policy == RTE_DEV_WHITELISTED)
ret = pci_probe_all_drivers(dev);
if (ret < 0) {
if (ret != -EEXIST) {
RTE_LOG(ERR, EAL, "Requested device "
PCI_PRI_FMT " cannot be used\n",
dev->addr.domain, dev->addr.bus,
dev->addr.devid, dev->addr.function);
rte_errno = errno;
failed++;
}
ret = 0;
}
}
return (probed && probed == failed) ? -1 : 0;
}
这段代码如果有内核开发经验或者阅读过内核源码的应该很清楚。
3、注册
注册的相关代码如下:
//lib\librte_eal\common\include\rte_dev.h
/**
* A structure describing a device driver.
*/
struct rte_driver {
TAILQ_ENTRY(rte_driver) next; /**< Next in list. */
const char *name; /**< Driver name. */
const char *alias; /**< Driver alias. * /
};
// /drviers/bus/pci/pci_common.c rte_pci_bus.h
/* register a driver * /
void
rte_pci_register(struct rte_pci_driver *driver)
{
TAILQ_INSERT_TAIL(&rte_pci_bus.driver_list, driver, next);
driver->bus = &rte_pci_bus;
}
#define RTE_PMD_REGISTER_PCI(nm, pci_drv) \
RTE_INIT(pciinitfn_ ##nm) \
{\
(pci_drv).driver.name = RTE_STR(nm);\
rte_pci_register(&pci_drv); \
} \
RTE_PMD_EXPORT_NAME(nm, __COUNTER__)
/**
* Structure describing the PCI bus
*/
struct rte_pci_bus {
struct rte_bus bus; /**< Inherit the generic class */
struct rte_pci_device_list device_list; /**< List of PCI devices */
struct rte_pci_driver_list driver_list; /**< List of PCI drivers * /
};
struct rte_pci_bus rte_pci_bus = {
.bus = {
.scan = rte_pci_scan,
.probe = pci_probe,
.find_device = pci_find_device,
.plug = pci_plug,
.unplug = pci_unplug,
.parse = pci_parse,
.dma_map = pci_dma_map,
.dma_unmap = pci_dma_unmap,
.get_iommu_class = rte_pci_get_iommu_class,
.dev_iterate = rte_pci_dev_iterate,
.hot_unplug_handler = pci_hot_unplug_handler,
.sigbus_handler = pci_sigbus_handler,
},
.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
};
它们最终会变换到实际的设备链表如下面的网卡:
//drivers/net/e1000/igb_ethdev.c
static struct rte_pci_driver rte_igb_pmd = {
.id_table = pci_id_igb_map,
.drv_flags = RTE_PCI_DRV_NEED_MAPPING | RTE_PCI_DRV_INTR_LSC,
.probe = eth_igb_pci_probe,
.remove = eth_igb_pci_remove,
};
上述转换的方式是使用宏RTE_PMD_REGISTER_PCI(net_e1000_igb, rte_igb_pmd)(drivers/net/e1000/igb_ethdev.c)
4、映射地址
上面提到了读取文件后要进行物理空间的映射,在下面的代码中:
//pci_comon_uio.c
/* Map pci device */
int
rte_pci_map_device(struct rte_pci_device *dev)
{
int ret = -1;
/* try mapping the NIC resources */
switch (dev->kdrv) {
case RTE_KDRV_NIC_UIO:
/* map resources for devices that use uio */
ret = pci_uio_map_resource(dev);
break;
default:
RTE_LOG(DEBUG, EAL,
" Not managed by a supported kernel driver, skipped\n");
ret = 1;
break;
}
return ret;
}
/* map the PCI resource of a PCI device in virtual memory * /
int
pci_uio_map_resource(struct rte_pci_device *dev)
{
int i, map_idx = 0, ret;
uint64_t phaddr;
struct mapped_pci_resource * uio_res = NULL;
struct mapped_pci_res_list * uio_res_list =
RTE_TAILQ_CAST(rte_uio_tailq.head, mapped_pci_res_list);
dev->intr_handle.fd = -1;
dev->intr_handle.uio_cfg_fd = -1;
/* secondary processes - use already recorded details * /
if (rte_eal_process_type() != RTE_PROC_PRIMARY)
return pci_uio_map_secondary(dev);
/* allocate uio resource * /
ret = pci_uio_alloc_resource(dev, &uio_res);
if (ret)
return ret;
/* Map all BARs * /
for (i = 0; i != PCI_MAX_RESOURCE; i++) {
/* skip empty BAR * /
phaddr = dev->mem_resource[i].phys_addr;
if (phaddr == 0)
continue;
ret = pci_uio_map_resource_by_index(dev, i,
uio_res, map_idx);
if (ret)
goto error;
map_idx++;
}
uio_res->nb_maps = map_idx;
TAILQ_INSERT_TAIL(uio_res_list, uio_res, next);
return 0;
error:
for (i = 0; i < map_idx; i++) {
pci_unmap_resource(uio_res->maps[i].addr,
(size_t)uio_res->maps[i].size);
rte_free(uio_res->maps[i].path);
}
pci_uio_free_resource(dev, uio_res);
return -1;
}
5、virtio中的相关函数
这里看一下在前面的virtio中的相关函数实现:
先看一下原来的声明:
/virtio_pci.h
/*
* Function declaration from virtio_pci.c
*/
int vtpci_init(struct rte_pci_device *dev, struct virtio_hw *hw);
void vtpci_reset(struct virtio_hw *);
void vtpci_reinit_complete(struct virtio_hw *);
uint8_t vtpci_get_status(struct virtio_hw *);
void vtpci_set_status(struct virtio_hw *, uint8_t);
uint64_t vtpci_negotiate_features(struct virtio_hw *, uint64_t);
void vtpci_write_dev_config(struct virtio_hw *, size_t, const void *, int);
void vtpci_read_dev_config(struct virtio_hw *, size_t, void *, int);
uint8_t vtpci_isr(struct virtio_hw *);
enum virtio_msix_status vtpci_msix_detect(struct rte_pci_device *dev);
然后看一下调用及实现:
/*
* This function is based on probe() function in virtio_pci.c
* It returns 0 on success.
*/
int
eth_virtio_dev_init(struct rte_eth_dev *eth_dev)
{
struct virtio_hw *hw = eth_dev->data->dev_private;
int ret;
if (sizeof(struct virtio_net_hdr_mrg_rxbuf) > RTE_PKTMBUF_HEADROOM) {
PMD_INIT_LOG(ERR,
"Not sufficient headroom required = %d, avail = %d",
(int)sizeof(struct virtio_net_hdr_mrg_rxbuf),
RTE_PKTMBUF_HEADROOM);
return -1;
}
eth_dev->dev_ops = &virtio_eth_dev_ops;
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
if (!hw->virtio_user_dev) {
ret = virtio_remap_pci(RTE_ETH_DEV_TO_PCI(eth_dev), hw);
if (ret)
return ret;
}
virtio_set_vtpci_ops(hw);
set_rxtx_funcs(eth_dev);
return 0;
}
/*
* Pass the information to the rte_eth_dev_close() that it should also
* release the private port resources.
*/
eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;
/* Allocate memory for storing MAC addresses */
eth_dev->data->mac_addrs = rte_zmalloc("virtio",
VIRTIO_MAX_MAC_ADDRS * RTE_ETHER_ADDR_LEN, 0);
if (eth_dev->data->mac_addrs == NULL) {
PMD_INIT_LOG(ERR,
"Failed to allocate %d bytes needed to store MAC addresses",
VIRTIO_MAX_MAC_ADDRS * RTE_ETHER_ADDR_LEN);
return -ENOMEM;
}
hw->port_id = eth_dev->data->port_id;
/* For virtio_user case the hw->virtio_user_dev is populated by
* virtio_user_eth_dev_alloc() before eth_virtio_dev_init() is called.
*/
if (!hw->virtio_user_dev) {
ret = vtpci_init(RTE_ETH_DEV_TO_PCI(eth_dev), hw);
if (ret)
goto err_vtpci_init;
}
rte_spinlock_init(&hw->state_lock);
/* reset device and negotiate default features */
ret = virtio_init_device(eth_dev, VIRTIO_PMD_DEFAULT_GUEST_FEATURES);
if (ret < 0)
goto err_virtio_init;
hw->opened = true;
return 0;
err_virtio_init:
if (!hw->virtio_user_dev) {
rte_pci_unmap_device(RTE_ETH_DEV_TO_PCI(eth_dev));
if (!hw->modern)
rte_pci_ioport_unmap(VTPCI_IO(hw));
}
err_vtpci_init:
rte_free(eth_dev->data->mac_addrs);
eth_dev->data->mac_addrs = NULL;
return ret;
}
上面这个函数会调用vtpci_init这个函数:
/*
* Return -1:
* if there is error mapping with VFIO/UIO.
* if port map error when driver type is KDRV_NONE.
* if whitelisted but driver type is KDRV_UNKNOWN.
* Return 1 if kernel driver is managing the device.
* Return 0 on success.
*/
int
vtpci_init(struct rte_pci_device *dev, struct virtio_hw *hw)
{
/*
* Try if we can succeed reading virtio pci caps, which exists
* only on modern pci device. If failed, we fallback to legacy
* virtio handling.
* /
if (virtio_read_caps(dev, hw) == 0) {
PMD_INIT_LOG(INFO, "modern virtio pci detected.");
virtio_hw_internal[hw->port_id].vtpci_ops = &modern_ops;
hw->modern = 1;
return 0;
}
PMD_INIT_LOG(INFO, "trying with legacy virtio pci.");
if (rte_pci_ioport_map(dev, 0, VTPCI_IO(hw)) < 0) {
rte_pci_unmap_device(dev);
if (dev->kdrv == RTE_KDRV_UNKNOWN &&
(!dev->device.devargs ||
dev->device.devargs->bus !=
rte_bus_find_by_name("pci"))) {
PMD_INIT_LOG(INFO,
"skip kernel managed virtio device.");
return 1;
}
return -1;
}
virtio_hw_internal[hw->port_id].vtpci_ops = &legacy_ops;
hw->modern = 0;
return 0;
}
这里主要要区分legacy和modern两个不同的版本,前者是在virtio0.95中,后者在virtio1.0及以后中。
其它的几个函数也比较简单,可以对应着看一下源码即可。
四、总结
怪不得总说,源码之前,了无秘密,确实如此。特别在底层,代码其实是对算法、协议或者具体的规定实现的,只要把相关的文档和技术搞清楚,再对应着相关的代码来看,就会有很通透的感觉。国内的程序员看开源的代码和外国人的框架总是有各种的不舒服,其中一个重要原因就是标准和协议是别人制定的。如果不了解这些标准和协议,单纯看代码就会陷入反向推理的莫名其妙中。费力不讨好。
这就是从抽象向实际的一种反应吧。