DPDK系列之十一虚拟化virtio源码分析之底层设备

最新推荐文章于 2024-09-06 18:54:43 发布

fpcc

最新推荐文章于 2024-09-06 18:54:43 发布

阅读量935

点赞数

分类专栏：网络开发文章标签：网络服务器运维

本文链接：https://blog.csdn.net/fpcc/article/details/129967365

版权

网络开发专栏收录该内容

58 篇文章 44 订阅

订阅专栏

一、基础数据结构

在前面介绍过DPDK中virtio源码的分布，其中在底层设备抽象的是virtio_pci.h和virtio_pci.c,它主要用来对PCI设备的检测并实现相关设备的驱动，看一下基础的数据结构和宏定义：

/* VirtIO PCI vendor/device ID. */
#define VIRTIO_PCI_VENDORID     0x1AF4
#define VIRTIO_PCI_LEGACY_DEVICEID_NET 0x1000
#define VIRTIO_PCI_MODERN_DEVICEID_NET 0x1041

/* VirtIO ABI version, this must match exactly. */
#define VIRTIO_PCI_ABI_VERSION 0

/*
 * VirtIO Header, located in BAR 0.
 */
#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
#define VIRTIO_PCI_ISR		  19 /* interrupt status register, reading
				      * also clears the register (8, RO) */
/* Only if MSIX is enabled: */
#define VIRTIO_MSI_CONFIG_VECTOR  20 /* configuration change vector (16, RW) */
#define VIRTIO_MSI_QUEUE_VECTOR	  22 /* vector for selected VQ notifications
				      (16, RW) */

/* The bit of the ISR which indicates a device has an interrupt. */
#define VIRTIO_PCI_ISR_INTR   0x1
/* The bit of the ISR which indicates a device configuration change. */
#define VIRTIO_PCI_ISR_CONFIG 0x2
/* Vector value used to disable MSI for queue. */
#define VIRTIO_MSI_NO_VECTOR 0xFFFF

/* VirtIO device IDs. */
#define VIRTIO_ID_NETWORK  0x01
#define VIRTIO_ID_BLOCK    0x02
#define VIRTIO_ID_CONSOLE  0x03
#define VIRTIO_ID_ENTROPY  0x04
#define VIRTIO_ID_BALLOON  0x05
#define VIRTIO_ID_IOMEMORY 0x06
#define VIRTIO_ID_9P       0x09

/* Status byte for guest to report progress. */
#define VIRTIO_CONFIG_STATUS_RESET     0x00
#define VIRTIO_CONFIG_STATUS_ACK       0x01
#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
#define VIRTIO_CONFIG_STATUS_FAILED    0x80

/*
 * Each virtqueue indirect descriptor list must be physically contiguous.
 * To allow us to malloc(9) each list individually, limit the number
 * supported to what will fit in one page. With 4KB pages, this is a limit
 * of 256 descriptors. If there is ever a need for more, we can switch to
 * contigmalloc(9) for the larger allocations, similar to what
 * bus_dmamem_alloc(9) does.
 *
 * Note the sizeof(struct vring_desc) is 16 bytes.
 */
#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))

/* The feature bitmap for virtio net */
#define VIRTIO_NET_F_CSUM	0	/* Host handles pkts w/ partial csum */
#define VIRTIO_NET_F_GUEST_CSUM	1	/* Guest handles pkts w/ partial csum */
#define VIRTIO_NET_F_MTU	3	/* Initial MTU advice. */
#define VIRTIO_NET_F_MAC	5	/* Host has given MAC address. */
#define VIRTIO_NET_F_GUEST_TSO4	7	/* Guest can handle TSOv4 in. */
#define VIRTIO_NET_F_GUEST_TSO6	8	/* Guest can handle TSOv6 in. */
#define VIRTIO_NET_F_GUEST_ECN	9	/* Guest can handle TSO[6] w/ ECN in. */
#define VIRTIO_NET_F_GUEST_UFO	10	/* Guest can handle UFO in. */
#define VIRTIO_NET_F_HOST_TSO4	11	/* Host can handle TSOv4 in. */
#define VIRTIO_NET_F_HOST_TSO6	12	/* Host can handle TSOv6 in. */
#define VIRTIO_NET_F_HOST_ECN	13	/* Host can handle TSO[6] w/ ECN in. */
#define VIRTIO_NET_F_HOST_UFO	14	/* Host can handle UFO in. */
#define VIRTIO_NET_F_MRG_RXBUF	15	/* Host can merge receive buffers. */
#define VIRTIO_NET_F_STATUS	16	/* virtio_net_config.status available */
#define VIRTIO_NET_F_CTRL_VQ	17	/* Control channel available */
#define VIRTIO_NET_F_CTRL_RX	18	/* Control channel RX mode support */
#define VIRTIO_NET_F_CTRL_VLAN	19	/* Control channel VLAN filtering */
#define VIRTIO_NET_F_CTRL_RX_EXTRA 20	/* Extra RX mode control support */
#define VIRTIO_NET_F_GUEST_ANNOUNCE 21	/* Guest can announce device on the
					 * network */
#define VIRTIO_NET_F_MQ		22	/* Device supports Receive Flow
					 * Steering */
#define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */

/* Do we get callbacks when the ring is completely used, even if we've
 * suppressed them? */
#define VIRTIO_F_NOTIFY_ON_EMPTY	24

/* Can the device handle any descriptor layout? */
#define VIRTIO_F_ANY_LAYOUT		27

/* We support indirect buffer descriptors */
#define VIRTIO_RING_F_INDIRECT_DESC	28

#define VIRTIO_F_VERSION_1		32
#define VIRTIO_F_IOMMU_PLATFORM	33
#define VIRTIO_F_RING_PACKED		34

/*
 * Some VirtIO feature bits (currently bits 28 through 31) are
 * reserved for the transport being used (eg. virtio_ring), the
 * rest are per-device feature bits.
 */
#define VIRTIO_TRANSPORT_F_START 28
#define VIRTIO_TRANSPORT_F_END   34

/*
 * Inorder feature indicates that all buffers are used by the device
 * in the same order in which they have been made available.
 */
#define VIRTIO_F_IN_ORDER 35

/*
 * This feature indicates that memory accesses by the driver and the device
 * are ordered in a way described by the platform.
 */
#define VIRTIO_F_ORDER_PLATFORM 36

/* The Guest publishes the used index for which it expects an interrupt
 * at the end of the avail ring. Host should ignore the avail->flags field. */
/* The Host publishes the avail index for which it expects a kick
 * at the end of the used ring. Guest should ignore the used->flags field. */
#define VIRTIO_RING_F_EVENT_IDX		29

#define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
#define VIRTIO_NET_S_ANNOUNCE	2	/* Announcement is needed */

/*
 * Maximum number of virtqueues per device.
 */
#define VIRTIO_MAX_VIRTQUEUE_PAIRS 8
#define VIRTIO_MAX_VIRTQUEUES (VIRTIO_MAX_VIRTQUEUE_PAIRS * 2 + 1)

/* Common configuration */
#define VIRTIO_PCI_CAP_COMMON_CFG	1
/* Notifications */
#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
/* ISR Status */
#define VIRTIO_PCI_CAP_ISR_CFG		3
/* Device specific configuration */
#define VIRTIO_PCI_CAP_DEVICE_CFG	4
/* PCI configuration access */
#define VIRTIO_PCI_CAP_PCI_CFG		5

上面的宏定义中，英文的注释已经很明显主要分为：设备控制，设备类型，版本控制，virtio设备标准，状态配置等等。下面再看一下相关的数据结构：

/* This is the PCI capability header: */
struct virtio_pci_cap {
	uint8_t cap_vndr;		/* Generic PCI field: PCI_CAP_ID_VNDR */
	uint8_t cap_next;		/* Generic PCI field: next ptr. */
	uint8_t cap_len;		/* Generic PCI field: capability length */
	uint8_t cfg_type;		/* Identifies the structure. */
	uint8_t bar;			/* Where to find it. */
	uint8_t padding[3];		/* Pad to full dword. */
	uint32_t offset;		/* Offset within bar. */
	uint32_t length;		/* Length of the structure, in bytes. */
};

struct virtio_pci_notify_cap {
	struct virtio_pci_cap cap;
	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
};

/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
struct virtio_pci_common_cfg {
	/* About the whole device. */
	uint32_t device_feature_select;	/* read-write */
	uint32_t device_feature;	/* read-only */
	uint32_t guest_feature_select;	/* read-write */
	uint32_t guest_feature;		/* read-write */
	uint16_t msix_config;		/* read-write */
	uint16_t num_queues;		/* read-only */
	uint8_t device_status;		/* read-write */
	uint8_t config_generation;	/* read-only */

	/* About a specific virtqueue. */
	uint16_t queue_select;		/* read-write */
	uint16_t queue_size;		/* read-write, power of 2. */
	uint16_t queue_msix_vector;	/* read-write */
	uint16_t queue_enable;		/* read-write */
	uint16_t queue_notify_off;	/* read-only */
	uint32_t queue_desc_lo;		/* read-write */
	uint32_t queue_desc_hi;		/* read-write */
	uint32_t queue_avail_lo;	/* read-write */
	uint32_t queue_avail_hi;	/* read-write */
	uint32_t queue_used_lo;		/* read-write */
	uint32_t queue_used_hi;		/* read-write * /
};

struct virtio_hw;

struct virtio_pci_ops {
	void (*read_dev_cfg)(struct virtio_hw * hw, size_t offset,
			     void * dst, int len);
	void (*write_dev_cfg)(struct virtio_hw * hw, size_t offset,
			      const void * src, int len);

	uint8_t (*get_status)(struct virtio_hw * hw);
	void    (*set_status)(struct virtio_hw * hw, uint8_t status);

	uint64_t (*get_features)(struct virtio_hw * hw);
	void     (*set_features)(struct virtio_hw * hw, uint64_t features);

	uint8_t (*get_isr)(struct virtio_hw * hw);

	uint16_t (*set_config_irq)(struct virtio_hw * hw, uint16_t vec);

	uint16_t (*set_queue_irq)(struct virtio_hw * hw, struct virtqueue * vq,
			uint16_t vec);

	uint16_t (*get_queue_num)(struct virtio_hw * hw, uint16_t queue_id);
	int (*setup_queue)(struct virtio_hw * hw, struct virtqueue * vq);
	void (*del_queue)(struct virtio_hw * hw, struct virtqueue * vq);
	void (*notify_queue)(struct virtio_hw * hw, struct virtqueue * vq);
};

struct virtio_net_config;
//硬件设备的抽象
struct virtio_hw {
	struct virtnet_ctl * cvq;
	uint64_t    req_guest_features;
	uint64_t    guest_features;
	uint32_t    max_queue_pairs;
	bool        started;
	uint16_t	max_mtu;
	uint16_t    vtnet_hdr_size;
	uint8_t	    vlan_strip;
	uint8_t	    use_msix;
	uint8_t     modern;
	uint8_t     use_simple_rx;
	uint8_t     use_inorder_rx;
	uint8_t     use_inorder_tx;
	uint8_t     weak_barriers;
	bool        has_tx_offload;
	bool        has_rx_offload;
	uint16_t    port_id;
	uint8_t     mac_addr[RTE_ETHER_ADDR_LEN];
	uint32_t    notify_off_multiplier;
	uint8_t     * isr;
	uint16_t    * notify_base;
	struct virtio_pci_common_cfg * common_cfg;
	struct virtio_net_config * dev_cfg;
	void	    * virtio_user_dev;
	/*
	 * App management thread and virtio interrupt handler thread
	 * both can change device state, this lock is meant to avoid
	 * such a contention.
	 * /
	rte_spinlock_t state_lock;
	struct rte_mbuf ** inject_pkts;
	bool        opened;

	struct virtqueue ** vqs;
};


/*
 * While virtio_hw is stored in shared memory, this structure stores
 * some infos that may vary in the multiple process model locally.
 * For example, the vtpci_ops pointer.
 */
struct virtio_hw_internal {
	const struct virtio_pci_ops * vtpci_ops;
	struct rte_pci_ioport io;
};

#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->port_id].vtpci_ops)
#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->port_id].io)

extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_ETHPORTS];


/*
 * This structure is just a reference to read
 * net device specific config space; it just a chodu structure
 *
 */
struct virtio_net_config {
	/* The config defining mac address (if VIRTIO_NET_F_MAC) * /
	uint8_t    mac[RTE_ETHER_ADDR_LEN];
	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above * /
	uint16_t   status;
	uint16_t   max_virtqueue_pairs;
	uint16_t   mtu;
} __attribute__((packed));

/*
 * How many bits to shift physical queue address written to QUEUE_PFN.
 * 12 is historical, and due to x86 page size.
 */
#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12

/* The alignment to use between consumer and producer parts of vring. */
#define VIRTIO_PCI_VRING_ALIGN 4096

enum virtio_msix_status {
	VIRTIO_MSIX_NONE = 0,
	VIRTIO_MSIX_DISABLED = 1,
	VIRTIO_MSIX_ENABLED = 2
};

其实有过驱动开发经验的可以比较容易的看出，抽象出来的PCI设备，要有描述其设备功能的数据结构体，也有要有相关配置的结构体，对硬件设备的映射以及具体操作的数据结构体封装的函数指针。
在读Linux内核代码的过程中，可以发现基本上这类数据结构体，都包含描述数据结构体，配置结构体和操作函数指针结构体。可能再辅助一些相关的数据内容结构体，通过上面的代码可以再次印证这一点。

二、主要功能

这一部分的主要功能其实就是对硬件到抽象的一次抽象，特别是在计算机设备的发展过程中，经历了各种总线的出现和共存，到现在发展到PCI总线。换句话说，现在的很多设备都支持在PCI总线上的挂载。通过抽象的PCI总线设备支持就可以对DPDK中支持的包括存储、网络等进行统一的支持。使得整个设备的管理更方便，更容易扩展新的设备。DPDK既然是直接从设备到应用层，也不能脱离这个框架，这就是这一层的主要作用。
PCI设备有自己的配置空间，所以在DPDK的定义中也要有相关的配置定义（包括宏和数据结构体），每个PCI设备由最多6个BAR寄存器组成，在此设备出厂时，其相关的地址及offset就已经在硬件上固定写好。PCI设备在整个管理层维护着一个PCI设备链表，这也是在数据结构体中可以看到的设备地址、ID、BAR的物理地址和中断地址及其相关的驱动。
更加详细的匹配说明可以参看相关PCI设备的资料和书籍。

三、基本流程

在上面提到了，本层主要是进行PCI设备的发现注册和挂载等流程，看一下相关的源码：
1、发现

//drvice/bus/pci/pci_common.c
struct rte_pci_bus rte_pci_bus = {
	.bus = {
		.scan = rte_pci_scan,
		.probe = pci_probe,
		.find_device = pci_find_device,
		.plug = pci_plug,
		.unplug = pci_unplug,
		.parse = pci_parse,
		.dma_map = pci_dma_map,
		.dma_unmap = pci_dma_unmap,
		.get_iommu_class = rte_pci_get_iommu_class,
		.dev_iterate = rte_pci_dev_iterate,
		.hot_unplug_handler = pci_hot_unplug_handler,
		.sigbus_handler = pci_sigbus_handler,
	},
	.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
	.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
};

RTE_REGISTER_BUS(pci, rte_pci_bus.bus);

//pci.c
/*
 * Scan the content of the PCI bus, and the devices in the devices
 * list
 */
int
rte_pci_scan(void)
{
	struct dirent * e;
	DIR * dir;
	char dirname[PATH_MAX];
	struct rte_pci_addr addr;

	/* for debug purposes, PCI can be disabled * /
	if (!rte_eal_has_pci())
		return 0;

#ifdef VFIO_PRESENT
	if (!pci_vfio_is_enabled())
		RTE_LOG(DEBUG, EAL, "VFIO PCI modules not loaded\n");
#endif

	dir = opendir(rte_pci_get_sysfs_path());
	if (dir == NULL) {
		RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n",
			__func__, strerror(errno));
		return -1;
	}

	while ((e = readdir(dir)) != NULL) {
		if (e->d_name[0] == '.')
			continue;

		if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &addr) != 0)
			continue;

		snprintf(dirname, sizeof(dirname), "%s/%s",
				rte_pci_get_sysfs_path(), e->d_name);

		if (pci_scan_one(dirname, &addr) < 0)
			goto error;
	}
	closedir(dir);
	return 0;

error:
	closedir(dir);
	return -1;
}

在Linux中一切都是文件，所以可以看到上面的代码先是判断指定路径上（/sys/bus/pci/devices）有没有相关的PCI设备，有的话开始遍历读取。而pci_scan_one就是将PCI设备目录下的文件读取出来：

/* Scan one pci sysfs entry, and fill the devices list from it. */
static int
pci_scan_one(const char * dirname, const struct rte_pci_addr * addr)
{
	char filename[PATH_MAX];
	unsigned long tmp;
	struct rte_pci_device * dev;
	char driver[PATH_MAX];
	int ret;

	dev = malloc(sizeof(*dev));
	if (dev == NULL)
		return -1;

	memset(dev, 0, sizeof(*dev));
	dev->device.bus = &rte_pci_bus.bus;
	dev->addr = * addr;

	/* get vendor id * /
	snprintf(filename, sizeof(filename), "%s/vendor", dirname);
	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
		free(dev);
		return -1;
	}
	dev->id.vendor_id = (uint16_t)tmp;

	/* get device id * /
	snprintf(filename, sizeof(filename), "%s/device", dirname);
	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
		free(dev);
		return -1;
	}
	dev->id.device_id = (uint16_t)tmp;

	/* get subsystem_vendor id * /
	snprintf(filename, sizeof(filename), "%s/subsystem_vendor",
		 dirname);
	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
		free(dev);
		return -1;
	}
	dev->id.subsystem_vendor_id = (uint16_t)tmp;

	/* get subsystem_device id * /
	snprintf(filename, sizeof(filename), "%s/subsystem_device",
		 dirname);
	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
		free(dev);
		return -1;
	}
	dev->id.subsystem_device_id = (uint16_t)tmp;

	/* get class_id * /
	snprintf(filename, sizeof(filename), "%s/class",
		 dirname);
	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
		free(dev);
		return -1;
	}
	/* the least 24 bits are valid: class, subclass, program interface * /
	dev->id.class_id = (uint32_t)tmp & RTE_CLASS_ANY_ID;

	/* get max_vfs * /
	dev->max_vfs = 0;
	snprintf(filename, sizeof(filename), "%s/max_vfs", dirname);
	if (!access(filename, F_OK) &&
	    eal_parse_sysfs_value(filename, &tmp) == 0)
		dev->max_vfs = (uint16_t)tmp;
	else {
		/* for non igb_uio driver, need kernel version >= 3.8 * /
		snprintf(filename, sizeof(filename),
			 "%s/sriov_numvfs", dirname);
		if (!access(filename, F_OK) &&
		    eal_parse_sysfs_value(filename, &tmp) == 0)
			dev->max_vfs = (uint16_t)tmp;
	}

	/* get numa node, default to 0 if not present * /
	snprintf(filename, sizeof(filename), "%s/numa_node",
		 dirname);

	if (access(filename, F_OK) != -1) {
		if (eal_parse_sysfs_value(filename, &tmp) == 0)
			dev->device.numa_node = tmp;
		else
			dev->device.numa_node = -1;
	} else {
		dev->device.numa_node = 0;
	}

	pci_name_set(dev);

	/* parse resources * /
	snprintf(filename, sizeof(filename), "%s/resource", dirname);
	if (pci_parse_sysfs_resource(filename, dev) < 0) {
		RTE_LOG(ERR, EAL, "%s(): cannot parse resource\n", __func__);
		free(dev);
		return -1;
	}

	/* parse driver * /
	snprintf(filename, sizeof(filename), "%s/driver", dirname);
	ret = pci_get_kernel_driver_by_path(filename, driver, sizeof(driver));
	if (ret < 0) {
		RTE_LOG(ERR, EAL, "Fail to get kernel driver\n");
		free(dev);
		return -1;
	}

	if (!ret) {
		if (!strcmp(driver, "vfio-pci"))
			dev->kdrv = RTE_KDRV_VFIO;
		else if (!strcmp(driver, "igb_uio"))
			dev->kdrv = RTE_KDRV_IGB_UIO;
		else if (!strcmp(driver, "uio_pci_generic"))
			dev->kdrv = RTE_KDRV_UIO_GENERIC;
		else
			dev->kdrv = RTE_KDRV_UNKNOWN;
	} else
		dev->kdrv = RTE_KDRV_NONE;

	/* device is valid, add in list (sorted) * /
	if (TAILQ_EMPTY(&rte_pci_bus.device_list)) {
		rte_pci_add_device(dev);
	} else {
		struct rte_pci_device * dev2;
		int ret;

		TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) {
			ret = rte_pci_addr_cmp(&dev->addr, &dev2->addr);
			if (ret > 0)
				continue;

			if (ret < 0) {
				rte_pci_insert_device(dev2, dev);
			} else { /* already registered * /
				if (!rte_dev_is_probed(&dev2->device)) {
					dev2->kdrv = dev->kdrv;
					dev2->max_vfs = dev->max_vfs;
					pci_name_set(dev2);
					memmove(dev2->mem_resource,
						dev->mem_resource,
						sizeof(dev->mem_resource));
				} else {
					/**
					 * If device is plugged and driver is
					 * probed already, (This happens when
					 * we call rte_dev_probe which will
					 * scan all device on the bus) we don't
					 * need to do anything here unless...
					 **/
					if (dev2->kdrv != dev->kdrv ||
						dev2->max_vfs != dev->max_vfs)
						/*
						 * This should not happens.
						 * But it is still possible if
						 * we unbind a device from
						 * vfio or uio before hotplug
						 * remove and rebind it with
						 * a different configure.
						 * So we just print out the
						 * error as an alarm.
						 */
						RTE_LOG(ERR, EAL, "Unexpected device scan at %s!\n",
							filename);
					else if (dev2->device.devargs !=
						 dev->device.devargs) {
						rte_devargs_remove(dev2->device.devargs);
						pci_name_set(dev2);
					}
				}
				free(dev);
			}
			return 0;
		}

		rte_pci_add_device(dev);
	}

	return 0;
}

上面的pci_parse_sysfs_resource就是用来读取PCI设备中的bar寄存器映射后的物理空间地址，然后再将其设备以地址大小顺序插入到List中。
2、探测

/*
 * If vendor/device ID match, call the probe() function of all
 * registered driver for the given device. Return < 0 if initialization
 * failed, return 1 if no driver is found for this device.
 */
static int
pci_probe_all_drivers(struct rte_pci_device *dev)
{
	struct rte_pci_driver *dr = NULL;
	int rc = 0;

	if (dev == NULL)
		return -EINVAL;

	FOREACH_DRIVER_ON_PCIBUS(dr) {
		rc = rte_pci_probe_one_driver(dr, dev);
		if (rc < 0)
			/* negative value is an error */
			return rc;
		if (rc > 0)
			/* positive value means driver doesn't support it */
			continue;
		return 0;
	}
	return 1;
}

/*
 * Scan the content of the PCI bus, and call the probe() function for
 * all registered drivers that have a matching entry in its id_table
 * for discovered devices.
 */
static int
pci_probe(void)
{
	struct rte_pci_device *dev = NULL;
	size_t probed = 0, failed = 0;
	struct rte_devargs *devargs;
	int probe_all = 0;
	int ret = 0;

	if (rte_pci_bus.bus.conf.scan_mode != RTE_BUS_SCAN_WHITELIST)
		probe_all = 1;

	FOREACH_DEVICE_ON_PCIBUS(dev) {
		probed++;

		devargs = dev->device.devargs;
		/* probe all or only whitelisted devices */
		if (probe_all)
			ret = pci_probe_all_drivers(dev);
		else if (devargs != NULL &&
			devargs->policy == RTE_DEV_WHITELISTED)
			ret = pci_probe_all_drivers(dev);
		if (ret < 0) {
			if (ret != -EEXIST) {
				RTE_LOG(ERR, EAL, "Requested device "
					PCI_PRI_FMT " cannot be used\n",
					dev->addr.domain, dev->addr.bus,
					dev->addr.devid, dev->addr.function);
				rte_errno = errno;
				failed++;
			}
			ret = 0;
		}
	}

	return (probed && probed == failed) ? -1 : 0;
}

这段代码如果有内核开发经验或者阅读过内核源码的应该很清楚。

3、注册
注册的相关代码如下：

//lib\librte_eal\common\include\rte_dev.h
/**
 * A structure describing a device driver.
 */
struct rte_driver {
	TAILQ_ENTRY(rte_driver) next;  /**< Next in list. */
	const char *name;                   /**< Driver name. */
	const char *alias;              /**< Driver alias. * /
};

// /drviers/bus/pci/pci_common.c  rte_pci_bus.h
/* register a driver * /
void
rte_pci_register(struct rte_pci_driver *driver)
{
	TAILQ_INSERT_TAIL(&rte_pci_bus.driver_list, driver, next);
	driver->bus = &rte_pci_bus;
}
#define RTE_PMD_REGISTER_PCI(nm, pci_drv) \
RTE_INIT(pciinitfn_ ##nm) \
{\
	(pci_drv).driver.name = RTE_STR(nm);\
	rte_pci_register(&pci_drv); \
} \
RTE_PMD_EXPORT_NAME(nm, __COUNTER__)
/**
 * Structure describing the PCI bus
 */
struct rte_pci_bus {
	struct rte_bus bus;               /**< Inherit the generic class */
	struct rte_pci_device_list device_list;  /**< List of PCI devices */
	struct rte_pci_driver_list driver_list;  /**< List of PCI drivers * /
};
struct rte_pci_bus rte_pci_bus = {
	.bus = {
		.scan = rte_pci_scan,
		.probe = pci_probe,
		.find_device = pci_find_device,
		.plug = pci_plug,
		.unplug = pci_unplug,
		.parse = pci_parse,
		.dma_map = pci_dma_map,
		.dma_unmap = pci_dma_unmap,
		.get_iommu_class = rte_pci_get_iommu_class,
		.dev_iterate = rte_pci_dev_iterate,
		.hot_unplug_handler = pci_hot_unplug_handler,
		.sigbus_handler = pci_sigbus_handler,
	},
	.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
	.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
};

它们最终会变换到实际的设备链表如下面的网卡：

//drivers/net/e1000/igb_ethdev.c
static struct rte_pci_driver rte_igb_pmd = {
	.id_table = pci_id_igb_map,
	.drv_flags = RTE_PCI_DRV_NEED_MAPPING | RTE_PCI_DRV_INTR_LSC,
	.probe = eth_igb_pci_probe,
	.remove = eth_igb_pci_remove,
};

上述转换的方式是使用宏RTE_PMD_REGISTER_PCI(net_e1000_igb, rte_igb_pmd)（drivers/net/e1000/igb_ethdev.c）

4、映射地址
上面提到了读取文件后要进行物理空间的映射，在下面的代码中：

//pci_comon_uio.c
/* Map pci device */
int
rte_pci_map_device(struct rte_pci_device *dev)
{
	int ret = -1;

	/* try mapping the NIC resources */
	switch (dev->kdrv) {
	case RTE_KDRV_NIC_UIO:
		/* map resources for devices that use uio */
		ret = pci_uio_map_resource(dev);
		break;
	default:
		RTE_LOG(DEBUG, EAL,
			"  Not managed by a supported kernel driver, skipped\n");
		ret = 1;
		break;
	}

	return ret;
}
/* map the PCI resource of a PCI device in virtual memory * /
int
pci_uio_map_resource(struct rte_pci_device *dev)
{
	int i, map_idx = 0, ret;
	uint64_t phaddr;
	struct mapped_pci_resource * uio_res = NULL;
	struct mapped_pci_res_list * uio_res_list =
		RTE_TAILQ_CAST(rte_uio_tailq.head, mapped_pci_res_list);

	dev->intr_handle.fd = -1;
	dev->intr_handle.uio_cfg_fd = -1;

	/* secondary processes - use already recorded details * /
	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
		return pci_uio_map_secondary(dev);

	/* allocate uio resource * /
	ret = pci_uio_alloc_resource(dev, &uio_res);
	if (ret)
		return ret;

	/* Map all BARs * /
	for (i = 0; i != PCI_MAX_RESOURCE; i++) {
		/* skip empty BAR * /
		phaddr = dev->mem_resource[i].phys_addr;
		if (phaddr == 0)
			continue;

		ret = pci_uio_map_resource_by_index(dev, i,
				uio_res, map_idx);
		if (ret)
			goto error;

		map_idx++;
	}

	uio_res->nb_maps = map_idx;

	TAILQ_INSERT_TAIL(uio_res_list, uio_res, next);

	return 0;
error:
	for (i = 0; i < map_idx; i++) {
		pci_unmap_resource(uio_res->maps[i].addr,
				(size_t)uio_res->maps[i].size);
		rte_free(uio_res->maps[i].path);
	}
	pci_uio_free_resource(dev, uio_res);
	return -1;
}

5、virtio中的相关函数
这里看一下在前面的virtio中的相关函数实现：
先看一下原来的声明：

/virtio_pci.h
/*
 * Function declaration from virtio_pci.c
 */
int vtpci_init(struct rte_pci_device *dev, struct virtio_hw *hw);
void vtpci_reset(struct virtio_hw *);

void vtpci_reinit_complete(struct virtio_hw *);

uint8_t vtpci_get_status(struct virtio_hw *);
void vtpci_set_status(struct virtio_hw *, uint8_t);

uint64_t vtpci_negotiate_features(struct virtio_hw *, uint64_t);

void vtpci_write_dev_config(struct virtio_hw *, size_t, const void *, int);

void vtpci_read_dev_config(struct virtio_hw *, size_t, void *, int);

uint8_t vtpci_isr(struct virtio_hw *);

enum virtio_msix_status vtpci_msix_detect(struct rte_pci_device *dev);

然后看一下调用及实现：

/*
 * This function is based on probe() function in virtio_pci.c
 * It returns 0 on success.
 */
int
eth_virtio_dev_init(struct rte_eth_dev *eth_dev)
{
	struct virtio_hw *hw = eth_dev->data->dev_private;
	int ret;

	if (sizeof(struct virtio_net_hdr_mrg_rxbuf) > RTE_PKTMBUF_HEADROOM) {
		PMD_INIT_LOG(ERR,
			"Not sufficient headroom required = %d, avail = %d",
			(int)sizeof(struct virtio_net_hdr_mrg_rxbuf),
			RTE_PKTMBUF_HEADROOM);

		return -1;
	}

	eth_dev->dev_ops = &virtio_eth_dev_ops;

	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
		if (!hw->virtio_user_dev) {
			ret = virtio_remap_pci(RTE_ETH_DEV_TO_PCI(eth_dev), hw);
			if (ret)
				return ret;
		}

		virtio_set_vtpci_ops(hw);
		set_rxtx_funcs(eth_dev);

		return 0;
	}

	/*
	 * Pass the information to the rte_eth_dev_close() that it should also
	 * release the private port resources.
	 */
	eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;

	/* Allocate memory for storing MAC addresses */
	eth_dev->data->mac_addrs = rte_zmalloc("virtio",
				VIRTIO_MAX_MAC_ADDRS * RTE_ETHER_ADDR_LEN, 0);
	if (eth_dev->data->mac_addrs == NULL) {
		PMD_INIT_LOG(ERR,
			"Failed to allocate %d bytes needed to store MAC addresses",
			VIRTIO_MAX_MAC_ADDRS * RTE_ETHER_ADDR_LEN);
		return -ENOMEM;
	}

	hw->port_id = eth_dev->data->port_id;
	/* For virtio_user case the hw->virtio_user_dev is populated by
	 * virtio_user_eth_dev_alloc() before eth_virtio_dev_init() is called.
	 */
	if (!hw->virtio_user_dev) {
		ret = vtpci_init(RTE_ETH_DEV_TO_PCI(eth_dev), hw);
		if (ret)
			goto err_vtpci_init;
	}

	rte_spinlock_init(&hw->state_lock);

	/* reset device and negotiate default features */
	ret = virtio_init_device(eth_dev, VIRTIO_PMD_DEFAULT_GUEST_FEATURES);
	if (ret < 0)
		goto err_virtio_init;

	hw->opened = true;

	return 0;

err_virtio_init:
	if (!hw->virtio_user_dev) {
		rte_pci_unmap_device(RTE_ETH_DEV_TO_PCI(eth_dev));
		if (!hw->modern)
			rte_pci_ioport_unmap(VTPCI_IO(hw));
	}
err_vtpci_init:
	rte_free(eth_dev->data->mac_addrs);
	eth_dev->data->mac_addrs = NULL;
	return ret;
}

上面这个函数会调用vtpci_init这个函数：

/*
 * Return -1:
 *   if there is error mapping with VFIO/UIO.
 *   if port map error when driver type is KDRV_NONE.
 *   if whitelisted but driver type is KDRV_UNKNOWN.
 * Return 1 if kernel driver is managing the device.
 * Return 0 on success.
 */
int
vtpci_init(struct rte_pci_device *dev, struct virtio_hw *hw)
{
	/*
	 * Try if we can succeed reading virtio pci caps, which exists
	 * only on modern pci device. If failed, we fallback to legacy
	 * virtio handling.
	 * /
	if (virtio_read_caps(dev, hw) == 0) {
		PMD_INIT_LOG(INFO, "modern virtio pci detected.");
		virtio_hw_internal[hw->port_id].vtpci_ops = &modern_ops;
		hw->modern = 1;
		return 0;
	}

	PMD_INIT_LOG(INFO, "trying with legacy virtio pci.");
	if (rte_pci_ioport_map(dev, 0, VTPCI_IO(hw)) < 0) {
		rte_pci_unmap_device(dev);
		if (dev->kdrv == RTE_KDRV_UNKNOWN &&
		    (!dev->device.devargs ||
		     dev->device.devargs->bus !=
		     rte_bus_find_by_name("pci"))) {
			PMD_INIT_LOG(INFO,
				"skip kernel managed virtio device.");
			return 1;
		}
		return -1;
	}

	virtio_hw_internal[hw->port_id].vtpci_ops = &legacy_ops;
	hw->modern   = 0;

	return 0;
}

这里主要要区分legacy和modern两个不同的版本，前者是在virtio0.95中，后者在virtio1.0及以后中。
其它的几个函数也比较简单，可以对应着看一下源码即可。

四、总结

怪不得总说，源码之前，了无秘密，确实如此。特别在底层，代码其实是对算法、协议或者具体的规定实现的，只要把相关的文档和技术搞清楚，再对应着相关的代码来看，就会有很通透的感觉。国内的程序员看开源的代码和外国人的框架总是有各种的不舒服，其中一个重要原因就是标准和协议是别人制定的。如果不了解这些标准和协议，单纯看代码就会陷入反向推理的莫名其妙中。费力不讨好。
这就是从抽象向实际的一种反应吧。