rte_eal_remote_launch()
在过去的几篇文章中,我们重点分析了DPDK初始化过程rte_eal_init()的主要流程,了解了其内存分配,primary和secondary之间如何实现数据共享。Hello world例子中,在DPDK初始化完成之后,调用rte_eal_remote_launch()在指定的lcore上启动了一个线程。接下来简单分析一下该过程是如何实现的。
在init()过程的第30步中简单介绍了每个worker lcore上都会启动一个线程,入口函数是rte_eal_remote_launch (),该函数会无休止执行从main_to_worker管道中接收消息,然后执行lcore_config[lcore_id].f(),最后返回结果。rte_eal_remote_launch的实现上即是基于此,分析如下:
int rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned int worker_id) {
int n;
char c = 0;
int m2w = lcore_config[worker_id].pipe_main2worker[1]; //获取两个main到对应worker的两条管道
int w2m = lcore_config[worker_id].pipe_worker2main[0];
int rc = -EBUSY;
if (lcore_config[worker_id].state != WAIT) // 当前lcore的状态不是WAIT状态时,表示有正在执行的任务,则退出
goto finish;
lcore_config[worker_id].f = f; //设定新的要执行的入口func和参数
lcore_config[worker_id].arg = arg;
n = 0;
while (n == 0 || (n < 0 && errno == EINTR)) //通过main_to_worker管道发送消息告知worker的线程,有任务需要执行了
n = write(m2w, &c, 1);
if (n < 0)
rte_panic("cannot write on configuration pipe\n");
do { //通过worker_to_main管道接收消息,表示对端成功接收
n = read(w2m, &c, 1);
} while (n < 0 && errno == EINTR);
if (n <= 0)
rte_panic("cannot read on configuration pipe\n");
rc = 0; //到此表示f()正常执行了
finish:
rte_eal_trace_thread_remote_launch(f, arg, worker_id, rc); //这里是一个通过宏定义定义的func,会在日志中打印信息方便今后的追踪
return rc;
}
在Hello world最后,调用rte_eal_mp_wait_lcore()对每个lcore调用rte_eal_wait_lcore(),该func是主要检查lcore_config[lcore_id].state是否是FINISH状态,如果是FINISH状态则改为WAIT状态,表示该lcore上可以接受下一次执行任务了。
example/ethtool
至此DPDK的Hello world基本上分析完毕。接下来我们以DPDK项目中example/ethtool这个例子为例,分析一下如何使用DPDK对网卡进行操作。
int main(int argc, char **argv) {
int cnt_args_parsed;
uint32_t id_core;
uint32_t cnt_ports;
cnt_args_parsed = rte_eal_init(argc, argv);
if (cnt_args_parsed < 0)
rte_exit(EXIT_FAILURE, "rte_eal_init(): Failed");
cnt_ports = rte_eth_dev_count_avail();
printf("Number of NICs: %i\n", cnt_ports);
if (cnt_ports == 0)
rte_exit(EXIT_FAILURE, "No available NIC ports!\n");
if (cnt_ports > MAX_PORTS) {
printf("Info: Using only %i of %i ports\n",
cnt_ports, MAX_PORTS
);
cnt_ports = MAX_PORTS;
}
setup_ports(&app_cfg, cnt_ports);
app_cfg.exit_now = 0;
app_cfg.cnt_ports = cnt_ports;
if (rte_lcore_count() < 2)
rte_exit(EXIT_FAILURE, "No available worker core!\n");
/* Assume there is an available worker.. */
id_core = rte_lcore_id();
id_core = rte_get_next_lcore(id_core, 1, 1);
rte_eal_remote_launch(worker_main, NULL, id_core);
ethapp_main();
app_cfg.exit_now = 1;
RTE_LCORE_FOREACH_WORKER(id_core) {
if (rte_eal_wait_lcore(id_core) < 0)
return -1;
}
return 0;
}
rte_eth_dev_count_avail()
初始化完成之后,调用rte_eth_dev_count_avail()获取当前有多少个可用的网卡设备,
uint16_t rte_eth_dev_count_avail(void) {
uint16_t p;
uint16_t count = 0;
RTE_ETH_FOREACH_DEV(p)
count++;
return count;
}
#define RTE_ETH_FOREACH_DEV(p) \
RTE_ETH_FOREACH_DEV_OWNED_BY(p, RTE_ETH_DEV_NO_OWNER)
#define RTE_ETH_FOREACH_DEV_OWNED_BY(p, o) \
for (p = rte_eth_find_next_owned_by(0, o); \
(unsigned int)p < (unsigned int)RTE_MAX_ETHPORTS; \
p = rte_eth_find_next_owned_by(p + 1, o))
uint64_t rte_eth_find_next_owned_by(uint16_t port_id, const uint64_t owner_id) {
port_id = rte_eth_find_next(port_id);
while (port_id < RTE_MAX_ETHPORTS &&
rte_eth_devices[port_id].data->owner.id != owner_id)
port_id = rte_eth_find_next(port_id + 1);
return port_id;
}
uint16_t rte_eth_find_next(uint16_t port_id) {
while (port_id < RTE_MAX_ETHPORTS &&
rte_eth_devices[port_id].state == RTE_ETH_DEV_UNUSED)
port_id++;
if (port_id >= RTE_MAX_ETHPORTS)
return RTE_MAX_ETHPORTS;
return port_id;
}
从以上代码可看出,统计可用的dev数量是通过遍历rte_eth_devices数组中无owner,且状态不是unused的项的数量。rte_eth_devices数组是一个比较重要的数组,该数组的数据填充是由各驱动完成的。接下来我们以pci驱动的ixgbe设备为例,分析在DPDK中如何注册总线和在总线上注册驱动,以及网卡设备的探测工作。
rte_bus_scan()
在前面分析rte_eal_init()过程的第17步中,我们提到过初始化过程会调用rte_bus_scan(),scan过程是依次遍历rte_bus_list这个模块全局列表中的每一个总线元素,然后依次调用总线的scan方法。每一种总线是通过注册的方式添加到rte_bus_list这个列表中的,调用的方法是rte_bus_register()
void rte_bus_register(struct rte_bus *bus) {
RTE_VERIFY(bus);
RTE_VERIFY(bus->name && strlen(bus->name));
……
RTE_VERIFY(bus->scan);
RTE_VERIFY(bus->probe);
RTE_VERIFY(bus->find_device);
……
RTE_VERIFY(!bus->plug || bus->unplug);
TAILQ_INSERT_TAIL(&rte_bus_list, bus, next);
……
}
其中RTE_VERIFY这个宏定义检查指针是不是空的,是空的则抛出panic。最后将总线插入到列表中。该func是封装成下面的宏定义提供给总线驱动使用的:
#define RTE_REGISTER_BUS(nm, bus) \
RTE_INIT_PRIO(businitfn_##nm, BUS) \
{\
(bus).name = RTE_STR(nm);\
rte_bus_register(&bus); \
}
#define RTE_INIT_PRIO(func, prio) \
static void __attribute__((constructor(RTE_PRIO(prio)), used)) func(void)
当PCI总线驱动注册时,使用方法如下:
struct rte_pci_bus {
struct rte_bus bus; /**< Inherit the generic class */
struct rte_pci_device_list device_list; /**< List of PCI devices */
struct rte_pci_driver_list driver_list; /**< List of PCI drivers */
};
struct rte_pci_bus rte_pci_bus = {
.bus = {
.scan = rte_pci_scan,
.probe = pci_probe,
.find_device = pci_find_device,
.plug = pci_plug,
.unplug = pci_unplug,
.parse = pci_parse,
.dma_map = pci_dma_map,
.dma_unmap = pci_dma_unmap,
.get_iommu_class = rte_pci_get_iommu_class,
.dev_iterate = rte_pci_dev_iterate,
.hot_unplug_handler = pci_hot_unplug_handler,
.sigbus_handler = pci_sigbus_handler,
},
.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
};
RTE_REGISTER_BUS(pci, rte_pci_bus.bus);
从结构上可以看出,PCI总线定义了自己的结构类型rte_pci_bus,该结构将rte_bus进行了封装,并额外增加了两个列表,PCI设备列表和设备驱动列表。在定义PCI总线的地方,调用了宏RTE_REGISTER_BUS,这个宏在展开之后,变成了如下内容:
static void __attribute__((constructor(RTE_PRIORITY_BUS), used)) businitfn_pci(void) {
(rte_pci_bus.bus).name = RTE_STR(pci);
rte_bus_register(&rte_pci_bus.bus);
}
这里宏定义变成了一个C语言的构造函数,C语言的构造函数会在执行main函数之前就得以执行,故在DPDK进程正式开始执行前,总线就会被注册到总线列表当中了。
PCI的scan方法被调用时,会执行rte_pci_scan()
int rte_pci_scan(void) {
struct dirent *e;
DIR *dir;
char dirname[PATH_MAX];
struct rte_pci_addr addr;
……
dir = opendir(rte_pci_get_sysfs_path()); // /sys/bus/pci/devices
if (dir == NULL) {
RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n",
__func__, strerror(errno));
return -1;
}
……
while ((e = readdir(dir)) != NULL) {
if (e->d_name[0] == '.')
continue;
if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &addr) != 0)
continue;
if (rte_pci_ignore_device(&addr))
continue;
snprintf(dirname, sizeof(dirname), "%s/%s",
rte_pci_get_sysfs_path(), e->d_name);
if (pci_scan_one(dirname, &addr) < 0)
goto error;
}
closedir(dir);
return 0;
error:
closedir(dir);
return -1;
}
该func中,会遍历系统的/sys/bus/pci/devices目录,获取每一个设备的addr信息,设备的addr信息实际上就是设备的PCI ID,分为以下4个部分:
struct rte_pci_addr {
uint32_t domain; /**< Device domain */
uint8_t bus; /**< Device bus */
uint8_t devid; /**< Device ID */
uint8_t function; /**< Device function. */
};
获取设备的addr之后,如果这个设备不是需要无视的设备,那么调用pci_scan_one()在处理这个设备。
未完待续……