目录
一、概述
分析一下dpdk源码,基于dpdk19.02
二、初始化流程
2.1 main之前
main执行之前,会执行constructor,DPDK实现都在如下函数中,有四个优先级
[lib/librte_eal/common/include/Rte_common.h]
#define RTE_PRIORITY_LOG 101
#define RTE_PRIORITY_BUS 110
#define RTE_PRIORITY_CLASS 120
#define RTE_PRIORITY_LAST 65535
#define RTE_PRIO(prio) \
RTE_PRIORITY_ ## prio
#define RTE_INIT_PRIO(func, prio) \
static void __attribute__((constructor(RTE_PRIO(prio)), used)) func(void)
#define RTE_INIT(func) \
RTE_INIT_PRIO(func, LAST)
调用RTE_INIT_PRIO主要集中在如下几处:
[lib/librte_eal/common/eal_common_log.c]
RTE_INIT_PRIO(rte_log_init, LOG)
#define RTE_REGISTER_BUS(nm, bus) \
RTE_INIT_PRIO(businitfn_ ##nm, BUS) \
{\
(bus).name = RTE_STR(nm);\
rte_bus_register(&bus); \
}
#define RTE_REGISTER_CLASS(nm, cls) \
RTE_INIT_PRIO(classinitfn_ ##nm, CLASS) \
{\
(cls).name = RTE_STR(nm); \
rte_class_register(&cls); \
}
和kernel的驱动模型类似,DPDK抽象了bus,class这些结构,
struct rte_bus {
TAILQ_ENTRY(rte_bus) next; /**< Next bus object in linked list */
const char *name; /**< Name of the bus */
rte_bus_scan_t scan; /**< Scan for devices attached to bus */
rte_bus_probe_t probe; /**< Probe devices on bus */
rte_bus_find_device_t find_device; /**< Find a device on the bus */
rte_bus_plug_t plug; /**< Probe single device for drivers */
rte_bus_unplug_t unplug; /**< Remove single device from driver */
rte_bus_parse_t parse; /**< Parse a device name */
struct rte_bus_conf conf; /**< Bus configuration */
rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu class */
rte_dev_iterate_t dev_iterate; /**< Device iterator. */
rte_bus_hot_unplug_handler_t hot_unplug_handler;
/**< handle hot-unplug failure on the bus */
rte_bus_sigbus_handler_t sigbus_handler;
/**< handle sigbus error on the bus */
};
- bus主要抽象了一组bus接口,是其他bus类型的基类。
bus的注册也很简单,就是将bus结构挂到全局的rte_bus_list上,在我阅读的这个版本中,主要有以下集中bus类型:
Dpaa_bus.c (drivers\bus\dpaa):RTE_REGISTER_BUS(FSL_DPAA_BUS_NAME, rte_dpaa_bus.bus);
Fslmc_bus.c (drivers\bus\fslmc):RTE_REGISTER_BUS(FSLMC_BUS_NAME, rte_fslmc_bus.bus);
Ifpga_bus.c (drivers\bus\ifpga):RTE_REGISTER_BUS(IFPGA_BUS_NAME, rte_ifpga_bus);
Pci_common.c (drivers\bus\pci):RTE_REGISTER_BUS(pci, rte_pci_bus.bus);
Vdev.c (drivers\bus\vdev):RTE_REGISTER_BUS(vdev, rte_vdev_bus);
Vmbus_common.c (drivers\bus\vmbus):RTE_REGISTER_BUS(vmbus, rte_vmbus_bus.bus);
本文关注红色部分的两种bus,pci bus和vdev bus,蓝色部分为bus name
[driver/bus/pci/pci_common.c]
struct rte_pci_bus rte_pci_bus = {
.bus = {
.scan = rte_pci_scan,
.probe = rte_pci_probe,
.find_device = pci_find_device,
.plug = pci_plug,
.unplug = pci_unplug,
.parse = pci_parse,
.get_iommu_class = rte_pci_get_iommu_class,
.dev_iterate = rte_pci_dev_iterate,
.hot_unplug_handler = pci_hot_unplug_handler,
.sigbus_handler = pci_sigbus_handler,
},
.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
};
[driver/bus/vdev/vdev.c]
static struct rte_bus rte_vdev_bus = {
.scan = vdev_scan,
.probe = vdev_probe,
.find_device = rte_vdev_find_device,
.plug = vdev_plug,
.unplug = vdev_unplug,
.parse = vdev_parse,
.dev_iterate = rte_vdev_dev_iterate,
};
RTE_REGISTER_BUS(vdev, rte_vdev_bus);
rte_pci_bus继承rte_bus,主要还是应该借鉴了内核的实现——一种bus下挂多个设备,设备由基于特定总线的驱动程序来驱动,构成了三者的对应关系,关于bus的接口调用会在初始化的时候展开。
RTE_INIT对应最低优先级的注册,RTE_INIT对应的注册很多,主要是驱动的注册,也有对其进行的基本封装,如pci
#define RTE_PMD_REGISTER_PCI(nm, pci_drv) \
RTE_INIT(pciinitfn_ ##nm) \
{\
(pci_drv).driver.name = RTE_STR(nm);\
rte_pci_register(&pci_drv); \
} \
基于pci总线的驱动有很多,列一下intel相关的网卡:
I40e_ethdev.c (drivers\net\i40e):RTE_PMD_REGISTER_PCI(net_i40e, rte_i40e_pmd);
I40e_ethdev_vf.c (drivers\net\i40e):RTE_PMD_REGISTER_PCI(net_i40e_vf, rte_i40evf_pmd);
Igb_ethdev.c (drivers\net\e1000):RTE_PMD_REGISTER_PCI(net_e1000_igb, rte_igb_pmd);
Igb_ethdev.c (drivers\net\e1000):RTE_PMD_REGISTER_PCI(net_e1000_igb_vf, rte_igbvf_pmd);
Ixgbe_ethdev.c (drivers\net\ixgbe):RTE_PMD_REGISTER_PCI(net_ixgbe, rte_ixgbe_pmd);
Ixgbe_ethdev.c (drivers\net\ixgbe):RTE_PMD_REGISTER_PCI(net_ixgbe_vf, rte_ixgbevf_pmd);
2.2 rte_eal_init
2.3 rte_eal_cpu_init
这个函数的目的就是要确定运行环境中node(socket), lcore, core的数量和他们之间的对应关系。
确定lcore_id和socket_id (node)的对应关系,注释写的很明白,通过判定下面的路径是否可访问
/sys/devices/system/node/nodeX/cpuX
确定cpu的在位情况,通过下面的信息是否可访问确定(每个lcore对应的core_id):
/sys/devices/system/cpu/cpuX/topology/core_id
最终这些信息都会在DPDK启动时输出出来
RTE_LOG(DEBUG, EAL, "Detected lcore %u as " "core %u on socket %u\n",
lcore_id, lcore_config[lcore_id].core_id, core_config[lcore_id].socket_id);
RTE_LOG(INFO, EAL, "Detected %u lcore(s)\n", config->lcore_count);
RTE_LOG(INFO, EAL, "Detected %u NUMA nodes\n", config->numa_node_count);
这些信息存储的位置在全局配置(rte_config)中,红色部分
struct rte_config {
uint32_t master_lcore;
uint32_t lcore_count; //逻辑核的数量
uint32_t numa_node_count; //node的数量
uint32_t numa_nodes[RTE_MAX_NUMA_NODES]; //node对应的node id
uint32_t service_lcore_count;
enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE]; //每个逻辑核的在位情况enum rte_proc_type_t process_type;
enum rte_iova_mode iova_mode;
struct rte_mem_config *mem_config;
} __attribute__((__packed__));
lcore内部配置lcore_config中
struct lcore_config {
unsigned detected; /**< true if lcore was detected */
pthread_t thread_id; /**< pthread identifier */
int pipe_master2slave[2]; /**< communication pipe with master */
int pipe_slave2master[2]; /**< communication pipe with master */
lcore_function_t * volatile f; /**< function to call */
void * volatile arg; /**< argument of function */
volatile int ret; /**< return value of function */
volatile enum rte_lcore_state_t state; /**< lcore state */
unsigned socket_id; /**< physical socket id for this lcore */
unsigned core_id; /**< core number on socket for this lcore */
int core_index; /**< relative index, starting from 0 */
rte_cpuset_t cpuset; /**< cpu set which the lcore affinity to */
uint8_t core_role; /**< role of core eg: OFF, RTE, SERVICE */
};
2.4 eal_parse_args
eal_parse_args和eal_log_level_parse流程一致,都是解析DPDK基本的参数,具体过程没什么可说的,把参数选项列一下。
- --log-level在eal_log_level_parse解析
2.4.1 eal_parse_common_option
- -b 黑名单
- -w 白名单,不能与-b同时指定 (eg. -w0000:02:01.0,-w0000:02:05.0)
上述两个参数都会将对应的类型及传递的参数加入devopt_list,具体参见eal_option_device_add。这里仅列一下类型:
enum rte_devtype {
RTE_DEVTYPE_WHITELISTED_PCI,
RTE_DEVTYPE_BLACKLISTED_PCI,
RTE_DEVTYPE_VIRTUAL,
};
struct device_option {
TAILQ_ENTRY(device_option) next;
enum rte_devtype type;
char arg[];
};
- -c coremask, 以掩码的形式指定使用哪个lcore,会综合lcore的在位情况,取交集——只将用户指定使用的cpu置为对应的状态
- -l corelist
- --lcores,用法在eal_parse_lcores的注释中找到
使用下面的定义描述core指定方式:
#define LCORE_OPT_LST 1 // -l
#define LCORE_OPT_MSK 2 //-c
#define LCORE_OPT_MAP 3 //--lcores
- -s 指定service core,和-c用法一致
- -S指定service corelist,和-l用法一致
在lcore_config的core_role中以下面的定义标记
enum rte_lcore_role_t {
ROLE_RTE,
ROLE_OFF,
ROLE_SERVICE,
};
- -m memory 指定memory大小
- -n channels 强制channels
- -r ranks 强制ranks
- -d 强制加载外部driver,其实就是使用动态库做插件机制,加入solib_list管理,后续分析eal_plugins_init会说明。
/* Definition for shared object drivers. */
struct shared_driver {
TAILQ_ENTRY(shared_driver) next;
char name[PATH_MAX];
void* lib_handle;
};
- --huge-unlink
- --no-huge
- --legacy-mem
- --hugepage_unlink
- --no-pci
- --no-hpet
- --vmware-tsc-map
- --no-shconf 不使用shared conf
- --in-memory 开启后默认打开--hugepage_unlink和--no-shconf
- --proc-type
proc-type主要包括如下类型:
/**
* The type of process in a linuxapp, multi-process setup
*/
enum rte_proc_type_t {
RTE_PROC_AUTO = -1, /* allow auto-detection of primary/secondary */
RTE_PROC_PRIMARY = 0, /* set to zero, so primary is the default */
RTE_PROC_SECONDARY,
RTE_PROC_INVALID
};
- --master-lcore ,指定master lcore,默认0
- --vdev 和-b/-w 用法类似,只不过 指定类型是RTE_DEVTYPE_VIRTUAL
- --syslog
- --single-file-segments
- --iova-mode 指定iommu的工作模式,pa/va
/**
* IOVA mapping mode.
*
* IOVA mapping mode is iommu programming mode of a device.
* That device (for example: IOMMU backed DMA device) based
* on rte_iova_mode will generate physical or virtual address.
*
*/
enum rte_iova_mode {
RTE_IOVA_DC = 0, /* Don't care mode */
RTE_IOVA_PA = (1 << 0), /* DMA using physical address */
RTE_IOVA_VA = (1 << 1) /* DMA using virtual address */
};
- --vfio-intr
{ "legacy", RTE_INTR_MODE_LEGACY },
{ "msi", RTE_INTR_MODE_MSI },
{ "msix", RTE_INTR_MODE_MSIX },
- --huge-dir 指定hugepage的mount目录
- --file-prefix hugefile的前缀
- --socket-mem
- --socket-limit
- --base-virtaddr 指定mmap起始地址,16M对齐
- --create-uio-dev
- --mbuf-pool-ops-name
- --match-allocations
2.4.2 eal_create_runtime_dir
函数比较简单,就是 为dpdk创建统一的runtime的各种文件,其目录默认如下,对应全局变量runtime_dir
/var/run/dpdk/rte
2.4.3 eal_adjust_config
主要是调整internal_config,包括
- 如果没有指定-c/-l/--lcores这些指定运行于哪个lcore的参数(core_parsed), 自动检测亲和性关系
- 如果没有指定--master-lcore(master_lcore_parsed),自动选择,一般是第一个可用lcore
基本上,参数解析都存储在internal_config这个结构:
struct internal_config {
volatile size_t memory; // -m memory
volatile unsigned force_nchannel; // -n channels
volatile unsigned force_nrank; // -r ranks
volatile unsigned no_hugetlbfs; // --no-huge
unsigned hugepage_unlink; // --hugepage_unlink
volatile unsigned no_pci; // --no-pci
volatile unsigned no_hpet; // --no-hpet
volatile unsigned vmware_tsc_map; // --vmware-tsc-map
volatile unsigned no_shconf; // --no-shconf
volatile unsigned in_memory; // --in-memory
volatile unsigned create_uio_dev; // --create-uio-dev
volatile enum rte_proc_type_t process_type; // --proc-typevolatile unsigned force_sockets; // --socket-mem
volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; // --socket-mem
volatile unsigned force_socket_limits; // --socket-limit
volatile uint64_t socket_limit[RTE_MAX_NUMA_NODES]; // --socket-limit
uintptr_t base_virtaddr; // --base-virtaddr
volatile unsigned legacy_mem; // --legacy-mem, 使用--no-huge时默认打开volatile unsigned match_allocations; // --match-allocations
volatile unsigned single_file_segments; // --single-file-segments
volatile int syslog_facility; // --syslog
volatile enum rte_intr_mode vfio_intr_mode; /// --vfio-intr
char *hugefile_prefix; // --file-prefix
char *hugepage_dir; // --huge-dir
char *user_mbuf_pool_ops_name; // --mbuf-pool-ops-name
unsigned num_hugepage_sizes;
struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
enum rte_iova_mode iova_mode ; // --iova-mode
volatile unsigned int init_complete;
};
参数解析过后,rte_config赋值情况:
struct rte_config {
uint32_t master_lcore; // --master-lcore
uint32_t lcore_count; //逻辑核的数量
uint32_t numa_node_count; //node的数量
uint32_t numa_nodes[RTE_MAX_NUMA_NODES]; //node对应的node id
uint32_t service_lcore_count;
enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE]; //每个逻辑核的在位情况enum rte_proc_type_t process_type;
enum rte_iova_mode iova_mode;
struct rte_mem_config *mem_config;
} __attribute__((__packed__));
2.5 eal_plugins_init
插件机制,就是动态库,插件的路径可以由-d参数指定,也可以由RTE_EAL_PMD_PATH(通过CONFIG_RTE_EAL_PMD_PATH)指定,函数会加载-d指定的so或者将指定路径下满足条件的插件加入管理(eal_plugin_add),并通过dlopen加载
2.6 eal_option_device_parse
还记得在参数解析的时候,有三个参数(-b/-w/--vdev),他们分别将设备的特性注册到了devopt_list上,在本节的函数调用时,bus的scan和probe还没有进行,先将用户对device的一些限制或者要求解析出来,这样后面做bus相关处理的时候就可以应用了。
函数的实现很简单,就是将之前devopt_list的项都摘下来,构造新参数rte_devargs并挂到新的全局devargs_list上
/**
* Structure that stores a device given by the user with its arguments
*
* A user device is a physical or a virtual device given by the user to
* the DPDK application at startup through command line arguments.
*
* The structure stores the configuration of the device, its PCI
* identifier if it's a PCI device or the driver name if it's a virtual
* device.
*/
struct rte_devargs {
TAILQ_ENTRY(rte_devargs) next;
enum rte_devtype type;
enum rte_dev_policy policy;
char name[RTE_DEV_NAME_MAX_LEN];
RTE_STD_C11
union {
/** Arguments string as given by user or "" for no argument. */
char *args;
const char *drv_str;
};
struct rte_bus *bus; /**< bus handle. */
struct rte_class *cls; /**< class handle. */
const char *bus_str; /**< bus-related part of device string. */
const char *cls_str; /**< class-related part of device string. */
const char *data; /**< Device string storage. */
};
rte_devarg才是描述一个device构造参数的真正结构,之前用户传递的参数对应args变量,这里我们重点关注:
__rte_experimental int rte_devargs_parse(struct rte_devargs *da, const char *dev)
{
...
/* Retrieve eventual bus info */
do {
devname = dev;
bus = rte_bus_find(bus, bus_name_cmp, dev);
if (bus == NULL)
break;
devname = dev + strlen(bus->name) + 1;
if (rte_bus_find_by_device_name(devname) == bus)
break;
} while (1);
/* Store device name */
i = 0;
while (devname[i] != '\0' && devname[i] != ',') {
da->name[i] = devname[i];
i++;
if (i == maxlen) {
RTE_LOG(WARNING, EAL, "Parsing \"%s\": device name should be shorter than %zu\n",
dev, maxlen);
da->name[i - 1] = '\0';
return -EINVAL;
}
}
da->name[i] = '\0';
if (bus == NULL) {
bus = rte_bus_find_by_device_name(da->name);
if (bus == NULL) {
RTE_LOG(ERR, EAL, "failed to parse device \"%s\"\n",
da->name);
return -EFAULT;
}
}
da->bus = bus;
/* Parse eventual device arguments */
if (devname[i] == ',')
da->args = strdup(&devname[i + 1]);
else
da->args = strdup("");
if (da->args == NULL) {
RTE_LOG(ERR, EAL, "not enough memory to parse arguments\n");
return -ENOMEM;
}
return 0;
- rte_bus_find匹配dev和bus->name,如果是参数-w/-b dev是DBDF/BDF的格式,bus->name是总线名称,这时候不会匹配
- 如果不匹配(没有找到bus)就使用DBDF/BDF填充rte_devargs中的name
- 这时候rte_bus_find_by_device_name需找匹配bus,深入这个函数可以发现匹配DBDF/BDF这种格式,这时候一定会匹配pci bus,因为只有pci bus是这种格式
- 最后,对rte_devargs中的bus字段进行赋值。
解析过后,就是对其他rte_devargs字段进行赋值,只贴下代码,很简单,主要就是根据指定类型确定scan_mode和policy:
devargs->type = devtype;
bus = devargs->bus;
if (devargs->type == RTE_DEVTYPE_BLACKLISTED_PCI)
devargs->policy = RTE_DEV_BLACKLISTED;
if (bus->conf.scan_mode == RTE_BUS_SCAN_UNDEFINED) {
if (devargs->policy == RTE_DEV_WHITELISTED)
bus->conf.scan_mode = RTE_BUS_SCAN_WHITELIST;
else if (devargs->policy == RTE_DEV_BLACKLISTED)
bus->conf.scan_mode = RTE_BUS_SCAN_BLACKLIST;
}
TAILQ_INSERT_TAIL(&devargs_list, devargs, next);
2.7 rte_config_init
在这个阶段,默认情况下执行PRIMARY的流程
2.7.1 rte_eal_config_create
映射/var/run/dpdk/rte/config文件(mem_cfg_fd)为文件共享
rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),
PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
这块内存对应全局rte_config的mem_config,映射完成后,将基本的内容进行赋值:
memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
rte_config.mem_config = rte_mem_cfg_addr;
/* store address of the config in the config itself so that secondary
* processes could later map the config into this exact location */
rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
rte_config.mem_config->dma_maskbits = 0;
2.7.2 eal_update_mem_config
将配置的参数进行赋值
static void eal_update_mem_config(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
mcfg->legacy_mem = internal_config.legacy_mem;
mcfg->single_file_segments = internal_config.single_file_segments;
}
最终看下赋值情况:
/**
* the structure for the memory configuration for the RTE.
* Used by the rte_config structure. It is separated out, as for multi-process
* support, the memory details should be shared across instances
*/
struct rte_mem_config {
volatile uint32_t magic; /**< Magic number - Sanity check. */uint32_t nchannel; /**< Number of channels (0 if unknown). */
uint32_t nrank; /**< Number of ranks (0 if unknown). */rte_rwlock_t mlock; /**< only used by memzone LIB for thread-safe. */
rte_rwlock_t qlock; /**< used for tailq operation for thread safe. */
rte_rwlock_t mplock; /**< only used by mempool LIB for thread-safe. */rte_rwlock_t memory_hotplug_lock;
struct rte_fbarray memzones; /**< Memzone descriptors. */struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */struct malloc_heap malloc_heaps[RTE_MAX_HEAPS];
int next_socket_id;
uint64_t mem_cfg_addr;
uint32_t legacy_mem;
uint32_t single_file_segments;uint8_t dma_maskbits;
} __attribute__((__packed__));