一、DPDK的并发
前面提到了网络爆发式的发展,提出了C10K到c100K甚至更多并发的要求,其实本质上还是对数据接收和处理的速度,即从软件、硬件实现整体的最大性能的平衡。这时候在软件上多线程、多进程以至于并行编程就拿到台面上,毕竟这是解决接收和处理数据的一个最有力的方法和手段。
在前面也提到过,DPDK的优势在于缩短了数据流的路径,但另外一个重要的方面就是多核(CPU)的利用以及在指令处理方面使用了SIMD。正如前面所分析,合理的利用核心的数量和并处理好缓存的命中,就可以大幅的提高处理的效率,再加上类似于批处理的数量优势,自然就会有大幅度的效能的提升。
说得更通俗一些,就是要挖掘多核和多CPU以及硬件的资源,但这些资源的挖掘尽量要从软件上来实现。这就是DPDK并发的意义。
二、多线程模型及源码分析
DPDK中的线程创建是基于Posix库创建的,从这方面来看,就没有脱离开上层应用编程的范围。说这个的意思就是说,这块本质上是没有难度的。看一下相关的线程处理代码:
/* Launch threads, called at application init(). */
int
rte_eal_init(int argc, char **argv)
{
int i, fctret, ret;
pthread_t thread_id;
static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
const char *p;
static char logid[PATH_MAX];
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
char thread_name[RTE_MAX_THREAD_NAME_LEN];
bool phys_addrs;
/* checks if the machine is adequate * /
if (!rte_cpu_is_supported()) {
rte_eal_init_alert("unsupported cpu type.");
rte_errno = ENOTSUP;
return -1;
}
if (!rte_atomic32_test_and_set(&run_once)) {
rte_eal_init_alert("already called initialization.");
rte_errno = EALREADY;
return -1;
}
p = strrchr(argv[0], '/');
strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid));
thread_id = pthread_self();
eal_reset_internal_config(&internal_config);
/* set log level as early as possible * /
eal_log_level_parse(argc, argv);
if (rte_eal_cpu_init() < 0) {
rte_eal_init_alert("Cannot detect lcores.");
rte_errno = ENOTSUP;
return -1;
}
fctret = eal_parse_args(argc, argv);
if (fctret < 0) {
rte_eal_init_alert("Invalid 'command line' arguments.");
rte_errno = EINVAL;
rte_atomic32_clear(&run_once);
return -1;
}
if (eal_plugins_init() < 0) {
rte_eal_init_alert("Cannot init plugins");
rte_errno = EINVAL;
rte_atomic32_clear(&run_once);
return -1;
}
if (eal_option_device_parse()) {
rte_errno = ENODEV;
rte_atomic32_clear(&run_once);
return -1;
}
if (rte_config_init() < 0) {
rte_eal_init_alert("Cannot init config");
return -1;
}
if (rte_eal_intr_init() < 0) {
rte_eal_init_alert("Cannot init interrupt-handling thread");
return -1;
}
if (rte_eal_alarm_init() < 0) {
rte_eal_init_alert("Cannot init alarm");
/* rte_eal_alarm_init sets rte_errno on failure. */
return -1;
}
/* Put mp channel init before bus scan so that we can init the vdev
* bus through mp channel in the secondary process before the bus scan.
* /
if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) {
rte_eal_init_alert("failed to init mp channel");
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
rte_errno = EFAULT;
return -1;
}
}
/* register multi-process action callbacks for hotplug * /
if (eal_mp_dev_hotplug_init() < 0) {
rte_eal_init_alert("failed to register mp callback for hotplug");
return -1;
}
if (rte_bus_scan()) {
rte_eal_init_alert("Cannot scan the buses for devices");
rte_errno = ENODEV;
rte_atomic32_clear(&run_once);
return -1;
}
phys_addrs = rte_eal_using_phys_addrs() != 0;
/* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
if (internal_config.iova_mode == RTE_IOVA_DC) {
/* autodetect the IOVA mapping mode */
enum rte_iova_mode iova_mode = rte_bus_get_iommu_class();
if (iova_mode == RTE_IOVA_DC) {
RTE_LOG(DEBUG, EAL, "Buses did not request a specific IOVA mode.\n");
if (!phys_addrs) {
/* if we have no access to physical addresses,
* pick IOVA as VA mode.
* /
iova_mode = RTE_IOVA_VA;
RTE_LOG(DEBUG, EAL, "Physical addresses are unavailable, selecting IOVA as VA mode.\n");
#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
} else if (rte_eal_check_module("rte_kni") == 1) {
iova_mode = RTE_IOVA_PA;
RTE_LOG(DEBUG, EAL, "KNI is loaded, selecting IOVA as PA mode for better KNI performance.\n");
#endif
} else if (is_iommu_enabled()) {
/* we have an IOMMU, pick IOVA as VA mode */
iova_mode = RTE_IOVA_VA;
RTE_LOG(DEBUG, EAL, "IOMMU is available, selecting IOVA as VA mode.\n");
} else {
/* physical addresses available, and no IOMMU
* found, so pick IOVA as PA.
*/
iova_mode = RTE_IOVA_PA;
RTE_LOG(DEBUG, EAL, "IOMMU is not available, selecting IOVA as PA mode.\n");
}
}
#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
/* Workaround for KNI which requires physical address to work
* in kernels < 4.10
* /
if (iova_mode == RTE_IOVA_VA &&
rte_eal_check_module("rte_kni") == 1) {
if (phys_addrs) {
iova_mode = RTE_IOVA_PA;
RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
} else {
RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
}
}
#endif
rte_eal_get_configuration()->iova_mode = iova_mode;
} else {
rte_eal_get_configuration()->iova_mode =
internal_config.iova_mode;
}
if (rte_eal_iova_mode() == RTE_IOVA_PA && !phys_addrs) {
rte_eal_init_alert("Cannot use IOVA as 'PA' since physical addresses are not available");
rte_errno = EINVAL;
return -1;
}
RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
if (internal_config.no_hugetlbfs == 0) {
/* rte_config isn't initialized yet * /
ret = internal_config.process_type == RTE_PROC_PRIMARY ?
eal_hugepage_info_init() :
eal_hugepage_info_read();
if (ret < 0) {
rte_eal_init_alert("Cannot get hugepage information.");
rte_errno = EACCES;
rte_atomic32_clear(&run_once);
return -1;
}
}
if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
if (internal_config.no_hugetlbfs)
internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
}
if (internal_config.vmware_tsc_map == 1) {
#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT
rte_cycles_vmware_tsc_map = 1;
RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "
"you must have monitor_control.pseudo_perfctr = TRUE\n");
#else
RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "
"RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");
#endif
}
if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
rte_eal_init_alert("Cannot init logging.");
rte_errno = ENOMEM;
rte_atomic32_clear(&run_once);
return -1;
}
#ifdef VFIO_PRESENT
if (rte_eal_vfio_setup() < 0) {
rte_eal_init_alert("Cannot init VFIO");
rte_errno = EAGAIN;
rte_atomic32_clear(&run_once);
return -1;
}
#endif
/* in secondary processes, memory init may allocate additional fbarrays
* not present in primary processes, so to avoid any potential issues,
* initialize memzones first.
*/
if (rte_eal_memzone_init() < 0) {
rte_eal_init_alert("Cannot init memzone");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory");
rte_errno = ENOMEM;
return -1;
}
/* the directories are locked during eal_hugepage_info_init */
eal_hugedirs_unlock();
if (rte_eal_malloc_heap_init() < 0) {
rte_eal_init_alert("Cannot init malloc heap");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_tailqs_init() < 0) {
rte_eal_init_alert("Cannot init tail queues for objects");
rte_errno = EFAULT;
return -1;
}
if (rte_eal_timer_init() < 0) {
rte_eal_init_alert("Cannot init HPET or TSC timers");
rte_errno = ENOTSUP;
return -1;
}
eal_check_mem_on_local_socket();
eal_thread_init_master(rte_config.master_lcore);
ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
rte_config.master_lcore, (uintptr_t)thread_id, cpuset,
ret == 0 ? "" : "...");
RTE_LCORE_FOREACH_SLAVE(i) {
/*
* create communication pipes between master thread
* and children
* /
if (pipe(lcore_config[i].pipe_master2slave) < 0)
rte_panic("Cannot create pipe\n");
if (pipe(lcore_config[i].pipe_slave2master) < 0)
rte_panic("Cannot create pipe\n");
lcore_config[i].state = WAIT;
/* create a thread for each lcore * /
ret = pthread_create(&lcore_config[i].thread_id, NULL,
eal_thread_loop, NULL);
if (ret != 0)
rte_panic("Cannot create thread\n");
/* Set thread_name for aid in debugging. * /
snprintf(thread_name, sizeof(thread_name),
"lcore-slave-%d", i);
ret = rte_thread_setname(lcore_config[i].thread_id,
thread_name);
if (ret != 0)
RTE_LOG(DEBUG, EAL,
"Cannot set name for lcore thread\n");
}
/*
* Launch a dummy function on all slave lcores, so that master lcore
* knows they are all ready when this function returns.
*/
rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
rte_eal_mp_wait_lcore();
/* initialize services so vdevs register service during bus_probe. */
ret = rte_service_init();
if (ret) {
rte_eal_init_alert("rte_service_init() failed");
rte_errno = ENOEXEC;
return -1;
}
/* Probe all the buses and devices/drivers on them */
if (rte_bus_probe()) {
rte_eal_init_alert("Cannot probe devices");
rte_errno = ENOTSUP;
return -1;
}
#ifdef VFIO_PRESENT
/* Register mp action after probe() so that we got enough info */
if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
return -1;
#endif
/* initialize default service/lcore mappings and start running. Ignore
* -ENOTSUP, as it indicates no service coremask passed to EAL.
*/
ret = rte_service_start_with_defaults();
if (ret < 0 && ret != -ENOTSUP) {
rte_errno = ENOEXEC;
return -1;
}
/*
* Clean up unused files in runtime directory. We do this at the end of
* init and not at the beginning because we want to clean stuff up
* whether we are primary or secondary process, but we cannot remove
* primary process' files because secondary should be able to run even
* if primary process is dead.
*
* In no_shconf mode, no runtime directory is created in the first
* place, so no cleanup needed.
* /
if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) {
rte_eal_init_alert("Cannot clear runtime directory");
return -1;
}
eal_mcfg_complete();
/* Call each registered callback, if enabled * /
rte_option_init();
return fctret;
}
这个函数在前面的文章中的例程中用过。看看开头的几个变量和rte_eal_cpu_init函数,不用看内部的实现,都应该猜到是什么了。在rte_eal_cpu_init处理好CPU,开始调用eal_parse_args来处理参数进行Master核的确定和CPU的管理(确定哪核可用)。
到这里,基本就应该明白要怎么使用线程了,再加上开头的pthread_t thread_id;应该立刻明白了吧。并行计算中处理的第一个要务就是对CPU的分配和管理,包括一些CPU控制在内。
接着往下看就是一系列的初始化如前面文章提到的,配置、内存、内存池、队列、定时器等等。最后到主核心线程和各个子线程核心的启动:
eal_thread_init_master(rte_config.master_lcore);
ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
rte_config.master_lcore, (uintptr_t)thread_id, cpuset,
ret == 0 ? "" : "...");
RTE_LCORE_FOREACH_SLAVE(i) {
/*
* create communication pipes between master thread
* and children
*/
if (pipe(lcore_config[i].pipe_master2slave) < 0)
rte_panic("Cannot create pipe\n");
if (pipe(lcore_config[i].pipe_slave2master) < 0)
rte_panic("Cannot create pipe\n");
lcore_config[i].state = WAIT;
/* create a thread for each lcore */
ret = pthread_create(&lcore_config[i].thread_id, NULL,
eal_thread_loop, NULL);
if (ret != 0)
rte_panic("Cannot create thread\n");
/* Set thread_name for aid in debugging. * /
snprintf(thread_name, sizeof(thread_name),
"lcore-slave-%d", i);
ret = rte_thread_setname(lcore_config[i].thread_id,
thread_name);
if (ret != 0)
RTE_LOG(DEBUG, EAL,
"Cannot set name for lcore thread\n");
}
eal_thread_init_master和eal_thread_loop它们都会调用:
/* set affinity for current EAL thread */
static int
eal_thread_set_affinity(void)
{
unsigned lcore_id = rte_lcore_id();
/* acquire system unique id */
rte_gettid();
/* update EAL thread core affinity * /
return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset);
}
来进行CPU的绑定。然后不同模块为了使用线程,就需要进行注册,即创建后紧随的调用rte_eal_mp_remote_launch:
/*
* Launch a dummy function on all slave lcores, so that master lcore
* knows they are all ready when this function returns.
*/
rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
rte_eal_mp_wait_lcore();
初始的是一个空的系统函数。有兴趣可以搜一搜这个函数的使用,你会发现真得好多。它的定义:
/*
* Check that every SLAVE lcores are in WAIT state, then call
* rte_eal_remote_launch() for all of them. If call_master is true
* (set to CALL_MASTER), also call the function on the master lcore.
*/
int
rte_eal_mp_remote_launch(int (*f)(void *), void *arg,
enum rte_rmt_call_master_t call_master)
{
int lcore_id;
int master = rte_get_master_lcore();
/* check state of lcores */
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
if (lcore_config[lcore_id].state != WAIT)
return -EBUSY;
}
/* send messages to cores * /
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
rte_eal_remote_launch(f, arg, lcore_id);
}
if (call_master == CALL_MASTER) {
lcore_config[master].ret = f(arg);
lcore_config[master].state = FINISHED;
}
return 0;
}
/*
* Send a message to a slave lcore identified by slave_id to call a
* function f with argument arg. Once the execution is done, the
* remote lcore switch in FINISHED state.
*/
int
rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id)
{
int n;
char c = 0;
int m2s = lcore_config[slave_id].pipe_master2slave[1];
int s2m = lcore_config[slave_id].pipe_slave2master[0];
if (lcore_config[slave_id].state != WAIT)
return -EBUSY;
lcore_config[slave_id].f = f;
lcore_config[slave_id].arg = arg;
/* send message * /
n = 0;
while (n == 0 || (n < 0 && errno == EINTR))
n = write(m2s, &c, 1);
if (n < 0)
rte_panic("cannot write on configuration pipe\n");
/* wait ack * /
do {
n = read(s2m, &c, 1);
} while (n < 0 && errno == EINTR);
if (n <= 0)
rte_panic("cannot read on configuration pipe\n");
return 0;
}
这明显是注册函数用的,函数指针放到了第一个参数上。尽管创建线程的手法一致,但尽量还是使用DPDK创建的线程,它能更好的发挥DPDK的性能,原因在后面分析代码中就慢慢明白了。
从上面的代码和分析是不是总结中线程模型和常用的开发的模型有所不同之处,如果有做类似并行计算如OpenMP开发的会发现有相似之处,其实就是把线程的自由调用也就是时间片的任意轮转,固定到了指定的核心上。线程间通过PIPE进行通信。
通过上面找分析,可以总结出DPDK的并发模型从提高指令的并行度和多核并发出发,通过内存和NUMA的管理提高水平扩展的能力。利用CPU的亲和力,使用主从线程的分配管理,利用SIMD,提高整体的吞吐量。
而从实际情况来看,由于Linux内核是不区分线程和进程动作的,在内核看来二者都是Fork的结果。上下文的切换的消耗对于线程和进程没有不可忽视的区别。反而是线程和进程对于处理不同内存的共享时,导致的性能开销会有较大的差异。在多核心的情况下,由于线程伪共享的问题,多进程反而较多线程更有优势。在排除不同进程线程间通信这种复杂情况下,多进程性能还是要高于多线程,并且更健壮。
在DPDK中利用巨页机制可以降低多线程在频繁交互下对多进程的优势。所以,在实际的应用过程中,还是要根据实际情况来决定应用的具体情况。
三、多进程
同样在DPDK中对多进程也是支持的,需要在启动任务时,手动指定相关的主从进程:
./xxx -c 0xf -n 2 -- -q 2 -p 0xf -proc-type primary
./xxx -c 0xf -n 2 -- -q 2 -p 0xf -proc-type secondary
应用的一些参数说明:
基本的格式:
./xxx [options] -- -p PORTMASK [-q NQ -T t]
option:DPDK EAL的默认参数,其形式为 -c COREMASK -n NUM
COREMASK:十六进制位掩码表示分配的逻辑内核数量
NUM:十进制整数表示每个逻辑内核的内存通道数量
-p PORTMASK
PORTMASK:十六进制位掩码即分配的端口数量。0x3是指 后两位为1,即起点两个端口
-q NQ
NQ:分配到每个逻辑内核的收发队列数量
-T t
t: 打印统计数据上屏的时间间隔,默认为10秒
DPDK中,多进程的情况下,主从进程有点类似于线程共享进程的资源一样,主从进程共享大页内存以及一些相关的队列。说到共享,如果有Android框架分析经验的肯定会首先想起mmap。对,DPDK也是,所以说,多看代码,多看优秀的框架,会明白很多事儿。在这个基础上,才可能产生创新。
四、例程
前面看一个简单的多线程的,下面看自带的一个多进程的简装版:
#define RTE_LOGTYPE_APP RTE_LOGTYPE_USER1
static const char *_MSG_POOL = "MSG_POOL";
static const char *_SEC_2_PRI = "SEC_2_PRI";
static const char *_PRI_2_SEC = "PRI_2_SEC";
struct rte_ring *send_ring, *recv_ring;
struct rte_mempool *message_pool;
volatile int quit = 0;
static int lcore_recv(__attribute__((unused)) void *arg)
{
unsigned lcore_id = rte_lcore_id();
printf("Starting core %u\n", lcore_id);
while (!quit){
void * msg;
if (rte_ring_dequeue(recv_ring, &msg) < 0){
usleep(5);
continue;
}
printf("core %u: Received '%s'\n", lcore_id, (char * )msg);
rte_mempool_put(message_pool, msg);
}
return 0;
}
int main(int argc, char **argv)
{
const unsigned flags = 0;
const unsigned ring_size = 64;
const unsigned pool_size = 1024;
const unsigned pool_cache = 32;
const unsigned priv_data_sz = 0;
int ret;
unsigned lcore_id;
ret = rte_eal_init(argc, argv);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Cannot init EAL\n");
if (rte_eal_process_type() == RTE_PROC_PRIMARY){
send_ring = rte_ring_create( _ PRI_2_SEC, ring_size, rte_socket_id(), flags);
recv_ring = rte_ring_create(_ SEC_2_PRI, ring_size, rte_socket_id(), flags);
message_pool = rte_mempool_create(_ MSG_POOL, pool_size,
STR_TOKEN_SIZE, pool_cache, priv_data_sz,
NULL, NULL, NULL, NULL,
rte_socket_id(), flags);
} else {
recv_ring = rte_ring_lookup(_ PRI_2_SEC);
send_ring = rte_ring_lookup(_SEC_2_PRI);
message_pool = rte_mempool_lookup(_ MSG_POOL);
}
if (send_ring == NULL)
rte_exit(EXIT_FAILURE, "Problem getting sending ring\n");
if (recv_ring == NULL)
rte_exit(EXIT_FAILURE, "Problem getting receiving ring\n");
if (message_pool == NULL)
rte_exit(EXIT_FAILURE, "Problem getting message pool\n");
RTE_LOG(INFO, APP, "Finished Process Init.\n");
/* call lcore_recv() on every slave lcore * /
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
rte_eal_remote_launch(lcore_recv, NULL, lcore_id);
}
/* call cmd prompt on master lcore * /
struct cmdline * cl = cmdline_stdin_new(simple_mp_ctx, "\nsimple_mp > ");
if (cl == NULL)
rte_exit(EXIT_FAILURE, "Cannot create cmdline instance\n");
cmdline_interact(cl);
cmdline_stdin_exit(cl);
rte_eal_mp_wait_lcore();
/* clean up the EAL * /
rte_eal_cleanup();
return 0;
}
重点看一下,前面对进程的判断和创建过程与线程有何不同,会更好的理解DPDK的并行模型。
五、总结
DPDK通过NUMA整合CPU的管理,通过对CPU核心的具体分析来确定主从线程,配置亲和性保证缓存的命中率真并尽量减小上下文的场景开销。从这些分析来看,其实个人开发一个好的后台服务,方法不就学到了。这时回头再看前面文章写服务程序时对OS的配置进行一系列的配置,是不是也是这个原理。正所谓举一反三,明白了通向高性能的方向,余下的就是设计和优化代码了。
俗话说得好:师父领进门,修行在个人。