DPDK技术详解:工作原理与环境搭建实践指南

一、DPDK环境搭建

工具准备:VMware、ubuntu16.04。

(1)VMware添加两个网卡。桥接网卡作为 DPDK 运行的网卡,NAT 网卡作为 ssh 连接的网卡。
vmware_network
(2)修改网卡配置信息。找到虚拟机安装目录下的.vmx文件,内容如下:

.encoding = "GBK"
config.version = "8"
virtualHW.version = "16"
mks.enable3d = "TRUE"
pciBridge0.present = "TRUE"
pciBridge4.present = "TRUE"
pciBridge4.virtualDev = "pcieRootPort"
pciBridge4.functions = "8"
pciBridge5.present = "TRUE"
pciBridge5.virtualDev = "pcieRootPort"
pciBridge5.functions = "8"
pciBridge6.present = "TRUE"
pciBridge6.virtualDev = "pcieRootPort"
pciBridge6.functions = "8"
pciBridge7.present = "TRUE"
pciBridge7.virtualDev = "pcieRootPort"
pciBridge7.functions = "8"
vmci0.present = "TRUE"
hpet0.present = "TRUE"
nvram = "Ubuntu64bit1604.nvram"
virtualHW.productCompatibility = "hosted"
powerType.powerOff = "soft"
powerType.powerOn = "soft"
powerType.suspend = "soft"
powerType.reset = "soft"
displayName = "Ubuntu64bit1604"
usb.vbluetooth.startConnected = "TRUE"
guestOS = "ubuntu-64"
tools.syncTime = "FALSE"
sound.autoDetect = "TRUE"
sound.fileName = "-1"
sound.present = "TRUE"
numvcpus = "4"
cpuid.coresPerSocket = "2"
vcpu.hotadd = "TRUE"
memsize = "8192"
mem.hotadd = "TRUE"
scsi0.virtualDev = "lsilogic"
scsi0.present = "TRUE"
sata0.present = "TRUE"
scsi0:0.fileName = "Ubuntu64bit1604.vmdk"
scsi0:0.present = "TRUE"
sata0:1.deviceType = "cdrom-image"
sata0:1.fileName = "G:\迅雷下载\ubuntu-16.04.7-desktop-amd64.iso"
sata0:1.present = "TRUE"
usb.present = "TRUE"
ehci.present = "TRUE"
svga.graphicsMemoryKB = "786432"
ethernet0.addressType = "generated"
ethernet0.virtualDev = "e1000"
serial0.fileType = "thinprint"
serial0.fileName = "thinprint"
ethernet0.present = "TRUE"
serial0.present = "TRUE"
extendedConfigFile = "Ubuntu64bit1604.vmxf"
floppy0.present = "FALSE"
uuid.bios = "56 4d fd 84 0c 40 c1 bb-af 47 c0 00 9f b9 71 74"
uuid.location = "56 4d fd 84 0c 40 c1 bb-af 47 c0 00 9f b9 71 74"
scsi0:0.redo = ""
pciBridge0.pciSlotNumber = "17"
pciBridge4.pciSlotNumber = "21"
pciBridge5.pciSlotNumber = "22"
pciBridge6.pciSlotNumber = "23"
pciBridge7.pciSlotNumber = "24"
scsi0.pciSlotNumber = "16"
usb.pciSlotNumber = "32"
ethernet0.pciSlotNumber = "33"
sound.pciSlotNumber = "34"
ehci.pciSlotNumber = "35"
vmci0.pciSlotNumber = "36"
sata0.pciSlotNumber = "37"
svga.vramSize = "268435456"
vmotion.checkpointFBSize = "4194304"
vmotion.checkpointSVGAPrimarySize = "268435456"
ethernet0.generatedAddress = "00:0C:29:B9:71:74"
ethernet0.generatedAddressOffset = "0"
vmci0.id = "-1615236748"
monitor.phys_bits_used = "43"
cleanShutdown = "TRUE"
softPowerOff = "FALSE"
usb:1.speed = "2"
usb:1.present = "TRUE"
usb:1.deviceType = "hub"
usb:1.port = "1"
usb:1.parent = "-1"
svga.guestBackedPrimaryAware = "TRUE"
tools.remindInstall = "FALSE"
gui.lastPoweredViewMode = "fullscreen"
toolsInstallManager.updateCounter = "2"
toolsInstallManager.lastInstallError = "0"
sata0:1.startConnected = "TRUE"
isolation.tools.hgfs.disable = "FALSE"
sharedFolder0.present = "TRUE"
sharedFolder0.enabled = "TRUE"
sharedFolder0.readAccess = "TRUE"
sharedFolder0.writeAccess = "TRUE"
sharedFolder0.hostPath = "G:\share"
sharedFolder0.guestName = "share"
sharedFolder0.expiration = "never"
sharedFolder.maxNum = "1"
usb:0.present = "TRUE"
usb:0.deviceType = "hid"
usb:0.port = "0"
usb:0.parent = "-1"
ethernet1.connectionType = "nat"
ethernet1.addressType = "generated"
ethernet1.virtualDev = "e1000"
ethernet1.present = "TRUE"

将 ethernet0.virtualDev 由 e1000 修改 vmxnet3,因为 vmware 的 vmxnet3 支持多队列网卡,同时添加一行ethernet0.wakeOnPcktRcv = “TRUE”:

ethernet0.virtualDev = "vmxnet3"
ethernet0.wakeOnPcktRcv = "TRUE"

重启虚拟机, 查看网卡, 成功被被配置为vmxnet3:

ethtool -i eth0

ethtool

(3)查看是否支持多队列网卡:

cat /proc/interrupts
...
  55:          0          0          0          0   PCI-MSI 407552-edge      PCIe PME, pciehp
  56:       1181          0          0      12784   PCI-MSI 1572864-edge      eth0-rxtx-0
  57:          0        193          0         22   PCI-MSI 1572865-edge      eth0-rxtx-1
  58:         41         36          0          0   PCI-MSI 1572866-edge      eth0-rxtx-2
  59:          0          5         13          0   PCI-MSI 1572867-edge      eth0-rxtx-3
  60:          0          0          0          0   PCI-MSI 1572868-edge      eth0-event-4
  61:          0        163        151          4   PCI-MSI 1130496-edge      ahci[0000:02:05.0]
...

这里虚拟机有多少个CPU就有多少个队列。

(4)修改 ubuntu 系统的启动参数。

sudo vim /etc/default/grub

主要是在GRUB_CMDLINE_LINUX添加如下内容:

default_hugepages=1G hugepagesz=2M hugepages=1024

如果是物理主机则添加的是如下内容:

# 物理机:
default_hugepages=1G hugepagesz=1G hugepages=20 isolcpus=0-7

示例:

# If you change this file, run 'update-grub' afterwards to update
# /boot/grub/grub.cfg.
# For full documentation of the options in this file, see:
#   info -f grub -n 'Simple configuration'

GRUB_DEFAULT=0
GRUB_HIDDEN_TIMEOUT=0
GRUB_HIDDEN_TIMEOUT_QUIET=true
GRUB_TIMEOUT=10
GRUB_DISTRIBUTOR=`lsb_release -i -s 2> /dev/null || echo Debian`
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"
GRUB_CMDLINE_LINUX="net.ifnames=0 biosdevname=0 default_hugepages=1G hugepagesz=2M hugepages=1024"

# Uncomment to enable BadRAM filtering, modify to suit your needs
# This works with Linux (no patch required) and with any kernel that obtains
# the memory map information from GRUB (GNU Mach, kernel of FreeBSD ...)
#GRUB_BADRAM="0x01234567,0xfefefefe,0x89abcdef,0xefefefef"

# Uncomment to disable graphical terminal (grub-pc only)
#GRUB_TERMINAL=console

# The resolution used on graphical terminal
# note that you can use only modes which your graphic card supports via VBE
# you can see them in real GRUB with the command `vbeinfo'
#GRUB_GFXMODE=640x480

# Uncomment if you don't want GRUB to pass "root=UUID=xxx" parameter to Linux
#GRUB_DISABLE_LINUX_UUID=true

# Uncomment to disable generation of recovery mode menu entries
#GRUB_DISABLE_RECOVERY="true"

# Uncomment to get a beep at grub start
#GRUB_INIT_TUNE="480 440 1"

修改完之后需要重启。

1.1、编译DPDK

(1)下载 dpdk,随便一个版本,不同版本直接子系统接口会有差异,这里下载的是19.08.2版本:

wget https://fast.dpdk.org/rel/dpdk-19.08.2.tar.xz

(2)解压源码:

tar -xvf dpdk-19.08.2.tar.xz 
cd dpdk-19.08.2
ls
app  buildtools  config  devtools  doc  drivers  examples  GNUmakefile  kernel  lib  license  
MAINTAINERS  Makefile  meson.build  meson_options.txt  mk  README  usertools  VERSION

(3)安装依赖库:

sudo apt-get install numactl
sudo apt-get install libnuma-dev
sudo apt-get install net-tools

(4)通过 usertools/dpdk-setup.sh,64 位系统选择 39,编译完会多出 x86_64-native-linux-gcc 的文件夹。

------------------------------------------------------------------------------
 RTE_SDK exported as /home/fly/workspace/dpdk-stable-19.08.2
------------------------------------------------------------------------------
----------------------------------------------------------
 Step 1: Select the DPDK environment to build
----------------------------------------------------------
[1] arm64-armada-linuxapp-gcc
[2] arm64-armada-linux-gcc
[3] arm64-armv8a-linuxapp-clang
[4] arm64-armv8a-linuxapp-gcc
[5] arm64-armv8a-linux-clang
[6] arm64-armv8a-linux-gcc
[7] arm64-bluefield-linuxapp-gcc
[8] arm64-bluefield-linux-gcc
[9] arm64-dpaa2-linuxapp-gcc
[10] arm64-dpaa2-linux-gcc
[11] arm64-dpaa-linuxapp-gcc
[12] arm64-dpaa-linux-gcc
[13] arm64-octeontx2-linuxapp-gcc
[14] arm64-octeontx2-linux-gcc
[15] arm64-stingray-linuxapp-gcc
[16] arm64-stingray-linux-gcc
[17] arm64-thunderx2-linuxapp-gcc
[18] arm64-thunderx2-linux-gcc
[19] arm64-thunderx-linuxapp-gcc
[20] arm64-thunderx-linux-gcc
[21] arm64-xgene1-linuxapp-gcc
[22] arm64-xgene1-linux-gcc
[23] arm-armv7a-linuxapp-gcc
[24] arm-armv7a-linux-gcc
[25] i686-native-linuxapp-gcc
[26] i686-native-linuxapp-icc
[27] i686-native-linux-gcc
[28] i686-native-linux-icc
[29] ppc_64-power8-linuxapp-gcc
[30] ppc_64-power8-linux-gcc
[31] x86_64-native-bsdapp-clang
[32] x86_64-native-bsdapp-gcc
[33] x86_64-native-freebsd-clang
[34] x86_64-native-freebsd-gcc
[35] x86_64-native-linuxapp-clang
[36] x86_64-native-linuxapp-gcc
[37] x86_64-native-linuxapp-icc
[38] x86_64-native-linux-clang
[39] x86_64-native-linux-gcc
[40] x86_64-native-linux-icc
[41] x86_x32-native-linuxapp-gcc
[42] x86_x32-native-linux-gcc

----------------------------------------------------------
 Step 2: Setup linux environment
----------------------------------------------------------
[43] Insert IGB UIO module
[44] Insert VFIO module
[45] Insert KNI module
[46] Setup hugepage mappings for non-NUMA systems
[47] Setup hugepage mappings for NUMA systems
[48] Display current Ethernet/Baseband/Crypto device settings
[49] Bind Ethernet/Baseband/Crypto device to IGB UIO module
[50] Bind Ethernet/Baseband/Crypto device to VFIO module
[51] Setup VFIO permissions

----------------------------------------------------------
 Step 3: Run test application for linux environment
----------------------------------------------------------
[52] Run test application ($RTE_TARGET/app/test)
[53] Run testpmd application in interactive mode ($RTE_TARGET/app/testpmd)

----------------------------------------------------------
 Step 4: Other tools
----------------------------------------------------------
[54] List hugepage info from /proc/meminfo

----------------------------------------------------------
 Step 5: Uninstall and system cleanup
----------------------------------------------------------
[55] Unbind devices from IGB UIO or VFIO driver
[56] Remove IGB UIO module
[57] Remove VFIO module
[58] Remove KNI module
[59] Remove hugepage mappings

[60] Exit Script

(5)导出dpdk环境变量。

cd dpdk路径
# 如 dpdk/dpdk-stable-19.08.2/
# 切换root权限
sudo su 
export RTE_SDK=dpdk路径
export RTE_TARGET=x86_64-native-linux-gcc

(6)配置dpdk。

./usertools/dpdk-setup.sh

依次执行:
43(加载DPDK UIO 模块,即插入driver)
44(加载VFIO模块,也是一种driver)
45(加载KNI模块,将一些数据写回内核)
46(设置巨页,可以不需要频繁页交换,512)
47(设置巨页,可512)
49(执行之前需要eth0 down掉,执行sudo ifconfig eth0 down,使绑定dpdk)pci地址=对应eth0的(如0000:03:00.0)
60(退出)

二、DPDK工作原理

网络上所有的数据传输都要经过网卡,网卡将模拟信号转换为数字信号,也就是将物理层信号转换为数据链路层信号。

A 网卡 驱动(NIC) 协议栈 应用程序 B 网线 信号转换 sk_buffer recv/read par [主机接收数据] send/write 数据包 par [主机发送数据] 网线 A 网卡 驱动(NIC) 协议栈 应用程序 B

这个过程会进行两次拷贝,第一次是从网卡拷贝到NIC,用于组织sk_buffer;第二次是从内核空间拷贝到用户空间,应用程序处理数据。这些拷贝需要CPU的参与,会占用CPU资源。

DPDK的作用:

  1. 是把网卡是数据映射到内存。
  2. 而且DPDK提供巨页机制,将原来的4K每页提升到2M、1G的巨页。
  3. 控制多队列网卡,提高性能;在启动线程时做了CPU亲缘性。
  4. DPDK提高一种KNI(Kernel Network Interface)方式,将不关注的数据写回内核让内核处理。
DMA
KNI
网卡
内存
DPDK处理
网络协议栈
VFS (posxi api)
应用程序

DPDK能用来做一下应用:

  1. 路由器。
  2. 网络协议栈的基石部分。
  3. 防火墙。
  4. VPN。

DPDK的优势:

  1. 有专门的公司维护,有保障。
  2. 问题能有快速解决,迭代出新的版本。
  3. 文档齐全。

三、DPDK实践之处理UDP数据

dpdk初始化失败的情况:内存不够(比如主机只有4G内存)、没有设置hugepage、没有bind pci。

mbuf类似内核的sk_buffer。

rte_eal_init()
rte_pktmbuf_pool_create()
rte_eth_dev_configure()
  1. 参数初始化。
  2. 创建内存池。
  3. 配置网口。
  4. 设置网口队列的使用,dkdp接收数据队列、dkdp发送数据队列。
  5. 开始启动。
  6. 混杂模式:disable–只接收目的IP地址是本机的;enable–网络的所有广播包都接收。
  7. 开始接收数据。

示例代码(dpdk_udp.c):

#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_mbuf.h>

#define MBUF_NUM	1024
#define MBUF_SIZE	32

int gDpdkPortId=0;

int main(int argc,char **argv)
{
	// 
	if(rte_eal_init(argc,argv)<0)
	{
		rte_exit(EXIT_FAILURE,"Init Error\n");
	}
	// per_lcore_socket_id
	struct rte_mempool *mbuf_pool=rte_pktmbuf_pool_create("mbufpool",MBUF_NUM,0,0,RTE_MBUF_DEFAULT_BUF_SIZE,rte_socket_id());

	if(!mbuf_pool)
	{
		rte_exit(EXIT_FAILURE,"mbuf_pool Error\n");
	}
	
	uint16_t nb_rx_queues = 1;
	uint16_t nb_tx_queues = 0;
	const struct rte_eth_conf port_conf_default = {
		.rxmode = {.max_rx_pkt_len = RTE_ETHER_MAX_LEN }
	};
	rte_eth_dev_configure(gDpdkPortId,nb_rx_queues,nb_tx_queues,&port_conf_default);

	rte_eth_rx_queue_setup(gDpdkPortId,0,128,rte_eth_dev_socket_id(gDpdkPortId),NULL,mbuf_pool);

	//rte_eth_tx_queue_setup(...)

	rte_eth_dev_start(gDpdkPortId);

	rte_eth_promiscuous_enable(gDpdkPortId);

	while(1)
	{
		struct rte_mbuf *mbuf[MBUF_SIZE];
		unsigned num_recvd = rte_eth_rx_burst(gDpdkPortId, 0, mbuf, MBUF_SIZE);
		if (num_recvd > MBUF_SIZE) {
			rte_exit(EXIT_FAILURE, "Error receiving from eth\n");
		}
		//printf("recv pkt\n");
		unsigned i = 0;
		for (i = 0;i < num_recvd;i ++) {
			// 将mbufs数据包中的以太网头提取出来
			struct rte_ether_hdr *ehdr = rte_pktmbuf_mtod(mbuf[i], struct rte_ether_hdr*);
			//将一个16位的值从CPU顺序转换为大端序
			if (ehdr->ether_type != rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) {
				continue;
			}
			// 将mbufs数据包中的ipv4头提取出来
			struct rte_ipv4_hdr *iphdr =  rte_pktmbuf_mtod_offset(mbuf[i], struct rte_ipv4_hdr *, 
				sizeof(struct rte_ether_hdr));
			
			if (iphdr->next_proto_id == IPPROTO_UDP) {

				struct rte_udp_hdr *udphdr = (struct rte_udp_hdr *)(iphdr + 1);
				uint16_t length = ntohs(udphdr->dgram_len);
				*((char*)udphdr + length -1) = '\0';
				printf("udp:%s\n",(char *)(udphdr+1));
			}
		}
	}

	return 0;
}

可以通过 gcc 命令编译:

gcc -o dpdk_udp dpdk_udp.c -I /usr/local/include/dpdk/ -ldpdk -lpthread -lnuma -ldl

也可以写如下的makefile文件来编译(推荐)。
Makefle:

# binary name
APP = dpdk_udp

# all source are stored in SRCS-y
SRCS-y := dpdk_udp.c

# Build using pkg-config variables if possible
ifeq ($(shell pkg-config --exists libdpdk && echo 0),0)

all: shared
.PHONY: shared static
shared: build/$(APP)-shared
        ln -sf $(APP)-shared build/$(APP)
static: build/$(APP)-static
        ln -sf $(APP)-static build/$(APP)

PKGCONF=pkg-config --define-prefix

PC_FILE := $(shell $(PKGCONF) --path libdpdk)
CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
LDFLAGS_SHARED = $(shell $(PKGCONF) --libs libdpdk)
LDFLAGS_STATIC = -Wl,-Bstatic $(shell $(PKGCONF) --static --libs libdpdk)

build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
        $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)

build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
        $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)

build:
        @mkdir -p $@

.PHONY: clean
clean:
        rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared
        test -d build && rmdir -p build || true

else

ifeq ($(RTE_SDK),)
$(error "Please define RTE_SDK environment variable")
endif

# Default target, detect a build directory, by looking for a path with a .config
RTE_TARGET ?= $(notdir $(abspath $(dir $(firstword $(wildcard $(RTE_SDK)/*/.config)))))

include $(RTE_SDK)/mk/rte.vars.mk

总结

  1. 多队列网卡只负责数据包的接收,不能做到逻辑控制。
  2. dpdk不能保证udp的可靠,它只负责将数据转移到内存。
  3. dpdk学习路线:核心是写代码,在dpdk环境搭建完成之后,实践写一个协议栈代码(比如eth、ip、arp、icmp、tcp、udp);然后尝试实现一些posix api(比如epoll的实现);最后再做一些应用(比如vpp、ovs、dpvs、pktgen等)。
  • 3
    点赞
  • 27
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Lion Long

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值