现象
DPDK场景下网卡批量发包失败(计划发送N个,实际只能发送M个)。
用GDB跟进去的话会发现是网卡发送队列的DD标志位未置位。
原因
DPDK 程序通过 ixgbe或者i40e pmd 驱动可以直接操作网卡的寄存器和 ring Buffer,当上层业务存在问题,发送一些“不合规”的数据包时,这些有问题的数据包会直接进入网卡,引发硬件问题。
在内核态时,这些有问题的数据包会被内核驱动检查过滤掉。
但是DPDK程序的检查是自己控制的,无法做到精准检查。
具体原因
引发发送丢包/失败的数据包包括以下几类:
包长小于14Byte
包长大于9674Byte
TCP报文分段数目大于8
TSO MSS小于256Byte
TSO MSS大于9674Byte
报文的ol_flags (offload flags)不正确
Mbuf链里mbuf个数和第一个mbuf的 nb_segs值不一样
Mbuf链里任意一个mbuf的data_len=0
Mbuf链长度大于8
其他目前还位置的原因
上面的几个原因在 i40e_prep_pkts 基本可以进行检查。
发包hang住时查看原因
以82599 ixgbe网卡为例,当发现无法发送包时,则此时通过gdb查看原因。
1>查看是不是DD位不对。
2>查看队列中是否存在异常包。
以dpvs为例:
基本信息:
lan:
ip: 10.23.11.66/27 ; mac: 24:A5:2C:06:A5:B7
gw: 10.23.11.65; mac: 80:e4:55:40:eb:55
local-ip: 10.23.11.67-85
wan:
ip: 198.19.193.76/31; mac:24:A5:2C:06:A5:B8
gw: 198.19.193.77; mac: 80:e4:55:41:a5:a9;
初步分析:
可以收包,但是无法发包。
gdb 查看,如下所示:
简单查看单个mbuf信息
(gdb) p rte_eth_devices[0]
$2 = {
rx_pkt_burst = 0x49a090 <ixgbe_recv_pkts_bulk_alloc>,
tx_pkt_burst = 0x49b6e0 <ixgbe_xmit_pkts>,
tx_pkt_prepare = 0x498dc0 <ixgbe_prep_pkts>,
data = 0x17ffb1580,
process_private = 0x0,
dev_ops = 0xe63f60 <ixgbe_eth_dev_ops>,
device = 0xaa549c0,
intr_handle = 0xaa54aa0,
link_intr_cbs = {
tqh_first = 0x0,
tqh_last = 0x94955c0 <rte_eth_devices+64>
},
post_rx_burst_cbs = {0x0 <repeats 1024 times>},
pre_tx_burst_cbs = {0x0 <repeats 1024 times>},
state = RTE_ETH_DEV_ATTACHED,
security_ctx = 0x0
}
(gdb) p *rte_eth_devices[0].data
$8 = {
name = "0000:3b:00.0", '\000' <repeats 51 times>,
rx_queues = 0x140002140,
tx_queues = 0x140002040,
nb_rx_queues = 16,
nb_tx_queues = 17,
sriov = {
active = 0 '\000',
nb_q_per_pool = 0 '\000',
def_vmdq_idx = 0,
def_pool_q_idx = 0
},
dev_private = 0x17ffa9dc0,
dev_link = {
link_speed = 10000,
link_duplex = 1,
link_autoneg = 1,
link_status = 1
},
dev_conf = {
link_speeds = 0,
rxmode = {
mq_mode = ETH_MQ_RX_RSS,
max_rx_pkt_len = 1518,
split_hdr_size = 0,
offloads = 3
},
txmode = {
mq_mode = ETH_MQ_TX_NONE,
offloads = 14,
pvid = 0,
hw_vlan_reject_tagged = 0 '\000',
hw_vlan_reject_untagged = 0 '\000',
hw_vlan_insert_pvid = 0 '\000'
},
lpbk_mode = 0,
rx_adv_conf = {
rss_conf = {
rss_key = 0x0,
rss_key_len = 0 '\000',
rss_hf = 232756
},
vmdq_dcb_conf = {
nb_queue_pools = 0,
enable_default_pool = 0 '\000',
default_pool = 0 '\000',
nb_pool_maps = 0 '\000',
pool_map = {{
vlan_id = 0,
pools = 0
} <repeats 64 times>},
dcb_tc = "\000\000\000\000\000\000\000"
},
dcb_rx_conf = {
nb_tcs = 0,
dcb_tc = "\000\000\000\000\000\000\000"
},
vmdq_rx_conf = {
nb_queue_pools = 0,
enable_default_pool = 0 '\000',
default_pool = 0 '\000',
enable_loop_back = 0 '\000',
nb_pool_maps = 0 '\000',
rx_mode = 0,
pool_map = {{
vlan_id = 0,
pools = 0
} <repeats 64 times>}
}
},
tx_adv_conf = {
vmdq_dcb_tx_conf = {
nb_queue_pools = 0,
dcb_tc = "\000\000\000\000\000\000\000"
},
dcb_tx_conf = {
nb_tcs = 0,
dcb_tc = "\000\000\000\000\000\000\000"
},
vmdq_tx_conf = {
nb_queue_pools = 0
}
},
dcb_capability_en = 0,
fdir_conf = {
mode = RTE_FDIR_MODE_PERFECT,
pballoc = RTE_FDIR_PBALLOC_64K,
status = RTE_FDIR_REPORT_STATUS,
drop_queue = 127 '\177',
mask = {
vlan_tci_mask = 0,
ipv4_mask = {
src_ip = 0,
dst_ip = 4294967295,
tos = 0 '\000',
ttl = 0 '\000',
proto = 0 '\000'
},
ipv6_mask = {
src_ip = {0, 0, 0, 0},
dst_ip = {4294967295, 4294967295, 4294967295, 4294967295},
tc = 0 '\000',
proto = 0 '\000',
hop_limits = 0 '\000'
},
src_port_mask = 0,
dst_port_mask = 3840,
mac_addr_byte_mask = 0 '\000',
tunnel_id_mask = 0,
tunnel_type_mask = 0 '\000'
},
flex_conf = {
nb_payloads = 0,
nb_flexmasks = 0,
flex_set = {{
type = RTE_ETH_PAYLOAD_UNKNOWN,
src_offset = {0 <repeats 16 times>}
}, {
type = RTE_ETH_PAYLOAD_UNKNOWN,
src_offset = {0 <repeats 16 times>}
}, {
type = RTE_ETH_PAYLOAD_UNKNOWN,
src_offset = {0 <repeats 16 times>}
}, {
type = RTE_ETH_PAYLOAD_UNKNOWN,
src_offset = {0 <repeats 16 times>}
}, {
type = RTE_ETH_PAYLOAD_UNKNOWN,
src_offset = {0 <repeats 16 times>}
}, {
type = RTE_ETH_PAYLOAD_UNKNOWN,
src_offset = {0 <repeats 16 times>}
}, {
type = RTE_ETH_PAYLOAD_UNKNOWN,
src_offset = {0 <repeats 16 times>}
}, {
type = RTE_ETH_PAYLOAD_UNKNOWN,
src_offset = {0 <repeats 16 times>}
}},
flex_mask = {{
flow_type = 0,
mask = '\000' <repeats 15 times>
} <repeats 23 times>}
}
},
intr_conf = {
lsc = 0,
rxq = 0,
rmv = 0
}
},
mtu = 1500,
min_rx_buf_size = 2176,
rx_mbuf_alloc_failed = 0,
mac_addrs = 0x17ffa9a80,
mac_pool_sel = {0 <repeats 128 times>},
hash_mac_addrs = 0x17ffa3a40,
port_id = 0,
promiscuous = 0 '\000',
scattered_rx = 0 '\000',
all_multicast = 0 '\000',
dev_started = 1 '\001',
lro = 0 '\000',
rx_queue_state = '\001' <repeats 16 times>, '\000' <repeats 1007 times>,
tx_queue_state = '\001' <repeats 17 times>, '\000' <repeats 1006 times>,
dev_flags = 2,
kdrv = RTE_KDRV_IGB_UIO,
numa_node = 0,
vlan_filter_conf = {
ids = {0 <repeats 64 times>}
},
owner = {
id = 0,
name = '\000' <repeats 63 times>
},
representor_id = 0
}
port0的17个发包队列地址:
(gdb) x /17xg 0x140002040
0x140002040: 0x0000000480000080 0x0000000280001600
0x140002050: 0x0000000280001500 0x0000000280001400
0x140002060: 0x0000000280001300 0x0000000280001200
0x140002070: 0x0000000280001100 0x0000000280001000
0x140002080: 0x0000000280000f00 0x0000000280000e00
0x140002090: 0x0000000280000d00 0x0000000280000c00
0x1400020a0: 0x0000000280000b00 0x0000000280000a00
0x1400020b0: 0x0000000280000900 0x0000000280000800
0x1400020c0: 0x0000000280000700
0号发送队列:
(gdb) x /16xg 0x0000000480000080
0x480000080: 0x00000004c1f67880 0x0000000481f67880
0x480000090: 0x00000004c1f63840 0x0000004300006018
0x4800000a0: 0x0020002003df0400 0x001f000003df001f
0x4800000b0: 0x000000000000001f 0x0000000000000020
0x4800000c0: 0x000000000000000e 0x0000000000000001
0x4800000d0: 0x00f0000000000000 0x0000000000000a0e
0x4800000e0: 0x0000000000000000 0x000000000000ffff
0x4800000f0: 0x0000000000000000 0x00d0000000000000
(gdb) x /64uh 0x0000000480000080
0x480000080: 30848 49654 4 0 30848 33270 4 0
0x480000090: 14400 49654 4 0 24600 0 67 0
0x4800000a0: 1024 991 32 32 31 991 0 31
0x4800000b0: 31 0 0 0 32 0 0 0
0x4800000c0: 14 0 0 0 1 0 0 0
0x4800000d0: 0 0 0 240 2574 0 0 0
0x4800000e0: 0 0 0 0 65535 0 0 0
0x4800000f0: 0 0 0 0 0 0 0 208
所以:
tx_ring = 0x00000004c1f67880
sw_ring = 0x00000004c1f63840
nb_tx_desc = 1024
tx_tail = 991
tx_free_thresh = 32
tx_rs_thresh = 32
nb_tx_used = 31
last_desc_cleaned = 991
nb_tx_free = 0
tx_next_dd = 31
tx_next_rs =
queue_id = 0
reg_idx = 0
port_id = 0
nb_tx_free = 0 进入到 ixgbe_xmit_cleanup
desc_to_clean_to = last_desc_cleaned + tx_rs_thresh = 991 + 32 = 1023
sw_ring 是 struct ixgbe_tx_entry * 类型,
sizeof(struct ixgbe_tx_entry) = 16
&sw_ring[desc_to_clean_to] = 1023 * 16 + 0x00000004c1f63840 = 20434024496
(gdb) x /8uh 20434024496
0x4c1f67830: 64000 11522 2 0 0 1023 0 0
desc_to_clean_to = last_id = 1023
tx_ring 是 volatile union ixgbe_adv_tx_desc * 类型,
sizeof(volatile union ixgbe_adv_tx_desc)= 16
&txr[desc_to_clean_to] = 0x00000004c1f67880 + 1023 * 16 = 20434040944
(gdb) x /4uw 20434040944
0x4c1f6b870: 3976395528 7 724566082 1082128
wb.status = 1082128,
为偶数,不含有IXGBE_TXD_STAT_DD(0x01) 标记;
&sw_ring[desc_to_clean_to] = 1023 * 16 + 0x00000004c1f63840 = 20434024496
(gdb) x /2xg 20434024496
0x4c1f67830: 0x000000022d02fa00 0x0000000003ff0000
mbuf地址 0x000000022d02fa00
(gdb) x /8uh 20434024496
0x4c1f67830: 64000 11522 2 0 0 1023 0 0
(gdb) p *(struct rte_mbuf *) 0x000000022d02fa00
$1 = {
cacheline0 = 0x22d02fa00,
buf_addr = 0x22d02fa88,
{
buf_iova = 34041166472,
buf_physaddr = 34041166472
},
rearm_data = 0x22d02fa10,
data_off = 128,
{
refcnt_atomic = {
cnt = 1
},
refcnt = 1
},
nb_segs = 1,
port = 0,
ol_flags = 58546795155816834,
(bit1「PKT_RX_RSS_HASH」,7「PKT_RX_IP_CKSUM_GOOD」,8「PKT_RX_L4_CKSUM_GOOD」,
52「PKT_TX_TCP_CKSUM」,54「PKT_TX_IP_CKSUM」,55「PKT_TX_IPV4」),
rx_descriptor_fields1 = 0x22d02fa20,
{
packet_type = 2048,
{
l2_type = 0,
l3_type = 0,
l4_type = 8,
tun_type = 0,
{
inner_esp_next_proto = 0 '\000',
{
inner_l2_type = 0 '\000',
inner_l3_type = 0 '\000'
}
},
inner_l4_type = 0
}
},
pkt_len = 66,
data_len = 66,
vlan_tci = 0,
{
hash = {
rss = 3954987456,
fdir = {
{
{
hash = 20928,
id = 60348
},
lo = 3954987456
},
hi = 0
},
sched = {
lo = 3954987456,
hi = 0
},
usr = 3954987456
},
{
tx_metadata = 3954987456,
reserved = 0
}
},
vlan_tci_outer = 0,
buf_len = 2176,
timestamp = 0,
cacheline1 = 0x22d02fa40,
{
userdata = 0x0,
udata64 = 0
},
pool = 0x1764ec900,
next = 0x0,
{
tx_offload = 2099726,
{
l2_len = 14,
l3_len = 20,
l4_len = 32 ,
tso_segsz = 0,
outer_l3_len = 0,
outer_l2_len = 0
}
},
priv_size = 8,
timesync = 0,
seqn = 0,
shinfo = 0x0
}
buf_addr = 0x22d02fa88
data_off = 128
pkt_len = 66,
data_len = 66,
buf_len = 2176,
l2_len = 14,
l3_len = 20,
l4_len = 32 (tcp option??),
ol_flags = 58546795155816834,
(bit1「PKT_RX_RSS_HASH」,7「PKT_RX_IP_CKSUM_GOOD」,8「PKT_RX_L4_CKSUM_GOOD」,
52「PKT_TX_TCP_CKSUM」,54「PKT_TX_IP_CKSUM」,55「PKT_TX_IPV4」),
0x22d02fa88 + 128 = 9345104648
(gdb) p *(struct ether_hdr *) 9345104648
$2 = {
d_addr = {
addr_bytes = "\200\344U@\353U"
},
s_addr = {
addr_bytes = "$\245,\006\245\267"
},
ether_type = 8
}
(gdb) x /16xb 9345104648
0x22d02fb08: 0x80 0xe4 0x55 0x40 0xeb 0x55 0x24 0xa5
0x22d02fb10: 0x2c 0x06 0xa5 0xb7 0x08 0x00 0x45 0x04
0x22d02fa88 + 128 + 14 = 9345104662
(gdb) p *(struct ipv4_hdr *) 9345104662
$3 = {
version_ihl = 69 'E',
type_of_service = 4 '\004',
total_length = 13312(主机序:52),
packet_id = 12910,
fragment_offset = 64,
time_to_live = 51 '3',
next_proto_id = 6 '\006'(TCP),
hdr_checksum = 0,
src_addr = 1292572426(主机序:0A170B4D: 10.23.11.77),
dst_addr = 658181130(主机序:0A0C3B27: 10.12.59.39)
}
(gdb) x /16xb 9345104662
0x22d02fb16: 0x45 0x04 0x00 0x34 0x6e 0x32 0x40 0x00
0x22d02fb1e: 0x33 0x06 0x00 0x00 0x0a 0x17 0x0b 0x4d
0x22d02fa88 + 128 + 14 + 20 = 9345104682
(gdb) x /32xb 9345104682
0x22d02fb2a: 0xd3 0x50 0x01 0xbb 0xb7 0x1f 0xcb 0xb7
0x22d02fb32: 0xdc 0x02 0xf8 0xda 0x80 0x10 0xff 0xff
0x22d02fb3a: 0x5a 0xbd 0x00 0x00 0x01 0x01 0x05 0x0a
0x22d02fb42: 0xdc 0x02 0xf7 0xb8 0xdc 0x02 0xf8 0xda
option部分:0x01 0x01 0x05 0x0a 0xdc 0x02 0xf7 0xb8 0xdc 0x02 0xf8 0xda
0x01 是 NOP
0x05 是 sack,size=0x0a=10,
left_edge=0xdc02f7b8(网络序)=(主机:b8f702dc, 3103195868)
right_edge=0xdc02f8da(网络序)=(主机:daf802dc: 3673686748)
(gdb) p *(struct tcp_hdr *) 9345104682
$4 = {
src_port = 20691,
dst_port = 47873(主机序:443),
sent_seq = 3083542455(主机序:B71FCBB7, 3072314295),
recv_ack = 3673686748(主机序:DC02F8DA, 3691182298),
data_off = 128 '\200'(高四位1000,即8,为header_len=8, 8*4=32),
tcp_flags = 16 '\020'(ack标记),
rx_win = 65535,
cksum = 48474,
tcp_urp = 0
}
注:上诉包看着并没有异常,只是给了一种查看包的方法。以及查看哪个地方的包。一般是查看 desc_to_clean_to 位置、tx_tail 位置前后的包。
但是通过上诉方法,得到的mbuf,看着都没有问题。
查看mempool中元素
那么笨的方法是,把mbuf所在mempool中的mbuf都通过gdb给打印出来(或者添加过滤条件进行打印),看看是否存在问题。
1》 mbuf mempool 的信息:
(gdb) p *(struct rte_mempool *) 0x1764ec900
$2 = {
name = "mbuf_pool_0", '\000' <repeats 20 times>,
{
pool_data = 0x175cec740,
pool_id = 6271452992
},
pool_config = 0x0,
mz = 0x100000318,
flags = 16,
socket_id = 0,
size = 1048575,
cache_size = 256,
elt_size = 2312,
header_size = 64,
trailer_size = 120,
private_data_size = 64,
ops_index = 0,
local_cache = 0x1764ec9c0,
populated_size = 1048575,
elt_list = {
stqh_first = 0x1c0000028,
stqh_last = 0x25bffed28
},
nb_mem_chunks = 2,
mem_list = {
stqh_first = 0x17f986980,
stqh_last = 0x17f986900
}
}
2》打印第一个mbuf
(gdb) p *(struct rte_mempool_objhdr *) 0x1c0000028
$18 = {
next = {
stqe_next = 0x1c00009e8
},
mp = 0x1764ec900,
{
iova = 32212254784,
physaddr = 32212254784
}
}
第一个mbuf的信息:
(gdb) p * (struct rte_mbuf *) (0x1c0000028 + sizeof(struct rte_mempool_objhdr))
3》打印所有的mbuf 信息到文件中。
(gdb) set logging file /home/dpvs_gdb_mbuf_info.out
(gdb) set logging on
(gdb) info functions # 比如要用info functions输出所有函数,结果往往有一大坨,所以可以将之输出到文件。
(gdb) set logging off
set $mempool_objhdr_aaa = (struct rte_mempool_objhdr *) 0x1c0000028
while $mempool_objhdr_aaa
p *(struct rte_mbuf *) (((char *)$mempool_objhdr_aaa) + sizeof(struct rte_mempool_objhdr))
set $mempool_objhdr_aaa = (struct rte_mempool_objhdr *) (((struct rte_mempool_objhdr *)$mempool_objhdr_aaa)->next.stqe_next)
end
(gdb) set logging off
Done logging to /home/dpvs_gdb.out.
(gdb) set logging file /home/dpvs_gdb_port0_mbuf_info
(gdb) set logging on
Copying output to /home/dpvs_gdb_port0_mbuf_info.
Copying debug output to /home/dpvs_gdb_port0_mbuf_info.
(gdb) set $mempool_objhdr_aaa = (struct rte_mempool_objhdr *) 0x1c0000028
(gdb) while $mempool_objhdr_aaa
>set $mbuf_aaa = (struct rte_mbuf *) (((char *)$mempool_objhdr_aaa) + sizeof(struct rte_mempool_objhdr))
>if (((struct rte_mbuf *)$mbuf_aaa)->nb_segs != 0 && ((struct rte_mbuf *)$mbuf_aaa)->port == 0)
>p *(struct rte_mbuf *) $mbuf_aaa
>end
>set $mempool_objhdr_aaa = (struct rte_mempool_objhdr *) (((struct rte_mempool_objhdr *)$mempool_objhdr_aaa)->next.stqe_next)
>end
注:mempool中元素的遍历参见:test/test/test_mempool.c 中rte_mempool_obj_iter的调用。
得到 /home/dpvs_gdb_mbuf_info.out 文件后,在基于mbuf中的 nb_segs,ol_flags,pkt_len,data_len 等来查询异常。
gdb 中循环 + 条件的一些语法:
set $i=32707
set $j=0
while ($i)
if (fcluster->hash_table[$i].addr == 0x380aa8c0)
set $j++
end
set $i--
end
p $j
注:这样查看mbuf 的 mempool 中所有元素,依然会不好查找。一个是数量太多。第二个是无法分清某个元素是否在 ring buffer 中。
查看每个tx ring buffer 中元素对应的 mbuf
# cat /tmp/print_ring_buffer_mbuf.gdb
define p_ring_buffer_mbufs
set $tx_queues_array_aaa = (void **)0x140002040
set $tx_queue_index_aaa = 0
while ($tx_queue_index_aaa < 17)
set $tx_queue_ptr_aaa = (void*)($tx_queues_array_aaa[$tx_queue_index_aaa])
p "index of tx queue"
p $tx_queue_index_aaa
p $tx_queue_ptr_aaa
x /64uh $tx_queue_ptr_aaa
x /16xg $tx_queue_ptr_aaa
set $tx_ring_ptr_aaaa = (void*)(((void**)$tx_queue_ptr_aaa)[0])
set $sw_ring_ptr_aaaa = (void*)(((void**)$tx_queue_ptr_aaa)[2])
p $tx_ring_ptr_aaaa
p $sw_ring_ptr_aaaa
set $nb_tx_desc_aaaa = *(uint16_t*)($tx_queue_ptr_aaa + 32)
set $tx_ring_buffer_index_aaaa = 0
while ($tx_ring_buffer_index_aaaa < $nb_tx_desc_aaaa)
set $sw_ring_ptr_for_obj_index_aaa = $sw_ring_ptr_aaaa + ($tx_ring_buffer_index_aaaa * 16)
set $mbuf_ptr_aaaa = (void*)(((void**)$sw_ring_ptr_for_obj_index_aaa)[0])
p "addrress of mbuf"
p $mbuf_ptr_aaaa
p "index of tx_ring_buffer"
p $tx_ring_buffer_index_aaaa
if ($mbuf_ptr_aaaa)
p *(struct rte_mbuf *)($mbuf_ptr_aaaa)
end
set $tx_ring_buffer_index_aaaa = $tx_ring_buffer_index_aaaa + 1
end
set $tx_queue_index_aaa = $tx_queue_index_aaa + 1
end
end
document p_ring_buffer_mbufs
to list the ring buffer by travel
end
注:
0x140002040 为 *rte_eth_devices[0].data 中的 tx_queues,即某个接口(port_id=0)的多个发送队列的数组的地址。
gdb 中执行如下:
(gdb) set logging file /home/dpvs_gdb_port_tx_queues_mbuf_info
(gdb) set logging on
(gdb) set print pretty on
(gdb) source /tmp/print_ring_buffer_mbuf.gdb
(gdb) p_ring_buffer_mbufs
(gdb) set logging off
原因总结
最可能的原因是 发送包时候的mbuf 的 某些成员设置的不对。
比如: ol_flags(offload flags) 的标记位和 l2_len/l3_len不匹配。
1> 对于ipv4包而言,如果通过网卡进行计算ip头的checksum,此时则会有 PKT_TX_IP_CKSUM 标记为,那么此时的 l2_len 需要是 mac头长度或者mac+vlan头长度。如果 l2_len设置不正确,那么通过l2_len 偏移得到 ip头的位置,然后再计算 ip checksum可能是错的。
2>对于ipv6而言,ipv6头不存在 checksum,对于ipv6包而言,应该没有PKT_TX_IP_CKSUM标记。但是计算ipv6 tcp/udp的checksum时,需要 l2_len,l3_len的正确。比如,一种错误的场景,ipv6包的 l2_len 设置为0,但是存在 PKT_TX_TCP_CKSUM/ PKT_TX_UDP_CKSUM 标记。
注:对于DPDK程序(比如DPDK负载均衡DPVS)而言,如果是程序自身产生的包,而不是产生的包,则需要尤其注意 mbuf中的相关字段(比如:ol_flags, l2_len, l3_len, pkt_len, data_len 等)的正确性。
DPVS中可能程序自己产生的包的场景有:
- TCP VS 开启了 synproxy, dpvs构造syn给后端rs发送包;
- TCP session 超时时,构造rst发送给client和rs;
- 在DPVS上 ttl 为0时,产生的 icmp ttl差错报;
- 报文大小大于mtu的分片报文
- 其他自己从mbufpool中取包构造包发出去的场景。
另外,对于Fnat46, Fnat64这种ipv6和ipv4的转换,由于ipv6和ipv4头的差异导致是否存在mbuf的ol_flag 的差异也需要注意。
解决
在调用rte_eth_tx*()方法之前,调用ixgbe或i40e pmd 驱动提供的[ixgbe | i40e]_prep_pkts()方法,可排除大部分导致该问题的报文。
注:
同时不必过度担心此类检查会影响性能,经过实际测试影响很小。
根源上,还是需要业务层面找到产生“异常”报文的代码。
查看 dpdk 18.11 中的 ixgbe_prep_pkts 感觉实际上检查的并不多,
dpdk 20.11 中的 ixgbe_prep_pkts 检查相对多一些,会增加报文大小的检查。
建议,搞明白 不同pmd的 发包队列中的 DD状态位的设置,哪些异常情况导致DD位不能够被正确设置。
则在发包前自己实现一套检查的机制。
#define IXGBE_TX_OFFLOAD_MASK ( \
PKT_TX_OUTER_IPV6 | \
PKT_TX_OUTER_IPV4 | \
PKT_TX_IPV6 | \
PKT_TX_IPV4 | \
PKT_TX_VLAN_PKT | \
PKT_TX_IP_CKSUM | \
PKT_TX_L4_MASK | \
PKT_TX_TCP_SEG | \
PKT_TX_MACSEC | \
PKT_TX_OUTER_IP_CKSUM | \
PKT_TX_SEC_OFFLOAD | \
IXGBE_TX_IEEE1588_TMST)
#define IXGBE_TX_OFFLOAD_NOTSUP_MASK \
(PKT_TX_OFFLOAD_MASK ^ IXGBE_TX_OFFLOAD_MASK)
uint16_t
ixgbe_prep_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
{
int i, ret;
uint64_t ol_flags;
struct rte_mbuf *m;
struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
for (i = 0; i < nb_pkts; i++) {
m = tx_pkts[i];
ol_flags = m->ol_flags;
/**
* Check if packet meets requirements for number of segments
*
* NOTE: for ixgbe it's always (40 - WTHRESH) for both TSO and
* non-TSO
*/
if (m->nb_segs > IXGBE_TX_MAX_SEG - txq->wthresh) {
rte_errno = EINVAL;
return i;
}
if (ol_flags & IXGBE_TX_OFFLOAD_NOTSUP_MASK) {
rte_errno = ENOTSUP;
return i;
}
/* check the size of packet */
if (m->pkt_len < IXGBE_TX_MIN_PKT_LEN) {
rte_errno = EINVAL;
return i;
}
#ifdef RTE_LIBRTE_ETHDEV_DEBUG
ret = rte_validate_tx_offload(m);
if (ret != 0) {
rte_errno = -ret;
return i;
}
#endif
ret = rte_net_intel_cksum_prepare(m);
if (ret != 0) {
rte_errno = -ret;
return i;
}
}
return i;
}
static inline int
rte_net_intel_cksum_prepare(struct rte_mbuf *m)
{
return rte_net_intel_cksum_flags_prepare(m, m->ol_flags);
}
static inline int
rte_net_intel_cksum_flags_prepare(struct rte_mbuf *m, uint64_t ol_flags)
{
/* Initialise ipv4_hdr to avoid false positive compiler warnings. */
struct rte_ipv4_hdr *ipv4_hdr = NULL;
struct rte_ipv6_hdr *ipv6_hdr;
struct rte_tcp_hdr *tcp_hdr;
struct rte_udp_hdr *udp_hdr;
uint64_t inner_l3_offset = m->l2_len;
/*
* Does packet set any of available offloads?
* Mainly it is required to avoid fragmented headers check if
* no offloads are requested.
*/
if (!(ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK | PKT_TX_TCP_SEG)))
return 0;
if (ol_flags & (PKT_TX_OUTER_IPV4 | PKT_TX_OUTER_IPV6))
inner_l3_offset += m->outer_l2_len + m->outer_l3_len;
/*
* Check if headers are fragmented.
* The check could be less strict depending on which offloads are
* requested and headers to be used, but let's keep it simple.
*/
if (unlikely(rte_pktmbuf_data_len(m) <
inner_l3_offset + m->l3_len + m->l4_len))
return -ENOTSUP;
if (ol_flags & PKT_TX_IPV4) {
ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *,
inner_l3_offset);
if (ol_flags & PKT_TX_IP_CKSUM)
ipv4_hdr->hdr_checksum = 0;
}
if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
if (ol_flags & PKT_TX_IPV4) {
udp_hdr = (struct rte_udp_hdr *)((char *)ipv4_hdr +
m->l3_len);
udp_hdr->dgram_cksum = rte_ipv4_phdr_cksum(ipv4_hdr,
ol_flags);
} else {
ipv6_hdr = rte_pktmbuf_mtod_offset(m,
struct rte_ipv6_hdr *, inner_l3_offset);
/* non-TSO udp */
udp_hdr = rte_pktmbuf_mtod_offset(m,
struct rte_udp_hdr *,
inner_l3_offset + m->l3_len);
udp_hdr->dgram_cksum = rte_ipv6_phdr_cksum(ipv6_hdr,
ol_flags);
}
} else if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM ||
(ol_flags & PKT_TX_TCP_SEG)) {
if (ol_flags & PKT_TX_IPV4) {
/* non-TSO tcp or TSO */
tcp_hdr = (struct rte_tcp_hdr *)((char *)ipv4_hdr +
m->l3_len);
tcp_hdr->cksum = rte_ipv4_phdr_cksum(ipv4_hdr,
ol_flags);
} else {
ipv6_hdr = rte_pktmbuf_mtod_offset(m,
struct rte_ipv6_hdr *, inner_l3_offset);
/* non-TSO tcp or TSO */
tcp_hdr = rte_pktmbuf_mtod_offset(m,
struct rte_tcp_hdr *,
inner_l3_offset + m->l3_len);
tcp_hdr->cksum = rte_ipv6_phdr_cksum(ipv6_hdr,
ol_flags);
}
}
return 0;
}
分析:
感觉 ixgbe的 ixgbe_prep_pkt 写的并不是很完善。比如,对于ipv6 报文的一些信息没有太多的检查。
参考
https://decodezp.github.io/2019/10/17/quickwords34-dpdk-tx-hang/