DPDK疑难杂症之发包Tx失败

现象

DPDK场景下网卡批量发包失败(计划发送N个,实际只能发送M个)。
用GDB跟进去的话会发现是网卡发送队列的DD标志位未置位。

原因

DPDK 程序通过 ixgbe或者i40e pmd 驱动可以直接操作网卡的寄存器和 ring Buffer,当上层业务存在问题,发送一些“不合规”的数据包时,这些有问题的数据包会直接进入网卡,引发硬件问题。

在内核态时,这些有问题的数据包会被内核驱动检查过滤掉。
但是DPDK程序的检查是自己控制的,无法做到精准检查。

具体原因

引发发送丢包/失败的数据包包括以下几类:

包长小于14Byte
包长大于9674Byte
TCP报文分段数目大于8
TSO MSS小于256Byte
TSO MSS大于9674Byte
报文的ol_flags (offload flags)不正确
Mbuf链里mbuf个数和第一个mbuf的 nb_segs值不一样
Mbuf链里任意一个mbuf的data_len=0
Mbuf链长度大于8
其他目前还位置的原因

上面的几个原因在 i40e_prep_pkts 基本可以进行检查。

发包hang住时查看原因

以82599 ixgbe网卡为例,当发现无法发送包时,则此时通过gdb查看原因。
1>查看是不是DD位不对。
2>查看队列中是否存在异常包。

以dpvs为例:

基本信息:
lan: 
  ip: 10.23.11.66/27 ; mac: 24:A5:2C:06:A5:B7
  gw: 10.23.11.65; mac: 80:e4:55:40:eb:55
  local-ip:  10.23.11.67-85
  
wan:
	ip: 198.19.193.76/31; mac:24:A5:2C:06:A5:B8
    gw: 198.19.193.77;  mac: 80:e4:55:41:a5:a9;

初步分析:
可以收包,但是无法发包。
在这里插入图片描述
gdb 查看,如下所示:

简单查看单个mbuf信息

(gdb)  p rte_eth_devices[0]
$2 = {
  rx_pkt_burst = 0x49a090 <ixgbe_recv_pkts_bulk_alloc>,
  tx_pkt_burst = 0x49b6e0 <ixgbe_xmit_pkts>,
  tx_pkt_prepare = 0x498dc0 <ixgbe_prep_pkts>,
  data = 0x17ffb1580,
  process_private = 0x0,
  dev_ops = 0xe63f60 <ixgbe_eth_dev_ops>,
  device = 0xaa549c0,
  intr_handle = 0xaa54aa0,
  link_intr_cbs = {
    tqh_first = 0x0,
    tqh_last = 0x94955c0 <rte_eth_devices+64>
  },
  post_rx_burst_cbs = {0x0 <repeats 1024 times>},
  pre_tx_burst_cbs = {0x0 <repeats 1024 times>},
  state = RTE_ETH_DEV_ATTACHED,
  security_ctx = 0x0
}

(gdb) p *rte_eth_devices[0].data
$8 = {
  name = "0000:3b:00.0", '\000' <repeats 51 times>,
  rx_queues = 0x140002140,
  tx_queues = 0x140002040,
  nb_rx_queues = 16,
  nb_tx_queues = 17,
  sriov = {
    active = 0 '\000',
    nb_q_per_pool = 0 '\000',
    def_vmdq_idx = 0,
    def_pool_q_idx = 0
  },
  dev_private = 0x17ffa9dc0,
  dev_link = {
    link_speed = 10000,
    link_duplex = 1,
    link_autoneg = 1,
    link_status = 1
  },
  dev_conf = {
    link_speeds = 0,
    rxmode = {
      mq_mode = ETH_MQ_RX_RSS,
      max_rx_pkt_len = 1518,
      split_hdr_size = 0,
      offloads = 3
    },
    txmode = {
      mq_mode = ETH_MQ_TX_NONE,
      offloads = 14,
      pvid = 0,
      hw_vlan_reject_tagged = 0 '\000',
      hw_vlan_reject_untagged = 0 '\000',
      hw_vlan_insert_pvid = 0 '\000'
    },
    lpbk_mode = 0,
    rx_adv_conf = {
      rss_conf = {
        rss_key = 0x0,
        rss_key_len = 0 '\000',
        rss_hf = 232756
      },
      vmdq_dcb_conf = {
        nb_queue_pools = 0,
        enable_default_pool = 0 '\000',
        default_pool = 0 '\000',
        nb_pool_maps = 0 '\000',
        pool_map = {{
            vlan_id = 0,
            pools = 0
          } <repeats 64 times>},
        dcb_tc = "\000\000\000\000\000\000\000"
      },
      dcb_rx_conf = {
        nb_tcs = 0,
        dcb_tc = "\000\000\000\000\000\000\000"
      },
      vmdq_rx_conf = {
        nb_queue_pools = 0,
        enable_default_pool = 0 '\000',
        default_pool = 0 '\000',
        enable_loop_back = 0 '\000',
        nb_pool_maps = 0 '\000',
        rx_mode = 0,
        pool_map = {{
            vlan_id = 0,
            pools = 0
          } <repeats 64 times>}
      }
    },
    tx_adv_conf = {
      vmdq_dcb_tx_conf = {
        nb_queue_pools = 0,
        dcb_tc = "\000\000\000\000\000\000\000"
      },
      dcb_tx_conf = {
        nb_tcs = 0,
        dcb_tc = "\000\000\000\000\000\000\000"
      },
      vmdq_tx_conf = {
        nb_queue_pools = 0
      }
    },
    dcb_capability_en = 0,
    fdir_conf = {
      mode = RTE_FDIR_MODE_PERFECT,
      pballoc = RTE_FDIR_PBALLOC_64K,
      status = RTE_FDIR_REPORT_STATUS,
      drop_queue = 127 '\177',
      mask = {
        vlan_tci_mask = 0,
        ipv4_mask = {
          src_ip = 0,
          dst_ip = 4294967295,
          tos = 0 '\000',
          ttl = 0 '\000',
          proto = 0 '\000'
        },
        ipv6_mask = {
          src_ip = {0, 0, 0, 0},
          dst_ip = {4294967295, 4294967295, 4294967295, 4294967295},
          tc = 0 '\000',
          proto = 0 '\000',
          hop_limits = 0 '\000'
        },
        src_port_mask = 0,
        dst_port_mask = 3840,
        mac_addr_byte_mask = 0 '\000',
        tunnel_id_mask = 0,
        tunnel_type_mask = 0 '\000'
      },
      flex_conf = {
        nb_payloads = 0,
        nb_flexmasks = 0,
        flex_set = {{
            type = RTE_ETH_PAYLOAD_UNKNOWN,
            src_offset = {0 <repeats 16 times>}
          }, {
            type = RTE_ETH_PAYLOAD_UNKNOWN,
            src_offset = {0 <repeats 16 times>}
          }, {
            type = RTE_ETH_PAYLOAD_UNKNOWN,
            src_offset = {0 <repeats 16 times>}
          }, {
            type = RTE_ETH_PAYLOAD_UNKNOWN,
            src_offset = {0 <repeats 16 times>}
          }, {
            type = RTE_ETH_PAYLOAD_UNKNOWN,
            src_offset = {0 <repeats 16 times>}
          }, {
            type = RTE_ETH_PAYLOAD_UNKNOWN,
            src_offset = {0 <repeats 16 times>}
          }, {
            type = RTE_ETH_PAYLOAD_UNKNOWN,
            src_offset = {0 <repeats 16 times>}
          }, {
            type = RTE_ETH_PAYLOAD_UNKNOWN,
            src_offset = {0 <repeats 16 times>}
          }},
        flex_mask = {{
            flow_type = 0,
            mask = '\000' <repeats 15 times>
          } <repeats 23 times>}
      }
    },
    intr_conf = {
      lsc = 0,
      rxq = 0,
      rmv = 0
    }
  },
  mtu = 1500,
  min_rx_buf_size = 2176,
  rx_mbuf_alloc_failed = 0,
  mac_addrs = 0x17ffa9a80,
  mac_pool_sel = {0 <repeats 128 times>},
  hash_mac_addrs = 0x17ffa3a40,
  port_id = 0,
  promiscuous = 0 '\000',
  scattered_rx = 0 '\000',
  all_multicast = 0 '\000',
  dev_started = 1 '\001',
  lro = 0 '\000',
  rx_queue_state = '\001' <repeats 16 times>, '\000' <repeats 1007 times>,
  tx_queue_state = '\001' <repeats 17 times>, '\000' <repeats 1006 times>,
  dev_flags = 2,
  kdrv = RTE_KDRV_IGB_UIO,
  numa_node = 0,
  vlan_filter_conf = {
    ids = {0 <repeats 64 times>}
  },
  owner = {
    id = 0,
    name = '\000' <repeats 63 times>
  },
  representor_id = 0
}

port0的17个发包队列地址:
(gdb) x /17xg 0x140002040
0x140002040:	0x0000000480000080	0x0000000280001600
0x140002050:	0x0000000280001500	0x0000000280001400
0x140002060:	0x0000000280001300	0x0000000280001200
0x140002070:	0x0000000280001100	0x0000000280001000
0x140002080:	0x0000000280000f00	0x0000000280000e00
0x140002090:	0x0000000280000d00	0x0000000280000c00
0x1400020a0:	0x0000000280000b00	0x0000000280000a00
0x1400020b0:	0x0000000280000900	0x0000000280000800
0x1400020c0:	0x0000000280000700

0号发送队列:
(gdb) x /16xg 0x0000000480000080
0x480000080:	0x00000004c1f67880	0x0000000481f67880
0x480000090:	0x00000004c1f63840	0x0000004300006018
0x4800000a0:	0x0020002003df0400	0x001f000003df001f
0x4800000b0:	0x000000000000001f	0x0000000000000020
0x4800000c0:	0x000000000000000e	0x0000000000000001
0x4800000d0:	0x00f0000000000000	0x0000000000000a0e
0x4800000e0:	0x0000000000000000	0x000000000000ffff
0x4800000f0:	0x0000000000000000	0x00d0000000000000

(gdb) x /64uh  0x0000000480000080
0x480000080:	30848	49654	4	0	30848	33270	4	0
0x480000090:	14400	49654	4	0	24600	0	67	0
0x4800000a0:	1024	991	32	32	31	991	0	31
0x4800000b0:	31	0	0	0	32	0	0	0
0x4800000c0:	14	0	0	0	1	0	0	0
0x4800000d0:	0	0	0	240	2574	0	0	0
0x4800000e0:	0	0	0	0	65535	0	0	0
0x4800000f0:	0	0	0	0	0	0	0	208

所以:
tx_ring = 0x00000004c1f67880
sw_ring = 0x00000004c1f63840

nb_tx_desc = 1024
tx_tail = 991
tx_free_thresh = 32
tx_rs_thresh = 32
nb_tx_used = 31
last_desc_cleaned = 991
nb_tx_free = 0
tx_next_dd = 31
tx_next_rs = 

queue_id = 0
reg_idx = 0
port_id = 0

nb_tx_free = 0 进入到 ixgbe_xmit_cleanup
desc_to_clean_to = last_desc_cleaned + tx_rs_thresh = 991 + 32 = 1023
sw_ring 是 struct ixgbe_tx_entry * 类型,
sizeof(struct ixgbe_tx_entry) = 16
&sw_ring[desc_to_clean_to] = 1023 * 16 + 0x00000004c1f63840 = 20434024496
(gdb) x /8uh 20434024496
0x4c1f67830:	64000	11522	2	0	0	1023	0	0
desc_to_clean_to = last_id = 1023

tx_ring 是 volatile union ixgbe_adv_tx_desc * 类型,
sizeofvolatile union ixgbe_adv_tx_desc)= 16
&txr[desc_to_clean_to] = 0x00000004c1f67880 + 1023 * 16 = 20434040944
(gdb) x /4uw 20434040944
0x4c1f6b870:	3976395528	7	724566082	1082128
wb.status = 1082128,
为偶数,不含有IXGBE_TXD_STAT_DD(0x01) 标记;

&sw_ring[desc_to_clean_to] = 1023 * 16 + 0x00000004c1f63840 = 20434024496
(gdb)  x /2xg  20434024496
0x4c1f67830:	0x000000022d02fa00	0x0000000003ff0000
mbuf地址 0x000000022d02fa00
(gdb) x /8uh 20434024496
0x4c1f67830:	64000	11522	2	0	0	1023	0	0
(gdb) p *(struct rte_mbuf *) 0x000000022d02fa00
$1 = {
  cacheline0 = 0x22d02fa00,
  buf_addr = 0x22d02fa88,
  {
    buf_iova = 34041166472,
    buf_physaddr = 34041166472
  },
  rearm_data = 0x22d02fa10,
  data_off = 128,
  {
    refcnt_atomic = {
      cnt = 1
    },
    refcnt = 1
  },
  nb_segs = 1,
  port = 0,
  ol_flags = 58546795155816834,
  (bit1「PKT_RX_RSS_HASH」,7「PKT_RX_IP_CKSUM_GOOD」,8「PKT_RX_L4_CKSUM_GOOD」,
  52「PKT_TX_TCP_CKSUM」,54「PKT_TX_IP_CKSUM」,55「PKT_TX_IPV4」),
  
  rx_descriptor_fields1 = 0x22d02fa20,
  {
    packet_type = 2048,
    {
      l2_type = 0,
      l3_type = 0,
      l4_type = 8,
      tun_type = 0,
      {
        inner_esp_next_proto = 0 '\000',
        {
          inner_l2_type = 0 '\000',
          inner_l3_type = 0 '\000'
        }
      },
      inner_l4_type = 0
    }
  },
  pkt_len = 66,
  data_len = 66,
  vlan_tci = 0,
  {
    hash = {
      rss = 3954987456,
      fdir = {
        {
          {
            hash = 20928,
            id = 60348
          },
          lo = 3954987456
        },
        hi = 0
      },
      sched = {
        lo = 3954987456,
        hi = 0
      },
      usr = 3954987456
    },
    {
      tx_metadata = 3954987456,
      reserved = 0
    }
  },
  vlan_tci_outer = 0,
  buf_len = 2176,
  timestamp = 0,
  cacheline1 = 0x22d02fa40,
  {
    userdata = 0x0,
    udata64 = 0
  },
  pool = 0x1764ec900,
  next = 0x0,
  {
    tx_offload = 2099726,
    {
      l2_len = 14,
      l3_len = 20,
      l4_len = 32 ,
      tso_segsz = 0,
      outer_l3_len = 0,
      outer_l2_len = 0
    }
  },
  priv_size = 8,
  timesync = 0,
  seqn = 0,
  shinfo = 0x0
}

buf_addr = 0x22d02fa88
data_off = 128
pkt_len = 66,
data_len = 66,
buf_len = 2176,
l2_len = 14,
l3_len = 20,
l4_len = 32 (tcp option??),
ol_flags = 58546795155816834,
(bit1「PKT_RX_RSS_HASH」,7「PKT_RX_IP_CKSUM_GOOD」,8「PKT_RX_L4_CKSUM_GOOD」,
52「PKT_TX_TCP_CKSUM」,54「PKT_TX_IP_CKSUM」,55「PKT_TX_IPV4」),

0x22d02fa88 + 128 = 9345104648
(gdb) p *(struct ether_hdr *) 9345104648
$2 = {
  d_addr = {
    addr_bytes = "\200\344U@\353U"
  },
  s_addr = {
    addr_bytes = "$\245,\006\245\267"
  },
  ether_type = 8
}

(gdb) x /16xb 9345104648
0x22d02fb08:	0x80	0xe4	0x55	0x40	0xeb	0x55	0x24	0xa5
0x22d02fb10:	0x2c	0x06	0xa5	0xb7	0x08	0x00	0x45	0x04

0x22d02fa88 + 128 + 14 = 9345104662
(gdb) p *(struct ipv4_hdr *) 9345104662
$3 = {
  version_ihl = 69 'E',
  type_of_service = 4 '\004',
  total_length = 13312(主机序:52),
  packet_id = 12910,
  fragment_offset = 64,
  time_to_live = 51 '3',
  next_proto_id = 6 '\006'(TCP),
  hdr_checksum = 0,
  src_addr = 1292572426(主机序:0A170B4D: 10.23.11.77,
  dst_addr = 658181130(主机序:0A0C3B27: 10.12.59.39}
(gdb) x /16xb 9345104662
0x22d02fb16:	0x45	0x04	0x00	0x34	0x6e	0x32	0x40	0x00
0x22d02fb1e:	0x33	0x06	0x00	0x00	0x0a	0x17	0x0b	0x4d

0x22d02fa88 + 128 + 14 + 20 = 9345104682
(gdb) x /32xb 9345104682
0x22d02fb2a:	0xd3	0x50	0x01	0xbb	0xb7	0x1f	0xcb	0xb7
0x22d02fb32:	0xdc	0x02	0xf8	0xda	0x80	0x10	0xff	0xff
0x22d02fb3a:	0x5a	0xbd	0x00	0x00	0x01	0x01	0x05	0x0a
0x22d02fb42:	0xdc	0x02	0xf7	0xb8	0xdc	0x02	0xf8	0xda
option部分:0x01	0x01	0x05	0x0a 0xdc	0x02	0xf7	0xb8	0xdc	0x02	0xf8	0xda
0x01 是 NOP
0x05 是 sack,size=0x0a=10, 
left_edge=0xdc02f7b8(网络序)=(主机:b8f702dc, 3103195868)
right_edge=0xdc02f8da(网络序)=(主机:daf802dc: 3673686748)

(gdb) p *(struct tcp_hdr *) 9345104682
$4 = {
  src_port = 20691,
  dst_port = 47873(主机序:443),
  sent_seq = 3083542455(主机序:B71FCBB7, 3072314295,
  recv_ack = 3673686748(主机序:DC02F8DA, 3691182298,
  data_off = 128 '\200'(高四位1000,8,为header_len=8, 8*4=32),
  tcp_flags = 16 '\020'(ack标记),
  rx_win = 65535,
  cksum = 48474,
  tcp_urp = 0
}

注:上诉包看着并没有异常,只是给了一种查看包的方法。以及查看哪个地方的包。一般是查看 desc_to_clean_to 位置、tx_tail 位置前后的包。
但是通过上诉方法,得到的mbuf,看着都没有问题。

查看mempool中元素

那么笨的方法是,把mbuf所在mempool中的mbuf都通过gdb给打印出来(或者添加过滤条件进行打印),看看是否存在问题。

1》 mbuf mempool 的信息:
(gdb) p *(struct rte_mempool *) 0x1764ec900
$2 = {
  name = "mbuf_pool_0", '\000' <repeats 20 times>,
  {
    pool_data = 0x175cec740,
    pool_id = 6271452992
  },
  pool_config = 0x0,
  mz = 0x100000318,
  flags = 16,
  socket_id = 0,
  size = 1048575,
  cache_size = 256,
  elt_size = 2312,
  header_size = 64,
  trailer_size = 120,
  private_data_size = 64,
  ops_index = 0,
  local_cache = 0x1764ec9c0,
  populated_size = 1048575,
  elt_list = {
    stqh_first = 0x1c0000028,
    stqh_last = 0x25bffed28
  },
  nb_mem_chunks = 2,
  mem_list = {
    stqh_first = 0x17f986980,
    stqh_last = 0x17f986900
  }
}

2》打印第一个mbuf
(gdb) p *(struct rte_mempool_objhdr *) 0x1c0000028
$18 = {
  next = {
    stqe_next = 0x1c00009e8
  },
  mp = 0x1764ec900,
  {
    iova = 32212254784,
    physaddr = 32212254784
  }
}
第一个mbuf的信息:
(gdb) p * (struct rte_mbuf *) (0x1c0000028 + sizeof(struct rte_mempool_objhdr))

3》打印所有的mbuf 信息到文件中。

(gdb) set logging file /home/dpvs_gdb_mbuf_info.out
(gdb) set logging on
(gdb) info functions # 比如要用info functions输出所有函数,结果往往有一大坨,所以可以将之输出到文件。
(gdb) set logging off

set $mempool_objhdr_aaa = (struct rte_mempool_objhdr *) 0x1c0000028
while $mempool_objhdr_aaa
p *(struct rte_mbuf *) (((char *)$mempool_objhdr_aaa) + sizeof(struct rte_mempool_objhdr))
set $mempool_objhdr_aaa = (struct rte_mempool_objhdr *) (((struct rte_mempool_objhdr *)$mempool_objhdr_aaa)->next.stqe_next)
end

(gdb) set logging off
Done logging to /home/dpvs_gdb.out.
(gdb) set logging file /home/dpvs_gdb_port0_mbuf_info
(gdb) set logging on
Copying output to /home/dpvs_gdb_port0_mbuf_info.
Copying debug output to /home/dpvs_gdb_port0_mbuf_info.
(gdb) set $mempool_objhdr_aaa = (struct rte_mempool_objhdr *) 0x1c0000028
(gdb) while $mempool_objhdr_aaa
 >set $mbuf_aaa = (struct rte_mbuf *) (((char *)$mempool_objhdr_aaa) + sizeof(struct rte_mempool_objhdr))
 >if (((struct rte_mbuf *)$mbuf_aaa)->nb_segs != 0 && ((struct rte_mbuf *)$mbuf_aaa)->port == 0)
  >p *(struct rte_mbuf *) $mbuf_aaa
  >end
 >set $mempool_objhdr_aaa = (struct rte_mempool_objhdr *) (((struct rte_mempool_objhdr *)$mempool_objhdr_aaa)->next.stqe_next)
 >end

注:mempool中元素的遍历参见:test/test/test_mempool.c 中rte_mempool_obj_iter的调用。

得到 /home/dpvs_gdb_mbuf_info.out 文件后,在基于mbuf中的 nb_segs,ol_flags,pkt_len,data_len 等来查询异常。

gdb 中循环 + 条件的一些语法:
set $i=32707
set $j=0
while ($i)
  if (fcluster->hash_table[$i].addr == 0x380aa8c0)
     set $j++
  end
  set $i--
end
p $j

注:这样查看mbuf 的 mempool 中所有元素,依然会不好查找。一个是数量太多。第二个是无法分清某个元素是否在 ring buffer 中。

查看每个tx ring buffer 中元素对应的 mbuf

# cat /tmp/print_ring_buffer_mbuf.gdb
define p_ring_buffer_mbufs

set $tx_queues_array_aaa = (void **)0x140002040
set $tx_queue_index_aaa = 0
while ($tx_queue_index_aaa < 17)
    set $tx_queue_ptr_aaa = (void*)($tx_queues_array_aaa[$tx_queue_index_aaa])
    p "index of tx queue"
    p $tx_queue_index_aaa
    p $tx_queue_ptr_aaa
    x /64uh $tx_queue_ptr_aaa
    x /16xg $tx_queue_ptr_aaa

    set $tx_ring_ptr_aaaa = (void*)(((void**)$tx_queue_ptr_aaa)[0])
    set $sw_ring_ptr_aaaa = (void*)(((void**)$tx_queue_ptr_aaa)[2])
    p $tx_ring_ptr_aaaa
    p $sw_ring_ptr_aaaa

    set $nb_tx_desc_aaaa = *(uint16_t*)($tx_queue_ptr_aaa + 32)
    set $tx_ring_buffer_index_aaaa = 0
    while ($tx_ring_buffer_index_aaaa < $nb_tx_desc_aaaa)
        set $sw_ring_ptr_for_obj_index_aaa = $sw_ring_ptr_aaaa + ($tx_ring_buffer_index_aaaa * 16)
        set $mbuf_ptr_aaaa = (void*)(((void**)$sw_ring_ptr_for_obj_index_aaa)[0])
        p "addrress of mbuf"
        p $mbuf_ptr_aaaa
        p "index of tx_ring_buffer"
        p $tx_ring_buffer_index_aaaa
        if ($mbuf_ptr_aaaa)
            p *(struct rte_mbuf *)($mbuf_ptr_aaaa)
        end
        set $tx_ring_buffer_index_aaaa = $tx_ring_buffer_index_aaaa + 1
    end

set $tx_queue_index_aaa = $tx_queue_index_aaa + 1
end

end

document p_ring_buffer_mbufs
  to list the ring buffer by travel
end

注:
0x140002040*rte_eth_devices[0].data 中的 tx_queues,即某个接口(port_id=0)的多个发送队列的数组的地址。
gdb 中执行如下:
(gdb) set logging file /home/dpvs_gdb_port_tx_queues_mbuf_info
(gdb) set logging on
(gdb) set print pretty on
(gdb) source /tmp/print_ring_buffer_mbuf.gdb
(gdb) p_ring_buffer_mbufs
(gdb) set logging off

原因总结

最可能的原因是 发送包时候的mbuf 的 某些成员设置的不对。
比如: ol_flags(offload flags) 的标记位和 l2_len/l3_len不匹配。

1> 对于ipv4包而言,如果通过网卡进行计算ip头的checksum,此时则会有 PKT_TX_IP_CKSUM 标记为,那么此时的 l2_len 需要是 mac头长度或者mac+vlan头长度。如果 l2_len设置不正确,那么通过l2_len 偏移得到 ip头的位置,然后再计算 ip checksum可能是错的。

2>对于ipv6而言,ipv6头不存在 checksum,对于ipv6包而言,应该没有PKT_TX_IP_CKSUM标记。但是计算ipv6 tcp/udp的checksum时,需要 l2_len,l3_len的正确。比如,一种错误的场景,ipv6包的 l2_len 设置为0,但是存在 PKT_TX_TCP_CKSUM/ PKT_TX_UDP_CKSUM 标记。

注:对于DPDK程序(比如DPDK负载均衡DPVS)而言,如果是程序自身产生的包,而不是产生的包,则需要尤其注意 mbuf中的相关字段(比如:ol_flags, l2_len, l3_len, pkt_len, data_len 等)的正确性。

DPVS中可能程序自己产生的包的场景有:

  • TCP VS 开启了 synproxy, dpvs构造syn给后端rs发送包;
  • TCP session 超时时,构造rst发送给client和rs;
  • 在DPVS上 ttl 为0时,产生的 icmp ttl差错报;
  • 报文大小大于mtu的分片报文
  • 其他自己从mbufpool中取包构造包发出去的场景。

另外,对于Fnat46, Fnat64这种ipv6和ipv4的转换,由于ipv6和ipv4头的差异导致是否存在mbuf的ol_flag 的差异也需要注意。

解决

在调用rte_eth_tx*()方法之前,调用ixgbe或i40e pmd 驱动提供的[ixgbe | i40e]_prep_pkts()方法,可排除大部分导致该问题的报文。

注:
同时不必过度担心此类检查会影响性能,经过实际测试影响很小。
根源上,还是需要业务层面找到产生“异常”报文的代码。

查看 dpdk 18.11 中的 ixgbe_prep_pkts 感觉实际上检查的并不多,
dpdk 20.11 中的 ixgbe_prep_pkts 检查相对多一些,会增加报文大小的检查。

建议,搞明白 不同pmd的 发包队列中的 DD状态位的设置,哪些异常情况导致DD位不能够被正确设置。
则在发包前自己实现一套检查的机制。
#define IXGBE_TX_OFFLOAD_MASK (			 \
		PKT_TX_OUTER_IPV6 |		 \
		PKT_TX_OUTER_IPV4 |		 \
		PKT_TX_IPV6 |			 \
		PKT_TX_IPV4 |			 \
		PKT_TX_VLAN_PKT |		 \
		PKT_TX_IP_CKSUM |		 \
		PKT_TX_L4_MASK |		 \
		PKT_TX_TCP_SEG |		 \
		PKT_TX_MACSEC |			 \
		PKT_TX_OUTER_IP_CKSUM |		 \
		PKT_TX_SEC_OFFLOAD |	 \
		IXGBE_TX_IEEE1588_TMST)

#define IXGBE_TX_OFFLOAD_NOTSUP_MASK \
		(PKT_TX_OFFLOAD_MASK ^ IXGBE_TX_OFFLOAD_MASK)

uint16_t
ixgbe_prep_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
{
	int i, ret;
	uint64_t ol_flags;
	struct rte_mbuf *m;
	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;

	for (i = 0; i < nb_pkts; i++) {
		m = tx_pkts[i];
		ol_flags = m->ol_flags;

		/**
		 * Check if packet meets requirements for number of segments
		 *
		 * NOTE: for ixgbe it's always (40 - WTHRESH) for both TSO and
		 *       non-TSO
		 */

		if (m->nb_segs > IXGBE_TX_MAX_SEG - txq->wthresh) {
			rte_errno = EINVAL;
			return i;
		}

		if (ol_flags & IXGBE_TX_OFFLOAD_NOTSUP_MASK) {
			rte_errno = ENOTSUP;
			return i;
		}

		/* check the size of packet */
		if (m->pkt_len < IXGBE_TX_MIN_PKT_LEN) {
			rte_errno = EINVAL;
			return i;
		}

#ifdef RTE_LIBRTE_ETHDEV_DEBUG
		ret = rte_validate_tx_offload(m);
		if (ret != 0) {
			rte_errno = -ret;
			return i;
		}
#endif
		ret = rte_net_intel_cksum_prepare(m);
		if (ret != 0) {
			rte_errno = -ret;
			return i;
		}
	}

	return i;
}

static inline int
rte_net_intel_cksum_prepare(struct rte_mbuf *m)
{
	return rte_net_intel_cksum_flags_prepare(m, m->ol_flags);
}

static inline int
rte_net_intel_cksum_flags_prepare(struct rte_mbuf *m, uint64_t ol_flags)
{
	/* Initialise ipv4_hdr to avoid false positive compiler warnings. */
	struct rte_ipv4_hdr *ipv4_hdr = NULL;
	struct rte_ipv6_hdr *ipv6_hdr;
	struct rte_tcp_hdr *tcp_hdr;
	struct rte_udp_hdr *udp_hdr;
	uint64_t inner_l3_offset = m->l2_len;

	/*
	 * Does packet set any of available offloads?
	 * Mainly it is required to avoid fragmented headers check if
	 * no offloads are requested.
	 */
	if (!(ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK | PKT_TX_TCP_SEG)))
		return 0;

	if (ol_flags & (PKT_TX_OUTER_IPV4 | PKT_TX_OUTER_IPV6))
		inner_l3_offset += m->outer_l2_len + m->outer_l3_len;

	/*
	 * Check if headers are fragmented.
	 * The check could be less strict depending on which offloads are
	 * requested and headers to be used, but let's keep it simple.
	 */
	if (unlikely(rte_pktmbuf_data_len(m) <
		     inner_l3_offset + m->l3_len + m->l4_len))
		return -ENOTSUP;

	if (ol_flags & PKT_TX_IPV4) {
		ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *,
				inner_l3_offset);

		if (ol_flags & PKT_TX_IP_CKSUM)
			ipv4_hdr->hdr_checksum = 0;
	}

	if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
		if (ol_flags & PKT_TX_IPV4) {
			udp_hdr = (struct rte_udp_hdr *)((char *)ipv4_hdr +
					m->l3_len);
			udp_hdr->dgram_cksum = rte_ipv4_phdr_cksum(ipv4_hdr,
					ol_flags);
		} else {
			ipv6_hdr = rte_pktmbuf_mtod_offset(m,
				struct rte_ipv6_hdr *, inner_l3_offset);
			/* non-TSO udp */
			udp_hdr = rte_pktmbuf_mtod_offset(m,
					struct rte_udp_hdr *,
					inner_l3_offset + m->l3_len);
			udp_hdr->dgram_cksum = rte_ipv6_phdr_cksum(ipv6_hdr,
					ol_flags);
		}
	} else if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM ||
			(ol_flags & PKT_TX_TCP_SEG)) {
		if (ol_flags & PKT_TX_IPV4) {
			/* non-TSO tcp or TSO */
			tcp_hdr = (struct rte_tcp_hdr *)((char *)ipv4_hdr +
					m->l3_len);
			tcp_hdr->cksum = rte_ipv4_phdr_cksum(ipv4_hdr,
					ol_flags);
		} else {
			ipv6_hdr = rte_pktmbuf_mtod_offset(m,
				struct rte_ipv6_hdr *, inner_l3_offset);
			/* non-TSO tcp or TSO */
			tcp_hdr = rte_pktmbuf_mtod_offset(m,
					struct rte_tcp_hdr *,
					inner_l3_offset + m->l3_len);
			tcp_hdr->cksum = rte_ipv6_phdr_cksum(ipv6_hdr,
					ol_flags);
		}
	}

	return 0;
}


分析:
感觉 ixgbe的 ixgbe_prep_pkt 写的并不是很完善。比如,对于ipv6 报文的一些信息没有太多的检查。

参考

https://decodezp.github.io/2019/10/17/quickwords34-dpdk-tx-hang/
  • 0
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值