struct rte_mbuf
struct rte_mbuf的源码定义如下,struct rte_mbuf结构体定义的时候,由于成员原来越多,所以有意跨两个cache line大小,
通过MARKER cacheline0和MARKER cacheline1来标示两个cache line, cacheline0到 cacheline1之间的变量都是在cache line 0,原则上将基础性、频繁访问的数据会放在cacheline0里面,而cacheline1之后的成员都是处在cache line1.
struct rte_mbuf {
MARKER cacheline0;
void *buf_addr; /**< Virtual address of segment buffer. */
/**
* Physical address of segment buffer.
* Force alignment to 8-bytes, so as to ensure we have the exact
* same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
* working on vector drivers easier.
*/
RTE_STD_C11
union {
rte_iova_t buf_iova;
rte_iova_t buf_physaddr; /**< deprecated */
} __rte_aligned(sizeof(rte_iova_t));
/* next 8 bytes are initialised on RX descriptor rearm */
MARKER64 rearm_data;
uint16_t data_off;
/**
* Reference counter. Its size should at least equal to the size
* of port field (16 bits), to support zero-copy broadcast.
* It should only be accessed using the following functions:
* rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and
* rte_mbuf_refcnt_set(). The functionality of these functions (atomic,
* or non-atomic) is controlled by the CONFIG_RTE_MBUF_REFCNT_ATOMIC
* config option.
*/
RTE_STD_C11
union {
rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
uint16_t refcnt; /**< Non-atomically accessed refcnt */
};
uint16_t nb_segs; /**< Number of segments. */
/** Input port (16 bits to support more than 256 virtual ports).
* The event eth Tx adapter uses this field to specify the output port.
*/
uint16_t port;
uint64_t ol_flags; /**< Offload features. */
/* remaining bytes are set on RX when pulling packet from descriptor */
MARKER rx_descriptor_fields1;
/*
* The packet type, which is the combination of outer/inner L2, L3, L4
* and tunnel types. The packet_type is about data really present in the
* mbuf. Example: if vlan stripping is enabled, a received vlan packet
* would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
* vlan is stripped from the data.
*/
RTE_STD_C11
union {
uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
struct {
uint32_t l2_type:4; /**< (Outer) L2 type. */
uint32_t l3_type:4; /**< (Outer) L3 type. */
uint32_t l4_type:4; /**< (Outer) L4 type. */
uint32_t tun_type:4; /**< Tunnel type. */
RTE_STD_C11
union {
uint8_t inner_esp_next_proto;
/**< ESP next protocol type, valid if
* RTE_PTYPE_TUNNEL_ESP tunnel type is set
* on both Tx and Rx.
*/
__extension__
struct {
uint8_t inner_l2_type:4;
/**< Inner L2 type. */
uint8_t inner_l3_type:4;
/**< Inner L3 type. */
};
};
uint32_t inner_l4_type:4; /**< Inner L4 type. */
};
};
uint32_t pkt_len; /**< Total pkt len: sum of all segments. */
uint16_t data_len; /**< Amount of data in segment buffer. */
/** VLAN TCI (CPU order), valid if PKT_RX_VLAN is set. */
uint16_t vlan_tci;
RTE_STD_C11
union {
union {
uint32_t rss; /**< RSS hash result if RSS enabled */
struct {
union {
struct {
uint16_t hash;
uint16_t id;
};
uint32_t lo;
/**< Second 4 flexible bytes */
};
uint32_t hi;
/**< First 4 flexible bytes or FD ID, dependent
* on PKT_RX_FDIR_* flag in ol_flags.
*/
} fdir; /**< Filter identifier if FDIR enabled */
struct {
uint32_t lo;
uint32_t hi;
/**< The event eth Tx adapter uses this field
* to store Tx queue id.
* @see rte_event_eth_tx_adapter_txq_set()
*/
} sched; /**< Hierarchical scheduler */
/**< User defined tags. See rte_distributor_process() */
uint32_t usr;
} hash; /**< hash information */
struct {
/**
* Application specific metadata value
* for egress flow rule match.
* Valid if PKT_TX_METADATA is set.
* Located here to allow conjunct use
* with hash.sched.hi.
*/
uint32_t tx_metadata;
uint32_t reserved;
};
};
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ is set. */
uint16_t vlan_tci_outer;
uint16_t buf_len; /**< Length of segment buffer. */
/** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference
* are not normalized but are always the same for a given port.
*/
uint64_t timestamp;
/* second cache line - fields only used in slow path or on TX */
MARKER cacheline1 __rte_cache_min_aligned;
RTE_STD_C11
union {
void *userdata; /**< Can be used for external metadata */
uint64_t udata64; /**< Allow 8-byte userdata on 32-bit */
};
struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
struct rte_mbuf *next; /**< Next segment of scattered packet. */
/* fields to support TX offloads */
RTE_STD_C11
union {
uint64_t tx_offload; /**< combined for easy fetch */
__extension__
struct {
uint64_t l2_len:7;
/**< L2 (MAC) Header Length for non-tunneling pkt.
* Outer_L4_len + ... + Inner_L2_len for tunneling pkt.
*/
uint64_t l3_len:9; /**< L3 (IP) Header Length. */
uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
uint64_t tso_segsz:16; /**< TCP TSO segment size */
/* fields for TX offloading of tunnels */
uint64_t outer_l3_len:9; /**< Outer L3 (IP) Hdr Length. */
uint64_t outer_l2_len:7; /**< Outer L2 (MAC) Hdr Length. */
/* uint64_t unused:8; */
};
};
/** Size of the application private data. In case of an indirect
* mbuf, it stores the direct mbuf private data size. */
uint16_t priv_size;
/** Timesync flags for use with IEEE1588. */
uint16_t timesync;
/** Sequence number. See also rte_reorder_insert(). */
uint32_t seqn;
/** Shared data for external buffer attached to mbuf. See
* rte_pktmbuf_attach_extbuf().
*/
struct rte_mbuf_ext_shared_info *shinfo;
}
buf_addr
当前mbuf的虚拟地址,标准buf addr的指向的内存是在mbuf头部开始,偏移一个mbuf头加上一个私有数据的大小。如下所示:
m->buf_addr = (char *)m + sizeof(struct rte_mbuf) + priv_size;
初始化这个变量是在我们创建mbuf的mempool的时候完成的
rte_pktmbuf_pool_create
rte_mempool_obj_iter(mp, rte_pktmbuf_init, NULL);
rte_pktmbuf_init
m->buf_addr = (char *)m + mbuf_size;
相关api:
rte_pktmbuf_mtod 返回mbuf数据buf的开始位置,已经做了data_off偏移。
buf的物理地址
union {
rte_iova_t buf_iova;
rte_iova_t buf_physaddr; /**< deprecated */
} __rte_aligned(sizeof(rte_iova_t));
mbuf对应的物理地址,一般mbuf物理地址在初始化mempool的时候就设置了,在mbuf对应obj的head里面存放,如下结构体的objhdr里面的iova/physaddr
struct rte_mempool_objhdr {
STAILQ_ENTRY(rte_mempool_objhdr) next; /**< Next in list. */
struct rte_mempool *mp; /**< The mempool owning the object. */
RTE_STD_C11
union {
rte_iova_t iova; /**< IO address of the object. */
phys_addr_t physaddr; /**< deprecated - Physical address of the object. */
};
#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
uint64_t cookie; /**< Debug cookie. */
#endif
};
这个转化关系如下:
m->buf_iova = rte_mempool_virt2iova(m) + sizeof(struct rte_mbuf) + priv_size;
data_off
这个变量是标识mbuf的data room开始地址到报文起始位置的偏移,默认是设置为RTE_PKTMBUF_HEADROOM(128),
我们在创建一个mbuf的mem pool的时候,会指定data room的大小,如下所示的data_room_size参数,
struct rte_mempool *
rte_pktmbuf_pool_create(const char *name, unsigned int n,
unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
int socket_id)
{
return rte_pktmbuf_pool_create_by_ops(name, n, cache_size, priv_size,
data_room_size, socket_id, NULL);
}
data_room_size标识每一个mbuf的数据报文的最大值,一般会设置大于一个mtu+128B的头部预留空间
dpdk提供一个默认宏定义:
#define RTE_PKTMBUF_HEADROOM 128
#define RTE_MBUF_DEFAULT_DATAROOM 2048
#define RTE_MBUF_DEFAULT_BUF_SIZE (RTE_MBUF_DEFAULT_DATAROOM + RTE_PKTMBUF_HEADROOM)
所以当我们从mbuf pool alloc一块mbuf过来的时候,都会reset一下mbuf的变量,里面就包含了重置data_off,具体如下:
static inline void rte_pktmbuf_reset_headroom(struct rte_mbuf *m)
{
m->data_off = (uint16_t)RTE_MIN((uint16_t)RTE_PKTMBUF_HEADROOM,
(uint16_t)m->buf_len);
}
static inline void rte_pktmbuf_reset(struct rte_mbuf *m)
{
m->next = NULL;
m->pkt_len = 0;
m->tx_offload = 0;
m->vlan_tci = 0;
m->vlan_tci_outer = 0;
m->nb_segs = 1;
m->port = MBUF_INVALID_PORT;
m->ol_flags = 0;
m->packet_type = 0;
rte_pktmbuf_reset_headroom(m);
m->data_len = 0;
__rte_mbuf_sanity_check(m, 1);
}
mbuf应用计数 refcnt
用来表示mbuf被引用的次数,在mbuf被释放的时候,需要检查,确定引用计数只能为1,否则报错。
static __rte_always_inline void
rte_mbuf_raw_free(struct rte_mbuf *m)
{
RTE_ASSERT(RTE_MBUF_DIRECT(m));
RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1);
RTE_ASSERT(m->next == NULL);
RTE_ASSERT(m->nb_segs == 1);
__rte_mbuf_sanity_check(m, 0);
rte_mempool_put(m->pool, m);
}
具体相关的api:
static inline uint16_t rte_mbuf_refcnt_read(const struct rte_mbuf *m)
static inline void rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value)
static inline uint16_t rte_mbuf_refcnt_update(struct rte_mbuf *m, int16_t value)
mbuf分段存储
涉及到的参数有:
uint16_t nb_segs 表示当前的mbuf报文有多少个分段
struct rte_mbuf *next 表示下一个分段的地址,单向链表连接,如下图所示
端口号
uint16_t port u16的端口号,表示输入输出端口号,无效值是UINT16_MAX
卸载特性标识 ol_flags
dpdk用一个u64来定义这个flag,使用的时候是按bit使用的,要么就是某个比特表示特定的意思,要么就是几个比特的组合表示特定的意思, 具体定义如下:
#define PKT_RX_VLAN (1ULL << 0)
#define PKT_RX_RSS_HASH (1ULL << 1) /**< RX packet with RSS hash result. */
#define PKT_RX_FDIR (1ULL << 2) /**< RX packet with FDIR match indicate. */
#define PKT_RX_L4_CKSUM_BAD (1ULL << 3)
#define PKT_RX_IP_CKSUM_BAD (1ULL << 4)
#define PKT_RX_EIP_CKSUM_BAD (1ULL << 5) /**< External IP header checksum error. */
#define PKT_RX_VLAN_STRIPPED (1ULL << 6)
#define PKT_RX_IP_CKSUM_MASK ((1ULL << 4) | (1ULL << 7))
#define PKT_RX_IP_CKSUM_BAD (1ULL << 4)
#define PKT_RX_IP_CKSUM_GOOD (1ULL << 7)
#define PKT_RX_IP_CKSUM_NONE ((1ULL << 4) | (1ULL << 7))
#define PKT_RX_L4_CKSUM_MASK ((1ULL << 3) | (1ULL << 8))
#define PKT_RX_L4_CKSUM_BAD (1ULL << 3)
#define PKT_RX_L4_CKSUM_GOOD (1ULL << 8)
#define PKT_RX_L4_CKSUM_NONE ((1ULL << 3) | (1ULL << 8))
#define PKT_RX_IEEE1588_PTP (1ULL << 9) /**< RX IEEE1588 L2 Ethernet PT Packet. */
#define PKT_RX_IEEE1588_TMST (1ULL << 10) /**< RX IEEE1588 L2/L4 timestamped packet.*/
#define PKT_RX_FDIR_ID (1ULL << 13) /**< FD id reported if FDIR match. */
#define PKT_RX_FDIR_FLX (1ULL << 14) /**< Flexible bytes reported if FDIR match. */
#define PKT_RX_QINQ_STRIPPED (1ULL << 15)
#define PKT_RX_LRO (1ULL << 16)
#define PKT_RX_TIMESTAMP (1ULL << 17)
#define PKT_RX_SEC_OFFLOAD (1ULL << 18)
#define PKT_RX_SEC_OFFLOAD_FAILED (1ULL << 19)
#define PKT_RX_QINQ (1ULL << 20)
#define PKT_RX_OUTER_L4_CKSUM_MASK ((1ULL << 21) | (1ULL << 22))
#define PKT_RX_OUTER_L4_CKSUM_UNKNOWN 0
#define PKT_RX_OUTER_L4_CKSUM_BAD (1ULL << 21)
#define PKT_RX_OUTER_L4_CKSUM_GOOD (1ULL << 22)
#define PKT_RX_OUTER_L4_CKSUM_INVALID ((1ULL << 21) | (1ULL << 22))
#define PKT_TX_METADATA (1ULL << 40)
#define PKT_TX_OUTER_UDP_CKSUM (1ULL << 41)
#define PKT_TX_UDP_SEG (1ULL << 42)
#define PKT_TX_SEC_OFFLOAD (1ULL << 43)
#define PKT_TX_MACSEC (1ULL << 44)
#define PKT_TX_TUNNEL_VXLAN (0x1ULL << 45)
#define PKT_TX_TUNNEL_GRE (0x2ULL << 45)
#define PKT_TX_TUNNEL_IPIP (0x3ULL << 45)
#define PKT_TX_TUNNEL_GENEVE (0x4ULL << 45)
#define PKT_TX_TUNNEL_MPLSINUDP (0x5ULL << 45)
#define PKT_TX_TUNNEL_VXLAN_GPE (0x6ULL << 45)
#define PKT_TX_TUNNEL_IP (0xDULL << 45)
#define PKT_TX_TUNNEL_UDP (0xEULL << 45)
#define PKT_TX_TUNNEL_MASK (0xFULL << 45)
#define PKT_TX_QINQ (1ULL << 49) /**< TX packet with double VLAN inserted. */
#define PKT_TX_TCP_SEG (1ULL << 50)
#define PKT_TX_IEEE1588_TMST (1ULL << 51) /**< TX IEEE1588 packet to timestamp. */
#define PKT_TX_L4_NO_CKSUM (0ULL << 52) /**< Disable L4 cksum of TX pkt. */
#define PKT_TX_TCP_CKSUM (1ULL << 52) /**< TCP cksum of TX pkt. computed by NIC. */
#define PKT_TX_SCTP_CKSUM (2ULL << 52) /**< SCTP cksum of TX pkt. computed by NIC. */
#define PKT_TX_UDP_CKSUM (3ULL << 52) /**< UDP cksum of TX pkt. computed by NIC. */
#define PKT_TX_L4_MASK (3ULL << 52) /**< Mask for L4 cksum offload request. */
#define PKT_TX_IP_CKSUM (1ULL << 54)
#define PKT_TX_IPV4 (1ULL << 55)
#define PKT_TX_IPV6 (1ULL << 56)
#define PKT_TX_VLAN (1ULL << 57)
#define PKT_TX_OUTER_IP_CKSUM (1ULL << 58)
#define PKT_TX_OUTER_IPV4 (1ULL << 59)
#define PKT_TX_OUTER_IPV6 (1ULL << 60)
#define EXT_ATTACHED_MBUF (1ULL << 61)
#define IND_ATTACHED_MBUF (1ULL << 62) /**< Indirect attached mbuf */
报文类型packet_type
主要用来表示报文的L2/L3/L4 and tunnel information
具体定义在 dpdk/lib/librte_mbuf/rte_mbuf_ptype.h里面
报文长度信息
具体涉及到的变量有:
uint32_t pkt_len 表示总的报文大小的长度,包含所有seg分段报文的报文长度
uint16_t data_len 表示当前mbuf的报文数据长度
uint16_t buf_len 表示当前mbuf的整个buf的长度,包含headroom的长度+data_len
这里把buf_len一起列出来对比讲是因为这三个都是mbuf里面的表示长度的变量,容易混淆,这里一比较容易理解。
vlan信息
主要涉及的变量有:
uint16_t vlan_tci 表示vlan报文的tci,如果ol_flags设置了PKT_RX_VLAN,这个变量才有意义
uint16_t vlan_tci_outer 表示qinq报文的外层vlan信息,如果ol_flags设置了PKT_RX_QINQ,这个变量才有意义
报文的hash信息
用一个u64大小的union来表示,主要包含
- 硬件nic RSS后的结果
- fdir 过滤标识符信息
- sched 分层调度
union {
union {
uint32_t rss; /**< RSS hash result if RSS enabled */
struct {
union {
struct {
uint16_t hash;
uint16_t id;
};
uint32_t lo;
/**< Second 4 flexible bytes */
};
uint32_t hi;
/**< First 4 flexible bytes or FD ID, dependent
* on PKT_RX_FDIR_* flag in ol_flags.
*/
} fdir; /**< Filter identifier if FDIR enabled */
struct {
uint32_t lo;
uint32_t hi;
/**< The event eth Tx adapter uses this field
* to store Tx queue id.
* @see rte_event_eth_tx_adapter_txq_set()
*/
} sched; /**< Hierarchical scheduler */
/**< User defined tags. See rte_distributor_process() */
uint32_t usr;
} hash; /**< hash information */
struct {
/**
* Application specific metadata value
* for egress flow rule match.
* Valid if PKT_TX_METADATA is set.
* Located here to allow conjunct use
* with hash.sched.hi.
*/
uint32_t tx_metadata;
uint32_t reserved;
};
};
报文的时间戳timestamp
ol_flags设置了PKT_RX_TIMESTAMP,这个变量才有意义,驱动才会将报文的接受时间戳填充进来。
pool
表示mbuf从这个pool申请来的,释放mbuf的时候用到
static __rte_always_inline void
rte_mbuf_raw_free(struct rte_mbuf *m)
{
RTE_ASSERT(RTE_MBUF_DIRECT(m));
RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1);
RTE_ASSERT(m->next == NULL);
RTE_ASSERT(m->nb_segs == 1);
__rte_mbuf_sanity_check(m, 0);
rte_mempool_put(m->pool, m);
}
TX offloads
使用一个u64的变量来表示tx offload的信息,一般tx offload需要设置以太网报文头的信息,
如l2_len、l3_len、l4_len等等,这个一般是根据nic硬件支持的类型来设置和使用的。
union {
uint64_t tx_offload; /**< combined for easy fetch */
__extension__
struct {
uint64_t l2_len:7;
/**< L2 (MAC) Header Length for non-tunneling pkt.
* Outer_L4_len + ... + Inner_L2_len for tunneling pkt.
*/
uint64_t l3_len:9; /**< L3 (IP) Header Length. */
uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
uint64_t tso_segsz:16; /**< TCP TSO segment size */
/* fields for TX offloading of tunnels */
uint64_t outer_l3_len:9; /**< Outer L3 (IP) Hdr Length. */
uint64_t outer_l2_len:7; /**< Outer L2 (MAC) Hdr Length. */
/* uint64_t unused:8; */
};
};
其他信息
uint16_t priv_size表示mbuf里面私有数据空间大小
uint16_t timesync表示IEEE1588标准的时间同步标志
uint32_t seqn mbuf的序列号,是dpdk的一个排序库用到的表示,它会根据这里的序列号来从排序报文。