一、checksum:
tcp checksum包括三部分:
1)、伪头部校验和;
伪头部包括: 源ip 、宿ip、 协议号、tcp 长度,主要用于校验是正确的目的机器接收到数据包
2)、tcp头部校验和;
3)、数据部分校验和;
当硬件有checksum offload能力时,可以通过ethtook -K tx on/off设置是否将checksum offload到硬件处理;
如果将checksum offload到硬件,那协议栈只需要计算伪头部的校验和,然后将其存放在tcp->check里;并同时
将csum_start、csum_offset高速硬件,csum_start表示硬件需要计算checksum的起始位置(tcp头部起始位置),
csum_offset表示硬件计算完checksum后将值存放的位置。
二、offload流程
1、传输层
tcp_sendmsg
if (sk->sk_route_caps & NETIF_F_CSUM_MASK)(判断网卡是否有checksum offload功能,如果有,则将ip_summed置为CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_PARTIAL;
skb_copy_to_page_nocache
skb_do_copy_data_nocache (根据ip_summed值填充skb->csum;如果网卡没有offload能力,则这里先计算数据部分的checksum值,然后保存在skb->csum里;如果硬件有offload能力,仅仅只是将数据从用户态拷贝过来,不计算校验和)
tcp_push_one
tcp_write_xmit
tcp_transmit_skb
icsk->icsk_af_ops->send_check
tcp_v4_send_check
__tcp_v4_send_check
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
struct tcphdr *th = tcp_hdr(skb);
//如果需要offload,则th->check仅仅保存伪头部的校验和值,tcp头部及数据交给硬件计算
if (skb->ip_summed == CHECKSUM_PARTIAL) {
th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
//设置硬件计算校验和开始的地址偏移,csum_start指向tcp头部的起始地址处
skb->csum_start = skb_transport_header(skb) - skb->head;
//设置硬件计算校验和保存的地址偏移,保存在tcp->check处
skb->csum_offset = offsetof(struct tcphdr, check);
} else {
//硬件没有offload能力,则需要在计算checksum
//1)、skb->csum为skb_do_copy_data_nocache里计算的数据部分校验和;
//2)、csum_partial(th, h->doff << 2, skb->csum))为计算tcp头部+数据的检验和;
//3)、tcp_v4_check再将算出来的tcp头部与数据的检验和再加上伪头部校验和
th->check = tcp_v4_check(skb->len, saddr, daddr,
csum_partial(th,
th->doff << 2,
skb->csum));
}
}
2、ip层
__ip_local_out
__ip_local_out_sk
ip_send_check(计算ip头的checksum,保存在iph->check)
3、virtio驱动层
start_xmit(virtio_net)
xmit_skb
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
...
if (skb->ip_summed == CHECKSUM_PARTIAL) {
//消息头设置需要硬件checksum标志
hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
//将csum_start、csum_offset保存在virtio_net_hdr里
hdr->hdr.csum_start = cpu_to_virtio16(vi->vdev,
skb_checksum_start_offset(skb));
hdr->hdr.csum_offset = cpu_to_virtio16(vi->vdev, skb->csum_offset);
} else {
hdr->hdr.flags = 0;
hdr->hdr.csum_offset = hdr->hdr.csum_start = 0;
}
}
4、dpdk层vhost_user口收包
virtio_dev_tx_split
copy_desc_to_mbuf
vhost_dequeue_offload
static __rte_always_inline void
vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
{
uint16_t l4_proto = 0;
void *l4_hdr = NULL;
struct tcp_hdr *tcp_hdr = NULL;
if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
return;
parse_ethernet(m, &l4_proto, &l4_hdr);
//判断virtio驱动是否有置为需要硬件做checksum
if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
//这里csum_start如果等于m->l2_len + m->l3_len,表明csum_start
//为L4的起始位置,然后根据csum_offset决定是tcp offload还是udp offload,或者sctp offload
if (hdr->csum_start == (m->l2_len + m->l3_len)) {
switch (hdr->csum_offset) {
case (offsetof(struct tcp_hdr, cksum)):
if (l4_proto == IPPROTO_TCP)
m->ol_flags |= PKT_TX_TCP_CKSUM;
break;
case (offsetof(struct udp_hdr, dgram_cksum)):
if (l4_proto == IPPROTO_UDP)
m->ol_flags |= PKT_TX_UDP_CKSUM;
break;
case (offsetof(struct sctp_hdr, cksum)):
if (l4_proto == IPPROTO_SCTP)
m->ol_flags |= PKT_TX_SCTP_CKSUM;
break;
default:
break;
}
}
}
if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
case VIRTIO_NET_HDR_GSO_TCPV6:
tcp_hdr = l4_hdr;
m->ol_flags |= PKT_TX_TCP_SEG;
m->tso_segsz = hdr->gso_size;
m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
break;
case VIRTIO_NET_HDR_GSO_TCPV4:
tcp_hdr = l4_hdr;
m->ol_flags |= PKT_TX_TCP_SEG;
m->tso_segsz = hdr->gso_size;
m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
//tso offload场景下,硬件需要重新拆分数据包,这个标志区分是ipv4还是ipv6
m->ol_flags |= PKT_TX_IP_CKSUM;
break;
case VIRTIO_NET_HDR_GSO_UDP:
m->ol_flags |= PKT_TX_UDP_SEG;
m->tso_segsz = hdr->gso_size;
m->l4_len = sizeof(struct udp_hdr);
break;
default:
RTE_LOG(WARNING, VHOST_DATA,
"unsupported gso type %u.\n", hdr->gso_type);
break;
}
}
RTE_LOG(ERR, VHOST_DATA, "vhost_dequeue_offload ol_flags %llu.\n", m->ol_flags);
}
}
5、ovs将数据包转给dpdk口
static inline int
netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
struct rte_mbuf **pkts, int cnt)
{
uint32_t nb_tx = 0;
//先通过dpdk的prepare接口计算伪头部csum(在tso offload开启场景下需要计算伪头部csum)
cnt = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
while (nb_tx != cnt) {
uint32_t ret;
ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
if (!ret) {
break;
}
nb_tx += ret;
}
if (OVS_UNLIKELY(nb_tx != cnt)) {
/* Free buffers, which we couldn't transmit, one at a time (each
* packet could come from a different mempool) */
int i;
for (i = nb_tx; i < cnt; i++) {
rte_pktmbuf_free(pkts[i]);
}
struct dp_netdev_port *dp_port = CONTAINER_OF(&dev->up, struct dp_netdev_port, netdev);
dp_port->need_reconfigure = true;
int retsult = port_reconfigure(port);
VLOG_ERR_RL(&rl, "zzzz after port reconfigure ret: %d \n", retsult);
}
return cnt - nb_tx;
}
6、dpdk,ixgbe驱动将数据转给硬件
ixgbe_xmit_pkts
ixgbe_set_xmit_ctx
static inline void ixgbe_set_xmit_ctx(struct ixgbe_tx_queue *txq,
volatile struct ixgbe_adv_tx_context_desc *ctx_txd,
uint64_t ol_flags, union ixgbe_tx_offload tx_offload,
__rte_unused uint64_t *mdata)
{
uint32_t type_tucmd_mlhl;
uint32_t mss_l4len_idx = 0;
uint32_t ctx_idx;
uint32_t vlan_macip_lens;
union ixgbe_tx_offload tx_offload_mask;
uint32_t seqnum_seed = 0;
ctx_idx = txq->ctx_curr;
tx_offload_mask.data[0] = 0;
tx_offload_mask.data[1] = 0;
type_tucmd_mlhl = 0;
/* Specify which HW CTX to upload. */
mss_l4len_idx |= (ctx_idx << IXGBE_ADVTXD_IDX_SHIFT);
if (ol_flags & PKT_TX_VLAN_PKT) {
tx_offload_mask.vlan_tci |= ~0;
}
/* check if TCP segmentation required for this packet */
//判断是否需要硬件做tso
if (ol_flags & PKT_TX_TCP_SEG) {
/* implies IP cksum in IPv4 */
//硬件tso的场景下,需要重新计算tcp、ip的checksum
//因此这里只判断是做ipv4还是ipv6的ip csum
if (ol_flags & PKT_TX_IP_CKSUM)
type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4 |
IXGBE_ADVTXD_TUCMD_L4T_TCP |
IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
else
type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV6 |
IXGBE_ADVTXD_TUCMD_L4T_TCP |
IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
//高速硬件l2、l3、l4及segment数
tx_offload_mask.l2_len |= ~0;
tx_offload_mask.l3_len |= ~0;
tx_offload_mask.l4_len |= ~0;
tx_offload_mask.tso_segsz |= ~0;
mss_l4len_idx |= tx_offload.tso_segsz << IXGBE_ADVTXD_MSS_SHIFT;
mss_l4len_idx |= tx_offload.l4_len << IXGBE_ADVTXD_L4LEN_SHIFT;
} else { /* no TSO, check if hardware checksum is needed */
//非tso场景下,硬件只需要计算tcp头+payload的csum,ip层的有需要才重新计算,因此
//这里判断是否需要重新做ip层的csum
if (ol_flags & PKT_TX_IP_CKSUM) {
type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4;
tx_offload_mask.l2_len |= ~0;
tx_offload_mask.l3_len |= ~0;
}
switch (ol_flags & PKT_TX_L4_MASK) {
case PKT_TX_UDP_CKSUM:
type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP |
IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
mss_l4len_idx |= sizeof(struct udp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
tx_offload_mask.l2_len |= ~0;
tx_offload_mask.l3_len |= ~0;
break;
//非tso的checksum场景,硬件只需要重新计算
case PKT_TX_TCP_CKSUM:
type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP |
IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
mss_l4len_idx |= sizeof(struct tcp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
tx_offload_mask.l2_len |= ~0;
tx_offload_mask.l3_len |= ~0;
break;
case PKT_TX_SCTP_CKSUM:
type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP |
IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
mss_l4len_idx |= sizeof(struct sctp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
tx_offload_mask.l2_len |= ~0;
tx_offload_mask.l3_len |= ~0;
break;
default:
type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_RSV |
IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
break;
}
}
if (ol_flags & PKT_TX_OUTER_IP_CKSUM) {
tx_offload_mask.outer_l2_len |= ~0;
tx_offload_mask.outer_l3_len |= ~0;
tx_offload_mask.l2_len |= ~0;
seqnum_seed |= tx_offload.outer_l3_len
<< IXGBE_ADVTXD_OUTER_IPLEN;
seqnum_seed |= tx_offload.l2_len
<< IXGBE_ADVTXD_TUNNEL_LEN;
}
#ifdef RTE_LIBRTE_SECURITY
if (ol_flags & PKT_TX_SEC_OFFLOAD) {
union ixgbe_crypto_tx_desc_md *md =
(union ixgbe_crypto_tx_desc_md *)mdata;
seqnum_seed |=
(IXGBE_ADVTXD_IPSEC_SA_INDEX_MASK & md->sa_idx);
type_tucmd_mlhl |= md->enc ?
(IXGBE_ADVTXD_TUCMD_IPSEC_TYPE_ESP |
IXGBE_ADVTXD_TUCMD_IPSEC_ENCRYPT_EN) : 0;
type_tucmd_mlhl |=
(md->pad_len & IXGBE_ADVTXD_IPSEC_ESP_LEN_MASK);
tx_offload_mask.sa_idx |= ~0;
tx_offload_mask.sec_pad_len |= ~0;
}
#endif
txq->ctx_cache[ctx_idx].flags = ol_flags;
txq->ctx_cache[ctx_idx].tx_offload.data[0] =
tx_offload_mask.data[0] & tx_offload.data[0];
txq->ctx_cache[ctx_idx].tx_offload.data[1] =
tx_offload_mask.data[1] & tx_offload.data[1];
txq->ctx_cache[ctx_idx].tx_offload_mask = tx_offload_mask;
ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
vlan_macip_lens = tx_offload.l3_len;
if (ol_flags & PKT_TX_OUTER_IP_CKSUM)
vlan_macip_lens |= (tx_offload.outer_l2_len <<
IXGBE_ADVTXD_MACLEN_SHIFT);
else
vlan_macip_lens |= (tx_offload.l2_len <<
IXGBE_ADVTXD_MACLEN_SHIFT);
vlan_macip_lens |= ((uint32_t)tx_offload.vlan_tci << IXGBE_ADVTXD_VLAN_SHIFT);
ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx);
ctx_txd->seqnum_seed = seqnum_seed;
}