hinic3 驱动发送流程
本文基于hinic3源码,系统化梳理 hinic3 网卡驱动的报文发送流程,涵盖主流程源码分析、关键步骤流程图,并结合 Linux 驱动架构还原真实报文收发场景。
一、hinic3 TX 发送流程概述
hinic3 网卡驱动(Linux 下)负责将上层协议栈的报文(skb)发送到硬件,并最终通过网线发出。整个过程分为两大阶段:
- 主发送流程(TX Path):协议栈调用驱动的发送接口,驱动完成报文映射、WQE 填写、doorbell 敲门,通知硬件。
- 发送完成中断流程(TX ACK Path):硬件 DMA 完报文后中断驱动,驱动回收资源,通知协议栈可继续发送。
核心入口函数:
hinic3_xmit_frame
二、TX 主流程源码解读(发送报文到网卡)
1. 流程总览图
2. 核心源码流程详细拆解
Step 1. 入口函数注册
Linux 下,hinic3 驱动通过 net_device_ops
注册发送入口:
.ndo_start_xmit = hinic3_xmit_frame
Step 2. 入口函数 hinic3_xmit_frame
netdev_tx_t hinic3_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
{
struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
struct hinic3_txq *txq = NULL;
u16 q_id = skb_get_queue_mapping(skb);
// 1. 检查设备 link 状态,不可用直接丢包
if (unlikely(!netif_carrier_ok(netdev))) {
dev_kfree_skb_any(skb);
HINIC3_NIC_STATS_INC(nic_dev, tx_carrier_off_drop);
return NETDEV_TX_OK;
}
// 2. 校验队列 ID,非法则丢包
if (unlikely(q_id >= nic_dev->q_params.num_qps)) {
txq = &nic_dev->txqs[0];
HINIC3_NIC_STATS_INC(nic_dev, tx_invalid_qid);
goto tx_drop_pkts;
}
txq = &nic_dev->txqs[q_id];
// 3. 真正执行发送
return hinic3_send_one_skb(skb, netdev, txq);
tx_drop_pkts:
dev_kfree_skb_any(skb);
u64_stats_update_begin(&txq->txq_stats.syncp);
txq->txq_stats.dropped++;
u64_stats_update_end(&txq->txq_stats.syncp);
return NETDEV_TX_OK;
}
Step 3. 报文发送主流程 hinic3_send_one_skb
主要步骤:
- 报文最小长度填充:小于 MIN_SKB_LEN 则 pad。
- 分片合法性检查:跳过长度为0的分片,若顺序不合法则丢弃。
- 判断 SQ WQE 资源是否足够:不够则 stop queue。
- 无状态卸载信息处理(TSO/VLAN/CSUM):填充 task、queue_info 字段。
- DMA 映射与 Buffer Description 填写。
- WQE Control Block 填写。
- Doorbell 通知硬件。
- 回调统计信息并返回。
伪代码流程如下:
netdev_tx_t hinic3_send_one_skb(struct sk_buff *skb,
struct net_device *netdev,
struct hinic3_txq *txq)
{
// 1. SKB长度小于MIN_SKB_LEN则pad
if (unlikely(skb->len < MIN_SKB_LEN)) {
if (skb_pad(skb, (int)(MIN_SKB_LEN - skb->len))) {
// 错误统计后丢包
}
skb->len = MIN_SKB_LEN;
}
// 2. 有效分片统计和合法性检查
valid_nr_frags = 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
if (!skb_frag_size(&skb_shinfo(skb)->frags[i])) {
find_zero_sge_len = true;
continue;
} else if (find_zero_sge_len) {
// 错误统计后丢包
}
valid_nr_frags++;
}
num_sge = valid_nr_frags + 1;
// 3. 判断 SQ WQE 资源
wqebb_cnt = num_sge + 1;
if (unlikely(hinic3_maybe_stop_tx(txq, wqebb_cnt))) {
// 资源不足,TX队列stop
return NETDEV_TX_BUSY;
}
// 4. VLAN插入等处理
if ((!skb_vlan_tag_present(skb)) &&
(nic_dev->nic_cap.outband_vlan_cfg_en == 1) &&
nic_dev->outband_cfg.outband_default_vid != 0) {
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
(u16)nic_dev->outband_cfg.outband_default_vid);
}
// 5. 卸载信息分析、task/queue_info填充
offload = hinic3_tx_offload(skb, &task, &queue_info, txq);
// 6. WQE Combo 结构体设置
owner = hinic3_set_wqe_combo(txq, &wqe_combo, offload, num_sge, &pi);
// 7. DMA 映射和BD填充
err = tx_map_skb(nic_dev, skb, valid_nr_frags, txq, tx_info, &wqe_combo);
if (err) {
// 回滚、丢包
}
// 8. 统计信息和WQE ctrl 填写
get_pkt_stats(tx_info, skb);
hinic3_prepare_sq_ctrl(&wqe_combo, queue_info, num_sge, owner);
// 9. Doorbell 通知硬件
hinic3_write_db(txq->sq, txq->cos, SQ_CFLAG_DP, hinic3_get_sq_local_pi(txq->sq));
return NETDEV_TX_OK;
}
三、【新增】关键函数源码讲解与流程图
1. hinic3_set_wqe_combo —— 生成WQE结构体
作用:
根据报文卸载类型与SGE个数,选择Compact/Extended WQE格式,分配并获取WQE各部分指针。
核心源码与注释:
// 生成WQE复合结构,包括控制块、task、buffer描述符
static u16 hinic3_set_wqe_combo(struct hinic3_txq *txq,
struct hinic3_sq_wqe_combo *wqe_combo,
u32 offload, u16 num_sge, u16 *curr_pi)
{
void *second_part_wqebbs_addr = NULL;
void *wqe = NULL;
u16 first_part_wqebbs_num, tmp_pi;
// ctrl_bd0: WQE控制块
wqe_combo->ctrl_bd0 = hinic3_get_sq_one_wqebb(txq->sq, curr_pi);
// 仅一个SGE且无offload,走紧凑格式
if (!offload && num_sge == 1) {
wqe_combo->wqe_type = SQ_WQE_COMPACT_TYPE;
return hinic3_get_and_update_sq_owner(txq->sq, *curr_pi, 1);
}
// 否则走扩展格式
wqe_combo->wqe_type = SQ_WQE_EXTENDED_TYPE;
// 有offload时需要task段
if (offload) {
wqe_combo->task = hinic3_get_sq_one_wqebb(txq->sq, &tmp_pi);
wqe_combo->task_type = SQ_WQE_TASKSECT_16BYTES;
} else {
wqe_combo->task_type = SQ_WQE_TASKSECT_46BITS;
}
// 多SGE时获取Buffer描述符块
if (num_sge > 1) {
wqe = hinic3_get_sq_multi_wqebbs(txq->sq, num_sge - 1, &tmp_pi,
&second_part_wqebbs_addr,
&first_part_wqebbs_num);
wqe_combo->bds_head = wqe;
wqe_combo->bds_sec2 = second_part_wqebbs_addr;
wqe_combo->first_bds_num = first_part_wqebbs_num;
}
// 更新PI,返回Owner
return hinic3_get_and_update_sq_owner(txq->sq, *curr_pi,
num_sge + (u16)!!offload);
}
流程图:
2. hinic3_tx_offload —— 解析并设置报文卸载信息
作用:
分析skb报文是否需要TSO、CSUM、VLAN等硬件卸载,将信息填入task和queue_info。
核心源码与注释:
// 解析skb的TSO、CSUM、VLAN卸载需求,填充task与queue_info
static u32 hinic3_tx_offload(struct sk_buff *skb, struct hinic3_sq_task *task,
u32 *queue_info, struct hinic3_txq *txq)
{
u32 offload = 0;
int tso_cs_en;
// 初始化task结构体
task->pkt_info0 = 0;
task->ip_identify = 0;
task->pkt_info2 = 0;
task->vlan_offload = 0;
// 处理TSO(大报文分片卸载)
tso_cs_en = hinic3_tso(task, queue_info, skb);
if (tso_cs_en < 0) {
offload = TX_OFFLOAD_INVALID;
return offload;
} else if (tso_cs_en) {
offload |= TX_OFFLOAD_TSO;
} else {
// 没有TSO则处理CSUM(校验和卸载)
tso_cs_en = hinic3_tx_csum(txq, task, skb);
if (tso_cs_en)
offload |= TX_OFFLOAD_CSUM;
}
// VLAN插入
#define VLAN_INSERT_MODE_MAX 5
if (unlikely(skb_vlan_tag_present(skb))) {
hinic3_set_vlan_tx_offload(task, skb_vlan_tag_get(skb),
txq->q_id % VLAN_INSERT_MODE_MAX);
offload |= TX_OFFLOAD_VLAN;
}
// payload offset检查
if (unlikely(SQ_CTRL_QUEUE_INFO_GET(*queue_info, PLDOFF) >
MAX_PAYLOAD_OFFSET)) {
offload = TX_OFFLOAD_INVALID;
return offload;
}
return offload;
}
流程图:
3. tx_map_skb —— 报文DMA映射与Buffer描述符填写
作用:
将skb各分片映射成物理DMA地址,并写入WQE的Buffer Description区,供硬件DMA拉取数据。
核心源码与注释:
// 对skb各分片做DMA映射,填写WQE的Buffer Description
static int tx_map_skb(struct hinic3_nic_dev *nic_dev, struct sk_buff *skb,
u16 valid_nr_frags, struct hinic3_txq *txq,
struct hinic3_tx_info *tx_info,
struct hinic3_sq_wqe_combo *wqe_combo)
{
struct hinic3_sq_wqe_desc *wqe_desc = wqe_combo->ctrl_bd0;
struct hinic3_sq_bufdesc *buf_desc = wqe_combo->bds_head;
struct hinic3_dma_info *dma_info = tx_info->dma_info;
struct pci_dev *pdev = nic_dev->pdev;
skb_frag_t *frag = NULL;
u32 j, i;
int err;
// 映射skb主数据区
dma_info[0].dma = dma_map_single(&pdev->dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE);
if (dma_mapping_error(&pdev->dev, dma_info[0].dma)) {
TXQ_STATS_INC(txq, map_frag_err);
return -EFAULT;
}
dma_info[0].len = skb_headlen(skb);
// 填写WQE控制块中的物理地址和长度
wqe_desc->hi_addr = hinic3_hw_be32(upper_32_bits(dma_info[0].dma));
wqe_desc->lo_addr = hinic3_hw_be32(lower_32_bits(dma_info[0].dma));
wqe_desc->ctrl_len = dma_info[0].len;
// 映射每个分片frag
for (i = 0; i < valid_nr_frags;) {
frag = &(skb_shinfo(skb)->frags[i]);
if (unlikely(i == wqe_combo->first_bds_num))
buf_desc = wqe_combo->bds_sec2;
i++;
dma_info[i].dma = skb_frag_dma_map(&pdev->dev, frag, 0,
skb_frag_size(frag),
DMA_TO_DEVICE);
if (dma_mapping_error(&pdev->dev, dma_info[i].dma)) {
TXQ_STATS_INC(txq, map_frag_err);
i--;
err = -EFAULT;
goto frag_map_err;
}
dma_info[i].len = skb_frag_size(frag);
hinic3_set_buf_desc(buf_desc, dma_info[i].dma,
dma_info[i].len);
buf_desc++;
}
return 0;
frag_map_err:
// 失败时回滚已映射的DMA
for (j = 0; j < i;) {
j++;
dma_unmap_page(&pdev->dev, dma_info[j].dma,
dma_info[j].len, DMA_TO_DEVICE);
}
dma_unmap_single(&pdev->dev, dma_info[0].dma, dma_info[0].len,
DMA_TO_DEVICE);
return err;
}
流程图:
4. hinic3_prepare_sq_ctrl —— WQE控制块填充
作用:
根据WQE类型、SGE数量、卸载功能等,设置WQE控制块相关字段,确保硬件正确解析。
核心源码与注释:
// 填写WQE控制块,包括长度、类型、owner等
static void hinic3_prepare_sq_ctrl(struct hinic3_sq_wqe_combo *wqe_combo,
u32 queue_info, int nr_descs, u16 owner)
{
struct hinic3_sq_wqe_desc *wqe_desc = wqe_combo->ctrl_bd0;
if (wqe_combo->wqe_type == SQ_WQE_COMPACT_TYPE) {
wqe_desc->ctrl_len |=
SQ_CTRL_SET(SQ_NORMAL_WQE, DATA_FORMAT) |
SQ_CTRL_SET(wqe_combo->wqe_type, EXTENDED) |
SQ_CTRL_SET(owner, OWNER);
wqe_desc->ctrl_len = hinic3_hw_be32(wqe_desc->ctrl_len);
wqe_desc->queue_info = 0;
return;
}
// 扩展型WQE需设置更多字段
wqe_desc->ctrl_len |= SQ_CTRL_SET(nr_descs, BUFDESC_NUM) |
SQ_CTRL_SET(wqe_combo->task_type, TASKSECT_LEN) |
SQ_CTRL_SET(SQ_NORMAL_WQE, DATA_FORMAT) |
SQ_CTRL_SET(wqe_combo->wqe_type, EXTENDED) |
SQ_CTRL_SET(owner, OWNER);
wqe_desc->ctrl_len = hinic3_hw_be32(wqe_desc->ctrl_len);
// queue_info填充
wqe_desc->queue_info = queue_info;
wqe_desc->queue_info |= SQ_CTRL_QUEUE_INFO_SET(1U, UC);
// MSS处理
if (!SQ_CTRL_QUEUE_INFO_GET(wqe_desc->queue_info, MSS)) {
wqe_desc->queue_info |=
SQ_CTRL_QUEUE_INFO_SET(TX_MSS_DEFAULT, MSS);
} else if (SQ_CTRL_QUEUE_INFO_GET(wqe_desc->queue_info, MSS) <
TX_MSS_MIN) {
wqe_desc->queue_info =
SQ_CTRL_QUEUE_INFO_CLEAR(wqe_desc->queue_info, MSS);
wqe_desc->queue_info |= SQ_CTRL_QUEUE_INFO_SET(TX_MSS_MIN, MSS);
}
wqe_desc->queue_info = hinic3_hw_be32(wqe_desc->queue_info);
}
流程图:
四、TX 中断流程源码解读(报文发送完成)
1. 流程总览图
2. 核心源码流程详细拆解
Step 1. 入口函数 hinic3_tx_poll
int hinic3_tx_poll(struct hinic3_txq *txq, int budget)
{
struct hinic3_nic_dev *nic_dev = netdev_priv(txq->netdev);
...
// 1. 获取硬件CI与软件CI
hw_ci = hinic3_get_sq_hw_ci(txq->sq);
dma_rmb();
sw_ci = hinic3_get_sq_local_ci(txq->sq);
// 2. 检查所有wqebb是否完成
do {
tx_info = &txq->tx_info[sw_ci];
if (hw_ci == sw_ci ||
((hw_ci - sw_ci) & txq->q_mask) < tx_info->wqebb_cnt)
break;
sw_ci = (sw_ci + tx_info->wqebb_cnt) & (u16)txq->q_mask;
prefetch(&txq->tx_info[sw_ci]);
wqebb_cnt += tx_info->wqebb_cnt;
tx_bytes += tx_info->num_bytes;
nr_pkts += tx_info->num_pkts;
pkts++;
// 3. 释放DMA和skb内存
tx_free_skb(nic_dev, tx_info);
} while (likely(pkts < budget));
// 4. 更新CI
hinic3_update_sq_local_ci(txq->sq, wqebb_cnt);
// 5. 如果队列被stop且资源充足则唤醒
if (unlikely(__netif_subqueue_stopped(nic_dev->netdev, q_id) &&
hinic3_get_sq_free_wqebbs(txq->sq) >= 1 &&
test_bit(HINIC3_INTF_UP, &nic_dev->flags))) {
...
netif_wake_subqueue(nic_dev->netdev, q_id);
wake++;
...
}
// 6. 更新统计信息
u64_stats_update_begin(&txq->txq_stats.syncp);
txq->txq_stats.bytes += tx_bytes;
txq->txq_stats.packets += nr_pkts;
txq->txq_stats.wake += wake;
u64_stats_update_end(&txq->txq_stats.syncp);
return pkts;
}
五、流程关键点源码对照表
步骤 | 关键函数/宏 | 源码位置 | 说明 |
---|---|---|---|
设备可用性检查 | netif_carrier_ok | hinic3_xmit_frame | 设备Link Down直接丢包 |
队列合法性检查 | skb_get_queue_mapping | hinic3_xmit_frame | 队列ID越界丢包 |
WQE资源判断 | hinic3_maybe_stop_tx | hinic3_send_one_skb | WQE资源不足stop队列 |
卸载信息处理 | hinic3_tx_offload | hinic3_send_one_skb | TSO/VLAN/CSUM等校验填充 |
DMA映射 | tx_map_skb | hinic3_send_one_skb | 映射DMA和Buffer描述符 |
WQE Combo准备 | hinic3_set_wqe_combo | hinic3_send_one_skb | 选择WQE组织方式,分配资源 |
WQE Ctrl填充 | hinic3_prepare_sq_ctrl | hinic3_send_one_skb | 填充WQE控制块 |
Doorbell敲门 | hinic3_write_db | hinic3_send_one_skb | 通知硬件处理 |
发送完成回收 | tx_free_skb | hinic3_tx_poll | 解DMA映射并释放skb |
队列唤醒 | netif_wake_subqueue | hinic3_tx_poll | 资源充足唤醒协议栈队列 |
统计更新 | u64_stats_update_begin/end | 多处 | 完善统计信息 |
如有收获,欢迎点赞、收藏、转发!