ixgbe网卡驱动（三）

最新推荐文章于 2025-03-01 20:08:00 发布

hz5034

最新推荐文章于 2025-03-01 20:08:00 发布

阅读量4.1k

点赞数 2

分类专栏：网卡驱动

本文链接：https://blog.csdn.net/hz5034/article/details/79794478

版权

网卡驱动专栏收录该内容

6 篇文章

订阅专栏

本文详细解析了Intel ixGBE网卡驱动的工作原理，包括硬中断处理流程、软中断调度机制以及发送和接收队列的数据处理过程。重点介绍了ixgbe_msix_clean_many函数如何清理发送和接收队列，并通过napi_schedule触发软中断，实现高效的数据包处理。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

ixgbe_ring结构图

这里写图片描述

中断上下部

在硬中断函数中抛出软中断：
ixgbe_msix_clean_many()/ixgbe_intr() -> napi_schedule() ->__napi_schedule() -> __raise_softirq_irqoff()

在软中断函数中调用poll函数：
net_rx_action() -> ixgbe_clean_rxtx_many()/ixgbe_poll() -> ixgbe_clean_tx_irq() + ixgbe_clean_rx_irq()

ixgbe_msix_clean_many()

static irqreturn_t ixgbe_msix_clean_many(int irq, void *data)
{
	struct ixgbe_q_vector *q_vector = data;
	struct ixgbe_adapter  *adapter = q_vector->adapter;
	struct ixgbe_ring  *ring;
	int r_idx;
	int i;

    // 若发送/接收队列数都为零
	if (!q_vector->txr_count && !q_vector->rxr_count)
		return IRQ_HANDLED;

	r_idx = find_first_bit(q_vector->txr_idx, adapter->num_tx_queues);
	for (i = 0; i < q_vector->txr_count; i++) {
	    // 遍历中断对应的发送队列（一般一个中断只对应一个发送队列）
		ring = &(adapter->tx_ring[r_idx]);
		// 清零total_bytes和total_packets
		ring->total_bytes = 0;
		ring->total_packets = 0;
		r_idx = find_next_bit(q_vector->txr_idx, adapter->num_tx_queues,
		                      r_idx + 1);
	}

	r_idx = find_first_bit(q_vector->rxr_idx, adapter->num_rx_queues);
	for (i = 0; i < q_vector->rxr_count; i++) {
	    // 遍历中断对应的接收队列（一般一个中断只对应一个接收队列）
		ring = &(adapter->rx_ring[r_idx]);
		// 清零total_bytes和total_packets
		ring->total_bytes = 0;
		ring->total_packets = 0;
		r_idx = find_next_bit(q_vector->rxr_idx, adapter->num_rx_queues,
		                      r_idx + 1);
	}

	/* disable interrupts on this vector only */
	ixgbe_irq_disable_queues(adapter, ((u64)1 << q_vector->v_idx));
	napi_schedule(&q_vector->napi); // NAPI调度

	return IRQ_HANDLED;
}

napi_schedule()

static inline void napi_schedule(struct napi_struct *n)
{
	if (napi_schedule_prep(n))
		__napi_schedule(n);
}

static inline int napi_schedule_prep(struct napi_struct *n)
{
	return !napi_disable_pending(n) &&
		!test_and_set_bit(NAPI_STATE_SCHED, &n->state);
}

static inline int napi_disable_pending(struct napi_struct *n)
{
	return test_bit(NAPI_STATE_DISABLE, &n->state);
}

void __napi_schedule(struct napi_struct *n)
{
   unsigned long flags;
 
   local_irq_save(flags);
   // 将adapter->napi加入当前CPU的softnet_data（一个per-CPU变量）的poll_list
   list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
   __raise_softirq_irqoff(NET_RX_SOFTIRQ); // 抛出软中断
   local_irq_restore(flags);
}

net_rx_action()

static void net_rx_action(struct softirq_action *h)
{
	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
	unsigned long time_limit = jiffies + 2; // 超时时间为2ms
	int budget = netdev_budget; // 配额为300
	void *have;

	local_irq_disable();

	while (!list_empty(list)) { // 遍历当前CPU的softnet_data的poll_list
		struct napi_struct *n;
		int work, weight;

		/* If softirq window is exhuasted then punt.
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
		 */
		// 若配额用完或超时
		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
			goto softnet_break;

		local_irq_enable();

		/* Even though interrupts have been re-enabled, this
		 * access is safe because interrupts can only add new
		 * entries to the tail of this list, and only ->poll()
		 * calls can remove this head entry from the list.
		 */
		n = list_entry(list->next, struct napi_struct, poll_list); // 得到napi_struct

		have = netpoll_poll_lock(n);

		weight = n->weight; // 一次poll的最大报文数

		/* This NAPI_STATE_SCHED test is for avoiding a race
		 * with netpoll's poll_napi().  Only the entity which
		 * obtains the lock and sees NAPI_STATE_SCHED set will
		 * actually make the ->poll() call.  Therefore we avoid
		 * accidently calling ->poll() when NAPI is not scheduled.
		 */
		work = 0;
		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
			work = n->poll(n, weight); // 调用poll函数
			trace_napi_poll(n);
		}

		WARN_ON_ONCE(work > weight);

		budget -= work;

		local_irq_disable();

		/* Drivers must not modify the NAPI state if they
		 * consume the entire weight.  In such cases this code
		 * still "owns" the NAPI instance and therefore can
		 * move the instance around on the list at-will.
		 */
		if (unlikely(work == weight)) {
			if (unlikely(napi_disable_pending(n))) {
				local_irq_enable();
				napi_complete(n);
				local_irq_disable();
			} else
				list_move_tail(&n->poll_list, list); // 将napi_struct移到队尾
		}

		netpoll_poll_unlock(have);
	}
out:
	local_irq_enable();

#ifdef CONFIG_NET_DMA
	/*
	 * There may not be any more sk_buffs coming right now, so push
	 * any pending DMA copies to hardware
	 */
	dma_issue_pending_all();
#endif

	return;

softnet_break:
	__get_cpu_var(netdev_rx_stat).time_squeeze++; // 统计软中断数
	__raise_softirq_irqoff(NET_RX_SOFTIRQ); // 抛出软中断
	goto out;
}

ixgbe_clean_rxtx_many()

static int ixgbe_clean_rxtx_many(struct napi_struct *napi, int budget)
{
	struct ixgbe_q_vector *q_vector =
	                       container_of(napi, struct ixgbe_q_vector, napi);
	struct ixgbe_adapter *adapter = q_vector->adapter;
	struct ixgbe_ring *ring = NULL;
	int work_done = 0, i;
	long r_idx;
	bool tx_clean_complete = true;

	r_idx = find_first_bit(q_vector->txr_idx, adapter->num_tx_queues);
	for (i = 0; i < q_vector->txr_count; i++) {
	    // 遍历中断对应的发送队列（一般一个中断只对应一个发送队列）
		ring = &(adapter->tx_ring[r_idx]);
#ifdef CONFIG_IXGBE_DCA
		if (adapter->flags & IXGBE_FLAG_DCA_ENABLED)
			ixgbe_update_tx_dca(adapter, ring);
#endif
        // 处理发送队列
		tx_clean_complete &= ixgbe_clean_tx_irq(q_vector, ring);
		r_idx = find_next_bit(q_vector->txr_idx, adapter->num_tx_queues,
		                      r_idx + 1);
	}

	/* attempt to distribute budget to each queue fairly, but don't allow
	 * the budget to go below 1 because we'll exit polling */
	budget /= (q_vector->rxr_count ?: 1); // x ? : y = x ? x : y
	budget = max(budget, 1);
	r_idx = find_first_bit(q_vector->rxr_idx, adapter->num_rx_queues);
	for (i = 0; i < q_vector->rxr_count; i++) {
	    // 遍历中断对应的接收队列（一般一个中断只对应一个接收队列）
		ring = &(adapter->rx_ring[r_idx]);
#ifdef CONFIG_IXGBE_DCA
		if (adapter->flags & IXGBE_FLAG_DCA_ENABLED)
			ixgbe_update_rx_dca(adapter, ring);
#endif
        // 处理接收队列
		ixgbe_clean_rx_irq(q_vector, ring, &work_done, budget);
		r_idx = find_next_bit(q_vector->rxr_idx, adapter->num_rx_queues,
		                      r_idx + 1);
	}

	r_idx = find_first_bit(q_vector->rxr_idx, adapter->num_rx_queues);
	ring = &(adapter->rx_ring[r_idx]);
	/* If all Rx work done, exit the polling mode */
	if (work_done < budget) {
		napi_complete(napi);
		if (adapter->rx_itr_setting & 1)
			ixgbe_set_itr_msix(q_vector);
		if (!test_bit(__IXGBE_DOWN, &adapter->state))
			ixgbe_irq_enable_queues(adapter,
			                        ((u64)1 << q_vector->v_idx));
		return 0;
	}

	return work_done;
}

ixgbe_clean_tx_irq()

static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
                               struct ixgbe_ring *tx_ring)
{
   struct ixgbe_adapter *adapter = q_vector->adapter;
   struct net_device *netdev = adapter->netdev;
   union ixgbe_adv_tx_desc *tx_desc, *eop_desc;
   struct ixgbe_tx_buffer *tx_buffer_info;
   unsigned int i, eop, count = 0;
   unsigned int total_bytes = 0, total_packets = 0;
 
   i = tx_ring->next_to_clean; // 从next_to_clean开始
   eop = tx_ring->tx_buffer_info[i].next_to_watch;
   eop_desc = IXGBE_TX_DESC_ADV(*tx_ring, eop);
 
   while ((eop_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)) &&
          (count < tx_ring->work_limit)) {
      bool cleaned = false;
      for ( ; !cleaned; count++) {
         struct sk_buff *skb;
         tx_desc = IXGBE_TX_DESC_ADV(*tx_ring, i); // 得到第i个ixgbe_adv_tx_desc
         tx_buffer_info = &tx_ring->tx_buffer_info[i]; // 得到第i个ixgbe_rx_buffer
         cleaned = (i == eop);
         skb = tx_buffer_info->skb; // 得到ixgbe_rx_buffer的skb
 
         if (cleaned && skb) {
            unsigned int segs, bytecount;
            unsigned int hlen = skb_headlen(skb);
 
            /* gso_segs is currently only valid for tcp */
            segs = skb_shinfo(skb)->gso_segs ?: 1;
#ifdef IXGBE_FCOE
            /* adjust for FCoE Sequence Offload */
            if ((adapter->flags & IXGBE_FLAG_FCOE_ENABLED)
                && (skb->protocol == htons(ETH_P_FCOE)) &&
                skb_is_gso(skb)) {
               hlen = skb_transport_offset(skb) +
                  sizeof(struct fc_frame_header) +
                  sizeof(struct fcoe_crc_eof);
               segs = DIV_ROUND_UP(skb->len - hlen,
                  skb_shinfo(skb)->gso_size);
            }
#endif /* IXGBE_FCOE */
            /* multiply data chunks by size of headers */
            bytecount = ((segs - 1) * hlen) + skb->len;
            total_packets += segs;
            total_bytes += bytecount;
         }

         // 取消流式DMA映射，释放skb和数据缓存区
         ixgbe_unmap_and_free_tx_resource(adapter,
                                          tx_buffer_info);
 
         tx_desc->wb.status = 0; // 设置status_error为0
 
         i++; // 更新i
         if (i == tx_ring->count)
            i = 0;
      }
 
      eop = tx_ring->tx_buffer_info[i].next_to_watch;
      eop_desc = IXGBE_TX_DESC_ADV(*tx_ring, eop);
   }
 
   tx_ring->next_to_clean = i; // 更新next_to_clean
 
#define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
   if (unlikely(count && netif_carrier_ok(netdev) &&
                (IXGBE_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
      /* Make sure that anybody stopping the queue after this
       * sees the new next_to_clean.
       */
      smp_mb();
      if (__netif_subqueue_stopped(netdev, tx_ring->queue_index) &&
          !test_bit(__IXGBE_DOWN, &adapter->state)) {
         netif_wake_subqueue(netdev, tx_ring->queue_index);
         ++adapter->restart_queue;
      }
   }
 
   if (adapter->detect_tx_hung) {
      if (ixgbe_check_tx_hang(adapter, tx_ring, i)) {
         /* schedule immediate reset if we believe we hung */
         DPRINTK(PROBE, INFO,
                 "tx hang %d detected, resetting adapter\n",
                 adapter->tx_timeout_count + 1);
         ixgbe_tx_timeout(adapter->netdev);
      }
   }
 
   /* re-arm the interrupt */
   if (count >= tx_ring->work_limit)
      ixgbe_irq_rearm_queues(adapter, ((u64)1 << q_vector->v_idx));
 
   tx_ring->total_bytes += total_bytes;
   tx_ring->total_packets += total_packets;
   tx_ring->stats.packets += total_packets;
   tx_ring->stats.bytes += total_bytes;
   adapter->net_stats.tx_bytes += total_bytes;
   adapter->net_stats.tx_packets += total_packets;
   return (count < tx_ring->work_limit);
}

ixgbe_clean_rx_irq()

ixgbe_clean_rx_irq() -> eth_type_trans() + ixgbe_receive_skb()
ixgbe_receive_skb() -> napi_gro_receive() -> napi_skb_finish() -> netif_receive_skb()

static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
                               struct ixgbe_ring *rx_ring,
                               int *work_done, int work_to_do)
{
   struct ixgbe_adapter *adapter = q_vector->adapter;
   struct pci_dev *pdev = adapter->pdev;
   union ixgbe_adv_rx_desc *rx_desc, *next_rxd;
   struct ixgbe_rx_buffer *rx_buffer_info, *next_buffer;
   struct sk_buff *skb;
   unsigned int i, rsc_count = 0;
   u32 len, staterr;
   u16 hdr_info;
   bool cleaned = false;
   int cleaned_count = 0;
   unsigned int total_rx_bytes = 0, total_rx_packets = 0;
#ifdef IXGBE_FCOE
   int ddp_bytes = 0;
#endif /* IXGBE_FCOE */
 
   i = rx_ring->next_to_clean;
   rx_desc = IXGBE_RX_DESC_ADV(*rx_ring, i); // 得到第i个ixgbe_adv_rx_desc
   staterr = le32_to_cpu(rx_desc->wb.upper.status_error); // 得到status_error
   rx_buffer_info = &rx_ring->rx_buffer_info[i]; // 得到第i个ixgbe_rx_buffer
 
   while (staterr & IXGBE_RXD_STAT_DD) { // 若status_error的DD（Descriptor Done）位为1
      u32 upper_len = 0;
      if (*work_done >= work_to_do)
         break;
      (*work_done)++;
 
      if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
         hdr_info = le16_to_cpu(ixgbe_get_hdr_info(rx_desc));
         len = (hdr_info & IXGBE_RXDADV_HDRBUFLEN_MASK) >>
                IXGBE_RXDADV_HDRBUFLEN_SHIFT;
         if (hdr_info & IXGBE_RXDADV_SPH)
            adapter->rx_hdr_split++;
         if (len > IXGBE_RX_HDR_SIZE)
            len = IXGBE_RX_HDR_SIZE;
         upper_len = le16_to_cpu(rx_desc->wb.upper.length);
      } else {
         len = le16_to_cpu(rx_desc->wb.upper.length);
      }
 
      cleaned = true;
      skb = rx_buffer_info->skb; // 得到ixgbe_rx_buffer的skb
      prefetch(skb->data - NET_IP_ALIGN);
      rx_buffer_info->skb = NULL; // 设置ixgbe_rx_buffer的skb为NULL
 
      if (rx_buffer_info->dma) {
         pci_unmap_single(pdev, rx_buffer_info->dma, // 取消流式DMA映射
                          rx_ring->rx_buf_len,
                          PCI_DMA_FROMDEVICE);
         rx_buffer_info->dma = 0;
         skb_put(skb, len);
      }
 
      if (upper_len) {
         pci_unmap_page(pdev, rx_buffer_info->page_dma,
                        PAGE_SIZE / 2, PCI_DMA_FROMDEVICE);
         rx_buffer_info->page_dma = 0;
         skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
                            rx_buffer_info->page,
                            rx_buffer_info->page_offset,
                            upper_len);
 
         if ((rx_ring->rx_buf_len > (PAGE_SIZE / 2)) ||
             (page_count(rx_buffer_info->page) != 1))
            rx_buffer_info->page = NULL;
         else
            get_page(rx_buffer_info->page);
 
         skb->len += upper_len;
         skb->data_len += upper_len;
         skb->truesize += upper_len;
      }
 
      i++; // 更新i
      if (i == rx_ring->count)
         i = 0;
 
      next_rxd = IXGBE_RX_DESC_ADV(*rx_ring, i);
      prefetch(next_rxd);
      cleaned_count++; // 统计移出Rx Ring Buffer（交给协议栈）的包数
 
      if (adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE)
         rsc_count = ixgbe_get_rsc_count(rx_desc);
 
      if (rsc_count) {
         u32 nextp = (staterr & IXGBE_RXDADV_NEXTP_MASK) >>
                 IXGBE_RXDADV_NEXTP_SHIFT;
         next_buffer = &rx_ring->rx_buffer_info[nextp];
         rx_ring->rsc_count += (rsc_count - 1);
      } else {
         next_buffer = &rx_ring->rx_buffer_info[i];
      }
 
      if (staterr & IXGBE_RXD_STAT_EOP) {
         if (skb->prev)
            skb = ixgbe_transform_rsc_queue(skb);
         rx_ring->stats.packets++;
         rx_ring->stats.bytes += skb->len;
      } else {
         if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
            rx_buffer_info->skb = next_buffer->skb;
            rx_buffer_info->dma = next_buffer->dma;
            next_buffer->skb = skb;
            next_buffer->dma = 0;
         } else {
            skb->next = next_buffer->skb;
            skb->next->prev = skb;
         }
         adapter->non_eop_descs++;
         goto next_desc;
      }
 
      if (staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) {
         dev_kfree_skb_irq(skb);
         goto next_desc;
      }
 
      ixgbe_rx_checksum(adapter, rx_desc, skb);
 
      /* probably a little skewed due to removing CRC */
      total_rx_bytes += skb->len;
      total_rx_packets++;
 
      skb->protocol = eth_type_trans(skb, adapter->netdev); // 设置L3协议
#ifdef IXGBE_FCOE
      /* if ddp, not passing to ULD unless for FCP_RSP or error */
      if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED) {
         ddp_bytes = ixgbe_fcoe_ddp(adapter, rx_desc, skb);
         if (!ddp_bytes)
            goto next_desc;
      }
#endif /* IXGBE_FCOE */
      /* ixgbe_receive_skb() -> napi_gro_receive() -> napi_skb_finish() -> netif_receive_skb()
         通过netif_receive_skb()将skb上送协议栈 */
      ixgbe_receive_skb(q_vector, skb, staterr, rx_ring, rx_desc);
 
next_desc:
      rx_desc->wb.upper.status_error = 0; // 设置status_error为0
 
      /* return some buffers to hardware, one at a time is too slow */
      // cleaned_count（移出Rx Ring Buffer的包数）每满16，补充16个sk_buff
      if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
         ixgbe_alloc_rx_buffers(adapter, rx_ring, cleaned_count);
         cleaned_count = 0;
      }
 
      /* use prefetched values */
      rx_desc = next_rxd; // 更新ixgbe_adv_rx_desc
      rx_buffer_info = &rx_ring->rx_buffer_info[i]; // 更新ixgbe_rx_buffer
 
      staterr = le32_to_cpu(rx_desc->wb.upper.status_error); // 更新staterr
   }
 
   rx_ring->next_to_clean = i; // 更新next_to_clean
   cleaned_count = IXGBE_DESC_UNUSED(rx_ring); // 计算从next_to_use到next_to_clean - 2的个数
 
   if (cleaned_count)
      ixgbe_alloc_rx_buffers(adapter, rx_ring, cleaned_count); // 补充clean_count个sk_buff
 
#ifdef IXGBE_FCOE
   /* include DDPed FCoE data */
   if (ddp_bytes > 0) {
      unsigned int mss;
 
      mss = adapter->netdev->mtu - sizeof(struct fcoe_hdr) -
         sizeof(struct fc_frame_header) -
         sizeof(struct fcoe_crc_eof);
      if (mss > 512)
         mss &= ~511;
      total_rx_bytes += ddp_bytes;
      total_rx_packets += DIV_ROUND_UP(ddp_bytes, mss);
   }
#endif /* IXGBE_FCOE */
 
   rx_ring->total_packets += total_rx_packets;
   rx_ring->total_bytes += total_rx_bytes;
   adapter->net_stats.rx_bytes += total_rx_bytes;
   adapter->net_stats.rx_packets += total_rx_packets;
 
   return cleaned;
}