ixgbe_ring结构图
中断上下部
在硬中断函数中抛出软中断:
ixgbe_msix_clean_many()/ixgbe_intr() -> napi_schedule() ->__napi_schedule() -> __raise_softirq_irqoff()
在软中断函数中调用poll函数:
net_rx_action() -> ixgbe_clean_rxtx_many()/ixgbe_poll() -> ixgbe_clean_tx_irq() + ixgbe_clean_rx_irq()
ixgbe_msix_clean_many()
static irqreturn_t ixgbe_msix_clean_many(int irq, void *data)
{
struct ixgbe_q_vector *q_vector = data;
struct ixgbe_adapter *adapter = q_vector->adapter;
struct ixgbe_ring *ring;
int r_idx;
int i;
// 若发送/接收队列数都为零
if (!q_vector->txr_count && !q_vector->rxr_count)
return IRQ_HANDLED;
r_idx = find_first_bit(q_vector->txr_idx, adapter->num_tx_queues);
for (i = 0; i < q_vector->txr_count; i++) {
// 遍历中断对应的发送队列(一般一个中断只对应一个发送队列)
ring = &(adapter->tx_ring[r_idx]);
// 清零total_bytes和total_packets
ring->total_bytes = 0;
ring->total_packets = 0;
r_idx = find_next_bit(q_vector->txr_idx, adapter->num_tx_queues,
r_idx + 1);
}
r_idx = find_first_bit(q_vector->rxr_idx, adapter->num_rx_queues);
for (i = 0; i < q_vector->rxr_count; i++) {
// 遍历中断对应的接收队列(一般一个中断只对应一个接收队列)
ring = &(adapter->rx_ring[r_idx]);
// 清零total_bytes和total_packets
ring->total_bytes = 0;
ring->total_packets = 0;
r_idx = find_next_bit(q_vector->rxr_idx, adapter->num_rx_queues,
r_idx + 1);
}
/* disable interrupts on this vector only */
ixgbe_irq_disable_queues(adapter, ((u64)1 << q_vector->v_idx));
napi_schedule(&q_vector->napi); // NAPI调度
return IRQ_HANDLED;
}
napi_schedule()
static inline void napi_schedule(struct napi_struct *n)
{
if (napi_schedule_prep(n))
__napi_schedule(n);
}
static inline int napi_schedule_prep(struct napi_struct *n)
{
return !napi_disable_pending(n) &&
!test_and_set_bit(NAPI_STATE_SCHED, &n->state);
}
static inline int napi_disable_pending(struct napi_struct *n)
{
return test_bit(NAPI_STATE_DISABLE, &n->state);
}
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
// 将adapter->napi加入当前CPU的softnet_data(一个per-CPU变量)的poll_list
list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ); // 抛出软中断
local_irq_restore(flags);
}
net_rx_action()
static void net_rx_action(struct softirq_action *h)
{
struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
unsigned long time_limit = jiffies + 2; // 超时时间为2ms
int budget = netdev_budget; // 配额为300
void *have;
local_irq_disable();
while (!list_empty(list)) { // 遍历当前CPU的softnet_data的poll_list
struct napi_struct *n;
int work, weight;
/* If softirq window is exhuasted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
// 若配额用完或超时
if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
goto softnet_break;
local_irq_enable();
/* Even though interrupts have been re-enabled, this
* access is safe because interrupts can only add new
* entries to the tail of this list, and only ->poll()
* calls can remove this head entry from the list.
*/
n = list_entry(list->next, struct napi_struct, poll_list); // 得到napi_struct
have = netpoll_poll_lock(n);
weight = n->weight; // 一次poll的最大报文数
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
* accidently calling ->poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight); // 调用poll函数
trace_napi_poll(n);
}
WARN_ON_ONCE(work > weight);
budget -= work;
local_irq_disable();
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
if (unlikely(work == weight)) {
if (unlikely(napi_disable_pending(n))) {
local_irq_enable();
napi_complete(n);
local_irq_disable();
} else
list_move_tail(&n->poll_list, list); // 将napi_struct移到队尾
}
netpoll_poll_unlock(have);
}
out:
local_irq_enable();
#ifdef CONFIG_NET_DMA
/*
* There may not be any more sk_buffs coming right now, so push
* any pending DMA copies to hardware
*/
dma_issue_pending_all();
#endif
return;
softnet_break:
__get_cpu_var(netdev_rx_stat).time_squeeze++; // 统计软中断数
__raise_softirq_irqoff(NET_RX_SOFTIRQ); // 抛出软中断
goto out;
}
ixgbe_clean_rxtx_many()
static int ixgbe_clean_rxtx_many(struct napi_struct *napi, int budget)
{
struct ixgbe_q_vector *q_vector =
container_of(napi, struct ixgbe_q_vector, napi);
struct ixgbe_adapter *adapter = q_vector->adapter;
struct ixgbe_ring *ring = NULL;
int work_done = 0, i;
long r_idx;
bool tx_clean_complete = true;
r_idx = find_first_bit(q_vector->txr_idx, adapter->num_tx_queues);
for (i = 0; i < q_vector->txr_count; i++) {
// 遍历中断对应的发送队列(一般一个中断只对应一个发送队列)
ring = &(adapter->tx_ring[r_idx]);
#ifdef CONFIG_IXGBE_DCA
if (adapter->flags & IXGBE_FLAG_DCA_ENABLED)
ixgbe_update_tx_dca(adapter, ring);
#endif
// 处理发送队列
tx_clean_complete &= ixgbe_clean_tx_irq(q_vector, ring);
r_idx = find_next_bit(q_vector->txr_idx, adapter->num_tx_queues,
r_idx + 1);
}
/* attempt to distribute budget to each queue fairly, but don't allow
* the budget to go below 1 because we'll exit polling */
budget /= (q_vector->rxr_count ?: 1); // x ? : y = x ? x : y
budget = max(budget, 1);
r_idx = find_first_bit(q_vector->rxr_idx, adapter->num_rx_queues);
for (i = 0; i < q_vector->rxr_count; i++) {
// 遍历中断对应的接收队列(一般一个中断只对应一个接收队列)
ring = &(adapter->rx_ring[r_idx]);
#ifdef CONFIG_IXGBE_DCA
if (adapter->flags & IXGBE_FLAG_DCA_ENABLED)
ixgbe_update_rx_dca(adapter, ring);
#endif
// 处理接收队列
ixgbe_clean_rx_irq(q_vector, ring, &work_done, budget);
r_idx = find_next_bit(q_vector->rxr_idx, adapter->num_rx_queues,
r_idx + 1);
}
r_idx = find_first_bit(q_vector->rxr_idx, adapter->num_rx_queues);
ring = &(adapter->rx_ring[r_idx]);
/* If all Rx work done, exit the polling mode */
if (work_done < budget) {
napi_complete(napi);
if (adapter->rx_itr_setting & 1)
ixgbe_set_itr_msix(q_vector);
if (!test_bit(__IXGBE_DOWN, &adapter->state))
ixgbe_irq_enable_queues(adapter,
((u64)1 << q_vector->v_idx));
return 0;
}
return work_done;
}
ixgbe_clean_tx_irq()
static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
struct ixgbe_ring *tx_ring)
{
struct ixgbe_adapter *adapter = q_vector->adapter;
struct net_device *netdev = adapter->netdev;
union ixgbe_adv_tx_desc *tx_desc, *eop_desc;
struct ixgbe_tx_buffer *tx_buffer_info;
unsigned int i, eop, count = 0;
unsigned int total_bytes = 0, total_packets = 0;
i = tx_ring->next_to_clean; // 从next_to_clean开始
eop = tx_ring->tx_buffer_info[i].next_to_watch;
eop_desc = IXGBE_TX_DESC_ADV(*tx_ring, eop);
while ((eop_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)) &&
(count < tx_ring->work_limit)) {
bool cleaned = false;
for ( ; !cleaned; count++) {
struct sk_buff *skb;
tx_desc = IXGBE_TX_DESC_ADV(*tx_ring, i); // 得到第i个ixgbe_adv_tx_desc
tx_buffer_info = &tx_ring->tx_buffer_info[i]; // 得到第i个ixgbe_rx_buffer
cleaned = (i == eop);
skb = tx_buffer_info->skb; // 得到ixgbe_rx_buffer的skb
if (cleaned && skb) {
unsigned int segs, bytecount;
unsigned int hlen = skb_headlen(skb);
/* gso_segs is currently only valid for tcp */
segs = skb_shinfo(skb)->gso_segs ?: 1;
#ifdef IXGBE_FCOE
/* adjust for FCoE Sequence Offload */
if ((adapter->flags & IXGBE_FLAG_FCOE_ENABLED)
&& (skb->protocol == htons(ETH_P_FCOE)) &&
skb_is_gso(skb)) {
hlen = skb_transport_offset(skb) +
sizeof(struct fc_frame_header) +
sizeof(struct fcoe_crc_eof);
segs = DIV_ROUND_UP(skb->len - hlen,
skb_shinfo(skb)->gso_size);
}
#endif /* IXGBE_FCOE */
/* multiply data chunks by size of headers */
bytecount = ((segs - 1) * hlen) + skb->len;
total_packets += segs;
total_bytes += bytecount;
}
// 取消流式DMA映射,释放skb和数据缓存区
ixgbe_unmap_and_free_tx_resource(adapter,
tx_buffer_info);
tx_desc->wb.status = 0; // 设置status_error为0
i++; // 更新i
if (i == tx_ring->count)
i = 0;
}
eop = tx_ring->tx_buffer_info[i].next_to_watch;
eop_desc = IXGBE_TX_DESC_ADV(*tx_ring, eop);
}
tx_ring->next_to_clean = i; // 更新next_to_clean
#define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
if (unlikely(count && netif_carrier_ok(netdev) &&
(IXGBE_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
/* Make sure that anybody stopping the queue after this
* sees the new next_to_clean.
*/
smp_mb();
if (__netif_subqueue_stopped(netdev, tx_ring->queue_index) &&
!test_bit(__IXGBE_DOWN, &adapter->state)) {
netif_wake_subqueue(netdev, tx_ring->queue_index);
++adapter->restart_queue;
}
}
if (adapter->detect_tx_hung) {
if (ixgbe_check_tx_hang(adapter, tx_ring, i)) {
/* schedule immediate reset if we believe we hung */
DPRINTK(PROBE, INFO,
"tx hang %d detected, resetting adapter\n",
adapter->tx_timeout_count + 1);
ixgbe_tx_timeout(adapter->netdev);
}
}
/* re-arm the interrupt */
if (count >= tx_ring->work_limit)
ixgbe_irq_rearm_queues(adapter, ((u64)1 << q_vector->v_idx));
tx_ring->total_bytes += total_bytes;
tx_ring->total_packets += total_packets;
tx_ring->stats.packets += total_packets;
tx_ring->stats.bytes += total_bytes;
adapter->net_stats.tx_bytes += total_bytes;
adapter->net_stats.tx_packets += total_packets;
return (count < tx_ring->work_limit);
}
ixgbe_clean_rx_irq()
ixgbe_clean_rx_irq() -> eth_type_trans() + ixgbe_receive_skb()
ixgbe_receive_skb() -> napi_gro_receive() -> napi_skb_finish() -> netif_receive_skb()
static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
struct ixgbe_ring *rx_ring,
int *work_done, int work_to_do)
{
struct ixgbe_adapter *adapter = q_vector->adapter;
struct pci_dev *pdev = adapter->pdev;
union ixgbe_adv_rx_desc *rx_desc, *next_rxd;
struct ixgbe_rx_buffer *rx_buffer_info, *next_buffer;
struct sk_buff *skb;
unsigned int i, rsc_count = 0;
u32 len, staterr;
u16 hdr_info;
bool cleaned = false;
int cleaned_count = 0;
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
#ifdef IXGBE_FCOE
int ddp_bytes = 0;
#endif /* IXGBE_FCOE */
i = rx_ring->next_to_clean;
rx_desc = IXGBE_RX_DESC_ADV(*rx_ring, i); // 得到第i个ixgbe_adv_rx_desc
staterr = le32_to_cpu(rx_desc->wb.upper.status_error); // 得到status_error
rx_buffer_info = &rx_ring->rx_buffer_info[i]; // 得到第i个ixgbe_rx_buffer
while (staterr & IXGBE_RXD_STAT_DD) { // 若status_error的DD(Descriptor Done)位为1
u32 upper_len = 0;
if (*work_done >= work_to_do)
break;
(*work_done)++;
if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
hdr_info = le16_to_cpu(ixgbe_get_hdr_info(rx_desc));
len = (hdr_info & IXGBE_RXDADV_HDRBUFLEN_MASK) >>
IXGBE_RXDADV_HDRBUFLEN_SHIFT;
if (hdr_info & IXGBE_RXDADV_SPH)
adapter->rx_hdr_split++;
if (len > IXGBE_RX_HDR_SIZE)
len = IXGBE_RX_HDR_SIZE;
upper_len = le16_to_cpu(rx_desc->wb.upper.length);
} else {
len = le16_to_cpu(rx_desc->wb.upper.length);
}
cleaned = true;
skb = rx_buffer_info->skb; // 得到ixgbe_rx_buffer的skb
prefetch(skb->data - NET_IP_ALIGN);
rx_buffer_info->skb = NULL; // 设置ixgbe_rx_buffer的skb为NULL
if (rx_buffer_info->dma) {
pci_unmap_single(pdev, rx_buffer_info->dma, // 取消流式DMA映射
rx_ring->rx_buf_len,
PCI_DMA_FROMDEVICE);
rx_buffer_info->dma = 0;
skb_put(skb, len);
}
if (upper_len) {
pci_unmap_page(pdev, rx_buffer_info->page_dma,
PAGE_SIZE / 2, PCI_DMA_FROMDEVICE);
rx_buffer_info->page_dma = 0;
skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
rx_buffer_info->page,
rx_buffer_info->page_offset,
upper_len);
if ((rx_ring->rx_buf_len > (PAGE_SIZE / 2)) ||
(page_count(rx_buffer_info->page) != 1))
rx_buffer_info->page = NULL;
else
get_page(rx_buffer_info->page);
skb->len += upper_len;
skb->data_len += upper_len;
skb->truesize += upper_len;
}
i++; // 更新i
if (i == rx_ring->count)
i = 0;
next_rxd = IXGBE_RX_DESC_ADV(*rx_ring, i);
prefetch(next_rxd);
cleaned_count++; // 统计移出Rx Ring Buffer(交给协议栈)的包数
if (adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE)
rsc_count = ixgbe_get_rsc_count(rx_desc);
if (rsc_count) {
u32 nextp = (staterr & IXGBE_RXDADV_NEXTP_MASK) >>
IXGBE_RXDADV_NEXTP_SHIFT;
next_buffer = &rx_ring->rx_buffer_info[nextp];
rx_ring->rsc_count += (rsc_count - 1);
} else {
next_buffer = &rx_ring->rx_buffer_info[i];
}
if (staterr & IXGBE_RXD_STAT_EOP) {
if (skb->prev)
skb = ixgbe_transform_rsc_queue(skb);
rx_ring->stats.packets++;
rx_ring->stats.bytes += skb->len;
} else {
if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
rx_buffer_info->skb = next_buffer->skb;
rx_buffer_info->dma = next_buffer->dma;
next_buffer->skb = skb;
next_buffer->dma = 0;
} else {
skb->next = next_buffer->skb;
skb->next->prev = skb;
}
adapter->non_eop_descs++;
goto next_desc;
}
if (staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) {
dev_kfree_skb_irq(skb);
goto next_desc;
}
ixgbe_rx_checksum(adapter, rx_desc, skb);
/* probably a little skewed due to removing CRC */
total_rx_bytes += skb->len;
total_rx_packets++;
skb->protocol = eth_type_trans(skb, adapter->netdev); // 设置L3协议
#ifdef IXGBE_FCOE
/* if ddp, not passing to ULD unless for FCP_RSP or error */
if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED) {
ddp_bytes = ixgbe_fcoe_ddp(adapter, rx_desc, skb);
if (!ddp_bytes)
goto next_desc;
}
#endif /* IXGBE_FCOE */
/* ixgbe_receive_skb() -> napi_gro_receive() -> napi_skb_finish() -> netif_receive_skb()
通过netif_receive_skb()将skb上送协议栈 */
ixgbe_receive_skb(q_vector, skb, staterr, rx_ring, rx_desc);
next_desc:
rx_desc->wb.upper.status_error = 0; // 设置status_error为0
/* return some buffers to hardware, one at a time is too slow */
// cleaned_count(移出Rx Ring Buffer的包数)每满16,补充16个sk_buff
if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
ixgbe_alloc_rx_buffers(adapter, rx_ring, cleaned_count);
cleaned_count = 0;
}
/* use prefetched values */
rx_desc = next_rxd; // 更新ixgbe_adv_rx_desc
rx_buffer_info = &rx_ring->rx_buffer_info[i]; // 更新ixgbe_rx_buffer
staterr = le32_to_cpu(rx_desc->wb.upper.status_error); // 更新staterr
}
rx_ring->next_to_clean = i; // 更新next_to_clean
cleaned_count = IXGBE_DESC_UNUSED(rx_ring); // 计算从next_to_use到next_to_clean - 2的个数
if (cleaned_count)
ixgbe_alloc_rx_buffers(adapter, rx_ring, cleaned_count); // 补充clean_count个sk_buff
#ifdef IXGBE_FCOE
/* include DDPed FCoE data */
if (ddp_bytes > 0) {
unsigned int mss;
mss = adapter->netdev->mtu - sizeof(struct fcoe_hdr) -
sizeof(struct fc_frame_header) -
sizeof(struct fcoe_crc_eof);
if (mss > 512)
mss &= ~511;
total_rx_bytes += ddp_bytes;
total_rx_packets += DIV_ROUND_UP(ddp_bytes, mss);
}
#endif /* IXGBE_FCOE */
rx_ring->total_packets += total_rx_packets;
rx_ring->total_bytes += total_rx_bytes;
adapter->net_stats.rx_bytes += total_rx_bytes;
adapter->net_stats.rx_packets += total_rx_packets;
return cleaned;
}
参考资料
https://www.intel.com/content/www/us/en/embedded/products/networking/82599-10-gbe-controller-datasheet.html
https://www.cnblogs.com/mylinuxer/p/4272382.html
http://blog.csdn.net/maijian/article/details/9040021
http://www.pagefault.info/?p=403
http://blog.csdn.net/youfuchen/article/details/39062553