Linux网络接受
本文基于2.6.15的内核版本来简单学习一下有关Linux内核如何进行网络数据的处理流程。
网卡驱动注册
以ixgb驱动为例,在网卡注册的时候会进入如下流程。
static struct pci_driver ixgb_driver = {
.name = ixgb_driver_name,
.id_table = ixgb_pci_tbl,
.probe = ixgb_probe,
.remove = __devexit_p(ixgb_remove),
};
...
/**
* ixgb_init_module - Driver Registration Routine
*
* ixgb_init_module is the first routine called when the driver is
* loaded. All it does is register with the PCI subsystem.
**/
static int __init
ixgb_init_module(void)
{
printk(KERN_INFO "%s - version %s\n",
ixgb_driver_string, ixgb_driver_version);
printk(KERN_INFO "%s\n", ixgb_copyright);
return pci_module_init(&ixgb_driver);
}
module_init(ixgb_init_module);
PCI注册的流程会将调用probe方法,即ixgb_probe。
/**
* ixgb_probe - Device Initialization Routine
* @pdev: PCI device information struct
* @ent: entry in ixgb_pci_tbl
*
* Returns 0 on success, negative on failure
*
* ixgb_probe initializes an adapter identified by a pci_dev structure.
* The OS initialization, configuring of the adapter private structure,
* and a hardware reset occur.
**/
static int __devinit
ixgb_probe(struct pci_dev *pdev,
const struct pci_device_id *ent)
{
struct net_device *netdev = NULL;
struct ixgb_adapter *adapter;
static int cards_found = 0;
unsigned long mmio_start;
int mmio_len;
int pci_using_dac;
int i;
int err;
if((err = pci_enable_device(pdev))) // 是否可以注册该设备
return err;
if(!(err = pci_set_dma_mask(pdev, DMA_64BIT_MASK))) { //设置DMA的掩码
pci_using_dac = 1;
} else {
if((err = pci_set_dma_mask(pdev, DMA_32BIT_MASK))) {
IXGB_ERR("No usable DMA configuration, aborting\n");
return err;
}
pci_using_dac = 0;
}
if((err = pci_request_regions(pdev, ixgb_driver_name)))
return err;
pci_set_master(pdev);
netdev = alloc_etherdev(sizeof(struct ixgb_adapter)); // 申请设备结构体
if(!netdev) {
err = -ENOMEM;
goto err_alloc_etherdev;
}
SET_MODULE_OWNER(netdev);
SET_NETDEV_DEV(netdev, &pdev->dev);
pci_set_drvdata(pdev, netdev); // 设置DMA对应的管理的内存空间
adapter = netdev_priv(netdev);
adapter->netdev = netdev;
adapter->pdev = pdev;
adapter->hw.back = adapter;
mmio_start = pci_resource_start(pdev, BAR_0);
mmio_len = pci_resource_len(pdev, BAR_0);
adapter->hw.hw_addr = ioremap(mmio_start, mmio_len);
if(!adapter->hw.hw_addr) {
err = -EIO;
goto err_ioremap;
}
for(i = BAR_1; i <= BAR_5; i++) {
if(pci_resource_len(pdev, i) == 0)
continue;
if(pci_resource_flags(pdev, i) & IORESOURCE_IO) {
adapter->hw.io_base = pci_resource_start(pdev, i);
break;
}
}
netdev->open = &ixgb_open; // 设置ethtool对应的操作的方法,即设备的打开的方法
netdev->stop = &ixgb_close; // 设备的关闭的方法
netdev->hard_start_xmit = &ixgb_xmit_frame;
netdev->get_stats = &ixgb_get_stats;
netdev->set_multicast_list = &ixgb_set_multi;
netdev->set_mac_address = &ixgb_set_mac;
netdev->change_mtu = &ixgb_change_mtu; // 设置mtu长度
ixgb_set_ethtool_ops(netdev);
netdev->tx_timeout = &ixgb_tx_timeout; // 设置超时时间
netdev->watchdog_timeo = HZ;
#ifdef CONFIG_IXGB_NAPI
netdev->poll = &ixgb_clean; // 设置设备的poll方法 一般用在NAPI模式中
netdev->weight = 64;
#endif
netdev->vlan_rx_register = ixgb_vlan_rx_register; // 设置vlan的方法
netdev->vlan_rx_add_vid = ixgb_vlan_rx_add_vid;
netdev->vlan_rx_kill_vid = ixgb_vlan_rx_kill_vid;
#ifdef CONFIG_NET_POLL_CONTROLLER
netdev->poll_controller = ixgb_netpoll; //是否设置了netpoll的方式
#endif
netdev->mem_start = mmio_start;
netdev->mem_end = mmio_start + mmio_len;
netdev->base_addr = adapter->hw.io_base;
adapter->bd_number = cards_found;
adapter->link_speed = 0;
adapter->link_duplex = 0;
/* setup the private structure */
if((err = ixgb_sw_init(adapter)))
goto err_sw_init;
netdev->features = NETIF_F_SG |
NETIF_F_HW_CSUM |
NETIF_F_HW_VLAN_TX |
NETIF_F_HW_VLAN_RX |
NETIF_F_HW_VLAN_FILTER;
#ifdef NETIF_F_TSO
netdev->features |= NETIF_F_TSO;
#endif
if(pci_using_dac)
netdev->features |= NETIF_F_HIGHDMA;
/* make sure the EEPROM is good */
if(!ixgb_validate_eeprom_checksum(&adapter->hw)) {
printk(KERN_ERR "The EEPROM Checksum Is Not Valid\n");
err = -EIO;
goto err_eeprom;
}
ixgb_get_ee_mac_addr(&adapter->hw, netdev->dev_addr);
memcpy(netdev->perm_addr, netdev->dev_addr, netdev->addr_len);
if(!is_valid_ether_addr(netdev->perm_addr)) {
err = -EIO;
goto err_eeprom;
}
adapter->part_num = ixgb_get_ee_pba_number(&adapter->hw);
init_timer(&adapter->watchdog_timer); // 设置定时器
adapter->watchdog_timer.function = &ixgb_watchdog;
adapter->watchdog_timer.data = (unsigned long)adapter;
INIT_WORK(&adapter->tx_timeout_task,
(void (*)(void *))ixgb_tx_timeout_task, netdev);
if((err = register_netdev(netdev))) // 注册网络设备
goto err_register;
/* we're going to reset, so assume we have no link for now */
netif_carrier_off(netdev); // 重置网络设备的参数
netif_stop_queue(netdev);
printk(KERN_INFO "%s: Intel(R) PRO/10GbE Network Connection\n",
netdev->name);
ixgb_check_options(adapter);
/* reset the hardware with the new settings */
ixgb_reset(adapter);
cards_found++;
return 0;
err_register:
err_sw_init:
err_eeprom:
iounmap(adapter->hw.hw_addr);
err_ioremap:
free_netdev(netdev);
err_alloc_etherdev:
pci_release_regions(pdev);
return err;
}
注册的方式主要就是通过配置一些网络的参数。
打开网卡接受数据
如果我们使用ethtool工具是该网卡打开,此时就会执行如下流程。
static int
ixgb_open(struct net_device *netdev)
{
struct ixgb_adapter *adapter = netdev_priv(netdev);
int err;
/* allocate transmit descriptors */
if((err = ixgb_setup_tx_resources(adapter)))
goto err_setup_tx;
/* allocate receive descriptors */
if((err = ixgb_setup_rx_resources(adapter)))
goto err_setup_rx;
if((err = ixgb_up(adapter))) // 注册中断函数,响应网卡数据
goto err_up;
return 0;
err_up:
ixgb_free_rx_resources(adapter);
err_setup_rx:
ixgb_free_tx_resources(adapter);
err_setup_tx:
ixgb_reset(adapter);
return err;
}
此时就会调用ixgb_open的函数,进行发送和接受的资源设置,并通过ixgb_up注册网卡的中断回调函数。
int
ixgb_up(struct ixgb_adapter *adapter)
{
struct net_device *netdev = adapter->netdev;
int err;
int max_frame = netdev->mtu + ENET_HEADER_SIZE + ENET_FCS_LENGTH; // 获取帧大小
struct ixgb_hw *hw = &adapter->hw;
/* hardware has been reset, we need to reload some things */
ixgb_set_multi(netdev);
ixgb_restore_vlan(adapter);
ixgb_configure_tx(adapter); // 配置发送
ixgb_setup_rctl(adapter);
ixgb_configure_rx(adapter); // 配置接受
ixgb_alloc_rx_buffers(adapter); // 设置接受ring缓冲区
#ifdef CONFIG_PCI_MSI
{
boolean_t pcix = (IXGB_READ_REG(&adapter->hw, STATUS) &
IXGB_STATUS_PCIX_MODE) ? TRUE : FALSE;
adapter->have_msi = TRUE;
if (!pcix)
adapter->have_msi = FALSE;
else if((err = pci_enable_msi(adapter->pdev))) {
printk (KERN_ERR
"Unable to allocate MSI interrupt Error: %d\n", err);
adapter->have_msi = FALSE;
/* proceed to try to request regular interrupt */
}
}
#endif
if((err = request_irq(adapter->pdev->irq, &ixgb_intr,
SA_SHIRQ | SA_SAMPLE_RANDOM,
netdev->name, netdev))) // 注册网卡的中断处理函数,并注册网卡的中断号
return err;
/* disable interrupts and get the hardware into a known state */
IXGB_WRITE_REG(&adapter->hw, IMC, 0xffffffff);
if((hw->max_frame_size != max_frame) ||
(hw->max_frame_size !=
(IXGB_READ_REG(hw, MFS) >> IXGB_MFS_SHIFT))) {
hw->max_frame_size = max_frame;
IXGB_WRITE_REG(hw, MFS, hw->max_frame_size << IXGB_MFS_SHIFT);
if(hw->max_frame_size >
IXGB_MAX_ENET_FRAME_SIZE_WITHOUT_FCS + ENET_FCS_LENGTH) {
uint32_t ctrl0 = IXGB_READ_REG(hw, CTRL0);
if(!(ctrl0 & IXGB_CTRL0_JFE)) {
ctrl0 |= IXGB_CTRL0_JFE;
IXGB_WRITE_REG(hw, CTRL0, ctrl0);
}
}
}
mod_timer(&adapter->watchdog_timer, jiffies);
ixgb_irq_enable(adapter); // 开启中断处理
#ifdef CONFIG_IXGB_NAPI
netif_poll_enable(netdev);
#endif
return 0;
}
主要是设置了接受的数据缓冲区,设置了中断的处理函数ixgb_intr,当数据来了之后就会调用ixgb_intr函数进行处理。
static irqreturn_t
ixgb_intr(int irq, void *data, struct pt_regs *regs)
{
struct net_device *netdev = data;
struct ixgb_adapter *adapter = netdev_priv(netdev);
struct ixgb_hw *hw = &adapter->hw;
uint32_t icr = IXGB_READ_REG(hw, ICR);
#ifndef CONFIG_IXGB_NAPI
unsigned int i;
#endif
if(unlikely(!icr))
return IRQ_NONE; /* Not our interrupt */
if(unlikely(icr & (IXGB_INT_RXSEQ | IXGB_INT_LSC))) {
mod_timer(&adapter->watchdog_timer, jiffies);
}
#ifdef CONFIG_IXGB_NAPI
if(netif_rx_schedule_prep(netdev)) { // 是否是NAPI模式运行
/* Disable interrupts and register for poll. The flush
of the posted write is intentionally left out.
*/
atomic_inc(&adapter->irq_sem);
IXGB_WRITE_REG(&adapter->hw, IMC, ~0);
__netif_rx_schedule(netdev);
}
#else
/* yes, that is actually a & and it is meant to make sure that
* every pass through this for loop checks both receive and
* transmit queues for completed descriptors, intended to
* avoid starvation issues and assist tx/rx fairness. */
for(i = 0; i < IXGB_MAX_INTR; i++) // 通过ixgb_clean_rx_irq进行数据处理
if(!ixgb_clean_rx_irq(adapter) &
!ixgb_clean_tx_irq(adapter))
break;
#endif
return IRQ_HANDLED;
}
此时就看配置的是网卡的哪种数据处理类型,一般有如下两种网卡数据处理流程。
NAPI模式
通过softirq内核线程来进行网卡数据的进行poll进行,从而避免过多的中断来进行处理提升效率。
static inline void __netif_rx_schedule(struct net_device *dev)
{
unsigned long flags;
local_irq_save(flags);
dev_hold(dev);
list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list); // 加入当前设备的队列从而使在软中断中内容持续接受网卡的数据
if (dev->quota < 0)
dev->quota += dev->weight;
else
dev->quota = dev->weight;
__raise_softirq_irqoff(NET_RX_SOFTIRQ); // 发出软中断,发送数据接受软中断
local_irq_restore(flags);
}
此时就会调用在NET_RX_SOFTIRQ注册的信号方法。
open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
此时调用的就是网络处理。
static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *queue = &__get_cpu_var(softnet_data);
unsigned long start_time = jiffies;
int budget = netdev_budget;
void *have;
local_irq_disable();
while (!list_empty(&queue->poll_list)) { // 检查链表是否为空
struct net_device *dev;
if (budget <= 0 || jiffies - start_time > 1)
goto softnet_break;
local_irq_enable(); //使能中断
dev = list_entry(queue->poll_list.next,
struct net_device, poll_list); // 获取设备
have = netpoll_poll_lock(dev);
if (dev->quota <= 0 || dev->poll(dev, &budget)) { // 获取链表上面的数据
netpoll_poll_unlock(have);
local_irq_disable();
list_del(&dev->poll_list);
list_add_tail(&dev->poll_list, &queue->poll_list);
if (dev->quota < 0)
dev->quota += dev->weight;
else
dev->quota = dev->weight;
} else {
netpoll_poll_unlock(have);
dev_put(dev); // 将获取数据放回
local_irq_disable();
}
}
out:
local_irq_enable(); //使能中断
return;
softnet_break:
__get_cpu_var(netdev_rx_stat).time_squeeze++;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
goto out;
}
通过调用dev的poll函数来进行数据的处理。ixgb对应的poll函数为ixgb_clean。
#ifdef CONFIG_IXGB_NAPI
/**
* ixgb_clean - NAPI Rx polling callback
* @adapter: board private structure
**/
static int
ixgb_clean(struct net_device *netdev, int *budget)
{
struct ixgb_adapter *adapter = netdev_priv(netdev);
int work_to_do = min(*budget, netdev->quota);
int tx_cleaned;
int work_done = 0;
tx_cleaned = ixgb_clean_tx_irq(adapter);
ixgb_clean_rx_irq(adapter, &work_done, work_to_do); // 处理单个网络接受数据
*budget -= work_done;
netdev->quota -= work_done; // 减去王超的数量
/* if no Tx and not enough Rx work done, exit the polling mode */
if((!tx_cleaned && (work_done == 0)) || !netif_running(netdev)) {
netif_rx_complete(netdev); // 如果没了就离开该模式
ixgb_irq_enable(adapter);
return 0;
}
return 1;
}
#endif
其中ixgb_clean_rx_irq函数,就是我们不经过NAPI模式处理的流程。
数据直接传递
数据直接传递就是直接通过ixgb_clean_rx_irq函数来处理网络包。
static boolean_t
#ifdef CONFIG_IXGB_NAPI
ixgb_clean_rx_irq(struct ixgb_adapter *adapter, int *work_done, int work_to_do)
#else
ixgb_clean_rx_irq(struct ixgb_adapter *adapter)
#endif
{
struct ixgb_desc_ring *rx_ring = &adapter->rx_ring;
struct net_device *netdev = adapter->netdev;
struct pci_dev *pdev = adapter->pdev;
struct ixgb_rx_desc *rx_desc, *next_rxd;
struct ixgb_buffer *buffer_info, *next_buffer, *next2_buffer;
uint32_t length;
unsigned int i, j;
boolean_t cleaned = FALSE;
i = rx_ring->next_to_clean;
rx_desc = IXGB_RX_DESC(*rx_ring, i);
buffer_info = &rx_ring->buffer_info[i];
while(rx_desc->status & IXGB_RX_DESC_STATUS_DD) {
struct sk_buff *skb, *next_skb; // skb数据包
u8 status;
#ifdef CONFIG_IXGB_NAPI
if(*work_done >= work_to_do)
break;
(*work_done)++;
#endif
status = rx_desc->status;
skb = buffer_info->skb; //获取网卡处理好的skb数据
prefetch(skb->data);
if(++i == rx_ring->count) i = 0;
next_rxd = IXGB_RX_DESC(*rx_ring, i);
prefetch(next_rxd);
if((j = i + 1) == rx_ring->count) j = 0;
next2_buffer = &rx_ring->buffer_info[j]; // 获取ring的数据
prefetch(next2_buffer);
next_buffer = &rx_ring->buffer_info[i];
next_skb = next_buffer->skb;
prefetch(next_skb);
cleaned = TRUE;
pci_unmap_single(pdev,
buffer_info->dma,
buffer_info->length,
PCI_DMA_FROMDEVICE);
length = le16_to_cpu(rx_desc->length);
if(unlikely(!(status & IXGB_RX_DESC_STATUS_EOP))) {
/* All receives must fit into a single buffer */
IXGB_DBG("Receive packet consumed multiple buffers "
"length<%x>\n", length);
dev_kfree_skb_irq(skb);
goto rxdesc_done;
}
if (unlikely(rx_desc->errors
& (IXGB_RX_DESC_ERRORS_CE | IXGB_RX_DESC_ERRORS_SE
| IXGB_RX_DESC_ERRORS_P |
IXGB_RX_DESC_ERRORS_RXE))) {
dev_kfree_skb_irq(skb);
goto rxdesc_done;
}
/* Good Receive */
skb_put(skb, length); //放入skb中保存
/* Receive Checksum Offload */
ixgb_rx_checksum(adapter, rx_desc, skb); //检查checksum
skb->protocol = eth_type_trans(skb, netdev); //获取协议
#ifdef CONFIG_IXGB_NAPI
if(adapter->vlgrp && (status & IXGB_RX_DESC_STATUS_VP)) {
vlan_hwaccel_receive_skb(skb, adapter->vlgrp,
le16_to_cpu(rx_desc->special) &
IXGB_RX_DESC_SPECIAL_VLAN_MASK);
} else {
netif_receive_skb(skb); // 如果是NAPI模式 直接调用netif_receive_skb处理
}
#else /* CONFIG_IXGB_NAPI */
if(adapter->vlgrp && (status & IXGB_RX_DESC_STATUS_VP)) {
vlan_hwaccel_rx(skb, adapter->vlgrp,
le16_to_cpu(rx_desc->special) &
IXGB_RX_DESC_SPECIAL_VLAN_MASK);
} else {
netif_rx(skb); // 走单个软中段的处理流程
}
#endif /* CONFIG_IXGB_NAPI */
netdev->last_rx = jiffies;
rxdesc_done:
/* clean up descriptor, might be written over by hw */
rx_desc->status = 0;
buffer_info->skb = NULL;
/* use prefetched values */
rx_desc = next_rxd;
buffer_info = next_buffer;
}
rx_ring->next_to_clean = i;
ixgb_alloc_rx_buffers(adapter);
return cleaned;
}
在非NAPI模式下,会调用netif_rx进入到软中段的网络数据处理中,只不过该步就没有调用poll模式处理。
int netif_rx(struct sk_buff *skb)
{
struct softnet_data *queue;
unsigned long flags;
/* if netpoll wants it, pretend we never saw it */
if (netpoll_rx(skb))
return NET_RX_DROP;
if (!skb->tstamp.off_sec)
net_timestamp(skb);
/*
* The code is rearranged so that the path is the most
* short when CPU is congested, but is still operating.
*/
local_irq_save(flags);
queue = &__get_cpu_var(softnet_data);
__get_cpu_var(netdev_rx_stat).total++;
if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
if (queue->input_pkt_queue.qlen) {
enqueue:
dev_hold(skb->dev);
__skb_queue_tail(&queue->input_pkt_queue, skb);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
netif_rx_schedule(&queue->backlog_dev); // 调用队列进行接受数据的处理
goto enqueue;
}
__get_cpu_var(netdev_rx_stat).dropped++;
local_irq_restore(flags);
kfree_skb(skb);
return NET_RX_DROP;
}
基本的两种模式下面的网络数据处理就基本分析完成。
总结
本文简单根据Linux-2.6系列的源码,基于网卡ixgb进行了一个网卡数据的简单的接受流程分析,网卡通过ethtool完成对应的接口,并注册对应的驱动,在通过ethtool工具使网卡开始运行的时候,就注册中断处理函数,并通过是否是NAPI处理流程来进行不同的处理,使用了NAPI会使用poll函数在softirq内核线程中,进行网卡数据的处理,效率上会比非NAPI模式要高,在非NAPI模式下面,最后也是通过通过softirq内核线程将获取的网络包进行处理,从而完成网卡数据的接受与处理。由于本人才疏学浅,如有错误请批评指正。