linux网卡驱动注册与接受数据处理

Linux网络接受

本文基于2.6.15的内核版本来简单学习一下有关Linux内核如何进行网络数据的处理流程。

网卡驱动注册

以ixgb驱动为例,在网卡注册的时候会进入如下流程。

static struct pci_driver ixgb_driver = {
	.name     = ixgb_driver_name,
	.id_table = ixgb_pci_tbl,
	.probe    = ixgb_probe,
	.remove   = __devexit_p(ixgb_remove),
};

...
/**
 * ixgb_init_module - Driver Registration Routine
 *
 * ixgb_init_module is the first routine called when the driver is
 * loaded. All it does is register with the PCI subsystem.
 **/

static int __init
ixgb_init_module(void)
{
	printk(KERN_INFO "%s - version %s\n",
	       ixgb_driver_string, ixgb_driver_version);

	printk(KERN_INFO "%s\n", ixgb_copyright);

	return pci_module_init(&ixgb_driver);
}

module_init(ixgb_init_module);

PCI注册的流程会将调用probe方法,即ixgb_probe。

/**
 * ixgb_probe - Device Initialization Routine
 * @pdev: PCI device information struct
 * @ent: entry in ixgb_pci_tbl
 *
 * Returns 0 on success, negative on failure
 *
 * ixgb_probe initializes an adapter identified by a pci_dev structure.
 * The OS initialization, configuring of the adapter private structure,
 * and a hardware reset occur.
 **/

static int __devinit
ixgb_probe(struct pci_dev *pdev,
		const struct pci_device_id *ent)
{
	struct net_device *netdev = NULL;
	struct ixgb_adapter *adapter;
	static int cards_found = 0;
	unsigned long mmio_start;
	int mmio_len;
	int pci_using_dac;
	int i;
	int err;

	if((err = pci_enable_device(pdev)))  // 是否可以注册该设备
		return err;

	if(!(err = pci_set_dma_mask(pdev, DMA_64BIT_MASK))) { //设置DMA的掩码
		pci_using_dac = 1;
	} else {
		if((err = pci_set_dma_mask(pdev, DMA_32BIT_MASK))) {
			IXGB_ERR("No usable DMA configuration, aborting\n");
			return err;
		}
		pci_using_dac = 0;
	}

	if((err = pci_request_regions(pdev, ixgb_driver_name)))
		return err;

	pci_set_master(pdev);

	netdev = alloc_etherdev(sizeof(struct ixgb_adapter)); // 申请设备结构体
	if(!netdev) {
		err = -ENOMEM;
		goto err_alloc_etherdev;
	}

	SET_MODULE_OWNER(netdev);
	SET_NETDEV_DEV(netdev, &pdev->dev);

	pci_set_drvdata(pdev, netdev);    // 设置DMA对应的管理的内存空间
	adapter = netdev_priv(netdev);
	adapter->netdev = netdev;
	adapter->pdev = pdev;
	adapter->hw.back = adapter;

	mmio_start = pci_resource_start(pdev, BAR_0);
	mmio_len = pci_resource_len(pdev, BAR_0);

	adapter->hw.hw_addr = ioremap(mmio_start, mmio_len);
	if(!adapter->hw.hw_addr) {
		err = -EIO;
		goto err_ioremap;
	}

	for(i = BAR_1; i <= BAR_5; i++) {
		if(pci_resource_len(pdev, i) == 0)
			continue;
		if(pci_resource_flags(pdev, i) & IORESOURCE_IO) {
			adapter->hw.io_base = pci_resource_start(pdev, i);
			break;
		}
	}

	netdev->open = &ixgb_open;  // 设置ethtool对应的操作的方法,即设备的打开的方法
	netdev->stop = &ixgb_close;   // 设备的关闭的方法
	netdev->hard_start_xmit = &ixgb_xmit_frame;
	netdev->get_stats = &ixgb_get_stats;
	netdev->set_multicast_list = &ixgb_set_multi;
	netdev->set_mac_address = &ixgb_set_mac;
	netdev->change_mtu = &ixgb_change_mtu;  // 设置mtu长度
	ixgb_set_ethtool_ops(netdev);
	netdev->tx_timeout = &ixgb_tx_timeout;  // 设置超时时间
	netdev->watchdog_timeo = HZ;
#ifdef CONFIG_IXGB_NAPI
	netdev->poll = &ixgb_clean;    // 设置设备的poll方法 一般用在NAPI模式中
	netdev->weight = 64;
#endif
	netdev->vlan_rx_register = ixgb_vlan_rx_register;  // 设置vlan的方法
	netdev->vlan_rx_add_vid = ixgb_vlan_rx_add_vid;
	netdev->vlan_rx_kill_vid = ixgb_vlan_rx_kill_vid;
#ifdef CONFIG_NET_POLL_CONTROLLER
	netdev->poll_controller = ixgb_netpoll;  //是否设置了netpoll的方式
#endif

	netdev->mem_start = mmio_start;
	netdev->mem_end = mmio_start + mmio_len;
	netdev->base_addr = adapter->hw.io_base;

	adapter->bd_number = cards_found;
	adapter->link_speed = 0;
	adapter->link_duplex = 0;

	/* setup the private structure */

	if((err = ixgb_sw_init(adapter)))
		goto err_sw_init;

	netdev->features = NETIF_F_SG |
			   NETIF_F_HW_CSUM |
			   NETIF_F_HW_VLAN_TX |
			   NETIF_F_HW_VLAN_RX |
			   NETIF_F_HW_VLAN_FILTER;
#ifdef NETIF_F_TSO
	netdev->features |= NETIF_F_TSO;
#endif

	if(pci_using_dac)
		netdev->features |= NETIF_F_HIGHDMA;

	/* make sure the EEPROM is good */

	if(!ixgb_validate_eeprom_checksum(&adapter->hw)) {
		printk(KERN_ERR "The EEPROM Checksum Is Not Valid\n");
		err = -EIO;
		goto err_eeprom;
	}

	ixgb_get_ee_mac_addr(&adapter->hw, netdev->dev_addr);
	memcpy(netdev->perm_addr, netdev->dev_addr, netdev->addr_len);

	if(!is_valid_ether_addr(netdev->perm_addr)) {
		err = -EIO;
		goto err_eeprom;
	}

	adapter->part_num = ixgb_get_ee_pba_number(&adapter->hw);

	init_timer(&adapter->watchdog_timer);   // 设置定时器
	adapter->watchdog_timer.function = &ixgb_watchdog;
	adapter->watchdog_timer.data = (unsigned long)adapter;

	INIT_WORK(&adapter->tx_timeout_task,
		  (void (*)(void *))ixgb_tx_timeout_task, netdev);

	if((err = register_netdev(netdev)))  // 注册网络设备
		goto err_register;

	/* we're going to reset, so assume we have no link for now */

	netif_carrier_off(netdev);     // 重置网络设备的参数
	netif_stop_queue(netdev);

	printk(KERN_INFO "%s: Intel(R) PRO/10GbE Network Connection\n",
		   netdev->name);
	ixgb_check_options(adapter);
	/* reset the hardware with the new settings */

	ixgb_reset(adapter);

	cards_found++;
	return 0;

err_register:
err_sw_init:
err_eeprom:
	iounmap(adapter->hw.hw_addr);
err_ioremap:
	free_netdev(netdev);
err_alloc_etherdev:
	pci_release_regions(pdev);
	return err;
}

注册的方式主要就是通过配置一些网络的参数。

打开网卡接受数据

如果我们使用ethtool工具是该网卡打开,此时就会执行如下流程。

static int
ixgb_open(struct net_device *netdev)
{
	struct ixgb_adapter *adapter = netdev_priv(netdev);
	int err;

	/* allocate transmit descriptors */

	if((err = ixgb_setup_tx_resources(adapter)))
		goto err_setup_tx;

	/* allocate receive descriptors */

	if((err = ixgb_setup_rx_resources(adapter)))
		goto err_setup_rx;

	if((err = ixgb_up(adapter)))   // 注册中断函数,响应网卡数据
		goto err_up;

	return 0;

err_up:
	ixgb_free_rx_resources(adapter);
err_setup_rx:
	ixgb_free_tx_resources(adapter);
err_setup_tx:
	ixgb_reset(adapter);

	return err;
}

此时就会调用ixgb_open的函数,进行发送和接受的资源设置,并通过ixgb_up注册网卡的中断回调函数。

int
ixgb_up(struct ixgb_adapter *adapter)
{
	struct net_device *netdev = adapter->netdev;
	int err;
	int max_frame = netdev->mtu + ENET_HEADER_SIZE + ENET_FCS_LENGTH;  // 获取帧大小
	struct ixgb_hw *hw = &adapter->hw;

	/* hardware has been reset, we need to reload some things */

	ixgb_set_multi(netdev);

	ixgb_restore_vlan(adapter);

	ixgb_configure_tx(adapter);  // 配置发送
	ixgb_setup_rctl(adapter);
	ixgb_configure_rx(adapter);  // 配置接受
	ixgb_alloc_rx_buffers(adapter);  // 设置接受ring缓冲区

#ifdef CONFIG_PCI_MSI
	{
	boolean_t pcix = (IXGB_READ_REG(&adapter->hw, STATUS) & 
						  IXGB_STATUS_PCIX_MODE) ? TRUE : FALSE;
	adapter->have_msi = TRUE;

	if (!pcix)
	   adapter->have_msi = FALSE;
	else if((err = pci_enable_msi(adapter->pdev))) {
		printk (KERN_ERR
		 "Unable to allocate MSI interrupt Error: %d\n", err);
		adapter->have_msi = FALSE;
		/* proceed to try to request regular interrupt */
	}
	}

#endif
	if((err = request_irq(adapter->pdev->irq, &ixgb_intr,
				  SA_SHIRQ | SA_SAMPLE_RANDOM,
				  netdev->name, netdev)))  // 注册网卡的中断处理函数,并注册网卡的中断号
		return err;

	/* disable interrupts and get the hardware into a known state */
	IXGB_WRITE_REG(&adapter->hw, IMC, 0xffffffff);

	if((hw->max_frame_size != max_frame) ||
		(hw->max_frame_size !=
		(IXGB_READ_REG(hw, MFS) >> IXGB_MFS_SHIFT))) {

		hw->max_frame_size = max_frame;

		IXGB_WRITE_REG(hw, MFS, hw->max_frame_size << IXGB_MFS_SHIFT);

		if(hw->max_frame_size >
		   IXGB_MAX_ENET_FRAME_SIZE_WITHOUT_FCS + ENET_FCS_LENGTH) {
			uint32_t ctrl0 = IXGB_READ_REG(hw, CTRL0);

			if(!(ctrl0 & IXGB_CTRL0_JFE)) {
				ctrl0 |= IXGB_CTRL0_JFE;
				IXGB_WRITE_REG(hw, CTRL0, ctrl0);
			}
		}
	}

	mod_timer(&adapter->watchdog_timer, jiffies);
	ixgb_irq_enable(adapter);   // 开启中断处理

#ifdef CONFIG_IXGB_NAPI
	netif_poll_enable(netdev);
#endif
	return 0;
}

主要是设置了接受的数据缓冲区,设置了中断的处理函数ixgb_intr,当数据来了之后就会调用ixgb_intr函数进行处理。

static irqreturn_t
ixgb_intr(int irq, void *data, struct pt_regs *regs)
{
	struct net_device *netdev = data;
	struct ixgb_adapter *adapter = netdev_priv(netdev);
	struct ixgb_hw *hw = &adapter->hw;
	uint32_t icr = IXGB_READ_REG(hw, ICR);
#ifndef CONFIG_IXGB_NAPI
	unsigned int i;
#endif

	if(unlikely(!icr))
		return IRQ_NONE;  /* Not our interrupt */

	if(unlikely(icr & (IXGB_INT_RXSEQ | IXGB_INT_LSC))) {
		mod_timer(&adapter->watchdog_timer, jiffies);
	}

#ifdef CONFIG_IXGB_NAPI
	if(netif_rx_schedule_prep(netdev)) {  // 是否是NAPI模式运行

		/* Disable interrupts and register for poll. The flush 
		  of the posted write is intentionally left out.
		*/

		atomic_inc(&adapter->irq_sem);
		IXGB_WRITE_REG(&adapter->hw, IMC, ~0);
		__netif_rx_schedule(netdev);
	}
#else
	/* yes, that is actually a & and it is meant to make sure that
	 * every pass through this for loop checks both receive and
	 * transmit queues for completed descriptors, intended to
	 * avoid starvation issues and assist tx/rx fairness. */
	for(i = 0; i < IXGB_MAX_INTR; i++)  // 通过ixgb_clean_rx_irq进行数据处理
		if(!ixgb_clean_rx_irq(adapter) &
		   !ixgb_clean_tx_irq(adapter))
			break;
#endif 
	return IRQ_HANDLED;
}

此时就看配置的是网卡的哪种数据处理类型,一般有如下两种网卡数据处理流程。

NAPI模式

通过softirq内核线程来进行网卡数据的进行poll进行,从而避免过多的中断来进行处理提升效率。

static inline void __netif_rx_schedule(struct net_device *dev)
{
	unsigned long flags;

	local_irq_save(flags);
	dev_hold(dev);
	list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list); // 加入当前设备的队列从而使在软中断中内容持续接受网卡的数据
	if (dev->quota < 0)
		dev->quota += dev->weight;
	else
		dev->quota = dev->weight;
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);  // 发出软中断,发送数据接受软中断
	local_irq_restore(flags);
}

此时就会调用在NET_RX_SOFTIRQ注册的信号方法。

open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);

此时调用的就是网络处理。

static void net_rx_action(struct softirq_action *h)
{
	struct softnet_data *queue = &__get_cpu_var(softnet_data);
	unsigned long start_time = jiffies;
	int budget = netdev_budget;
	void *have;

	local_irq_disable();

	while (!list_empty(&queue->poll_list)) {  // 检查链表是否为空
		struct net_device *dev;

		if (budget <= 0 || jiffies - start_time > 1)
			goto softnet_break;

		local_irq_enable();  //使能中断

		dev = list_entry(queue->poll_list.next,
				 struct net_device, poll_list);  // 获取设备
		have = netpoll_poll_lock(dev);    

		if (dev->quota <= 0 || dev->poll(dev, &budget)) {  // 获取链表上面的数据
			netpoll_poll_unlock(have);
			local_irq_disable();
			list_del(&dev->poll_list);  
			list_add_tail(&dev->poll_list, &queue->poll_list);
			if (dev->quota < 0)
				dev->quota += dev->weight;
			else
				dev->quota = dev->weight;
		} else {
			netpoll_poll_unlock(have);
			dev_put(dev);  // 将获取数据放回
			local_irq_disable();
		}
	}
out:
	local_irq_enable();  //使能中断
	return;

softnet_break:
	__get_cpu_var(netdev_rx_stat).time_squeeze++;
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
	goto out;
}

通过调用dev的poll函数来进行数据的处理。ixgb对应的poll函数为ixgb_clean。

#ifdef CONFIG_IXGB_NAPI
/**
 * ixgb_clean - NAPI Rx polling callback
 * @adapter: board private structure
 **/

static int
ixgb_clean(struct net_device *netdev, int *budget)
{
	struct ixgb_adapter *adapter = netdev_priv(netdev);
	int work_to_do = min(*budget, netdev->quota);
	int tx_cleaned;
	int work_done = 0;

	tx_cleaned = ixgb_clean_tx_irq(adapter);
	ixgb_clean_rx_irq(adapter, &work_done, work_to_do); // 处理单个网络接受数据

	*budget -= work_done;
	netdev->quota -= work_done;  // 减去王超的数量

	/* if no Tx and not enough Rx work done, exit the polling mode */
	if((!tx_cleaned && (work_done == 0)) || !netif_running(netdev)) {
		netif_rx_complete(netdev);  // 如果没了就离开该模式
		ixgb_irq_enable(adapter);
		return 0;
	}

	return 1;
}
#endif

其中ixgb_clean_rx_irq函数,就是我们不经过NAPI模式处理的流程。

数据直接传递

数据直接传递就是直接通过ixgb_clean_rx_irq函数来处理网络包。

static boolean_t
#ifdef CONFIG_IXGB_NAPI
ixgb_clean_rx_irq(struct ixgb_adapter *adapter, int *work_done, int work_to_do)
#else
ixgb_clean_rx_irq(struct ixgb_adapter *adapter)
#endif
{
	struct ixgb_desc_ring *rx_ring = &adapter->rx_ring;
	struct net_device *netdev = adapter->netdev;
	struct pci_dev *pdev = adapter->pdev;
	struct ixgb_rx_desc *rx_desc, *next_rxd;
	struct ixgb_buffer *buffer_info, *next_buffer, *next2_buffer;
	uint32_t length;
	unsigned int i, j;
	boolean_t cleaned = FALSE;

	i = rx_ring->next_to_clean;
	rx_desc = IXGB_RX_DESC(*rx_ring, i);
	buffer_info = &rx_ring->buffer_info[i];

	while(rx_desc->status & IXGB_RX_DESC_STATUS_DD) {
		struct sk_buff *skb, *next_skb;  // skb数据包
		u8 status;

#ifdef CONFIG_IXGB_NAPI
		if(*work_done >= work_to_do)
			break;

		(*work_done)++;
#endif
		status = rx_desc->status;
		skb = buffer_info->skb;  //获取网卡处理好的skb数据

		prefetch(skb->data);

		if(++i == rx_ring->count) i = 0;
		next_rxd = IXGB_RX_DESC(*rx_ring, i);
		prefetch(next_rxd);

		if((j = i + 1) == rx_ring->count) j = 0;
		next2_buffer = &rx_ring->buffer_info[j];  // 获取ring的数据
		prefetch(next2_buffer);

		next_buffer = &rx_ring->buffer_info[i];
		next_skb = next_buffer->skb;
		prefetch(next_skb);

		cleaned = TRUE;

		pci_unmap_single(pdev,
				 buffer_info->dma,
				 buffer_info->length,
				 PCI_DMA_FROMDEVICE);

		length = le16_to_cpu(rx_desc->length);

		if(unlikely(!(status & IXGB_RX_DESC_STATUS_EOP))) {

			/* All receives must fit into a single buffer */

			IXGB_DBG("Receive packet consumed multiple buffers "
					 "length<%x>\n", length);

			dev_kfree_skb_irq(skb);
			goto rxdesc_done;
		}

		if (unlikely(rx_desc->errors
			     & (IXGB_RX_DESC_ERRORS_CE | IXGB_RX_DESC_ERRORS_SE
				| IXGB_RX_DESC_ERRORS_P |
				IXGB_RX_DESC_ERRORS_RXE))) {

			dev_kfree_skb_irq(skb);
			goto rxdesc_done;
		}

		/* Good Receive */
		skb_put(skb, length);   //放入skb中保存

		/* Receive Checksum Offload */
		ixgb_rx_checksum(adapter, rx_desc, skb); //检查checksum

		skb->protocol = eth_type_trans(skb, netdev);  //获取协议
#ifdef CONFIG_IXGB_NAPI
		if(adapter->vlgrp && (status & IXGB_RX_DESC_STATUS_VP)) {
			vlan_hwaccel_receive_skb(skb, adapter->vlgrp,
				le16_to_cpu(rx_desc->special) &
					IXGB_RX_DESC_SPECIAL_VLAN_MASK);
		} else {
			netif_receive_skb(skb);  // 如果是NAPI模式 直接调用netif_receive_skb处理
		}
#else /* CONFIG_IXGB_NAPI */
		if(adapter->vlgrp && (status & IXGB_RX_DESC_STATUS_VP)) {
			vlan_hwaccel_rx(skb, adapter->vlgrp,
				le16_to_cpu(rx_desc->special) &
					IXGB_RX_DESC_SPECIAL_VLAN_MASK);
		} else {
			netif_rx(skb);   // 走单个软中段的处理流程
		}
#endif /* CONFIG_IXGB_NAPI */
		netdev->last_rx = jiffies;

rxdesc_done:
		/* clean up descriptor, might be written over by hw */
		rx_desc->status = 0;
		buffer_info->skb = NULL;

		/* use prefetched values */
		rx_desc = next_rxd;
		buffer_info = next_buffer;
	}

	rx_ring->next_to_clean = i;   

	ixgb_alloc_rx_buffers(adapter);

	return cleaned;
}

在非NAPI模式下,会调用netif_rx进入到软中段的网络数据处理中,只不过该步就没有调用poll模式处理。

int netif_rx(struct sk_buff *skb)
{
	struct softnet_data *queue;
	unsigned long flags;

	/* if netpoll wants it, pretend we never saw it */
	if (netpoll_rx(skb))
		return NET_RX_DROP;

	if (!skb->tstamp.off_sec)
		net_timestamp(skb);

	/*
	 * The code is rearranged so that the path is the most
	 * short when CPU is congested, but is still operating.
	 */
	local_irq_save(flags);
	queue = &__get_cpu_var(softnet_data);

	__get_cpu_var(netdev_rx_stat).total++;
	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
		if (queue->input_pkt_queue.qlen) {
enqueue:
			dev_hold(skb->dev);
			__skb_queue_tail(&queue->input_pkt_queue, skb);
			local_irq_restore(flags);
			return NET_RX_SUCCESS;
		}

		netif_rx_schedule(&queue->backlog_dev);  // 调用队列进行接受数据的处理
		goto enqueue;
	}

	__get_cpu_var(netdev_rx_stat).dropped++;
	local_irq_restore(flags);

	kfree_skb(skb);
	return NET_RX_DROP;
}

基本的两种模式下面的网络数据处理就基本分析完成。

总结

本文简单根据Linux-2.6系列的源码,基于网卡ixgb进行了一个网卡数据的简单的接受流程分析,网卡通过ethtool完成对应的接口,并注册对应的驱动,在通过ethtool工具使网卡开始运行的时候,就注册中断处理函数,并通过是否是NAPI处理流程来进行不同的处理,使用了NAPI会使用poll函数在softirq内核线程中,进行网卡数据的处理,效率上会比非NAPI模式要高,在非NAPI模式下面,最后也是通过通过softirq内核线程将获取的网络包进行处理,从而完成网卡数据的接受与处理。由于本人才疏学浅,如有错误请批评指正。

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值