Linux kernel 帧的接收
本文以e1000驱动为例,基于3.10.0-514.10.2版本内核。
驱动注册 - e1000_init_module
[root@10-254-0-111 ~]# modprobe e1000 #插入e1000模块
或者
[root@10-254-0-111 ~]# insmod /path/to/e1000.ko #insmod后面指定e1000.ko文件路径
上面的操作对应的实现代码如下:
/*** drivers/net/ethernet/intel/e1000/e1000_main.c ***/
//该e1000驱动对应的pci驱动实例
static struct pci_driver e1000_driver = {
.name = e1000_driver_name, //e1000驱动程序名称,默认e1000
.id_table = e1000_pci_tbl, //该驱动程序所支持的网卡设备列表
.probe = e1000_probe, //设备初始化函数,当PCI子系统检测到该驱动所支持的设备被插入到总线上时,调用该函数对设备进行初始化操作。
.remove = e1000_remove, //移除设备时(热插拔或驱动程序被移除时)调用该函数
#ifdef CONFIG_PM //电源管理
/* Power Management Hooks */
.suspend = e1000_suspend, //系统休眠时调用
.resume = e1000_resume, //系统被唤醒时调用
#endif
.shutdown = e1000_shutdown, //系统关闭时调用
.err_handler = &e1000_err_handler //错误处理器
};
/**
* e1000_init_module - 驱动注册函数
*
* e1000_init_module is the first routine called when the driver is
* loaded. All it does is register with the PCI subsystem.
**/
static int __init e1000_init_module(void)
{
int ret;
pr_info("%s - version %s\n", e1000_driver_string, e1000_driver_version);
pr_info("%s\n", e1000_copyright);
/* 注册pci驱动 - 把e1000驱动程序以pci_driver形式注册到pci子系统中 */
ret = pci_register_driver(&e1000_driver);
if (copybreak != COPYBREAK_DEFAULT) {
if (copybreak == 0)
pr_info("copybreak disabled\n");
else
pr_info("copybreak enabled for "
"packets <= %u bytes\n", copybreak);
}
return ret;
}
module_init(e1000_init_module);
pci_register_driver() 注册的是驱动程序,是把驱动程序安装到内核,准确的说是安装到内核的PCI子系统中。此时还没有设备出现,但是内核已经具备管理e1000设备的能力。
设备发现和初始化 - e1000_probe
刚刚我们通过e1000_init_module()把e1000驱动程序注册到PCI子系统,这样当有e1000设备插入到PCI总线的时候,PCI子系统就可以发现该设备,并调用之前注册的函数e1000_probe()对设备进行初始化
/*** drivers/net/ethernet/intel/e1000/e1000_main.c ***/
/* 设备的操作函数 */
static const struct net_device_ops e1000_netdev_ops = {
.ndo_open = e1000_open, //打开设备
.ndo_stop = e1000_close, //关闭设备
.ndo_start_xmit = e1000_xmit_frame,
.ndo_get_stats = e1000_get_stats,
.ndo_set_rx_mode = e1000_set_rx_mode,
.ndo_set_mac_address = e1000_set_mac,
.ndo_tx_timeout = e1000_tx_timeout,
.ndo_change_mtu = e1000_change_mtu,
.ndo_do_ioctl = e1000_ioctl,
.ndo_validate_addr = eth_validate_addr,
.ndo_vlan_rx_add_vid = e1000_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = e1000_vlan_rx_kill_vid,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = e1000_netpoll,
#endif
.ndo_fix_features = e1000_fix_features,
.ndo_set_features = e1000_set_features,
};
/**
* e1000_probe - 设备初始化函数
* @pdev: PCI device information struct
* @ent: entry in e1000_pci_tbl
*
* Returns 0 on success, negative on failure
*
* e1000_probe initializes an adapter identified by a pci_dev structure.
* The OS initialization, configuring of the adapter private structure,
* and a hardware reset occur.
**/
static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
struct net_device *netdev;
struct e1000_adapter *adapter; //设备私有数据
struct e1000_hw *hw;
...
err = -ENOMEM;
/* 为设备分配net_device结构体 */
netdev = alloc_etherdev(sizeof(struct e1000_adapter));
if (!netdev)
goto err_alloc_etherdev;
...
/* 设置设备的私有数据 */
adapter = netdev_priv(netdev);
adapter->netdev = netdev;
adapter->pdev = pdev;
adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
adapter->bars = bars;
adapter->need_ioport = need_ioport;
...
/* 设置设备的操作函数 */
netdev->netdev_ops = &e1000_netdev_ops;
e1000_set_ethtool_ops(netdev);
/**
* 初始化该设备的napi,用于下半部接收数据;
* 该函数初始化napi字段,并把napi注册到全局napi_hash表中。
**/
netif_napi_add(netdev, &adapter->napi, e1000_clean, 64);
/* 设置设备名称 */
strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);
...
/* 初始化延时任务,如看门狗任务 */
INIT_DELAYED_WORK(&adapter->watchdog_task, e1000_watchdog);
INIT_DELAYED_WORK(&adapter->fifo_stall_task,
e1000_82547_tx_fifo_stall_task);
INIT_DELAYED_WORK(&adapter->phy_info_task, e1000_update_phy_info_task);
INIT_WORK(&adapter->reset_task, e1000_reset_task);
...
/* reset the hardware with the new settings */
e1000_reset(adapter);
strcpy(netdev->name, "eth%d");
/* 注册网络设备net_device */
err = register_netdev(netdev);
if (err)
goto err_register;
...
}
开启设备 - e1000_open
当使用用户空间工具(如ifconfig、iproute)开启网络设备时,PCI子系统就会调用函数e1000_open()
/*** drivers/net/ethernet/intel/e1000/e1000_main.c ***/
/**
* e1000_open - Called when a network interface is made active
* @netdev: network interface device structure
*
* Returns 0 on success, negative value on failure
*
* The open entry point is called when a network interface is made
* active by the system (IFF_UP). At this point all resources needed
* for transmit and receive operations are allocated, the interrupt
* handler is registered with the OS, the watchdog task is started,
* and the stack is notified that the interface is ready.
**/
static int e1000_open(struct net_device *netdev)
{
struct e1000_adapter *adapter = netdev_priv(netdev);
struct e1000_hw *hw = &adapter->hw;
int err;
/* disallow open during test */
if (test_bit(__E1000_TESTING, &adapter->flags))
return -EBUSY;
netif_carrier_off(netdev);
/* allocate transmit descriptors */
err = e1000_setup_all_tx_resources(adapter);
if (err)
goto err_setup_tx;
/* allocate receive descriptors */
err = e1000_setup_all_rx_resources(adapter);
if (err)
goto err_setup_rx;
e1000_power_up_phy(adapter);
adapter->mng_vlan_id = E1000_MNG_VLAN_NONE;
if ((hw->mng_cookie.status &
E1000_MNG_DHCP_COOKIE_STATUS_VLAN_SUPPORT)) {
e1000_update_mng_vlan(adapter);
}
/* before we allocate an interrupt, we must be ready to handle it.
* Setting DEBUG_SHIRQ in the kernel makes it fire an interrupt
* as soon as we call pci_request_irq, so we have to setup our
* clean_rx handler before we do so.
*/
e1000_configure(adapter);
//注册中断请求,中断处理函数为e1000_intr
err = e1000_request_irq(adapter);
if (err)
goto err_req_irq;
/* From here on the code is the same as e1000_up() */
clear_bit(__E1000_DOWN, &adapter->flags);
napi_enable(&adapter->napi);
e1000_irq_enable(adapter);
netif_start_queue(netdev);
/**
* fire a link status change interrupt to start the watchdog,
* 触发链路状态变更中断,启动看门狗
**/
ew32(ICS, E1000_ICS_LSC);
return E1000_SUCCESS;
err_req_irq:
e1000_power_down_phy(adapter);
e1000_free_all_rx_resources(adapter);
err_setup_rx:
e1000_free_all_tx_resources(adapter);
err_setup_tx:
e1000_reset(adapter);
return err;
}
设备中断 - e1000_intr
当网卡接收到数据就会发出中断请求(IRQ),对应的中断处理函数就是e1000_intr(),该函数运行在中断上下文中,不可休眠。
/*** drivers/net/ethernet/intel/e1000/e1000_main.c ***/
/**
* e1000_intr - Interrupt Handler
* @irq: interrupt number
* @data: pointer to a network interface device structure
**/
static irqreturn_t e1000_intr(int irq, void *data)
{
struct net_device *netdev = data;
struct e1000_adapter *adapter = netdev_priv(netdev);
struct e1000_hw *hw = &adapter->hw;
u32 icr = er32(ICR);
if (unlikely((!icr)))
return IRQ_NONE; /* Not our interrupt */
/* we might have caused the interrupt, but the above
* read cleared it, and just in case the driver is
* down there is nothing to do so return handled
*/
if (unlikely(test_bit(__E1000_DOWN, &adapter->flags)))
return IRQ_HANDLED;
if (unlikely(icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC))) {
hw->get_link_status = 1;
/* guard against interrupt when we're going down */
if (!test_bit(__E1000_DOWN, &adapter->flags)){
//启动看门狗任务
schedule_delayed_work(&adapter->watchdog_task, 1);
}
}
/* disable interrupts, without the synchronize_irq bit */
ew32(IMC, ~0);
E1000_WRITE_FLUSH();
if (likely(napi_schedule_prep(&adapter->napi))) {
adapter->total_tx_bytes = 0;
adapter->total_tx_packets = 0;
adapter->total_rx_bytes = 0;
adapter->total_rx_packets = 0;
/**
* 调度接收
* 1、把napi加入到cpu的softnet_data.poll_list
* 2、触发软中断(softirq)NET_RX_SOFTIRQ 准备接收数据
**/
__napi_schedule(&adapter->napi);
} else {
/* this really should not happen! if it does it is basically a
* bug, but not a hard error, so enable ints and continue
*/
if (!test_bit(__E1000_DOWN, &adapter->flags))
e1000_irq_enable(adapter);
}
return IRQ_HANDLED;
}
下半部处理 - 软中断
设备初始化完毕并开启后,就进入就绪状态,当有数据到达网卡就触发中断(硬件中断),内核执行对应的中断处理程序(中断处理程序是上半部,要简短、迅速),当执行完中断处理程序后内核触发NET_RX_SOFTIRQ软中断,进行下半部处理,NET_RX_SOFTIRQ对应的软中断处理函数是net_rx_action(),注册的地方是在系统初始化时:
/*** drivers/net/ethernet/intel/e1000/e1000_main.c ***/
static int __init net_dev_init(void)
{
int i, rc = -ENOMEM;
BUG_ON(!dev_boot_phase);
...
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action); //注册网络收包的软中断处理函数
hotcpu_notifier(dev_cpu_callback, 0);
dst_subsys_init();
rc = 0;
...
}
net_rx_action()函数
/*** net/core/dev.c ***/
static void net_rx_action(struct softirq_action *h)
{
//获取当前CPU的softnet_data数据
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
//设置处理时限
unsigned long time_limit = jiffies + 2;
//设置此次最大处理包数
int budget = netdev_budget;
LIST_HEAD(list);
LIST_HEAD(repoll);
local_irq_disable();
//把poll_list连接到list上,然后把poll_list清空(把poll_list中的napi缓存到list中)
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
//对list中的napi循环处理
for (;;) {
struct napi_struct *n;
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
return;
break;
}
n = list_first_entry(&list, struct napi_struct, poll_list);
/**
* 调用napi->poll函数从设备接收数据,poll函数是由设备驱动程序提供,并在
* 设备初始化时通过netif_napi_add()函数放到设备的private data中。
* poll函数从硬件中读取数据然后通过napi_gro_receive()函数把skb发送到
* 网络协议栈做进一步处理。
**/
budget -= napi_poll(n, &repoll);
/* If softirq window is exhausted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}
__kfree_skb_flush();
local_irq_disable();
//把未完成的napi加入到CPU全局poll_list中,待下次处理软中断时再次处理
list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
//poll_list非空,此次处理超过时限或超过最大处理包数,重新出发软中断进行处理。
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
}
看门狗
看门狗是用来检测设备状态的,在e1000_open()中启动。
/**
* e1000_watchdog - work function
* @work: work struct contained inside adapter struct
**/
static void e1000_watchdog(struct work_struct *work)
{
struct e1000_adapter *adapter = container_of(work,
struct e1000_adapter,
watchdog_task.work);
struct e1000_hw *hw = &adapter->hw;
struct net_device *netdev = adapter->netdev;
struct e1000_tx_ring *txdr = adapter->tx_ring;
u32 link, tctl;
link = e1000_has_link(adapter);
//链路已激活并有载波->链路状态正常,去更新统计数据
if ((netif_carrier_ok(netdev)) && link)
goto link_up;
if (link) {
if (!netif_carrier_ok(netdev)) {
//链路已激活但是没有载波,检测到设备启动
u32 ctrl;
bool txb2b = true;
/* update snapshot of PHY registers on LSC */
e1000_get_speed_and_duplex(hw,
&adapter->link_speed,
&adapter->link_duplex);
ctrl = er32(CTRL);
pr_info("%s NIC Link is Up %d Mbps %s, "
"Flow Control: %s\n",
netdev->name,
adapter->link_speed,
adapter->link_duplex == FULL_DUPLEX ?
"Full Duplex" : "Half Duplex",
((ctrl & E1000_CTRL_TFCE) && (ctrl &
E1000_CTRL_RFCE)) ? "RX/TX" : ((ctrl &
E1000_CTRL_RFCE) ? "RX" : ((ctrl &
E1000_CTRL_TFCE) ? "TX" : "None")));
/* adjust timeout factor according to speed/duplex */
adapter->tx_timeout_factor = 1;
switch (adapter->link_speed) {
case SPEED_10:
txb2b = false;
adapter->tx_timeout_factor = 16;
break;
case SPEED_100:
txb2b = false;
/* maybe add some timeout factor ? */
break;
}
/* enable transmits in the hardware */
tctl = er32(TCTL);
tctl |= E1000_TCTL_EN;
ew32(TCTL, tctl);
netif_carrier_on(netdev);
if (!test_bit(__E1000_DOWN, &adapter->flags))
schedule_delayed_work(&adapter->phy_info_task,
2 * HZ);
adapter->smartspeed = 0;
}
} else {
if (netif_carrier_ok(netdev)) {
//链路未激活但有载波,检测到设备关闭
adapter->link_speed = 0;
adapter->link_duplex = 0;
pr_info("%s NIC Link is Down\n",
netdev->name);
netif_carrier_off(netdev);
if (!test_bit(__E1000_DOWN, &adapter->flags))
schedule_delayed_work(&adapter->phy_info_task,
2 * HZ);
}
e1000_smartspeed(adapter);
}
link_up:
e1000_update_stats(adapter);
hw->tx_packet_delta = adapter->stats.tpt - adapter->tpt_old;
adapter->tpt_old = adapter->stats.tpt;
hw->collision_delta = adapter->stats.colc - adapter->colc_old;
adapter->colc_old = adapter->stats.colc;
adapter->gorcl = adapter->stats.gorcl - adapter->gorcl_old;
adapter->gorcl_old = adapter->stats.gorcl;
adapter->gotcl = adapter->stats.gotcl - adapter->gotcl_old;
adapter->gotcl_old = adapter->stats.gotcl;
e1000_update_adaptive(hw);
if (!netif_carrier_ok(netdev)) {
if (E1000_DESC_UNUSED(txdr) + 1 < txdr->count) {
/* We've lost link, so the controller stops DMA,
* but we've got queued Tx work that's never going
* to get done, so reset controller to flush Tx.
* (Do the reset outside of interrupt context).
*/
adapter->tx_timeout_count++;
schedule_work(&adapter->reset_task);
/* exit immediately since reset is imminent */
return;
}
}
/* Simple mode for Interrupt Throttle Rate (ITR) */
if (hw->mac_type >= e1000_82540 && adapter->itr_setting == 4) {
/* Symmetric Tx/Rx gets a reduced ITR=2000;
* Total asymmetrical Tx or Rx gets ITR=8000;
* everyone else is between 2000-8000.
*/
u32 goc = (adapter->gotcl + adapter->gorcl) / 10000;
u32 dif = (adapter->gotcl > adapter->gorcl ?
adapter->gotcl - adapter->gorcl :
adapter->gorcl - adapter->gotcl) / 10000;
u32 itr = goc > 0 ? (dif * 6000 / goc + 2000) : 8000;
ew32(ITR, 1000000000 / (itr * 256));
}
/* Cause software interrupt to ensure rx ring is cleaned */
ew32(ICS, E1000_ICS_RXDMT0);
/* Force detection of hung controller every watchdog period */
adapter->detect_tx_hung = true;
/* Reschedule the task 2HZ后再次调用看门狗检测设备状态*/
if (!test_bit(__E1000_DOWN, &adapter->flags)){
schedule_delayed_work(&adapter->watchdog_task, 2 * HZ);
}
}
欢迎交流学习!