Linux kernel 帧的接收

Linux kernel 帧的接收

本文以e1000驱动为例,基于3.10.0-514.10.2版本内核。


驱动注册 - e1000_init_module

[root@10-254-0-111 ~]# modprobe e1000  #插入e1000模块
    或者
[root@10-254-0-111 ~]# insmod /path/to/e1000.ko   #insmod后面指定e1000.ko文件路径

上面的操作对应的实现代码如下:

/*** drivers/net/ethernet/intel/e1000/e1000_main.c ***/

 //该e1000驱动对应的pci驱动实例
 static struct pci_driver e1000_driver = {
        .name     = e1000_driver_name, //e1000驱动程序名称,默认e1000
        .id_table = e1000_pci_tbl,     //该驱动程序所支持的网卡设备列表
        .probe    = e1000_probe,       //设备初始化函数,当PCI子系统检测到该驱动所支持的设备被插入到总线上时,调用该函数对设备进行初始化操作。
        .remove   = e1000_remove,      //移除设备时(热插拔或驱动程序被移除时)调用该函数
#ifdef CONFIG_PM //电源管理
        /* Power Management Hooks */
        .suspend  = e1000_suspend,     //系统休眠时调用
        .resume   = e1000_resume,      //系统被唤醒时调用
#endif                  
        .shutdown = e1000_shutdown,    //系统关闭时调用
        .err_handler = &e1000_err_handler //错误处理器
};

/**
 * e1000_init_module - 驱动注册函数
 *
 * e1000_init_module is the first routine called when the driver is
 * loaded. All it does is register with the PCI subsystem.
 **/
static int __init e1000_init_module(void)
{
    int ret;
    pr_info("%s - version %s\n", e1000_driver_string, e1000_driver_version);

    pr_info("%s\n", e1000_copyright);

    /* 注册pci驱动 - 把e1000驱动程序以pci_driver形式注册到pci子系统中 */
    ret = pci_register_driver(&e1000_driver);
    if (copybreak != COPYBREAK_DEFAULT) {
        if (copybreak == 0)
            pr_info("copybreak disabled\n");
        else
            pr_info("copybreak enabled for "
                   "packets <= %u bytes\n", copybreak);
    }
    return ret;
}

module_init(e1000_init_module);

pci_register_driver() 注册的是驱动程序,是把驱动程序安装到内核,准确的说是安装到内核的PCI子系统中。此时还没有设备出现,但是内核已经具备管理e1000设备的能力。

设备发现和初始化 - e1000_probe

刚刚我们通过e1000_init_module()把e1000驱动程序注册到PCI子系统,这样当有e1000设备插入到PCI总线的时候,PCI子系统就可以发现该设备,并调用之前注册的函数e1000_probe()对设备进行初始化

/*** drivers/net/ethernet/intel/e1000/e1000_main.c ***/

/* 设备的操作函数 */
static const struct net_device_ops e1000_netdev_ops = {
        .ndo_open               = e1000_open,   //打开设备
        .ndo_stop               = e1000_close,   //关闭设备
        .ndo_start_xmit         = e1000_xmit_frame,
        .ndo_get_stats          = e1000_get_stats,
        .ndo_set_rx_mode        = e1000_set_rx_mode,
        .ndo_set_mac_address    = e1000_set_mac,
        .ndo_tx_timeout         = e1000_tx_timeout,
        .ndo_change_mtu         = e1000_change_mtu,
        .ndo_do_ioctl           = e1000_ioctl,
        .ndo_validate_addr      = eth_validate_addr,
        .ndo_vlan_rx_add_vid    = e1000_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid   = e1000_vlan_rx_kill_vid,
#ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller    = e1000_netpoll,
#endif
        .ndo_fix_features       = e1000_fix_features,
        .ndo_set_features       = e1000_set_features,
};

/**
 * e1000_probe - 设备初始化函数
 * @pdev: PCI device information struct
 * @ent: entry in e1000_pci_tbl
 *
 * Returns 0 on success, negative on failure
 *
 * e1000_probe initializes an adapter identified by a pci_dev structure.
 * The OS initialization, configuring of the adapter private structure,
 * and a hardware reset occur.
 **/
static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
    struct net_device *netdev;
    struct e1000_adapter *adapter;   //设备私有数据
    struct e1000_hw *hw;

    ...

    err = -ENOMEM;
    /* 为设备分配net_device结构体 */
    netdev = alloc_etherdev(sizeof(struct e1000_adapter));
    if (!netdev)
        goto err_alloc_etherdev;

    ...

    /* 设置设备的私有数据 */
    adapter = netdev_priv(netdev);
    adapter->netdev = netdev;
    adapter->pdev = pdev;
    adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
    adapter->bars = bars;
    adapter->need_ioport = need_ioport;

    ...

    /* 设置设备的操作函数 */
    netdev->netdev_ops = &e1000_netdev_ops;
    e1000_set_ethtool_ops(netdev);

    /**
     * 初始化该设备的napi,用于下半部接收数据;
     * 该函数初始化napi字段,并把napi注册到全局napi_hash表中。
     **/
    netif_napi_add(netdev, &adapter->napi, e1000_clean, 64);

    /* 设置设备名称 */
    strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);

    ...

    /* 初始化延时任务,如看门狗任务 */
    INIT_DELAYED_WORK(&adapter->watchdog_task, e1000_watchdog);
    INIT_DELAYED_WORK(&adapter->fifo_stall_task,
              e1000_82547_tx_fifo_stall_task);
    INIT_DELAYED_WORK(&adapter->phy_info_task, e1000_update_phy_info_task);
    INIT_WORK(&adapter->reset_task, e1000_reset_task);

    ...

    /* reset the hardware with the new settings */
    e1000_reset(adapter);

    strcpy(netdev->name, "eth%d");

    /* 注册网络设备net_device */
    err = register_netdev(netdev);
    if (err)
        goto err_register;

    ...
}

开启设备 - e1000_open

当使用用户空间工具(如ifconfig、iproute)开启网络设备时,PCI子系统就会调用函数e1000_open()

/*** drivers/net/ethernet/intel/e1000/e1000_main.c ***/

/**
 * e1000_open - Called when a network interface is made active
 * @netdev: network interface device structure
 *
 * Returns 0 on success, negative value on failure
 *
 * The open entry point is called when a network interface is made
 * active by the system (IFF_UP).  At this point all resources needed
 * for transmit and receive operations are allocated, the interrupt
 * handler is registered with the OS, the watchdog task is started,
 * and the stack is notified that the interface is ready.
 **/
static int e1000_open(struct net_device *netdev)
{
    struct e1000_adapter *adapter = netdev_priv(netdev);
    struct e1000_hw *hw = &adapter->hw;
    int err;

    /* disallow open during test */
    if (test_bit(__E1000_TESTING, &adapter->flags))
        return -EBUSY;

    netif_carrier_off(netdev);

    /* allocate transmit descriptors */
    err = e1000_setup_all_tx_resources(adapter);
    if (err)
        goto err_setup_tx;

    /* allocate receive descriptors */
    err = e1000_setup_all_rx_resources(adapter);
    if (err)
        goto err_setup_rx;

    e1000_power_up_phy(adapter);

    adapter->mng_vlan_id = E1000_MNG_VLAN_NONE;
    if ((hw->mng_cookie.status &
              E1000_MNG_DHCP_COOKIE_STATUS_VLAN_SUPPORT)) {
        e1000_update_mng_vlan(adapter);
    }

    /* before we allocate an interrupt, we must be ready to handle it.
     * Setting DEBUG_SHIRQ in the kernel makes it fire an interrupt
     * as soon as we call pci_request_irq, so we have to setup our
     * clean_rx handler before we do so.
     */
    e1000_configure(adapter);

    //注册中断请求,中断处理函数为e1000_intr
    err = e1000_request_irq(adapter);
    if (err)
        goto err_req_irq;

    /* From here on the code is the same as e1000_up() */
    clear_bit(__E1000_DOWN, &adapter->flags);

    napi_enable(&adapter->napi);

    e1000_irq_enable(adapter);

    netif_start_queue(netdev);

    /**
     * fire a link status change interrupt to start the watchdog,
     * 触发链路状态变更中断,启动看门狗
    **/
    ew32(ICS, E1000_ICS_LSC);

    return E1000_SUCCESS;

err_req_irq:
    e1000_power_down_phy(adapter);
    e1000_free_all_rx_resources(adapter);
err_setup_rx:
    e1000_free_all_tx_resources(adapter);
err_setup_tx:
    e1000_reset(adapter);

    return err;
}

设备中断 - e1000_intr

当网卡接收到数据就会发出中断请求(IRQ),对应的中断处理函数就是e1000_intr(),该函数运行在中断上下文中,不可休眠。

/*** drivers/net/ethernet/intel/e1000/e1000_main.c ***/

/**
 * e1000_intr - Interrupt Handler
 * @irq: interrupt number
 * @data: pointer to a network interface device structure
 **/
static irqreturn_t e1000_intr(int irq, void *data)
{
    struct net_device *netdev = data;
    struct e1000_adapter *adapter = netdev_priv(netdev);
    struct e1000_hw *hw = &adapter->hw;
    u32 icr = er32(ICR);

    if (unlikely((!icr)))
        return IRQ_NONE;  /* Not our interrupt */

    /* we might have caused the interrupt, but the above
     * read cleared it, and just in case the driver is
     * down there is nothing to do so return handled
     */
    if (unlikely(test_bit(__E1000_DOWN, &adapter->flags)))
        return IRQ_HANDLED;

    if (unlikely(icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC))) {
        hw->get_link_status = 1;
        /* guard against interrupt when we're going down */
        if (!test_bit(__E1000_DOWN, &adapter->flags)){
            //启动看门狗任务
            schedule_delayed_work(&adapter->watchdog_task, 1);
        }
    }

    /* disable interrupts, without the synchronize_irq bit */
    ew32(IMC, ~0);
    E1000_WRITE_FLUSH();

    if (likely(napi_schedule_prep(&adapter->napi))) {
        adapter->total_tx_bytes = 0;
        adapter->total_tx_packets = 0;
        adapter->total_rx_bytes = 0;
        adapter->total_rx_packets = 0;

        /**
         * 调度接收
         * 1、把napi加入到cpu的softnet_data.poll_list
         * 2、触发软中断(softirq)NET_RX_SOFTIRQ 准备接收数据
        **/
        __napi_schedule(&adapter->napi);
    } else {
        /* this really should not happen! if it does it is basically a
         * bug, but not a hard error, so enable ints and continue
         */
        if (!test_bit(__E1000_DOWN, &adapter->flags))
            e1000_irq_enable(adapter);
    }

    return IRQ_HANDLED;
}

下半部处理 - 软中断

设备初始化完毕并开启后,就进入就绪状态,当有数据到达网卡就触发中断(硬件中断),内核执行对应的中断处理程序(中断处理程序是上半部,要简短、迅速),当执行完中断处理程序后内核触发NET_RX_SOFTIRQ软中断,进行下半部处理,NET_RX_SOFTIRQ对应的软中断处理函数是net_rx_action(),注册的地方是在系统初始化时:

/*** drivers/net/ethernet/intel/e1000/e1000_main.c ***/

static int __init net_dev_init(void)
{
        int i, rc = -ENOMEM;

        BUG_ON(!dev_boot_phase);

        ...

        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
        open_softirq(NET_RX_SOFTIRQ, net_rx_action); //注册网络收包的软中断处理函数

        hotcpu_notifier(dev_cpu_callback, 0);
        dst_subsys_init();
        rc = 0;
        ...
}

net_rx_action()函数

/*** net/core/dev.c ***/

static void net_rx_action(struct softirq_action *h)
{
        //获取当前CPU的softnet_data数据
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);

        //设置处理时限
        unsigned long time_limit = jiffies + 2;
        //设置此次最大处理包数
        int budget = netdev_budget;

        LIST_HEAD(list);
        LIST_HEAD(repoll);

        local_irq_disable();

        //把poll_list连接到list上,然后把poll_list清空(把poll_list中的napi缓存到list中)
        list_splice_init(&sd->poll_list, &list);
        local_irq_enable();

        //对list中的napi循环处理
        for (;;) {
                struct napi_struct *n;

                if (list_empty(&list)) {
                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
                                return;
                        break;
                }

                n = list_first_entry(&list, struct napi_struct, poll_list);
                /**
                 * 调用napi->poll函数从设备接收数据,poll函数是由设备驱动程序提供,并在
                 * 设备初始化时通过netif_napi_add()函数放到设备的private data中。
                 * poll函数从硬件中读取数据然后通过napi_gro_receive()函数把skb发送到
                 * 网络协议栈做进一步处理。
                **/
                budget -= napi_poll(n, &repoll);

                /* If softirq window is exhausted then punt.
                 * Allow this to run for 2 jiffies since which will allow
                 * an average latency of 1.5/HZ.
                 */
                if (unlikely(budget <= 0 ||
                             time_after_eq(jiffies, time_limit))) {
                        sd->time_squeeze++;
                        break;
                }
        }

        __kfree_skb_flush();
        local_irq_disable();

        //把未完成的napi加入到CPU全局poll_list中,待下次处理软中断时再次处理
        list_splice_tail_init(&sd->poll_list, &list);
        list_splice_tail(&repoll, &list);
        list_splice(&list, &sd->poll_list);

        //poll_list非空,此次处理超过时限或超过最大处理包数,重新出发软中断进行处理。
        if (!list_empty(&sd->poll_list))
                __raise_softirq_irqoff(NET_RX_SOFTIRQ);

        net_rps_action_and_irq_enable(sd);
}

看门狗

看门狗是用来检测设备状态的,在e1000_open()中启动。

/**
 * e1000_watchdog - work function
 * @work: work struct contained inside adapter struct
 **/
static void e1000_watchdog(struct work_struct *work)
{
    struct e1000_adapter *adapter = container_of(work,
                             struct e1000_adapter,
                             watchdog_task.work);
    struct e1000_hw *hw = &adapter->hw;
    struct net_device *netdev = adapter->netdev;
    struct e1000_tx_ring *txdr = adapter->tx_ring;
    u32 link, tctl;

    link = e1000_has_link(adapter);
    //链路已激活并有载波->链路状态正常,去更新统计数据
    if ((netif_carrier_ok(netdev)) && link)
        goto link_up;

    if (link) {
        if (!netif_carrier_ok(netdev)) {
            //链路已激活但是没有载波,检测到设备启动
            u32 ctrl;
            bool txb2b = true;
            /* update snapshot of PHY registers on LSC */
            e1000_get_speed_and_duplex(hw,
                           &adapter->link_speed,
                           &adapter->link_duplex);

            ctrl = er32(CTRL);
            pr_info("%s NIC Link is Up %d Mbps %s, "
                "Flow Control: %s\n",
                netdev->name,
                adapter->link_speed,
                adapter->link_duplex == FULL_DUPLEX ?
                "Full Duplex" : "Half Duplex",
                ((ctrl & E1000_CTRL_TFCE) && (ctrl &
                E1000_CTRL_RFCE)) ? "RX/TX" : ((ctrl &
                E1000_CTRL_RFCE) ? "RX" : ((ctrl &
                E1000_CTRL_TFCE) ? "TX" : "None")));

            /* adjust timeout factor according to speed/duplex */
            adapter->tx_timeout_factor = 1;
            switch (adapter->link_speed) {
            case SPEED_10:
                txb2b = false;
                adapter->tx_timeout_factor = 16;
                break;
            case SPEED_100:
                txb2b = false;
                /* maybe add some timeout factor ? */
                break;
            }

            /* enable transmits in the hardware */
            tctl = er32(TCTL);
            tctl |= E1000_TCTL_EN;
            ew32(TCTL, tctl);

            netif_carrier_on(netdev);
            if (!test_bit(__E1000_DOWN, &adapter->flags))
                schedule_delayed_work(&adapter->phy_info_task,
                              2 * HZ);
            adapter->smartspeed = 0;
        }
    } else {
        if (netif_carrier_ok(netdev)) {
            //链路未激活但有载波,检测到设备关闭
            adapter->link_speed = 0;
            adapter->link_duplex = 0;
            pr_info("%s NIC Link is Down\n",
                netdev->name);
            netif_carrier_off(netdev);

            if (!test_bit(__E1000_DOWN, &adapter->flags))
                schedule_delayed_work(&adapter->phy_info_task,
                              2 * HZ);
        }

        e1000_smartspeed(adapter);
    }

link_up:
    e1000_update_stats(adapter);

    hw->tx_packet_delta = adapter->stats.tpt - adapter->tpt_old;
    adapter->tpt_old = adapter->stats.tpt;
    hw->collision_delta = adapter->stats.colc - adapter->colc_old;
    adapter->colc_old = adapter->stats.colc;

    adapter->gorcl = adapter->stats.gorcl - adapter->gorcl_old;
    adapter->gorcl_old = adapter->stats.gorcl;
    adapter->gotcl = adapter->stats.gotcl - adapter->gotcl_old;
    adapter->gotcl_old = adapter->stats.gotcl;

    e1000_update_adaptive(hw);

    if (!netif_carrier_ok(netdev)) {
        if (E1000_DESC_UNUSED(txdr) + 1 < txdr->count) {
            /* We've lost link, so the controller stops DMA,
             * but we've got queued Tx work that's never going
             * to get done, so reset controller to flush Tx.
             * (Do the reset outside of interrupt context).
             */
            adapter->tx_timeout_count++;
            schedule_work(&adapter->reset_task);
            /* exit immediately since reset is imminent */
            return;
        }
    }

    /* Simple mode for Interrupt Throttle Rate (ITR) */
    if (hw->mac_type >= e1000_82540 && adapter->itr_setting == 4) {
        /* Symmetric Tx/Rx gets a reduced ITR=2000;
         * Total asymmetrical Tx or Rx gets ITR=8000;
         * everyone else is between 2000-8000.
         */
        u32 goc = (adapter->gotcl + adapter->gorcl) / 10000;
        u32 dif = (adapter->gotcl > adapter->gorcl ?
                adapter->gotcl - adapter->gorcl :
                adapter->gorcl - adapter->gotcl) / 10000;
        u32 itr = goc > 0 ? (dif * 6000 / goc + 2000) : 8000;

        ew32(ITR, 1000000000 / (itr * 256));
    }

    /* Cause software interrupt to ensure rx ring is cleaned */
    ew32(ICS, E1000_ICS_RXDMT0);

    /* Force detection of hung controller every watchdog period */
    adapter->detect_tx_hung = true;

    /* Reschedule the task 2HZ后再次调用看门狗检测设备状态*/
    if (!test_bit(__E1000_DOWN, &adapter->flags)){
        schedule_delayed_work(&adapter->watchdog_task, 2 * HZ);
    }
}

欢迎交流学习!

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值