网络收包流程-软中断中process_backlog和poll方式处理流程（二）

最新推荐文章于 2023-05-09 20:00:34 发布

菜鸟别浪

最新推荐文章于 2023-05-09 20:00:34 发布

阅读量2.4k

点赞数

分类专栏：网络 linux tcp/ip 文章标签： process_backlog NAPI napi_poll

本文链接：https://blog.csdn.net/hzj_001/article/details/100708621

版权

linux 同时被 3 个专栏收录

83 篇文章 20 订阅

订阅专栏

网络

25 篇文章 7 订阅

订阅专栏

tcp/ip

12 篇文章 9 订阅

订阅专栏

在硬中断中触发了软中断后，最终会调用软中断处理函数 net_rx_action，注意：硬中断流程触发软中断后退出中断上下文，但是并不会立刻进入软中断，具体的实现需要了解软中断处理流程。
1.软中断处理函数net_rx_action

具体实现详解：

static void net_rx_action(struct softirq_action *h)
{
    struct softnet_data *sd = this_cpu_ptr(&softnet_data);//获取当前cpu的sd变量
    unsigned long time_limit = jiffies + 2;
    int budget = netdev_budget;这个值就是 net.core.netdev_max_backlog，通过sysctl来修改，表示一次软中断处理skb的数目，系统默认定义为300
    LIST_HEAD(list);
    LIST_HEAD(repoll);

    local_irq_disable();//禁止中断（中断响应的时候会把特定于设备的poll_list放入到sd中）会把获取sd的poll_list链表 
    list_splice_init(&sd->poll_list, &list);//将sd->poll_list接到list的开头
    local_irq_enable();//打开中断，正常处理poll_list

    for (;;) {
        struct napi_struct *n;

        if (list_empty(&list)) {
            if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))//检查POLL队列(poll_list)上是否有设备在准备等待轮询
                return;
            break;
        }

        n = list_first_entry(&list, struct napi_struct, poll_list);//轮询sd->poll_list上的所有设备
        budget -= napi_poll(n, &repoll);//调用poll函数从网卡驱动中读取一定数量的skb

        /* If softirq window is exhausted then punt.
         * Allow this to run for 2 jiffies since which will allow
         * an average latency of 1.5/HZ.
         */
        if (unlikely(budget <= 0 ||
                 time_after_eq(jiffies, time_limit))) {//如果读取的数量超过300或者时间超过一个jiffies，则终止中断处理
            sd->time_squeeze++;
            break;
        }
    }

    __kfree_skb_flush();
    local_irq_disable();//同上

    list_splice_tail_init(&sd->poll_list, &list);
    list_splice_tail(&repoll, &list);
    list_splice(&list, &sd->poll_list);//将未处理完的list设备链表接到sd->poll_list开头
    if (!list_empty(&sd->poll_list))  //如果poll list中不为空，表示还有skb没有读取完成，则继续读取，触发下一次软中断
        __raise_softirq_irqoff(NET_RX_SOFTIRQ);

    net_rps_action_and_irq_enable(sd);//本地中断开启，根据条件发送IPI给其他CPU
}

2.napi_poll

static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
    void *have;
    int work, weight;

    list_del_init(&n->poll_list);//从链表中拿掉n

    have = netpoll_poll_lock(n);

    weight = n->weight;//读取配额，表示设备能读取的分组数，此权重可由设备驱动指定，但都不能超过该设备可以在Rx缓冲区中存储的分组的数目

    /* This NAPI_STATE_SCHED test is for avoiding a race
     * with netpoll's poll_napi().  Only the entity which
     * obtains the lock and sees NAPI_STATE_SCHED set will
     * actually make the ->poll() call.  Therefore we avoid
     * accidentally calling ->poll() when NAPI is not scheduled.
     */
    work = 0;
    if (test_bit(NAPI_STATE_SCHED, &n->state)) {//如果napi poll被调度状态
        work = n->poll(n, weight);//执行当前设备n的poll回调,非NAPI调用process_backlog，NAPI则调用特定设备的poll函数
        trace_napi_poll(n);
    }

    WARN_ON_ONCE(work > weight);

    if (likely(work < weight))//读取小于配额，全部读出，退出
        goto out_unlock;

    //读取数等于配额表示尚未读完
    /* Drivers must not modify the NAPI state if they
     * consume the entire weight.  In such cases this code
     * still "owns" the NAPI instance and therefore can
     * move the instance around on the list at-will.
     */
    if (unlikely(napi_disable_pending(n))) {//如果napi状态为disable，则执行完成项
        napi_complete(n);
        goto out_unlock;
    }

    if (n->gro_list) {//如果等待合并的skb链表存在，清理过时的节点
        /* flush too old packets
         * If HZ < 1000, flush all packets.
         */
        napi_gro_flush(n, HZ >= 1000);
    }

    /* Some drivers may have called napi_schedule
     * prior to exhausting their budget.
     */
    if (unlikely(!list_empty(&n->poll_list))) {
        pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
                 n->dev ? n->dev->name : "backlog");
        goto out_unlock;
    }

    list_add_tail(&n->poll_list, repoll);//未处理完，内核接下来将该设备移动到轮询表末尾，在链表中所有其他设备都处理过之后，继续轮询该设备。

out_unlock:
    netpoll_poll_unlock(have);

    return work;
}

3.对于非NAPI方式的网卡收报最终会调用process_backlog来处理网路分组。

process_backlog主要完成二项工作：
1) __skb_dequeue从等待队列移除一个套接字缓冲区，该缓冲区管理着一个接收到的分组。
2) 调用netif_receive_skb函数分析分组类型，以便根据分组类型将分组传递给网络层的接收函数（即传递到网络系统的更高一层）。为此，该函数遍历所有可能负责当前分组类型的所有网络层函数,一一调用deliver_skb函数（ deliver_skb函数使用一个特定于分组类型的处理程序func，承担对分组的更高层（例如互联网络层）的处理）。
具体实现详解：

static int process_backlog(struct napi_struct *napi, int quota)
{
    int work = 0;
    struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);//获取当前backlog所在的sd

    /* Check if we have pending ipi, its better to send them now,
     * not waiting net_rx_action() end.
     */
    if (sd_has_rps_ipi_waiting(sd)) {//是否有rps ipi等待，如果是需要发送ipi中断给其他CPU
        local_irq_disable();
        net_rps_action_and_irq_enable(sd);
    }

    napi->weight = weight_p;//设置每次处理的最大数据包数，默认为6
    local_irq_disable();//关闭中断
    while (work < quota) {//如果处理的分组小于配额则一直接收分组
        struct sk_buff *skb;
        unsigned int qlen;

        while ((skb = __skb_dequeue(&sd->process_queue))) {//从缓存队列中取skb向上层输入，直到process队列处理完或者设备配额用完。
            rcu_read_lock();
            local_irq_enable();//开中断
            __netif_receive_skb(skb);//处理报文
            rcu_read_unlock();
            local_irq_disable();
            input_queue_head_incr(sd);//将队列头部往后偏移一个单位
            if (++work >= quota) {//如果处理报文数超过设备配额，则退出
                local_irq_enable();
                return work;//返回处理报文分则数
            }
        }
        //如果process队列被处理完，则需要继续合并input队列到process队列。
        rps_lock(sd);
        qlen = skb_queue_len(&sd->input_pkt_queue);//获取input队列长度
        if (qlen)  //input队列不为空
            skb_queue_splice_tail_init(&sd->input_pkt_queue,
                           &sd->process_queue); //把input队列合并到process队列中，继续处理

        if (qlen < quota - work) {//如果剩余配合还是大于待处理分组队列的长度，则调整配合大小（减小）
            /*
             * Inline a custom version of __napi_complete().
             * only current cpu owns and manipulates this napi,
             * and NAPI_STATE_SCHED is the only possible flag set on backlog.
             * we can use a plain write instead of clear_bit(),
             * and we dont need an smp_mb() memory barrier.
             */
            napi->state = 0;

            quota = work + qlen;
        }
        rps_unlock(sd);
    }
    local_irq_enable();

    return work;
}

3.NAPI的poll函数处理，以gro_cell_poll为例

/* called under BH context */
static inline int gro_cell_poll(struct napi_struct *napi, int budget)
{
    struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
    struct sk_buff *skb;
    int work_done = 0;

    while (work_done < budget) {//小于配额就不断接收
        skb = __skb_dequeue(&cell->napi_skbs);//从队列中取出一个分组
        if (!skb)
            break;//接收完，退出
        napi_gro_receive(napi, skb);//接收
        work_done++;//接收分组计数
    }

    if (work_done < budget)//如果分组处理完成，则退出poll
        napi_complete_done(napi, work_done);
    return work_done；//返回接收分组数目
}

总结：
poll函数最多允许处理budget个分组。该函数返回实际上处理的分组的数目。他的处理存在以下两种情况：
1）如果处理分组的数目小于预算，那么没有更多的分组， Rx缓冲区为空，否则，肯定还需要处理剩余的分组（亦即，返回值不可能小于预算）。因此， netif_rx_complete将该情况通知内核，内核将从轮询表移除该设备。接下来，驱动程序必须通过特定于硬件的适当方法来重新启用IRQ。
2）已经完全用掉了预算，但仍然有更多的分组需要处理。设备仍然留在轮询表上，不启用中断。