深入理解Linux网络技术内幕第10章帧的接收_budget exhausted after napi rescheduled-CSDN博客

本文链接：https://blog.csdn.net/weixin_44793395/article/details/106593127

帧的接收

通知内核已接收NAPI和netif_rx
NET_RX_SOFTIRQ处理
process_backlog
netif_receive_skb函数

通知内核已接收NAPI和netif_rx

Linux驱动程序通知内核接收报文有两种方式：

netif_rx函数
NAPI机制

NAPI

NAPI混合中断和轮询，不适用纯粹的中断事件驱动模型。

net_rx_action和NAPI

net_rx_action会查看列表中处于轮询状态的设备，然后为每个设备调用相关联的poll函数，处理入口队列中的帧。该列表中的设备按照循环的方式被查阅，每次poll方法启用时存在能处理的帧数目的最大值。也就是说net_rx_action会持续为入口队列中有数据的设备调用设备驱动程序提供的poll方法，直到入口队列为空，到那时驱动程序重新开启设备的中断功能。
net_rx_action会限制执行时间，当处理完一定数量的报文或者执行时间超过限制后，会自行重新调度准备执行。


static __latent_entropy void net_rx_action(struct softirq_action *h)
{
	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
	unsigned long time_limit = jiffies +
		usecs_to_jiffies(netdev_budget_usecs);
	int budget = netdev_budget;
	LIST_HEAD(list);
	LIST_HEAD(repoll);

	local_irq_disable();
	list_splice_init(&sd->poll_list, &list);
	local_irq_enable();

	for (;;) {
		struct napi_struct *n;

		if (list_empty(&list)) {
			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
				goto out;
			break;
		}

		n = list_first_entry(&list, struct napi_struct, poll_list);
		budget -= napi_poll(n, &repoll);

		/* If softirq window is exhausted then punt.
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
		 */
		if (unlikely(budget <= 0 ||
			     time_after_eq(jiffies, time_limit))) {
			sd->time_squeeze++;
			break;
		}
	}

	local_irq_disable();

	list_splice_tail_init(&sd->poll_list, &list);
	list_splice_tail(&repoll, &list);
	list_splice(&list, &sd->poll_list);
	if (!list_empty(&sd->poll_list))
		__raise_softirq_irqoff(NET_RX_SOFTIRQ);

	net_rps_action_and_irq_enable(sd);
out:
	__kfree_skb_flush();
}

从设备驱动程序角度看，NAPI和非NAPI有两点差异：

NAPI驱动程序必须提供一个poll方法。
NAPI收到报文中断时调用__napi_schedule函数，非NAPI调用netif_rx函数

NAPI分析

首先查看中断函数，e100_intr是选取的某个驱动程序的中断函数。
可以看到在调用__napi_schedule(&nic->napi);函数之前，代码首先使用函数e100_disable_irq(nic)把中断禁用。

static irqreturn_t e100_intr(int irq, void *dev_id)
{
	struct net_device *netdev = dev_id;
	struct nic *nic = netdev_priv(netdev);
	u8 stat_ack = ioread8(&nic->csr->scb.stat_ack);

	netif_printk(nic, intr, KERN_DEBUG, nic->netdev,
		     "stat_ack = 0x%02X\n", stat_ack);

	if (stat_ack == stat_ack_not_ours ||	/* Not our interrupt */
	   stat_ack == stat_ack_not_present)	/* Hardware is ejected */
		return IRQ_NONE;

	/* Ack interrupt(s) */
	iowrite8(stat_ack, &nic->csr->scb.stat_ack);

	/* We hit Receive No Resource (RNR); restart RU after cleaning */
	if (stat_ack & stat_ack_rnr)
		nic->ru_running = RU_SUSPENDED;

	if (likely(napi_schedule_prep(&nic->napi))) {
		e100_disable_irq(nic);
		__napi_schedule(&nic->napi);
	}

	return IRQ_HANDLED;
}

__napi_schedule函数首先是将设备的poll_list加入到softnet_data的poll_list中，然后NET_RX_SOFTIRQ软中断调度准备执行。

/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
				     struct napi_struct *napi)
{
	list_add_tail(&napi->poll_list, &sd->poll_list);
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
/**
 * __napi_schedule - schedule for receive
 * @n: entry to schedule
 *
 * The entry's receive function will be scheduled to run.
 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 */
void __napi_schedule(struct napi_struct *n)
{
	unsigned long flags;

	local_irq_save(flags);
	____napi_schedule(this_cpu_ptr(&softnet_data), n);
	local_irq_restore(flags);
}

netif_rx函数

当有输入报文等待处理时，设备驱动程序可以调用netif_rx函数，此函数将sk_buff放入到input_pkt_queue中。
netif_rx函数通常在中断环境下被驱动程序调用当然也有例外比如环路设备，因此netif_rx函数会关闭本地中断函数退出时重新开启中断。

netif_rx函数调用netif_rx_internal函数，在netif_rx_internal中首先判断是否要为接收报文保存时间戳，因为保存时间戳需要消耗性能。

static int netif_rx_internal(struct sk_buff *skb)
{
	int ret;

	net_timestamp_check(netdev_tstamp_prequeue, skb);
	trace_netif_rx(skb);
	unsigned int qtail;
	ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
	put_cpu();
	
	return ret;
}

/**
 *	netif_rx	-	post buffer to the network code
 *	@skb: buffer to post
 *
 *	This function receives a packet from a device driver and queues it for
 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 *	may be dropped during processing for congestion control or by the
 *	protocol layers.
 *
 *	return values:
 *	NET_RX_SUCCESS	(no congestion)
 *	NET_RX_DROP     (packet was dropped)
 *
 */

int netif_rx(struct sk_buff *skb)
{
	trace_netif_rx_entry(skb);

	return netif_rx_internal(skb);
}

enqueue_to_backlog函数负责把skb放入队列，队列最大长度是netdev_max_backlog。
如果input_pkt_queue此前是空队列，则需要____napi_schedule函数为软中断调度，如果不为空说明已经调度过了没有必要再调用一次。

/*
 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 * queue (may be a remote CPU queue).
 */
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
			      unsigned int *qtail)
{
	struct softnet_data *sd;
	unsigned long flags;
	unsigned int qlen;

	sd = &per_cpu(softnet_data, cpu);

	local_irq_save(flags);

	rps_lock(sd);
	if (!netif_running(skb->dev))
		goto drop;
	qlen = skb_queue_len(&sd->input_pkt_queue);
	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
		if (qlen) {
enqueue:
			__skb_queue_tail(&sd->input_pkt_queue, skb);
			input_queue_tail_incr_save(sd, qtail);
			rps_unlock(sd);
			local_irq_restore(flags);
			return NET_RX_SUCCESS;
		}

		/* Schedule NAPI for backlog device
		 * We can use non atomic operation since we own the queue lock
		 */
		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
			if (!rps_ipi_queued(sd))
				____napi_schedule(sd, &sd->backlog);
		}
		goto enqueue;
	}

drop:
	sd->dropped++;
	rps_unlock(sd);

	local_irq_restore(flags);

	atomic_long_inc(&skb->dev->rx_dropped);
	kfree_skb(skb);
	return NET_RX_DROP;
}

NET_RX_SOFTIRQ处理

net_rx_action函数负责处理NET_RX_SOFTIRQ中断，其执行的触发是当驱动程序通知内核有输入报文的存在的时候。报文可以在两个地方等待处理：

非NAPI设备softnet_data的输入队列
设备内存，NAPI驱动程序用poll方法直接从设备中取出报文。

这个函数遍历poll_list列表，这些设备的入口队列中都有报文，为每个设备启用关联的poll函数。
这个函数会控制自己的运行时间和处理报文的数量，避免过分占用CPU运行时间。
每个设备都有自己的运行时间配额，napi_poll函数返回刚刚设备使用了多少配额。
在napi_poll函数中，如果驱动程序的poll函数因为配额不足而返回就将该设备再次放到poll_list的末尾。
如果net_rx_action因用完配额返回，则需要调用__raise_softirq_irqoff函数，安排下一次的运行。


static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
	void *have;
	int work, weight;

	list_del_init(&n->poll_list);

	have = netpoll_poll_lock(n);

	weight = n->weight;

	/* This NAPI_STATE_SCHED test is for avoiding a race
	 * with netpoll's poll_napi().  Only the entity which
	 * obtains the lock and sees NAPI_STATE_SCHED set will
	 * actually make the ->poll() call.  Therefore we avoid
	 * accidentally calling ->poll() when NAPI is not scheduled.
	 */
	work = 0;
	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
		work = n->poll(n, weight);
		trace_napi_poll(n, work, weight);
	}

	WARN_ON_ONCE(work > weight);

	if (likely(work < weight))
		goto out_unlock;

	/* Drivers must not modify the NAPI state if they
	 * consume the entire weight.  In such cases this code
	 * still "owns" the NAPI instance and therefore can
	 * move the instance around on the list at-will.
	 */
	if (unlikely(napi_disable_pending(n))) {
		napi_complete(n);
		goto out_unlock;
	}

	if (n->gro_bitmask) {
		/* flush too old packets
		 * If HZ < 1000, flush all packets.
		 */
		napi_gro_flush(n, HZ >= 1000);
	}

	/* Some drivers may have called napi_schedule
	 * prior to exhausting their budget.
	 */
	if (unlikely(!list_empty(&n->poll_list))) {
		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
			     n->dev ? n->dev->name : "backlog");
		goto out_unlock;
	}

	list_add_tail(&n->poll_list, repoll);

out_unlock:
	netpoll_poll_unlock(have);

	return work;
}

static __latent_entropy void net_rx_action(struct softirq_action *h)
{
	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
	unsigned long time_limit = jiffies +
		usecs_to_jiffies(netdev_budget_usecs);
	int budget = netdev_budget;
	LIST_HEAD(list);
	LIST_HEAD(repoll);

	local_irq_disable();
	list_splice_init(&sd->poll_list, &list);
	local_irq_enable();

	for (;;) {
		struct napi_struct *n;

		if (list_empty(&list)) {
			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
				goto out;
			break;
		}

		n = list_first_entry(&list, struct napi_struct, poll_list);
		budget -= napi_poll(n, &repoll);

		/* If softirq window is exhausted then punt.
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
		 */
		if (unlikely(budget <= 0 ||
			     time_after_eq(jiffies, time_limit))) {
			sd->time_squeeze++;
			break;
		}
	}

	local_irq_disable();

	list_splice_tail_init(&sd->poll_list, &list);
	list_splice_tail(&repoll, &list);
	list_splice(&list, &sd->poll_list);
	if (!list_empty(&sd->poll_list))
		__raise_softirq_irqoff(NET_RX_SOFTIRQ);

	net_rps_action_and_irq_enable(sd);
out:
	__kfree_skb_flush();
}

process_backlog

前文提到netif_rx函数，在该函数中会调用enqueue_to_backlog函数将sk_buff放到input_pkt_queue中。
当input_pkt_queue为空就调用____napi_schedule函数将backlog挂到softnet_data的poll_list队列下。
net_rx_action函数在轮询的时候调用poll函数指针。而backlog的poll指针在net_dev_init函数中被初始化成了process_backlog函数。

static inline void ____napi_schedule(struct softnet_data *sd,
				     struct napi_struct *napi)
{
	list_add_tail(&napi->poll_list, &sd->poll_list);
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

process_backlog函数将input_pkt_queue上的报文移动到process_queue上处理，为每个报文调用__netif_receive_skb函数进行后续的处理。
该函数知道用完配额或者将队列中的所有报文都处理完毕才会退出。

static int process_backlog(struct napi_struct *napi, int quota)
{
	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
	bool again = true;
	int work = 0;

	/* Check if we have pending ipi, its better to send them now,
	 * not waiting net_rx_action() end.
	 */
	if (sd_has_rps_ipi_waiting(sd)) {
		local_irq_disable();
		net_rps_action_and_irq_enable(sd);
	}

	napi->weight = dev_rx_weight;
	while (again) {
		struct sk_buff *skb;

		while ((skb = __skb_dequeue(&sd->process_queue))) {
			rcu_read_lock();
			__netif_receive_skb(skb);
			rcu_read_unlock();
			input_queue_head_incr(sd);
			if (++work >= quota)
				return work;

		}

		local_irq_disable();
		rps_lock(sd);
		if (skb_queue_empty(&sd->input_pkt_queue)) {
			/*
			 * Inline a custom version of __napi_complete().
			 * only current cpu owns and manipulates this napi,
			 * and NAPI_STATE_SCHED is the only possible flag set
			 * on backlog.
			 * We can use a plain write instead of clear_bit(),
			 * and we dont need an smp_mb() memory barrier.
			 */
			napi->state = 0;
			again = false;
		} else {
			skb_queue_splice_tail_init(&sd->input_pkt_queue,
						   &sd->process_queue);
		}
		rps_unlock(sd);
		local_irq_enable();
	}

	return work;
}

__netif_receive_skb函数负责处理报文，所有的poll函数都用到这个函数。

netif_receive_skb函数

netif_receive_skb函数被调用时，L3协议标识已经从L2报文头中提取出来存入skb->protocol字段中。
netif_receive_skb函数由三个主要任务

把报文的副本传递给每个协议分流器。
把报文的副本传递给L3协议处理函数。
负责处理L2一些功能比如桥接。


/**
 *	netif_receive_skb - process receive buffer from network
 *	@skb: buffer to process
 *
 *	netif_receive_skb() is the main receive data processing function.
 *	It always succeeds. The buffer may be dropped during processing
 *	for congestion control or by the protocol layers.
 *
 *	This function may only be called from softirq context and interrupts
 *	should be enabled.
 *
 *	Return values (usually ignored):
 *	NET_RX_SUCCESS: no congestion
 *	NET_RX_DROP: packet was dropped
 */
int netif_receive_skb(struct sk_buff *skb)
{
	trace_netif_receive_skb_entry(skb);

	return netif_receive_skb_internal(skb);
}

netif_receive_skb_internal函数调用__netif_receive_skb_one_core函数。在__netif_receive_skb_one_core函数根据protocol将skb传递给和内核已经注册的匹配的协议号的处理模块。

static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{
	struct net_device *orig_dev = skb->dev;
	struct packet_type *pt_prev = NULL;
	int ret;

	ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
	if (pt_prev)
		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
	return ret;
}