帧的接收
通知内核已接收NAPI和netif_rx
Linux驱动程序通知内核接收报文有两种方式:
- netif_rx函数
- NAPI机制
NAPI
NAPI混合中断和轮询,不适用纯粹的中断事件驱动模型。
net_rx_action和NAPI
net_rx_action会查看列表中处于轮询状态的设备,然后为每个设备调用相关联的poll函数,处理入口队列中的帧。该列表中的设备按照循环的方式被查阅,每次poll方法启用时存在能处理的帧数目的最大值。也就是说net_rx_action会持续为入口队列中有数据的设备调用设备驱动程序提供的poll方法,直到入口队列为空,到那时驱动程序重新开启设备的中断功能。
net_rx_action会限制执行时间,当处理完一定数量的报文或者执行时间超过限制后,会自行重新调度准备执行。
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(netdev_budget_usecs);
int budget = netdev_budget;
LIST_HEAD(list);
LIST_HEAD(repoll);
local_irq_disable();
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
for (;;) {
struct napi_struct *n;
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
goto out;
break;
}
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll);
/* If softirq window is exhausted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
out:
__kfree_skb_flush();
}
从设备驱动程序角度看,NAPI和非NAPI有两点差异:
- NAPI驱动程序必须提供一个poll方法。
- NAPI收到报文中断时调用__napi_schedule函数,非NAPI调用netif_rx函数
NAPI分析
首先查看中断函数,e100_intr是选取的某个驱动程序的中断函数。
可以看到在调用__napi_schedule(&nic->napi);函数之前,代码首先使用函数e100_disable_irq(nic)把中断禁用。
static irqreturn_t e100_intr(int irq, void *dev_id)
{
struct net_device *netdev = dev_id;
struct nic *nic = netdev_priv(netdev);
u8 stat_ack = ioread8(&nic->csr->scb.stat_ack);
netif_printk(nic, intr, KERN_DEBUG, nic->netdev,
"stat_ack = 0x%02X\n", stat_ack);
if (stat_ack == stat_ack_not_ours || /* Not our interrupt */
stat_ack == stat_ack_not_present) /* Hardware is ejected */
return IRQ_NONE;
/* Ack interrupt(s) */
iowrite8(stat_ack, &nic->csr->scb.stat_ack);
/* We hit Receive No Resource (RNR); restart RU after cleaning */
if (stat_ack & stat_ack_rnr)
nic->ru_running = RU_SUSPENDED;
if (likely(napi_schedule_prep(&nic->napi))) {
e100_disable_irq(nic);
__napi_schedule(&nic->napi);
}
return IRQ_HANDLED;
}
__napi_schedule函数首先是将设备的poll_list加入到softnet_data的poll_list中,然后NET_RX_SOFTIRQ软中断调度准备执行。
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
/**
* __napi_schedule - schedule for receive
* @n: entry to schedule
*
* The entry's receive function will be scheduled to run.
* Consider using __napi_schedule_irqoff() if hard irqs are masked.
*/
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
}
netif_rx函数
当有输入报文等待处理时,设备驱动程序可以调用netif_rx函数,此函数将sk_buff放入到input_pkt_queue中。
netif_rx函数通常在中断环境下被驱动程序调用当然也有例外比如环路设备,因此netif_rx函数会关闭本地中断函数退出时重新开启中断。
netif_rx函数调用netif_rx_internal函数,在netif_rx_internal中首先判断是否要为接收报文保存时间戳,因为保存时间戳需要消耗性能。
static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb);
unsigned int qtail;
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
put_cpu();
return ret;
}
/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
* the upper (protocol) levels to process. It always succeeds. The buffer
* may be dropped during processing for congestion control or by the
* protocol layers.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
int netif_rx(struct sk_buff *skb)
{
trace_netif_rx_entry(skb);
return netif_rx_internal(skb);
}
enqueue_to_backlog函数负责把skb放入队列,队列最大长度是netdev_max_backlog。
如果input_pkt_queue此前是空队列,则需要____napi_schedule函数为软中断调度,如果不为空说明已经调度过了没有必要再调用一次。
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
*/
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
{
struct softnet_data *sd;
unsigned long flags;
unsigned int qlen;
sd = &per_cpu(softnet_data, cpu);
local_irq_save(flags);
rps_lock(sd);
if (!netif_running(skb->dev))
goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
if (qlen) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
rps_unlock(sd);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
if (!rps_ipi_queued(sd))
____napi_schedule(sd, &sd->backlog);
}
goto enqueue;
}
drop:
sd->dropped++;
rps_unlock(sd);
local_irq_restore(flags);
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
NET_RX_SOFTIRQ处理
net_rx_action函数负责处理NET_RX_SOFTIRQ中断,其执行的触发是当驱动程序通知内核有输入报文的存在的时候。报文可以在两个地方等待处理:
- 非NAPI设备softnet_data的输入队列
- 设备内存,NAPI驱动程序用poll方法直接从设备中取出报文。
这个函数遍历poll_list列表,这些设备的入口队列中都有 报文,为每个设备启用关联的poll函数。
这个函数会控制自己的运行时间和处理报文的数量,避免过分占用CPU运行时间。
每个设备都有自己的运行时间配额,napi_poll函数返回刚刚设备使用了多少配额。
在napi_poll函数中,如果驱动程序的poll函数因为配额不足而返回就将该设备再次放到poll_list的末尾。
如果net_rx_action因用完配额返回,则需要调用__raise_softirq_irqoff函数,安排下一次的运行。
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
void *have;
int work, weight;
list_del_init(&n->poll_list);
have = netpoll_poll_lock(n);
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);
trace_napi_poll(n, work, weight);
}
WARN_ON_ONCE(work > weight);
if (likely(work < weight))
goto out_unlock;
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
if (unlikely(napi_disable_pending(n))) {
napi_complete(n);
goto out_unlock;
}
if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
napi_gro_flush(n, HZ >= 1000);
}
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
*/
if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog");
goto out_unlock;
}
list_add_tail(&n->poll_list, repoll);
out_unlock:
netpoll_poll_unlock(have);
return work;
}
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(netdev_budget_usecs);
int budget = netdev_budget;
LIST_HEAD(list);
LIST_HEAD(repoll);
local_irq_disable();
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
for (;;) {
struct napi_struct *n;
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
goto out;
break;
}
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll);
/* If softirq window is exhausted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
out:
__kfree_skb_flush();
}
process_backlog
前文提到netif_rx函数,在该函数中会调用enqueue_to_backlog函数将sk_buff放到input_pkt_queue中。
当input_pkt_queue为空就调用____napi_schedule函数将backlog挂到softnet_data的poll_list队列下。
net_rx_action函数在轮询的时候调用poll函数指针。而backlog的poll指针在net_dev_init函数中被初始化成了process_backlog函数。
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
process_backlog函数将input_pkt_queue上的报文移动到process_queue上处理,为每个报文调用__netif_receive_skb函数进行后续的处理。
该函数知道用完配额或者将队列中的所有报文都处理完毕才会退出。
static int process_backlog(struct napi_struct *napi, int quota)
{
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
bool again = true;
int work = 0;
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
*/
if (sd_has_rps_ipi_waiting(sd)) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
napi->weight = dev_rx_weight;
while (again) {
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sd->process_queue))) {
rcu_read_lock();
__netif_receive_skb(skb);
rcu_read_unlock();
input_queue_head_incr(sd);
if (++work >= quota)
return work;
}
local_irq_disable();
rps_lock(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
* and NAPI_STATE_SCHED is the only possible flag set
* on backlog.
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
napi->state = 0;
again = false;
} else {
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
}
rps_unlock(sd);
local_irq_enable();
}
return work;
}
__netif_receive_skb函数负责处理报文,所有的poll函数都用到这个函数。
netif_receive_skb函数
netif_receive_skb函数被调用时,L3协议标识已经从L2报文头中提取出来存入skb->protocol字段中。
netif_receive_skb函数由三个主要任务
- 把报文的副本传递给每个协议分流器。
- 把报文的副本传递给L3协议处理函数。
- 负责处理L2一些功能比如桥接。
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb(struct sk_buff *skb)
{
trace_netif_receive_skb_entry(skb);
return netif_receive_skb_internal(skb);
}
netif_receive_skb_internal函数调用__netif_receive_skb_one_core函数。在__netif_receive_skb_one_core函数根据protocol将skb传递给和内核已经注册的匹配的协议号的处理模块。
static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
int ret;
ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
if (pt_prev)
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
return ret;
}