在Linux内核中,当网卡驱动接收到数据时,会调用netif_rx_ni函数传递数据到IP层
,主要把数据包链接到input_pkt_queue队列,并启动一次软中断函数
int netif_rx_ni(struct sk_buff *skb)
{
int err;
preempt_disable();
err = netif_rx(skb);
if (local_softirq_pending())
do_softirq();
preempt_enable();
return err;
}
1. netif_rx 函数
int netif_rx(struct sk_buff *skb)
{
int ret;
/*…………..*/
{
unsigned int qtail;
直接调用enqueue_to_backlog函数
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
put_cpu();
}
return ret;
}
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
{
struct softnet_data *sd;
unsigned long flags;
/*获取per cpu数据 */
sd = &per_cpu(softnet_data, cpu);
local_irq_save(flags);
rps_lock(sd);
/*如果input_pkt_queue队列中有skb包,且没有超过netdev_max_backlog,则会直接把skb链接到input_pkt_queue队列 */
if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
rps_unlock(sd);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
if (!rps_ipi_queued(sd))/*如果input_pkt_queue队列为空,启动一次软中断,并把sd挂接到poll_list链表 */
____napi_schedule(sd, &sd->backlog);
}
goto enqueue;//挂接skb到input_pkt_queue队列
}
sd->dropped++;
rps_unlock(sd);
local_irq_restore(flags);
/* 如果超过最大包数,则直接丢掉*/
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
2. net_rx_action 函数
static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = &__get_cpu_var(softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;/*此网络设备允许最大传输数,目前为300 */
void *have;
local_irq_disable();
while (!list_empty(&sd->poll_list)) {
struct napi_struct *n;
int work, weight;
if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
goto softnet_break; /*如果budget用完,或者已经超过2 jiffies 则退出 */
local_irq_enable();
n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
have = netpoll_poll_lock(n);
weight = n->weight;
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);//调用poll函数,如果驱动没有实现,则调用默认的process_backlog函数
trace_napi_poll(n);
}
WARN_ON_ONCE(work > weight);
budget -= work;
local_irq_disable();
if (unlikely(work == weight)) { /*如果网络设备的budget用完,则完成一次接收 */
if (unlikely(napi_disable_pending(n))) {
local_irq_enable();
napi_complete(n);
local_irq_disable();
} else {
if (n->gro_list) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
local_irq_enable();
napi_gro_flush(n, HZ >= 1000);
local_irq_disable();
}
list_move_tail(&n->poll_list, &sd->poll_list);
}
}
netpoll_poll_unlock(have);
}
out:
net_rps_action_and_irq_enable(sd);
}
static int process_backlog(struct napi_struct *napi, int quota)
{
int work = 0;
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
napi->weight = weight_p;
local_irq_disable();
while (work < quota) {
struct sk_buff *skb;
unsigned int qlen;
/*第一次进来时为空 */
while ((skb = __skb_dequeue(&sd->process_queue))) {
local_irq_enable();
/*把数据传递到IP层 */
__netif_receive_skb(skb);
local_irq_disable();
input_queue_head_incr(sd);
if (++work >= quota) {
local_irq_enable();
return work;
}
}
rps_lock(sd);
qlen = skb_queue_len(&sd->input_pkt_queue);
if (qlen)/*直接把input_ptk_queue链接到process_queue队列 */
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
if (qlen < quota - work) {
/*如果此napi没有数据包,则把napi移除 */
list_del(&napi->poll_list);
napi->state = 0;
quota = work + qlen;
}
rps_unlock(sd);
}
local_irq_enable();
return work;
}
3. __netif_receive_skb函数
static int __netif_receive_skb(struct sk_buff *skb)
{
int ret;
if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
unsigned long pflags = current->flags;
current->flags |= PF_MEMALLOC;
ret = __netif_receive_skb_core(skb, true);
tsk_restore_flags(current, pflags, PF_MEMALLOC);
} else /*直接调用__netif_receive_skb_core函数 */
ret = __netif_receive_skb_core(skb, false);
return ret;
}
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
struct net_device *null_or_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
orig_dev = skb->dev;
/*重置network报头和mac head长度 */
skb_reset_network_header(skb);
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
pt_prev = NULL;
rcu_read_lock();
another_round:
skb->skb_iif = skb->dev->ifindex;
__this_cpu_inc(softnet_data.processed);
/* deliver only exact match when indicated */
null_or_dev = deliver_exact ? skb->dev : NULL;
type = skb->protocol;
/*先把skb传递给ptype_all的协议层,如果用tcpdump抓包时,会注册处理函数到ptype_all */
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
/*ptype_base为基本协议处理,
协议层通过dev_add_pack注册pack处理函数
如IP的ip_rcv和arp_rcv */
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
if (ptype->type == type &&
(ptype->dev == null_or_dev || ptype->dev == skb->dev ||
ptype->dev == orig_dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
if (pt_prev) {
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
goto drop;
else
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
drop:
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
unlock:
rcu_read_unlock();
out:
return ret;