数据包接收系列 — 下半部实现（软中断）

最新推荐文章于 2022-07-19 19:38:24 发布

fengzhishang_meteor

最新推荐文章于 2022-07-19 19:38:24 发布

阅读量634

点赞数

本文主要内容：下半部的实现，分析数据包从上半部结束后到L3的处理过程。

内核版本：2.6.37

Author：zhangskd @ csdn blog

下半部的实现

接收数据包的下半部处理流程为：

net_rx_action // 软中断

|--> process_backlog() // 默认poll

|--> __netif_receive_skb() // L2处理函数

|--> ip_rcv() // L3入口

net_rx_action

软中断(NET_RX_SOFTIRQ)的处理函数net_rx_action()主要做了：

遍历sd->poll_list，对于每个处于轮询状态的设备，调用它的poll()函数来处理数据包。

如果设备NAPI被禁止了，则把设备从sd->poll_list上删除，否则把设备移动到sd->poll_list的队尾。

每次软中断最多允许处理netdev_budget(300)个数据包，最长运行时间为2jiffies(2ms)。

每个设备一次最多允许处理weight_p(64)个数据包(非NAPI)。

如果在这次软中断中没处理玩，则再次设置NET_RX_SOFTIRQ标志触发软中断。

[java]view plaincopy 
   
 static void net_rx_action(struct softirq_action *h)  
 {  
     struct softnet_data *sd = &__get_cpu_var(softnet_data); /* 当前CPU的softnet_data实例 */  
     unsigned long time_limit = jiffies + 2; /* 一次软中断的最长处理时间(2ms) */  
     int budget = netdev_budget; /* 一次软中断最多能够处理的skb个数(300) */  
     void *have;  
   
     local_irq_disable(); /* 禁止本地中断 */  
   
     /* 如果有处于轮询状态的设备 */  
     while(! list_empty(&sd->poll_list)) {  
   
         struct napi_struct *n;  
         int work, weight;  
         
         /* If softirq window is exhuasted then punt. 
          * Allow this to run for 2 jiffies since which will allow an average 
          * latency of 1.5/HZ. 
          * 如果处理的数据包过多了，或者处理的时间过长了，则退出。 
          */  
         if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))  
             goto softnet_break;  
   
         local_irq_enable(); /* 开启本地中断 */  
   
         /* Even though interrupts have been re-enabled, this access is safe because 
          * interrupts can only add new entries to the tail of this list, and only ->poll() 
          * calls can remove this head entry from the list. 
          */  
   
         /* 获取链表上的第一个napi_struct实例 */  
         n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);  
   
         have = netpoll_poll_lock(n);  
         weight = n->weight; /* 这个设备每次能poll的数据包上限 */  
   
         /* This NAPI_STATE_SCHED test is for avoiding a race with netpoll's 
          * poll_napi(). Only the entity which obtains the lock and sees NAPI_STATE_SCHED 
          * set will actually make the ->poll() call. Therefore we avoid accidently calling ->poll() 
          * when NAPI is not scheduled. 
          */  
         work = 0;  
   
         if (test_bit(NAPI_STATE_SCHED, &n->state)) {  
             /* 调用napi_struct的poll方法，返回处理的数据包个数 */  
             work = n->poll(n, weight); /* 默认为process_backlog() */  
   
             trace_napi_poll(n);  
         }  
         WARN_ON_ONCE(work > weight);  
   
         budget -= work; /* 总预算减去本次处理的数据包数 */  
   
         local_irq_disable(); /* 禁止本地中断 */  
   
         if (unlikely(work == weight)) {  
             /* 如果NAPI被禁止了，则把当前napi_struct从poll_list中删除 */  
             if (unlikely(napi_disable_pending(n))) {  
                 local_irq_enable();  
                 napi_complete(n);  
                 local_irq_disable();  
   
             } else  
                 /* 把当前napi_struct移动到poll_list的队尾 */  
                 list_move_tail(&n->poll_list, &sd->poll_list);  
         }  
         netpoll_poll_unlock(have);  
     }  
   
 out:  
     net_rps_action_and_irq_enable(sd); /* 开启本地中断 */  
   
 #ifdef CONFIG_NET_DMA  
     ...  
 #endif  
   
     return;  
   
 softnet_break:  
     sd->time_squeeze++; /* 跑满2ms，或处理了300个包 */  
     __raise_softirq_irqoff(NET_RX_SOFTIRQ); /* 因为没处理完，再次触发软中断 */  
     goto out;  
 }  

当调用napi_struct的poll()来处理数据包时，本地中断是开启的，这意味着新的数据包可以继续添加到

输入队列中。

process_backlog

如果网卡驱动不支持NAPI，则默认的napi_struct->poll()函数为process_backlog()。

process_backlog()的主要工作：

1. 处理sd->process_queue中的数据包

分别取出每个skb，从队列中删除。

开本地中断，调用__netif_rx_skb()把skb从L2传递到L3，然后关本地中断。

这说明在处理skb时，是允许网卡中断把数据包添加到接收队列(sd->input_pkt_queue)中的。

2. 如果处理完sd->process_queue中的数据包了，quota还没用完

把接收队列添加到sd->process_queue处理队列的尾部后，初始化接收队列。

接下来会继续处理sd->process_queue中的数据包。

3. 如果本次能处理完sd->process_queue和sd->input_pkt_queue中的所有数据包

把napi_struct从sd->poll_list队列中删除掉，清除NAPI_STATE_SCHED标志。

[java]view plaincopy 
   
 static int process_backlog(struct napi_struct *napi, int quota)  
 {  
     int work = 0;  
     struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);  
   
 #ifdef CONFIG_RPS  
     ...  
 #endif  
   
     napi->weight = weight_p; /* 每次处理的最大数据包数，默认为64 */  
     local_irq_disable(); /* 禁止本地中断 */  
       
     while(work < quota) { /* 配额允许时 */  
         struct sk_buff *skb;  
         unsigned int qlen;  
   
         /* 从sd->process_queue队列取出第一个skb，并把它从队列中删除。 
          * sd->process_queue用于存储即将处理的数据包。 
          */  
         while((skb = __skb_dequeue(&sd->process_queue))) {  
             local_irq_enable(); /* 开启本地中断 */  
   
             __netif_receive_skb(skb); /* 进行二层处理后转发给网络层 */  
   
             local_irq_disable();  
             input_queue_head_incr(sd);  
   
             if (++work >= quota) { /* 处理的数据包个数超过上限了，返回 */  
                 local_irq_enable();  
                 return work;  
             }  
         }  
   
         rps_lock(sd);  
         qlen = skb_queue_len(&sd->input_pkt_queue); /* 接收队列的长度 */  
         /* 把接收队列添加到sd->process_queue的尾部，然后初始化接收队列 */  
         if (qlen)  
             skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue);  
   
         /* 如果能在本次处理完接收队列的数据包 */  
         if (qlen < quota - work) {  
             /* 把napi_struct从sd->poll_list队列中删除，因为马上要全部处理完了 */  
             list_del(&napi->poll_list);  
   
             napi->state = 0; /* 清除掉NAPI_STATE_SCHED标志 */  
             quota = work + qlen; /* 减小quota，使接下来处理完process_queue的qlen个包即退出 */  
         }  
         rps_unlock(sd);  
     }      
    
     local_irq_enable();  
     return work;  
 }  

从sk_buff_head队列中取出第一个skb，并把它从队列中删除。

[java]view plaincopy 
   
 /** 
  * __skb_dequeue - remove from the head of the queue 
  * @list: list to dequeue from 
  * Remove the head of the list.  
  * The head item is returned or %NULL if the list is empty. 
  */  
 static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)  
 {  
     struct sk_buff *skb = skb_peek(list); /* 取出队列的第一个元素 */  
     if (skb)  
         __skb_unlink(skb, list); /* 把skb从sk_buff_head队列中删除 */  
     return skb;  
 }  

把list添加到head的队尾，然后把list重新初始化。

[java]view plaincopy 
   
 /** 
  * skb_queue_splice_tail - join two skb lists and reinitialise the emptied list 
  * @list: the new list to add 
  * @head: the place to add it in the first list 
  * Each of the lists is a queue. 
  * The list at @list is reinitialised 
  */  
 static inline void skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head)  
 {  
     if (! skb_queue_empty(list)) {  
         __skb_queue_splice(list, head->prev, (struct sk_buff *)head);  
         head->qlen += list->qlen;  
         __skb_queue_head_init(list);  
     }  
 }  

__netif_receive_skb

__netif_receive_skb()的主要工作为：

处理NETPOLL、网卡绑定、入口流量控制、桥接、VLAN。

遍历嗅探器(ETH_P_ALL)链表ptype_all。对于每个注册的sniffer，调用它的处理函数

packet_type->func()，例如tcpdump。

赋值skb->network_header，根据skb->protocol从三层协议哈希表ptype_base中找到对应的

三层协议。如果三层协议是ETH_P_IP，相应的packet_type为ip_packet_type，协议处理函数为ip_rcv()。

[java]view plaincopy 
   
 static int __netif_receive_skb(struct sk_buff *skb)  
 {  
     struct packet_type *ptype, *pt_prev;  
     rx_handler_func_t *rx_handler;  
     struct net_device *orig_dev;  
     struct net_device *master;  
     struct net_device *null_or_orig;  
     struct net_device *orig_or_bond;  
     int ret = NET_RX_DROP;  
     __be16 type;  
   
     if (! netdev_tstamp_prequeue)  
         net_timestamp_check(skb); /* 记录接收时间到skb->tstamp */  
     trace_netif_receive_skb(skb);  
   
     /* If we've gotten here through NAPI, check netpoll */  
     if (netpoll_receive_skb(skb))  
         return NET_RX_DROP;  
    
     if (! skb->skb_iif)  
         skb->skb_iif = skb->dev->ifinex; /* 记录设备编号 */  
   
     /* 处理网卡绑定(bonding) */  
     null_or_orig = NULL;  
     orig_dev = skb->dev;  
     master = ACCESS_ONCE(orig_dev->master);  
   
     if (skb->deliver_no_wcard)  
         null_or_orig = orig_dev;  
     else if (master) {  
         if (skb_bond_should_drop(skb, master)) {  
             skb->deliver_no_wcard = 1;  
             null_or_orig = orig_dev; /* deliver only exact match */  
         } else  
             skb->dev = master;  
     }  
   
     __this_cpu_inc(softnet_data.processed); /* 增加本cpu处理过的数据包个数 */  
     skb_reset_network_header(skb); /* 赋值skb->network_header */  
     skb_reset_network_header(skb); /* 赋值skb->transport_header */  
     skb->mac_len = skb->network_header - skb->mac_header; /* MAC头的长度，一般为14 */  
     pt_prev = NULL;  
   
     rcu_read_lock();  
   
 /* 入口流量控制 */  
 #ifdef CONFIG_NET_CLS_ACT   
     if (skb->tc_verd & TC_NCLS) {  
         skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);  
         goto ncls;  
     }  
 #endif  
   
     /* 遍历嗅探器(ETH_P_ALL)链表ptype_all。对于每个注册的sniffer， 
      * 调用它的处理函数packet_type->func()，例如tcpdump。 
      */  
     list_for_each_entry_rcu(ptype, &ptype_all, list) {  
         if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||  
             ptype->dev == orig_dev) {  
             if (pt_prev)  
                 ret = deliver_skb(skb, pt_prev, orig_dev); /* 嗅探器的处理函数 */  
                 pt_prev = ptype;  
         }  
     }  
   
 #ifdef CONFIG_NET_CLS_ACT  
     skb = handle_ing(skb, &pt_prev, &ret, orig_dev);  
     if (! skb)  
         goto out;  
 ncls:  
 #endif  
   
     /* Handle special case of bridge or macvlan，接收的特殊过程 */  
     rx_handler = rcu_dereference(skb->dev->rx_handler);  
     if (rx_handler) {  
         if (pt_prev) {  
             ret = deliver_skb(skb, pt_prev, orig_dev);  
             pt_prev = NULL;  
         }  
         skb = rx_handler(skb);  
         if (! skb)  
             goto out;  
     }  
   
     /* VLAN虚拟局域网 */  
     if (vlan_tx_tag_present(skb)) {  
         if (pt_prev) {  
             ret = deliver_skb(skb, pt_prev, orig_dev);  
             pt_prev = NULL;  
         }  
   
         if (vlan_hwaccel_do_receive(&skb)) {  
             ret = __netif_receive_skb(skb);  
             goto out;  
         } else if (unlikely(! skb))  
             goto out;  
     }  
   
     /* Make sure frames received on VLAN interfaces stacked on bonding 
      * interfaces still make their way to any base bonding device that may 
      * have registered for a specific ptype. The handler may have to adjust 
      * skb->dev and orig_dev. 
      */  
     orig_or_bond = orig_dev;  
     if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&  
         (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {  
         orig_or_bond = vlan_dev_real_dev(skb->dev);  
     }  
   
     type = skb->protocol; /* 三层协议类型 */  
   
     list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {  
         if (ptype->type == type && (ptype->dev == null_or_orig || ptype->dev == skb->dev  
             || ptype->dev == orig_dev || ptype->dev == orig_or_bond)) {  
   
             /* 如果三层协议是ETH_P_IP，相应的packet_type为ip_packet_type， 
              * 协议处理函数为ip_rcv()。 
              */  
             if (pt_prev)  
                 ret = deliver_skb(skb, pt_prev, orig_dev);  
             pt_prev = ptype;  
         }  
     }  
   
     if (pt_prev) {  
         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);  
     } else { /* 说明没找到对应的三层协议 */  
         atomic_long_inc(&skb->dev->rx_dropped);  
         kfree_skb(skb);  
         ret = NET_RX_DROP;  
     }  
   
 out:  
     rcu_read_unlock();  
     return ret;  
 }  
    

L3协议处理函数

[java]view plaincopy 
   
 #define PTYPE_HASH_SIZE (16)  
 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)  
 static DEFINE_SPINLOCK(ptype_lock);  
   
 static struct list_head ptype_base[PTYPE_HASH_SIZE]; /* 协议哈希表 */  
 static struct list_head ptype_all; /* 嗅探器(ETH_P_ALL)的链表 */  

packet_type用于描述一个协议：

[java]view plaincopy 
   
 struct packet_type {  
     __be16 type; /* This is really htons(ether_type). 协议代码 */  
     struct net_device *dev; /* NULL is wildcarded here */  
   
     /* 协议处理函数，如ip_rcv() */  
     int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);  
     ...  
     struct list_head list;  
 }  

IP协议：

[java]view plaincopy 
   
 /* IP protocol layer initialiser */  
 static struct packet_type ip_packet_type = {  
     .type = cpu_to_be16(ETH_P_IP),  
     .func = ip_rcv,  
     ...  
 };  
 #define ETH_P_IP 0x0800 /* Internet Protocol packet */  

fengzhishang_meteor

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
数据包接收系列 — 下半部实现（软中断）

本文主要内容：下半部的实现，分析数据包从上半部结束后到L3的处理过程。内核版本：2.6.37Author：zhangskd @ csdn blog 下半部的实现接收数据包的下半部处理流程为：net_rx_action // 软中断 |--> process_backlog() // 默认poll
复制链接

扫一扫