Linux内核IP层的报文处理流程--从网卡接收的报文处理流程

最新推荐文章于 2022-07-22 19:16:00 发布

wzb56

最新推荐文章于 2022-07-22 19:16:00 发布

阅读量3.9k

点赞数

分类专栏： Linux内核-网络子系统

Linux内核-网络子系统专栏收录该内容

6 篇文章 1 订阅

订阅专栏

本文主要讲解了Linux内核IP层的整体架构和对从网卡接受的报文处理流程。

使用的内核的版本是2.6.32.27

为了方便理解，本文采用整体流程图加伪代码的方式

对Linxu内核中IP整体实现架构和对网卡报文的处理流程进行了讲解，希望可以对大家有所帮助。

阅读本文章假设大家对C语言有了一定的了解

IP层的整体实现架构

IP层接受底层数据报文的处理流程

[cpp]view plaincopy 
   
 /* 
  * 在NET_RX_SOFTIRQ软中后，由ETH_P_IP触发的ipv4协议入口函数 
  */  
 int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)  
 {  
     /* 
      * 过滤掉送往其他主机的数据包（这时网卡正在处于混杂模式） 
      */  
     if (skb->pkt_type == PACKET_OTHERHOST)  
         goto drop;  
   
     iph = ip_hdr(skb);  
   
     /*头的长度是否至少是IP头长度(5); 是否是IPV4报文*/  
     if (iph->ihl < 5 || iph->version != 4)  
         goto inhdr_error;  
   
     /*IP头长度是否正确，不是伪造的长度*/  
     if (!pskb_may_pull(skb, iph->ihl*4))  
         goto inhdr_error;  
   
     iph = ip_hdr(skb);  
     /*检查校验和*/  
     if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))  
         goto inhdr_error;  
   
     len = ntohs(iph->tot_len);  
     if (skb->len < len) {  
         IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);  
         goto drop;  
     } else if (len < (iph->ihl*4))  
         goto inhdr_error;  
   
     /*实际尺寸不匹配套接字缓冲(skb->len)中维护的信息，则调用skb_trim调整数据包的长度*/  
     if (pskb_trim_rcsum(skb, len)) {  
         IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);  
         goto drop;  
     }  
   
     /*调用IP_PRE_ROUTING(NF_INET_PRE_ROUTING)上注册的钩子， 
       *在调用钩子处理完之后，调用钩子处理完成之后，调用ip_rcv_finish 
       * 后面讲防火墙的时候，我们会仔细梳理*/  
     return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish);  
 }  
   
   
 /* NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish)*/  
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \  
     NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, INT_MIN)  
 {  
     int __ret;                                     \  
     if ((__ret=nf_hook_thresh(pf, hook, (skb), indev, outdev, okfn, thresh, 1)) == 1)\  
         __ret = (okfn)(skb);                               \  
     __ret;  
 }         
   
 static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,  
                  struct sk_buff *skb,  
                  struct net_device *indev,  
                  struct net_device *outdev,  
                  int (*okfn)(struct sk_buff *), int thresh,  
                  int cond)  
 {  
     /*逐个调用注册的防火墙钩子*/  
     return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh);  
 }  
   
   
 /* 
  * 接收完数据包后的后续处理函数 
  */  
 static int ip_rcv_finish(struct sk_buff *skb)  
 {  
     const struct iphdr *iph = ip_hdr(skb);  
     struct rtable *rt;  
   
      /* 
       * 激活ip_route_input，确定报文的路由，如果ip_route_input无法从FIB中找到路由 
       * 则丢弃数据报文，ip_route_input将在IP路由中的专题中进行讲解 
       */  
     if (skb_dst(skb) == NULL) {  
         int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev);  
         if (unlikely(err)) {  
             goto drop;  
         }  
     }  
   
     /*检查IP报头里面是否含有选项，如果含有建立ip_options*/  
     if (iph->ihl > 5 && ip_rcv_options(skb))  
         goto drop;  
   
     /*根据dst_entry的结果，使用skb_dst(skb)->input(skb)进行IP的路由选择 
      *传递给本地计算机的单播或多播，进入 ip_local_deliver(); 
      *单播转发的报文进入ip_forward() 
      *多播转发进入ip_mr_input() 
      */  
     return dst_input(skb);  
     {  
         skb_dst(skb)->input(skb)  
     }  
   
 drop:  
     kfree_skb(skb);  
     return NET_RX_DROP;  
 }  
   
 /*目的地分发策略的注册*/  
 static int __mkroute_input(struct sk_buff *skb,  
                struct fib_result *res,  
                struct in_device *in_dev,  
                __be32 daddr, __be32 saddr, u32 tos,  
                struct rtable **result)  
 {  
     //.......  
     rth->u.dst.input = ip_forward;  
     rth->u.dst.output = ip_output;  
     //......      
 }  
   
   
 static int __mkroute_output(struct rtable **result,  
                 struct fib_result *res,  
                 const struct flowi *fl,  
                 const struct flowi *oldflp,  
                 struct net_device *dev_out,  
                 unsigned flags)  
 {  
     //......  
     if (flags & RTCF_LOCAL) {  
         rth->u.dst.input = ip_local_deliver;  
         rth->rt_spec_dst = fl->fl4_dst;  
     }  
     if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {  
         rth->rt_spec_dst = fl->fl4_src;  
         if (flags & RTCF_LOCAL &&  
             !(dev_out->flags & IFF_LOOPBACK)) {  
             rth->u.dst.output = ip_mc_output;  
             RT_CACHE_STAT_INC(out_slow_mc);  
         }  
 #ifdef CONFIG_IP_MROUTE  
         if (res->type == RTN_MULTICAST) {  
             if (IN_DEV_MFORWARD(in_dev) &&  
                 !ipv4_is_local_multicast(oldflp->fl4_dst)) {  
                 rth->u.dst.input = ip_mr_input;  
                 rth->u.dst.output = ip_mc_output;  
             }  
         }  
 #endif  
   
     }     
     //......  
 }  

如果IP报文需要转发，那么分析流程如下

[cpp]view plaincopy 
   
 //-----------------------------------------------------------------------------------------------------------------------------------------------------------------------  
 /*单播转发处理，负责处理转发相关的所有动作*/  
 int ip_forward(struct sk_buff *skb)  
 {  
     /*删除不是PACKET_HOST的数据包*/  
     if (skb->pkt_type != PACKET_HOST)  
         goto drop;  
   
     /*TTL递减为1之间，丢弃该报，并返回ICMP_TIME_EXCEEDED*/  
     if (ip_hdr(skb)->ttl <= 1)  
         goto too_many_hops;  
   
     /*如果skb->len大于MTU值，且Dont-Fragment被职位，则丢弃此报文， 
      *并返回ICMP_FRAG_NEEDED*/  
     if (unlikely(skb->len > dst_mtu(&rt->u.dst) &&  (ip_hdr(skb)->frag_off & htons(IP_DF)))) {  
         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,  htonl(dst_mtu(&rt->u.dst)));  
         goto drop;  
     }  
   
     /*检查是否有足够的空间用于输出网络设备中的MAC报头dst.header_len() 
      *调用skb_cow来创建一个新的足够长的skb，并且拷贝原来的所有数据 
      */  
     if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))  
         goto drop;  
     iph = ip_hdr(skb);  
   
     /*TTL减少1*/  
     ip_decrease_ttl(iph);  
   
     /*使用IP_FORWARD中注册的钩子函数，当防火墙中的钩子都与运行完成后， 
       *进入ip_forward_finish*/  
     return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev,  ip_forward_finish);  
   
 sr_failed:  
     /* 
      *  Strict routing permits no gatewaying 
      */  
      icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);  
      goto drop;  
   
 too_many_hops:  
     /* Tell the sender its packet died... */  
     IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);  
     icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);  
 drop:  
     kfree_skb(skb);  
     return NET_RX_DROP;  
 }  
   
 /* 
  * 该函数没有什么用途，除非启用了FASTROUTE, 
  * 将处理后的函数报文送入output阶段 
  */  
 static int ip_forward_finish(struct sk_buff *skb)  
 {  
     struct ip_options * opt = &(IPCB(skb)->opt);  
   
     /*使用ip_forward_options处理IP选项*/  
     if (unlikely(opt->optlen))  
         ip_forward_options(skb);  
       
     /*送入到输出阶段*/  
     return dst_output(skb);  
     {  
         skb_dst(skb)->output(skb);  
     }  
 }  
   
   
 /*目的地分发策略的注册*/  
 static int __mkroute_input(struct sk_buff *skb,  
                struct fib_result *res,  
                struct in_device *in_dev,  
                __be32 daddr, __be32 saddr, u32 tos,  
                struct rtable **result)  
 {  
     //.......  
     rth->u.dst.input = ip_forward;  
     rth->u.dst.output = ip_output;  
     //......      
 }  
   
   
 int ip_output(struct sk_buff *skb)  
 {  
     struct net_device *dev = skb_dst(skb)->dev;  
   
     /*将skb->dev指向输出设备的dev*/  
     skb->dev = dev;  
     /*设置2层包类型为ETH_P_IP*/  
     skb->protocol = htons(ETH_P_IP);  
   
     /*使用防火墙中的NF_IP_POST_ROUTING中注册的钩子函数进行处理， 
       *处理完成之后进入ip_finish_output处理*/  
     return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,  
                 ip_finish_output,  
                 !(IPCB(skb)->flags & IPSKB_REROUTED));  
 }  
   
 /*判定是否进行IP分片*/  
 static int ip_finish_output(struct sk_buff *skb)  
 {  
     /*如果报文尺寸大于MTU，则进行IP分片后送入ip_finish_output2 
      *否则直接送入ip_finish_output2 
      */  
     if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))  
         return ip_fragment(skb, ip_finish_output2);  
     else  
         return ip_finish_output2(skb);  
 }  
   
 static const struct neigh_ops arp_generic_ops = {  
     .family =       AF_INET,  
     .output =       neigh_resolve_output,  
     .hh_output =        dev_queue_xmit,  
 };  
   
 static const struct neigh_ops arp_hh_ops = {  
     .family =       AF_INET,  
     .output =       neigh_resolve_output,  
     .hh_output =        dev_queue_xmit,  
 };  
   
   
 static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,   __be16 protocol)  
 {  
     struct hh_cache *hh;  
   
     //......  
       
     if (n->nud_state & NUD_CONNECTED)  
         hh->hh_output = n->ops->hh_output; /*也就是dev_queue_xmit*/  
     else  
         hh->hh_output = n->ops->output;  
   
     //......  
 }  
   
 static void neigh_suspect(struct neighbour *neigh)  
 {  
     //.....  
     neigh->output = neigh->ops->output; /*也就是neigh_resolve_output*/  
   
 }  
   
   
 static inline int ip_finish_output2(struct sk_buff *skb)  
 {     
     /*如果2层头数据空间不够，则重新分配足够长度的SKB，并将数据复制到新的SKB后释放原来SKB*/  
     if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {  
         struct sk_buff *skb2;  
           
         skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));  
   
         if (skb->sk)  
             skb_set_owner_w(skb2, skb->sk);  
         kfree_skb(skb);  
         skb = skb2;  
     }  
   
     /*如果路由出口项中已经含有2层包头缓存的引用(dst->hh)，进入neigh_hh_output*/  
     if (dst->hh)  
         return neigh_hh_output(dst->hh, skb);  
     /*如果没有dst->hh，有dst->neighbour，则启动地址解析协议，也就是neigh_resolve_output*/  
     else if (dst->neighbour)  
         return dst->neighbour->output(skb);  
   
 }  
   
 static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)  
 {  
     /*直接复制2层包头到套接字的包数据空间中*/  
     memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);  
     skb_push(skb, hh_len);  
       
     /*调用hh->hh_output(skb)，也就是dev_queue_xmit进行硬件发送*/  
     return hh->hh_output(skb);  
 }  

如果IP是上送本地CPU的报文，处理流程如下

[cpp]view plaincopy 
   
 //-----------------------------------------------------------------------------------------------------------------------------------------------------------------------  
 /*包的本地投递*/  
 int ip_local_deliver(struct sk_buff *skb)  
 {  
     /*收集并组装IP分片，如果还没有收集完成，那么就等待IP分片组装完成*/  
     if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {  
         if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))  
             return 0;  
     }  
   
     /*进入NF_IP_LOCAL_IN的过滤器处理，处理完成后进入ip_local_deliver_finish*/  
     return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL, ip_local_deliver_finish);  
 }  
   
   
 /*IP层处理完成后的协议分发函数*/  
 static int ip_local_deliver_finish(struct sk_buff *skb)  
 {  
     resubmit:  
         /*如果是RAW-IP报文，送往RAW-IP对应的处理???*/  
         raw = raw_local_deliver(skb, protocol);  
   
         /*MAX_INET_PROTOS-1 为IP报头中协议的模， 
          *这里计算对应协议在ipprot中被散列的位置*/  
         hash = protocol & (MAX_INET_PROTOS - 1);  
         /*IP层上的ipprot负责管理所有的传输协议*/  
         ipprot = rcu_dereference(inet_protos[hash]);  
           
         /*如果找到相应的协议，那么调用对应的处理例程*/  
         if (ipprot != NULL) {             
             ret = ipprot->handler(skb);  
             if (ret < 0) {  
                 protocol = -ret;  
                 goto resubmit;  
             }             
         }  
         /*找不到相应的处理例程*/   
         else {  
             /*又是RAW-IP报文，会在RAW-IP处理例程??? 
              * 就丢弃，并想对端发送ICMP_DEST_UNREACH，ICMP_PROT_UNREACH*/  
             if (!raw)  
              {  
                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);  
             }   
   
             kfree_skb(skb);  
         }  
   
     return 0;  
 }  
   
 static const struct net_protocol tcp_protocol = {  
     .handler =  tcp_v4_rcv, /*TCP*/  
 };  
   
 static const struct net_protocol udp_protocol = {  
     .handler =  udp_rcv, /*UDP*/  
 };  
   
 static const struct net_protocol icmp_protocol = {  
     .handler =  icmp_rcv, /*ICMP*/  
 };  
   
 static const struct net_protocol igmp_protocol = {  
     .handler =  igmp_rcv, /*IGMP*/  
 };