IP函数分析_APUE

IP函数分析_APUE
亦无 @ 2005-03-10 18:29

$$$$$$$$$$$$$$$$$IP初始化函数分析$$$$$$$$$$$$$$$$$$$$

void __init ip_init(void)
   当系统启动的时候,会调用inet_init(void)来初始化网络子系统,这个函数又调用ip_init来初始化IP协议处理模块(注册处理函数,分配路由处理表等等)。

实现过程:
* 注册IP协议处理函数。将ip_packet_type放入到ptype_base(Linux的网络协议处理链表)中,
*其中IP层的接收函数为ip_rcv。
* 初始化路由表(routing table),Linux的路由处理比较复杂,需另撰文分析。
* 初始化对等IP(peer ip)信息表,需另撰文分析。
       /*
        *      IP protocol layer initialiser
        */
       static struct packet_type ip_packet_type =
       {
               __constant_htons(ETH_P_IP),
               NULL,   /* All devices */
               ip_rcv,
               (void*)1,
               NULL,
       };
       /*
        *      IP registers the packet type and then calls the subprotocol initialisers
        */
       void __init ip_init(void)
       {
               dev_add_pack(&ip_packet_type);
               ip_rt_init();
               inet_initpeers();
       #ifdef CONFIG_IP_MULTICAST
               proc_net_create("igmp", 0, ip_mc_procinfo);
       #endif
       }


$$$$$$$$$$$$$$$$$$IP发送函数分析$$$$$$$$$$$$$$$$$$$$$$$

作者:硅谷农民<mailto:ggnm@kerneldiary.net>

int ip_output(struct sk_buff *skb)
   系统调用ip_output来发送一个网络包:将skb放入发送队列中,最终由网卡驱动程
序中的发送函数
发送出去。
实现过程:
* 如果设置了IP_ROUTE_NAT,则调用ip_do_nat处理,否则,调用ip_finish_output完
成发送。
       int ip_output(struct sk_buff *skb)
       {
       #ifdef CONFIG_IP_ROUTE_NAT
        struct rtable *rt = (struct rtable*)skb->dst;
        if (rt->rt_flags&RTCF_NAT)
         ip_do_nat(skb);
       #endif
        return ip_finish_output(skb);
       }
----------------------------------------------------------------------------
---------
int ip_finish_output(struct sk_buff *skb)
实现过程:
* 调用netfilter的挂钩函数,然后调用ip_finish_output2进一步处理。
       __inline__ int ip_finish_output(struct sk_buff *skb)
       {
        struct net_device *dev = skb->dst->dev;
        skb->dev = dev;
        skb->protocol = __constant_htons(ETH_P_IP);
        return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
                ip_finish_output2);
       }
----------------------------------------------------------------------------
---------
static inline int ip_finish_output2(struct sk_buff *skb)

实现过程:
* 在调用ip_output之前,系统已经找好路由放在skb->dst中,如果以前这个dst_entry已经发送过网络包,双方的硬件地址头部已经放入缓存hh_cache中,则将ethernet头部(16字节)从hh->hh_data拷贝到skb中,然后调用hh->hh_output(大多数情况下是dev_queue_xmit)将这个skb放到发送读队列中。
 (hh_cache在arp模块中初始化,需另外文章解释)

* 如果没有hh_cache,则直接调用neighbour的output函数(大多数情况下是neigh_resolve_output)处理,这个函数在必要的时候调用neigh_event_send取得对方的mac地址,最终调用neigh->ops
->queue_xmit(大多数情况下是dev_queue_xmit)将这个skb放到发送读队列中。

       static inline int ip_finish_output2(struct sk_buff *skb)
       {
               struct dst_entry *dst = skb->dst;
               struct hh_cache *hh = dst->hh;
               if (hh) {
                          read_lock_bh(&hh->hh_lock);
                          memcpy(skb->data - 16, hh->hh_data, 16);
                          read_unlock_bh(&hh->hh_lock);
                          skb_push(skb, hh->hh_len);
                          return hh->hh_output(skb);
                } else if (dst->neighbour)
                                   return dst->neighbour->output(skb);
                                   if (net_ratelimit())
                                              printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!n");
                                   kfree_skb(skb);
                                   return -EINVAL;
        }

注意:
  UDP的send_msg会调用ip_build_xmit处理fragmentation,最终会调用ip_output发送。TCP会调用ip_queue_xmit处理fragmentation,最终也会调用ip_output发送。这些函数都在ip_output.c中,要另外写文章阐释。


$$$$$$$$$$$$$$$$IP接收函数分析$$$$$$$$$$$$$$$$

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type
*pt)
   当系统接收到网络包的时候,如果是IP包,则调用ip_rcv来处理。ip_rcv主要的功
能是碎片重组,
根据路由将将它们送给上一层协议栈(例如tcp,udp)处理,或者调用ip_forward将IP
包转送给另外
一个网卡设备。
实现过程:
* 检查skb有没有被另外的进程调动,如果有的话,则克隆(clone)一个同样的skb。

* 检查IP包的基本属性:
   1. IP头的长度是以4字节为单位计算的,标准的IP头部是
    20字节,所以iph->ihl 必须大于或等于5。
   2. IP包的版本为IPv4。
   3. 调用ip_fast_csum来检查校验。
   4. 检查IP包的总长度ntohs(iph->tot_len)和skb缓冲区的长度是不是匹配;IP包的
长度至少大于
    IP头(IP header)的长度(iph->ihl<<2)。
* 如果skb的长度大于IP包的长度的话,说明可能网卡自动附加的一些空字符在缓冲区
的后面,则调用
  __pskb_trim(skb, len)将多余的字符去掉。
* 当上面的步骤完成以后,检查需不需要调用netfilter的挂钩函数(hook function),
完成以后,调用
  ip_rcv_finish继续进行IP协议的处理。
       int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
t_type *pt)
       {
            ... ...
               if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
                       goto out;
               if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                       goto inhdr_error;
               iph = skb->nh.iph;
               if (iph->ihl < 5 || iph->version != 4)
                       goto inhdr_error;
               if (!pskb_may_pull(skb, iph->ihl*4))
                       goto inhdr_error;
               if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
                       goto inhdr_error;
               {
                       __u32 len = ntohs(iph->tot_len);
                       if (skb->len < len || len < (iph->ihl<<2))
                               goto inhdr_error;
                       if (skb->len > len) {
                               __pskb_trim(skb, len);
                               if (skb->ip_summed == CHECKSUM_HW)
                                       skb->ip_summed = CHECKSUM_NONE;
                       }
               }
               return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
                              ip_rcv_finish);
       }
----------------------------------------------------------------------------
---------
static inline int ip_rcv_finish(struct sk_buff *skb)
实现过程:
* 调用ip_route_input根据IP源地址,目的地址计算出这个IP包的路由,存放到skb->
dst中,
  这个路由决定了IP包如何在网络上传输(例如根据子网转发到不同的网卡设备等等
)。
       static inline int ip_rcv_finish(struct sk_buff *skb)
       {
               struct net_device *dev = skb->dev;
               struct iphdr *iph = skb->nh.iph;
               if (skb->dst == NULL) {
                       if (ip_route_input(skb, iph->daddr, iph->saddr, iph-
>tos, dev))
                               goto drop;
               }
* 如果iph->ihl > 5 (IP头部长度大于20),表示这个IP包有其他选项(option),则调

  ip_options_compile填好各个选项IPCB(skb)->opt,这里之处理了“源路由”选项(
source route),
  具体细节这里先略过。
               if (iph->ihl > 5) {
                       struct ip_options *opt;
                       /* It looks as overkill, because not all
                          IP options require packet mangling.
                          But it is the easiest for now, especially taking
                          into account that combination of IP options
                          and running sniffer is extremely rare condition.
                                                             --ANK (980813)

                       */
                       if (skb_cow(skb, skb_headroom(skb)))
                               goto drop;
                       iph = skb->nh.iph;
                       skb->ip_summed = 0;
                       if (ip_options_compile(NULL, skb))
                               goto inhdr_error;
                       opt = &(IPCB(skb)->opt);
                       if (opt->srr) {
                               struct in_device *in_dev = in_dev_get(dev);
                               if (in_dev) {
                                   if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
                                       if (IN_DEV_LOG_MARTIANS(in_dev) && n
et_ratelimit())
                                                   printk(KERN_INFO "source
route option
                                                                %u.%u.%u.%u
-> %u.%u.%u.%un",
                                                     NIPQUAD(iph->saddr), N
IPQUAD(iph->daddr));
                                               in_dev_put(in_dev);
                                               goto drop;
                                       }
                                       in_dev_put(in_dev);
                               }
                               if (ip_options_rcv_srr(skb))
                                       goto drop;
                       }
               }
* 最后,调用目的路由的处理函数,如果这个IP包的目的地址不是本机器的地址,则调
用ip_forward处理,
  否则调用ip_local_deliever继续处理,为将IP包传送到上一层协议作准备。
               return skb->dst->input(skb);
       inhdr_error:
               IP_INC_STATS_BH(IpInHdrErrors);
       drop:
               kfree_skb(skb);
               return NET_RX_DROP;
       }
----------------------------------------------------------------------------
---------
int ip_local_deliver(struct sk_buff *skb)
实现过程:
* 如果网络包是IP碎片的话,则调用ip_defrag进行碎片重组。
* 检查调用netfilter的挂钩函数,完成以后调用ip_local_deliver_finish进行 
  最后的IP层协议的处理。
       int ip_local_deliver(struct sk_buff *skb)
       {
               /*
                *      Reassemble IP fragments.
                */
               if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
                       skb = ip_defrag(skb);
                       if (!skb)
                               return 0;
               }
               return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,

                              ip_local_deliver_finish);
       }
----------------------------------------------------------------------------
---------
static inline int ip_local_deliver_finish(struct sk_buff *skb)

实现过程:
* 根据IP头部的协议(skb->nh.iph->protocol)找到相应的协议定义pprot。
* 如果有raw socket的话,调用raw_v4_input先处理。
* 在通常情况下,一个哈希值只有一个协议相对应(inet_protos[hash]),则调用ipprot->handler(skb),从而将这个网路包送到上一层协议(例如icmp, udp, tcp 等等)处理。
* 否则的话调用ip_run_ipprot,找到相应的协议,再调用ipprot->handler(skb)处理。
       static inline int ip_local_deliver_finish(struct sk_buff *skb)
       {
               int ihl = skb->nh.iph->ihl*4;
               /* Pull out additionl 8 bytes to save some space in protocols. */
               if (!pskb_may_pull(skb, ihl+8))
                       goto out;
               __skb_pull(skb, ihl);
               /* Point into the IP datagram, just past the header. */
               skb->h.raw = skb->data;
               {
                       /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
                       int protocol = skb->nh.iph->protocol;
                       int hash = protocol & (MAX_INET_PROTOS - 1);
                       struct sock *raw_sk = raw_v4_htable[hash];
                       struct inet_protocol *ipprot;
                       int flag;
                       /* If there maybe a raw socket we must check - if not we
                        * don't care less
                        */
                       if(raw_sk != NULL)
                               raw_sk = raw_v4_input(skb, skb->nh.iph, hash);
                       ipprot = (struct inet_protocol *) inet_protos[hash];

                       flag = 0;
                       if(ipprot != NULL) {
                               if(raw_sk == NULL &&
                                  ipprot->next == NULL &&
                                  ipprot->protocol == protocol) {
                                       int ret;
                                       /* Fast path... */
                                       ret = ipprot->handler(skb);
                                       return ret;
                               } else {
                                       flag = ip_run_ipprot(skb, skb->nh.iph, ipprot,(raw_sk != NULL));
                               }
                       }
                       /* All protocols checked.
                        * If this packet was a broadcast, we may *not* reply to it,
                        * since that causes (proven, grin) ARP storms and a leakage
                        * of memory (i.e. all ICMP reply messages get queued up for
                        * transmission...)
                        */
                       if(raw_sk != NULL) {    /* Shift to last raw user */

                               raw_rcv(raw_sk, skb);
                               sock_put(raw_sk);
                       } else if (!flag) {             /* Free and report errors */
                               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
       out:
                               kfree_skb(skb);
                       }
               }
               return 0;
       }


$$$$$$$$$$$$$$$$$$$$$$IP转发函数分析$$$$$$$$$$$$$$$$$$$$$$

int ip_forward(struct sk_buff *skb)
   当系统的路由处理完一个网络包的时候,如果就决定需要经由另外的网卡设备,转发这个包到另外的子网,则调用ip_forward。

实现过程:
* 在调用ip_forward之前,系统已经调用ip_route_input_slow找到路由,放在skb->dst中,这里将它取出来。
* 检查IP头部的TTL(time to live)值,如果小于1的话,则将这个包扔掉,并且发回icmp错误信息,表示这个IP包在网络上传输时经过太多的节点。
* 如果设置了路由限制选项,需要判断两个路由是不是匹配。
   iph = skb->nh.iph;
   rt = (struct rtable*)skb->dst;
   if (iph->ttl <= 1)
       goto too_many_hops;
   if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
       goto sr_failed;
* 找出要转发的网卡设备和MTU。
* 如果找到更好的路由的话,则发回一个ICMP重定向网络包,告诉发送者更好的路由。

dev2 = rt->u.dst.dev;
mtu = rt->u.dst.pmtu;
/* We now generate an ICMP HOST REDIRECT giving the route we calculated.
*/
if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)  ip_rt_send_redirect(skb);
* 如果skb的headroom不够(一般要有16字节来存放ethernet头部),或者还有另外的模块在处理这
*个skb,则调用skb_cow来创建一个新的足够长的skb,并且拷贝原来的所有数据。
* 将TTL减1。
* 如果skb中数据的长度大于MTU,并IP包被标为路由器“不能分块”(Don't Fragment),则发回一个
*ICMP信息包,告诉发送方需要自己将IP包分为更小块再发送。
   /* We are about to mangle packet. Copy it! */
   if ((skb = skb_cow(skb, dev2->hard_header_len)) == NULL)
       return NET_RX_DROP;
   iph = skb->nh.iph;
   opt = &(IPCB(skb)->opt);
   /* Decrease ttl after skb cow done */
   ip_decrease_ttl(iph);
   /* We now may allocate a new buffer, and copy the datagram into it.
    * If the indicated interface is up and running, kick it.
    */
   if (skb->len > mtu && (ntohs(iph->frag_off) & IP_DF))
       goto frag_needed;
* 调用netfilter框架中的挂钩函数(hook),一般是NAT和防火墙模块注册的处理流程。处理完以后,如果这个网络包需要继续处理,则调用ip_forward_finish。
   return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev2,
       ip_forward_finish);
----------------------------------------------------------------------------
---
static inline int ip_forward_finish(struct sk_buff *skb)
   当系统从ip_forward转入到netfilter的注册函数以后,再转回来调用ip_forward_finish完成转发的功能。

实现过程:
* 如果IP包的头部没有选项(option)的话,调用ip_forward_options处理。
* 否则的话,直接调用ip_send将这个包转发出去。
   if (opt->optlen == 0) {
       return (ip_send(skb));
   }
   ip_forward_options(skb);
   return (ip_send(skb));

$$$$$$$$$$$$$$$$$$$$IP碎片重组函数分析$$$$$$$$$$$$$$$$$$$$$

struct sk_buff *ip_defrag(struct sk_buff *skb)
   Linux的ip_fragment.c包含了一系列用来将IP包分块重组的函数。当系统接收到IP碎片的时候,将它们
分队列保存起来。一直等到所有的碎片都收到,则将它们重新装配,再送给上一层协议栈(例如tcp,udp)处理。

实现过程:
* 如果defragment使用了太多的内存,则调用ip_evictor释放那些太老的fragment队列。
* 根据IP Header调用ip_find找到或者创建一个IP Fragment的队列(qp)。
* 调用ip_frag_queue将这个网络包skb插入到队列qp中。
* 如果第一个和最后一个fragment都已到达,并且队列fragment长度总和等于IP包的长度,则调用
  ip_frag_reasm将这个IP包重新装配起来。注意对fragment队列操作的时候,要加上spin lock,防止多个CPU同时插入或删除fragment。
       struct sk_buff *ip_defrag(struct sk_buff *skb)
       {
        struct iphdr *iph = skb->nh.iph;
        struct ipq *qp;
        struct net_device *dev;
        /* Start by cleaning up the memory. */
        if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
         ip_evictor();
        dev = skb->dev;
        /* Lookup (or create) queue header */
        if ((qp = ip_find(iph)) != NULL) {
         struct sk_buff *ret = NULL;
         spin_lock(&qp->lock);
         ip_frag_queue(qp, skb);
         if (qp->last_in == (FIRST_IN|LAST_IN) &&
             qp->meat == qp->len)
          ret = ip_frag_reasm(qp, dev);
         spin_unlock(&qp->lock);
         ipq_put(qp);
         return ret;
        }
        kfree_skb(skb);
        return NULL;
       }
----------------------------------------------------------------------------
---------
static inline struct ipq *ip_find(struct iphdr *iph)
实现过程:
* 所有的fragment队列实际上是放在一个哈希表(ipq_hash)中,所以第一步先根据IP包的id,源IP地址,目的IP地址和协议,调用ipqhashfn算出哈希值hash。
* 遍历相同hash值的队列,找到以后增加它的reference counter,返回这个队列。
* 如果找不到,则调用ip_frag_create创建一个fragment队列,返回。
       static inline struct ipq *ip_find(struct iphdr *iph)
       {
        __u16 id = iph->id;
        __u32 saddr = iph->saddr;
        __u32 daddr = iph->daddr;
        __u8 protocol = iph->protocol;
        unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
        struct ipq *qp;
        read_lock(&ipfrag_lock);
        for(qp = ipq_hash[hash]; qp; qp = qp->next) {
         if(qp->id == id  &&
            qp->saddr == saddr &&
            qp->daddr == daddr &&
            qp->protocol == protocol) {
          atomic_inc(&qp->refcnt);
          read_unlock(&ipfrag_lock);
          return qp;
         }
        }
        read_unlock(&ipfrag_lock);
        return ip_frag_create(hash, iph);
       }
----------------------------------------------------------------------------
---------
static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
实现过程:
* 算出这个fragment的起始和结束的偏移量(offset, end),如果是最后的一个fragment(IP_MF为零),则设置LAST_IN标志。
* 如果不是最后一个fragment,修正队列(qp)的长度qp->len = end。
       static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
       {
        struct sk_buff *prev, *next;
        int flags, offset;
        int ihl, end;
        if (qp->last_in & COMPLETE)
         goto err;
         offset = ntohs(skb->nh.iph->frag_off);
        flags = offset & ~IP_OFFSET;
        offset &= IP_OFFSET;
        offset <<= 3;  /* offset is in 8-byte chunks */
         ihl = skb->nh.iph->ihl * 4;
        /* Determine the position of this fragment. */
         end = offset + skb->len - ihl;
        /* Is this the final fragment? */
        if ((flags & IP_MF) == 0) {
         /* If we already have some bits beyond end
          * or have different end, the segment is corrrupted.
          */
         if (end < qp->len ||
             ((qp->last_in & LAST_IN) && end != qp->len))
          goto err;
         qp->last_in |= LAST_IN;
         qp->len = end;
        } else {
         if (end&7) {
          end &= ~7;
          if (skb->ip_summed != CHECKSUM_UNNECESSARY)
           skb->ip_summed = CHECKSUM_NONE;
         }
         if (end > qp->len) {
          /* Some bits beyond end -> corruption. */
          if (qp->last_in & LAST_IN)
           goto err;
          qp->len = end;
         }
        }
        if (end == offset)
         goto err;
        if (pskb_pull(skb, ihl) == NULL)
         goto err;
        if (pskb_trim(skb, end-offset))
         goto err;
* 遍历qp队列中的所有fragment,通过比较offset,找出要插入的位置的前一个和后一个fragment(prev和next)。
* 如果prev和要插入的fragment有重叠,调用pskb_pull(skb, i)缩小skb,对齐offset。
        prev = NULL;
        for(next = qp->fragments; next != NULL; next = next->next) {
         if (FRAG_CB(next)->offset >= offset)
          break; /* bingo! */
         prev = next;
        }
        if (prev) {
         int i = (FRAG_CB(prev)->offset + prev->len) - offset;
         if (i > 0) {
          offset += i;
          if (end <= offset)
           goto err;
          if (!pskb_pull(skb, i))
           goto err;
          if (skb->ip_summed != CHECKSUM_UNNECESSARY)
           skb->ip_summed = CHECKSUM_NONE;
         }
        }
* 从next开始,检查每一个fragment,如果有部分重叠(i字节)的话,调用pskb_pull(next, i)修正当前的fragment,如果是全部重叠的话,则调用frag_kfree_skb释放当前的fragment。
* 如果不是最后一个fragment,修正队列(qp)的长度qp->len = end。
        while (next && FRAG_CB(next)->offset < end) {
         int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
         if (i < next->len) {
          if (!pskb_pull(next, i))
           goto err;
          FRAG_CB(next)->offset += i;
          qp->meat -= i;
          if (next->ip_summed != CHECKSUM_UNNECESSARY)
           next->ip_summed = CHECKSUM_NONE;
          break;
         } else {
          struct sk_buff *free_it = next;
          next = next->next;
          if (prev)
           prev->next = next;
          else
           qp->fragments = next;
          qp->meat -= free_it->len;
          frag_kfree_skb(free_it);
         }
        }
        FRAG_CB(skb)->offset = offset;
* 将skb插入队列qp中,修正qp的修改时间,qp的长度,记录这个队列已使用的内存(ip_frag_mem),
  如果offset等于零,标志第一个fragment已来到(qp->last_in |= FIRST_IN)。
* 如果有任何错误,记住调用kfree_skb释放内存。
        /* Insert this fragment in the chain of fragments. */
        skb->next = next;
        if (prev)
         prev->next = skb;
        else
         qp->fragments = skb;
         if (skb->dev)
          qp->iif = skb->dev->ifindex;
        skb->dev = NULL;
        qp->stamp = skb->stamp;
        qp->meat += skb->len;
        atomic_add(skb->truesize, &ip_frag_mem);
        if (offset == 0)
         qp->last_in |= FIRST_IN;
        return;
       err:
        kfree_skb(skb);
       }

Trackback地址: http://www.yculblog.com/trackback/4/608944

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值