IP传输2

内核完成的任务:

  • 查询下一个跳点,ip层必须知道外出设备以及用作下一个跳点的下一个路由器。路径是通过ip_route_output_flow发现的。
  • 初始化ip报头,几个字段会在此阶段填入。
  • 处理选项,软件必须尊重需要把一个地址或时间戳加进报头里的那些选项
  • 分段,如果ip封包太大,无法在外出设备上传输,就必须分段
  • 校验和,对报头的其他工作都做完后,还必须计算校验和。
  • 由netfilter检查
  • 更新统计数据,取决于传输结果(成功或失败)以及一些诸如分段的行动而定

ip_queue函数:

   1:  int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
   2:  {
   3:      struct sock *sk = skb->sk;
   4:      struct inet_sock *inet = inet_sk(sk);
   5:      struct ip_options *opt = inet->opt;
   6:      struct rtable *rt;
   7:      struct iphdr *iph;
   8:   
   9:      /* Skip all of this if the packet is already routed,
  10:       * f.e. by something like SCTP.
  11:       */
  12:      rt = skb_rtable(skb);
  13:      if (rt != NULL)
  14:          goto packet_routed;
  15:   
  16:      /* Make sure we can route this packet. */
  17:      rt = (struct rtable *)__sk_dst_check(sk, 0);
  18:      if (rt == NULL) {
  19:          __be32 daddr;
  20:   
  21:          /* Use correct destination address if we have options. */
  22:          daddr = inet->daddr;
  23:          if(opt && opt->srr)
  24:              daddr = opt->faddr;
  25:   
  26:          {
  27:              struct flowi fl = { .oif = sk->sk_bound_dev_if,
  28:                          .mark = sk->sk_mark,
  29:                          .nl_u = { .ip4_u =
  30:                                { .daddr = daddr,
  31:                              .saddr = inet->saddr,
  32:                              .tos = RT_CONN_FLAGS(sk) } },
  33:                          .proto = sk->sk_protocol,
  34:                          .flags = inet_sk_flowi_flags(sk),
  35:                          .uli_u = { .ports =
  36:                                 { .sport = inet->sport,
  37:                               .dport = inet->dport } } };
  38:   
  39:              /* If this fails, retransmit mechanism of transport layer will
  40:               * keep trying until route appears or the connection times
  41:               * itself out.
  42:               */
  43:              security_sk_classify_flow(sk, &fl);
  44:              if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
  45:                  goto no_route;
  46:          }
  47:          sk_setup_caps(sk, &rt->u.dst);
  48:      }
  49:      skb_dst_set(skb, dst_clone(&rt->u.dst));
  50:   
  51:  packet_routed:
  52:      if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
  53:          goto no_route;
  54:   
  55:      /* OK, we know where to send it, allocate and build IP header. */
  56:      skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
  57:      skb_reset_network_header(skb);
  58:      iph = ip_hdr(skb);
  59:      *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
  60:      if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
  61:          iph->frag_off = htons(IP_DF);
  62:      else
  63:          iph->frag_off = 0;
  64:      iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
  65:      iph->protocol = sk->sk_protocol;
  66:      iph->saddr    = rt->rt_src;
  67:      iph->daddr    = rt->rt_dst;
  68:      /* Transport layer set skb->h.foo itself. */
  69:   
  70:      if (opt && opt->optlen) {
  71:          iph->ihl += opt->optlen >> 2;
  72:          ip_options_build(skb, opt, inet->daddr, rt, 0);
  73:      }
  74:   
  75:      ip_select_ident_more(iph, &rt->u.dst, sk,
  76:                   (skb_shinfo(skb)->gso_segs ?: 1) - 1);
  77:   
  78:      skb->priority = sk->sk_priority;
  79:      skb->mark = sk->sk_mark;
  80:   
  81:      return ip_local_out(skb);
  82:   
  83:  no_route:
  84:      IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
  85:      kfree_skb(skb);
  86:      return -EHOSTUNREACH;
  87:  }

其参数的意义:

skb,要传输的封包的缓冲区描述符。此数据结构中有填入IP报头以及传输封包所需的所有参数。ip_queue_xmit用于处理本地产生的封包;转发封包没有相关的套接字。和skb相关的套接字包含一个名为opt的指针。此结构中包含ip报头中的选项,而其存储格式使得ip层的函数更易于存取。此结构是放在socket结构中的,因此此结构对每个要通过该套接字传输的封包而言都相同;替每个封包重建此信息就太浪费了。

ipfragok,主要用sctp使用的标志,用来指出是否允许分段。

ip_append_data函数:

   1:  int ip_append_data(struct sock *sk,
   2:             int getfrag(void *from, char *to, int offset, int len,
   3:                     int odd, struct sk_buff *skb),
   4:             void *from, int length, int transhdrlen,
   5:             struct ipcm_cookie *ipc, struct rtable **rtp,
   6:             unsigned int flags)
   7:  {
   8:      struct inet_sock *inet = inet_sk(sk);
   9:      struct sk_buff *skb;
  10:   
  11:      struct ip_options *opt = NULL;
  12:      int hh_len;
  13:      int exthdrlen;
  14:      int mtu;
  15:      int copy;
  16:      int err;
  17:      int offset = 0;
  18:      unsigned int maxfraglen, fragheaderlen;
  19:      int csummode = CHECKSUM_NONE;
  20:      struct rtable *rt;
  21:   
  22:      if (flags&MSG_PROBE)
  23:          return 0;
  24:   
  25:      if (skb_queue_empty(&sk->sk_write_queue)) {
  26:          /*
  27:           * setup for corking.
  28:           */
  29:          opt = ipc->opt;
  30:          if (opt) {
  31:              if (inet->cork.opt == NULL) {
  32:                  inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
  33:                  if (unlikely(inet->cork.opt == NULL))
  34:                      return -ENOBUFS;
  35:              }
  36:              memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
  37:              inet->cork.flags |= IPCORK_OPT;
  38:              inet->cork.addr = ipc->addr;
  39:          }
  40:          rt = *rtp;
  41:          if (unlikely(!rt))
  42:              return -EFAULT;
  43:          /*
  44:           * We steal reference to this route, caller should not release it
  45:           */
  46:          *rtp = NULL;
  47:          inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
  48:                          rt->u.dst.dev->mtu :
  49:                          dst_mtu(rt->u.dst.path);
  50:          inet->cork.dst = &rt->u.dst;
  51:          inet->cork.length = 0;
  52:          sk->sk_sndmsg_page = NULL;
  53:          sk->sk_sndmsg_off = 0;
  54:          if ((exthdrlen = rt->u.dst.header_len) != 0) {
  55:              length += exthdrlen;
  56:              transhdrlen += exthdrlen;
  57:          }
  58:      } else {
  59:          rt = (struct rtable *)inet->cork.dst;
  60:          if (inet->cork.flags & IPCORK_OPT)
  61:              opt = inet->cork.opt;
  62:   
  63:          transhdrlen = 0;
  64:          exthdrlen = 0;
  65:          mtu = inet->cork.fragsize;
  66:      }
  67:      hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
  68:   
  69:      fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
  70:      maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
  71:   
  72:      if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
  73:          ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
  74:          return -EMSGSIZE;
  75:      }
  76:   
  77:      /*
  78:       * transhdrlen > 0 means that this is the first fragment and we wish
  79:       * it won't be fragmented in the future.
  80:       */
  81:      if (transhdrlen &&
  82:          length + fragheaderlen <= mtu &&
  83:          rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
  84:          !exthdrlen)
  85:          csummode = CHECKSUM_PARTIAL;
  86:   
  87:      skb = skb_peek_tail(&sk->sk_write_queue);
  88:   
  89:      inet->cork.length += length;
  90:      if (((length > mtu) || (skb && skb_is_gso(skb))) &&
  91:          (sk->sk_protocol == IPPROTO_UDP) &&
  92:          (rt->u.dst.dev->features & NETIF_F_UFO)) {
  93:          err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
  94:                       fragheaderlen, transhdrlen, mtu,
  95:                       flags);
  96:          if (err)
  97:              goto error;
  98:          return 0;
  99:      }
 100:   
 101:      /* So, what's going on in the loop below?
 102:       *
 103:       * We use calculated fragment length to generate chained skb,
 104:       * each of segments is IP fragment ready for sending to network after
 105:       * adding appropriate IP header.
 106:       */
 107:   
 108:      if (!skb)
 109:          goto alloc_new_skb;
 110:   
 111:      while (length > 0) {
 112:          /* Check if the remaining data fits into current packet. */
 113:          copy = mtu - skb->len;
 114:          if (copy < length)
 115:              copy = maxfraglen - skb->len;
 116:          if (copy <= 0) {
 117:              char *data;
 118:              unsigned int datalen;
 119:              unsigned int fraglen;
 120:              unsigned int fraggap;
 121:              unsigned int alloclen;
 122:              struct sk_buff *skb_prev;
 123:  alloc_new_skb:
 124:              skb_prev = skb;
 125:              if (skb_prev)
 126:                  fraggap = skb_prev->len - maxfraglen;
 127:              else
 128:                  fraggap = 0;
 129:   
 130:              /*
 131:               * If remaining data exceeds the mtu,
 132:               * we know we need more fragment(s).
 133:               */
 134:              datalen = length + fraggap;
 135:              if (datalen > mtu - fragheaderlen)
 136:                  datalen = maxfraglen - fragheaderlen;
 137:              fraglen = datalen + fragheaderlen;
 138:   
 139:              if ((flags & MSG_MORE) &&
 140:                  !(rt->u.dst.dev->features&NETIF_F_SG))
 141:                  alloclen = mtu;
 142:              else
 143:                  alloclen = datalen + fragheaderlen;
 144:   
 145:              /* The last fragment gets additional space at tail.
 146:               * Note, with MSG_MORE we overallocate on fragments,
 147:               * because we have no idea what fragment will be
 148:               * the last.
 149:               */
 150:              if (datalen == length + fraggap)
 151:                  alloclen += rt->u.dst.trailer_len;
 152:   
 153:              if (transhdrlen) {
 154:                  skb = sock_alloc_send_skb(sk,
 155:                          alloclen + hh_len + 15,
 156:                          (flags & MSG_DONTWAIT), &err);
 157:              } else {
 158:                  skb = NULL;
 159:                  if (atomic_read(&sk->sk_wmem_alloc) <=
 160:                      2 * sk->sk_sndbuf)
 161:                      skb = sock_wmalloc(sk,
 162:                                 alloclen + hh_len + 15, 1,
 163:                                 sk->sk_allocation);
 164:                  if (unlikely(skb == NULL))
 165:                      err = -ENOBUFS;
 166:                  else
 167:                      /* only the initial fragment is
 168:                         time stamped */
 169:                      ipc->shtx.flags = 0;
 170:              }
 171:              if (skb == NULL)
 172:                  goto error;
 173:   
 174:              /*
 175:               *    Fill in the control structures
 176:               */
 177:              skb->ip_summed = csummode;
 178:              skb->csum = 0;
 179:              skb_reserve(skb, hh_len);
 180:              *skb_tx(skb) = ipc->shtx;
 181:   
 182:              /*
 183:               *    Find where to start putting bytes.
 184:               */
 185:              data = skb_put(skb, fraglen);
 186:              skb_set_network_header(skb, exthdrlen);
 187:              skb->transport_header = (skb->network_header +
 188:                           fragheaderlen);
 189:              data += fragheaderlen;
 190:   
 191:              if (fraggap) {
 192:                  skb->csum = skb_copy_and_csum_bits(
 193:                      skb_prev, maxfraglen,
 194:                      data + transhdrlen, fraggap, 0);
 195:                  skb_prev->csum = csum_sub(skb_prev->csum,
 196:                                skb->csum);
 197:                  data += fraggap;
 198:                  pskb_trim_unique(skb_prev, maxfraglen);
 199:              }
 200:   
 201:              copy = datalen - transhdrlen - fraggap;
 202:              if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 203:                  err = -EFAULT;
 204:                  kfree_skb(skb);
 205:                  goto error;
 206:              }
 207:   
 208:              offset += copy;
 209:              length -= datalen - fraggap;
 210:              transhdrlen = 0;
 211:              exthdrlen = 0;
 212:              csummode = CHECKSUM_NONE;
 213:   
 214:              /*
 215:               * Put the packet on the pending queue.
 216:               */
 217:              __skb_queue_tail(&sk->sk_write_queue, skb);
 218:              continue;
 219:          }
 220:   
 221:          if (copy > length)
 222:              copy = length;
 223:   
 224:          if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 225:              unsigned int off;
 226:   
 227:              off = skb->len;
 228:              if (getfrag(from, skb_put(skb, copy),
 229:                      offset, copy, off, skb) < 0) {
 230:                  __skb_trim(skb, off);
 231:                  err = -EFAULT;
 232:                  goto error;
 233:              }
 234:          } else {
 235:              int i = skb_shinfo(skb)->nr_frags;
 236:              skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 237:              struct page *page = sk->sk_sndmsg_page;
 238:              int off = sk->sk_sndmsg_off;
 239:              unsigned int left;
 240:   
 241:              if (page && (left = PAGE_SIZE - off) > 0) {
 242:                  if (copy >= left)
 243:                      copy = left;
 244:                  if (page != frag->page) {
 245:                      if (i == MAX_SKB_FRAGS) {
 246:                          err = -EMSGSIZE;
 247:                          goto error;
 248:                      }
 249:                      get_page(page);
 250:                      skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 251:                      frag = &skb_shinfo(skb)->frags[i];
 252:                  }
 253:              } else if (i < MAX_SKB_FRAGS) {
 254:                  if (copy > PAGE_SIZE)
 255:                      copy = PAGE_SIZE;
 256:                  page = alloc_pages(sk->sk_allocation, 0);
 257:                  if (page == NULL)  {
 258:                      err = -ENOMEM;
 259:                      goto error;
 260:                  }
 261:                  sk->sk_sndmsg_page = page;
 262:                  sk->sk_sndmsg_off = 0;
 263:   
 264:                  skb_fill_page_desc(skb, i, page, 0, 0);
 265:                  frag = &skb_shinfo(skb)->frags[i];
 266:              } else {
 267:                  err = -EMSGSIZE;
 268:                  goto error;
 269:              }
 270:              if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
 271:                  err = -EFAULT;
 272:                  goto error;
 273:              }
 274:              sk->sk_sndmsg_off += copy;
 275:              frag->size += copy;
 276:              skb->len += copy;
 277:              skb->data_len += copy;
 278:              skb->truesize += copy;
 279:              atomic_add(copy, &sk->sk_wmem_alloc);
 280:          }
 281:          offset += copy;
 282:          length -= copy;
 283:      }
 284:   
 285:      return 0;
 286:   
 287:  error:
 288:      inet->cork.length -= length;
 289:      IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
 290:      return err;
 291:  }

这是由那些想把传输的数据暂存于缓冲区的l4协议所使用的函数。此函数并不传输数据,而是将数据放在大小合适的一些缓冲区中,让后续的函数可以借此构成一些片段(必选的话)并进行传输。因此,此函数并不建立或操作任何IP报头。要把由ip_append_data缓冲的数据刷新并传输,L4层必须显式地调用ip_push_pending_frames(也会处理ip报头)才行。

如果l4层想要快速的响应时间,每次调用ip_append_data之后,可能会在调用ip_push_pending_frames。但是,这两个函数是为了尽可能把多一点的数据暂存于缓冲区内,然后依次传输,以获取效率。如果如上所示,则背道而驰了。

ip_append_data的主要任务是:

  • 把来自于l4层的输入数据组成一些缓冲区,而这些缓冲区的尺寸又使其易于处理IP分段工作(必要时)。此外,把那些数据片段放进那些缓冲区时,要还安排成l3和l2层稍后能轻易新增较底层协议头
  • 优化内存分配,把来自于上层的信息以及出口设备的能力考虑进来。特别地是:
    • 如果上层指出马上就有更多的其他传输请求(通过MSG_MORE标志),分配大一点的缓冲区才合理。
    • 如果出口设备支持分散/聚集I/O(NETIF_F_SG),片段可以安排内存的处理得以优化
  • 处理l4校验和。skb->ip_summed是根据出口设备能力和其他因素进行初始化的

ip_append_data的输入参数的意义:

sk,此封包传输背后的套接字。此数据结构包含一些参数,稍后必须用于填写ip报头

from,指向l4层正试着传输的数据(有效载荷)的指针。其不是内核指针,就是用户空间指针。getfrag函数的工作就是正确处理该指针

getfrag,用于把接受自l4层的有效载荷拷贝到即将建立的一些数据片段中

length,要传输的数据量(包括l4报头和l4有效载荷)

transhdrlen,传输报头的尺寸

ipc,正确转发封包所必须的信息

rt,与此封包相关的路由表缓存项目。当ip_queue_xmit自己接收此信息时,ip_append_data会依赖调用者通过ip_route_output_flow来收集该项信息。

flags,此变量可包含任何一个MSG_XXX标志。此函数会用到其中三个标志:

  • MSG_MORE,此标志是由应用程序使用,来告知l4层马上就有更过其他传输。
  • MSG_DONTWAIT,当此标志设定时,对ip_append_data的调用一定不能收到阻塞。ip_append_data可能必须为套接字sk分配一个缓冲区(利用sock_alloc_send_skb)。当sock_alloc_send_skb用掉其限额时,不是阻塞住期望定时器到期可以有些空间可用,不然就是失败。此标志可以用于在前两个选项中做选择
  • MSG_PROBE,此标志设定时,用户其实不想传输任何东西,而只是在探测路径。如果此标志已设定,ip_append_data只会立即传回一个代表成功的返回代码

处理分段:

分片重组子系统初始化:

   1:  void __init ipfrag_init(void)
   2:  {
   3:      ip4_frags_ctl_register();
   4:      register_pernet_subsys(&ip4_frags_ops);
   5:      ip4_frags.hashfn = ip4_hashfn;
   6:      ip4_frags.constructor = ip4_frag_init;
   7:      ip4_frags.destructor = ip4_frag_free;
   8:      ip4_frags.skb_free = NULL;
   9:      ip4_frags.qsize = sizeof(struct ipq);
  10:      ip4_frags.match = ip4_frag_match;
  11:      ip4_frags.frag_expire = ip_expire;
  12:      ip4_frags.secret_interval = 10 * 60 * HZ;
  13:      inet_frags_init(&ip4_frags);
  14:  }
   1:  void inet_frags_init(struct inet_frags *f)
   2:  {
   3:      int i;
   4:   
   5:      for (i = 0; i < INETFRAGS_HASHSZ; i++)
   6:          INIT_HLIST_HEAD(&f->hash[i]);
   7:   
   8:      rwlock_init(&f->lock);
   9:   
  10:      f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
  11:                     (jiffies ^ (jiffies >> 6)));
  12:   
  13:      setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
  14:              (unsigned long)f);
  15:      f->secret_timer.expires = jiffies + f->secret_interval;
  16:      add_timer(&f->secret_timer);
  17:  }

IP分段,ip_fragment的输入数据可以是:

  • 完整的已转发封包
  • 原来的主机或路由器已分段的已转发封包
  • 由本地函数(已启动分段流程,但尚未把作为封包传输所需的报头加进去)所建的缓冲区

特别的,ip_fragment必须能够处理下列两种情况:

必须切成小块的大数据块,要切割大缓冲区,需要分配新缓冲区,并在大缓冲区和小缓冲区间做内存拷贝。当然,这会影响性能

不需要再分段的数据片段链表或数组,如果分配的缓冲区有空间可以新增较低层的L3和L2报头,则ip_fragment处理这些缓冲区时就不需要做内存拷贝。IP层所做的就是为每个片段加一个ip报头,并处理校验和。

ip分段的主要任务

  1. 把l3有效载荷分割成一些较小段的数据,使它与传送此封包的路径所用的MTU(PMTU)相匹配。如果IP有效载荷的尺寸并非刚刚好是片段尺寸的倍数,则最后一个片段会小于其他片段。此外,因为IP报头的“片段偏移量”字段是以8字节为单位,因此,该值会以8字节边界对齐。每个片段都是这个大小。
  2. 为每个片段的ip报头做初始化
  3. 计算ip校验和
  4. 向netfilter请求完成传输的权限
  5. 更新内核和snmp必要的统计数据
   1:  int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
   2:  {
   3:      struct iphdr *iph;
   4:      int raw = 0;
   5:      int ptr;
   6:      struct net_device *dev;
   7:      struct sk_buff *skb2;
   8:      unsigned int mtu, hlen, left, len, ll_rs, pad;
   9:      int offset;
  10:      __be16 not_last_frag;
  11:      struct rtable *rt = skb_rtable(skb);
  12:      int err = 0;
  13:   
  14:      dev = rt->u.dst.dev;
  15:   
  16:      /*
  17:       *    Point into the IP datagram header.
  18:       */
  19:   
  20:      iph = ip_hdr(skb);
  21:   
  22:      if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
  23:          IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
  24:          icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
  25:                htonl(ip_skb_dst_mtu(skb)));
  26:          kfree_skb(skb);
  27:          return -EMSGSIZE;
  28:      }
  29:   
  30:      /*
  31:       *    Setup starting values.
  32:       */
  33:   
  34:      hlen = iph->ihl * 4;
  35:      mtu = dst_mtu(&rt->u.dst) - hlen;    /* Size of data space */
  36:      IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
  37:   
  38:      /* When frag_list is given, use it. First, check its validity:
  39:       * some transformers could create wrong frag_list or break existing
  40:       * one, it is not prohibited. In this case fall back to copying.
  41:       *
  42:       * LATER: this step can be merged to real generation of fragments,
  43:       * we can switch to copy when see the first bad fragment.
  44:       */
  45:      if (skb_has_frags(skb)) {
  46:          struct sk_buff *frag, *frag2;
  47:          int first_len = skb_pagelen(skb);
  48:   
  49:          if (first_len - hlen > mtu ||
  50:              ((first_len - hlen) & 7) ||
  51:              (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
  52:              skb_cloned(skb))
  53:              goto slow_path;
  54:   
  55:          skb_walk_frags(skb, frag) {
  56:              /* Correct geometry. */
  57:              if (frag->len > mtu ||
  58:                  ((frag->len & 7) && frag->next) ||
  59:                  skb_headroom(frag) < hlen)
  60:                  goto slow_path_clean;
  61:   
  62:              /* Partially cloned skb? */
  63:              if (skb_shared(frag))
  64:                  goto slow_path_clean;
  65:   
  66:              BUG_ON(frag->sk);
  67:              if (skb->sk) {
  68:                  frag->sk = skb->sk;
  69:                  frag->destructor = sock_wfree;
  70:              }
  71:              skb->truesize -= frag->truesize;
  72:          }
  73:   
  74:          /* Everything is OK. Generate! */
  75:   
  76:          err = 0;
  77:          offset = 0;
  78:          frag = skb_shinfo(skb)->frag_list;
  79:          skb_frag_list_init(skb);
  80:          skb->data_len = first_len - skb_headlen(skb);
  81:          skb->len = first_len;
  82:          iph->tot_len = htons(first_len);
  83:          iph->frag_off = htons(IP_MF);
  84:          ip_send_check(iph);
  85:   
  86:          for (;;) {
  87:              /* Prepare header of the next frame,
  88:               * before previous one went down. */
  89:              if (frag) {
  90:                  frag->ip_summed = CHECKSUM_NONE;
  91:                  skb_reset_transport_header(frag);
  92:                  __skb_push(frag, hlen);
  93:                  skb_reset_network_header(frag);
  94:                  memcpy(skb_network_header(frag), iph, hlen);
  95:                  iph = ip_hdr(frag);
  96:                  iph->tot_len = htons(frag->len);
  97:                  ip_copy_metadata(frag, skb);
  98:                  if (offset == 0)
  99:                      ip_options_fragment(frag);
 100:                  offset += skb->len - hlen;
 101:                  iph->frag_off = htons(offset>>3);
 102:                  if (frag->next != NULL)
 103:                      iph->frag_off |= htons(IP_MF);
 104:                  /* Ready, complete checksum */
 105:                  ip_send_check(iph);
 106:              }
 107:   
 108:              err = output(skb);
 109:   
 110:              if (!err)
 111:                  IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 112:              if (err || !frag)
 113:                  break;
 114:   
 115:              skb = frag;
 116:              frag = skb->next;
 117:              skb->next = NULL;
 118:          }
 119:   
 120:          if (err == 0) {
 121:              IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 122:              return 0;
 123:          }
 124:   
 125:          while (frag) {
 126:              skb = frag->next;
 127:              kfree_skb(frag);
 128:              frag = skb;
 129:          }
 130:          IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 131:          return err;
 132:   
 133:  slow_path_clean:
 134:          skb_walk_frags(skb, frag2) {
 135:              if (frag2 == frag)
 136:                  break;
 137:              frag2->sk = NULL;
 138:              frag2->destructor = NULL;
 139:              skb->truesize += frag2->truesize;
 140:          }
 141:      }
 142:   
 143:  slow_path:
 144:      left = skb->len - hlen;        /* Space per frame */
 145:      ptr = raw + hlen;        /* Where to start from */
 146:   
 147:      /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 148:       * we need to make room for the encapsulating header
 149:       */
 150:      pad = nf_bridge_pad(skb);
 151:      ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 152:      mtu -= pad;
 153:   
 154:      /*
 155:       *    Fragment the datagram.
 156:       */
 157:   
 158:      offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 159:      not_last_frag = iph->frag_off & htons(IP_MF);
 160:   
 161:      /*
 162:       *    Keep copying data until we run out.
 163:       */
 164:   
 165:      while (left > 0) {
 166:          len = left;
 167:          /* IF: it doesn't fit, use 'mtu' - the data space left */
 168:          if (len > mtu)
 169:              len = mtu;
 170:          /* IF: we are not sending upto and including the packet end
 171:             then align the next start on an eight byte boundary */
 172:          if (len < left)    {
 173:              len &= ~7;
 174:          }
 175:          /*
 176:           *    Allocate buffer.
 177:           */
 178:   
 179:          if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 180:              NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 181:              err = -ENOMEM;
 182:              goto fail;
 183:          }
 184:   
 185:          /*
 186:           *    Set up data on packet
 187:           */
 188:   
 189:          ip_copy_metadata(skb2, skb);
 190:          skb_reserve(skb2, ll_rs);
 191:          skb_put(skb2, len + hlen);
 192:          skb_reset_network_header(skb2);
 193:          skb2->transport_header = skb2->network_header + hlen;
 194:   
 195:          /*
 196:           *    Charge the memory for the fragment to any owner
 197:           *    it might possess
 198:           */
 199:   
 200:          if (skb->sk)
 201:              skb_set_owner_w(skb2, skb->sk);
 202:   
 203:          /*
 204:           *    Copy the packet header into the new buffer.
 205:           */
 206:   
 207:          skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 208:   
 209:          /*
 210:           *    Copy a block of the IP datagram.
 211:           */
 212:          if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 213:              BUG();
 214:          left -= len;
 215:   
 216:          /*
 217:           *    Fill in the new header fields.
 218:           */
 219:          iph = ip_hdr(skb2);
 220:          iph->frag_off = htons((offset >> 3));
 221:   
 222:          /* ANK: dirty, but effective trick. Upgrade options only if
 223:           * the segment to be fragmented was THE FIRST (otherwise,
 224:           * options are already fixed) and make it ONCE
 225:           * on the initial skb, so that all the following fragments
 226:           * will inherit fixed options.
 227:           */
 228:          if (offset == 0)
 229:              ip_options_fragment(skb);
 230:   
 231:          /*
 232:           *    Added AC : If we are fragmenting a fragment that's not the
 233:           *           last fragment then keep MF on each bit
 234:           */
 235:          if (left > 0 || not_last_frag)
 236:              iph->frag_off |= htons(IP_MF);
 237:          ptr += len;
 238:          offset += len;
 239:   
 240:          /*
 241:           *    Put this fragment into the sending queue.
 242:           */
 243:          iph->tot_len = htons(len + hlen);
 244:   
 245:          ip_send_check(iph);
 246:   
 247:          err = output(skb2);
 248:          if (err)
 249:              goto fail;
 250:   
 251:          IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 252:      }
 253:      kfree_skb(skb);
 254:      IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 255:      return err;
 256:   
 257:  fail:
 258:      kfree_skb(skb);
 259:      IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 260:      return err;
 261:  }

重组:

重组的制约条件

  • 片段必须存储在内核内存中,直到它完全被网络子系统处理完,但是内存很昂贵,因此,一定要有种方式去限制内存的使用
  • 存储大量信息最有效的结构就是hash表,然而,hash表也会失衡,特别是如果恶意攻击者摸清hash算法后,精打细算试着拉高hash表中特定元素的权重使得处理速度慢下来。
  • 网络联机时常使用不可靠的媒介,所以,片段可能会遗失。如果一个封包里的不同片段以不同路径传输时,这一点尤其可能发生。因此,IP层必须为每个封包持有一个定时器,并在某个点上放弃,并把所有接收到的片段抛弃掉。此外,也必须采用校验,把检测到损毁的机会拉到最大
  • 如果源主机在一段时间后没收到某些数据的确认通知信息,而且传输协议实现了流控制,该主机就会重传数据。因此,对单个ip封包而言,目的地可能会接收到好几个重复片段。让这一问题更复杂的是,第二个ip封包所走的路径可能和第一个不同,因此,其分段也不同,所以片段间的边界也可能不吻合

于是,这些需求就造成接下来要说明的实现细节。片段存储在一个会定期改变(在hash函数的输入函数中多一个随机元素)的hash表内。每个封包都会关联一个定时器,如果定时器到期了,该封包就会被删除。每个片段都会检查是否损毁,以及是否和先前接收的片段重叠。

重组涉及的函数:

ip_evictor:逐一删除不完整的ipq结构,从最旧的着手,直到片段所用的内存降到sysctl_ipfrag_low_thresh阀值下。为了让ip_evictor正确运作,一个lru链表必须不断更新。其做法就是把心ipq结构加到一个全局链表尾端,然后每次有新片段加至一个ipq结构时,就加到队列末尾。于是,最没希望完成的封包就会站在队列前端

ip_find:找出和正在被处理的片段相关的封包(片段链表)。查询是根据IP报头的4个字段:ID,源IP地址、目的IP地址以及l4协议。查询关键字实际上也包含一个本地参数:user。这一个参数用于指出重组的原因

ip_frag_queue:把指定的片段插入和同一个ip封包相关的片段链表中。

ip_frag_reasm:一旦所有片段都被接收之后,就从这些片段构建原有的ip封包

   1:  int ip_defrag(struct sk_buff *skb, u32 user)
   2:  {
   3:      struct ipq *qp;
   4:      struct net *net;
   5:   
   6:      net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
   7:      IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
   8:   
   9:      /* Start by cleaning up the memory. */
  10:      if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
  11:          ip_evictor(net);
  12:   
  13:      /* Lookup (or create) queue header */
  14:      if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
  15:          int ret;
  16:   
  17:          spin_lock(&qp->q.lock);
  18:   
  19:          ret = ip_frag_queue(qp, skb);
  20:   
  21:          spin_unlock(&qp->q.lock);
  22:          ipq_put(qp);
  23:          return ret;
  24:      }
  25:   
  26:      IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
  27:      kfree_skb(skb);
  28:      return -ENOMEM;
  29:  }
 
ip_frag_queue函数:

使用链表处理片段,可以对所用的内存进行优化,但是会使得片段的处理稍微复杂一点。ip_frag_queue所做的主要任务如下:

  • 弄清输入片段处于原有封包何处(根据其偏移量和长度)
  • 弄清是否为封包的最后片段,如果是的话,就从中取出ip封包长度
  • 把该片段插入到链表中,而该链表内的片段都关联同一个ip封包,此外还要处理可能的重叠问题。
  • 更新ipq结构中由垃圾收集任务所用的那些字段
  • 必要时,让在硬件中计算机的l4校验和失效
   1:  /* Add new segment to existing queue. */
   2:  static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
   3:  {
   4:      struct sk_buff *prev, *next;
   5:      struct net_device *dev;
   6:      int flags, offset;
   7:      int ihl, end;
   8:      int err = -ENOENT;
   9:   
  10:      if (qp->q.last_in & INET_FRAG_COMPLETE)
  11:          goto err;
  12:   
  13:      if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
  14:          unlikely(ip_frag_too_far(qp)) &&
  15:          unlikely(err = ip_frag_reinit(qp))) {
  16:          ipq_kill(qp);
  17:          goto err;
  18:      }
  19:   
  20:      offset = ntohs(ip_hdr(skb)->frag_off);
  21:      flags = offset & ~IP_OFFSET;
  22:      offset &= IP_OFFSET;
  23:      offset <<= 3;        /* offset is in 8-byte chunks */
  24:      ihl = ip_hdrlen(skb);
  25:   
  26:      /* Determine the position of this fragment. */
  27:      end = offset + skb->len - ihl;
  28:      err = -EINVAL;
  29:   
  30:      /* Is this the final fragment? */
  31:      if ((flags & IP_MF) == 0) {
  32:          /* If we already have some bits beyond end
  33:           * or have different end, the segment is corrrupted.
  34:           */
  35:          if (end < qp->q.len ||
  36:              ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
  37:              goto err;
  38:          qp->q.last_in |= INET_FRAG_LAST_IN;
  39:          qp->q.len = end;
  40:      } else {
  41:          if (end&7) {
  42:              end &= ~7;
  43:              if (skb->ip_summed != CHECKSUM_UNNECESSARY)
  44:                  skb->ip_summed = CHECKSUM_NONE;
  45:          }
  46:          if (end > qp->q.len) {
  47:              /* Some bits beyond end -> corruption. */
  48:              if (qp->q.last_in & INET_FRAG_LAST_IN)
  49:                  goto err;
  50:              qp->q.len = end;
  51:          }
  52:      }
  53:      if (end == offset)
  54:          goto err;
  55:   
  56:      err = -ENOMEM;
  57:      if (pskb_pull(skb, ihl) == NULL)
  58:          goto err;
  59:   
  60:      err = pskb_trim_rcsum(skb, end - offset);
  61:      if (err)
  62:          goto err;
  63:   
  64:      /* Find out which fragments are in front and at the back of us
  65:       * in the chain of fragments so far.  We must know where to put
  66:       * this fragment, right?
  67:       */
  68:      prev = NULL;
  69:      for (next = qp->q.fragments; next != NULL; next = next->next) {
  70:          if (FRAG_CB(next)->offset >= offset)
  71:              break;    /* bingo! */
  72:          prev = next;
  73:      }
  74:   
  75:      /* We found where to put this one.  Check for overlap with
  76:       * preceding fragment, and, if needed, align things so that
  77:       * any overlaps are eliminated.
  78:       */
  79:      if (prev) {
  80:          int i = (FRAG_CB(prev)->offset + prev->len) - offset;
  81:   
  82:          if (i > 0) {
  83:              offset += i;
  84:              err = -EINVAL;
  85:              if (end <= offset)
  86:                  goto err;
  87:              err = -ENOMEM;
  88:              if (!pskb_pull(skb, i))
  89:                  goto err;
  90:              if (skb->ip_summed != CHECKSUM_UNNECESSARY)
  91:                  skb->ip_summed = CHECKSUM_NONE;
  92:          }
  93:      }
  94:   
  95:      err = -ENOMEM;
  96:   
  97:      while (next && FRAG_CB(next)->offset < end) {
  98:          int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
  99:   
 100:          if (i < next->len) {
 101:              /* Eat head of the next overlapped fragment
 102:               * and leave the loop. The next ones cannot overlap.
 103:               */
 104:              if (!pskb_pull(next, i))
 105:                  goto err;
 106:              FRAG_CB(next)->offset += i;
 107:              qp->q.meat -= i;
 108:              if (next->ip_summed != CHECKSUM_UNNECESSARY)
 109:                  next->ip_summed = CHECKSUM_NONE;
 110:              break;
 111:          } else {
 112:              struct sk_buff *free_it = next;
 113:   
 114:              /* Old fragment is completely overridden with
 115:               * new one drop it.
 116:               */
 117:              next = next->next;
 118:   
 119:              if (prev)
 120:                  prev->next = next;
 121:              else
 122:                  qp->q.fragments = next;
 123:   
 124:              qp->q.meat -= free_it->len;
 125:              frag_kfree_skb(qp->q.net, free_it, NULL);
 126:          }
 127:      }
 128:   
 129:      FRAG_CB(skb)->offset = offset;
 130:   
 131:      /* Insert this fragment in the chain of fragments. */
 132:      skb->next = next;
 133:      if (prev)
 134:          prev->next = skb;
 135:      else
 136:          qp->q.fragments = skb;
 137:   
 138:      dev = skb->dev;
 139:      if (dev) {
 140:          qp->iif = dev->ifindex;
 141:          skb->dev = NULL;
 142:      }
 143:      qp->q.stamp = skb->tstamp;
 144:      qp->q.meat += skb->len;
 145:      atomic_add(skb->truesize, &qp->q.net->mem);
 146:      if (offset == 0)
 147:          qp->q.last_in |= INET_FRAG_FIRST_IN;
 148:   
 149:      if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 150:          qp->q.meat == qp->q.len)
 151:          return ip_frag_reasm(qp, prev, dev);
 152:   
 153:      write_lock(&ip4_frags.lock);
 154:      list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
 155:      write_unlock(&ip4_frags.lock);
 156:      return -EINPROGRESS;
 157:   
 158:  err:
 159:      kfree_skb(skb);
 160:      return err;
 161:  }

转载于:https://my.oschina.net/longscu/blog/60633

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值