内核完成的任务:
- 查询下一个跳点,ip层必须知道外出设备以及用作下一个跳点的下一个路由器。路径是通过ip_route_output_flow发现的。
- 初始化ip报头,几个字段会在此阶段填入。
- 处理选项,软件必须尊重需要把一个地址或时间戳加进报头里的那些选项
- 分段,如果ip封包太大,无法在外出设备上传输,就必须分段
- 校验和,对报头的其他工作都做完后,还必须计算校验和。
- 由netfilter检查
- 更新统计数据,取决于传输结果(成功或失败)以及一些诸如分段的行动而定
ip_queue函数:
1: int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
2: {
3: struct sock *sk = skb->sk;
4: struct inet_sock *inet = inet_sk(sk);
5: struct ip_options *opt = inet->opt;
6: struct rtable *rt;
7: struct iphdr *iph;
8:
9: /* Skip all of this if the packet is already routed,
10: * f.e. by something like SCTP.
11: */
12: rt = skb_rtable(skb);
13: if (rt != NULL)
14: goto packet_routed;
15:
16: /* Make sure we can route this packet. */
17: rt = (struct rtable *)__sk_dst_check(sk, 0);
18: if (rt == NULL) {
19: __be32 daddr;
20:
21: /* Use correct destination address if we have options. */
22: daddr = inet->daddr;
23: if(opt && opt->srr)
24: daddr = opt->faddr;
25:
26: {
27: struct flowi fl = { .oif = sk->sk_bound_dev_if,
28: .mark = sk->sk_mark,
29: .nl_u = { .ip4_u =
30: { .daddr = daddr,
31: .saddr = inet->saddr,
32: .tos = RT_CONN_FLAGS(sk) } },
33: .proto = sk->sk_protocol,
34: .flags = inet_sk_flowi_flags(sk),
35: .uli_u = { .ports =
36: { .sport = inet->sport,
37: .dport = inet->dport } } };
38:
39: /* If this fails, retransmit mechanism of transport layer will
40: * keep trying until route appears or the connection times
41: * itself out.
42: */
43: security_sk_classify_flow(sk, &fl);
44: if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
45: goto no_route;
46: }
47: sk_setup_caps(sk, &rt->u.dst);
48: }
49: skb_dst_set(skb, dst_clone(&rt->u.dst));
50:
51: packet_routed:
52: if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
53: goto no_route;
54:
55: /* OK, we know where to send it, allocate and build IP header. */
56: skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
57: skb_reset_network_header(skb);
58: iph = ip_hdr(skb);
59: *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
60: if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
61: iph->frag_off = htons(IP_DF);
62: else
63: iph->frag_off = 0;
64: iph->ttl = ip_select_ttl(inet, &rt->u.dst);
65: iph->protocol = sk->sk_protocol;
66: iph->saddr = rt->rt_src;
67: iph->daddr = rt->rt_dst;
68: /* Transport layer set skb->h.foo itself. */
69:
70: if (opt && opt->optlen) {
71: iph->ihl += opt->optlen >> 2;
72: ip_options_build(skb, opt, inet->daddr, rt, 0);
73: }
74:
75: ip_select_ident_more(iph, &rt->u.dst, sk,
76: (skb_shinfo(skb)->gso_segs ?: 1) - 1);
77:
78: skb->priority = sk->sk_priority;
79: skb->mark = sk->sk_mark;
80:
81: return ip_local_out(skb);
82:
83: no_route:
84: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
85: kfree_skb(skb);
86: return -EHOSTUNREACH;
87: }
其参数的意义:
skb,要传输的封包的缓冲区描述符。此数据结构中有填入IP报头以及传输封包所需的所有参数。ip_queue_xmit用于处理本地产生的封包;转发封包没有相关的套接字。和skb相关的套接字包含一个名为opt的指针。此结构中包含ip报头中的选项,而其存储格式使得ip层的函数更易于存取。此结构是放在socket结构中的,因此此结构对每个要通过该套接字传输的封包而言都相同;替每个封包重建此信息就太浪费了。
ipfragok,主要用sctp使用的标志,用来指出是否允许分段。
ip_append_data函数:
1: int ip_append_data(struct sock *sk,
2: int getfrag(void *from, char *to, int offset, int len,
3: int odd, struct sk_buff *skb),
4: void *from, int length, int transhdrlen,
5: struct ipcm_cookie *ipc, struct rtable **rtp,
6: unsigned int flags)
7: {
8: struct inet_sock *inet = inet_sk(sk);
9: struct sk_buff *skb;
10:
11: struct ip_options *opt = NULL;
12: int hh_len;
13: int exthdrlen;
14: int mtu;
15: int copy;
16: int err;
17: int offset = 0;
18: unsigned int maxfraglen, fragheaderlen;
19: int csummode = CHECKSUM_NONE;
20: struct rtable *rt;
21:
22: if (flags&MSG_PROBE)
23: return 0;
24:
25: if (skb_queue_empty(&sk->sk_write_queue)) {
26: /*
27: * setup for corking.
28: */
29: opt = ipc->opt;
30: if (opt) {
31: if (inet->cork.opt == NULL) {
32: inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
33: if (unlikely(inet->cork.opt == NULL))
34: return -ENOBUFS;
35: }
36: memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
37: inet->cork.flags |= IPCORK_OPT;
38: inet->cork.addr = ipc->addr;
39: }
40: rt = *rtp;
41: if (unlikely(!rt))
42: return -EFAULT;
43: /*
44: * We steal reference to this route, caller should not release it
45: */
46: *rtp = NULL;
47: inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
48: rt->u.dst.dev->mtu :
49: dst_mtu(rt->u.dst.path);
50: inet->cork.dst = &rt->u.dst;
51: inet->cork.length = 0;
52: sk->sk_sndmsg_page = NULL;
53: sk->sk_sndmsg_off = 0;
54: if ((exthdrlen = rt->u.dst.header_len) != 0) {
55: length += exthdrlen;
56: transhdrlen += exthdrlen;
57: }
58: } else {
59: rt = (struct rtable *)inet->cork.dst;
60: if (inet->cork.flags & IPCORK_OPT)
61: opt = inet->cork.opt;
62:
63: transhdrlen = 0;
64: exthdrlen = 0;
65: mtu = inet->cork.fragsize;
66: }
67: hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
68:
69: fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
70: maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
71:
72: if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
73: ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
74: return -EMSGSIZE;
75: }
76:
77: /*
78: * transhdrlen > 0 means that this is the first fragment and we wish
79: * it won't be fragmented in the future.
80: */
81: if (transhdrlen &&
82: length + fragheaderlen <= mtu &&
83: rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
84: !exthdrlen)
85: csummode = CHECKSUM_PARTIAL;
86:
87: skb = skb_peek_tail(&sk->sk_write_queue);
88:
89: inet->cork.length += length;
90: if (((length > mtu) || (skb && skb_is_gso(skb))) &&
91: (sk->sk_protocol == IPPROTO_UDP) &&
92: (rt->u.dst.dev->features & NETIF_F_UFO)) {
93: err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
94: fragheaderlen, transhdrlen, mtu,
95: flags);
96: if (err)
97: goto error;
98: return 0;
99: }
100:
101: /* So, what's going on in the loop below?
102: *
103: * We use calculated fragment length to generate chained skb,
104: * each of segments is IP fragment ready for sending to network after
105: * adding appropriate IP header.
106: */
107:
108: if (!skb)
109: goto alloc_new_skb;
110:
111: while (length > 0) {
112: /* Check if the remaining data fits into current packet. */
113: copy = mtu - skb->len;
114: if (copy < length)
115: copy = maxfraglen - skb->len;
116: if (copy <= 0) {
117: char *data;
118: unsigned int datalen;
119: unsigned int fraglen;
120: unsigned int fraggap;
121: unsigned int alloclen;
122: struct sk_buff *skb_prev;
123: alloc_new_skb:
124: skb_prev = skb;
125: if (skb_prev)
126: fraggap = skb_prev->len - maxfraglen;
127: else
128: fraggap = 0;
129:
130: /*
131: * If remaining data exceeds the mtu,
132: * we know we need more fragment(s).
133: */
134: datalen = length + fraggap;
135: if (datalen > mtu - fragheaderlen)
136: datalen = maxfraglen - fragheaderlen;
137: fraglen = datalen + fragheaderlen;
138:
139: if ((flags & MSG_MORE) &&
140: !(rt->u.dst.dev->features&NETIF_F_SG))
141: alloclen = mtu;
142: else
143: alloclen = datalen + fragheaderlen;
144:
145: /* The last fragment gets additional space at tail.
146: * Note, with MSG_MORE we overallocate on fragments,
147: * because we have no idea what fragment will be
148: * the last.
149: */
150: if (datalen == length + fraggap)
151: alloclen += rt->u.dst.trailer_len;
152:
153: if (transhdrlen) {
154: skb = sock_alloc_send_skb(sk,
155: alloclen + hh_len + 15,
156: (flags & MSG_DONTWAIT), &err);
157: } else {
158: skb = NULL;
159: if (atomic_read(&sk->sk_wmem_alloc) <=
160: 2 * sk->sk_sndbuf)
161: skb = sock_wmalloc(sk,
162: alloclen + hh_len + 15, 1,
163: sk->sk_allocation);
164: if (unlikely(skb == NULL))
165: err = -ENOBUFS;
166: else
167: /* only the initial fragment is
168: time stamped */
169: ipc->shtx.flags = 0;
170: }
171: if (skb == NULL)
172: goto error;
173:
174: /*
175: * Fill in the control structures
176: */
177: skb->ip_summed = csummode;
178: skb->csum = 0;
179: skb_reserve(skb, hh_len);
180: *skb_tx(skb) = ipc->shtx;
181:
182: /*
183: * Find where to start putting bytes.
184: */
185: data = skb_put(skb, fraglen);
186: skb_set_network_header(skb, exthdrlen);
187: skb->transport_header = (skb->network_header +
188: fragheaderlen);
189: data += fragheaderlen;
190:
191: if (fraggap) {
192: skb->csum = skb_copy_and_csum_bits(
193: skb_prev, maxfraglen,
194: data + transhdrlen, fraggap, 0);
195: skb_prev->csum = csum_sub(skb_prev->csum,
196: skb->csum);
197: data += fraggap;
198: pskb_trim_unique(skb_prev, maxfraglen);
199: }
200:
201: copy = datalen - transhdrlen - fraggap;
202: if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
203: err = -EFAULT;
204: kfree_skb(skb);
205: goto error;
206: }
207:
208: offset += copy;
209: length -= datalen - fraggap;
210: transhdrlen = 0;
211: exthdrlen = 0;
212: csummode = CHECKSUM_NONE;
213:
214: /*
215: * Put the packet on the pending queue.
216: */
217: __skb_queue_tail(&sk->sk_write_queue, skb);
218: continue;
219: }
220:
221: if (copy > length)
222: copy = length;
223:
224: if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
225: unsigned int off;
226:
227: off = skb->len;
228: if (getfrag(from, skb_put(skb, copy),
229: offset, copy, off, skb) < 0) {
230: __skb_trim(skb, off);
231: err = -EFAULT;
232: goto error;
233: }
234: } else {
235: int i = skb_shinfo(skb)->nr_frags;
236: skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
237: struct page *page = sk->sk_sndmsg_page;
238: int off = sk->sk_sndmsg_off;
239: unsigned int left;
240:
241: if (page && (left = PAGE_SIZE - off) > 0) {
242: if (copy >= left)
243: copy = left;
244: if (page != frag->page) {
245: if (i == MAX_SKB_FRAGS) {
246: err = -EMSGSIZE;
247: goto error;
248: }
249: get_page(page);
250: skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
251: frag = &skb_shinfo(skb)->frags[i];
252: }
253: } else if (i < MAX_SKB_FRAGS) {
254: if (copy > PAGE_SIZE)
255: copy = PAGE_SIZE;
256: page = alloc_pages(sk->sk_allocation, 0);
257: if (page == NULL) {
258: err = -ENOMEM;
259: goto error;
260: }
261: sk->sk_sndmsg_page = page;
262: sk->sk_sndmsg_off = 0;
263:
264: skb_fill_page_desc(skb, i, page, 0, 0);
265: frag = &skb_shinfo(skb)->frags[i];
266: } else {
267: err = -EMSGSIZE;
268: goto error;
269: }
270: if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
271: err = -EFAULT;
272: goto error;
273: }
274: sk->sk_sndmsg_off += copy;
275: frag->size += copy;
276: skb->len += copy;
277: skb->data_len += copy;
278: skb->truesize += copy;
279: atomic_add(copy, &sk->sk_wmem_alloc);
280: }
281: offset += copy;
282: length -= copy;
283: }
284:
285: return 0;
286:
287: error:
288: inet->cork.length -= length;
289: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
290: return err;
291: }
这是由那些想把传输的数据暂存于缓冲区的l4协议所使用的函数。此函数并不传输数据,而是将数据放在大小合适的一些缓冲区中,让后续的函数可以借此构成一些片段(必选的话)并进行传输。因此,此函数并不建立或操作任何IP报头。要把由ip_append_data缓冲的数据刷新并传输,L4层必须显式地调用ip_push_pending_frames(也会处理ip报头)才行。
如果l4层想要快速的响应时间,每次调用ip_append_data之后,可能会在调用ip_push_pending_frames。但是,这两个函数是为了尽可能把多一点的数据暂存于缓冲区内,然后依次传输,以获取效率。如果如上所示,则背道而驰了。
ip_append_data的主要任务是:
- 把来自于l4层的输入数据组成一些缓冲区,而这些缓冲区的尺寸又使其易于处理IP分段工作(必要时)。此外,把那些数据片段放进那些缓冲区时,要还安排成l3和l2层稍后能轻易新增较底层协议头
- 优化内存分配,把来自于上层的信息以及出口设备的能力考虑进来。特别地是:
- 如果上层指出马上就有更多的其他传输请求(通过MSG_MORE标志),分配大一点的缓冲区才合理。
- 如果出口设备支持分散/聚集I/O(NETIF_F_SG),片段可以安排内存的处理得以优化
- 处理l4校验和。skb->ip_summed是根据出口设备能力和其他因素进行初始化的
ip_append_data的输入参数的意义:
sk,此封包传输背后的套接字。此数据结构包含一些参数,稍后必须用于填写ip报头
from,指向l4层正试着传输的数据(有效载荷)的指针。其不是内核指针,就是用户空间指针。getfrag函数的工作就是正确处理该指针
getfrag,用于把接受自l4层的有效载荷拷贝到即将建立的一些数据片段中
length,要传输的数据量(包括l4报头和l4有效载荷)
transhdrlen,传输报头的尺寸
ipc,正确转发封包所必须的信息
rt,与此封包相关的路由表缓存项目。当ip_queue_xmit自己接收此信息时,ip_append_data会依赖调用者通过ip_route_output_flow来收集该项信息。
flags,此变量可包含任何一个MSG_XXX标志。此函数会用到其中三个标志:
- MSG_MORE,此标志是由应用程序使用,来告知l4层马上就有更过其他传输。
- MSG_DONTWAIT,当此标志设定时,对ip_append_data的调用一定不能收到阻塞。ip_append_data可能必须为套接字sk分配一个缓冲区(利用sock_alloc_send_skb)。当sock_alloc_send_skb用掉其限额时,不是阻塞住期望定时器到期可以有些空间可用,不然就是失败。此标志可以用于在前两个选项中做选择
- MSG_PROBE,此标志设定时,用户其实不想传输任何东西,而只是在探测路径。如果此标志已设定,ip_append_data只会立即传回一个代表成功的返回代码
处理分段:
分片重组子系统初始化:
1: void __init ipfrag_init(void)
2: {
3: ip4_frags_ctl_register();
4: register_pernet_subsys(&ip4_frags_ops);
5: ip4_frags.hashfn = ip4_hashfn;
6: ip4_frags.constructor = ip4_frag_init;
7: ip4_frags.destructor = ip4_frag_free;
8: ip4_frags.skb_free = NULL;
9: ip4_frags.qsize = sizeof(struct ipq);
10: ip4_frags.match = ip4_frag_match;
11: ip4_frags.frag_expire = ip_expire;
12: ip4_frags.secret_interval = 10 * 60 * HZ;
13: inet_frags_init(&ip4_frags);
14: }
1: void inet_frags_init(struct inet_frags *f)
2: {
3: int i;
4:
5: for (i = 0; i < INETFRAGS_HASHSZ; i++)
6: INIT_HLIST_HEAD(&f->hash[i]);
7:
8: rwlock_init(&f->lock);
9:
10: f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
11: (jiffies ^ (jiffies >> 6)));
12:
13: setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
14: (unsigned long)f);
15: f->secret_timer.expires = jiffies + f->secret_interval;
16: add_timer(&f->secret_timer);
17: }
IP分段,ip_fragment的输入数据可以是:
- 完整的已转发封包
- 原来的主机或路由器已分段的已转发封包
- 由本地函数(已启动分段流程,但尚未把作为封包传输所需的报头加进去)所建的缓冲区
特别的,ip_fragment必须能够处理下列两种情况:
必须切成小块的大数据块,要切割大缓冲区,需要分配新缓冲区,并在大缓冲区和小缓冲区间做内存拷贝。当然,这会影响性能
不需要再分段的数据片段链表或数组,如果分配的缓冲区有空间可以新增较低层的L3和L2报头,则ip_fragment处理这些缓冲区时就不需要做内存拷贝。IP层所做的就是为每个片段加一个ip报头,并处理校验和。
ip分段的主要任务:
- 把l3有效载荷分割成一些较小段的数据,使它与传送此封包的路径所用的MTU(PMTU)相匹配。如果IP有效载荷的尺寸并非刚刚好是片段尺寸的倍数,则最后一个片段会小于其他片段。此外,因为IP报头的“片段偏移量”字段是以8字节为单位,因此,该值会以8字节边界对齐。每个片段都是这个大小。
- 为每个片段的ip报头做初始化
- 计算ip校验和
- 向netfilter请求完成传输的权限
- 更新内核和snmp必要的统计数据
1: int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
2: {
3: struct iphdr *iph;
4: int raw = 0;
5: int ptr;
6: struct net_device *dev;
7: struct sk_buff *skb2;
8: unsigned int mtu, hlen, left, len, ll_rs, pad;
9: int offset;
10: __be16 not_last_frag;
11: struct rtable *rt = skb_rtable(skb);
12: int err = 0;
13:
14: dev = rt->u.dst.dev;
15:
16: /*
17: * Point into the IP datagram header.
18: */
19:
20: iph = ip_hdr(skb);
21:
22: if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
23: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
24: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
25: htonl(ip_skb_dst_mtu(skb)));
26: kfree_skb(skb);
27: return -EMSGSIZE;
28: }
29:
30: /*
31: * Setup starting values.
32: */
33:
34: hlen = iph->ihl * 4;
35: mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
36: IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
37:
38: /* When frag_list is given, use it. First, check its validity:
39: * some transformers could create wrong frag_list or break existing
40: * one, it is not prohibited. In this case fall back to copying.
41: *
42: * LATER: this step can be merged to real generation of fragments,
43: * we can switch to copy when see the first bad fragment.
44: */
45: if (skb_has_frags(skb)) {
46: struct sk_buff *frag, *frag2;
47: int first_len = skb_pagelen(skb);
48:
49: if (first_len - hlen > mtu ||
50: ((first_len - hlen) & 7) ||
51: (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
52: skb_cloned(skb))
53: goto slow_path;
54:
55: skb_walk_frags(skb, frag) {
56: /* Correct geometry. */
57: if (frag->len > mtu ||
58: ((frag->len & 7) && frag->next) ||
59: skb_headroom(frag) < hlen)
60: goto slow_path_clean;
61:
62: /* Partially cloned skb? */
63: if (skb_shared(frag))
64: goto slow_path_clean;
65:
66: BUG_ON(frag->sk);
67: if (skb->sk) {
68: frag->sk = skb->sk;
69: frag->destructor = sock_wfree;
70: }
71: skb->truesize -= frag->truesize;
72: }
73:
74: /* Everything is OK. Generate! */
75:
76: err = 0;
77: offset = 0;
78: frag = skb_shinfo(skb)->frag_list;
79: skb_frag_list_init(skb);
80: skb->data_len = first_len - skb_headlen(skb);
81: skb->len = first_len;
82: iph->tot_len = htons(first_len);
83: iph->frag_off = htons(IP_MF);
84: ip_send_check(iph);
85:
86: for (;;) {
87: /* Prepare header of the next frame,
88: * before previous one went down. */
89: if (frag) {
90: frag->ip_summed = CHECKSUM_NONE;
91: skb_reset_transport_header(frag);
92: __skb_push(frag, hlen);
93: skb_reset_network_header(frag);
94: memcpy(skb_network_header(frag), iph, hlen);
95: iph = ip_hdr(frag);
96: iph->tot_len = htons(frag->len);
97: ip_copy_metadata(frag, skb);
98: if (offset == 0)
99: ip_options_fragment(frag);
100: offset += skb->len - hlen;
101: iph->frag_off = htons(offset>>3);
102: if (frag->next != NULL)
103: iph->frag_off |= htons(IP_MF);
104: /* Ready, complete checksum */
105: ip_send_check(iph);
106: }
107:
108: err = output(skb);
109:
110: if (!err)
111: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
112: if (err || !frag)
113: break;
114:
115: skb = frag;
116: frag = skb->next;
117: skb->next = NULL;
118: }
119:
120: if (err == 0) {
121: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
122: return 0;
123: }
124:
125: while (frag) {
126: skb = frag->next;
127: kfree_skb(frag);
128: frag = skb;
129: }
130: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
131: return err;
132:
133: slow_path_clean:
134: skb_walk_frags(skb, frag2) {
135: if (frag2 == frag)
136: break;
137: frag2->sk = NULL;
138: frag2->destructor = NULL;
139: skb->truesize += frag2->truesize;
140: }
141: }
142:
143: slow_path:
144: left = skb->len - hlen; /* Space per frame */
145: ptr = raw + hlen; /* Where to start from */
146:
147: /* for bridged IP traffic encapsulated inside f.e. a vlan header,
148: * we need to make room for the encapsulating header
149: */
150: pad = nf_bridge_pad(skb);
151: ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
152: mtu -= pad;
153:
154: /*
155: * Fragment the datagram.
156: */
157:
158: offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
159: not_last_frag = iph->frag_off & htons(IP_MF);
160:
161: /*
162: * Keep copying data until we run out.
163: */
164:
165: while (left > 0) {
166: len = left;
167: /* IF: it doesn't fit, use 'mtu' - the data space left */
168: if (len > mtu)
169: len = mtu;
170: /* IF: we are not sending upto and including the packet end
171: then align the next start on an eight byte boundary */
172: if (len < left) {
173: len &= ~7;
174: }
175: /*
176: * Allocate buffer.
177: */
178:
179: if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
180: NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
181: err = -ENOMEM;
182: goto fail;
183: }
184:
185: /*
186: * Set up data on packet
187: */
188:
189: ip_copy_metadata(skb2, skb);
190: skb_reserve(skb2, ll_rs);
191: skb_put(skb2, len + hlen);
192: skb_reset_network_header(skb2);
193: skb2->transport_header = skb2->network_header + hlen;
194:
195: /*
196: * Charge the memory for the fragment to any owner
197: * it might possess
198: */
199:
200: if (skb->sk)
201: skb_set_owner_w(skb2, skb->sk);
202:
203: /*
204: * Copy the packet header into the new buffer.
205: */
206:
207: skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
208:
209: /*
210: * Copy a block of the IP datagram.
211: */
212: if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
213: BUG();
214: left -= len;
215:
216: /*
217: * Fill in the new header fields.
218: */
219: iph = ip_hdr(skb2);
220: iph->frag_off = htons((offset >> 3));
221:
222: /* ANK: dirty, but effective trick. Upgrade options only if
223: * the segment to be fragmented was THE FIRST (otherwise,
224: * options are already fixed) and make it ONCE
225: * on the initial skb, so that all the following fragments
226: * will inherit fixed options.
227: */
228: if (offset == 0)
229: ip_options_fragment(skb);
230:
231: /*
232: * Added AC : If we are fragmenting a fragment that's not the
233: * last fragment then keep MF on each bit
234: */
235: if (left > 0 || not_last_frag)
236: iph->frag_off |= htons(IP_MF);
237: ptr += len;
238: offset += len;
239:
240: /*
241: * Put this fragment into the sending queue.
242: */
243: iph->tot_len = htons(len + hlen);
244:
245: ip_send_check(iph);
246:
247: err = output(skb2);
248: if (err)
249: goto fail;
250:
251: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
252: }
253: kfree_skb(skb);
254: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
255: return err;
256:
257: fail:
258: kfree_skb(skb);
259: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
260: return err;
261: }
重组:
重组的制约条件
- 片段必须存储在内核内存中,直到它完全被网络子系统处理完,但是内存很昂贵,因此,一定要有种方式去限制内存的使用
- 存储大量信息最有效的结构就是hash表,然而,hash表也会失衡,特别是如果恶意攻击者摸清hash算法后,精打细算试着拉高hash表中特定元素的权重使得处理速度慢下来。
- 网络联机时常使用不可靠的媒介,所以,片段可能会遗失。如果一个封包里的不同片段以不同路径传输时,这一点尤其可能发生。因此,IP层必须为每个封包持有一个定时器,并在某个点上放弃,并把所有接收到的片段抛弃掉。此外,也必须采用校验,把检测到损毁的机会拉到最大
- 如果源主机在一段时间后没收到某些数据的确认通知信息,而且传输协议实现了流控制,该主机就会重传数据。因此,对单个ip封包而言,目的地可能会接收到好几个重复片段。让这一问题更复杂的是,第二个ip封包所走的路径可能和第一个不同,因此,其分段也不同,所以片段间的边界也可能不吻合
于是,这些需求就造成接下来要说明的实现细节。片段存储在一个会定期改变(在hash函数的输入函数中多一个随机元素)的hash表内。每个封包都会关联一个定时器,如果定时器到期了,该封包就会被删除。每个片段都会检查是否损毁,以及是否和先前接收的片段重叠。
重组涉及的函数:
ip_evictor:逐一删除不完整的ipq结构,从最旧的着手,直到片段所用的内存降到sysctl_ipfrag_low_thresh阀值下。为了让ip_evictor正确运作,一个lru链表必须不断更新。其做法就是把心ipq结构加到一个全局链表尾端,然后每次有新片段加至一个ipq结构时,就加到队列末尾。于是,最没希望完成的封包就会站在队列前端
ip_find:找出和正在被处理的片段相关的封包(片段链表)。查询是根据IP报头的4个字段:ID,源IP地址、目的IP地址以及l4协议。查询关键字实际上也包含一个本地参数:user。这一个参数用于指出重组的原因
ip_frag_queue:把指定的片段插入和同一个ip封包相关的片段链表中。
ip_frag_reasm:一旦所有片段都被接收之后,就从这些片段构建原有的ip封包
1: int ip_defrag(struct sk_buff *skb, u32 user)
2: {
3: struct ipq *qp;
4: struct net *net;
5:
6: net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
7: IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
8:
9: /* Start by cleaning up the memory. */
10: if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
11: ip_evictor(net);
12:
13: /* Lookup (or create) queue header */
14: if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
15: int ret;
16:
17: spin_lock(&qp->q.lock);
18:
19: ret = ip_frag_queue(qp, skb);
20:
21: spin_unlock(&qp->q.lock);
22: ipq_put(qp);
23: return ret;
24: }
25:
26: IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
27: kfree_skb(skb);
28: return -ENOMEM;
29: }
使用链表处理片段,可以对所用的内存进行优化,但是会使得片段的处理稍微复杂一点。ip_frag_queue所做的主要任务如下:
- 弄清输入片段处于原有封包何处(根据其偏移量和长度)
- 弄清是否为封包的最后片段,如果是的话,就从中取出ip封包长度
- 把该片段插入到链表中,而该链表内的片段都关联同一个ip封包,此外还要处理可能的重叠问题。
- 更新ipq结构中由垃圾收集任务所用的那些字段
- 必要时,让在硬件中计算机的l4校验和失效
1: /* Add new segment to existing queue. */
2: static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
3: {
4: struct sk_buff *prev, *next;
5: struct net_device *dev;
6: int flags, offset;
7: int ihl, end;
8: int err = -ENOENT;
9:
10: if (qp->q.last_in & INET_FRAG_COMPLETE)
11: goto err;
12:
13: if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
14: unlikely(ip_frag_too_far(qp)) &&
15: unlikely(err = ip_frag_reinit(qp))) {
16: ipq_kill(qp);
17: goto err;
18: }
19:
20: offset = ntohs(ip_hdr(skb)->frag_off);
21: flags = offset & ~IP_OFFSET;
22: offset &= IP_OFFSET;
23: offset <<= 3; /* offset is in 8-byte chunks */
24: ihl = ip_hdrlen(skb);
25:
26: /* Determine the position of this fragment. */
27: end = offset + skb->len - ihl;
28: err = -EINVAL;
29:
30: /* Is this the final fragment? */
31: if ((flags & IP_MF) == 0) {
32: /* If we already have some bits beyond end
33: * or have different end, the segment is corrrupted.
34: */
35: if (end < qp->q.len ||
36: ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
37: goto err;
38: qp->q.last_in |= INET_FRAG_LAST_IN;
39: qp->q.len = end;
40: } else {
41: if (end&7) {
42: end &= ~7;
43: if (skb->ip_summed != CHECKSUM_UNNECESSARY)
44: skb->ip_summed = CHECKSUM_NONE;
45: }
46: if (end > qp->q.len) {
47: /* Some bits beyond end -> corruption. */
48: if (qp->q.last_in & INET_FRAG_LAST_IN)
49: goto err;
50: qp->q.len = end;
51: }
52: }
53: if (end == offset)
54: goto err;
55:
56: err = -ENOMEM;
57: if (pskb_pull(skb, ihl) == NULL)
58: goto err;
59:
60: err = pskb_trim_rcsum(skb, end - offset);
61: if (err)
62: goto err;
63:
64: /* Find out which fragments are in front and at the back of us
65: * in the chain of fragments so far. We must know where to put
66: * this fragment, right?
67: */
68: prev = NULL;
69: for (next = qp->q.fragments; next != NULL; next = next->next) {
70: if (FRAG_CB(next)->offset >= offset)
71: break; /* bingo! */
72: prev = next;
73: }
74:
75: /* We found where to put this one. Check for overlap with
76: * preceding fragment, and, if needed, align things so that
77: * any overlaps are eliminated.
78: */
79: if (prev) {
80: int i = (FRAG_CB(prev)->offset + prev->len) - offset;
81:
82: if (i > 0) {
83: offset += i;
84: err = -EINVAL;
85: if (end <= offset)
86: goto err;
87: err = -ENOMEM;
88: if (!pskb_pull(skb, i))
89: goto err;
90: if (skb->ip_summed != CHECKSUM_UNNECESSARY)
91: skb->ip_summed = CHECKSUM_NONE;
92: }
93: }
94:
95: err = -ENOMEM;
96:
97: while (next && FRAG_CB(next)->offset < end) {
98: int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
99:
100: if (i < next->len) {
101: /* Eat head of the next overlapped fragment
102: * and leave the loop. The next ones cannot overlap.
103: */
104: if (!pskb_pull(next, i))
105: goto err;
106: FRAG_CB(next)->offset += i;
107: qp->q.meat -= i;
108: if (next->ip_summed != CHECKSUM_UNNECESSARY)
109: next->ip_summed = CHECKSUM_NONE;
110: break;
111: } else {
112: struct sk_buff *free_it = next;
113:
114: /* Old fragment is completely overridden with
115: * new one drop it.
116: */
117: next = next->next;
118:
119: if (prev)
120: prev->next = next;
121: else
122: qp->q.fragments = next;
123:
124: qp->q.meat -= free_it->len;
125: frag_kfree_skb(qp->q.net, free_it, NULL);
126: }
127: }
128:
129: FRAG_CB(skb)->offset = offset;
130:
131: /* Insert this fragment in the chain of fragments. */
132: skb->next = next;
133: if (prev)
134: prev->next = skb;
135: else
136: qp->q.fragments = skb;
137:
138: dev = skb->dev;
139: if (dev) {
140: qp->iif = dev->ifindex;
141: skb->dev = NULL;
142: }
143: qp->q.stamp = skb->tstamp;
144: qp->q.meat += skb->len;
145: atomic_add(skb->truesize, &qp->q.net->mem);
146: if (offset == 0)
147: qp->q.last_in |= INET_FRAG_FIRST_IN;
148:
149: if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
150: qp->q.meat == qp->q.len)
151: return ip_frag_reasm(qp, prev, dev);
152:
153: write_lock(&ip4_frags.lock);
154: list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
155: write_unlock(&ip4_frags.lock);
156: return -EINPROGRESS;
157:
158: err:
159: kfree_skb(skb);
160: return err;
161: }