数据包的分片与重组发生在IP层, 当IP数据包的长度超过PMTU,并且允许IP分片时,就会进行分片操作,分片后的数据包有独立的IP报头,并且独立路由,在接收端的IP层进行重组. 目前有两种分片处理方式,快速分片和慢速分片.
- 分片工作
- 层在处理分片时,把TCP/UPD的负载分割成MTU大小的片段,并且为每个片段设置IP报头,更新IP报头offset和检验和,如果是慢速分片,还需要进行数据的拷贝(处理frag_list和frag数组)
- 分片方式
1 存在frag_list 链表,且链表上每个分片的大小都不超过MTU(其实这是TCP/UDP层进行了预分片处理)
2 除最后一个分片外,其他分片都要8字节对齐
3 此skb没有被克隆
3 源码分析
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
{
/*判读是否可以进行快速分片
如果存在frag_list,则进入快速分片流程 */
if (skb_has_frag_list(skb)) {
struct sk_buff *frag, *frag2;
int first_len = skb_pagelen(skb);/*计算skb数据大小包括线性数据和SG数据 */
/*长度大于MTU,没有8自己对齐,已经被分片,或者已经被克隆,都会进入慢速分片流程 */
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
ip_is_fragment(iph) ||
skb_cloned(skb))
goto slow_path;
skb_walk_frags(skb, frag) {
/* 继续检查每个分片是否满足快速分片条件 */
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
skb_headroom(frag) < hlen)
goto slow_path_clean;
/* Partially cloned skb? */
if (skb_shared(frag))
goto slow_path_clean;
BUG_ON(frag->sk);
if (skb->sk) {
frag->sk = skb->sk;
frag->destructor = sock_wfree;
}
/*从skb中减去分片大小 */
skb->truesize -= frag->truesize;
}
/*处理第一个skb,重新设置IP报头 */
err = 0;
offset = 0;
frag = skb_shinfo(skb)->frag_list;
skb_frag_list_init(skb);
skb->data_len = first_len - skb_headlen(skb);
skb->len = first_len;
iph->tot_len = htons(first_len);
iph->frag_off = htons(IP_MF);
ip_send_check(iph);
/*处理frag_list分片 */
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
if (frag) {
/*重新构建分片IP报头 */
frag->ip_summed = CHECKSUM_NONE;
skb_reset_transport_header(frag);
__skb_push(frag, hlen);
skb_reset_network_header(frag);
memcpy(skb_network_header(frag), iph, hlen);
iph = ip_hdr(frag);
iph->tot_len = htons(frag->len);
ip_copy_metadata(frag, skb);
if (offset == 0)
ip_options_fragment(frag);
offset += skb->len - hlen;
iph->frag_off = htons(offset>>3);
if (frag->next != NULL)
iph->frag_off |= htons(IP_MF);
/* Ready, complete checksum */
ip_send_check(iph);
}
/*发送IP分片 */
err = output(skb);
skb = frag;
frag = skb->next;
skb->next = NULL;
}
/*慢速分片 */
slow_path:
iph = ip_hdr(skb);
/*计算数据包总长度 */
left = skb->len - hlen; /* Space per frame */
ptr = hlen; /* Where to start from */
ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
not_last_frag = iph->frag_off & htons(IP_MF);
while (left > 0) {
len = left;
/* IF: it doesn't fit, use 'mtu' - the data space left */
if (len > mtu)
len = mtu;
/* IF: we are not sending up to and including the packet end
then align the next start on an eight byte boundary */
if (len < left) {
len &= ~7;
}
/*重新分配一个新的分片 */
if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
err = -ENOMEM;
goto fail;
}
/*重新构建skb和IP报头 */
ip_copy_metadata(skb2, skb);
skb_reserve(skb2, ll_rs);
skb_put(skb2, len + hlen);
skb_reset_network_header(skb2);
skb2->transport_header = skb2->network_header + hlen
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
/*复制skb线性数据包 */
skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
/*复制skb 的frag_list和SG数据 */
if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
BUG();
left -= len;
iph = ip_hdr(skb2);
/*设置分片偏移量,用于重组 */
iph->frag_off = htons((offset >> 3));
if (offset == 0)
ip_options_fragment(skb);
if (left > 0 || not_last_frag)
iph->frag_off |= htons(IP_MF);/*设置MF标志,用于重组 */
ptr += len;
offset += len;
iph->tot_len = htons(len + hlen);
ip_send_check(iph);
/*最后输出新的IP数据包 */
err = output(skb2);
if (err)
goto fail;
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
}
consume_skb(skb);
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
return err;
fail:
kfree_skb(skb);
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
}