位置:net/ipv4/tcp_offload.c
在网卡驱动中,当支持NAPI时,在中断中接收到数据后,最后通过调用napi_gro_receive,将数据交给上层协议,而在此过程中,会在每层协议中调用不同的回调函数,下面来看TCP层的情况。首先要申明一个结构:
static const struct net_offload tcpv4_offload = {
.callbacks = {
.gso_send_check = tcp_v4_gso_send_check,
.gso_segment = tcp_gso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
},
};
这里定义了一系列的回调函数,这个结构要注册到内核中去:
int __init tcpv4_offload_init(void)
{
return inet_add_offload(&tcpv4_offload, IPPROTO_TCP);
}
下面来看接收函数:
static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
/* Use the IP hdr immediately proceeding for this transport */
const struct iphdr *iph = skb_gro_network_header(skb);
__wsum wsum;
/* Don't bother verifying checksum if we're going to flush anyway. */
if (NAPI_GRO_CB(skb)->flush)
goto skip_csum;
wsum = NAPI_GRO_CB(skb)->csum;
switch (skb->ip_summed) {
case CHECKSUM_NONE:
wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),
0);
/* fall through */
case CHECKSUM_COMPLETE:
if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
wsum)) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
}
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
skip_csum:
return tcp_gro_receive(head, skb);
}
struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
struct sk_buff **pp = NULL;
struct sk_buff *p;
struct tcphdr *th;
struct tcphdr *th2;
unsigned int len;
unsigned int thlen;
__be32 flags;
unsigned int mss = 1;
unsigned int hlen;
unsigned int off;
int flush = 1;
int i;
这时,NAPI_GRO_CB(skb)->data_offset己经是TCP头部的位置了。
off = skb_gro_offset(skb);
hlen = off + sizeof(*th);
th = skb_gro_header_fast(skb, off);
if (skb_gro_header_hard(skb, hlen)) {
th = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!th))
goto out;
}
thlen = th->doff * 4;
if (thlen < sizeof(*th))
goto out;
hlen = off + thlen;
if (skb_gro_header_hard(skb, hlen)) {
th = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!th))
goto out;
}
先得到TCP头部,下面是将NAPI_GRO_CB(skb)->data_offset设为指向实际的数据(跟在TCP头部之后的数据)然后得到实际数据的长度
skb_gro_pull(skb, thlen);
len = skb_gro_len(skb);
flags = tcp_flag_word(th);
下面是一个循环,对napi的列表进行遍历:
for (; (p = *head); head = &p->next) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
th2 = tcp_hdr(p);
if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
goto found;
}
goto out_check_final;
同样是设置same_flow的值,还是判断源是否一致。
found:
/* Include the IP ID check below from the inner most IP hdr */
flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
flush |= (__force int)(flags & TCP_FLAG_CWR);
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
for (i = sizeof(*th); i < thlen; i += 4)
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
mss = tcp_skb_mss(p);
flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
if (flush || skb_gro_receive(head, skb)) {
mss = 1;
goto out_check_final;
}
p = *head;
th2 = tcp_hdr(p);
tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
如果在上面找到了在同一个流中的sk_buff,设置flush。要设置flush的情况包括:TCP_FLAG_CWR(拥塞窗口减少);flag不相同或者相同并且不是TCP_FLAG_CWR,不是TCP_FLAG_FIN(结束会话),不是TCP_FLAG_PSH(数据包立即发送);ack_seq不同;TCP头部不相同等。
如果设置了flush,则不用合并,否则要进行合并:
offset为skb实际数据的位置,headlen为skb主buffer的长度(不包含分片的长度),len为实际数据的长度。int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) { struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); struct sk_buff *nskb, *lp, *p = *head; unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; unsigned int headroom; if (unlikely(p->len + len >= 65536)) return -E2BIG; lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp);
if (headlen <= offset) { skb_frag_t *frag; skb_frag_t *frag2; int i = skbinfo->nr_frags; int nr_frags = pinfo->nr_frags + i; if (nr_frags > MAX_SKB_FRAGS) goto merge;
这里是对Scatter-Gather I/O的处理,先得到两个skb_buff的总的分片数。MAX_SKB_FRAGS为16。在skb_shared_info中有一个skb_frag_t数组,大小为MAX_SKB_FRAGS,如果数量超过数组大小,去进行合并。
把skbinfo中的分片信息拷贝到pinfo的后面。offset -= headlen; pinfo->nr_frags = nr_frags; skbinfo->nr_frags = 0; frag = pinfo->frags + nr_frags; frag2 = skbinfo->frags + i; do { *--frag = *--frag2; } while (--i);
调转frag和skb的值。将此skb标记为可以删除。frag->page_offset += offset; skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ delta_truesize = skb->truesize - SKB_TRUESIZE(skb_end_offset(skb)); skb->truesize -= skb->data_len; skb->len -= skb->data_len; skb->data_len = 0; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; goto done;
如果分片数量不为0。page为skb的buffer的开始处的内存页。} else if (skb->head_frag) { int nr_frags = pinfo->nr_frags; skb_frag_t *frag = pinfo->frags + nr_frags; struct page *page = virt_to_head_page(skb->head); unsigned int first_size = headlen - offset; unsigned int first_offset; if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) goto merge;
这里是直接将skbinfo中的分片信息通过memcpy拷贝到pinfo的后面。first_offset = skb->data - (unsigned char *)page_address(page) + offset; pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; frag->page.p = page; frag->page_offset = first_offset; skb_frag_size_set(frag, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; }
分配一个新的sk_buff。if (pinfo->frag_list) goto merge; if (skb_gro_len(p) != pinfo->gso_size) return -E2BIG; headroom = skb_headroom(p); nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); if (unlikely(!nskb)) return -ENOMEM;
将skb中的内容拷贝到新的sk_buff中。__copy_skb_header(nskb, p); nskb->mac_len = p->mac_len; skb_reserve(nskb, headroom); __skb_put(nskb, skb_gro_offset(p)); skb_set_mac_header(nskb, skb_mac_header(p) - p->data); skb_set_network_header(nskb, skb_network_offset(p)); skb_set_transport_header(nskb, skb_transport_offset(p)); __skb_pull(p, skb_gro_offset(p)); memcpy(skb_mac_header(nskb), skb_mac_header(p), p->data - skb_mac_header(p));
调整新的sk_buff中的各值。skb_shinfo(nskb)->frag_list = p; skb_shinfo(nskb)->gso_size = pinfo->gso_size; pinfo->gso_size = 0; skb_header_release(p); NAPI_GRO_CB(nskb)->last = p; nskb->data_len += p->len; nskb->truesize += p->truesize; nskb->len += p->len; *head = nskb; nskb->next = p->next; p->next = NULL; p = nskb;
merge: delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; skbinfo->frags[0].page_offset += eat; skb_frag_size_sub(&skbinfo->frags[0], eat); skb->data_len -= eat; skb->len -= eat; offset = headlen; }
__skb_pull(skb, offset); if (NAPI_GRO_CB(p)->last == p) skb_shinfo(p)->frag_list = skb; else NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; skb_header_release(skb); lp = p;
done: NAPI_GRO_CB(p)->count++; p->data_len += len; p->truesize += delta_truesize; p->len += len; if (lp != p) { lp->data_len += len; lp->truesize += delta_truesize; lp->len += len; } NAPI_GRO_CB(skb)->same_flow = 1; return 0; } EXPORT_SYMBOL_GPL(skb_gro_receive);
out_check_final:
flush = len < mss;
flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
TCP_FLAG_RST | TCP_FLAG_SYN |
TCP_FLAG_FIN));
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
pp = head;
out:
NAPI_GRO_CB(skb)->flush |= (flush != 0);
return pp;
}