TCP

位置:net/ipv4/tcp_offload.c

在网卡驱动中,当支持NAPI时,在中断中接收到数据后,最后通过调用napi_gro_receive,将数据交给上层协议,而在此过程中,会在每层协议中调用不同的回调函数,下面来看TCP层的情况。首先要申明一个结构:

static const struct net_offload tcpv4_offload = {
	.callbacks = {
		.gso_send_check	=	tcp_v4_gso_send_check,
		.gso_segment	=	tcp_gso_segment,
		.gro_receive	=	tcp4_gro_receive,
		.gro_complete	=	tcp4_gro_complete,
	},
};
这里定义了一系列的回调函数,这个结构要注册到内核中去:

int __init tcpv4_offload_init(void)
{
	return inet_add_offload(&tcpv4_offload, IPPROTO_TCP);
}
下面来看接收函数:

static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
	/* Use the IP hdr immediately proceeding for this transport */
	const struct iphdr *iph = skb_gro_network_header(skb);
	__wsum wsum;

	/* Don't bother verifying checksum if we're going to flush anyway. */
	if (NAPI_GRO_CB(skb)->flush)
		goto skip_csum;

	wsum = NAPI_GRO_CB(skb)->csum;

	switch (skb->ip_summed) {
	case CHECKSUM_NONE:
		wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),
				    0);

		/* fall through */

	case CHECKSUM_COMPLETE:
		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
				  wsum)) {
			skb->ip_summed = CHECKSUM_UNNECESSARY;
			break;
		}

		NAPI_GRO_CB(skb)->flush = 1;
		return NULL;
	}

skip_csum:
	return tcp_gro_receive(head, skb);
}

struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
	struct sk_buff **pp = NULL;
	struct sk_buff *p;
	struct tcphdr *th;
	struct tcphdr *th2;
	unsigned int len;
	unsigned int thlen;
	__be32 flags;
	unsigned int mss = 1;
	unsigned int hlen;
	unsigned int off;
	int flush = 1;
	int i;

这时,NAPI_GRO_CB(skb)->data_offset己经是TCP头部的位置了。

        off = skb_gro_offset(skb);
	hlen = off + sizeof(*th);
	th = skb_gro_header_fast(skb, off);
	if (skb_gro_header_hard(skb, hlen)) {
		th = skb_gro_header_slow(skb, hlen, off);
		if (unlikely(!th))
			goto out;
	}

	thlen = th->doff * 4;
	if (thlen < sizeof(*th))
		goto out;

	hlen = off + thlen;
	if (skb_gro_header_hard(skb, hlen)) {
		th = skb_gro_header_slow(skb, hlen, off);
		if (unlikely(!th))
			goto out;
	}
先得到TCP头部,下面是将NAPI_GRO_CB(skb)->data_offset设为指向实际的数据(跟在TCP头部之后的数据)然后得到实际数据的长度

        skb_gro_pull(skb, thlen);

	len = skb_gro_len(skb);
	flags = tcp_flag_word(th);
下面是一个循环,对napi的列表进行遍历:

        for (; (p = *head); head = &p->next) {
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;

 		th2 = tcp_hdr(p);

 		if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
			NAPI_GRO_CB(p)->same_flow = 0;
 			continue;
 		}

 		goto found;
 	}

        goto out_check_final;
同样是设置same_flow的值,还是判断源是否一致。

found:
	/* Include the IP ID check below from the inner most IP hdr */
	flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
	flush |= (__force int)(flags & TCP_FLAG_CWR);
	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
	flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
	for (i = sizeof(*th); i < thlen; i += 4)
		flush |= *(u32 *)((u8 *)th + i) ^
			 *(u32 *)((u8 *)th2 + i);

	mss = tcp_skb_mss(p);

	flush |= (len - 1) >= mss;
	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);

	if (flush || skb_gro_receive(head, skb)) {
		mss = 1;
		goto out_check_final;
	}

	p = *head;
	th2 = tcp_hdr(p);
	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
如果在上面找到了在同一个流中的sk_buff,设置flush。要设置flush的情况包括:TCP_FLAG_CWR(拥塞窗口减少);flag不相同或者相同并且不是TCP_FLAG_CWR,不是TCP_FLAG_FIN(结束会话),不是TCP_FLAG_PSH(数据包立即发送);ack_seq不同;TCP头部不相同等。

如果设置了flush,则不用合并,否则要进行合并:

int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
	struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
	unsigned int offset = skb_gro_offset(skb);
	unsigned int headlen = skb_headlen(skb);
	struct sk_buff *nskb, *lp, *p = *head;
	unsigned int len = skb_gro_len(skb);
	unsigned int delta_truesize;
	unsigned int headroom;

	if (unlikely(p->len + len >= 65536))
		return -E2BIG;

	lp = NAPI_GRO_CB(p)->last;
	pinfo = skb_shinfo(lp);
offset为skb实际数据的位置,headlen为skb主buffer的长度(不包含分片的长度),len为实际数据的长度。
	if (headlen <= offset) {
		skb_frag_t *frag;
		skb_frag_t *frag2;
		int i = skbinfo->nr_frags;
		int nr_frags = pinfo->nr_frags + i;

		if (nr_frags > MAX_SKB_FRAGS)
			goto merge;

这里是对Scatter-Gather I/O的处理,先得到两个skb_buff的总的分片数。MAX_SKB_FRAGS为16。在skb_shared_info中有一个skb_frag_t数组,大小为MAX_SKB_FRAGS,如果数量超过数组大小,去进行合并。

                offset -= headlen;
		pinfo->nr_frags = nr_frags;
 		skbinfo->nr_frags = 0;

 		frag = pinfo->frags + nr_frags;
 		frag2 = skbinfo->frags + i;
 		do {
 			*--frag = *--frag2;
 		} while (--i);
把skbinfo中的分片信息拷贝到pinfo的后面。
		frag->page_offset += offset;
		skb_frag_size_sub(frag, offset);

		/* all fragments truesize : remove (head size + sk_buff) */
		delta_truesize = skb->truesize -
				 SKB_TRUESIZE(skb_end_offset(skb));

		skb->truesize -= skb->data_len;
		skb->len -= skb->data_len;
		skb->data_len = 0;

		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
		goto done;
调转frag和skb的值。将此skb标记为可以删除。
	} else if (skb->head_frag) {
		int nr_frags = pinfo->nr_frags;
		skb_frag_t *frag = pinfo->frags + nr_frags;
		struct page *page = virt_to_head_page(skb->head);
		unsigned int first_size = headlen - offset;
		unsigned int first_offset;

		if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
			goto merge;
如果分片数量不为0。page为skb的buffer的开始处的内存页。
		first_offset = skb->data -
			       (unsigned char *)page_address(page) +
			       offset;

		pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;

		frag->page.p	  = page;
		frag->page_offset = first_offset;
		skb_frag_size_set(frag, first_size);

		memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
		/* We dont need to clear skbinfo->nr_frags here */

		delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
		goto done;
	}
这里是直接将skbinfo中的分片信息通过memcpy拷贝到pinfo的后面。
	if (pinfo->frag_list)
		goto merge;
	if (skb_gro_len(p) != pinfo->gso_size)
		return -E2BIG;

	headroom = skb_headroom(p);
	nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
	if (unlikely(!nskb))
		return -ENOMEM;
分配一个新的sk_buff。
	__copy_skb_header(nskb, p);
	nskb->mac_len = p->mac_len;

	skb_reserve(nskb, headroom);
	__skb_put(nskb, skb_gro_offset(p));

	skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
	skb_set_network_header(nskb, skb_network_offset(p));
	skb_set_transport_header(nskb, skb_transport_offset(p));

	__skb_pull(p, skb_gro_offset(p));
	memcpy(skb_mac_header(nskb), skb_mac_header(p),
	       p->data - skb_mac_header(p));
将skb中的内容拷贝到新的sk_buff中。
	skb_shinfo(nskb)->frag_list = p;
	skb_shinfo(nskb)->gso_size = pinfo->gso_size;
	pinfo->gso_size = 0;
	skb_header_release(p);
	NAPI_GRO_CB(nskb)->last = p;

	nskb->data_len += p->len;
	nskb->truesize += p->truesize;
	nskb->len += p->len;

	*head = nskb;
	nskb->next = p->next;
	p->next = NULL;

	p = nskb;
调整新的sk_buff中的各值。
merge:
	delta_truesize = skb->truesize;
	if (offset > headlen) {
		unsigned int eat = offset - headlen;

		skbinfo->frags[0].page_offset += eat;
		skb_frag_size_sub(&skbinfo->frags[0], eat);
		skb->data_len -= eat;
		skb->len -= eat;
		offset = headlen;
	}


	__skb_pull(skb, offset);

	if (NAPI_GRO_CB(p)->last == p)
		skb_shinfo(p)->frag_list = skb;
	else
		NAPI_GRO_CB(p)->last->next = skb;
	NAPI_GRO_CB(p)->last = skb;
	skb_header_release(skb);
	lp = p;


done:
	NAPI_GRO_CB(p)->count++;
	p->data_len += len;
	p->truesize += delta_truesize;
	p->len += len;
	if (lp != p) {
		lp->data_len += len;
		lp->truesize += delta_truesize;
		lp->len += len;
	}
	NAPI_GRO_CB(skb)->same_flow = 1;
	return 0;
}
EXPORT_SYMBOL_GPL(skb_gro_receive);




out_check_final:
	flush = len < mss;
	flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
					TCP_FLAG_RST | TCP_FLAG_SYN |
					TCP_FLAG_FIN));

	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
		pp = head;

out:
	NAPI_GRO_CB(skb)->flush |= (flush != 0);

	return pp;
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值