网络数据包接收之GRO处理

linux网络子系统 专栏收录该内容
6 篇文章 2 订阅

驱动把数据包上传到GRO处理时,调用napi_gro_receive函数进行GRO处理,完成之后才会上传到网络协议层(如TCP/IP层)进行进一步的处理。GRO的处理过程如下:

1、首先检查该网络接口是否允许GRO操作,以及底层硬件是否已经完成了GRO处理,如果没有完成才会继续往下处理;

if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
		goto normal;

	if (skb_is_gso(skb) || skb_has_frag_list(skb))
		goto normal;

 

2、然后遍历NAPI链表上等待GRO合并的数据分片,依次比较它们的地址hash码、MAC头部、VLAN标签、接收的网络设备(因为linux在一个实体网络接口上支持多个虚拟网络接口的原因),标记那些和当前接收数据分片相同的分片。

static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
{
	struct sk_buff *p;
	unsigned int maclen = skb->dev->hard_header_len;
	u32 hash = skb_get_hash_raw(skb);

	for (p = napi->gro_list; p; p = p->next) {
		unsigned long diffs;

		NAPI_GRO_CB(p)->flush = 0;

		if (hash != skb_get_hash_raw(p)) {
			NAPI_GRO_CB(p)->same_flow = 0;
			continue;
		}

		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
		diffs |= p->vlan_tci ^ skb->vlan_tci;
		if (maclen == ETH_HLEN)
			diffs |= compare_ether_header(skb_mac_header(p),
						      skb_gro_mac_header(skb));
		else if (!diffs)
			diffs = memcmp(skb_mac_header(p),
				       skb_gro_mac_header(skb),
				       maclen);
		NAPI_GRO_CB(p)->same_flow = !diffs;
	}
}


3、回调通知上层协议处理新接收到的数据分片,以及更新流标志的NAPI上的数据分片;

	rcu_read_lock();
	list_for_each_entry_rcu(ptype, head, list) {
		if (ptype->type != type || !ptype->callbacks.gro_receive)
			continue;

		skb_set_network_header(skb, skb_gro_offset(skb));
		skb_reset_mac_len(skb);
		NAPI_GRO_CB(skb)->same_flow = 0;
		NAPI_GRO_CB(skb)->flush = 0;
		NAPI_GRO_CB(skb)->free = 0;
		NAPI_GRO_CB(skb)->udp_mark = 0;

		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
		break;
	}
	rcu_read_unlock();


4、如果没有上层协议需要处理该数据分片、或者无法合并该数据包时,则直接递送给网络层协议进行处理;

	if (NAPI_GRO_CB(skb)->flush)
		goto normal;
	if (&ptype->list == head)
		goto normal;


5、如果成功合并到已有数据包中,则不需要进行任何处理,直接返回;

	same_flow = NAPI_GRO_CB(skb)->same_flow;
	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
	if (same_flow)
		goto ok;


6、如果上层协议返回skb链表,则进行以下处理,原因在哪里?

	if (pp) {
		struct sk_buff *nskb = *pp;

		*pp = nskb->next;
		nskb->next = NULL;
		napi_gro_complete(nskb);
		napi->gro_count--;
	}


7、如果该数据分片将来可能被合并,则暂时将该数据分片保留到NAPI的链表上;如果NAPI上保留了太多的数据分片,则将该链表上最早到达的数据分片递送到网络层进行处理;

	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
		struct sk_buff *nskb = napi->gro_list;

		/* locate the end of the list to select the 'oldest' flow */
		while (nskb->next) {
			pp = &nskb->next;
			nskb = *pp;
		}
		*pp = NULL;
		nskb->next = NULL;
		napi_gro_complete(nskb);
	} else {
		napi->gro_count++;
	}
	NAPI_GRO_CB(skb)->count = 1;
	NAPI_GRO_CB(skb)->age = jiffies;
	NAPI_GRO_CB(skb)->last = skb;
	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
	skb->next = napi->gro_list;
	napi->gro_list = skb;
	ret = GRO_HELD;


8、最后在返回之前还要检查数据分片的头部是否进行了扩充,如果是则需要进行相应的调整。原因在哪里?

pull:
	if (skb_headlen(skb) < skb_gro_offset(skb)) {
		int grow = skb_gro_offset(skb) - skb_headlen(skb);

		BUG_ON(skb->end - skb->tail < grow);

		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);

		skb->tail += grow;
		skb->data_len -= grow;

		skb_shinfo(skb)->frags[0].page_offset += grow;
		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);

		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
			skb_frag_unref(skb, 0);
			memmove(skb_shinfo(skb)->frags,
				skb_shinfo(skb)->frags + 1,
				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
		}
	}


9、如果数据包需要继续上传的话,则调用netif_receive_skb_internal函数;该函数首先检查接收的数据包是否为PTP协议帧,如果是则调用PHY驱动添加时间戳;这里的classify函数使用了BPF风格的过滤算法;

bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
	struct phy_device *phydev;
	unsigned int type;

	if (skb_headroom(skb) < ETH_HLEN)
		return false;
	__skb_push(skb, ETH_HLEN);

	type = classify(skb);

	__skb_pull(skb, ETH_HLEN);

	switch (type) {
	case PTP_CLASS_V1_IPV4:
	case PTP_CLASS_V1_IPV6:
	case PTP_CLASS_V2_IPV4:
	case PTP_CLASS_V2_IPV6:
	case PTP_CLASS_V2_L2:
	case PTP_CLASS_V2_VLAN:
		phydev = skb->dev->phydev;
		if (likely(phydev->drv->rxtstamp))
			return phydev->drv->rxtstamp(phydev, skb, type);
		break;
	default:
		break;
	}

	return false;
}


10、对于多处理器系统,当使能了RPS(receive packet steering)功能,则会根据负载均衡的原则将数据包平均分配到各个CPU,减少单个CPU的工作负荷;具体来说就是先计算哪个CPU当前是否插入当前数据包,然后把该数据包插入到该CPU的sd->input_pkt_queue队列中,然后唤醒名为backlog的NAPI,等待下次软中断发生时处理;如果该CPU上面等待处理的数据包过多,则直接丢弃收到的数据包;

static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
			      unsigned int *qtail)
{
	struct softnet_data *sd;
	unsigned long flags;
	unsigned int qlen;

	sd = &per_cpu(softnet_data, cpu);

	local_irq_save(flags);

	rps_lock(sd);
	qlen = skb_queue_len(&sd->input_pkt_queue);
	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
		if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
			__skb_queue_tail(&sd->input_pkt_queue, skb);
			input_queue_tail_incr_save(sd, qtail);
			rps_unlock(sd);
			local_irq_restore(flags);
			return NET_RX_SUCCESS;
		}

		/* Schedule NAPI for backlog device
		 * We can use non atomic operation since we own the queue lock
		 */
		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
			if (!rps_ipi_queued(sd))
				____napi_schedule(sd, &sd->backlog);
		}
		goto enqueue;
	}

	sd->dropped++;
	rps_unlock(sd);

	local_irq_restore(flags);

	atomic_long_inc(&skb->dev->rx_dropped);
	kfree_skb(skb);
	return NET_RX_DROP;
}


11、如果是单处理器系统或者RPS功能没有使能,则直接调用__netif_receive_skb函数准备上传到网络层进行处理;步骤10中通过NAPI调度后的数据包最终也会调用__netif_receive_skb函数;

static int process_backlog(struct napi_struct *napi, int quota)
{
	int work = 0;
	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);

#ifdef CONFIG_RPS
	/* Check if we have pending ipi, its better to send them now,
	 * not waiting net_rx_action() end.
	 */
	if (sd->rps_ipi_list) {
		local_irq_disable();
		net_rps_action_and_irq_enable(sd);
	}
#endif
	napi->weight = weight_p;
	local_irq_disable();
	while (work < quota) {
		struct sk_buff *skb;
		unsigned int qlen;

		while ((skb = __skb_dequeue(&sd->process_queue))) {
			local_irq_enable();
			__netif_receive_skb(skb);
			local_irq_disable();
			input_queue_head_incr(sd);
			if (++work >= quota) {
				local_irq_enable();
				return work;
			}
		}

		rps_lock(sd);
		qlen = skb_queue_len(&sd->input_pkt_queue);
		if (qlen)
			skb_queue_splice_tail_init(&sd->input_pkt_queue,
						   &sd->process_queue);

		if (qlen < quota - work) {
			/*
			 * Inline a custom version of __napi_complete().
			 * only current cpu owns and manipulates this napi,
			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
			 * we can use a plain write instead of clear_bit(),
			 * and we dont need an smp_mb() memory barrier.
			 */
			list_del(&napi->poll_list);
			napi->state = 0;

			quota = work + qlen;
		}
		rps_unlock(sd);
	}
	local_irq_enable();

	return work;
}


12、以下开始了上传数据包到网络层的最后准备工作,首先检查硬件是否已经给数据包打上了时间戳;

#define net_timestamp_check(COND, SKB)			\
	if (static_key_false(&netstamp_needed)) {		\
		if ((COND) && !(SKB)->tstamp.tv64)	\
			__net_timestamp(SKB);		\
	}	


13、检查是否启用POLL方式处理接收到的数据包,只有该网络接口没有注册NAPI时才可能启用POLL方式;

static inline int netpoll_receive_skb(struct sk_buff *skb)
{
	if (!list_empty(&skb->dev->napi_list))
		return netpoll_rx(skb);
	return 0;
}

 

14、如果采用POLL处理路径,则调用__netpoll_rx函数继续处理,如果设备不是以太网设备、接收包目的地址非本网卡地址、或者接收包既不是ARP包、ICMP包也不是UDP包,则bypass到正常接收路径;如果数据包封装为VLAN方式,则拆开外层VLAN标签;对于ARP包,添加到npinfo->neigh_tx等待链表中,对于UDP包,则调用注册的POLL方法对比源IP地址、目的IP地址、端口号,如果匹配,则调用接收钩子函数rx_skb_hook;

int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
{
	int proto, len, ulen, data_len;
	int hits = 0, offset;
	const struct iphdr *iph;
	struct udphdr *uh;
	struct netpoll *np, *tmp;
	uint16_t source;

	if (list_empty(&npinfo->rx_np))
		goto out;

	if (skb->dev->type != ARPHRD_ETHER)
		goto out;

	/* check if netpoll clients need ARP */
	if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) {
		skb_queue_tail(&npinfo->neigh_tx, skb);
		return 1;
	} else if (pkt_is_ns(skb) && atomic_read(&trapped)) {
		skb_queue_tail(&npinfo->neigh_tx, skb);
		return 1;
	}

	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
		skb = skb_vlan_untag(skb);
		if (unlikely(!skb))
			goto out;
	}

	proto = ntohs(eth_hdr(skb)->h_proto);
	if (proto != ETH_P_IP && proto != ETH_P_IPV6)
		goto out;
	if (skb->pkt_type == PACKET_OTHERHOST)
		goto out;
	if (skb_shared(skb))
		goto out;

	if (proto == ETH_P_IP) {
		if (!pskb_may_pull(skb, sizeof(struct iphdr)))
			goto out;
		iph = (struct iphdr *)skb->data;
		if (iph->ihl < 5 || iph->version != 4)
			goto out;
		if (!pskb_may_pull(skb, iph->ihl*4))
			goto out;
		iph = (struct iphdr *)skb->data;
		if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
			goto out;

		len = ntohs(iph->tot_len);
		if (skb->len < len || len < iph->ihl*4)
			goto out;

		/*
		 * Our transport medium may have padded the buffer out.
		 * Now We trim to the true length of the frame.
		 */
		if (pskb_trim_rcsum(skb, len))
			goto out;

		iph = (struct iphdr *)skb->data;
		if (iph->protocol != IPPROTO_UDP)
			goto out;

		len -= iph->ihl*4;
		uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
		offset = (unsigned char *)(uh + 1) - skb->data;
		ulen = ntohs(uh->len);
		data_len = skb->len - offset;
		source = ntohs(uh->source);

		if (ulen != len)
			goto out;
		if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
			goto out;
		list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
			if (np->local_ip.ip && np->local_ip.ip != iph->daddr)
				continue;
			if (np->remote_ip.ip && np->remote_ip.ip != iph->saddr)
				continue;
			if (np->local_port && np->local_port != ntohs(uh->dest))
				continue;

			np->rx_skb_hook(np, source, skb, offset, data_len);
			hits++;
		}
	} else {
#if IS_ENABLED(CONFIG_IPV6)
		const struct ipv6hdr *ip6h;

		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
			goto out;
		ip6h = (struct ipv6hdr *)skb->data;
		if (ip6h->version != 6)
			goto out;
		len = ntohs(ip6h->payload_len);
		if (!len)
			goto out;
		if (len + sizeof(struct ipv6hdr) > skb->len)
			goto out;
		if (pskb_trim_rcsum(skb, len + sizeof(struct ipv6hdr)))
			goto out;
		ip6h = ipv6_hdr(skb);
		if (!pskb_may_pull(skb, sizeof(struct udphdr)))
			goto out;
		uh = udp_hdr(skb);
		offset = (unsigned char *)(uh + 1) - skb->data;
		ulen = ntohs(uh->len);
		data_len = skb->len - offset;
		source = ntohs(uh->source);
		if (ulen != skb->len)
			goto out;
		if (udp6_csum_init(skb, uh, IPPROTO_UDP))
			goto out;
		list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
			if (!ipv6_addr_equal(&np->local_ip.in6, &ip6h->daddr))
				continue;
			if (!ipv6_addr_equal(&np->remote_ip.in6, &ip6h->saddr))
				continue;
			if (np->local_port && np->local_port != ntohs(uh->dest))
				continue;

			np->rx_skb_hook(np, source, skb, offset, data_len);
			hits++;
		}
#endif
	}

	if (!hits)
		goto out;

	kfree_skb(skb);
	return 1;

out:
	if (atomic_read(&trapped)) {
		kfree_skb(skb);
		return 1;
	}

	return 0;
}


15、如果不走POLL路径处理,或者POLL路径没有处理该数据包,则按正常路径处理;

static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
	struct packet_type *ptype, *pt_prev;
	rx_handler_func_t *rx_handler;
	struct net_device *orig_dev;
	struct net_device *null_or_dev;
	bool deliver_exact = false;
	int ret = NET_RX_DROP;
	__be16 type;

	net_timestamp_check(!netdev_tstamp_prequeue, skb);

	trace_netif_receive_skb(skb);

	/* if we've gotten here through NAPI, check netpoll */
	if (netpoll_receive_skb(skb))
		goto out;

	orig_dev = skb->dev;

	skb_reset_network_header(skb);
	if (!skb_transport_header_was_set(skb))
		skb_reset_transport_header(skb);
	skb_reset_mac_len(skb);

	pt_prev = NULL;

	rcu_read_lock();

another_round:
	skb->skb_iif = skb->dev->ifindex;

	__this_cpu_inc(softnet_data.processed);

        /* 提取VLAN标签,重新初始化协议头部指针 */
	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
		skb = skb_vlan_untag(skb);
		if (unlikely(!skb))
			goto unlock;
	}

#ifdef CONFIG_NET_CLS_ACT
	if (skb->tc_verd & TC_NCLS) {
		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
		goto ncls;
	}
#endif

	if (pfmemalloc)
		goto skip_taps;

        /* 回调处理所有协议包的上层网络注册函数,比如? */
	list_for_each_entry_rcu(ptype, &ptype_all, list) {
		if (!ptype->dev || ptype->dev == skb->dev) {
			if (pt_prev)
				ret = deliver_skb(skb, pt_prev, orig_dev);
			pt_prev = ptype;
		}
	}

skip_taps:
#ifdef CONFIG_NET_CLS_ACT
        /* 进行输入流量调度控制 */
	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
	if (!skb)
		goto unlock;
ncls:
#endif

	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
		goto drop;

        /* 如果数据包提供了VLAN标签,需要插入标签,再次跳转到another_round处理 */
	if (vlan_tx_tag_present(skb)) {
             /* 如果经过流量控制之后,输出的skb和输入的skb可能不一样,所以这里会? */
		if (pt_prev) {
			ret = deliver_skb(skb, pt_prev, orig_dev);
			pt_prev = NULL;
		}
		if (vlan_do_receive(&skb))
			goto another_round;
		else if (unlikely(!skb))
			goto unlock;
	}

        /* 内核现在好像只有网桥子系统注册了rx_handler:br_handle_frame */
	rx_handler = rcu_dereference(skb->dev->rx_handler);
	if (rx_handler) {
		if (pt_prev) {
			ret = deliver_skb(skb, pt_prev, orig_dev);
			pt_prev = NULL;
		}
		switch (rx_handler(&skb)) {
		case RX_HANDLER_CONSUMED:
			ret = NET_RX_SUCCESS;
			goto unlock;
		case RX_HANDLER_ANOTHER:
			goto another_round;
		case RX_HANDLER_EXACT:
			deliver_exact = true;
		case RX_HANDLER_PASS:
			break;
		default:
			BUG();
		}
	}

        /* 这里为啥还有插入VLAN的提示呢? */
	if (unlikely(vlan_tx_tag_present(skb))) {
		if (vlan_tx_tag_get_id(skb))
			skb->pkt_type = PACKET_OTHERHOST;
		/* Note: we might in the future use prio bits
		 * and set skb->priority like in vlan_do_receive()
		 * For the time being, just ignore Priority Code Point
		 */
		skb->vlan_tci = 0;
	}

	/* deliver only exact match when indicated */
	null_or_dev = deliver_exact ? skb->dev : NULL;

        /* 这里是绝大多数数据包到达的地方,根据数据包的协议调用已注册的网络层接收处理回调函数 */
	type = skb->protocol;
	list_for_each_entry_rcu(ptype,
			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
		if (ptype->type == type &&
		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
		     ptype->dev == orig_dev)) {
			if (pt_prev)
				ret = deliver_skb(skb, pt_prev, orig_dev);
			pt_prev = ptype;
		}
	}

	if (pt_prev) {
                /* 最后一次调用需要考虑硬件接口使用零拷贝技术(直接拷贝到用户空间)时,将数据从用户空间转移到内核空间 */
		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
			goto drop;
		else
			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
	} else {
drop:
                /* 如果没有协议能处理该数据包,则只能扔掉它了 */
		atomic_long_inc(&skb->dev->rx_dropped);
		kfree_skb(skb);
		/* Jamal, now you will not able to escape explaining
		 * me how you were going to use this. :-)
		 */
		ret = NET_RX_DROP;
	}

unlock:
	rcu_read_unlock();
out:
	return ret;
}


 

  • 0
    点赞
  • 0
    评论
  • 3
    收藏
  • 打赏
    打赏
  • 扫一扫,分享海报

©️2022 CSDN 皮肤主题:大白 设计师:CSDN官方博客 返回首页

打赏作者

43259260

你的鼓励将是我创作的最大动力

¥2 ¥4 ¥6 ¥10 ¥20
输入1-500的整数
余额支付 (余额:-- )
扫码支付
扫码支付:¥2
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值