linux网络RPS RFS流程分析

接收流程

netif_rx() -> netif_rx_internal() -> get_rps_cpu()

具体函数分析

netif_rx()

< net/core/dev.c >

int netif_rx(struct sk_buff *skb)
{
	bool need_bh_off = !(hardirq_count() | softirq_count());
	int ret;

	if (need_bh_off)
		local_bh_disable();
	trace_netif_rx_entry(skb);                           /* 网络包开始接收的trace信息 */
	ret = netif_rx_internal(skb);                        /* 网络包处理逻辑 */
	trace_netif_rx_exit(ret);                               /* 网络包接收完成的trace信息 */
	if (need_bh_off)
		local_bh_enable();
	return ret;
}

netif_rx_internal()

< net/core/dev.c >

static int netif_rx_internal(struct sk_buff *skb)
{
	int ret;

	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);

	trace_netif_rx(skb);

#ifdef CONFIG_RPS                                                                                  /* 使能RPS */
	if (static_branch_unlikely(&rps_needed)) {
		struct rps_dev_flow voidflow, *rflow = &voidflow;
		int cpu;

		rcu_read_lock();

		cpu = get_rps_cpu(skb->dev, skb, &rflow); 
		if (cpu < 0)
			cpu = smp_processor_id();

		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

		rcu_read_unlock();
	} else
#endif
	{
		unsigned int qtail;

		ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
	}
	return ret;
}

关键结构体

< include/linux/netdevice.h >

struct rps_map {                              /* rps_map和RPS相关 */
	unsigned int len;                         /* len成员是cpus数组的长度,cpus是边长数组 */
	struct rcu_head rcu;
	u16 cpus[];                                    /* cpus数组用来记录配置/sys/.../rps_cpus中设置的,处理网络报文的cpu数组 */
};

// rps_dev_flow 类型的实例则主要包括存放着上次处理该流中报文的 cpu 以及所在 cpu 私有数据对象 softnet_data 的 input_pkt_queue 队列尾部索引的两个成员
struct rps_dev_flow {                        /* rps_dev_flow 和RFS相关 */
	u16 cpu;                                           /* 上次处理该流中报文的 cpu */
	u16 filter;
	unsigned int last_qtail;
};

struct rps_dev_flow_table {                 /* rps_dev_flow_table 和RFS相关 */
	unsigned int mask;
	struct rcu_head rcu;
	struct rps_dev_flow flows[];
};

/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
	struct xdp_rxq_info		xdp_rxq;
#ifdef CONFIG_RPS
	struct rps_map __rcu		*rps_map;
	struct rps_dev_flow_table __rcu	*rps_flow_table;
#endif
	struct kobject			kobj;
	struct net_device		*dev;
	netdevice_tracker		dev_tracker;

#ifdef CONFIG_XDP_SOCKETS
	struct xsk_buff_pool            *pool;
#endif
} ____cacheline_aligned_in_smp;

解析函数

store_rps_map()

< net/core/net-sysfs.c >

static ssize_t store_rps_map(struct netdev_rx_queue *queue,
			     const char *buf, size_t len)
{
	struct rps_map *old_map, *map;
	cpumask_var_t mask;
	int err, cpu, i;
	static DEFINE_MUTEX(rps_map_mutex);

	if (!capable(CAP_NET_ADMIN))
		return -EPERM;

	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
		return -ENOMEM;

	err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
	if (err) {
		free_cpumask_var(mask);
		return err;
	}

	if (!cpumask_empty(mask)) {
		cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
		cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
		if (cpumask_empty(mask)) {
			free_cpumask_var(mask);
			return -EINVAL;
		}
	}

	map = kzalloc(max_t(unsigned int,
			    RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
		      GFP_KERNEL);
	if (!map) {
		free_cpumask_var(mask);
		return -ENOMEM;
	}

	i = 0;
	for_each_cpu_and(cpu, mask, cpu_online_mask)
		map->cpus[i++] = cpu;

	if (i) {
		map->len = i;
	} else {
		kfree(map);
		map = NULL;
	}

	mutex_lock(&rps_map_mutex);
	old_map = rcu_dereference_protected(queue->rps_map,
					    mutex_is_locked(&rps_map_mutex));
	rcu_assign_pointer(queue->rps_map, map);

	if (map)
		static_branch_inc(&rps_needed);
	if (old_map)
		static_branch_dec(&rps_needed);

	mutex_unlock(&rps_map_mutex);

	if (old_map)
		kfree_rcu(old_map, rcu);

	free_cpumask_var(mask);
	return len;
}

store_rps_dev_flow_table_cnt()

< net/core/net-sysfs.c >

static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
					    const char *buf, size_t len)
{
	unsigned long mask, count;
	struct rps_dev_flow_table *table, *old_table;
	static DEFINE_SPINLOCK(rps_dev_flow_lock);
	int rc;

	if (!capable(CAP_NET_ADMIN))
		return -EPERM;

	rc = kstrtoul(buf, 0, &count);
	if (rc < 0)
		return rc;

	if (count) {
		mask = count - 1;
		/* mask = roundup_pow_of_two(count) - 1;
		 * without overflows...
		 */
		while ((mask | (mask >> 1)) != mask)
			mask |= (mask >> 1);
		/* On 64 bit arches, must check mask fits in table->mask (u32),
		 * and on 32bit arches, must check
		 * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.
		 */
#if BITS_PER_LONG > 32
		if (mask > (unsigned long)(u32)mask)
			return -EINVAL;
#else
		if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
				/ sizeof(struct rps_dev_flow)) {
			/* Enforce a limit to prevent overflow */
			return -EINVAL;
		}
#endif
		table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
		if (!table)
			return -ENOMEM;

		table->mask = mask;
		for (count = 0; count <= mask; count++)
			table->flows[count].cpu = RPS_NO_CPU;                                  /* 初始化默认值 */
	} else {
		table = NULL;
	}

	spin_lock(&rps_dev_flow_lock);
	old_table = rcu_dereference_protected(queue->rps_flow_table,
					      lockdep_is_held(&rps_dev_flow_lock));
	rcu_assign_pointer(queue->rps_flow_table, table);
	spin_unlock(&rps_dev_flow_lock);

	if (old_table)
		call_rcu(&old_table->rcu, rps_dev_flow_table_release);

	return len;
}

RFS

用户设置的位置/proc/sys/net/core/rps_sock_flow_entries

内核数据接口

struct rps_sock_flow_table

< include/linux/netdevice.h >

struct rps_sock_flow_table {
	u32	mask;

	u32	ents[] ____cacheline_aligned_in_smp;        /* 每个ents代表最新的运行application的cpu */
};

proc设置时内核态的处理

处理函数注册位置

< net/core/sysctrl_net_core.c >

static struct ctl_table net_core_table[] = {
#ifdef CONFIG_RPS
	{
		.procname	= "rps_sock_flow_entries",
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= rps_sock_flow_sysctl
	},
#endif
}

全局流表更新

rps_record_sock_flow()

< include/linux/netdevice.h >

static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
					u32 hash)
{
	if (table && hash) {
		unsigned int index = hash & table->mask;
		u32 val = hash & ~rps_cpu_mask;

		/* We only give a hint, preemption can change CPU under us */
		val |= raw_smp_processor_id();                                                     /* 获取当前cpu */

		if (table->ents[index] != val)
			table->ents[index] = val;
	}
}

流表更新流程

accept() -> inet_accept() -> sock_rps_record_flow() -> sock_rps_record_flow_hash() -> tun_flow_update()

tcp_splice_read() -> sock_rps_record_flow() -> sock_rps_record_flow_hash() -> tun_flow_update()

tcp_zerocopy_receive() -> sock_rps_record_flow() -> sock_rps_record_flow_hash() -> tun_flow_update()

tun_chr_write_iter() -> tun_get_user() -> sock_rps_record_flow_hash() -> tun_flow_update()

static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
	if (static_branch_unlikely(&rfs_needed)) {
		/* Reading sk->sk_rxhash might incur an expensive cache line
		 * miss.
		 *
		 * TCP_ESTABLISHED does cover almost all states where RFS
		 * might be useful, and is cheaper [1] than testing :
		 *	IPv4: inet_sk(sk)->inet_daddr
		 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
		 * OR	an additional socket flag
		 * [1] : sk_state and sk_prot are in the same cache line.
		 */
		if (sk->sk_state == TCP_ESTABLISHED)
			sock_rps_record_flow_hash(sk->sk_rxhash);
	}
#endif
}

sock_rps_record_flow_hash()

< include/net/sock.h >

static inline void sock_rps_record_flow_hash(__u32 hash)
{
#ifdef CONFIG_RPS
	struct rps_sock_flow_table *sock_flow_table;

	rcu_read_lock();
	sock_flow_table = rcu_dereference(rps_sock_flow_table);
	rps_record_sock_flow(sock_flow_table, hash);
	rcu_read_unlock();
#endif
}

rps_record_sock_flow()

< include/linux/netdevice.h >

static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
					u32 hash)
{
	if (table && hash) {
		unsigned int index = hash & table->mask;
		u32 val = hash & ~rps_cpu_mask;

		/* We only give a hint, preemption can change CPU under us */
		val |= raw_smp_processor_id();

		if (table->ents[index] != val)
			table->ents[index] = val;
	}
}

raw_smp_processor_id 函数作用

raw_smp_processor_id() 是 Linux 内核中的一个函数,其作用是获取当前 CPU 的物理 ID(Physical ID)。它通常在 SMP(对称多处理)系统中使用,用于获取当前正在执行的代码所在的 CPU 的唯一标识符。

在 SMP 系统中,有多个 CPU 可以同时执行代码。每个 CPU 都有一个唯一的物理 ID,用于标识该 CPU。raw_smp_processor_id() 函数允许内核代码获取当前 CPU 的物理 ID,从而可以在多 CPU 的环境中进行处理器相关的操作,例如分配特定的资源、调度任务到特定的 CPU 等。

raw_smp_processor_id() 函数通常在内核编程中使用,并且在内核的多 CPU 相关的代码中是比较常见的。需要注意的是,raw_smp_processor_id() 函数是一个非常底层的函数,直接返回当前 CPU 的物理 ID,没有进行任何锁定或同步操作。因此,在使用 raw_smp_processor_id() 函数时,应谨慎考虑多 CPU 系统中的并发和同步问题,以确保代码的正确性和稳定性。

两个流表分析

设备流表 rps_dev_flow_table,记录的是上次在内核态处理该流中报文的cpu。

全局的socket流表 rps_sock_flow_table,记录报文期望被处理的目标cpu。

get_rps_cpu()

< net/core/dev.c >

static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
		       struct rps_dev_flow **rflowp)
{
	const struct rps_sock_flow_table *sock_flow_table;
	struct netdev_rx_queue *rxqueue = dev->_rx;
	struct rps_dev_flow_table *flow_table;
	struct rps_map *map;
	int cpu = -1;
	u32 tcpu;
	u32 hash;

	if (skb_rx_queue_recorded(skb)) {
		u16 index = skb_get_rx_queue(skb);                                       /* 获取网卡的接收队列 */

		if (unlikely(index >= dev->real_num_rx_queues)) {
			WARN_ONCE(dev->real_num_rx_queues > 1,
				  "%s received packet on queue %u, but number "
				  "of RX queues is %u\n",
				  dev->name, index, dev->real_num_rx_queues);
			goto done;
		}
		rxqueue += index;
	}

	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */

	flow_table = rcu_dereference(rxqueue->rps_flow_table);
	map = rcu_dereference(rxqueue->rps_map);
	if (!flow_table && !map)                                  /* 如果RFS的rps_flow_table表和RPS的rps_map表都是空的,就退出 */
		goto done;

	skb_reset_network_header(skb);

    /* 根据skb获取hash值。如果在RSS模式下,可以直接使用网
     * 卡硬件计算的hash值,否则需要根据skb内容用软件计算一个
     */
	hash = skb_get_hash(skb);
	if (!hash)
		goto done;

	sock_flow_table = rcu_dereference(rps_sock_flow_table);
	if (flow_table && sock_flow_table) {
		struct rps_dev_flow *rflow;
		u32 next_cpu;
		u32 ident;

		/* First check into global flow table if there is a match */
		ident = sock_flow_table->ents[hash & sock_flow_table->mask];                            /* 优先判断全局表rps_sock_flow_table是否可以匹配 */
		if ((ident ^ hash) & ~rps_cpu_mask)
			goto try_rps;

		next_cpu = ident & rps_cpu_mask;

		/* OK, now we know there is a match,
		 * we can look at the local (per receive queue) flow table
		 */
		rflow = &flow_table->flows[hash & flow_table->mask];                                     /* 获取rps_flow_table表的匹配情况 */
		tcpu = rflow->cpu;

		/*
		 * If the desired CPU (where last recvmsg was done) is
		 * different from current CPU (one in the rx-queue flow
		 * table entry), switch if one of the following holds:
		 *   - Current CPU is unset (>= nr_cpu_ids).
		 *   - Current CPU is offline.
		 *   - The current CPU's queue tail has advanced beyond the
		 *     last packet that was enqueued using this table entry.
		 *     This guarantees that all previous packets for the flow
		 *     have been dequeued, thus preserving in order delivery.
		 */
		if (unlikely(tcpu != next_cpu) &&                                                   /* 当前cpu和期望cpu不是同一个 */
		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||                                /* 当前cpu unset,或者不在线 */
		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
		      rflow->last_qtail)) >= 0)) {           
			tcpu = next_cpu;
			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
		}
		/* 优先使用RFS的流表 */
		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
			*rflowp = rflow;
			cpu = tcpu;
			goto done;
		}
	}

try_rps:
	/* 能执行到这里,说明RFS表不存在 */
	if (map) {
		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
		if (cpu_online(tcpu)) {
			cpu = tcpu;
			goto done;
		}
	}

done:
	return cpu;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值