接收流程
netif_rx() -> netif_rx_internal() -> get_rps_cpu()
具体函数分析
netif_rx()
< net/core/dev.c >
int netif_rx(struct sk_buff *skb)
{
bool need_bh_off = !(hardirq_count() | softirq_count());
int ret;
if (need_bh_off)
local_bh_disable();
trace_netif_rx_entry(skb); /* 网络包开始接收的trace信息 */
ret = netif_rx_internal(skb); /* 网络包处理逻辑 */
trace_netif_rx_exit(ret); /* 网络包接收完成的trace信息 */
if (need_bh_off)
local_bh_enable();
return ret;
}
netif_rx_internal()
< net/core/dev.c >
static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
trace_netif_rx(skb);
#ifdef CONFIG_RPS /* 使能RPS */
if (static_branch_unlikely(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu < 0)
cpu = smp_processor_id();
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
} else
#endif
{
unsigned int qtail;
ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
}
return ret;
}
关键结构体
< include/linux/netdevice.h >
struct rps_map { /* rps_map和RPS相关 */
unsigned int len; /* len成员是cpus数组的长度,cpus是边长数组 */
struct rcu_head rcu;
u16 cpus[]; /* cpus数组用来记录配置/sys/.../rps_cpus中设置的,处理网络报文的cpu数组 */
};
// rps_dev_flow 类型的实例则主要包括存放着上次处理该流中报文的 cpu 以及所在 cpu 私有数据对象 softnet_data 的 input_pkt_queue 队列尾部索引的两个成员
struct rps_dev_flow { /* rps_dev_flow 和RFS相关 */
u16 cpu; /* 上次处理该流中报文的 cpu */
u16 filter;
unsigned int last_qtail;
};
struct rps_dev_flow_table { /* rps_dev_flow_table 和RFS相关 */
unsigned int mask;
struct rcu_head rcu;
struct rps_dev_flow flows[];
};
/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
struct xdp_rxq_info xdp_rxq;
#ifdef CONFIG_RPS
struct rps_map __rcu *rps_map;
struct rps_dev_flow_table __rcu *rps_flow_table;
#endif
struct kobject kobj;
struct net_device *dev;
netdevice_tracker dev_tracker;
#ifdef CONFIG_XDP_SOCKETS
struct xsk_buff_pool *pool;
#endif
} ____cacheline_aligned_in_smp;
解析函数
store_rps_map()
< net/core/net-sysfs.c >
static ssize_t store_rps_map(struct netdev_rx_queue *queue,
const char *buf, size_t len)
{
struct rps_map *old_map, *map;
cpumask_var_t mask;
int err, cpu, i;
static DEFINE_MUTEX(rps_map_mutex);
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
if (err) {
free_cpumask_var(mask);
return err;
}
if (!cpumask_empty(mask)) {
cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
if (cpumask_empty(mask)) {
free_cpumask_var(mask);
return -EINVAL;
}
}
map = kzalloc(max_t(unsigned int,
RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
GFP_KERNEL);
if (!map) {
free_cpumask_var(mask);
return -ENOMEM;
}
i = 0;
for_each_cpu_and(cpu, mask, cpu_online_mask)
map->cpus[i++] = cpu;
if (i) {
map->len = i;
} else {
kfree(map);
map = NULL;
}
mutex_lock(&rps_map_mutex);
old_map = rcu_dereference_protected(queue->rps_map,
mutex_is_locked(&rps_map_mutex));
rcu_assign_pointer(queue->rps_map, map);
if (map)
static_branch_inc(&rps_needed);
if (old_map)
static_branch_dec(&rps_needed);
mutex_unlock(&rps_map_mutex);
if (old_map)
kfree_rcu(old_map, rcu);
free_cpumask_var(mask);
return len;
}
store_rps_dev_flow_table_cnt()
< net/core/net-sysfs.c >
static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
const char *buf, size_t len)
{
unsigned long mask, count;
struct rps_dev_flow_table *table, *old_table;
static DEFINE_SPINLOCK(rps_dev_flow_lock);
int rc;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
rc = kstrtoul(buf, 0, &count);
if (rc < 0)
return rc;
if (count) {
mask = count - 1;
/* mask = roundup_pow_of_two(count) - 1;
* without overflows...
*/
while ((mask | (mask >> 1)) != mask)
mask |= (mask >> 1);
/* On 64 bit arches, must check mask fits in table->mask (u32),
* and on 32bit arches, must check
* RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.
*/
#if BITS_PER_LONG > 32
if (mask > (unsigned long)(u32)mask)
return -EINVAL;
#else
if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
/ sizeof(struct rps_dev_flow)) {
/* Enforce a limit to prevent overflow */
return -EINVAL;
}
#endif
table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
if (!table)
return -ENOMEM;
table->mask = mask;
for (count = 0; count <= mask; count++)
table->flows[count].cpu = RPS_NO_CPU; /* 初始化默认值 */
} else {
table = NULL;
}
spin_lock(&rps_dev_flow_lock);
old_table = rcu_dereference_protected(queue->rps_flow_table,
lockdep_is_held(&rps_dev_flow_lock));
rcu_assign_pointer(queue->rps_flow_table, table);
spin_unlock(&rps_dev_flow_lock);
if (old_table)
call_rcu(&old_table->rcu, rps_dev_flow_table_release);
return len;
}
RFS
用户设置的位置/proc/sys/net/core/rps_sock_flow_entries
内核数据接口
struct rps_sock_flow_table
< include/linux/netdevice.h >
struct rps_sock_flow_table {
u32 mask;
u32 ents[] ____cacheline_aligned_in_smp; /* 每个ents代表最新的运行application的cpu */
};
proc设置时内核态的处理
处理函数注册位置
< net/core/sysctrl_net_core.c >
static struct ctl_table net_core_table[] = {
#ifdef CONFIG_RPS
{
.procname = "rps_sock_flow_entries",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = rps_sock_flow_sysctl
},
#endif
}
全局流表更新
rps_record_sock_flow()
< include/linux/netdevice.h >
static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
u32 hash)
{
if (table && hash) {
unsigned int index = hash & table->mask;
u32 val = hash & ~rps_cpu_mask;
/* We only give a hint, preemption can change CPU under us */
val |= raw_smp_processor_id(); /* 获取当前cpu */
if (table->ents[index] != val)
table->ents[index] = val;
}
}
流表更新流程
accept() -> inet_accept() -> sock_rps_record_flow() -> sock_rps_record_flow_hash() -> tun_flow_update()
tcp_splice_read() -> sock_rps_record_flow() -> sock_rps_record_flow_hash() -> tun_flow_update()
tcp_zerocopy_receive() -> sock_rps_record_flow() -> sock_rps_record_flow_hash() -> tun_flow_update()
tun_chr_write_iter() -> tun_get_user() -> sock_rps_record_flow_hash() -> tun_flow_update()
static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
if (static_branch_unlikely(&rfs_needed)) {
/* Reading sk->sk_rxhash might incur an expensive cache line
* miss.
*
* TCP_ESTABLISHED does cover almost all states where RFS
* might be useful, and is cheaper [1] than testing :
* IPv4: inet_sk(sk)->inet_daddr
* IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
* OR an additional socket flag
* [1] : sk_state and sk_prot are in the same cache line.
*/
if (sk->sk_state == TCP_ESTABLISHED)
sock_rps_record_flow_hash(sk->sk_rxhash);
}
#endif
}
sock_rps_record_flow_hash()
< include/net/sock.h >
static inline void sock_rps_record_flow_hash(__u32 hash)
{
#ifdef CONFIG_RPS
struct rps_sock_flow_table *sock_flow_table;
rcu_read_lock();
sock_flow_table = rcu_dereference(rps_sock_flow_table);
rps_record_sock_flow(sock_flow_table, hash);
rcu_read_unlock();
#endif
}
rps_record_sock_flow()
< include/linux/netdevice.h >
static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
u32 hash)
{
if (table && hash) {
unsigned int index = hash & table->mask;
u32 val = hash & ~rps_cpu_mask;
/* We only give a hint, preemption can change CPU under us */
val |= raw_smp_processor_id();
if (table->ents[index] != val)
table->ents[index] = val;
}
}
raw_smp_processor_id 函数作用
raw_smp_processor_id() 是 Linux 内核中的一个函数,其作用是获取当前 CPU 的物理 ID(Physical ID)。它通常在 SMP(对称多处理)系统中使用,用于获取当前正在执行的代码所在的 CPU 的唯一标识符。
在 SMP 系统中,有多个 CPU 可以同时执行代码。每个 CPU 都有一个唯一的物理 ID,用于标识该 CPU。raw_smp_processor_id() 函数允许内核代码获取当前 CPU 的物理 ID,从而可以在多 CPU 的环境中进行处理器相关的操作,例如分配特定的资源、调度任务到特定的 CPU 等。
raw_smp_processor_id() 函数通常在内核编程中使用,并且在内核的多 CPU 相关的代码中是比较常见的。需要注意的是,raw_smp_processor_id() 函数是一个非常底层的函数,直接返回当前 CPU 的物理 ID,没有进行任何锁定或同步操作。因此,在使用 raw_smp_processor_id() 函数时,应谨慎考虑多 CPU 系统中的并发和同步问题,以确保代码的正确性和稳定性。
两个流表分析
设备流表 rps_dev_flow_table,记录的是上次在内核态处理该流中报文的cpu。
全局的socket流表 rps_sock_flow_table,记录报文期望被处理的目标cpu。
get_rps_cpu()
< net/core/dev.c >
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow **rflowp)
{
const struct rps_sock_flow_table *sock_flow_table;
struct netdev_rx_queue *rxqueue = dev->_rx;
struct rps_dev_flow_table *flow_table;
struct rps_map *map;
int cpu = -1;
u32 tcpu;
u32 hash;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb); /* 获取网卡的接收队列 */
if (unlikely(index >= dev->real_num_rx_queues)) {
WARN_ONCE(dev->real_num_rx_queues > 1,
"%s received packet on queue %u, but number "
"of RX queues is %u\n",
dev->name, index, dev->real_num_rx_queues);
goto done;
}
rxqueue += index;
}
/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
flow_table = rcu_dereference(rxqueue->rps_flow_table);
map = rcu_dereference(rxqueue->rps_map);
if (!flow_table && !map) /* 如果RFS的rps_flow_table表和RPS的rps_map表都是空的,就退出 */
goto done;
skb_reset_network_header(skb);
/* 根据skb获取hash值。如果在RSS模式下,可以直接使用网
* 卡硬件计算的hash值,否则需要根据skb内容用软件计算一个
*/
hash = skb_get_hash(skb);
if (!hash)
goto done;
sock_flow_table = rcu_dereference(rps_sock_flow_table);
if (flow_table && sock_flow_table) {
struct rps_dev_flow *rflow;
u32 next_cpu;
u32 ident;
/* First check into global flow table if there is a match */
ident = sock_flow_table->ents[hash & sock_flow_table->mask]; /* 优先判断全局表rps_sock_flow_table是否可以匹配 */
if ((ident ^ hash) & ~rps_cpu_mask)
goto try_rps;
next_cpu = ident & rps_cpu_mask;
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
rflow = &flow_table->flows[hash & flow_table->mask]; /* 获取rps_flow_table表的匹配情况 */
tcpu = rflow->cpu;
/*
* If the desired CPU (where last recvmsg was done) is
* different from current CPU (one in the rx-queue flow
* table entry), switch if one of the following holds:
* - Current CPU is unset (>= nr_cpu_ids).
* - Current CPU is offline.
* - The current CPU's queue tail has advanced beyond the
* last packet that was enqueued using this table entry.
* This guarantees that all previous packets for the flow
* have been dequeued, thus preserving in order delivery.
*/
if (unlikely(tcpu != next_cpu) && /* 当前cpu和期望cpu不是同一个 */
(tcpu >= nr_cpu_ids || !cpu_online(tcpu) || /* 当前cpu unset,或者不在线 */
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
}
/* 优先使用RFS的流表 */
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
goto done;
}
}
try_rps:
/* 能执行到这里,说明RFS表不存在 */
if (map) {
tcpu = map->cpus[reciprocal_scale(hash, map->len)];
if (cpu_online(tcpu)) {
cpu = tcpu;
goto done;
}
}
done:
return cpu;
}