tcp/ip 协议栈Linux内核源码分析六路由子系统分析一路由缓存

最新推荐文章于 2024-08-23 10:10:25 发布

yyyyyyyuande

最新推荐文章于 2024-08-23 10:10:25 发布

阅读量1.7k

点赞数 1

分类专栏： Linux 内核文章标签：路由缓存

本文链接：https://blog.csdn.net/fuyuande/article/details/90545266

版权

Linux 内核同时被 2 个专栏收录

32 篇文章

订阅专栏

Linux 网络子系统

8 篇文章

订阅专栏

本文围绕Linux内核3.4.39版本展开，介绍了路由缓存机制。Linux通过路由缓存减少路由表查询，以降低CPU消耗，路由缓存由hash表组织，初始化在ip_rt_init函数中。还说明了接收和发送报文时查找缓存的位置，以及缓存初始化流程和查询调用方式。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

内核版本：3.4.39

收到报文或者发送报文的时候都需要查找路由表，频繁的路由表查找操作时需要耗费一部分CPU的，Linux提供了路由缓存来减少路由表的查询，路由缓存由hash表组织而成，路由缓存的初始化放在路由初始化函数ip_rt_init中，当路由缓存没有命中的时候会去查找路由表，查找成功则会添加到路由缓存里。

有两个地方需要查找缓存，一个是ip_rcv()接收报文的时候，另一个是发送报文的时候。

缓存的初始化流程：

看下ip_rt_init函数

//路由缓存初始化
int __init ip_rt_init(void)
{
	int rc = 0;

#ifdef CONFIG_IP_ROUTE_CLASSID
	//基于路由的分类器，每个CPU256个变量
	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
	if (!ip_rt_acct)
		panic("IP: failed to allocate ip_rt_acct\n");
#endif

    //路由缓存池
	ipv4_dst_ops.kmem_cachep =
		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);

	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;

	//初始化每CPU变量
	if (dst_entries_init(&ipv4_dst_ops) < 0)
		panic("IP: failed to allocate ipv4_dst_ops counter\n");

	//初始化每CPU变量
	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");

	//建立路由缓存hash表
	rt_hash_table = (struct rt_hash_bucket *)
		alloc_large_system_hash("IP route cache",
					sizeof(struct rt_hash_bucket),
					rhash_entries,
					(totalram_pages >= 128 * 1024) ?
					15 : 17,
					0,
					&rt_hash_log,
					&rt_hash_mask,
					rhash_entries ? 0 : 512 * 1024);
	//初始化路由缓存hash表
	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));

	//每个hash表
	rt_hash_lock_init();

	//设置gc时间和缓存最大数量
	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
	ip_rt_max_size = (rt_hash_mask + 1) * 16;

	//初始化
	devinet_init();

	//注册通知链和创建alias缓存
	ip_fib_init();

	//注册gc任务
	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
	expires_ljiffies = jiffies;
	schedule_delayed_work(&expires_work,
		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);

	if (ip_rt_proc_init())
		pr_err("Unable to create route proc files\n");
#ifdef CONFIG_XFRM
	xfrm_init();
	xfrm4_init(ip_rt_max_size);
#endif
	//注册netlink消息
	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);

#ifdef CONFIG_SYSCTL
	register_pernet_subsys(&sysctl_route_ops);
#endif
	register_pernet_subsys(&rt_genid_ops);
	return rc;
}

函数中rt_hash_table就是路由缓存hash表。

看完了初始化看下查询是如何调用的。

首先是输入函数的查询：

主要的查找函数是ip_route_input_common:

int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
			   u8 tos, struct net_device *dev, bool noref)
{
	struct rtable * rth;
	unsigned	hash;
	int iif = dev->ifindex;
	struct net *net;
	int res;

	net = dev_net(dev);

	rcu_read_lock();

	if (!rt_caching(net))
		goto skip_cache;

	tos &= IPTOS_RT_MASK;

	//输入hash值的计算由src、dst、iif和随机量组成
	hash = rt_hash(daddr, saddr, iif, rt_genid(net));

    //根据报文得到的hash值查找路由缓存
	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
	     rth = rcu_dereference(rth->dst.rt_next)) {
	        //比较地址，设备，tos，mark
		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
		     (rth->rt_route_iif ^ iif) |
		     (rth->rt_key_tos ^ tos)) == 0 &&
		    rth->rt_mark == skb->mark &&
		    net_eq(dev_net(rth->dst.dev), net) &&
		    !rt_is_expired(rth)) {
			ipv4_validate_peer(rth);

			//noref区别，基本上外部设备发来的报文noref为ture
			if (noref) {				
				dst_use_noref(&rth->dst, jiffies);
				//更新该缓存计数器和时间
				
				skb_dst_set_noref(skb, &rth->dst);
			} else {
				//基本上自己发给自己的报文会走到这里来
				dst_use(&rth->dst, jiffies);
				skb_dst_set(skb, &rth->dst);
			}
			//增加命中计数
			RT_CACHE_STAT_INC(in_hit);
			rcu_read_unlock();
			return 0;
		}
			//增加查找次数统计
			RT_CACHE_STAT_INC(in_hlist_search);
	}

skip_cache:
	/* Multicast recognition logic is moved from route cache to here.
	   The problem was that too many Ethernet cards have broken/missing
	   hardware multicast filters :-( As result the host on multicasting
	   network acquires a lot of useless route cache entries, sort of
	   SDR messages from all the world. Now we try to get rid of them.
	   Really, provided software IP multicast filter is organized
	   reasonably (at least, hashed), it does not result in a slowdown
	   comparing with route cache reject entries.
	   Note, that multicast routers are not affected, because
	   route cache entry is created eventually.
	 */
	if (ipv4_is_multicast(daddr)) {
		struct in_device *in_dev = __in_dev_get_rcu(dev);

		if (in_dev) {
			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
						  ip_hdr(skb)->protocol);
			if (our
#ifdef CONFIG_IP_MROUTE
				||
			    (!ipv4_is_local_multicast(daddr) &&
			     IN_DEV_MFORWARD(in_dev))
#endif
			   ) {
				int res = ip_route_input_mc(skb, daddr, saddr,
							    tos, dev, our);
				rcu_read_unlock();
				return res;
			}
		}
		rcu_read_unlock();
		return -EINVAL;
	}

	//查询路由表
	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
	rcu_read_unlock();
	return res;
}
EXPORT_SYMBOL(ip_route_input_common);

输出报文查找函数是ip_route_output_flow()，这是个包裹函数，核心是调用__ip_route_output_key，

该函数如下：

//查找路由，先查找路由缓存，查找不到再查找路由表
struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
{
	struct rtable *rth;
	unsigned int hash;

	if (!rt_caching(net))
		goto slow_output;

	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));

	rcu_read_lock_bh();
    //遍历hash表，匹配则返回
	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
		rth = rcu_dereference_bh(rth->dst.rt_next)) {
		if (rth->rt_key_dst == flp4->daddr &&
		    rth->rt_key_src == flp4->saddr &&
		    rt_is_output_route(rth) &&
		    rth->rt_oif == flp4->flowi4_oif &&
		    rth->rt_mark == flp4->flowi4_mark &&
		    rth->rt_uid == flp4->flowi4_uid &&
		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
		    net_eq(dev_net(rth->dst.dev), net) &&
		    !rt_is_expired(rth)) {
			ipv4_validate_peer(rth);
			dst_use(&rth->dst, jiffies);
			RT_CACHE_STAT_INC(out_hit);
			rcu_read_unlock_bh();
			if (!flp4->saddr)
				flp4->saddr = rth->rt_src;
			if (!flp4->daddr)
				flp4->daddr = rth->rt_dst;

			//缓存匹配则返回	
			return rth;
		}
		RT_CACHE_STAT_INC(out_hlist_search);
	}
	rcu_read_unlock_bh();

slow_output:
	//查找路由表
	return ip_route_output_slow(net, flp4);
}
EXPORT_SYMBOL_GPL(__ip_route_output_key);

参考目录：

1. 《Linux Kernel Networking - Implementation and Theory》

2. 《深入理解Linux网络技术内幕》