内核版本:3.4.39
收到报文或者发送报文的时候都需要查找路由表,频繁的路由表查找操作时需要耗费一部分CPU的,Linux提供了路由缓存来减少路由表的查询,路由缓存由hash表组织而成,路由缓存的初始化放在路由初始化函数ip_rt_init中,当路由缓存没有命中的时候会去查找路由表,查找成功则会添加到路由缓存里。
有两个地方需要查找缓存,一个是ip_rcv()接收报文的时候,另一个是发送报文的时候。
缓存的初始化流程:
看下ip_rt_init函数
//路由缓存初始化
int __init ip_rt_init(void)
{
int rc = 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
//基于路由的分类器,每个CPU256个变量
ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
if (!ip_rt_acct)
panic("IP: failed to allocate ip_rt_acct\n");
#endif
//路由缓存池
ipv4_dst_ops.kmem_cachep =
kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
//初始化每CPU变量
if (dst_entries_init(&ipv4_dst_ops) < 0)
panic("IP: failed to allocate ipv4_dst_ops counter\n");
//初始化每CPU变量
if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
//建立路由缓存hash表
rt_hash_table = (struct rt_hash_bucket *)
alloc_large_system_hash("IP route cache",
sizeof(struct rt_hash_bucket),
rhash_entries,
(totalram_pages >= 128 * 1024) ?
15 : 17,
0,
&rt_hash_log,
&rt_hash_mask,
rhash_entries ? 0 : 512 * 1024);
//初始化路由缓存hash表
memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
//每个hash表
rt_hash_lock_init();
//设置gc时间和缓存最大数量
ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
ip_rt_max_size = (rt_hash_mask + 1) * 16;
//初始化
devinet_init();
//注册通知链和创建alias缓存
ip_fib_init();
//注册gc任务
INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
expires_ljiffies = jiffies;
schedule_delayed_work(&expires_work,
net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
if (ip_rt_proc_init())
pr_err("Unable to create route proc files\n");
#ifdef CONFIG_XFRM
xfrm_init();
xfrm4_init(ip_rt_max_size);
#endif
//注册netlink消息
rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
#ifdef CONFIG_SYSCTL
register_pernet_subsys(&sysctl_route_ops);
#endif
register_pernet_subsys(&rt_genid_ops);
return rc;
}
函数中rt_hash_table就是路由缓存hash表。
看完了初始化看下查询是如何调用的。
首先是输入函数的查询:
主要的查找函数是ip_route_input_common:
int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, bool noref)
{
struct rtable * rth;
unsigned hash;
int iif = dev->ifindex;
struct net *net;
int res;
net = dev_net(dev);
rcu_read_lock();
if (!rt_caching(net))
goto skip_cache;
tos &= IPTOS_RT_MASK;
//输入hash值的计算由src、dst、iif和随机量组成
hash = rt_hash(daddr, saddr, iif, rt_genid(net));
//根据报文得到的hash值查找路由缓存
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->dst.rt_next)) {
//比较地址,设备,tos,mark
if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
(rth->rt_route_iif ^ iif) |
(rth->rt_key_tos ^ tos)) == 0 &&
rth->rt_mark == skb->mark &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
ipv4_validate_peer(rth);
//noref区别,基本上外部设备发来的报文noref为ture
if (noref) {
dst_use_noref(&rth->dst, jiffies);
//更新该缓存计数器和时间
skb_dst_set_noref(skb, &rth->dst);
} else {
//基本上自己发给自己的报文会走到这里来
dst_use(&rth->dst, jiffies);
skb_dst_set(skb, &rth->dst);
}
//增加命中计数
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
return 0;
}
//增加查找次数统计
RT_CACHE_STAT_INC(in_hlist_search);
}
skip_cache:
/* Multicast recognition logic is moved from route cache to here.
The problem was that too many Ethernet cards have broken/missing
hardware multicast filters :-( As result the host on multicasting
network acquires a lot of useless route cache entries, sort of
SDR messages from all the world. Now we try to get rid of them.
Really, provided software IP multicast filter is organized
reasonably (at least, hashed), it does not result in a slowdown
comparing with route cache reject entries.
Note, that multicast routers are not affected, because
route cache entry is created eventually.
*/
if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
if (in_dev) {
int our = ip_check_mc_rcu(in_dev, daddr, saddr,
ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
||
(!ipv4_is_local_multicast(daddr) &&
IN_DEV_MFORWARD(in_dev))
#endif
) {
int res = ip_route_input_mc(skb, daddr, saddr,
tos, dev, our);
rcu_read_unlock();
return res;
}
}
rcu_read_unlock();
return -EINVAL;
}
//查询路由表
res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
rcu_read_unlock();
return res;
}
EXPORT_SYMBOL(ip_route_input_common);
输出报文查找函数是ip_route_output_flow(),这是个包裹函数,核心是调用__ip_route_output_key,
该函数如下:
//查找路由,先查找路由缓存,查找不到再查找路由表
struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
{
struct rtable *rth;
unsigned int hash;
if (!rt_caching(net))
goto slow_output;
hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
rcu_read_lock_bh();
//遍历hash表,匹配则返回
for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
rth = rcu_dereference_bh(rth->dst.rt_next)) {
if (rth->rt_key_dst == flp4->daddr &&
rth->rt_key_src == flp4->saddr &&
rt_is_output_route(rth) &&
rth->rt_oif == flp4->flowi4_oif &&
rth->rt_mark == flp4->flowi4_mark &&
rth->rt_uid == flp4->flowi4_uid &&
!((rth->rt_key_tos ^ flp4->flowi4_tos) &
(IPTOS_RT_MASK | RTO_ONLINK)) &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
ipv4_validate_peer(rth);
dst_use(&rth->dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
if (!flp4->saddr)
flp4->saddr = rth->rt_src;
if (!flp4->daddr)
flp4->daddr = rth->rt_dst;
//缓存匹配则返回
return rth;
}
RT_CACHE_STAT_INC(out_hlist_search);
}
rcu_read_unlock_bh();
slow_output:
//查找路由表
return ip_route_output_slow(net, flp4);
}
EXPORT_SYMBOL_GPL(__ip_route_output_key);
参考目录:
1. 《Linux Kernel Networking - Implementation and Theory》
2. 《深入理解Linux网络技术内幕》