基于linux2.4.0分析,讲解路由原理,我们看比较简单的方式,未定义CONFIG_IP_MULTIPLE_TABLES(策略路由)的。
在“linux tcp 的 socket、bind、listen、accept原理分析”博客中,我们留下inet_addr_type关于路由的相关知识,如果弄懂这个函数,我们要再从inet_init函数看起,由此理清路由函数表的初始化过程并从中找到本地路由表函数的内容。
inet_init内部调用了ip_init,通过它进一步调用ip_rt_init,由此完成路由函数表的初始化操作。
inet_init=>ip_init
void __init ip_init(void)
{
dev_add_pack(&ip_packet_type);
ip_rt_init();
inet_initpeers();
#ifdef CONFIG_IP_MULTICAST
proc_net_create("igmp", 0, ip_mc_procinfo);
#endif
}
inet_init=>ip_init=>ip_rt_init
void __init ip_rt_init(void)
{
int i, order, goal;
#ifdef CONFIG_NET_CLS_ROUTE
for (order=0;
(PAGE_SIZE<<order) < 256*sizeof(ip_rt_acct)*NR_CPUS; order++)
/* NOTHING */;
ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
if (!ip_rt_acct)
panic("IP: failed to allocate ip_rt_acct\n");
memset(ip_rt_acct, 0, PAGE_SIZE<<order);
#endif
//创建路由项的高速缓存,对象长度等于路由表的长度
ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
sizeof(struct rtable),
0, SLAB_HWCACHE_ALIGN,
NULL, NULL);
if (!ipv4_dst_ops.kmem_cachep)
panic("IP: failed to allocate ip_dst_cache\n");
goal = num_physpages >> (26 - PAGE_SHIFT);
for (order = 0; (1UL << order) < goal; order++)
/* NOTHING */;
do {
rt_hash_mask = (1UL << order) * PAGE_SIZE /
sizeof(struct rt_hash_bucket);
while (rt_hash_mask & (rt_hash_mask-1))
rt_hash_mask--;
rt_hash_table = (struct rt_hash_bucket *)
__get_free_pages(GFP_ATOMIC, order);//创建路由哈希桶缓存
} while (rt_hash_table == NULL && --order > 0);
if (!rt_hash_table)
panic("Failed to allocate IP route cache hash table\n");
printk("IP: routing cache hash table of %u buckets, %ldKbytes\n",
rt_hash_mask,
(long) (rt_hash_mask*sizeof(struct rt_hash_bucket))/1024);
for (rt_hash_log=0; (1<<rt_hash_log) != rt_hash_mask; rt_hash_log++)
/* NOTHING */;
rt_hash_mask--;
//初始化路由哈希桶队列
for (i = 0; i <= rt_hash_mask; i++) {
rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
rt_hash_table[i].chain = NULL;
}
ipv4_dst_ops.gc_thresh = (rt_hash_mask+1);//记录回收底线
ip_rt_max_size = (rt_hash_mask+1)*16;//哈希表的最大长度
devinet_init();
ip_fib_init();
rt_flush_timer.function = rt_run_flush;
rt_periodic_timer.function = rt_check_expire;
/* All the timers, started at system startup tend
to synchronize. Perturb it a bit.
*/
rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
+ ip_rt_gc_interval;
add_timer(&rt_periodic_timer);
proc_net_create ("rt_cache", 0, rt_cache_get_info);
#ifdef CONFIG_NET_CLS_ROUTE
create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
#endif
}
相关数据结构:
struct rt_hash_bucket {
struct rtable *chain;//路由表结构队列
rwlock_t lock;
} __attribute__((__aligned__(8)));
struct sk_buff;
struct dst_entry
{
struct dst_entry *next;
atomic_t __refcnt; /* client references */
int __use;
struct net_device *dev;
int obsolete;
int flags;
#define DST_HOST 1
unsigned long lastuse;
unsigned long expires;
unsigned mxlock;
unsigned pmtu;
unsigned window;
unsigned rtt;
unsigned rttvar;
unsigned ssthresh;
unsigned cwnd;
unsigned advmss;
unsigned reordering;
unsigned long rate_last; /* rate limiting for ICMP */
unsigned long rate_tokens;
int error;
struct neighbour *neighbour;
struct hh_cache *hh;
int (*input)(struct sk_buff*);//输入函数指针
int (*output)(struct sk_buff*);//输出函数指针
#ifdef CONFIG_NET_CLS_ROUTE
__u32 tclassid;
#endif
struct dst_ops *ops;
char info[0];
};
struct rtable
{
union
{
struct dst_entry dst;
struct rtable *rt_next;
} u;
unsigned rt_flags;
unsigned rt_type;
__u32 rt_dst; /* Path destination */
__u32 rt_src; /* Path source */
int rt_iif;
/* Info on neighbour */
__u32 rt_gateway;
/* Cache lookup keys */
struct rt_key key;
/* Miscellaneous cached information */
__u32 rt_spec_dst; /* RFC1122 specific destination */
struct inet_peer *peer; /* long-living peer info */
#ifdef CONFIG_IP_ROUTE_NAT
__u32 rt_src_map;
__u32 rt_dst_map;
#endif
};
devinet_init函数中注册的是用于管理路由地址,关于通知链相关原理,后续会写对应博客解释。
inet_init=>ip_init=>ip_rt_init=>devinet_init
struct notifier_block ip_netdev_notifier={
inetdev_event,
NULL,
0
};
void __init devinet_init(void)
{
register_gifconf(PF_INET, inet_gifconf);
register_netdevice_notifier(&ip_netdev_notifier);//注册通知链
#ifdef CONFIG_RTNETLINK
rtnetlink_links[PF_INET] = inet_rtnetlink_table;
#endif
#ifdef CONFIG_SYSCTL
devinet_sysctl.sysctl_header =
register_sysctl_table(devinet_sysctl.devinet_root_dir, 0);
devinet_sysctl_register(NULL, &ipv4_devconf_dflt);
#endif
}
inet_init=>ip_init=>ip_rt_init=>ip_fib_init
struct fib_table *local_table;//本地路由函数表
struct fib_table *main_table;//主路由函数表
void __init ip_fib_init(void)
{
#ifdef CONFIG_PROC_FS
proc_net_create("route",0,fib_get_procinfo);
#endif /* CONFIG_PROC_FS */
#ifndef CONFIG_IP_MULTIPLE_TABLES
local_table = fib_hash_init(RT_TABLE_LOCAL);//创建本地路由函数表
main_table = fib_hash_init(RT_TABLE_MAIN);//创建主路由函数表
#else
fib_rules_init();
#endif
//通知链的注册,会通过fib_add_ifaddr函数增加修改删除路由
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
}
inet_init=>ip_init=>ip_rt_init=>ip_fib_init=>fib_hash_init
struct fib_table
{
unsigned char tb_id;
unsigned tb_stamp;
int (*tb_lookup)(struct fib_table *tb, const struct rt_key *key, struct fib_result *res);
int (*tb_insert)(struct fib_table *table, struct rtmsg *r,
struct kern_rta *rta, struct nlmsghdr *n,
struct netlink_skb_parms *req);
int (*tb_delete)(struct fib_table *table, struct rtmsg *r,
struct kern_rta *rta, struct nlmsghdr *n,
struct netlink_skb_parms *req);
int (*tb_dump)(struct fib_table *table, struct sk_buff *skb,
struct netlink_callback *cb);
int (*tb_flush)(struct fib_table *table);
int (*tb_get_info)(struct fib_table *table, char *buf,
int first, int count);
void (*tb_select_default)(struct fib_table *table,
const struct rt_key *key, struct fib_result *res);
unsigned char tb_data[0];//分配fib_table时,连同struct fn_hash一起分配,利用了0长度数组的特性
};
struct fn_hash
{
struct fn_zone *fn_zones[33];//ipv4 为32位的地址,所以数组为33
struct fn_zone *fn_zone_list;
};
struct fn_zone//路由区队列结构定义
{
struct fn_zone *fz_next; /* 指向下一个不为空的路由区结构 */
struct fib_node **fz_hash; /* 哈希队列 */
int fz_nent; /* 包含的路由节点总数 */
int fz_divisor; /* 哈希头数量 */
u32 fz_hashmask; /* 哈希头掩码 */
#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
int fz_order; /* 掩码位数 */
u32 fz_mask;//子网掩码
#define FZ_MASK(fz) ((fz)->fz_mask)
};
struct fib_node
{
struct fib_node *fn_next;
struct fib_info *fn_info;
#define FIB_INFO(f) ((f)->fn_info)
fn_key_t fn_key;
u8 fn_tos;
u8 fn_type;
u8 fn_scope;
u8 fn_state;
};
struct fib_nh
{
struct net_device *nh_dev;//指向网络设备结构
unsigned nh_flags;//跳转标志位
unsigned char nh_scope;//路由的跳转范围,以此确定下一个跳转
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int nh_weight;//跳转压力
int nh_power;//跳转能力
#endif
#ifdef CONFIG_NET_CLS_ROUTE
__u32 nh_tclassid;
#endif
int nh_oif;//发送设备的ID
u32 nh_gw;//网关的ip地址
};
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_table * fib_hash_init(int id)
#else
struct fib_table * __init fib_hash_init(int id)
#endif
{
struct fib_table *tb;
if (fn_hash_kmem == NULL)
fn_hash_kmem = kmem_cache_create("ip_fib_hash",
sizeof(struct fib_node),
0, SLAB_HWCACHE_ALIGN,
NULL, NULL);
tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), GFP_KERNEL);
if (tb == NULL)
return NULL;
//初始化设置fib_table
tb->tb_id = id;
tb->tb_lookup = fn_hash_lookup;
tb->tb_insert = fn_hash_insert;
tb->tb_delete = fn_hash_delete;
tb->tb_flush = fn_hash_flush;
tb->tb_select_default = fn_hash_select_default;
#ifdef CONFIG_RTNETLINK
tb->tb_dump = fn_hash_dump;
#endif
#ifdef CONFIG_PROC_FS
tb->tb_get_info = fn_hash_get_info;
#endif
memset(tb->tb_data, 0, sizeof(struct fn_hash));//清零fn_hash结构
return tb;
}
到了这里我们可以解答之前的一个问题在博客 “linux tcp 的 socket、bind、listen、accept原理分析”中 inet_addr_type函数中的local_table->tb_lookup是什么函数,答案为fn_hash_lookup。我们看下相关代码:
sys_socketcall=>sys_bind=>inet_bind=>inet_addr_type
unsigned inet_addr_type(u32 addr)
{
......
if (local_table) {
ret = RTN_UNICAST;
if (local_table->tb_lookup(local_table, &key, &res) == 0) {
ret = res.type;
fib_res_put(&res);
}
}
return ret;
}
sys_socketcall=>sys_bind=>inet_bind=>inet_addr_type=> fn_hash_lookup
/*根据rt_key结构的入口参数在fib_table结构的tb中查找对应的FIB项,如果有,将结构存入入口参数
fib_result结构的指针res中。该函数嵌套了两个for循环,外层是对路由区队列的循环,内层是对路由
节点队列循环;代码沿着路由区结构队列对比每一个路由节点,找到同一个子网的路由节点后接着调用
fib_semantic_match函数*/
static int
fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result *res)
{
int err;
struct fn_zone *fz;
struct fn_hash *t = (struct fn_hash*)tb->tb_data;//通过tb->tb_data获得fib_hash的指针,这是FIB表的头指针
//这个for循环遍历链表的所有元素,每一个元素都是fib_zone结构
read_lock(&fib_hash_lock);
for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
struct fib_node *f;
fn_key_t k = fz_key(key->dst, fz);
//遍历fib_node结构的链表
for (f = fz_chain(k, fz); f; f = f->fn_next) {
if (!fn_key_eq(k, f->fn_key)) {
if (fn_key_leq(k, f->fn_key))
break;
else
continue;
}
#ifdef CONFIG_IP_ROUTE_TOS
if (f->fn_tos && f->fn_tos != key->tos)
continue;
#endif
f->fn_state |= FN_S_ACCESSED;//说明找到了对应的FIB项,设置已访问标志位
if (f->fn_state&FN_S_ZOMBIE)
continue;
if (f->fn_scope < key->scope)
continue;
err = fib_semantic_match(f->fn_type, FIB_INFO(f), key, res);//判断是否有错误
if (err == 0) {
res->type = f->fn_type;
res->scope = f->fn_scope;
res->prefixlen = fz->fz_order;
goto out;
}
if (err < 0)
goto out;
}
}
err = 1;
out:
read_unlock(&fib_hash_lock);
return err;
}
sys_socketcall=>sys_bind=>inet_bind=>inet_addr_type=> fn_hash_lookup=>fib_semantic_match
int
fib_semantic_match(int type, struct fib_info *fi, const struct rt_key *key, struct fib_result *res)
{
int err = fib_props[type].error;
if (err == 0) {
if (fi->fib_flags&RTNH_F_DEAD)
return 1;
res->fi = fi;
switch (type) {
#ifdef CONFIG_IP_ROUTE_NAT
case RTN_NAT:
FIB_RES_RESET(*res);
atomic_inc(&fi->fib_clntref);
return 0;
#endif
case RTN_UNICAST:
case RTN_LOCAL:
case RTN_BROADCAST:
case RTN_ANYCAST:
case RTN_MULTICAST:
for_nexthops(fi) {
if (nh->nh_flags&RTNH_F_DEAD)
continue;
if (!key->oif || key->oif == nh->nh_oif)
break;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (nhsel < fi->fib_nhs) {
res->nh_sel = nhsel;
atomic_inc(&fi->fib_clntref);
return 0;
}
#else
if (nhsel < 1) {
atomic_inc(&fi->fib_clntref);
return 0;
}
#endif
endfor_nexthops(fi);
res->fi = NULL;
return 1;
default:
res->fi = NULL;
printk(KERN_DEBUG "impossible 102\n");
return -EINVAL;
}
}
return err;
}
代码中的fib_props定义:
static struct
{
int error;
u8 scope;
} fib_props[RTA_MAX+1] = {
{ 0, RT_SCOPE_NOWHERE}, /* RTN_UNSPEC */
{ 0, RT_SCOPE_UNIVERSE}, /* RTN_UNICAST */
{ 0, RT_SCOPE_HOST}, /* RTN_LOCAL */
{ 0, RT_SCOPE_LINK}, /* RTN_BROADCAST */
{ 0, RT_SCOPE_LINK}, /* RTN_ANYCAST */
{ 0, RT_SCOPE_UNIVERSE}, /* RTN_MULTICAST */
{ -EINVAL, RT_SCOPE_UNIVERSE}, /* RTN_BLACKHOLE */
{ -EHOSTUNREACH, RT_SCOPE_UNIVERSE},/* RTN_UNREACHABLE */
{ -EACCES, RT_SCOPE_UNIVERSE}, /* RTN_PROHIBIT */
{ -EAGAIN, RT_SCOPE_UNIVERSE}, /* RTN_THROW */
#ifdef CONFIG_IP_ROUTE_NAT
{ 0, RT_SCOPE_HOST}, /* RTN_NAT */
#else
{ -EINVAL, RT_SCOPE_NOWHERE}, /* RTN_NAT */
#endif
{ -EINVAL, RT_SCOPE_NOWHERE} /* RTN_XRESOLVE */
};
以路由节点的fn_type,即路由类型为下标取出数组中的错误码err,如果错误码是0,则表示支持该路由,检查输入参数路由信息是否正处于被删除状态,这要根据fib_flags标志中的RTNH_F_DEAD来决定。先将fi路由信息记入到res中,再根据type路由类型来执行switch语句。
类型 | 意义 |
RTN_UNICAST | 单播路由类型 |
RTN_LOCAL | 回环的路由(本地转发)类型 |
RTN_BROADCAST | 广播路由类型 |
RTN_ANYCAST | 任意路由类型 |
RTN_MULTICAST | 组播(多播)路由类型 |
这类型类都调用了for_nexthops宏,看其定义
#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
for (nhsel=0; nhsel < 1; nhsel++)
这个宏定义是从fib_info结构中获取fib_nh路由跳转结构指针,一个fib_nh结构代表一次路由跳转的内容。每一次跳转都要用这个结构来表示。
for_nexthops(fi) {
if (nh->nh_flags&RTNH_F_DEAD)
continue;
if (!key->oif || key->oif == nh->nh_oif)
break;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (nhsel < fi->fib_nhs) {
res->nh_sel = nhsel;
atomic_inc(&fi->fib_clntref);
return 0;
}
#else
if (nhsel < 1) {
atomic_inc(&fi->fib_clntref);
return 0;
}
#endif
这段代码是循环检查路由中的跳转结构是否处于移除状态,如果是nh_flags设置了RTNH_F_DEAD。如果路由键值没有定义oif既没有指定发送设备,或者指定的发送设备和目的跳转结构中的nh_oif相同,即同一个设备就找到了目的跳转结构,则此时跳出循环。
跳出循环后还要检查一下跳转的次数(宏定义的else分支)如果小于1,增加路由信息的引用计数,然后返回0。回到fn_hash_lookup函数中将执行下面的代码:
err = fib_semantic_match(f->fn_type, FIB_INFO(f), key, res);
if (err == 0) {
res->type = f->fn_type;
res->scope = f->fn_scope;
res->prefixlen = fz->fz_order;
goto out;
}
将信息记入到res中,前面在fib_semantic_match已经记录了路由信息。然后一层一层返回,最终将res->type作为返回值传给inet_bind函数。
我们接着探索下代码中出现的路由节点、路由信息以及跳转结构的初始化过程。
路由的设置及相关结构的初始化
路由的相关结构在外力和通知链设置路由时初始化的。外力指的是用户使用工具或者路由命令设置。先分析下net-tools通过ioctl系统调用设置路由的过程。
路由设置路线A
ioctl=>sys_ioctl=>sock_ioctl=>inet_ioctl=>ip_rt_ioctl
int ip_rt_ioctl(unsigned int cmd, void *arg)
{
int err;
struct kern_rta rta;
struct rtentry r;
struct {
struct nlmsghdr nlh;
struct rtmsg rtm;
} req;
switch (cmd) {
case SIOCADDRT: /* Add a route */
case SIOCDELRT: /* Delete a route */
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (copy_from_user(&r, arg, sizeof(struct rtentry)))
return -EFAULT;
rtnl_lock();
err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r);
if (err == 0) {
if (cmd == SIOCDELRT) {
struct fib_table *tb = fib_get_table(req.rtm.rtm_table);
err = -ESRCH;
if (tb)
err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
} else {
struct fib_table *tb = fib_new_table(req.rtm.rtm_table);
err = -ENOBUFS;
if (tb)
err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
}
if (rta.rta_mx)
kfree(rta.rta_mx);
}
rtnl_unlock();
return err;
}
return -EINVAL;
}
代码中根据命令参数cmd分别调用指定路由函数表的tb_insert和tb_delete两个钩子函数。在初始化路由函数表的fib_hash_init函数中曾经对这两个钩子函数进行了设置。
tb->tb_insert = fn_hash_insert;
tb->tb_delete = fn_hash_delete;
增加路由必须进入fn_hash_insert函数,删除路由则必须进入fn_hash_delete函数,这是net-tools的设置路线,我们在后面阅读这两个函数。
路由设置线路B
第二条路由设置线路是指通知链的方式。前面曾经在ip_fib_init函数中介绍过内核注册了两个通知节点。
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
这两个通知节点分别被插入到netdev_chain、inetaddr_chain;当网络设备安装或者初始化时,以及ip地址改变时都会触发通知链的处理函数。只不过这次操作的地址通知节点是fib_inetaddr_event
struct notifier_block fib_inetaddr_notifier = {
fib_inetaddr_event,
NULL,
0
};
从notifier_call字段的内容来看,内核在IP地址发生变换时会执行fib_inetaddr_event通知函数。
这两个通知链inetaddr_chain和netdev_chain调用了fib_inetaddr_event和fib_netdev_event函数,这连个函数都调用了fib_add_ifaddr来添加地址到路由中,只不过fib_inetaddr_event函数中增加了对fib_del_ifaddr函数的调用,从而使之可以删除路由地址。
无论是fib_add_ifaddr函数还是fib_del_ifaddr函数,它们都是调用fib_magic函数来实现路由地址的操作。
/* Prepare and feed intra-kernel routing request.
Really, it should be netlink message, but :-( netlink
can be not configured, so that we feed it directly
to fib engine. It is legal, because all events occur
only when netlink is already locked.
*/
static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa)
{
struct fib_table * tb;
struct {
struct nlmsghdr nlh;
struct rtmsg rtm;
} req;
struct kern_rta rta;
memset(&req.rtm, 0, sizeof(req.rtm));
memset(&rta, 0, sizeof(rta));
if (type == RTN_UNICAST)
tb = fib_new_table(RT_TABLE_MAIN);
else
tb = fib_new_table(RT_TABLE_LOCAL);
if (tb == NULL)
return;
req.nlh.nlmsg_len = sizeof(req);
req.nlh.nlmsg_type = cmd;
req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
req.nlh.nlmsg_pid = 0;
req.nlh.nlmsg_seq = 0;
req.rtm.rtm_dst_len = dst_len;
req.rtm.rtm_table = tb->tb_id;
req.rtm.rtm_protocol = RTPROT_KERNEL;
req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
req.rtm.rtm_type = type;
rta.rta_dst = &dst;
rta.rta_prefsrc = &ifa->ifa_local;
rta.rta_oif = &ifa->ifa_dev->dev->ifindex;
if (cmd == RTM_NEWROUTE)
tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
else
tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
}
函数最后根据命令参数cmd的值来调用tb->tb_insert或者tb->tb_delete。通过fib_hash_init初始化函数可以知道增加路由调用fn_hash_insert,删除路由调用fn_hash_delete。这两个函数是内核配置路由的入口。
路由表和路由缓存的Linux内核实现
我们从内核中如何维护路由表和路由缓存的流程开始说明。
下图说明了Linux 内核中路由表如何更新、如何访问以及不同的路径,另外还会解释路由表和路由缓存之间的关系。
路由缓存实现概述
路由缓存是查找路由的最快的缓存方法。FIB也提供了查找路由的方法,但查询时间比查询缓存长,如果每个对报文都运行一个FIB查询会影响路由性能;然而路由缓存减少了路由信息的查找时间。
路由缓存时一个包含多个缓存表项的散列表。路由缓存散列表是一个rt_hash_bucket结构的数组。
static struct rt_hash_bucket *rt_hash_table;
每个rt_hash_bucket结构包含链元素和读写锁 。链元素包含反映缓存表项的结构列表。
未完待续。。。。。。