浅析Linux Kernel 哈希路由表实现(一)

最新推荐文章于 2023-12-02 13:52:44 发布

BlueSy2008

最新推荐文章于 2023-12-02 13:52:44 发布

阅读量1k

点赞数

分类专栏： Linux

Linux 专栏收录该内容

47 篇文章 0 订阅

订阅专栏

1. 路由表

目前Linux内核中支持两种路由表，一种是Hash路由表，另一种是Trie路由表，Trie算法查找效率很高，但它也因极其消毫内存资源而闻名，因此一般用在大型机上，否则在路由项过多时很可能造成内存的耗尽。在一般的机器上最好还是使用Hash路由表，之前有争论说Linux使用Hash路由表相比于二叉树效率太低，不适合商用，其实只要设计良好的哈希算法，尽量减少哈希碰撞，Hash路由表的查找效率也是很高的，在最好的情况下算法复杂算可以达到O(1)，当然，最差也不过是O(n)，我们有理由相信Linux中存在各种优秀的哈希算法，这些都是值得我们学习的。

Linux内核中IPv4路由表采用了分层的结构，第一层是是fib_table，表示路由表，最多可以支持256张路由表，用户可以根据策略路由的需求来创建自己的路由表，系统默认的两张路由表为RT_TABLE_LOCAL和RT_TABLE_MAIN。下面是系统保留的路由表标识符：

enum rt_class_t {
	RT_TABLE_UNSPEC=0,
/* User defined values */
	RT_TABLE_COMPAT=252,
	RT_TABLE_DEFAULT=253,
	RT_TABLE_MAIN=254,
	RT_TABLE_LOCAL=255,
	RT_TABLE_MAX=0xFFFFFFFF
};

下面是路由表的定义：

struct fib_table {
	struct hlist_node tb_hlist;
	/* 路由表的标识，即为上面提到的类型 */
	u32		tb_id;
	/* 该路由中默认路由条目在哈希路由表中的索引，
		在fib_table_select_default()函数中计算得出 */
	int		tb_default;
	/* Linux保留内存空间一惯的做法，这块空间留给了struct fn_hash */
	unsigned char	tb_data[0];
};

1.1 根据路由表ID来查找路由表。

struct fib_table *fib_get_table(struct net *net, u32 id)
{
	struct fib_table *tb;
	struct hlist_node *node;
	struct hlist_head *head;
	unsigned int h;
 
	if (id == 0)
		id = RT_TABLE_MAIN;
	h = id & (FIB_TABLE_HASHSZ - 1);
 
	rcu_read_lock();
	head = &net->ipv4.fib_table_hash[h];
	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
		if (tb->tb_id == id) {
			rcu_read_unlock();
			return tb;
		}
	}
	rcu_read_unlock();
	return NULL;
}

当路由表标识为0时默认返回主路由表，这里的哈希算法也很简单，当支持多径路由时FIB_TABLE_HASHSZ定义为256，否则FIB_TABLE_HASHSZ定义为2。也就是说在不支持多径路由的情况下，fib_table_hash中只有两个元素。

1.2 路由表的创建。

 
struct fib_table *fib_new_table(struct net *net, u32 id)
{
	struct fib_table *tb;
	unsigned int h;
 
	if (id == 0)
		id = RT_TABLE_MAIN;
	tb = fib_get_table(net, id);
	if (tb)
		return tb;
 
	tb = fib_hash_table(id);
	if (!tb)
		return NULL;
	h = id & (FIB_TABLE_HASHSZ - 1);
	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
	return tb;
}

在这个函数里面首先在已有的哈希表中根据给出的路由表标识查找已存在的路由表，若找到对应的路由表则直接返回，否则调用fib_hash_table()函数创建一个新的路由表，然后将其添加到fib_table_hash中去。接下来看一下fib_hash_table()这个函数：

struct fib_table *fib_hash_table(u32 id)
{
	struct fib_table *tb;
 
	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
		     GFP_KERNEL);
	if (tb == NULL)
		return NULL;
 
	tb->tb_id = id;
	tb->tb_default = -1;
 
	memset(tb->tb_data, 0, sizeof(struct fn_hash));
	return tb;
}

这个函数很简单，分配了一块内存空间，大小为sizeof(struct fib_table) + sizeof(struct fn_hash)，于是fib_table的最后一个字段tb_data便指向了这个struct fn_hash。接下来该说路由表的第二层了。

2. 路由域struct fn_zone

第二层是fn_zone，表示路由域，Linux根据路由掩码的长度将所有的路由分为32个域。我们先来看下位于fib_table末尾的struct fn_hash的定义：

struct fn_hash {
	struct fn_zone		*fn_zones[33];
	struct fn_zone __rcu	*fn_zone_list;
};

如上路由表分为32个路由域，这32个跌幅域又连成了链表，其中fn_zone_list这个字段指向链表的头。

 
struct fn_zone {
	struct fn_zone __rcu	*fz_next;	/* Next not empty zone	*/
	struct hlist_head __rcu	*fz_hash;	/* Hash table pointer	*/
	seqlock_t		fz_lock;
	u32			fz_hashmask;	/* (fz_divisor - 1)	*/
 
	u8			fz_order;	/* Zone order (0..32)	*/
	u8			fz_revorder;	/* 32 - fz_order	*/
	__be32			fz_mask;	/* inet_make_mask(order) */
#define FZ_MASK(fz)		((fz)->fz_mask)
 
	struct hlist_head	fz_embedded_hash[EMBEDDED_HASH_SIZE];
 
	int			fz_nent;	/* Number of entries	*/
	int			fz_divisor;	/* Hash size (mask+1)	*/
};

2.1 fn_zone的创建

 
static struct fn_zone *
fn_new_zone(struct fn_hash *table, int z)
{
	int i;
	struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
	if (!fz)
		return NULL;
 
	seqlock_init(&fz->fz_lock);
	fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
	fz->fz_hashmask = fz->fz_divisor - 1;
	RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
	fz->fz_order = z;
	fz->fz_revorder = 32 - z;
	fz->fz_mask = inet_make_mask(z);
 
	/* Find the first not empty zone with more specific mask */
	for (i = z + 1; i <= 32; i++)
		if (table->fn_zones[i])
			break;
	if (i > 32) {
		/* No more specific masks, we are the first. */
		rcu_assign_pointer(fz->fz_next,
				   rtnl_dereference(table->fn_zone_list));
		rcu_assign_pointer(table->fn_zone_list, fz);
	} else {
		rcu_assign_pointer(fz->fz_next,
				   rtnl_dereference(table->fn_zones[i]->fz_next));
		rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
	}
	table->fn_zones[z] = fz;
	fib_hash_genid++;
	return fz;
}

首先在内存中给struct fn_zone分配内存空间，然后按规则对一些基本的变量进行初始化，之后就是安排各个fn_zone在fn_zone_list的位置了，fn_zone_list中的节点都是按照fz_order从大到小排列的，所以首先一个for循环找出比当前fn_zone的fz_order大的最小fn_zone，如果存在，就将当前节点插到该节点之后;如果不存在，则表示当前节点是目前zn_order最大的节点，则它会成为链表的头。

3. 路由节点fib_node

第三层为路由节点fib_node，在同一个路由域中的路由项有相同的掩码长度，但它们的网络号可以不同，则根据不同路由的网络号将同一个域划分为不同的路由节点，如到地址10.10.65.0和10.10.64.0的路由便是属于同一个路由域里面两个不同路由节点的两条路由。在路由节点中又有根据路由的服务类型，路由类型，路由寻址范围以前路由状态的不同将同一个路由节点中的路由划分成不同的路由

struct fib_node {
	/* 哈希链表指针，针向同一个Hash槽中的相临节点 */
	struct hlist_node	fn_hash;
	/* 属于该节点的alias链表的头 */
	struct list_head	fn_alias;
	/* 该节点对应的网络key */
	__be32			fn_key;
	/* 预先分配了空间的别名，在fib_fast_alloc()中使用 */
	struct fib_alias        fn_embedded_alias;
};

3.1 fib_alias结构

struct fib_alias {
	/* 指向链表中的相邻节点 */
	struct list_head	fa_list;
	/* fa_info是最终的路由信息 */
	struct fib_info		*fa_info;
	/* 服务类型，对于一般服务取值为0 */
	u8			fa_tos;
	/* 路由类型 */
	u8			fa_type;
	/* 路由范围 */
	u8			fa_scope;
	/* 路由状态 */
	u8			fa_state;
	struct rcu_head		rcu;
};

路由类型和路由范围的对应关系如下结构：

 
static const struct
{
	int	error;
	u8	scope;
} fib_props[RTN_MAX + 1] = {
	[RTN_UNSPEC] = {
		.error	= 0,
		.scope	= RT_SCOPE_NOWHERE,
	},
	[RTN_UNICAST] = {
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
	},
	[RTN_LOCAL] = {
		.error	= 0,
		.scope	= RT_SCOPE_HOST,
	},
	[RTN_BROADCAST] = {
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
	},
	[RTN_ANYCAST] = {
		.error	= 0,
		.scope	= RT_SCOPE_LINK,
	},
	[RTN_MULTICAST] = {
		.error	= 0,
		.scope	= RT_SCOPE_UNIVERSE,
	},
	[RTN_BLACKHOLE] = {
		.error	= -EINVAL,
		.scope	= RT_SCOPE_UNIVERSE,
	},
	[RTN_UNREACHABLE] = {
		.error	= -EHOSTUNREACH,
		.scope	= RT_SCOPE_UNIVERSE,
	},
	[RTN_PROHIBIT] = {
		.error	= -EACCES,
		.scope	= RT_SCOPE_UNIVERSE,
	},
	[RTN_THROW] = {
		.error	= -EAGAIN,
		.scope	= RT_SCOPE_UNIVERSE,
	},
	[RTN_NAT] = {
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
	},
	[RTN_XRESOLVE] = {
		.error	= -EINVAL,
		.scope	= RT_SCOPE_NOWHERE,
	},
};

3.2 fib_info结构

fib_alias中的fib_info为最终的路由信息，来看一下它的定义:

 
struct fib_info {
	struct hlist_node	fib_hash;
	struct hlist_node	fib_lhash;
	struct net		*fib_net;
	int			fib_treeref;
	atomic_t		fib_clntref;
	int			fib_dead;
	unsigned		fib_flags;
	int			fib_protocol;
	__be32			fib_prefsrc;
	u32			fib_priority;
	u32			fib_metrics[RTAX_MAX];
#define fib_mtu fib_metrics[RTAX_MTU-1]
#define fib_window fib_metrics[RTAX_WINDOW-1]
#define fib_rtt fib_metrics[RTAX_RTT-1]
#define fib_advmss fib_metrics[RTAX_ADVMSS-1]
	/* 路由的跃点数，当支持多径路由时，fib_nhs为跃点数目，
	当不支持多径路由时，到达目前地址都只有一跳，则该值为1 */
	int			fib_nhs;
	struct rcu_head		rcu;
	/* 下一跳信息 */
	struct fib_nh		fib_nh[0];
#define fib_dev		fib_nh[0].nh_dev
};

3.3 fib_info的创建

struct fib_config中包含了创建一条路由条目所需要的信息。本文中不考虑路由多径的情况，即假设宏CONFIG_IP_ROUTE_MULTIPATH未定义。

struct fib_info *fib_create_info(struct fib_config *cfg)
{
	int err;
	struct fib_info *fi = NULL;
	struct fib_info *ofi;
	int nhs = 1;
	struct net *net = cfg->fc_nlinfo.nl_net;
 
	/* 检测该请求创建的路由信息范围与类型是否对应 */
	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
		goto err_inval;
 
	/* 路由信息除依附于上述所提到的几种数据结构外，同时系统会维护
		两个全局的哈希链表，一个用于保存路由信息，另一个用于保存本地地址，
		fib_info_cnt全局变量表示路由表项的数目，fib_hash_size表示哈希表
		的大小，当路由表项数目大于路由哈希表大小时，为了防止哈希碰撞导致的
		查找效率降低，需要扩大哈希表大小为原来的两倍 */
	err = -ENOBUFS;
	if (fib_info_cnt >= fib_hash_size) {
		unsigned int new_size = fib_hash_size << 1;
		struct hlist_head *new_info_hash;
		struct hlist_head *new_laddrhash;
		unsigned int bytes;
 
		if (!new_size)
			new_size = 1;
		/* 按新的哈希表尺寸为哈希表分配内存空间 */
		bytes = new_size * sizeof(struct hlist_head *);
		new_info_hash = fib_hash_alloc(bytes);
		new_laddrhash = fib_hash_alloc(bytes);
		if (!new_info_hash || !new_laddrhash) {
			fib_hash_free(new_info_hash, bytes);
			fib_hash_free(new_laddrhash, bytes);
		} else
			/* 如果内存空间分配成功，则将旧的列表中的内容移动到新的链表中，
			 	并释放旧列表的内存空间 */
			fib_hash_move(new_info_hash, new_laddrhash, new_size);
 
		/* 列表大小溢出，则出错返回 */
		if (!fib_hash_size)
			goto failure;
	}
 
	/* 为路由表项分配内存空间，大小为struct fib_info的大小加上nhs个下一跳信息
	struct fib_nh的大小，不支持多径路由的路由时nhs始终为1 */
	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
	if (fi == NULL)
		goto failure;
	fib_info_cnt++;
 
	fi->fib_net = hold_net(net);
	fi->fib_protocol = cfg->fc_protocol;
	fi->fib_flags = cfg->fc_flags;
	fi->fib_priority = cfg->fc_priority;
	fi->fib_prefsrc = cfg->fc_prefsrc;
 
	fi->fib_nhs = nhs;
	/* 遍历fib_nh信息，此处仅执行一次，设置fib_nh的nh_parent */
	change_nexthops(fi) {
		nexthop_nh->nh_parent = fi;
	} endfor_nexthops(fi)
 
	/* 如果给出了路由属性信信，则通过遍历路由属性信息来确定fib_metrics的值 */
	if (cfg->fc_mx) {
		struct nlattr *nla;
		int remaining;
 
		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
			int type = nla_type(nla);
 
			if (type) {
				if (type > RTAX_MAX)
					goto err_inval;
				fi->fib_metrics[type - 1] = nla_get_u32(nla);
			}
		}
	}
 
	struct fib_nh *nh = fi->fib_nh;
 
	nh->nh_oif = cfg->fc_oif;
	nh->nh_gw = cfg->fc_gw;
	nh->nh_flags = cfg->fc_flags;
 
	if (fib_props[cfg->fc_type].error) {
		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
			goto err_inval;
		goto link_it;
	}
 
	if (cfg->fc_scope > RT_SCOPE_HOST)
		goto err_inval;
 
	if (cfg->fc_scope == RT_SCOPE_HOST) {
		struct fib_nh *nh = fi->fib_nh;
		/* 当前添加的是本地路由信息，只可能有一跳，即便是开启了
			多径路由，下一跳数目不为1则报错，同时本地路由也不需要
			指定网关，如果指定则报错 */
 
		if (nhs != 1 || nh->nh_gw)
			goto err_inval;
		nh->nh_scope = RT_SCOPE_NOWHERE;
		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
		err = -ENODEV;
		if (nh->nh_dev == NULL)
			goto failure;
	} else {
		/* 如果添加的不是本地路由信息，则检查下一跳信息 */
		change_nexthops(fi) {
			err = fib_check_nh(cfg, fi, nexthop_nh);
			if (err != 0)
				goto failure;
		} endfor_nexthops(fi)
	}
 
	if (fi->fib_prefsrc) {
		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
		    fi->fib_prefsrc != cfg->fc_dst)
			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
				goto err_inval;
	}
 
link_it:
	/* 查找路由条目，返回与当前路由条目精确匹配的条目，
		若存在，则释放当前创建的新条目，增加已找到的路由条目
		的引用计数，并返回已找到的旧路由条目 */
	ofi = fib_find_info(fi);
	if (ofi) {
		fi->fib_dead = 1;
		free_fib_info(fi);
		ofi->fib_treeref++;
		return ofi;
	}
 
	/* 当前路由表中未找到已存在的符合要求的路由条目, 则增加
		新建路由条目的引用计数 */
	fi->fib_treeref++;
	atomic_inc(&fi->fib_clntref);
	spin_lock_bh(&fib_info_lock);
	/* 将新建的路由插入到全局路由列表中，其中fib_info_hashfh
		为散列函数 */
	hlist_add_head(&fi->fib_hash,
		       &fib_info_hash[fib_info_hashfn(fi)]);
 
	/* 如果指定了源地址，则将源地址插入到全局本地地址列表中 */
	if (fi->fib_prefsrc) {
		struct hlist_head *head;
		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
		hlist_add_head(&fi->fib_lhash, head);
	}
	/* 将下一跳信息写入全局列表中，由上述知本迭代只进行一次，
		散列函数为fib_devindex_hashfn() */
	change_nexthops(fi) {
		struct hlist_head *head;
		unsigned int hash;
 
		if (!nexthop_nh->nh_dev)
			continue;
		hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
		head = &fib_info_devhash[hash];
		hlist_add_head(&nexthop_nh->nh_hash, head);
	} endfor_nexthops(fi)
	spin_unlock_bh(&fib_info_lock);
	return fi;
 
err_inval:
	err = -EINVAL;
 
failure:
	if (fi) {
		fi->fib_dead = 1;
		free_fib_info(fi);
	}
 
	return ERR_PTR(err);
}

3.4 向路由表中插入路由信息。

 
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
	struct fib_node *new_f = NULL;
	struct fib_node *f;
	struct fib_alias *fa, *new_fa;
	struct fn_zone *fz;
	struct fib_info *fi;
	u8 tos = cfg->fc_tos;
	__be32 key;
	int err;
 
	if (cfg->fc_dst_len > 32)
		return -EINVAL;
 
	/* 根据目的地址长度找出对应的路由域 */
	fz = table->fn_zones[cfg->fc_dst_len];
	/* 如果路由域不存在，则调用fn_new_zone()函数创建一个新的路由域 */
	if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
		return -ENOBUFS;
 
	key = 0;
 
	/* 如果指定了目的地址，如果目的地址主机位不为0，则出错返回 */
	if (cfg->fc_dst) {
		if (cfg->fc_dst & ~FZ_MASK(fz))
			return -EINVAL;
		key = fz_key(cfg->fc_dst, fz);
	}
 
	/* 创建一个新的fib_info对象 */
	fi = fib_create_info(cfg);
	if (IS_ERR(fi))
		return PTR_ERR(fi);
 
	/* 如果当前路由域中路由节点的数目大于散列表大小的两倍，
		并且相关数据都合法的情况下，需要重构散列表以减小
		哈希碰撞 */
	if (fz->fz_nent > (fz->fz_divisor<<1) &&
	    fz->fz_divisor < FZ_MAX_DIVISOR &&
	    (cfg->fc_dst_len == 32 ||
	     (1 << cfg->fc_dst_len) > fz->fz_divisor))
		fn_rehash_zone(fz);
 
	/* 通过网络号key找出对应的路由节点fn_node */
	f = fib_find_node(fz, key);
 
	if (!f)
		fa = NULL;
	else
		fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
 
	if (fa && fa->fa_tos == tos &&
	    fa->fa_info->fib_priority == fi->fib_priority) {
		struct fib_alias *fa_first, *fa_match;
 
		err = -EEXIST;
		/* 如果具有与新建路由项相同属性的fib_alias存在，并且添加路由项标志中
			设置了NLM_F_EXCL(排它选项)，则返回路由已存在 */
		if (cfg->fc_nlflags & NLM_F_EXCL)
			goto out;
 
		/* We have 2 goals:
		 * 1. Find exact match for type, scope, fib_info to avoid
		 * duplicate routes
		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
		 */
		fa_match = NULL;
		fa_first = fa;
		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
		list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
			if (fa->fa_tos != tos)
				break;
			if (fa->fa_info->fib_priority != fi->fib_priority)
				break;
			if (fa->fa_type == cfg->fc_type &&
			    fa->fa_scope == cfg->fc_scope &&
			    fa->fa_info == fi) {
				fa_match = fa;
				break;
			}
		}
 
		if (cfg->fc_nlflags & NLM_F_REPLACE) {
			u8 state;
 
			/* 如果存在一条精确匹配的路由项fib_alias，并且在设置了NLM_F_REPLACE
				标志的情况下，不做处理直接返回 */
			fa = fa_first;
			if (fa_match) {
				if (fa == fa_match)
					err = 0;
				goto out;
			}
 
			/* 并没有精确匹配的路由项fib_alias，即便有匹配的fib_alias，也是
				仅tos和priority两个选项匹配，因此需要新建一个路由别名fib_alias */
			err = -ENOBUFS;
			new_fa = fib_fast_alloc(f);
			if (new_fa == NULL)
				goto out;
 
			new_fa->fa_tos = fa->fa_tos;
			new_fa->fa_info = fi;
			new_fa->fa_type = cfg->fc_type;
			new_fa->fa_scope = cfg->fc_scope;
			state = fa->fa_state;
			new_fa->fa_state = state & ~FA_S_ACCESSED;
			fib_hash_genid++;
			/* 因为设置了NLM_F_REPLACE选项，所以用新fib_alias对象替换掉
				列表中旧的fib_alias对象，并释放旧对象的内存 */
			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
 
			fn_free_alias(fa, f);
			if (state & FA_S_ACCESSED)
				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
			/* 这里留做以后再讨论 */
			rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
				  tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
			return 0;
		}
 
		/* Error if we find a perfect match which
		 * uses the same scope, type, and nexthop
		 * information.
		 */
		if (fa_match)
			goto out;
 
		if (!(cfg->fc_nlflags & NLM_F_APPEND))
			fa = fa_first;
	}
 
	/* 对应的fib_alias对象并不存在，如果没有设置NLM_F_CREATE则返回出错 */
	err = -ENOENT;
	if (!(cfg->fc_nlflags & NLM_F_CREATE))
		goto out;
 
	err = -ENOBUFS;
 
	/* 新建一个路由节点项fib_node并初始化 */
	if (!f) {
		new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
		if (new_f == NULL)
			goto out;
 
		INIT_HLIST_NODE(&new_f->fn_hash);
		INIT_LIST_HEAD(&new_f->fn_alias);
		new_f->fn_key = key;
		f = new_f;
	}
 
	/* 新建一个路由别名项fib_alias并初始化 */
	new_fa = fib_fast_alloc(f);
	if (new_fa == NULL)
		goto out;
 
	new_fa->fa_info = fi;
	new_fa->fa_tos = tos;
	new_fa->fa_type = cfg->fc_type;
	new_fa->fa_scope = cfg->fc_scope;
	new_fa->fa_state = 0;
 
	/* 将路由信息项，路由别名项，路由节点项按规则组织起来
		后刷新路由表 */
	if (new_f)
		fib_insert_node(fz, new_f);
	list_add_tail_rcu(&new_fa->fa_list,
		 (fa ? &fa->fa_list : &f->fn_alias));
	fib_hash_genid++;
 
	if (new_f)
		fz->fz_nent++;
	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
 
	rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
		  &cfg->fc_nlinfo, 0);
	return 0;
 
out:
	if (new_f)
		kmem_cache_free(fn_hash_kmem, new_f);
	fib_release_info(fi);
	return err;
}

3.5 查询路由信息

这个这个函数先在RT_TABLE_LOCAL表中查询路由信息，成功则返回，如果失败则再从RT_TABLE_MAIN表中查询路由信息，如果失败则返回网络不可达，成功则返回0。

static inline int fib_lookup(struct net *net, const struct flowi *flp,
			     struct fib_result *res)
{
	struct fib_table *table;
 
	table = fib_get_table(net, RT_TABLE_LOCAL);
	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
		return 0;
 
	table = fib_get_table(net, RT_TABLE_MAIN);
	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
		return 0;
	return -ENETUNREACH;
}

接下来再看fib_table_lookup()函数的定义：

int fib_table_lookup(struct fib_table *tb,
		     const struct flowi *flp, struct fib_result *res,
		     int fib_flags)
{
	int err;
	struct fn_zone *fz;
	struct fn_hash *t = (struct fn_hash *)tb->tb_data;
 
	rcu_read_lock();
	for (fz = rcu_dereference(t->fn_zone_list);
	     fz != NULL;
	     fz = rcu_dereference(fz->fz_next)) {
		struct hlist_head *head;
		struct hlist_node *node;
		struct fib_node *f;
		__be32 k;
		unsigned int seq;
 
		do {
			seq = read_seqbegin(&fz->fz_lock);
			k = fz_key(flp->fl4_dst, fz);
 
			head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
			hlist_for_each_entry_rcu(f, node, head, fn_hash) {
				if (f->fn_key != k)
					continue;
 
				err = fib_semantic_match(&f->fn_alias,
						 flp, res,
						 fz->fz_order, fib_flags);
				if (err <= 0)
					goto out;
			}
		} while (read_seqretry(&fz->fz_lock, seq));
	}
	err = 1;
out:
	rcu_read_unlock();
	return err;
}

该函数从fn_zone_list开始遍历fn_zone链表，上面创建fn_zone的过程中我们提到过，fn_zone在fn_zone_list中是按掩码长度从大到小排列的，即该搜索先匹配掩码最大的zone，若不匹配则转而去匹配下一个掩码稍小的路由域，其中fn_key是匹配的关键，如果key不匹配则该路由项肯定不匹配，key匹配之后还要再调用fib_semantic_match()函数再做进一步的检查，并在该函数中利用查询到的路由信息给查询结果对象fib_result初始化。当没有一个掩码长度不为0的zone中有路由项与其匹配时，函数最后一次循环会检测掩码长度为0的zone，该域便为默认路由域，在这里fn_key与k的值均为0，它们始终匹配，也就是说如果检查不到匹配的路由项，则交由默认路由来处理（不知道这样表达合不合理，大体就是这个意思），接下来再来检验证TOS，SCOPE等信息。接下来看fib_semantic_match()函数：

 
int fib_semantic_match(struct list_head *head, const struct flowi *flp,
		       struct fib_result *res, int prefixlen, int fib_flags)
{
	struct fib_alias *fa;
	int nh_sel = 0;
 
	list_for_each_entry_rcu(fa, head, fa_list) {
		int err;
 
		if (fa->fa_tos &&
		    fa->fa_tos != flp->fl4_tos)
			continue;
 
		if (fa->fa_scope < flp->fl4_scope)
			continue;
 
		fib_alias_accessed(fa);
 
		err = fib_props[fa->fa_type].error;
		if (err == 0) {
			struct fib_info *fi = fa->fa_info;
 
			if (fi->fib_flags & RTNH_F_DEAD)
				continue;
 
			switch (fa->fa_type) {
			case RTN_UNICAST:
			case RTN_LOCAL:
			case RTN_BROADCAST:
			case RTN_ANYCAST:
			case RTN_MULTICAST:
				for_nexthops(fi) {
					if (nh->nh_flags & RTNH_F_DEAD)
						continue;
					if (!flp->oif || flp->oif == nh->nh_oif)
						break;
				}
				goto out_fill_res;
				endfor_nexthops(fi);
				continue;
 
			default:
				pr_warning("fib_semantic_match bad type %#x\n",
					   fa->fa_type);
				return -EINVAL;
			}
		}
		return err;
	}
	return 1;
 
out_fill_res:
	res->prefixlen = prefixlen;
	res->nh_sel = nh_sel;
	res->type = fa->fa_type;
	res->scope = fa->fa_scope;
	res->fi = fa->fa_info;
	if (!(fib_flags & FIB_LOOKUP_NOREF))
		atomic_inc(&res->fi->fib_clntref);
	return 0;
}

该函数返回1时表示没有匹配的路由项，返回0时表示匹配成功，返回负值时表示管理失败，即fa->fa_type类型有问题，可查询fib_props数组来确定对应的fa_type是否有错误，如果类型为RTN_BLACKHOLE,RTN_UNREACHABLE,RTN_PROHIBIT,RTN_THROW,RTN_NAT,RTN_XRESOLVE时表示存在对应的错误。

函数先检验TOS，如果搜索健值中指定了TOS，则必须对TOS进行严格匹配，否则当前路由项即不符合要求。如果未指定TOS，则可以匹配任意TOS。

接下来检验scope，路由项的范围必须比请求的范围更”窄“才能符合要求，如果请求查找的scope为RTN_UNIVERSE，则RTN_LINK即不匹配;类似的，如果请求查找的是RTN_HOST，则RTN_LINK，RTN_UNIVERSE都是可以匹配的。

scope检查完成之后即开始检查路由类型，即刚才提到的fa->fa_type，不再说了。如果类型没有问题则接下来检查下一跳信息，首先得确保下一跳信息不是待删除的，其次如果指定了出口设备，则出口设备要和下一跳信息中的出口设备匹配。如果到这一步全都匹配，则函数跳到out_fill_res，填充fib_result，之后返回0，表示成功。

转自：http://basiccoder.com/intro-linux-kernel-hash-rt-1.html