IP地址为32位,当报文在网络上传输时,要在路由表中进行查找。IP地址的对应位如果能与路由表中保存的进行匹配,这些报文就会发送到路由指向的地址。如下面是一个路由表:
Entry
--------
0 0000
1 0001
2 00101
3 010
4 0110
5 0111
6 100
7 101000
8 101001
9 10101
10 10110
11 10111
12 110
13 11101000
14 11101001
目标地址和路由表中的各项比较,如果有多个都能匹配,选择最长的那个,如果没有一个能匹配,使用默认路由。
在内核中,这个路由表由trie来表示:
左孩子表示0,右孩子表示1。这样比如010就会到达节点3的位置。
从图中看,有很多空的节点,并且高度太天,为提高效率,可使用路径压缩,这样就变成如下的形状:
每一个空节点都被移除了,在每个节点新增一个变量skip,保存被移除的空节点的个数。这样11101001从根节点开始,先到右边一个节点,再到右边一个节点,再到右边一个节点,这时要跳过4位,最后一位是1,到右边的节点,就到达14的位置。
再进行优化,对trie的层级进行压缩,就成为LC-tries,最后变成下面的形状:
一个节点,如果它只有两个节点且左右节点都存在,用两个子节点代替这个节点,如果节点的两个节点都为叶子节点,则停止替换。这个替换可在子树中进行重复。
linux下的实现:
构成树的节点的数据结构:
[ net/ipv4/fib_trie.c ]
// 中间节点和叶子节点的抽象节点(前两个变量相同)
struct rt_trie_node {
unsigned long parent; // 父节点
t_key key; // 为地址
};
// 叶子节点(路由)
struct leaf {
unsigned long parent; // 父节点
t_key key; // 地址
struct hlist_head list; // leaf_info列表
struct rcu_head rcu;
};
// 路由信息
struct leaf_info {
struct hlist_node hlist; // 加入到leaf中
int plen; // 目标地址长度
u32 mask_plen; // 根据目标地址长度计算出地址的有效位
struct list_head falh; // 路由列表(fib_alias)
struct rcu_head rcu;
};
// 中间节点
struct tnode {
unsigned long parent; // 父节点
t_key key; // 地址
unsigned char pos; /* 2log(KEYLENGTH) bits needed, 有效位的开始位置 */
unsigned char bits; /* 2log(KEYLENGTH) bits needed, 有效位的位数 */
unsigned int full_children; /* KEYLENGTH bits needed */
unsigned int empty_children; /* KEYLENGTH bits needed */
union {
struct rcu_head rcu;
struct tnode *tnode_free;
};
struct rt_trie_node __rcu *child[0]; // 子节点,为中间节点或叶子节点
};
插入节点,如何生成树:
[ net/ipv4/fib_trie.c ]
/* only used from updater-side
* t为路由表根节点
* key为地址
* plen为地址长度
*/
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
int pos, newpos;
struct tnode *tp = NULL, *tn = NULL;
struct rt_trie_node *n;
struct leaf *l;
int missbit;
struct list_head *fa_head = NULL;
struct leaf_info *li;
t_key cindex;
pos = 0; // 比较的位置,初始位置为0
n = rtnl_dereference(t->trie); // 从根节点开始查找
/* If we point to NULL, stop. Either the tree is empty and we should
* just put a new leaf in if, or we have reached an empty child slot,
* and we should just put our new leaf in that.
* If we point to a T_TNODE, check if it matches our key. Note that
* a T_TNODE might be skipping any number of bits - its 'pos' need
* not be the parent's 'pos'+'bits'!
*
* If it does match the current key, get pos/bits from it, extract
* the index from our key, push the T_TNODE and walk the tree.
*
* If it doesn't, we have to replace it with a new T_TNODE.
*
* If we point to a T_LEAF, it might or might not have the same key
* as we do. If it does, just change the value, update the T_LEAF's
* value, and return it.
* If it doesn't, we need to replace it with a T_TNODE.
*/
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
check_tnode(tn); // tn中的有效位长度不能超过32位
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { // 比较两个key,从pos开始的(tn->pos - pos)位都相同
tp = tn; // tp为n的父节点
pos = tn->pos + tn->bits; // 比较的位置为tn之后的位置,这里会跳过一些位
/* key从tn->pos开始tn->bits位的值为数组下标得到tn的子节点
*/
n = tnode_get_child(tn,
tkey_extract_bits(key,
tn->pos,
tn->bits));
BUG_ON(n && node_parent(n) != tn);
} else
break;
}
/*
* n ----> NULL, LEAF or TNODE
*
* tp is n's (parent) ----> NULL or TNODE
*/
BUG_ON(tp && IS_LEAF(tp));
/* Case 1: n is a leaf. Compare prefixes */
if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { // n是一个leaf并且key相同
l = (struct leaf *) n;
li = leaf_info_new(plen); // 分配一个leaf_info
if (!li)
return NULL;
fa_head = &li->falh; // 此列表现在为空
insert_leaf_info(&l->list, li); // leaf_info加入到leaf->list中
goto done; // 完成
}
l = leaf_new(); // 分配一个叶子
if (!l)
return NULL;
l->key = key; // 设置key
li = leaf_info_new(plen); // 分配一个leaf_info
if (!li) {
free_leaf(l);
return NULL;
}
fa_head = &li->falh; // 此列表现在为空
insert_leaf_info(&l->list, li); // leaf_info插入到leaf的散列表中
if (t->trie && n == NULL) { // 根节点存在,此时tp不为空
/* Case 2: n is NULL, and will just insert a new leaf */
node_set_parent((struct rt_trie_node *)l, tp); // l为新分配的叶子,设置它的父节点为tp
cindex = tkey_extract_bits(key, tp->pos, tp->bits); // 由key计算出子节点数组的索引,tp为父节点
put_child(tp, cindex, (struct rt_trie_node *)l); // 将l放入到tp的子节点数组中
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
* Add a new tnode here
* first tnode need some special handling
*/
if (n) {
pos = tp ? tp->pos+tp->bits : 0; // 比较位置为父节点后面的位置
/* 到达这里,两个key不同并且pos前面的位都是相同的
*/
newpos = tkey_mismatch(key, pos, n->key); // 找到第一个不同的位置
tn = tnode_new(n->key, newpos, 1); // 新建一个node,包含2个子节点
} else {
newpos = 0;
tn = tnode_new(key, newpos, 1); /* First tnode, 包含2个子节点 */
}
if (!tn) {
free_leaf_info(li);
free_leaf(l);
return NULL;
}
node_set_parent((struct rt_trie_node *)tn, tp); // 设置tn的父节点为tp
/* 此时新建一个tn作为tp的子节点,并且将l和n作为tn的子节点
*/
missbit = tkey_extract_bits(key, newpos, 1); // 得到新位置处key的值,只能是0或1
put_child(tn, missbit, (struct rt_trie_node *)l); // l作为tn的一个节点
put_child(tn, 1-missbit, n); // n作为tn的一个节点
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits); // 计算子节点数组的索引
put_child(tp, cindex, (struct rt_trie_node *)tn); // 计tn放入子节点数组中
} else {
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); // 第一个节点
tp = tn;
}
}
if (tp && tp->pos + tp->bits > 32)
pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
tp, tp->pos, tp->bits, key, plen);
/* Rebalance the trie */
trie_rebalance(t, tp); // 对树进行平衡
done:
return fa_head;
}
对树进行平衡
[ net/ipv4/fib_trie.c ]
/* 对树进行平衡
* tn为新插入节点的父节点
*/
static void trie_rebalance(struct trie *t, struct tnode *tn)
{
int wasfull;
t_key cindex, key;
struct tnode *tp;
key = tn->key; // 节点的key
while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { // 节点和其父节点都不为空
cindex = tkey_extract_bits(key, tp->pos, tp->bits); // 父节点的数组下标(key从tp->pos开始tp->bits位的值为数组下标)
wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); // 父节点是否为中间节点并且没有位被跳过
tn = (struct tnode *)resize(t, tn);
tnode_put_child_reorg(tp, cindex,
(struct rt_trie_node *)tn, wasfull);
tp = node_parent((struct rt_trie_node *) tn);
if (!tp)
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
if (!tp)
break;
tn = tp;
}
/* Handle last (top) tnode */
if (IS_TNODE(tn))
tn = (struct tnode *)resize(t, tn);
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
}
扩张子树
[ net/ipv4/fib_trie.c ]
/* 扩张子树tn
*/
static struct tnode *inflate(struct trie *t, struct tnode *tn)
{
struct tnode *oldtnode = tn; // 原来的节点
int olen = tnode_child_length(tn); // 子节点个数
int i;
pr_debug("In inflate\n");
tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); // 新建节点,子节点变为2倍
if (!tn)
return ERR_PTR(-ENOMEM);
/*
* Preallocate and store tnodes before the actual work so we
* don't get into an inconsistent state if memory allocation
* fails. In case of failure we return the oldnode and inflate
* of tnode is ignored.
*/
for (i = 0; i < olen; i++) {
struct tnode *inode;
inode = (struct tnode *) tnode_get_child(oldtnode, i); // 老节点的子节点
/* 子节点不为空
* 子节点为中间节点
* 子节点没有跳位(偏移位置为父节点有效位后面的位置)
* 子节点有大于2个的子节点
*/
if (inode &&
IS_TNODE(inode) &&
inode->pos == oldtnode->pos + oldtnode->bits &&
inode->bits > 1) {
struct tnode *left, *right;
t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
/* 左节点的key从(inode->pos + 1)开始都为0
* 左节点的偏移位置为后一个位置
* 左节点的子节点的个数减2
*/
left = tnode_new(inode->key&(~m), inode->pos + 1,
inode->bits - 1);
if (!left)
goto nomem;
/* 右节点的key从(inode->pos + 1)开始都为1
* 右节点的偏移位置为后一个位置
* 右节点的子节点的个数减2
*/
right = tnode_new(inode->key|m, inode->pos + 1,
inode->bits - 1);
if (!right) {
tnode_free(left);
goto nomem;
}
// 加入到新节点中
put_child(tn, 2*i, (struct rt_trie_node *) left);
put_child(tn, 2*i+1, (struct rt_trie_node *) right);
}
}
for (i = 0; i < olen; i++) {
struct tnode *inode;
struct rt_trie_node *node = tnode_get_child(oldtnode, i); // 老节点的子节点
struct tnode *left, *right;
int size, j;
/* An empty child */
if (node == NULL)
continue;
/* A leaf or an internal node with skipped bits */
if (IS_LEAF(node) || ((struct tnode *) node)->pos >
tn->pos + tn->bits - 1) {
/* 数组下标(key从pos开始bits + 1位的值为数组下标)
* tn的子节点数量比原来的多两个
*/
put_child(tn,
tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
node);
continue;
}
/* An internal node with two children */
inode = (struct tnode *) node;
if (inode->bits == 1) { // 只有两个节点,将两个节点插入到新节点的数组
put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
tnode_free_safe(inode); // 移除此节点
continue;
}
/* An internal node with more than two children */
/* We will replace this node 'inode' with two new
* ones, 'left' and 'right', each with half of the
* original children. The two new nodes will have
* a position one bit further down the key and this
* means that the "significant" part of their keys
* (see the discussion near the top of this file)
* will differ by one bit, which will be "0" in
* left's key and "1" in right's key. Since we are
* moving the key position by one step, the bit that
* we are moving away from - the bit at position
* (inode->pos) - is the one that will differ between
* left and right. So... we synthesize that bit in the
* two new keys.
* The mask 'm' below will be a single "one" bit at
* the position (inode->pos)
*/
/* Use the old key, but set the new significant
* bit to zero.
*/
left = (struct tnode *) tnode_get_child(tn, 2*i);
put_child(tn, 2*i, NULL); // 左节点的位置设为0
BUG_ON(!left);
right = (struct tnode *) tnode_get_child(tn, 2*i+1);
put_child(tn, 2*i+1, NULL); // 右节点的位置设为0
BUG_ON(!right);
size = tnode_child_length(left); // 左节点的子节点个数
/* 原来的子节点分成两份分别放入left和right中
*/
for (j = 0; j < size; j++) {
put_child(left, j, rtnl_dereference(inode->child[j]));
put_child(right, j, rtnl_dereference(inode->child[j + size]));
}
put_child(tn, 2*i, resize(t, left)); // 对left进行调整放入新节点的左边
put_child(tn, 2*i+1, resize(t, right)); // 对right进行调整放入新节点的右边
tnode_free_safe(inode);
}
tnode_free_safe(oldtnode);
return tn;
nomem:
tnode_clean_free(tn);
return ERR_PTR(-ENOMEM);
}