用户层应用ip命令删除IPv6路由:
# ip -6 -d route
unicast 3001::/64 dev ens33 proto kernel scope global metric 256 pref medium
unicast 3ffe::/64 via 3001::10 dev ens33 proto boot scope global metric 1024 pref medium
#
# ip -6 route del 3ffe::0/64 via 3001::10
内核函数inet6_rtm_delroute处理以上IP命令的路由删除。
static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct fib6_config cfg;
int err;
err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
if (err < 0)
return err;
if (cfg.fc_nh_id && !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
return -EINVAL;
}
if (cfg.fc_mp)
return ip6_route_multipath_del(&cfg, extack);
else {
cfg.fc_delete_all_nh = 1;
return ip6_route_del(&cfg, extack);
函数rtm_to_fib6_config负责解析IP命令参数,例如,以下解析nexthop属性ID,下一跳网关等参数。
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
struct fib6_config *cfg, struct netlink_ext_ack *extack)
{
if (tb[RTA_NH_ID]) {
if (tb[RTA_GATEWAY] || tb[RTA_OIF] ||
tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
NL_SET_ERR_MSG(extack,
"Nexthop specification and nexthop id are mutually exclusive");
goto errout;
}
cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
}
if (tb[RTA_GATEWAY]) {
cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
cfg->fc_flags |= RTF_GATEWAY;
}
if (tb[RTA_VIA]) {
NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
goto errout;
}
内核中由函数ip6_route_del进行处理。其中函数fib6_locate根据目的地址信息和源地址信息(支持子树的情况下)获得路由表中对应的路由节点,如果没有找到,返回失败,错误码ESRCH。
static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack)
{
struct fib6_table *table;
struct fib6_info *rt;
struct fib6_node *fn;
int err = -ESRCH;
table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
if (!table) {
NL_SET_ERR_MSG(extack, "FIB table does not exist");
return err;
}
rcu_read_lock();
fn = fib6_locate(&table->tb6_root,
&cfg->fc_dst, cfg->fc_dst_len,
&cfg->fc_src, cfg->fc_src_len,
!(cfg->fc_flags & RTF_CACHE));
之后,遍历此节点的叶子节点,如果其下一跳配置了nexthop属性,并且要删除的路由也是配置了nexthop属性(cfg->fc_nh_id不为空),但是两者所指向的下一跳不相同,即不是要找的路由项,遍历下一个叶子节点。
如果配置了路由metric和protocol,分别进行两者的比较。
if (fn) {
for_each_fib6_node_rt_rcu(fn) {
struct fib6_nh *nh;
if (rt->nh && cfg->fc_nh_id &&
rt->nh->id != cfg->fc_nh_id)
continue;
...
if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
continue;
if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
continue;
对于nexthop属性路由,由函数__ip6_del_rt进行删除,结束处理。否则如果遍历路由不具有nexthop属性,但是要查找的是具有nexthop属性的路由(fc_nh_id不为空),遍历下一个路由项。
if (rt->nh) {
if (!fib6_info_hold_safe(rt)) continue;
return __ip6_del_rt(rt, &cfg->fc_nlinfo);
}
if (cfg->fc_nh_id) continue;
对于内置路由项(非nexthop属性路由),比较两者出接口是否相同。如果配置了网关,比较两者网关是否相同。如果匹配,在配置了网关的情况下,删除此路由,由函数__ip6_del_rt处理,与以上nexthop属性删除处理一致。
但是,如果未指定网关,将有函数__ip6_del_rt_siblings处理。
nh = rt->fib6_nh;
if (cfg->fc_ifindex &&
(!nh->fib_nh_dev ||
nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
continue;
if (cfg->fc_flags & RTF_GATEWAY &&
!ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
continue;
if (!fib6_info_hold_safe(rt))
continue;
rcu_read_unlock();
/* if gateway was specified only delete the one hop */
if (cfg->fc_flags & RTF_GATEWAY)
return __ip6_del_rt(rt, &cfg->fc_nlinfo);
return __ip6_del_rt_siblings(rt, cfg);
删除指定路由信息
核心处理有fib6_del函数实现。
static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
{
struct net *net = info->nl_net;
struct fib6_table *table;
int err;
if (rt == net->ipv6.fib6_null_entry) {
err = -ENOENT;
goto out;
}
table = rt->fib6_table;
spin_lock_bh(&table->tb6_lock);
err = fib6_del(rt, info);
spin_unlock_bh(&table->tb6_lock);
out:
fib6_info_release(rt);
return err;
}
遍历路由信息rt所属的路由节点的叶子链表,找到其自身所在位置,由函数fib6_del_route处理。
int fib6_del(struct fib6_info *rt, struct nl_info *info)
{
struct net *net = info->nl_net;
struct fib6_info __rcu **rtp;
struct fib6_info __rcu **rtp_next;
struct fib6_table *table;
struct fib6_node *fn;
if (rt == net->ipv6.fib6_null_entry) return -ENOENT;
table = rt->fib6_table;
fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock));
if (!fn) return -ENOENT;
WARN_ON(!(fn->fn_flags & RTN_RTINFO));
/* Walk the leaf entries looking for ourself */
for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
struct fib6_info *cur = rcu_dereference_protected(*rtp,
lockdep_is_held(&table->tb6_lock));
if (rt == cur) {
if (fib6_requires_src(cur))
fib6_routes_require_src_dec(info->nl_net);
fib6_del_route(table, fn, rtp, info);
return 0;
}
rtp_next = &cur->fib6_next;
}
return -ENOENT;
如果要删除的路由信息是路由节点的第一个叶子,并且其没有兄弟(siblings)节点,即其不属于任何多径路由,如果其下一个节点存在,使用下一个节点作为替换节点,否则,进行删除。
static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
struct fib6_info __rcu **rtp, struct nl_info *info)
{
struct fib6_info *leaf, *replace_rt = NULL;
struct fib6_walker *w;
struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock));
struct net *net = info->nl_net;
bool notify_del = false;
RT6_TRACE("fib6_del_route\n");
/* If the deleted route is the first in the node and it is not part of
* a multipath route, then we need to replace it with the next route
* in the node, if exists.
*/
leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&table->tb6_lock));
if (leaf == rt && !rt->fib6_nsiblings) {
if (rcu_access_pointer(rt->fib6_next))
replace_rt = rcu_dereference_protected(rt->fib6_next,
lockdep_is_held(&table->tb6_lock));
else
notify_del = true;
}
将待删除路由信息从链表上删除,后一个节点占据其位置。函数rt6_flush_exceptions清除所有缓存的rt相关exception表项。
/* Unlink it */
*rtp = rt->fib6_next;
rt->fib6_node = NULL;
net->ipv6.rt6_stats->fib_rt_entries--;
net->ipv6.rt6_stats->fib_discarded_routes++;
/* Flush all cached dst in exception table */
rt6_flush_exceptions(rt);
/* Reset round-robin state, if necessary */
if (rcu_access_pointer(fn->rr_ptr) == rt)
fn->rr_ptr = NULL;
如果此路由信息是多径路由的一部分,将其从siblings链表中删除。如果此路由信息为其所在路由节点的第一个叶子节点,发送删除通知(notify_del)。
/* Remove this entry from other siblings */
if (rt->fib6_nsiblings) {
struct fib6_info *sibling, *next_sibling;
/* The route is deleted from a multipath route. If this
* multipath route is the first route in the node, then we need
* to emit a delete notification. Otherwise, we need to skip
* the notification.
*/
if (rt->fib6_metric == leaf->fib6_metric &&
rt6_qualify_for_ecmp(leaf))
notify_del = true;
list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings)
sibling->fib6_nsiblings--;
rt->fib6_nsiblings = 0;
list_del_init(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
}
更新命名空间中的fib6_walkers链表,如果walker的叶子等于要删除的路由信息,用下一个叶子节点进行替换,如果walker叶子为空,更新其状态为FWS_U。
/* Adjust walkers */
read_lock(&net->ipv6.fib6_walker_lock);
FOR_WALKERS(net, w) {
if (w->state == FWS_C && w->leaf == rt) {
RT6_TRACE("walker %p adjusted by delroute\n", w);
w->leaf = rcu_dereference_protected(rt->fib6_next,
lockdep_is_held(&table->tb6_lock));
if (!w->leaf)
w->state = FWS_U;
}
}
read_unlock(&net->ipv6.fib6_walker_lock);
如果路由信息为节点中的最后一个叶子,对于非根节点,清除其RTN_RTINFO标志,表明其不包含有效的路由信息,接下来执行路由树的修复。
/* If it was last route, call fib6_repair_tree() to:
* 1. For root node, put back null_entry as how the table was created.
* 2. For other nodes, expunge its radix tree node.
*/
if (!rcu_access_pointer(fn->leaf)) {
if (!(fn->fn_flags & RTN_TL_ROOT)) {
fn->fn_flags &= ~RTN_RTINFO;
net->ipv6.rt6_stats->fib_route_nodes--;
}
fn = fib6_repair_tree(net, table, fn);
}
删除路由信息相关的路由缓存,根据以上的设置,决定是否发送内核以及用户层通知消息,最后,函数fib6_info_release释放路由信息结构rt。
fib6_purge_rt(rt, fn, net);
if (!info->skip_notify_kernel) {
if (notify_del)
call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
else if (replace_rt)
call_fib6_entry_notifiers_replace(net, replace_rt);
}
if (!info->skip_notify)
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
fib6_info_release(rt);
}
删除路由exception缓存
上节在删除路由信息函数fib6_del_route中,调用函数rt6_flush_exceptions执行删除exception缓存操作。对于配置了nexthop属性的路由,如果nexthop为下一跳组,由函数nexthop_for_each_fib6_nh执行遍历。
对于内置的路由(非nexthop属性),以及以上的nexthop属性路由,最终都是由函数fib6_nh_flush_exceptions进行删除处理。
static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
{
struct fib6_info *f6i = arg;
fib6_nh_flush_exceptions(nh, f6i);
return 0;
}
void rt6_flush_exceptions(struct fib6_info *f6i)
{
if (f6i->nh)
nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i);
else
fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
}
找到下一跳节点对应的哈希桶数组(1024大小),遍历数组中每一个哈希桶,以及每一个桶所对应的哈希链表,如果from为空,删除所有的exception,否则,只删除由from生成的exception项。
#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10
#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
#define FIB6_MAX_DEPTH 5
static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
spin_lock_bh(&rt6_exception_lock);
bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (!bucket)
goto out;
/* Prevent rt6_insert_exception() to recreate the bucket list */
if (!from) fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
if (!from || rcu_access_pointer(rt6_ex->rt6i->from) == from)
rt6_remove_exception(bucket, rt6_ex);
}
WARN_ON_ONCE(!from && bucket->depth);
bucket++;
}
out:
spin_unlock_bh(&rt6_exception_lock);
如下删除exception函数,释放其中的路由信息,以及exception路由缓存自身,减低哈希桶深度。
static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex)
{
struct fib6_info *from;
struct net *net;
if (!bucket || !rt6_ex) return;
net = dev_net(rt6_ex->rt6i->dst.dev);
net->ipv6.rt6_stats->fib_rt_cache--;
/* purge completely the exception to allow releasing the held resources:
* some [sk] cache may keep the dst around for unlimited time
*/
from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
fib6_info_release(from);
dst_dev_put(&rt6_ex->rt6i->dst);
hlist_del_rcu(&rt6_ex->hlist);
dst_release(&rt6_ex->rt6i->dst);
kfree_rcu(rt6_ex, rcu);
WARN_ON_ONCE(!bucket->depth);
bucket->depth--;
}
清除路由缓存
这里实际上是清除路由缓存中对此路由信息rt的引用(fib6_drop_pcpu_from),之后,如果路由信息下一跳具有nexthop属性,将其从下一跳链表中移除。
static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net)
{
struct fib6_table *table = rt->fib6_table;
fib6_drop_pcpu_from(rt, table);
if (rt->nh && !list_empty(&rt->nh_list))
list_del_init(&rt->nh_list);
最后,如果此时路由信息的引用计数不为1,表明其还在被一下dummy地址使用。如下遍历路由树,对于不包括路由信息的路由节点,如果其首个叶子等于当前要释放的路由信息,使用其左子树节点或者右子树节点替换首个叶子节点(fib6_find_prefix),释放路由信息。
if (refcount_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
* nodes. It is not leaked, but it still holds other resources,
* which must be released in time. So, scan ascendant nodes
* and replace dummy references to this route with references
* to still alive ones.
*/
while (fn) {
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&table->tb6_lock));
struct fib6_info *new_leaf;
if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
new_leaf = fib6_find_prefix(net, table, fn);
fib6_info_hold(new_leaf);
rcu_assign_pointer(fn->leaf, new_leaf);
fib6_info_release(rt);
}
fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock));
首先停止f6i相应路由缓存的创建(fib6_destroying),对于具有nexthop属性的路由,或者内置路由,最终都有函数__fib6_drop_pcpu_from删除路由信息引用。
static void fib6_drop_pcpu_from(struct fib6_info *f6i, const struct fib6_table *table)
{
/* Make sure rt6_make_pcpu_route() wont add other percpu routes
* while we are cleaning them here.
*/
f6i->fib6_destroying = 1;
mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
if (f6i->nh) {
struct fib6_nh_pcpu_arg arg = {
.from = f6i,
.table = table
};
nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg);
} else {
struct fib6_nh *fib6_nh;
fib6_nh = f6i->fib6_nh;
__fib6_drop_pcpu_from(fib6_nh, f6i, table);
}
}
遍历所有的处理器,找到下一跳结构fib6_nh对应的每处理器路由缓存,如果其依据路由信息match生成,将其from字段设置为NULL,释放其引用的路由信息。
static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
const struct fib6_info *match,
const struct fib6_table *table)
{
if (!fib6_nh->rt6i_pcpu) return;
/* release the reference to this fib entry from
* all of its cached pcpu routes
*/
for_each_possible_cpu(cpu) {
struct rt6_info **ppcpu_rt;
struct rt6_info *pcpu_rt;
ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
pcpu_rt = *ppcpu_rt;
/* only dropping the 'from' reference if the cached route
* is using 'match'. The cached pcpu_rt->from only changes
* from a fib6_info to NULL (ip6_dst_destroy); it can never
* change from one fib6_info reference to another
*/
if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
struct fib6_info *from;
from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
fib6_info_release(from);
删除路由信息及其siblings
如上函数inet6_rtm_delroute,其中将fc_delete_all_nh固定设置为1,如果rt是多径路由的一部分,以下首先删除其所有的siblings节点。对于多径路由的所有的节点内核仅发送一个通知消息。
static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
{
struct nl_info *info = &cfg->fc_nlinfo;
struct net *net = info->nl_net;
struct sk_buff *skb = NULL;
struct fib6_table *table;
int err = -ENOENT;
if (rt == net->ipv6.fib6_null_entry)
goto out_put;
table = rt->fib6_table;
spin_lock_bh(&table->tb6_lock);
if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
struct fib6_info *sibling, *next_sibling;
struct fib6_node *fn;
/* prefer to send a single notification with all hops */
skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
if (skb) {
u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
if (rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_DELROUTE, info->portid, seq, 0) < 0) {
kfree_skb(skb);
skb = NULL;
} else
info->skip_notify = 1;
}
如果路由信息为路由节点的第一个叶子,并且路由信息所属的多径路由的最后一个节点,之后还有其它节点,发送替换通知。否则,如果多径路由之后没有其它节点,发送删除通知。
之后,遍历路由信息rt的所有siblings节点,执行删除操作。
/* 'rt' points to the first sibling route. If it is not the
* leaf, then we do not need to send a notification. Otherwise,
* we need to check if the last sibling has a next route or not
* and emit a replace or delete notification, respectively.
*/
info->skip_notify_kernel = 1;
fn = rcu_dereference_protected(rt->fib6_node,
lockdep_is_held(&table->tb6_lock));
if (rcu_access_pointer(fn->leaf) == rt) {
struct fib6_info *last_sibling, *replace_rt;
last_sibling = list_last_entry(&rt->fib6_siblings,
struct fib6_info, fib6_siblings);
replace_rt = rcu_dereference_protected(
last_sibling->fib6_next, lockdep_is_held(&table->tb6_lock));
if (replace_rt)
call_fib6_entry_notifiers_replace(net, replace_rt);
else
call_fib6_multipath_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, rt->fib6_nsiblings, NULL);
}
list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) {
err = fib6_del(sibling, info);
if (err)
goto out_unlock;
函数最后,删除路由信息rt,并将其释放(fib6_info_release)。
err = fib6_del(rt, info);
out_unlock:
spin_unlock_bh(&table->tb6_lock);
out_put:
fib6_info_release(rt);
if (skb) {
rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any());
}
return err;
内核版本 5.10