IPv6路由删除

本文深入剖析了Linux内核中IPv6路由删除的过程,从用户层的ip命令到内核函数的执行,包括inet6_rtm_delroute、fib6_locate、ip6_route_del等多个关键步骤,详细解释了路由信息的查找、比较和删除,以及异常和缓存的清理机制。
摘要由CSDN通过智能技术生成

用户层应用ip命令删除IPv6路由:

# ip -6 -d route 
unicast 3001::/64 dev ens33 proto kernel scope global metric 256 pref medium
unicast 3ffe::/64 via 3001::10 dev ens33 proto boot scope global metric 1024 pref medium
#
# ip -6 route del 3ffe::0/64 via 3001::10

内核函数inet6_rtm_delroute处理以上IP命令的路由删除。

static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
                  struct netlink_ext_ack *extack)
{   
    struct fib6_config cfg;
    int err;
    
    err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
    if (err < 0)
        return err;
    
    if (cfg.fc_nh_id && !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
        NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
        return -EINVAL;
    }   
        
    if (cfg.fc_mp) 
        return ip6_route_multipath_del(&cfg, extack);
    else {  
        cfg.fc_delete_all_nh = 1;
        return ip6_route_del(&cfg, extack); 

函数rtm_to_fib6_config负责解析IP命令参数,例如,以下解析nexthop属性ID,下一跳网关等参数。

static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
                  struct fib6_config *cfg, struct netlink_ext_ack *extack)
{

    if (tb[RTA_NH_ID]) {
        if (tb[RTA_GATEWAY]   || tb[RTA_OIF] ||
            tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
            NL_SET_ERR_MSG(extack,
                       "Nexthop specification and nexthop id are mutually exclusive");
            goto errout;
        }
        cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
    }

    if (tb[RTA_GATEWAY]) {
        cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
        cfg->fc_flags |= RTF_GATEWAY;
    }
    if (tb[RTA_VIA]) {
        NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
        goto errout;
    }

内核中由函数ip6_route_del进行处理。其中函数fib6_locate根据目的地址信息和源地址信息(支持子树的情况下)获得路由表中对应的路由节点,如果没有找到,返回失败,错误码ESRCH。

static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack)
{
    struct fib6_table *table;
    struct fib6_info *rt;
    struct fib6_node *fn;
    int err = -ESRCH;

    table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
    if (!table) {
        NL_SET_ERR_MSG(extack, "FIB table does not exist");
        return err;
    }

    rcu_read_lock();

    fn = fib6_locate(&table->tb6_root,
             &cfg->fc_dst, cfg->fc_dst_len,
             &cfg->fc_src, cfg->fc_src_len,
             !(cfg->fc_flags & RTF_CACHE));

之后,遍历此节点的叶子节点,如果其下一跳配置了nexthop属性,并且要删除的路由也是配置了nexthop属性(cfg->fc_nh_id不为空),但是两者所指向的下一跳不相同,即不是要找的路由项,遍历下一个叶子节点。

如果配置了路由metric和protocol,分别进行两者的比较。

    if (fn) {
        for_each_fib6_node_rt_rcu(fn) {
            struct fib6_nh *nh;

            if (rt->nh && cfg->fc_nh_id &&
                rt->nh->id != cfg->fc_nh_id)
                continue;
            ...

            if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
                continue;
            if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
                continue;

对于nexthop属性路由,由函数__ip6_del_rt进行删除,结束处理。否则如果遍历路由不具有nexthop属性,但是要查找的是具有nexthop属性的路由(fc_nh_id不为空),遍历下一个路由项。

            if (rt->nh) {
                if (!fib6_info_hold_safe(rt)) continue;

                return __ip6_del_rt(rt, &cfg->fc_nlinfo);
            }
            if (cfg->fc_nh_id) continue;

对于内置路由项(非nexthop属性路由),比较两者出接口是否相同。如果配置了网关,比较两者网关是否相同。如果匹配,在配置了网关的情况下,删除此路由,由函数__ip6_del_rt处理,与以上nexthop属性删除处理一致。

但是,如果未指定网关,将有函数__ip6_del_rt_siblings处理。

            nh = rt->fib6_nh;
            if (cfg->fc_ifindex &&
                (!nh->fib_nh_dev ||
                 nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
                continue;
            if (cfg->fc_flags & RTF_GATEWAY &&
                !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
                continue;
            if (!fib6_info_hold_safe(rt))
                continue;
            rcu_read_unlock();

            /* if gateway was specified only delete the one hop */
            if (cfg->fc_flags & RTF_GATEWAY)
                return __ip6_del_rt(rt, &cfg->fc_nlinfo);

            return __ip6_del_rt_siblings(rt, cfg);

删除指定路由信息

核心处理有fib6_del函数实现。

static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
{
    struct net *net = info->nl_net;
    struct fib6_table *table;
    int err;

    if (rt == net->ipv6.fib6_null_entry) {
        err = -ENOENT;
        goto out;
    }

    table = rt->fib6_table;
    spin_lock_bh(&table->tb6_lock);
    err = fib6_del(rt, info);
    spin_unlock_bh(&table->tb6_lock);

out:
    fib6_info_release(rt);
    return err;
}

遍历路由信息rt所属的路由节点的叶子链表,找到其自身所在位置,由函数fib6_del_route处理。

int fib6_del(struct fib6_info *rt, struct nl_info *info)
{       
    struct net *net = info->nl_net;
    struct fib6_info __rcu **rtp;
    struct fib6_info __rcu **rtp_next;
    struct fib6_table *table;
    struct fib6_node *fn;
    
    if (rt == net->ipv6.fib6_null_entry) return -ENOENT;

    table = rt->fib6_table;
    fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock));
    if (!fn) return -ENOENT;

    WARN_ON(!(fn->fn_flags & RTN_RTINFO));
        
    /* Walk the leaf entries looking for ourself */

    for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
        struct fib6_info *cur = rcu_dereference_protected(*rtp,
                    lockdep_is_held(&table->tb6_lock));
        if (rt == cur) {
            if (fib6_requires_src(cur))
                fib6_routes_require_src_dec(info->nl_net);
            fib6_del_route(table, fn, rtp, info);
            return 0;
        }
        rtp_next = &cur->fib6_next;
    }
    return -ENOENT;

如果要删除的路由信息是路由节点的第一个叶子,并且其没有兄弟(siblings)节点,即其不属于任何多径路由,如果其下一个节点存在,使用下一个节点作为替换节点,否则,进行删除。

static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
               struct fib6_info __rcu **rtp, struct nl_info *info)
{
    struct fib6_info *leaf, *replace_rt = NULL;
    struct fib6_walker *w;
    struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock));
    struct net *net = info->nl_net;
    bool notify_del = false;

    RT6_TRACE("fib6_del_route\n");

    /* If the deleted route is the first in the node and it is not part of
     * a multipath route, then we need to replace it with the next route
     * in the node, if exists.
     */
    leaf = rcu_dereference_protected(fn->leaf,
                     lockdep_is_held(&table->tb6_lock));
    if (leaf == rt && !rt->fib6_nsiblings) {
        if (rcu_access_pointer(rt->fib6_next))
            replace_rt = rcu_dereference_protected(rt->fib6_next,
                        lockdep_is_held(&table->tb6_lock));
        else
            notify_del = true;
    }

将待删除路由信息从链表上删除,后一个节点占据其位置。函数rt6_flush_exceptions清除所有缓存的rt相关exception表项。

    /* Unlink it */
    *rtp = rt->fib6_next;
    rt->fib6_node = NULL;
    net->ipv6.rt6_stats->fib_rt_entries--;
    net->ipv6.rt6_stats->fib_discarded_routes++;

    /* Flush all cached dst in exception table */
    rt6_flush_exceptions(rt);

    /* Reset round-robin state, if necessary */
    if (rcu_access_pointer(fn->rr_ptr) == rt)
        fn->rr_ptr = NULL;

如果此路由信息是多径路由的一部分,将其从siblings链表中删除。如果此路由信息为其所在路由节点的第一个叶子节点,发送删除通知(notify_del)。

    /* Remove this entry from other siblings */
    if (rt->fib6_nsiblings) {
        struct fib6_info *sibling, *next_sibling;

        /* The route is deleted from a multipath route. If this
         * multipath route is the first route in the node, then we need
         * to emit a delete notification. Otherwise, we need to skip
         * the notification.
         */
        if (rt->fib6_metric == leaf->fib6_metric &&
            rt6_qualify_for_ecmp(leaf))
            notify_del = true;
        list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings)
            sibling->fib6_nsiblings--;
        rt->fib6_nsiblings = 0;
        list_del_init(&rt->fib6_siblings);
        rt6_multipath_rebalance(next_sibling);
    }

更新命名空间中的fib6_walkers链表,如果walker的叶子等于要删除的路由信息,用下一个叶子节点进行替换,如果walker叶子为空,更新其状态为FWS_U。

    /* Adjust walkers */
    read_lock(&net->ipv6.fib6_walker_lock);
    FOR_WALKERS(net, w) {
        if (w->state == FWS_C && w->leaf == rt) {
            RT6_TRACE("walker %p adjusted by delroute\n", w);
            w->leaf = rcu_dereference_protected(rt->fib6_next,
                        lockdep_is_held(&table->tb6_lock));
            if (!w->leaf)
                w->state = FWS_U;
        }
    }
    read_unlock(&net->ipv6.fib6_walker_lock);

如果路由信息为节点中的最后一个叶子,对于非根节点,清除其RTN_RTINFO标志,表明其不包含有效的路由信息,接下来执行路由树的修复。

    /* If it was last route, call fib6_repair_tree() to:
     * 1. For root node, put back null_entry as how the table was created.
     * 2. For other nodes, expunge its radix tree node.
     */
    if (!rcu_access_pointer(fn->leaf)) {
        if (!(fn->fn_flags & RTN_TL_ROOT)) {
            fn->fn_flags &= ~RTN_RTINFO;
            net->ipv6.rt6_stats->fib_route_nodes--;
        }
        fn = fib6_repair_tree(net, table, fn);
    }

删除路由信息相关的路由缓存,根据以上的设置,决定是否发送内核以及用户层通知消息,最后,函数fib6_info_release释放路由信息结构rt。

    fib6_purge_rt(rt, fn, net);

    if (!info->skip_notify_kernel) {
        if (notify_del)
            call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
        else if (replace_rt)
            call_fib6_entry_notifiers_replace(net, replace_rt);
    }
    if (!info->skip_notify)
        inet6_rt_notify(RTM_DELROUTE, rt, info, 0);

    fib6_info_release(rt);
}

删除路由exception缓存

上节在删除路由信息函数fib6_del_route中,调用函数rt6_flush_exceptions执行删除exception缓存操作。对于配置了nexthop属性的路由,如果nexthop为下一跳组,由函数nexthop_for_each_fib6_nh执行遍历。

对于内置的路由(非nexthop属性),以及以上的nexthop属性路由,最终都是由函数fib6_nh_flush_exceptions进行删除处理。

static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
{   
    struct fib6_info *f6i = arg;
    fib6_nh_flush_exceptions(nh, f6i);
    return 0;
}    
void rt6_flush_exceptions(struct fib6_info *f6i)
{   
    if (f6i->nh)
        nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i);
    else
        fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
}  

找到下一跳节点对应的哈希桶数组(1024大小),遍历数组中每一个哈希桶,以及每一个桶所对应的哈希链表,如果from为空,删除所有的exception,否则,只删除由from生成的exception项。

#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10
#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
#define FIB6_MAX_DEPTH 5

static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
{
    struct rt6_exception_bucket *bucket;
    struct rt6_exception *rt6_ex;
    struct hlist_node *tmp;

    spin_lock_bh(&rt6_exception_lock);

    bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
    if (!bucket)
        goto out;

    /* Prevent rt6_insert_exception() to recreate the bucket list */
    if (!from) fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);

    for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
        hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
            if (!from || rcu_access_pointer(rt6_ex->rt6i->from) == from)
                rt6_remove_exception(bucket, rt6_ex);
        }
        WARN_ON_ONCE(!from && bucket->depth);
        bucket++;
    }
out:
    spin_unlock_bh(&rt6_exception_lock);

如下删除exception函数,释放其中的路由信息,以及exception路由缓存自身,减低哈希桶深度。

static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex)
{
    struct fib6_info *from;
    struct net *net;

    if (!bucket || !rt6_ex) return;

    net = dev_net(rt6_ex->rt6i->dst.dev);
    net->ipv6.rt6_stats->fib_rt_cache--;

    /* purge completely the exception to allow releasing the held resources:
     * some [sk] cache may keep the dst around for unlimited time
     */
    from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
    fib6_info_release(from);
    dst_dev_put(&rt6_ex->rt6i->dst);

    hlist_del_rcu(&rt6_ex->hlist);
    dst_release(&rt6_ex->rt6i->dst);
    kfree_rcu(rt6_ex, rcu);
    WARN_ON_ONCE(!bucket->depth);
    bucket->depth--;
}

清除路由缓存

这里实际上是清除路由缓存中对此路由信息rt的引用(fib6_drop_pcpu_from),之后,如果路由信息下一跳具有nexthop属性,将其从下一跳链表中移除。

static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net)
{
    struct fib6_table *table = rt->fib6_table;

    fib6_drop_pcpu_from(rt, table);

    if (rt->nh && !list_empty(&rt->nh_list))
        list_del_init(&rt->nh_list);

最后,如果此时路由信息的引用计数不为1,表明其还在被一下dummy地址使用。如下遍历路由树,对于不包括路由信息的路由节点,如果其首个叶子等于当前要释放的路由信息,使用其左子树节点或者右子树节点替换首个叶子节点(fib6_find_prefix),释放路由信息。

    if (refcount_read(&rt->fib6_ref) != 1) {
        /* This route is used as dummy address holder in some split
         * nodes. It is not leaked, but it still holds other resources,
         * which must be released in time. So, scan ascendant nodes
         * and replace dummy references to this route with references
         * to still alive ones.
         */
        while (fn) {
            struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                        lockdep_is_held(&table->tb6_lock));
            struct fib6_info *new_leaf;
            if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
                new_leaf = fib6_find_prefix(net, table, fn);
                fib6_info_hold(new_leaf);

                rcu_assign_pointer(fn->leaf, new_leaf);
                fib6_info_release(rt);
            }
            fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock));

首先停止f6i相应路由缓存的创建(fib6_destroying),对于具有nexthop属性的路由,或者内置路由,最终都有函数__fib6_drop_pcpu_from删除路由信息引用。

static void fib6_drop_pcpu_from(struct fib6_info *f6i, const struct fib6_table *table)
{
    /* Make sure rt6_make_pcpu_route() wont add other percpu routes
     * while we are cleaning them here.
     */
    f6i->fib6_destroying = 1;
    mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */

    if (f6i->nh) {
        struct fib6_nh_pcpu_arg arg = {
            .from = f6i,
            .table = table
        };

        nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg);
    } else {
        struct fib6_nh *fib6_nh;

        fib6_nh = f6i->fib6_nh;
        __fib6_drop_pcpu_from(fib6_nh, f6i, table);
    }
}

遍历所有的处理器,找到下一跳结构fib6_nh对应的每处理器路由缓存,如果其依据路由信息match生成,将其from字段设置为NULL,释放其引用的路由信息。

static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
                  const struct fib6_info *match,
                  const struct fib6_table *table)
{
    if (!fib6_nh->rt6i_pcpu) return;

    /* release the reference to this fib entry from
     * all of its cached pcpu routes
     */
    for_each_possible_cpu(cpu) {
        struct rt6_info **ppcpu_rt;
        struct rt6_info *pcpu_rt;

        ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
        pcpu_rt = *ppcpu_rt;

        /* only dropping the 'from' reference if the cached route
         * is using 'match'. The cached pcpu_rt->from only changes
         * from a fib6_info to NULL (ip6_dst_destroy); it can never
         * change from one fib6_info reference to another
         */
        if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
            struct fib6_info *from;

            from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
            fib6_info_release(from);

删除路由信息及其siblings

如上函数inet6_rtm_delroute,其中将fc_delete_all_nh固定设置为1,如果rt是多径路由的一部分,以下首先删除其所有的siblings节点。对于多径路由的所有的节点内核仅发送一个通知消息。

static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
{
    struct nl_info *info = &cfg->fc_nlinfo;
    struct net *net = info->nl_net;
    struct sk_buff *skb = NULL;
    struct fib6_table *table;
    int err = -ENOENT;

    if (rt == net->ipv6.fib6_null_entry)
        goto out_put;
    table = rt->fib6_table;
    spin_lock_bh(&table->tb6_lock);

    if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
        struct fib6_info *sibling, *next_sibling;
        struct fib6_node *fn;

        /* prefer to send a single notification with all hops */
        skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
        if (skb) {
            u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;

            if (rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_DELROUTE, info->portid, seq, 0) < 0) {
                kfree_skb(skb);
                skb = NULL;
            } else
                info->skip_notify = 1;
        }

如果路由信息为路由节点的第一个叶子,并且路由信息所属的多径路由的最后一个节点,之后还有其它节点,发送替换通知。否则,如果多径路由之后没有其它节点,发送删除通知。

之后,遍历路由信息rt的所有siblings节点,执行删除操作。

        /* 'rt' points to the first sibling route. If it is not the
         * leaf, then we do not need to send a notification. Otherwise,
         * we need to check if the last sibling has a next route or not
         * and emit a replace or delete notification, respectively.
         */
        info->skip_notify_kernel = 1;
        fn = rcu_dereference_protected(rt->fib6_node,
                        lockdep_is_held(&table->tb6_lock));
        if (rcu_access_pointer(fn->leaf) == rt) {
            struct fib6_info *last_sibling, *replace_rt;

            last_sibling = list_last_entry(&rt->fib6_siblings,
                               struct fib6_info, fib6_siblings);
            replace_rt = rcu_dereference_protected(
                        last_sibling->fib6_next, lockdep_is_held(&table->tb6_lock));
            if (replace_rt)
                call_fib6_entry_notifiers_replace(net, replace_rt);
            else
                call_fib6_multipath_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, rt->fib6_nsiblings, NULL);
        }
        list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) {
            err = fib6_del(sibling, info);
            if (err)
                goto out_unlock;

函数最后,删除路由信息rt,并将其释放(fib6_info_release)。

    err = fib6_del(rt, info);
out_unlock:
    spin_unlock_bh(&table->tb6_lock);
out_put:
    fib6_info_release(rt);

    if (skb) {
        rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any());
    }
    return err;

内核版本 5.10

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值