邻居表项的回收控制

最新推荐文章于 2024-09-21 23:02:02 发布

redwingz

最新推荐文章于 2024-09-21 23:02:02 发布

阅读量3.1k

点赞数 2

分类专栏：邻居子系统文章标签： neigh

本文链接：https://blog.csdn.net/sinat_20184565/article/details/109636549

版权

邻居子系统专栏收录该内容

11 篇文章

订阅专栏

内核中存在3个阈值控制邻居表项的回收：

gc_thresh1 表示最小可保留的表项数量，如果表项数量小于此值GC（Garbage collector）不进行回收操作，默认为128；
gc_thresh2 当表项数量超过此值时，GC将会清空大于5秒的表项，默认为512；
gc_thresh3 最大可允许的非永久表项数量。如果系统拥有庞大的接口数量，或者直连了大量的设备，应增大此值。默认值为1024。

另外，gc_interval不太清楚有什么用处，默认值为30秒钟。

对于IPv4，可通过以下PROC文件查看和修改以上4个值：

$ cat /proc/sys/net/ipv4/neigh/default/gc_interval 
30
$ cat /proc/sys/net/ipv4/neigh/default/gc_thresh1 
128
$ cat /proc/sys/net/ipv4/neigh/default/gc_thresh2
512
$ cat /proc/sys/net/ipv4/neigh/default/gc_thresh3
1024

在arp的全局邻居表变量arp_tbl中，初始化了这四个值。

struct neigh_table arp_tbl = {
    .family     = AF_INET,
    .key_len    = 4,
    .protocol   = cpu_to_be16(ETH_P_IP),
    .hash       = arp_hash,
    .key_eq     = arp_key_eq,
    .constructor    = arp_constructor,
    .proxy_redo = parp_redo,
    .id     = "arp_cache",
    .parms      = {
        ...
    },
    .gc_interval    = 30 * HZ,
    .gc_thresh1 = 128,
    .gc_thresh2 = 512,
    .gc_thresh3 = 1024,

gc_thresh1阈值

在邻居表初始化函数neigh_table_init中，初始化一个延迟work定期进行回收处理，处理函数为neigh_periodic_work。

void neigh_table_init(int index, struct neigh_table *tbl)
{
    ...
    INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
    queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
            tbl->parms.reachable_time);
			
	tbl->last_flush = now;

首先，如果邻居表中的表项数量entries小于gc_thresh1阈值，不进行回收处理，结束执行。

static void neigh_periodic_work(struct work_struct *work)
{
    struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
    struct neighbour *n;
    struct neighbour __rcu **np;
    struct neigh_hash_table *nht;

    NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);

    write_lock_bh(&tbl->lock);
    nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock));

    ...

    if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
        goto out;

否则，遍历邻居表的hash桶，即每个桶中的表项链表，如果表项的状态标志位设置了NUD_PERMANENT或者NUD_IN_TIMER位，或者此表项是由外部模块添加的，不执行回收操作。

对于状态位为NUD_PERMANENT的表项，可是接口自身的IP地址与MAC的表项，或者用户通过ip neigh命令所添加。对于状态位NUD_IN_TIMER，表明此表项还在表项自身的定时器处理控制中，暂不需回收处理。对于表项标志位NTF_EXT_LEARNED，表明此表项为外部的VXLAN或者Switchdev等模块所添加，由这些模块自行删除。

    for (i = 0 ; i < (1 << nht->hash_shift); i++) {
        np = &nht->hash_buckets[i];

        while ((n = rcu_dereference_protected(*np,
                lockdep_is_held(&tbl->lock))) != NULL) {
            unsigned int state;

            write_lock(&n->lock);

            state = n->nud_state;
            if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
                (n->flags & NTF_EXT_LEARNED)) {
                write_unlock(&n->lock);
                goto next_elt;
            }

当以上条件不满足时，先行更新一下表项的使用时间，如果表项的引用计数为1，并且状态位等于NUD_FAILED（注意，这里表明仅此一个状态位），或者表项已经超过g_staletime定义的时长没有使用过了，进行回收处理。

            if (time_before(n->used, n->confirmed))
                n->used = n->confirmed;

            if (refcount_read(&n->refcnt) == 1 &&
                (state == NUD_FAILED ||
                 time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
                *np = n->next;
                neigh_mark_dead(n);
                write_unlock(&n->lock);
                neigh_cleanup_and_release(n);
                continue;
            }
            write_unlock(&n->lock);

next_elt:
            np = &n->next;
        }
        /* It's fine to release lock here, even if hash table grows while we are preempted.
         */
        write_unlock_bh(&tbl->lock);
        cond_resched();
        write_lock_bh(&tbl->lock);
        nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock));
    }

gc_thresh2/gc_thresh3阈值

如下邻居表强制回收函数neigh_forced_gc，强制清理邻居表的gc_list链表中的表项，最大清理数量为gc_entries中超出gc_thresh2定义的数量的部分，被回收的表项需要满足两个条件：1）引用计数为1；2）状态等于NUD_FAILED或者已超过5秒没有更新了。last_flush记录此次强制回收的时间戳。

static int neigh_forced_gc(struct neigh_table *tbl)
{
    int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2;
    unsigned long tref = jiffies - 5 * HZ;
    struct neighbour *n, *tmp;

    NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);

    write_lock_bh(&tbl->lock);

    list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {
        if (refcount_read(&n->refcnt) == 1) {
            bool remove = false;

            write_lock(&n->lock);
            if ((n->nud_state == NUD_FAILED) ||
                time_after(tref, n->updated))
                remove = true;
            write_unlock(&n->lock);

            if (remove && neigh_remove_one(n, tbl))
                shrunk++;
            if (shrunk >= max_clean)
                break;
        }
    }

    tbl->last_flush = jiffies;

以上强制回收函数在neigh_alloc中调用，当gc_entries数量大于等于gc_thresh3阈值时，进行强制回收；或者gc_entries大于等于gc_thresh2，并且距离上一次强制回收超过了5秒钟的时长，也进行强制回收。

如果强制回收函数neigh_forced_gc未能成功回收任何表项，并且gc_entries大于等于gc_thresh3阈值，打印警告信息。

static struct neighbour *neigh_alloc(struct neigh_table *tbl,
                     struct net_device *dev, bool exempt_from_gc)
{
    struct neighbour *n = NULL;
    unsigned long now = jiffies;

    if (exempt_from_gc)
        goto do_alloc;

    entries = atomic_inc_return(&tbl->gc_entries) - 1;
    if (entries >= tbl->gc_thresh3 ||
        (entries >= tbl->gc_thresh2 &&
         time_after(now, tbl->last_flush + 5 * HZ))) {
        if (!neigh_forced_gc(tbl) &&
            entries >= tbl->gc_thresh3) {
            net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id);
            NEIGH_CACHE_STAT_INC(tbl, table_fulls);
            goto out_entries;
        }
    }

如下可见，此函数中将初始化邻居表项自身的gc_list，并且在函数开头处增加了gc_entries计数。

do_alloc:
    n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
    if (!n)
        goto out_entries;
    ...
    n->dead       = 1;
    INIT_LIST_HEAD(&n->gc_list);

    atomic_inc(&tbl->entries);
out:
    return n;

out_entries:
    if (!exempt_from_gc)
        atomic_dec(&tbl->gc_entries);

以下函数neigh_add处理应用层面的表项添加，如果用户设置了状态位NUD_PERMANENT，或者标志位NTF_EXT_LEARNED，在邻居表项分配函数中将跳过以上描述的回收检查。

static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
             struct netlink_ext_ack *extack)
{
    ...
    neigh = neigh_lookup(tbl, dst, dev);
    if (neigh == NULL) {
        bool exempt_from_gc;

        if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
            err = -ENOENT;
            goto out;
        }

        exempt_from_gc = ndm->ndm_state & NUD_PERMANENT ||
                 ndm->ndm_flags & NTF_EXT_LEARNED;
        neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true);

在内核自身使用的表项创建函数中，如果exempt_from_gc为零，将表项链接到邻居表的gc_list中，在内核函数__neigh_create中，将exempt_from_gc固定为false，所以，内核创建的表项初始时都在gc_list链表上。

static struct neighbour *___neigh_create(struct neigh_table *tbl,
                     const void *pkey,
                     struct net_device *dev,
                     bool exempt_from_gc, bool want_ref)
{
    struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc);

    n->dead = 0;
    if (!exempt_from_gc)
        list_add_tail(&n->gc_list, &n->tbl->gc_list);

gc_list更新

在邻居表项更新函数__neigh_update中，如果表项的状态位NUD_PERMANENT发生变化，或则外部属性发生变化，就需要更新邻居表的gc_list。

static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
              u8 new, u32 flags, u32 nlmsg_pid,
              struct netlink_ext_ack *extack)
{
    ...
    if (((new ^ old) & NUD_PERMANENT) || ext_learn_change)
        neigh_update_gc_list(neigh);
	 
    if (notify)
        neigh_update_notify(neigh, nlmsg_pid);

如下neigh_update_gc_list函数，如果表项设置了状态位NUD_PERMANENT，或者设置了外部标志位NTF_EXT_LEARNED，将其由gc_list中移除，表明不能进行回收。否则，将其添加到gc_list链表的末尾，由于回收操作由链表头部开始，更新过的表项最后进行回收处理。

static void neigh_update_gc_list(struct neighbour *n)
{
    bool on_gc_list, exempt_from_gc;

    write_lock_bh(&n->tbl->lock);
    write_lock(&n->lock);

    /* remove from the gc list if new state is permanent or if neighbor
     * is externally learned; otherwise entry should be on the gc list
     */
    exempt_from_gc = n->nud_state & NUD_PERMANENT ||
             n->flags & NTF_EXT_LEARNED;
    on_gc_list = !list_empty(&n->gc_list);

    if (exempt_from_gc && on_gc_list) {
        list_del_init(&n->gc_list);
        atomic_dec(&n->tbl->gc_entries);
    } else if (!exempt_from_gc && !on_gc_list) {
        /* add entries to the tail; cleaning removes from the front */
        list_add_tail(&n->gc_list, &n->tbl->gc_list);
        atomic_inc(&n->tbl->gc_entries);
    }

函数neigh_mark_dead负责将表项由链表gc_list中移除。当执行清理操作时，将使用到此函数；另外，在应用层使用ip命令删除指定表项时，也使用到此函数。

static void neigh_mark_dead(struct neighbour *n)
{
    n->dead = 1;
    if (!list_empty(&n->gc_list)) {
        list_del_init(&n->gc_list);
        atomic_dec(&n->tbl->gc_entries);
    }
}

内核版本 5.0