内核中存在3个阈值控制邻居表项的回收:
- gc_thresh1 表示最小可保留的表项数量,如果表项数量小于此值GC(Garbage collector)不进行回收操作,默认为128;
- gc_thresh2 当表项数量超过此值时,GC将会清空大于5秒的表项,默认为512;
- gc_thresh3 最大可允许的非永久表项数量。如果系统拥有庞大的接口数量,或者直连了大量的设备,应增大此值。默认值为1024。
另外,gc_interval不太清楚有什么用处,默认值为30秒钟。
对于IPv4,可通过以下PROC文件查看和修改以上4个值:
$ cat /proc/sys/net/ipv4/neigh/default/gc_interval
30
$ cat /proc/sys/net/ipv4/neigh/default/gc_thresh1
128
$ cat /proc/sys/net/ipv4/neigh/default/gc_thresh2
512
$ cat /proc/sys/net/ipv4/neigh/default/gc_thresh3
1024
在arp的全局邻居表变量arp_tbl中,初始化了这四个值。
struct neigh_table arp_tbl = {
.family = AF_INET,
.key_len = 4,
.protocol = cpu_to_be16(ETH_P_IP),
.hash = arp_hash,
.key_eq = arp_key_eq,
.constructor = arp_constructor,
.proxy_redo = parp_redo,
.id = "arp_cache",
.parms = {
...
},
.gc_interval = 30 * HZ,
.gc_thresh1 = 128,
.gc_thresh2 = 512,
.gc_thresh3 = 1024,
gc_thresh1阈值
在邻居表初始化函数neigh_table_init中,初始化一个延迟work定期进行回收处理,处理函数为neigh_periodic_work。
void neigh_table_init(int index, struct neigh_table *tbl)
{
...
INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
tbl->parms.reachable_time);
tbl->last_flush = now;
首先,如果邻居表中的表项数量entries小于gc_thresh1阈值,不进行回收处理,结束执行。
static void neigh_periodic_work(struct work_struct *work)
{
struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
struct neighbour *n;
struct neighbour __rcu **np;
struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock));
...
if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
goto out;
否则,遍历邻居表的hash桶,即每个桶中的表项链表,如果表项的状态标志位设置了NUD_PERMANENT或者NUD_IN_TIMER位,或者此表项是由外部模块添加的,不执行回收操作。
对于状态位为NUD_PERMANENT的表项,可是接口自身的IP地址与MAC的表项,或者用户通过ip neigh命令所添加。对于状态位NUD_IN_TIMER,表明此表项还在表项自身的定时器处理控制中,暂不需回收处理。对于表项标志位NTF_EXT_LEARNED,表明此表项为外部的VXLAN或者Switchdev等模块所添加,由这些模块自行删除。
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
np = &nht->hash_buckets[i];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
unsigned int state;
write_lock(&n->lock);
state = n->nud_state;
if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
(n->flags & NTF_EXT_LEARNED)) {
write_unlock(&n->lock);
goto next_elt;
}
当以上条件不满足时,先行更新一下表项的使用时间,如果表项的引用计数为1,并且状态位等于NUD_FAILED(注意,这里表明仅此一个状态位),或者表项已经超过g_staletime定义的时长没有使用过了,进行回收处理。
if (time_before(n->used, n->confirmed))
n->used = n->confirmed;
if (refcount_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
*np = n->next;
neigh_mark_dead(n);
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
continue;
}
write_unlock(&n->lock);
next_elt:
np = &n->next;
}
/* It's fine to release lock here, even if hash table grows while we are preempted.
*/
write_unlock_bh(&tbl->lock);
cond_resched();
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock));
}
gc_thresh2/gc_thresh3阈值
如下邻居表强制回收函数neigh_forced_gc,强制清理邻居表的gc_list链表中的表项,最大清理数量为gc_entries中超出gc_thresh2定义的数量的部分,被回收的表项需要满足两个条件:1)引用计数为1;2)状态等于NUD_FAILED或者已超过5秒没有更新了。last_flush记录此次强制回收的时间戳。
static int neigh_forced_gc(struct neigh_table *tbl)
{
int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2;
unsigned long tref = jiffies - 5 * HZ;
struct neighbour *n, *tmp;
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
write_lock_bh(&tbl->lock);
list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {
if (refcount_read(&n->refcnt) == 1) {
bool remove = false;
write_lock(&n->lock);
if ((n->nud_state == NUD_FAILED) ||
time_after(tref, n->updated))
remove = true;
write_unlock(&n->lock);
if (remove && neigh_remove_one(n, tbl))
shrunk++;
if (shrunk >= max_clean)
break;
}
}
tbl->last_flush = jiffies;
以上强制回收函数在neigh_alloc中调用,当gc_entries数量大于等于gc_thresh3阈值时,进行强制回收;或者gc_entries大于等于gc_thresh2,并且距离上一次强制回收超过了5秒钟的时长,也进行强制回收。
如果强制回收函数neigh_forced_gc未能成功回收任何表项,并且gc_entries大于等于gc_thresh3阈值,打印警告信息。
static struct neighbour *neigh_alloc(struct neigh_table *tbl,
struct net_device *dev, bool exempt_from_gc)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
if (exempt_from_gc)
goto do_alloc;
entries = atomic_inc_return(&tbl->gc_entries) - 1;
if (entries >= tbl->gc_thresh3 ||
(entries >= tbl->gc_thresh2 &&
time_after(now, tbl->last_flush + 5 * HZ))) {
if (!neigh_forced_gc(tbl) &&
entries >= tbl->gc_thresh3) {
net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id);
NEIGH_CACHE_STAT_INC(tbl, table_fulls);
goto out_entries;
}
}
如下可见,此函数中将初始化邻居表项自身的gc_list,并且在函数开头处增加了gc_entries计数。
do_alloc:
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n)
goto out_entries;
...
n->dead = 1;
INIT_LIST_HEAD(&n->gc_list);
atomic_inc(&tbl->entries);
out:
return n;
out_entries:
if (!exempt_from_gc)
atomic_dec(&tbl->gc_entries);
以下函数neigh_add处理应用层面的表项添加,如果用户设置了状态位NUD_PERMANENT,或者标志位NTF_EXT_LEARNED,在邻居表项分配函数中将跳过以上描述的回收检查。
static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
...
neigh = neigh_lookup(tbl, dst, dev);
if (neigh == NULL) {
bool exempt_from_gc;
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
err = -ENOENT;
goto out;
}
exempt_from_gc = ndm->ndm_state & NUD_PERMANENT ||
ndm->ndm_flags & NTF_EXT_LEARNED;
neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true);
在内核自身使用的表项创建函数中,如果exempt_from_gc为零,将表项链接到邻居表的gc_list中,在内核函数__neigh_create中,将exempt_from_gc固定为false,所以,内核创建的表项初始时都在gc_list链表上。
static struct neighbour *___neigh_create(struct neigh_table *tbl,
const void *pkey,
struct net_device *dev,
bool exempt_from_gc, bool want_ref)
{
struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc);
n->dead = 0;
if (!exempt_from_gc)
list_add_tail(&n->gc_list, &n->tbl->gc_list);
gc_list更新
在邻居表项更新函数__neigh_update中,如果表项的状态位NUD_PERMANENT发生变化,或则外部属性发生变化,就需要更新邻居表的gc_list。
static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
u8 new, u32 flags, u32 nlmsg_pid,
struct netlink_ext_ack *extack)
{
...
if (((new ^ old) & NUD_PERMANENT) || ext_learn_change)
neigh_update_gc_list(neigh);
if (notify)
neigh_update_notify(neigh, nlmsg_pid);
如下neigh_update_gc_list函数,如果表项设置了状态位NUD_PERMANENT,或者设置了外部标志位NTF_EXT_LEARNED,将其由gc_list中移除,表明不能进行回收。否则,将其添加到gc_list链表的末尾,由于回收操作由链表头部开始,更新过的表项最后进行回收处理。
static void neigh_update_gc_list(struct neighbour *n)
{
bool on_gc_list, exempt_from_gc;
write_lock_bh(&n->tbl->lock);
write_lock(&n->lock);
/* remove from the gc list if new state is permanent or if neighbor
* is externally learned; otherwise entry should be on the gc list
*/
exempt_from_gc = n->nud_state & NUD_PERMANENT ||
n->flags & NTF_EXT_LEARNED;
on_gc_list = !list_empty(&n->gc_list);
if (exempt_from_gc && on_gc_list) {
list_del_init(&n->gc_list);
atomic_dec(&n->tbl->gc_entries);
} else if (!exempt_from_gc && !on_gc_list) {
/* add entries to the tail; cleaning removes from the front */
list_add_tail(&n->gc_list, &n->tbl->gc_list);
atomic_inc(&n->tbl->gc_entries);
}
函数neigh_mark_dead负责将表项由链表gc_list中移除。当执行清理操作时,将使用到此函数;另外,在应用层使用ip命令删除指定表项时,也使用到此函数。
static void neigh_mark_dead(struct neighbour *n)
{
n->dead = 1;
if (!list_empty(&n->gc_list)) {
list_del_init(&n->gc_list);
atomic_dec(&n->tbl->gc_entries);
}
}
内核版本 5.0