注:本文分析基于3.10.0-693.el7内核版本,即CentOS 7.4
1、关于drop_caches
通常在内存不足时,我们习惯通过echo 3 > /proc/sys/vm/drop_caches 的方式手动清理系统缓存,
[root@localhost ~]# free -m
total used free shared buff/cache available
Mem: 7822 3436 2068 40 2317 3997
Swap: 0 0 0
[root@localhost ~]# echo 3 > /proc/sys/vm/drop_caches
[root@localhost ~]# free -m
total used free shared buff/cache available
Mem: 7822 3433 4036 40 352 4037
Swap: 0 0 0
对于数字3的含义,我们可以通过内核文档了解其具体含义,
To free pagecache:
echo 1 > /proc/sys/vm/drop_caches
To free reclaimable slab objects (includes dentries and inodes):
echo 2 > /proc/sys/vm/drop_caches
To free slab objects and pagecache:
echo 3 > /proc/sys/vm/drop_caches
2、释放pagecache
在之前我们知道当内存低于某个阈值时,会触发脏页回写,提交回写work到对应BDI设备上,由BDI writebacke进程回写脏页释放内存。这和drop_caches中的echo 1类似,都是释放脏页,因此其最后路径是一致的。
int drop_caches_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
int ret;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (ret)
return ret;
if (write) {
static int stfu;
// echo 1 > drop_caches
if (sysctl_drop_caches & 1) {
iterate_supers(drop_pagecache_sb, NULL);
count_vm_event(DROP_PAGECACHE);
}
// echo 2 > drop_caches
if (sysctl_drop_caches & 2) {
drop_slab();
count_vm_event(DROP_SLAB);
}
if (!stfu) {
pr_info("%s (%d): drop_caches: %d\n",
current->comm, task_pid_nr(current),
sysctl_drop_caches);
}
//置位,否则就一直在回收了
stfu |= sysctl_drop_caches & 4;
}
return 0;
}
可见,echo 1时,会调用drop_pagecache_sb去释放pagecache,我们继续往下查,
drop_pagecache_sb ->
iput ->
iput_final->
write_inode_now -> #提交writeback_control,立即回写
writeback_single_inode ->
__writeback_single_inode ->
do_writepages #调用对应文件系统的writepage写回磁盘
在BDI回写里,一开始提交的是wb_writeback_work,等到实际要执行回写操作时,都会转换为writeback_control,再去执行回写。
因此,echo 1的操作就是,遍历每个超级块,调用drop_pagecache_sb,drop_pagecache_sb中会遍历该超级块所有的inode,对其关联的pagecache进行回写。与BDI不同的是,该操作是立马执行,不需要等待周期执行或者inode过期。
3、释放slab cache
而对于echo 2的情况,就比较复杂一点,
static void drop_slab(void)
{
int nr_objects;
struct shrink_control shrink = {
.gfp_mask = GFP_KERNEL,
};
//上次回收缓存数量高于10,就再进行一次回收
//这个条件其实挺苛刻的,回收后整个系统空闲slab不会超过10
do {
nr_objects = shrink_slab(&shrink, 1000, 1000);
} while (nr_objects > 10);
}
unsigned long shrink_slab(struct shrink_control *shrink,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
{
struct shrinker *shrinker;
unsigned long ret = 0;
...
//遍历系统中所有的shrinker,回收各个slab管理区的空闲缓存
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
long total_scan;
long max_pass;
int shrink_ret = 0;
long nr;
long new_nr;
//获取批处理数量,默认每次回收128,对于超级块而言是1024
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
//获取该slab管理区可回收的缓存数量
max_pass = do_shrinker_shrink(shrinker, shrink, 0);
if (max_pass <= 0)
continue;
nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
total_scan = nr;
//计算该slab管理区此次缓存回收额度,一堆操作
//针对手动释放缓存的场景,基本上是两倍的max_pass,也就是尽可能去释放
//对于kswap或其他路径上,不会超过一倍的max_pass
delta = (4 * nr_pages_scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
total_scan += delta;
if (total_scan < 0) {
printk(KERN_ERR "shrink_slab: %pF negative objects to "
"delete nr=%ld\n",
shrinker->shrink, total_scan);
total_scan = max_pass;
}
//如果delta偏小,意味着系统中inactive的缓存偏少,我们回收的额度也不能设置太大
if (delta < max_pass / 4)
total_scan = min(total_scan, max_pass / 2);
//控制回收总额上限,避免死循环
if (total_scan > max_pass * 2)
total_scan = max_pass * 2;
trace_mm_shrink_slab_start(shrinker, shrink, nr,
nr_pages_scanned, lru_pages,
max_pass, delta, total_scan);
//循环回收缓存
while (total_scan >= batch_size) {
int nr_before;
//记录处理前缓存数量
nr_before = do_shrinker_shrink(shrinker, shrink, 0);
//回收后缓存数量
shrink_ret = do_shrinker_shrink(shrinker, shrink,
batch_size);
if (shrink_ret == -1)
break;
//统计此次回收的缓存数量
if (shrink_ret < nr_before)
ret += nr_before - shrink_ret;
count_vm_events(SLABS_SCANNED, batch_size);
//减少扫描总额
total_scan -= batch_size;
cond_resched();
}
//如果剩下的额度不够一个batch_size,留着下次使用,记录在nr_in_batch
if (total_scan > 0)
new_nr = atomic_long_add_return(total_scan,
&shrinker->nr_in_batch);
else
new_nr = atomic_long_read(&shrinker->nr_in_batch);
trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
}
up_read(&shrinker_rwsem);
out:
cond_resched();
return ret;
}
空闲slab缓存计算和回收都是在do_shrinker_shrink完成,它其实调用的是一个函数指针,不同slab管理区有自己定义的shrink函数,第三个入参nr_to_scan为0时,是计算空闲slab缓存;不为空时,表示扫描和回收缓存的数量。
static inline int do_shrinker_shrink(struct shrinker *shrinker,
struct shrink_control *sc,
unsigned long nr_to_scan)
{
int objects;
sc->nr_to_scan = nr_to_scan;
objects = (*shrinker->shrink)(shrinker, sc);
if (objects < -1)
return INT_MAX;
return objects;
}
总的来说,drop_slab就是调用每个slab管理区定义的shrink函数,先计算出可回收的slab缓存数量,然后确定扫描数量,最后调用shrink函数执行缓存扫描和回收。