注:本文分析基于3.10.0-693.el7内核版本,即CentOS 7.4
1、zone内存水位值
系统内存的每个node上都有不同的zone,每个zone的内存都有对应的水位线,当内存使用达到某个阈值时就会触发相应动作,比如直接回收内存,或者启动kswap进行回收内存。我们可以通过查看/proc/zoneinfo来确认每个zone的min、low、high水位值。
[root@centos7 ~]# cat /proc/zoneinfo | grep -E "Node|min|low|high "
Node 0, zone DMA
min 92
low 115
high 138
Node 0, zone DMA32
min 10839
low 13548
high 16258
Node 1, zone DMA32
min 5706
low 7132
high 8559
Node 1, zone Normal
min 5890
low 7362
high 8835
而这些内存的水位值就由/proc/sys/vm/min_free_kbytes这个参数控制。
2、zone内存保留值
系统在分配内存时,有可能会出现跨zone分配内存的情况,比如,如果我们想要在zone的normal区域分配一个order为6大小的内存块,但是由于normal内存使用紧张,无法提供该大小的内存块,就会往下到DMA32的zone空间去分配。一两次没关系,如果每次都这么操作,那么DMA32的内存就会被耗光;等到某些应用程序需要该zone的内存时无法分配,特别对于某些只能使用特定zone内存的程序。因此对于跨zone使用内存需要控制,因此在考虑是否能进行跨zone分配时,需要参考该zone对应的lowmem_reserve内存。该值就由/proc/sys/vm/lowmem_reserve_ratio控制。
3、内核初始化zone内存水位线
内核在初始化阶段会调用init_per_zone_wmark_min来进行每个zone的内存水位线初始化,同时也会设置每个zone的lowmem_reserve值。
/*
* For small machines we want it small (128k min). For large machines
* we want it large (64MB max). But it is not linear, because network
* bandwidth does not increase linearly with machine size. We use
*
* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
* min_free_kbytes = sqrt(lowmem_kbytes * 16)
*/
int __meminit init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes;
int new_min_free_kbytes;
//获取系统空闲内存值,扣除每个zone的high水位值后的总和
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
//根据上述公式计算new_min_free_kbytes值
new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
if (new_min_free_kbytes > user_min_free_kbytes) {
min_free_kbytes = new_min_free_kbytes;
//最小128k
if (min_free_kbytes < 128)
min_free_kbytes = 128;
//最大65M,但是这只是系统初始化的值,可以通过proc接口设置范围外的值
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
} else {
pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
new_min_free_kbytes, user_min_free_kbytes);
}
//设置每个zone的min low high水位值
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
//设置每个zone为其他zone的保留内存
setup_per_zone_lowmem_reserve();
setup_per_zone_inactive_ratio();
return 0;
}
系统初始化里min_free_kbytes的值介于128k~65M之间,但是通过proc接口设置就没这个限制,
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
if (write) {
user_min_free_kbytes = min_free_kbytes;
//直接根据用户的值设置min、low、high水位值
setup_per_zone_wmarks();
}
return 0;
}
然后就是进入setup_per_zone_wmarks计算每个zone的min、low、high水位线,因为需要考虑多个zone,因此这个min_free_kbytes需要按比例分配给各个zone。
void setup_per_zone_wmarks(void)
{
mutex_lock(&zonelists_mutex);
__setup_per_zone_wmarks();
mutex_unlock(&zonelists_mutex);
}
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
//统计非ZONE_HIGHMEM的内存总量
for_each_zone(zone) {
if (!is_highmem(zone))
lowmem_pages += zone->managed_pages;
}
//针对每个zone设置min low high水位线
for_each_zone(zone) {
u64 tmp;
spin_lock_irqsave(&zone->lock, flags);
//以下两个语句就是按当前zone内存量占总的内存量的大小来分配min的水位值
//因为所有zone的min水位相加才是真正的min_free_kbytes
tmp = (u64)pages_min * zone->managed_pages;
do_div(tmp, lowmem_pages);
//64位机器上不会有highmem区域,因此不考虑该情况
if (is_highmem(zone)) {
...
} else {
//设置min水位值
zone->watermark[WMARK_MIN] = tmp;
}
//low水位值=5/4的min水位值
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
//high水位值=3/2的min水位值
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) -
low_wmark_pages(zone) -
zone_page_state(zone, NR_ALLOC_BATCH));
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
//更新totalreserve_pages的值
calculate_totalreserve_pages();
}
所以总的来说,
- watermark[WMARK_MIN] = min_free_kbytes/4*zone.pages/zone.allpages
- watermark[WMARK_LOW] = 5/4*watermark[WMARK_MIN]
- watermark[WMARK_HIGH] = 3/2*watermark[WMARK_MIN]
设置完内存水位线后,会更新totalreserve_pages的值,这个值用于评估系统正常运行时需要使用的内存,该值会作用于overcommit时,判断当前是否允许此次内存分配。
不过由于此时各个zone的lowmem_reserve值还未设置,因此这里我们先不分析,我们先看lowmem_reserve值是如何设置,毕竟totalreserve_pages的值会在设置lowmem_reserve后再次更新。
那么我们就进入setup_per_zone_lowmem_reserve,看看lowmem_reserve值是如何设置的,
/*
* setup_per_zone_lowmem_reserve - called whenever
* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
* has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*/
static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
enum zone_type j, idx;
//遍历每个node
for_each_online_pgdat(pgdat) {
//遍历每个zone,假设系统上有ZONE_DMA,ZONE_DMA32,ZONE_NORMAL三个zone类型
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long managed_pages = zone->managed_pages;
//j=0,则zone[DMA].lowmem_reserve[DMA] = 0
//j=1,则zone[DMA32].lowmem_reserve[DMA32] = 0
//j=2,则zone[NORMAL].lowmem_reserve[NORMAL] = 0
//这里的意思就是对于自身zone的内存使用,不需要考虑保留
zone->lowmem_reserve[j] = 0;
idx = j;
while (idx) {
struct zone *lower_zone;
idx--;
if (sysctl_lowmem_reserve_ratio[idx] < 1)
sysctl_lowmem_reserve_ratio[idx] = 1;
lower_zone = pgdat->node_zones + idx;
//j=0,不会进入循环
//j=1,idx=0,则zone[DMA].lowmem_reserve[DMA32] = zone[DMA32].pages/sysctl_lowmem_reserve_ratio[DMA]
//j=2,idx=1,则zone[DMA32].lowmem_reserve[NORMAL] = zone[NORMAL].pages/sysctl_lowmem_reserve_ratio[DMA32]
// idx=0,则zone[DMA].lowmem_reserve[NORMAL] = zone[NORMAL+DMA32].pages/sysctl_lowmem_reserve_ratio[DMA]
lower_zone->lowmem_reserve[j] = managed_pages /
sysctl_lowmem_reserve_ratio[idx];
managed_pages += lower_zone->managed_pages;
}
}
}
//再次更新totalreserve_pages的值
calculate_totalreserve_pages();
}
由于内存分配只可能向下分配,不可能向上分配,即需要DMA的内存不可能到DMA32或者NORMAL上分配,
同理需要DMA32的内存也不可能到NORMAL上分配,因此不存在zone[DMA32].lowmem_reserve[DMA],
也不存在zone[NORMAL].lowmem_reserve[DMA]和zone[NORMAL].lowmem_reserve[DMA32],
整理该过程,我们有以下计算方式,
zone[DMA].lowmem_reserve[DMA] = 0
zone[DMA].lowmem_reserve[DMA32] = zone[DMA32].pages/sysctl_lowmem_reserve_ratio[DMA]
zone[DMA].lowmem_reserve[NORMAL] = zone[NORMAL+DMA32].pages/sysctl_lowmem_reserve_ratio[DMA]
zone[DMA32].lowmem_reserve[DMA32] = 0
zone[DMA32].lowmem_reserve[NORMAL] = zone[NORMAL].pages/sysctl_lowmem_reserve_ratio[DMA32]
zone[NORMAL].lowmem_reserve[NORMAL] = 0
总的来说就是为了避免跨zone分配内存将下级zone内存耗光的情况。
设置lowmem_reserve后就会再次更新totalreserve_pages的值,
/*
* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
* or min_free_kbytes changes.
*/
static void calculate_totalreserve_pages(void)
{
struct pglist_data *pgdat;
unsigned long reserve_pages = 0;
enum zone_type i, j;
//遍历每个node
for_each_online_pgdat(pgdat) {
//遍历每个zone
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
unsigned long max = 0;
/* Find valid and maximum lowmem_reserve in the zone */
//查找当前zone中,为上级zone内存类型最大的保留值
for (j = i; j < MAX_NR_ZONES; j++) {
if (zone->lowmem_reserve[j] > max)
max = zone->lowmem_reserve[j];
}
/* we treat the high watermark as reserved pages. */
//每个zone的high水位值和最大保留值之和当做是系统运行保留阈值
max += high_wmark_pages(zone);
if (max > zone->managed_pages)
max = zone->managed_pages;
reserve_pages += max;
/*
* Lowmem reserves are not available to
* GFP_HIGHUSER page cache allocations and
* kswapd tries to balance zones to their high
* watermark. As a result, neither should be
* regarded as dirtyable memory, to prevent a
* situation where reclaim has to clean pages
* in order to balance the zones.
*/
zone->dirty_balance_reserve = max;
}
}
dirty_balance_reserve = reserve_pages;
//这个totalreserve_pages在overcommit时会使用到
//该值作为系统正常运行的最低保证
totalreserve_pages = reserve_pages;
}
4、系统验证
通过代码分析后我们了解了内存的min、low、high水位线,以及每个zone的lowmem_reserve,下面我们就来验证下系统上这些值是否符合我们的分析。
首先是min、low、high水位线和min_free_kbytes,我们以min值为例,
[root@centos7 ~]# echo 90112 > /proc/sys/vm/min_free_kbytes
[root@centos7 ~]# cat /proc/sys/vm/min_free_kbytes
90112
[root@centos7 ~]# cat /proc/zoneinfo | grep -E "Node|managed|min"
Node 0, zone DMA
min 92
managed 3977
Node 0, zone DMA32
min 10844
managed 467178
Node 1, zone DMA32
min 5703
managed 245716
Node 1, zone Normal
min 5887
managed 253642
由此有,
Node[0].DMA.min=3977/4*90112/(3977+467178+245716+253642)=92
Node[0].DMA32.min=467178/4*90112/(3977+467178+245716+253642)=10844
Node[1].DMA32.min=90112/4*245716/(3977+467178+245716+253642)=5703
Node[1].Normal.min=90112/4*253642/(3977+467178+245716+253642)=5887
可见是符合我们的分析。
然后是lowmem_reserve,
[root@centos7 ~]# cat /proc/zoneinfo | grep -E "Node|managed|protection"
Node 0, zone DMA
managed 3977
protection: (0, 1824, 1824, 1824)
Node 0, zone DMA32
managed 467178
protection: (0, 0, 0, 0)
Node 1, zone DMA32
managed 245716
protection: (0, 0, 990, 990)
Node 1, zone Normal
managed 253642
protection: (0, 0, 0, 0)
[root@centos7 ~]# cat /proc/sys/vm/lowmem_reserve_ratio
256 256 32
这里的计算是要区分node的,由于系统上没有MOVEABLE的zone,我们就不计算这个zone的相关参数,由此有,
Node[0].zone[DMA].lowmem_reserve[DMA] = 0
Node[0].zone[DMA].lowmem_reserve[DMA32] = Node[0].zone[DMA32].pages/sysctl_lowmem_reserve_ratio[DMA] = 467178/256 = 1824
Node[0].zone[DMA].lowmem_reserve[NORMAL] = Node[0].zone[NORMAL+DMA32].pages/sysctl_lowmem_reserve_ratio[DMA] = 467178/256 = 1824
Node[0].zone[DMA32].lowmem_reserve[DMA] = 0
Node[0].zone[DMA32].lowmem_reserve[DMA32] = 0
Node[0].zone[DMA32].lowmem_reserve[NORMAL] = Node[0].zone[NORMAL].pages/sysctl_lowmem_reserve_ratio[DMA32] = 0/256 = 0
Node[1].zone[DMA32].lowmem_reserve[DMA] = 0
Node[1].zone[DMA32].lowmem_reserve[DMA32] = 0
Node[1].zone[DMA32].lowmem_reserve[NORMAL] = Node[1].zone[NORMAL].pages/sysctl_lowmem_reserve_ratio[DMA32] = 253642/256 = 990
由此可见,确实符合我们的分析。