shrink_zone()函数用于扫描zone中所有可回收的页面。shrink_zone()函数函数中有大量的memcg相关函数,为了方便理解代码,我们假设系统没有打开CONFIG_MEMCG配置。
[kswapd()->balance_pgdat()->kswapd_shrink_zone()->shrink_zone()]
/*参数说明:
@zone:表示即将要扫描的zone
@sc: 表示扫描的控制参数
@is_classzone: 表示当前zone是否为balance_pgdat()刚开始计算的第一个处于非平衡状态的zone
*/
static bool shrink_zone(struct zone *zone, struct scan_control *sc,
bool is_classzone)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
/*下面是嵌套的while循环。大循环中,判断条件为should_continue_reclaim()函数,
通过这一轮的回收页面的数量和扫描页面的数量来判断是否需要继续扫描,下面我们查看
should_continue_reclaim函数*/
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
.priority = sc->priority,
};
unsigned long zone_lru_pages = 0;
struct mem_cgroup *memcg;
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
memcg = mem_cgroup_iter(root, NULL, &reclaim);
/*下面只有一个循环*/
do {
unsigned long lru_pages;
unsigned long scanned;
struct lruvec *lruvec;
int swappiness;
if (mem_cgroup_low(root, memcg)) {
if (!sc->may_thrash)
continue;
mem_cgroup_events(memcg, MEMCG_LOW, 1);
}
/*获取zone中LRU链表的数据结构,zone的数据结构中有成员lruvec。
struct lruvec数据结构包含了LRU链表,且zone数据结构有一个成员
指向struct lruvec数据结构。*/
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
/*获取系统中的vm_swappiness参数,用于表示swap的活跃程度,这个
值从0到100,0表示匿名页面,不会往swap分区写入;100表示积极地向
swap分区写入匿名页面,通常默认值为60.*/
swappiness = mem_cgroup_swappiness(memcg);
scanned = sc->nr_scanned;
/*扫描LRU链表的核心函数, 下面我们查看此函数的实现*/
shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
zone_lru_pages += lru_pages;
/*shrink_slab函数是调用内存管理系统中的shrinker接口,很多子系统会
注册shrinker接口来回收内存,例如Android系统的lower memory killer*/
if (memcg && is_classzone)
shrink_slab(sc->gfp_mask, zone_to_nid(zone),
memcg, sc->nr_scanned - scanned,
lru_pages);
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
* zone.
*
* Limit reclaim, on the other hand, only cares about
* nr_to_reclaim pages to be reclaimed and it will
* retry with decreasing priority if one round over the
* whole hierarchy is not sufficient.
*/
if (!global_reclaim(sc) &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
mem_cgroup_iter_break(root, memcg);
break;
}
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
/*
* Shrink the slab caches in the same proportion that
* the eligible LRU pages were scanned.
*/
if (global_reclaim(sc) && is_classzone)
shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
sc->nr_scanned - nr_scanned,
zone_lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
return reclaimable;
}
should_continue_reclaim()函数:判断是否需要继续扫描
[shrink_zone()->should_continue_reclaim()]
/*
* Reclaim/compaction is used for high-order allocation requests. It reclaims
* order-0 pages before compacting the zone. should_continue_reclaim() returns
* true if more pages should be reclaimed such that when the page allocator
* calls try_to_compact_zone() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/
/*此函数的判断逻辑是如果已经回收的页面数量sc->nr_reclaimed小于(2 << sc->order)个页面,
且不活跃页面总数大于(2 << sc->order), 那么需要继续回收页面。
compaction_suitable()函数也会判断当前zone的水位,如果水位超过WMARK_LOW,那么会停止扫
描页面。compaction_suitable()函数会在内存规则中详细介绍*/
static inline bool should_continue_reclaim(struct zone *zone,
unsigned long nr_reclaimed,
unsigned long nr_scanned,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(sc))
return false;
/* Consider stopping depending on scan and reclaim activity */
if (sc->gfp_mask & __GFP_REPEAT) {
/*
* For __GFP_REPEAT allocations, stop reclaiming if the
* full LRU list has been scanned and we are still failing
* to reclaim pages. This full LRU scan is potentially
* expensive but a __GFP_REPEAT caller really wants to succeed
*/
if (!nr_reclaimed && !nr_scanned)
return false;
} else {
/*
* For non-__GFP_REPEAT allocations which can presumably
* fail without consequence, stop if we failed to reclaim
* any pages from the last SWAP_CLUSTER_MAX number of
* pages that were scanned. This will return to the
* caller faster at the risk reclaim/compaction and
* the resulting allocation attempt fails
*/
if (!nr_reclaimed)
return false;
}
/*
* If we have not reclaimed enough pages for compaction and the
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
if (get_nr_swap_pages() > 0)
inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
/* If compaction would go ahead or the allocation would succeed, stop */
switch (compaction_suitable(zone, sc->order, 0, 0)) {
case COMPACT_PARTIAL:
case COMPACT_CONTINUE:
return false;
default:
return true;
}
}
回到shrink_zone函数
shrink_lruvec()函数:
get_scan_count()函数会根据swappiness参数和sc->priority优先级去计算4个LRU链表中应该扫描的页面页数。结果存放在nr[]数组中,扫描规则总结如下:
-
如果系统没有swap交换分区或SWAP空间,则不用扫描匿名页面。
-
如果zone_free+zone_lru_file 小于等于 watermark[WMARK_HIGH],那么只扫描匿名页面。
-
如果LRU_INACTIVE_FILE大于LRU_ACTIVE_FILE,那么只扫描文件映射页面。
扫描页面公式如下:
-
扫描一种页面:
scan = LRU上总页面 >> sc->priority
-
同时扫描两种页面:(计算扫描匿名页面和文件映射页面的比例,然后根据比例计算分别扫描的页面数量)
scan = LRU上总页面 >> sc->priority
ap = (swappiness*(recent_scanned[0] + 1)) / (recent_rotated[0] + 1)
fp = ((200-swappiness)*(recent_scanned[1]+1))/(recent_rotated[1]+1)
scan_anon = (scan * ap)/ (ap+fp+1)
scan_file = (scan * fp)/ (ap+fp+1)
(1) recent_scanned: 指最近扫描页面的数量,在扫描活跃链表和不活跃链表时,会统计到recent_scanned变量中,详见shrink_active_list()函数和shrink_inactive_list()函数.
(2) recent_rotated
-
在扫描不活跃链表时,统计那些被踢回活跃链表的页面数量到recent_rotated变量中,详见shrink_inactive_list()->putback_inactive_pages()函数。
-
在扫描活跃页面时,访问引用的页面数量也被加到recent_rotated变量。
-
总之,该变量反映了真实的活跃页面的数量。
下面查看struct zone_reclaim_stat来描述这个数据统计。
struct zone_reclaim_stat数据结构:
struct zone_reclaim_stat {
/*
* The pageout code in vmscan.c keeps track of how many of the
* mem/swap backed and file backed pages are referenced.
* The higher the rotated/scanned ratio, the more valuable
* that cache is.
*
* The anon LRU stats live in [0], file LRU stats in [1]
*/
/*匿名页面存放在数组[0],文件缓存存放在数组[1]中,recent_rotated/recent_scanned的比值越大,
说明这些被缓存起来的页面越有价值,他们更应该留下来。以匿名页面为例,recent_rotated值越小,
说明LRU链表中匿名页面价值越小,那么更应该多扫面一些匿名页面,尽量把没有缓存价值的页面换出去。
根据计算公式,匿名页面的recent_rotated值越小,ap的值越大,那么最后scan_anon需要扫描的匿名
页面数量也越多,也可以理解为扫描的总量一定的情况下,匿名页面占用比重更大。*/
unsigned long recent_rotated[2];
unsigned long recent_scanned[2];
};
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
struct scan_control *sc, unsigned long *lru_pages)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
bool scan_adjusted;
get_scan_count(lruvec, swappiness, sc, nr, lru_pages);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
/*
* Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
* event that can occur when there is little memory pressure e.g.
* multiple streaming readers/writers. Hence, we do not abort scanning
* when the requested number of pages are reclaimed when scanning at
* DEF_PRIORITY on the assumption that the fact we are direct
* reclaiming implies that kswapd is not keeping up and it is best to
* do a batch of work at once. For memcg reclaim one check is made to
* abort proportional reclaim if either the file or anon lru has already
* dropped to zero at the first pass.
*/
scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
/*这里为什么漏掉活跃的匿名页面(LRU_ACTIVE_ANON)呢?因为活跃的匿名页面不能直接被回收,
根据局部原理,它由可能很快又被访问了,匿名页面需要长时间的老化且加入不活跃匿名页面LRU
链表后才能被回收。*/
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
unsigned long nr_anon, nr_file, percentage;
unsigned long nr_scanned;
/*依次扫描可回收的4种LRU链表,shrink_list()函数会具体处
理各种LRU链表的情况。*/
for_each_evictable_lru(lru) {
if (nr[lru]) {
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
nr[lru] -= nr_to_scan;
/*下面查看shrink_list实现*/
nr_reclaimed += shrink_list(lru, nr_to_scan,
lruvec, sc);
}
}
/*如果已经回收的页面数量(nr_reclaimed)没有达到预期值(nr_to_reclaim),
那么将继续扫描*/
if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
continue;
/*
* For kswapd and memcg, reclaim at least the number of pages
* requested. Ensure that the anon and file LRUs are scanned
* proportionally what was requested by get_scan_count(). We
* stop reclaiming one LRU and reduce the amount scanning
* proportional to the original scan target.
*/
nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
/*
* It's just vindictive to attack the larger once the smaller
* has gone to zero. And given the way we stop scanning the
* smaller below, this makes sure that we only make one nudge
* towards proportionality once we've got nr_to_reclaim.
*/
/*如果已经扫描完毕,则退出循环*/
if (!nr_file || !nr_anon)
break;
if (nr_file > nr_anon) {
unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
targets[LRU_ACTIVE_ANON] + 1;
lru = LRU_BASE;
percentage = nr_anon * 100 / scan_target;
} else {
unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
targets[LRU_ACTIVE_FILE] + 1;
lru = LRU_FILE;
percentage = nr_file * 100 / scan_target;
}
/* Stop scanning the smaller of the LRU */
nr[lru] = 0;
nr[lru + LRU_ACTIVE] = 0;
/*
* Recalculate the other LRU scan count based on its original
* scan target and the percentage scanning already complete
*/
lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
lru += LRU_ACTIVE;
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
scan_adjusted = true;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
if (inactive_anon_is_low(lruvec))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
throttle_vm_writeout(sc->gfp_mask);
}
shrink_list()函数实现:
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
/*处理活跃的LRU链表,包括匿名页面和文件映射页面,如果不活跃页面少于活跃页面,那么
需要调用shrink_active_list()函数来看有哪些活跃页面可以迁移到不活跃页面链表中。
下面查看inactive_list_is_low()函数实现*/
if (is_active_lru(lru)) {
if (inactive_list_is_low(lruvec, lru))/*确定是否需要扫描活跃LRU链表*/
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}
回到shrink_lruvec()函数
inactive_list_is_low()函数: 确定缓存比例是否是理想情况,
1. 匿名页面,活跃和不活跃比例为3:1
2. 文件页面,活跃大于不活跃
static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
{
/*区分匿名页面和文件缓存两种情况,我们暂时看匿名页面情况,那就是
查看inactive_anon_is_low()函数实现*/
if (is_file_lru(lru))
return inactive_file_is_low(lruvec);
else
return inactive_anon_is_low(lruvec);
}
回到shrink_list()函数
inactive_anon_is_low()函数实现:
static int inactive_anon_is_low(struct lruvec *lruvec)
{
/*
* If we don't have swap space, anonymous page deactivation
* is pointless.
*/
if (!total_swap_pages)
return 0;
if (!mem_cgroup_disabled())
return mem_cgroup_inactive_anon_is_low(lruvec);
return inactive_anon_is_low_global(lruvec_zone(lruvec));
}
static int inactive_anon_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
active = zone_page_state(zone, NR_ACTIVE_ANON);
inactive = zone_page_state(zone, NR_INACTIVE_ANON);
/*为什么活跃LRU链表页面的数量少于不活跃LRU时,不去扫描活跃LRU呢?
系统常常会有只使用一次的文件访问(user-once streaming IO)的情况,
不活跃LRU链表增长速度变快,不活跃LRU页面数量大于活跃页面数量,这时
不会去扫描活跃LRU。
判断文件映射链表相对简单,直接比较活跃和不活跃链表页面的数据即可。对于匿名页面,
zone数据结构中有一个inactive_ratio成员,inactive_ratio的计算在mm/page_alloc.c
文件中的calculate_zone_inactive_ratio()函数里,对于zone的内存空间小于1GB的情况,
通常inactive_radio为1,1GB-10GB的inactive-ratio为3。inactive_radio为3,表示在
LRU中活跃匿名页面和不活跃匿名页面的比值为3:1,也就是说在理想状态下有25%的页面保存
在不活跃链表中。匿名页面的不活跃链表有些奇怪,一方面我们需要它越短越好,这样页面
回收机制可以少做点事情,但是另一方面,如果匿名页面的不活跃链表比较长,在这个链表的
页面会有比较长的时间有机会被再次访问到。*/
if (inactive * zone->inactive_ratio < active)
return 1;
return 0;
}
回到inactive_list_is_low()函数