linux内核中有一个非常重要的内核线程kswapd,负责在内存不足时回收页面,kswapd内核线程初始化时会为系统中每个NUMA内存节点创建一个名为“kswapd%d”的内核线程。kswapd线程的初始化实现如下:
/*
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
int kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
int ret = 0;
if (pgdat->kswapd)
return 0;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state == SYSTEM_BOOTING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
}
return ret;
}
在NUMA中,kswapd线程初始化时传递的参数为pg_data_t结构体,该结构体描述的是每个内存node节点的物理内存布局,node节点的相关知识在【Linux内核学习笔记一】内存管理-节点(node)中已经详细介绍。
在pg_data_t结构体中与kswapd线程相关的参数包括:kswapd_wait,kswapd_max_order和classzone_idx。kswapd_wait是一个等待队列,该队列在free_area_init_core()函数中初始化,页面分配路径上的唤醒函数wakeup_kswapd()把kswapd_max_order和classzone_idx作为参数传递给kswapd内核线程,在分配路径上,如果低水位的情况下无法成功分配内存,那么会通过wakeup_kswapd()函数唤醒kswapd内核线程进行页面回收,以便释放一些内存。wakeup_kswapd()的实现如下:
/*
* A zone is low on free memory, so wake its kswapd task to service it.
*/
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
if (!populated_zone(zone))
return;
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
return;
pgdat = zone->zone_pgdat;
if (pgdat->kswapd_max_order < order) {
pgdat->kswapd_max_order = order;
pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
}
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
if (zone_balanced(zone, order, 0, 0))
return;
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
wakeup_kswapd()函数中kswapd_max_order不能小鱼alloc_page()分配内存的order,classzone_idx是在__alloc_pages_nodemask()函数中计算第一个最合适分配内存的zone序号,这两个参数会传递到kswap内核线程中。kswap内核线程的代码如下:
/*
* The background pageout daemon, started as a kernel thread
* from the init process.
*
* This basically trickles out pages so that we have _some_
* free memory available even if there is no other activity
* that frees anything up. This is needed for things like routing
* etc, where we otherwise might have all activity going on in
* asynchronous contexts that cannot page things out.
*
* If there are applications that are active memory-allocators
* (most normal use), this basically shouldn't matter.
*/
static int kswapd(void *p)
{
unsigned long order, new_order;
unsigned balanced_order;
int classzone_idx, new_classzone_idx;
int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
lockdep_set_current_reclaim_state(GFP_KERNEL);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
current->reclaim_state = &reclaim_state;
/*
* Tell the memory management that we're a "memory allocator",
* and that if we need more memory we should get access to it
* regardless (see "__alloc_pages()"). "kswapd" should
* never get caught in the normal page freeing logic.
*
* (Kswapd normally doesn't need memory anyway, but sometimes
* you need a small amount of memory in order to be able to
* page out something else, and this flag essentially protects
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
order = new_order = 0;
balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
bool ret;
/*
* If the last balance_pgdat was unsuccessful it's unlikely a
* new request of a similar or harder type will succeed soon
* so consider going to sleep on the basis we reclaimed at
*/
if (balanced_classzone_idx >= new_classzone_idx &&
balanced_order == new_order) {
new_order = pgdat->kswapd_max_order;
new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
pgdat->classzone_idx = pgdat->nr_zones - 1;
}
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
* allocation or has tigher zone constraints
*/
order = new_order;
classzone_idx = new_classzone_idx;
} else {
kswapd_try_to_sleep(pgdat, balanced_order,
balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
new_order = order;
new_classzone_idx = classzone_idx;
pgdat->kswapd_max_order = 0;
pgdat->classzone_idx = pgdat->nr_zones - 1;
}
ret = try_to_freeze();
if (kthread_should_stop())
break;
/*
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
balanced_classzone_idx = classzone_idx;
balanced_order = balance_pgdat(pgdat, order,
&balanced_classzone_idx);
}
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
return 0;
}
系统启动时,会在kswapd_try_to_sleep中睡眠,并让出CPU控制权。当系统内存紧张时,这时内存分配函数会调用wakeup_kswapd()来唤醒kswapd内核线程,此时kswapd内核线程在kswapd_try_to_sleep函数中被唤醒,然后调用balance_pgdat()函数来回收页面,balance_pgdat()函数的实现如下:
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
* Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
* What we do is to detect the case where all pages in the zone have been
* scanned twice and there has been zero successful reclaim. Mark the zone as
* dead and from now on, only perform a short scan. Basically we're polling
* the zone for when the problem goes away.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
* lower zones regardless of the number of free pages in the lower zones. This
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int *classzone_idx)
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = 1,
};
count_vm_event(PAGEOUTRUN);
do {
unsigned long nr_attempted = 0;
bool raise_priority = true;
bool pgdat_needs_compaction = (order > 0);
sc.nr_reclaimed = 0;
/*
* Scan in the highmem->dma direction for the highest
* zone which needs scanning
*/
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
if (sc.priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue;
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming.
*/
age_active_anon(zone, &sc);
/*
* If the number of buffer_heads in the machine
* exceeds the maximum allowed level and this node
* has a highmem zone, force kswapd to reclaim from
* it to relieve lowmem pressure.
*/
if (buffer_heads_over_limit && is_highmem_idx(i)) {
end_zone = i;
break;
}
if (!zone_balanced(zone, order, 0, 0)) {
end_zone = i;
break;
} else {
/*
* If balanced, clear the dirty and congested
* flags
*/
clear_bit(ZONE_CONGESTED, &zone->flags);
clear_bit(ZONE_DIRTY, &zone->flags);
}
}
if (i < 0)
goto out;
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
/*
* If any zone is currently balanced then kswapd will
* not call compaction as it is expected that the
* necessary pages are already available.
*/
if (pgdat_needs_compaction &&
zone_watermark_ok(zone, order,
low_wmark_pages(zone),
*classzone_idx, 0))
pgdat_needs_compaction = false;
}
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
*/
if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;
/*
* Now scan the zone in the dma->highmem direction, stopping
* at the last zone which needs scanning.
*
* We do this because the page allocator works in the opposite
* direction. This prevents the page allocator from allocating
* pages behind kswapd's direction of progress, which would
* cause too much scanning of the lower zones.
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
if (sc.priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue;
sc.nr_scanned = 0;
nr_soft_scanned = 0;
/*
* Call soft limit reclaim before calling shrink_zone.
*/
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
order, sc.gfp_mask,
&nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;
/*
* There should be no need to raise the scanning
* priority if enough pages are already being scanned
* that that high watermark would be met at 100%
* efficiency.
*/
if (kswapd_shrink_zone(zone, end_zone,
&sc, &nr_attempted))
raise_priority = false;
}
/*
* If the low watermark is met there is no need for processes
* to be throttled on pfmemalloc_wait as they should not be
* able to safely make forward progress. Wake them
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
pfmemalloc_watermark_ok(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
/*
* Fragmentation may mean that the system cannot be rebalanced
* for high-order allocations in all zones. If twice the
* allocation size has been reclaimed and the zones are still
* not balanced then recheck the watermarks at order-0 to
* prevent kswapd reclaiming excessively. Assume that a
* process requested a high-order can direct reclaim/compact.
*/
if (order && sc.nr_reclaimed >= 2UL << order)
order = sc.order = 0;
/* Check if kswapd should be suspending */
if (try_to_freeze() || kthread_should_stop())
break;
/*
* Compact if necessary and kswapd is reclaiming at least the
* high watermark number of pages as requsted
*/
if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
compact_pgdat(pgdat, order);
/*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1 &&
!pgdat_balanced(pgdat, order, *classzone_idx));
out:
/*
* Return the order we were reclaiming at so prepare_kswapd_sleep()
* makes a decision on the order we were last reclaiming at. However,
* if another caller entered the allocator slow path while kswapd
* was awake, order will remain at the higher level
*/
*classzone_idx = end_zone;
return order;
}
balance_pgdat()函数的大致框架是这样的,先从高端内存向低端内存的方向开始扫描每一个zone,直到找到一个内存不平衡的zone,即end_zone。然后再从低端内存往end_zone的方向开始扫描这些zone,进行页面回收,每次回收了一个zone的内存后,加大扫描粒度继续扫描进行页面回收,直到该内存节点node的内存达到平衡后,就结束页面回收。
函数kswapd_shrink_zone用于回收zone中的页面,kswapd_shrink_zone的实现如下:
/*
* kswapd shrinks the zone by the number of pages required to reach
* the high watermark.
*
* Returns true if kswapd scanned at least the requested number of pages to
* reclaim or if the lack of progress was due to pages under writeback.
* This is used to determine if the scanning priority needs to be raised.
*/
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
struct scan_control *sc,
unsigned long *nr_attempted)
{
int testorder = sc->order;
unsigned long balance_gap;
bool lowmem_pressure;
/* Reclaim above the high watermark. */
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
/*
* Kswapd reclaims only single pages with compaction enabled. Trying
* too hard to reclaim until contiguous free pages have become
* available can hurt performance by evicting too much useful data
* from memory. Do not reclaim more than needed for compaction.
*/
if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
compaction_suitable(zone, sc->order, 0, classzone_idx)
!= COMPACT_SKIPPED)
testorder = 0;
/*
* We put equal pressure on every zone, unless one zone has way too
* many pages free already. The "too many pages" is defined as the
* high wmark plus a "gap" where the gap is either the low
* watermark or 1% of the zone, whichever is smaller.
*/
balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
/*
* If there is no low memory pressure or the zone is balanced then no
* reclaim is necessary
*/
lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
if (!lowmem_pressure && zone_balanced(zone, testorder,
balance_gap, classzone_idx))
return true;
shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
/* Account for the number of pages attempted to reclaim */
*nr_attempted += sc->nr_to_reclaim;
clear_bit(ZONE_WRITEBACK, &zone->flags);
/*
* If a zone reaches its high watermark, consider it to be no longer
* congested. It's possible there are dirty pages backed by congested
* BDIs but as pressure is relieved, speculatively avoid congestion
* waits.
*/
if (zone_reclaimable(zone) &&
zone_balanced(zone, testorder, 0, classzone_idx)) {
clear_bit(ZONE_CONGESTED, &zone->flags);
clear_bit(ZONE_DIRTY, &zone->flags);
}
return sc->nr_scanned >= sc->nr_to_reclaim;
}
函数kswapd_shrink_zone中,取SWAP_CLUSTER_MAX和high_wmark_pages(zone)的最大值为要回收的页面数量,SWAP_CLUSTER_MAX为32,balance_gap为低水位值和1%的zone页面数中的较小值,在判断zone中页面是否处于平衡时,需要判断zone的当前水位是否高于WMARK_HIGH + balance_gap。如果zone中的页面处于平衡状态,就不需要进行回收,否则继续调用shrink_zone进行页面回收,该函数是kswapd内核线程的核心函数,shrink_zone的实现如下:
static bool shrink_zone(struct zone *zone, struct scan_control *sc,
bool is_classzone)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
.priority = sc->priority,
};
unsigned long zone_lru_pages = 0;
struct mem_cgroup *memcg;
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
unsigned long lru_pages;
unsigned long scanned;
struct lruvec *lruvec;
int swappiness;
if (mem_cgroup_low(root, memcg)) {
if (!sc->may_thrash)
continue;
mem_cgroup_events(memcg, MEMCG_LOW, 1);
}
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg);
scanned = sc->nr_scanned;
shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
zone_lru_pages += lru_pages;
if (memcg && is_classzone)
shrink_slab(sc->gfp_mask, zone_to_nid(zone),
memcg, sc->nr_scanned - scanned,
lru_pages);
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
* zone.
*
* Limit reclaim, on the other hand, only cares about
* nr_to_reclaim pages to be reclaimed and it will
* retry with decreasing priority if one round over the
* whole hierarchy is not sufficient.
*/
if (!global_reclaim(sc) &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
mem_cgroup_iter_break(root, memcg);
break;
}
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
/*
* Shrink the slab caches in the same proportion that
* the eligible LRU pages were scanned.
*/
if (global_reclaim(sc) && is_classzone)
shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
sc->nr_scanned - nr_scanned,
zone_lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
return reclaimable;
}