伙伴系统以页为单位来管理内存,内存碎片也是基于页面的,即有大量离散且不连续的页面导致的。从内存角度来看,内存碎片不是好事情,有些情况下物理设备需要大段的连续的物理内存,如果内核无法满足,则会发生内核panic。内存碎片化好比军训中带队行走时间长了,队列乱了,需要重新规整一下,因此本章称为内存规整,一些文献称为内存紧凑,它是为了解决内存碎片化而出现的一个功能。
内核中去碎片化的基本原理是按照页的可移动性将页面分组。迁移内核本身使用的物理内存的实现难度和复杂度都很大,因此目前的内核是不迁移内核本身使用的物理页面。对于应用程序进程使用的页面,实际上通过用户页表的映射来访问。用户页表可以移动和修改映射关系,不会影响用户进程,因此内存规整是基于页面迁移实现的。
内存规整实现:
内存规整的一个重要的应用场景是在分配大块内存时(order > 1),在WMARK_LOW低水位情况下分配失败,唤醒kswapd内核线程后依然无法分配出内存,这时调用__alloc_pages_direct_compact()来压缩内存尝试分配出所需要的内存。下面沿着alloc_pages()->...->__alloc_pages_direct_compact()这条内核路径来看内存规整是如何工作的。
[mm/page_alloc.c]
[alloc_pages()->alloc_pages_node()->__alloc_pages()->__alloc_pages_nodemask()->__alloc_pages_slowpath()->__alloc_pages_direct_compact()]
/* Try memory compaction for high-order allocations before reclaim */
/*参数mode指migration_mode,通常由__alloc_pages_slowpath()传递过来,其值为MIGRATE_ASYNC*/
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, int *contended_compaction,
bool *deferred_compaction)
{
unsigned long compact_result;
struct page *page;
/*内存规整是针对high-order的内存分配,所以order等于0的情况不需要触发内存规整。*/
if (!order)
return NULL;
current->flags |= PF_MEMALLOC;
/*try_to_compact_pages()函数执行时需要设置当前进程的PF_MEMALLOC标志位,该标志位会在页面迁移时用到,
避免页面锁(PG_Locked)发生死锁,下面查看此函数实现*/
compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
mode, contended_compaction);
current->flags &= ~PF_MEMALLOC;
switch (compact_result) {
case COMPACT_DEFERRED:
*deferred_compaction = true;
/* fall-through */
case COMPACT_SKIPPED:
return NULL;
default:
break;
}
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
* count a compaction stall
*/
count_vm_event(COMPACTSTALL);
/*当内存规整执行完成后,调用get_page_from_freelist()尝试分配内存,如果分配成功将返回首页page数据结构*/
page = get_page_from_freelist(gfp_mask, order,
alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
if (page) {
struct zone *zone = page_zone(page);
zone->compact_blockskip_flush = false;
compaction_defer_reset(zone, order, true);
count_vm_event(COMPACTSUCCESS);
return page;
}
/*
* It's bad if compaction run occurs and fails. The most likely reason
* is that pages exist, but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
cond_resched();
return NULL;
}
try_to_compact_pages()函数实现:
[__alloc_pages_direct_compact()->try_to_compact_pages()]
/**
* try_to_compact_pages - Direct compact to satisfy a high-order allocation
* @gfp_mask: The GFP mask of the current allocation
* @order: The order of the current allocation
* @alloc_flags: The allocation flags of the current allocation
* @ac: The context of current allocation
* @mode: The migration mode for async, sync light, or sync migration
* @contended: Return value that determines if compaction was aborted due to
* need_resched() or lock contention
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, int *contended)
{
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_DEFERRED;
int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
*contended = COMPACT_CONTENDED_NONE;
/* Check if the GFP flags allow compaction */
if (!order || !may_enter_fs || !may_perform_io)
return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
/* Compact each zone in the list */
/*for_each_zone_zonelist_nodemask宏,它会根据分配掩码来确定需要扫描和遍历哪些zone*/
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
int status;
int zone_contended;
if (compaction_deferred(zone, order))
continue;
/*compact_zone_order()对特定zone执行内存规整,下面查看此函数实现*/
status = compact_zone_order(zone, order, gfp_mask, mode,
&zone_contended, alloc_flags,
ac->classzone_idx);
rc = max(status, rc);
/*
* It takes at least one zone that wasn't lock contended
* to clear all_zones_contended.
*/
all_zones_contended &= zone_contended;
/* If a normal allocation would succeed, stop compacting */
/*zone_watermark_ok()判断zone当前的水位是否高于LOW_WMARK水位,如果是,则退出循环*/
if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
ac->classzone_idx, alloc_flags)) {
/*
* We think the allocation will succeed in this zone,
* but it is not certain, hence the false. The caller
* will repeat this with true if allocation indeed
* succeeds in this zone.
*/
compaction_defer_reset(zone, order, false);
/*
* It is possible that async compaction aborted due to
* need_resched() and the watermarks were ok thanks to
* somebody else freeing memory. The allocation can
* however still fail so we better signal the
* need_resched() contention anyway (this will not
* prevent the allocation attempt).
*/
if (zone_contended == COMPACT_CONTENDED_SCHED)
*contended = COMPACT_CONTENDED_SCHED;
goto break_loop;
}
if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
/*
* We think that allocation won't succeed in this zone
* so we defer compaction there. If it ends up
* succeeding after all, it will be reset.
*/
defer_compaction(zone, order);
}
/*
* We might have stopped compacting due to need_resched() in
* async compaction, or due to a fatal signal detected. In that
* case do not try further zones and signal need_resched()
* contention.
*/
if ((zone_contended == COMPACT_CONTENDED_SCHED)
|| fatal_signal_pending(current)) {
*contended = COMPACT_CONTENDED_SCHED;
goto break_loop;
}
continue;
break_loop:
/*
* We might not have tried all the zones, so be conservative
* and assume they are not all lock contended.
*/
all_zones_contended = 0;
break;
}
/*
* If at least one zone wasn't deferred or skipped, we report if all
* zones that were tried were lock contended.
*/
if (rc > COMPACT_SKIPPED && all_zones_contended)
*contended = COMPACT_CONTENDED_LOCK;
return rc;
}
回到__alloc_pages_direct_compact函数
compact_zone_order()函数实现:
[__alloc_pages_direct_compact()->try_to_compact_pages()->compact_zone_order()]
和kswapd的代码一样,这里定义了控制相关的数据结构struct compact_control cc来传递参数。cc.migratepages是将要迁移页面的链表,cc.freepages表示要迁移的目的链表。
static unsigned long compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum migrate_mode mode, int *contended,
int alloc_flags, int classzone_idx)
{
unsigned long ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
.order = order,
.gfp_mask = gfp_mask,
.zone = zone,
.mode = mode,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
/*下面查看此函数实现*/
ret = compact_zone(zone, &cc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
*contended = cc.contended;
return ret;
}
回到try_to_compact_pages()函数
compact_zone()函数实现:
[alloc_pages()->alloc_pages_node()->__alloc_pages()->__alloc_pages_nodemask()->
__alloc_pages_slowpath()->__alloc_pages_direct_compact()->try_to_compact_pages()->compact_zone_order()->compact_zone()]
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
const bool sync = cc->mode != MIGRATE_ASYNC;
unsigned long last_migrated_pfn = 0;
/*根据当前水位来判断是否需要进行内存规整,下面查看此函数实现*/
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
switch (ret) {
case COMPACT_PARTIAL:
case COMPACT_SKIPPED:
/* Compaction is likely to fail */
return ret;
case COMPACT_CONTINUE:
/* Fall through to compaction */
;
}
/*
* Clear pageblock skip if there were failures recently and compaction
* is about to be retried after being deferred. kswapd does not do
* this reset as it'll reset the cached information when going to sleep.
*/
if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
__reset_isolation_suitable(zone);
/*
* Setup to move all movable pages to the end of the zone. Used cached
* information on where the scanners should start but check that it
* is initialised by ensuring the values are within zone boundaries.
*/
/*设置cc->migrate_pfn和cc->free_pfn。简单来说,cc->migrate_pfn设置为zone的开始
pfn(zone->zone_start_pfn),表示从zone的第一个页面开始扫描和查找哪些页面可以迁移。
cc->free_pfn设置为zone的最末的pfn,表示从zone的最末端开始扫描和查找有哪些空闲的
页面可以用作迁移页面目的地。*/
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
migrate_prep_local();
/*while循环从zone的开头处去扫描和查找合适的迁移页面,然后尝试迁移到zone末端的空闲页面中,
直到zone处于低水位WMARK_LOW之上。compact_finished()判断compact过程是否可以结束,下面
查看此函数的实现*/
while ((ret = compact_finished(zone, cc, migratetype)) ==
COMPACT_CONTINUE) {
int err;
unsigned long isolate_start_pfn = cc->migrate_pfn;
/*isolate_migratepages()函数扫描并且寻找zone中可迁移的页面,可迁移的页面会添加到
cc->migratepages链表中,下面查看此函数的实现*/
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_PARTIAL;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
/*
* We haven't isolated and migrated anything, but
* there might still be unflushed migrations from
* previous cc->order aligned block.
*/
goto check_drain;
case ISOLATE_SUCCESS:
;
}
/*迁移页的核心函数,从cc->migratepages链表中摘取页,然后尝试去迁移页。下面查看此函数的实现*/
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION);
trace_mm_compaction_migratepages(cc->nr_migratepages, err,
&cc->migratepages);
/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;
/*处理迁移页面失败的情况,没迁移的页面会放回适合的LRU链表中*/
if (err) {
putback_movable_pages(&cc->migratepages);
/*
* migrate_pages() may return -ENOMEM when scanners meet
* and we want compact_finished() to detect it
*/
if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
ret = COMPACT_PARTIAL;
goto out;
}
}
/*
* Record where we could have freed pages by migration and not
* yet flushed them to buddy allocator. We use the pfn that
* isolate_migratepages() started from in this loop iteration
* - this is the lowest page that could have been isolated and
* then freed by migration.
*/
if (!last_migrated_pfn)
last_migrated_pfn = isolate_start_pfn;
check_drain:
/*
* Has the migration scanner moved away from the previous
* cc->order aligned block where we migrated from? If yes,
* flush the pages that were freed, so that they can merge and
* compact_finished() can detect immediately if allocation
* would succeed.
*/
if (cc->order > 0 && last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
cc->migrate_pfn & ~((1UL << cc->order) - 1);
if (last_migrated_pfn < current_block_start) {
cpu = get_cpu();
lru_add_drain_cpu(cpu);
drain_local_pages(zone);
put_cpu();
/* No more flushing until we migrate again */
last_migrated_pfn = 0;
}
}
}
out:
/*
* Release free pages and update where the free scanner should restart,
* so we don't leave any returned pages behind in the next attempt.
*/
if (cc->nr_freepages > 0) {
unsigned long free_pfn = release_freepages(&cc->freepages);
cc->nr_freepages = 0;
VM_BUG_ON(free_pfn == 0);
/* The cached pfn is always the first in a pageblock */
free_pfn &= ~(pageblock_nr_pages-1);
/*
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
*/
if (free_pfn > zone->compact_cached_free_pfn)
zone->compact_cached_free_pfn = free_pfn;
}
trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync, ret);
return ret;
}
回到compact_zone_order()函数
compaction_suitable()函数实现:判断当前水位是否需要内存规整
unsigned long compaction_suitable(struct zone *zone, int order,
int alloc_flags, int classzone_idx)
{
unsigned long ret;
ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
trace_mm_compaction_suitable(zone, order, ret);
if (ret == COMPACT_NOT_SUITABLE_ZONE)
ret = COMPACT_SKIPPED;
return ret;
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
* Returns
* COMPACT_SKIPPED - If there are too few free pages for compaction
* COMPACT_PARTIAL - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
static unsigned long __compaction_suitable(struct zone *zone, int order,
int alloc_flags, int classzone_idx)
{
int fragindex;
unsigned long watermark;
/*
* order == -1 is expected when compacting via
* /proc/sys/vm/compact_memory
*/
if (order == -1)
return COMPACT_CONTINUE;
/*以低水位WMARK_LOW为判断标准然后做如下三个判断*/
watermark = low_wmark_pages(zone);
/*
* If watermarks for high-order allocation are already met, there
* should be no need for compaction at all.
*/
/*(1) 以分配内存请求的order来判断zone是否在低水位WMARK_LOW之上,如果是,
则返回COMPACT_PARTIAL表示不需要做内存规整*/
if (zone_watermark_ok(zone, order, watermark, classzone_idx,
alloc_flags))
return COMPACT_PARTIAL;
/*
* Watermarks for order-0 must be met for compaction. Note the 2UL.
* This is because during migration, copies of pages need to be
* allocated and for a short time, the footprint is higher
*/
/*(2) 接下来以order为0来判断zone是否在低水位WMARK_LOW + (2 << order)之上,
如果达不到这个条件,说明zone中只有很少的空闲页面,不适合做内存规整,返回
COMPACT_SKIPPED表示跳过这个zone*/
watermark += (2UL << order);
if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
return COMPACT_SKIPPED;
/*(3) 其余情况返回COMPACT_CONTINUE表示zone可以做内存规整。*/
/*
* fragmentation index determines if allocation failures are due to
* low memory or external fragmentation
*
* index of -1000 would imply allocations might succeed depending on
* watermarks, but we already failed the high-order watermark check
* index towards 0 implies failure is due to lack of memory
* index towards 1000 implies failure is due to fragmentation
*
* Only compact if a failure would be due to fragmentation.
*/
fragindex = fragmentation_index(zone, order);
if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
return COMPACT_NOT_SUITABLE_ZONE;
return COMPACT_CONTINUE;
}
回到compact_zone()函数
compact_finished()函数实现:
static int compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
{
int ret;
ret = __compact_finished(zone, cc, migratetype);
trace_mm_compaction_finished(zone, cc->order, ret);
if (ret == COMPACT_NO_SUITABLE_PAGE)
ret = COMPACT_CONTINUE;
return ret;
}
static int __compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
{
unsigned int order;
unsigned long watermark;
if (cc->contended || fatal_signal_pending(current))
return COMPACT_PARTIAL;
/* Compaction run completes if the migrate and free scanner meet */
/*结束条件有两个:
(1) cc->migrate_pfn和cc->free_pfn两个指针相遇,他们从zone的一头一尾向中间方向运行
(2) 以order为条件判断当前zone的水位在低水位WMARK_LOW之上。
如果当zone在低水位WMARK_LOW之上,那么需要判断伙伴系统中的order对应的zone中的可移动类型
的空闲链表是否为空(zone->free_area[order].free_list[MIGRATE_MOVABLE]),最好的结果是
order对应的free_area链表正好有空闲页面,或者大于order的空闲链表里有空闲页面,再或者大于
pageblock_order的空闲链表有空闲页面。
*/
if (cc->free_pfn <= cc->migrate_pfn) {
/* Let the next compaction start anew. */
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
zone->compact_cached_free_pfn = zone_end_pfn(zone);
/*
* Mark that the PG_migrate_skip information should be cleared
* by kswapd when it goes to sleep. kswapd does not set the
* flag itself as the decision to be clear should be directly
* based on an allocation request.
*/
if (!current_is_kswapd())
zone->compact_blockskip_flush = true;
return COMPACT_COMPLETE;
}
/*
* order == -1 is expected when compacting via
* /proc/sys/vm/compact_memory
*/
if (cc->order == -1)
return COMPACT_CONTINUE;
/* Compaction run is not finished if the watermark is not met */
watermark = low_wmark_pages(zone);
if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
cc->alloc_flags))
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
/* Job done if page is free of the right migratetype */
if (!list_empty(&area->free_list[migratetype]))
return COMPACT_PARTIAL;
/* Job done if allocation would set block type */
if (order >= pageblock_order && area->nr_free)
return COMPACT_PARTIAL;
}
return COMPACT_NO_SUITABLE_PAGE;
}
回到compact_zone()函数
isolate_migratepages()函数实现:
用于扫描和查找合适迁移的页面,从zone的头部开始找起,查找的步长以pageblock_nr_pages为单位。linux内核以pageblock为单位来管理页的迁移属性。页的迁移属性包括MIGRATE_UNMOVABLE、MIGRATE_RECLAIMABLE、MIGRATE_MOVABLE、MIGRATE_PCPTYPES和MIGRATE_CMA等,内核有两个函数来管理迁移类型,分别是get_pageblock_migratetype()和set_pageblock_migratetype()。内核在初始化时,所有的页面最初都标记位MIGRATE_MOVABLE,见memmap_init_zone()函数(mm/page_alloc.c)。pageblock_nr_pages通常是1024个页面(1UL << MAX_ORDER-1)。
[alloc_pages()->alloc_pages_node()->__alloc_pages()->__alloc_pages_nodemask()->__alloc_pages_slowpath()->
__alloc_pages_direct_compact()->try_to_compact_pages()->compact_zone_order()->compact_zone()->isolate_migratepages()]
*
* Isolate all pages that can be migrated from the first suitable block,
* starting at the block pointed to by the migrate scanner pfn within
* compact_control.
*/
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
struct page *page;
/*确定分离类型,通常isolate_mode为ISOLATE_ASYNC_MIGRATE*/
const isolate_mode_t isolate_mode =
(cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
/*
* Start at where we last stopped, or beginning of the zone as
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
/* Only scan within a pageblock boundary */
end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
/*
* Iterate over whole pageblocks until we find the first suitable.
* Do not cross the free scanner.
*/
/*从zone头部cc->migrate_pfn开始以pageblock_br_pages为单位向zone尾部方向扫描。*/
for (; end_pfn <= cc->free_pfn;
low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
/*
* This can potentially iterate a massively long zone with
* many pageblocks unsuitable, so periodically check if we
* need to schedule, or even abort async compaction.
*/
if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
&& compact_should_abort(cc))
break;
page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
if (!page)
continue;
/* If isolation recently failed, do not retry */
if (!isolation_suitable(cc, page))
continue;
/*
* For async compaction, also only scan in MOVABLE blocks.
* Async compaction is optimistic to see if the minimum amount
* of work satisfies the allocation.
*/
/*判断pageblock是否为MIGRATE_MOVABLE或MIGRATE_CMA类型,因为这两种类型的
页是可以迁移的。cc->mode迁移的类型在__alloc_pages_slowpath()函数传递下来
的参数,通常migration_mode参数是异步的,即MIGRATE_ASYNC*/
if (cc->mode == MIGRATE_ASYNC &&
!migrate_async_suitable(get_pageblock_migratetype(page)))
continue;
/* Perform the isolation */
/*扫描和分离pageblock中的页面是否适合迁移,下面查看此函数的实现*/
low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
isolate_mode);
if (!low_pfn || cc->contended) {
acct_isolated(zone, cc);
return ISOLATE_ABORT;
}
/*
* Either we isolated something and proceed with migration. Or
* we failed and compact_zone should decide if we should
* continue or not.
*/
break;
}
acct_isolated(zone, cc);
/*
* Record where migration scanner will be restarted. If we end up in
* the same pageblock as the free scanner, make the scanners fully
* meet so that compact_finished() terminates compaction.
*/
cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
回到compact_zone()函数
isolate_migratepages_block()函数实现:
[alloc_pages()->alloc_pages_node()->__alloc_pages()->__alloc_pages_nodemask()->__alloc_pages_slowpath()->
__alloc_pages_direct_compact()->try_to_compact_pages()->compact_zone_order()->compact_zone()->
isolate_migratepages()->isolate_migratepages_block()]
**
* isolate_migratepages_block() - isolate all migrate-able pages within
* a single pageblock
* @cc: Compaction control structure.
* @low_pfn: The first PFN to isolate
* @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
* @isolate_mode: Isolation mode to be used.
*
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). The range is expected to be within same pageblock.
* Returns zero if there is a fatal signal pending, otherwise PFN of the
* first page that was not scanned (which may be both less, equal to or more
* than end_pfn).
*
* The pages are isolated on cc->migratepages list (not required to be empty),
* and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
* is neither read nor updated.
*/
static unsigned long
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long end_pfn, isolate_mode_t isolate_mode)
{
struct zone *zone = cc->zone;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
struct lruvec *lruvec;
unsigned long flags = 0;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
/*
* Ensure that there are not too many pages isolated from the LRU
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
/*too_many_isolated()函数判断当前临时从LRU链表分离出来的页面比较多,则最好睡眠
等待100毫秒(congestion_wait),如果迁移模式是异步(MIGRATE_ASYNC)的,则直接退出。*/
while (unlikely(too_many_isolated(zone))) {
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
return 0;
congestion_wait(BLK_RW_ASYNC, HZ/10);
if (fatal_signal_pending(current))
return 0;
}
if (compact_should_abort(cc))
return 0;
/* Time to isolate some pages for migration */
/*此循环中扫描pageblock寻找可以迁移的页*/
for (; low_pfn < end_pfn; low_pfn++) {
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort async compaction
* if contended.
*/
if (!(low_pfn % SWAP_CLUSTER_MAX)
&& compact_unlock_should_abort(&zone->lru_lock, flags,
&locked, cc))
break;
if (!pfn_valid_within(low_pfn))
continue;
nr_scanned++;
page = pfn_to_page(low_pfn);
if (!valid_page)
valid_page = page;
/*
* Skip if free. We read page order here without zone lock
* which is generally unsafe, but the race window is small and
* the worst thing that can happen is that we skip some
* potential isolation targets.
*/
/*如果该页还在伙伴系统中,那么该页不适合迁移,略过该页。通过
page_order_unsafe()读取该页的order值,for循环可以直接略过这些页*/
if (PageBuddy(page)) {
unsigned long freepage_order = page_order_unsafe(page);
/*
* Without lock, we cannot be sure that what we got is
* a valid page order. Consider only values in the
* valid order range to prevent low_pfn overflow.
*/
if (freepage_order > 0 && freepage_order < MAX_ORDER)
low_pfn += (1UL << freepage_order) - 1;
continue;
}
/*
* Check may be lockless but that's ok as we recheck later.
* It's possible to migrate LRU pages and balloon pages
* Skip any other type of page
*/
/*在LRU链表中的页面或balloon页面适合迁移,其他类型的页面将被略过。*/
if (!PageLRU(page)) {
if (unlikely(balloon_page_movable(page))) {
if (balloon_page_isolate(page)) {
/* Successfully isolated */
goto isolate_success;
}
}
continue;
}
/*
* PageLRU is set. lru_lock normally excludes isolation
* splitting and collapsing (collapsing has already happened
* if PageLRU is set) but the lock is not necessarily taken
* here and it is wasteful to take it just to check transhuge.
* Check TransHuge without lock and skip the whole pageblock if
* it's either a transhuge or hugetlbfs page, as calling
* compound_order() without preventing THP from splitting the
* page underneath us may return surprising results.
*/
if (PageTransHuge(page)) {
if (!locked)
low_pfn = ALIGN(low_pfn + 1,
pageblock_nr_pages) - 1;
else
low_pfn += (1 << compound_order(page)) - 1;
continue;
}
/*
* Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
/*之前已经排除了PageBuddy和页不在LRU链表的情况,接下来剩下的页面是比较合适的候选者,
但是还有一些特殊情况需要过滤掉。page_mapping()返回0,说明有可能是匿名页面。对于匿名
页面来说,通常情况下page_count(page) = page_mapcount(page),即page->_count = page->_mapcount + 1.
如果不相等,说明内核有人偷偷使用了这个匿名页面,所以匿名页面也不适合迁移。*/
if (!page_mapping(page) &&
page_count(page) > page_mapcount(page))
continue;
/* If we already hold the lock, we can skip some rechecking */
/*加锁zone->lru_lock,并且重新判断该页是否是LRU链表中的页*/
if (!locked) {
locked = compact_trylock_irqsave(&zone->lru_lock,
&flags, cc);
if (!locked)
break;
/* Recheck PageLRU and PageTransHuge under lock */
if (!PageLRU(page))
continue;
if (PageTransHuge(page)) {
low_pfn += (1 << compound_order(page)) - 1;
continue;
}
}
lruvec = mem_cgroup_page_lruvec(page, zone);
/* Try isolate the page */
/*__isolate_lru_page()分离ISOLATE_ASYNC_MIGRATE类型的页面。
__isolate_lru_page()函数之前分析过,对于正在回写的页面是不合
格的候选者,对于脏的页面,如果该页没有定义mapping->a_ops->migratepage()
函数指针,那么也是不合格的候选者,另外还会对该页的page->_count引用计数加1,
并清PG_lru标志位*/
if (__isolate_lru_page(page, isolate_mode) != 0)
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* Successfully isolated */
/*把该页从LRU链表中删除*/
del_page_from_lru_list(page, lruvec, page_lru(page));
/*表示该页是一个合格的、可以迁移的页面,添加到cc->migratelist链表中*/
isolate_success:
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
break;
}
}
/*
适合被内存规整迁移的页面总结如下:
(1) 必须在LRU链表中的页面,还在伙伴系统中的页面不适合。
(2) 正在回写中的页面不适合,即标记为PG_writeback的页面。
(3) 标记为PG_unevictable的页面不适合。
(4) 没有定义mapping->a_ops->migratepage()方法的脏页面不适合。
*/
/*
* The PageBuddy() check could have potentially brought us outside
* the range to be scanned.
*/
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags);
/*
* Update the pageblock-skip information and cached scanner pfn,
* if the whole pageblock was scanned without isolating any page.
*/
if (low_pfn == end_pfn)
update_pageblock_skip(cc, valid_page, nr_isolated, true);
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
return low_pfn;
}
回到compact_zone()函数
migrate_pages()函数实现:迁移页的核心函数,从cc->migratepages链表中摘取页,然后尝试去迁移页。compaction_alloc()从zone的末尾开始查找空闲页面,然后并把空闲页面添加到cc->freepages链表中。
migrate_pages()函数在页迁移一节中已经介绍,其中get_new_page()函数指针指向compaction_alloc()函数,put_new_page()函数指针指向compaction_free()函数,迁移模式为MIGRATE_ASYNC,reasion为MR_COMPACTION.
/*
* This is a migrate-callback that "allocates" freepages by taking pages
* from the isolated freelists in the block we are migrating to.
*/
/*查找哪些页面适合迁移,compaction_alloc()函数是从zone尾部开始查找哪些页面是空闲页面,
核心函数是isolate_freepages()函数,它与之前的isolate_migratepages()函数很相似。
compaction_alloc()函数最后返回一个空闲的页面。*/
static struct page *compaction_alloc(struct page *migratepage,
unsigned long data,
int **result)
{
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
/*
* Isolate free pages if necessary, and if we are not aborting due to
* contention.
*/
if (list_empty(&cc->freepages)) {
if (!cc->contended)
isolate_freepages(cc);
if (list_empty(&cc->freepages))
return NULL;
}
freepage = list_entry(cc->freepages.next, struct page, lru);
list_del(&freepage->lru);
cc->nr_freepages--;
return freepage;
}