linux系统经过长时间运行后,很难有太多free pages可供分配,
这个时候分配内存只能走慢速分配方式,里面涉及到内存回收和紧缩;
内存回收跟紧缩算法比较复杂,涉及内容比较多,将在后续逐个介绍;
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int classzone_idx, int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT; //wait high fs
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
enum migrate_mode migration_mode = MIGRATE_ASYNC;
bool deferred_compaction = false;
int contended_compaction = COMPACT_CONTENDED_NONE;
/*
* In the slowpath, we sanity check order to avoid ever trying to
* reclaim >= MAX_ORDER areas which will never succeed. Callers may
* be using allocators in order of preference for an area that is
* too large.
*/
if (order >= MAX_ORDER) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
/*
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
* __GFP_NOWARN set) should not cause reclaim since the subsystem
* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
* using a larger set of nodes after it has established that the
* allowed per node queues are empty and that nodes are
* over allocated.
*/
if (IS_ENABLED(CONFIG_NUMA) &&
(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
//如果未禁止kswapd,则唤醒节点的kswapd线程
if (!(gfp_mask & __GFP_NO_KSWAPD))
wake_all_kswapds(order, zonelist, high_zoneidx,
preferred_zone, nodemask);
/*
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
*/
//gfp 转成alloc_flags
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*/
if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
struct zoneref *preferred_zoneref;
preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
NULL, &preferred_zone);
classzone_idx = zonelist_zone_idx(preferred_zoneref);
}
rebalance:
/* This is the last chance, in general, before the goto nopage. */
//再次尝试从空闲链表中获取内存
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, classzone_idx, migratetype);
if (page)
goto got_pg;
/* Allocate without watermarks if the context allows */
//需要忽略watermark时,表示系统紧急需要内存,要比较高优先级处理
if (alloc_flags & ALLOC_NO_WATERMARKS) {
/*
* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
* the allocation is high priority and these type of
* allocations are system rather than user orientated
*/
zonelist = node_zonelist(numa_node_id(), gfp_mask);
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, classzone_idx, migratetype);
if (page) {
goto got_pg;
}
}
/* Atomic allocations - we can't balance anything */
if (!wait) {
/*
* All existing users of the deprecated __GFP_NOFAIL are
* blockable, so warn of any new users that actually allow this
* type of allocation to fail.
*/
WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
goto nopage;
}
/* Avoid recursion of direct reclaim */
//调用者本身是内存回收线程的话,直接返回,避免死锁
if (current->flags & PF_MEMALLOC)
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
/*
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
//接下来尝试通过内存紧缩,减少内存碎片,看能否分配到连续的page,
//先尝试异步压缩,接着尝试同步压缩。
//这部分比较复杂,后面单独进行分析。
page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
high_zoneidx, nodemask, alloc_flags,
preferred_zone,
classzone_idx, migratetype,
migration_mode, &contended_compaction,
&deferred_compaction);
if (page)
goto got_pg;
/* Checks for THP-specific high-order allocations */
//透明大页的分配
if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
/*
* If compaction is deferred for high-order allocations, it is
* because sync compaction recently failed. If this is the case
* and the caller requested a THP allocation, we do not want
* to heavily disrupt the system, so we fail the allocation
* instead of entering direct reclaim.
*/
//运行延迟紧缩的话,直接返回nopage
if (deferred_compaction)
goto nopage;
/*
* In all zones where compaction was attempted (and not
* deferred or skipped), lock contention has been detected.
* For THP allocation we do not want to disrupt the others
* so we fallback to base pages instead.
*/
if (contended_compaction == COMPACT_CONTENDED_LOCK)
goto nopage;
/*
* If compaction was aborted due to need_resched(), we do not
* want to further increase allocation latency, unless it is
* khugepaged trying to collapse.
*/
if (contended_compaction == COMPACT_CONTENDED_SCHED
&& !(current->flags & PF_KTHREAD))
goto nopage;
}
/*
* It can become very expensive to allocate transparent hugepages at
* fault, so use asynchronous memory compaction for THP unless it is
* khugepaged trying to collapse.
*/
//内核线程或者非透明大页的情况,设置为轻同步紧缩模式
if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
(current->flags & PF_KTHREAD))
migration_mode = MIGRATE_SYNC_LIGHT;
/* Try direct reclaim and then allocating */
//接下来尝试回收内存后,在分配空闲页面
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
classzone_idx, migratetype,
&did_some_progress);
if (page)
goto got_pg;
/*
* If we failed to make any progress reclaiming, then we are
* running out of options and have to consider going OOM
*/
//还是分配不到内存的话,就得尝试通过oom回收内存
if (!did_some_progress) {
if (oom_gfp_allowed(gfp_mask)) {
if (oom_killer_disabled)
goto nopage;
/* Coredumps can quickly deplete all memory reserves */
if ((current->flags & PF_DUMPCORE) &&
!(gfp_mask & __GFP_NOFAIL))
goto nopage;
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
classzone_idx, migratetype);
if (page)
goto got_pg;
if (!(gfp_mask & __GFP_NOFAIL)) {
/*
* The oom killer is not called for high-order
* allocations that may fail, so if no progress
* is being made, there are no other options and
* retrying is unlikely to help.
*/
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto nopage;
/*
* The oom killer is not called for lowmem
* allocations to prevent needlessly killing
* innocent tasks.
*/
if (high_zoneidx < ZONE_NORMAL)
goto nopage;
}
goto restart;
}
}
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, did_some_progress,
pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
} else {
/*
* High-order allocations do not necessarily loop after
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*/
//再次尝试通过内存紧缩分配内存
page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
high_zoneidx, nodemask, alloc_flags,
preferred_zone,
classzone_idx, migratetype,
migration_mode, &contended_compaction,
&deferred_compaction);
if (page)
goto got_pg;
}
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
return page;
got_pg:
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
if (page)
set_page_owner(page, order, gfp_mask);
return page;
}
//这个函数中在while中一直尝试从空闲链表获取内存,直到成功为止
static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int classzone_idx, int migratetype)
{
struct page *page;
do {
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
preferred_zone, classzone_idx, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
} while (!page && (gfp_mask & __GFP_NOFAIL));
return page;
}