上一篇记录了伙伴系统的组织结构,这一篇记录一下基于伙伴系统的具体分配策略。
一、分配接口
alloc_pages(gfp_t gfp_mask, unsigned int order)
分配个page,返回内存的起始page
alloc_page(mask)
就是alloc_pages(mask, 0)
get_zeroed_page(mask)分配一页,并将页面初始化成0.
__get_free_pages(mask, order) 返回虚拟地址而非page
__get_free_page(mask)
get_dma_pages(gfp_mask, order) 用来获得适用于dma的页。
二、释放接口
free_page(struct page *)
free_pages(struct page *, order)
__free_pages(addr, order)
__free_page(addr)
三、分配掩码
上述分配函数所包含的入参mask(下称修饰符)可以告诉内核自己需要什么样的内存,以及分配策略信息。
内存域修饰符,决定从哪个zone去分配内存:
___GFP_DMA ___GFP_HIGHMEM ___GFP_DMA32
内核提供了一个函数专门计算给定mask兼容的最高内存域(zone)
static inline enum zone_type gfp_zone(gfp_t flags)
{
enum zone_type z;
int bit = (__force int) (flags & GFP_ZONEMASK);
z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
((1 << GFP_ZONES_SHIFT) - 1);
VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
return z;
}
这个函数挺难理解的,以后再看看吧。
其他修饰符:
#define ___GFP_RECLAIMABLE 0x10u
#define ___GFP_HIGH 0x20u
#define ___GFP_IO 0x40u
#define ___GFP_FS 0x80u
#define ___GFP_ZERO 0x100u
#define ___GFP_ATOMIC 0x200u
#define ___GFP_DIRECT_RECLAIM 0x400u
#define ___GFP_KSWAPD_RECLAIM 0x800u
#define ___GFP_WRITE 0x1000u
#define ___GFP_NOWARN 0x2000u
#define ___GFP_RETRY_MAYFAIL 0x4000u
#define ___GFP_NOFAIL 0x8000u
#define ___GFP_NORETRY 0x10000u
#define ___GFP_MEMALLOC 0x20000u
#define ___GFP_COMP 0x40000u
#define ___GFP_NOMEMALLOC 0x80000u
#define ___GFP_HARDWALL 0x100000u
#define ___GFP_THISNODE 0x200000u
#define ___GFP_ACCOUNT 0x400000u
#ifdef CONFIG_LOCKDEP
#define ___GFP_NOLOCKDEP 0x800000u
#else
#define ___GFP_NOLOCKDEP 0
#endif
一般我们不会直接使用这些修饰符而是使用上述的组合,比如内核中最常见的GFP_KERNEL
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
从字面上我们可以看出GFP_KERNEL在分配的内存可以回收,在分配期间可以被io打断。
四、核心分配函数
__alloc_pages是伙伴系统的核心分配函数
/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
...
/* First allocation attempt */
page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
if (likely(page))
goto out;
...
page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
out:
...
return page;
}
get_page_from_freelist负责第一轮的内存分配,如果分配成功就return,如果不成功就再走slow path-> __alloc_pages_slowpath
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
bool no_fallback;
retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
//开始的时候分配连续内存
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
//从zonelist中一个一个找合适的zone
for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
ac->nodemask) {
struct page *page;
unsigned long mark;
...
if (no_fallback && nr_online_nodes > 1 &&
zone != ac->preferred_zoneref->zone) {
int local_nid;
/*
* If moving to a remote node, retry but allow
* fragmenting fallbacks. Locality is more important
* than fragmentation avoidance.
*/
//如果需要从别的node去找内存,那么就设置允许在当前node不连续内存分配
//局部性比反碎片化更重要
local_nid = zone_to_nid(ac->preferred_zoneref->zone);
if (zone_to_nid(zone) != local_nid) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
}
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
//检查当前zone的水位是否满足要求
if (!zone_watermark_fast(zone, order, mark,
ac->highest_zoneidx, alloc_flags,
gfp_mask)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
*/
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
//如果内存水位不足就回收内存试试
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
ac->highest_zoneidx, alloc_flags))
goto try_this_zone;
}
}
try_this_zone:
//已经找到合适的zone了,开始真正的内存分配
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
/*
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
*/
if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
reserve_highatomic_pageblock(page, zone, order);
return page;
} else {
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
}
}
/*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
if (no_fallback) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
return NULL;
}
下面看看rmqueue是怎么工作的
/*
* Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
//如果满足一定条件可以从per cpu list中分配内存,比如需要的内存块很小,这样可以方便冷热页优化
if (likely(pcp_allowed_order(order))) {
/*
* MIGRATE_MOVABLE pcplist could have the pages on CMA area and
* we need to skip it when CMA area isn't allowed.
*/
if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
migratetype != MIGRATE_MOVABLE) {
page = rmqueue_pcplist(preferred_zone, zone, order,
gfp_flags, migratetype, alloc_flags);
goto out;
}
}
/*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
do {
page = NULL;
/*
* order-0 request can reach here when the pcplist is skipped
* due to non-CMA allocation context. HIGHATOMIC area is
* reserved for high-order atomic allocation, so order-0
* request should skip it.
*/
if (order > 0 && alloc_flags & ALLOC_HARDER) {
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
spin_unlock_irqrestore(&zone->lock, flags);
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
out:
/* Separate test+clear to avoid unnecessary atomics */
if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
failed:
spin_unlock_irqrestore(&zone->lock, flags);
return NULL;
}
总体上就是分成低阶和高阶两条路,低阶一般在pcp上分配,这是zone上的冷热页管理结构,高阶就用一般的分配函数rmqueue_smallest.
再来看看 __rmqueue_smallest
/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
//在zone中的free_area中沿着order去找内存
area = &(zone->free_area[current_order]);
//在之前的文章中提到过,free_area是有关order和migratetype的二维链表,确定了order和migratetype就确定了一片内存区域
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
//如果找到了内存那就把这块内存从对应的链表中删除
del_page_from_free_list(page, zone, current_order);
//如果需要的内存块小于分配的内存,那就需要把大块碎成小块,边角料再放回伙伴系统
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
return NULL;
}
这个函数就是实实在在从zone的free_list数据结构指向的内存区分配内存了,其实就是在链表上找到合适的内存块然后从链表中删除要分配出去的那个块。最终分配函数返回一个代表要分配内存块的起始page结构。
这样get_page_from_freelist就结束了,回到__alloc_page,如果第一轮分配失败那就要靠第二轮
__alloc_pages_slowpath,这个函数非常长,这里就不贴代码了。总的来说就是内核不断地加大强度去分配内存,包括唤醒kswapds,腾出更多地空间,这可能会比较费时所以函数名叫做slow_path。所以在系统内存不足的时候很可能影响到内存分配地效率,从而影响到系统性能。
内核内存分配是非常重要也是非常复杂的部分,由于水平所限这里仅仅做了一些很粗略的分析。虽然代码贴了很多但是多数细节还不是特别明白,留待日后补足。