不知不觉,居然totally花了5个小时整理完了页面分配的快速路径相关的内容,看了一下目录,一共才17页。。。。。我居然花了那么久总结。不知道是学习效率低,还是能力不行。不过根据实际情况来看一般就是两者皆有T_T
回顾一下之前立的flag,用20天复习的学习路径,感觉从下周开始有点悬了。。。。毕竟休假结束了,不可能再投这么多时间用来自学了。
而且从上午2小时到下午3小时,中间的“上下文切换”耗费的时间也不少,要进入这种入定学习的状态还挺难的。(因为学的内容本身比较难,代码级别晦涩难懂)
页面分配-快速路径概念梳理
快速路径分配函数预览
快速路径分配核心函数: __alloc_pages_nodemask()源码分析
/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
//alloc_flags用于表示页面分配的行为和属性,初始化为ALLOC_WMARK_LOW
//表示允许分配内存的判断条件为低水位WaterMark Low
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
//alloc_context数据结构是伙伴系统分配函数中用于保存相关参数的数据结构体
//prepare_alloc_pages()函数会计算相关的信息并且保存到alloc_context数据结构中
//包含high-zoneidx, migratetype, zonelist等信息
struct alloc_context ac = { };
//MAX_ORDER=11, 最大能分配的页面为4MB,2^10
if (unlikely(order >= MAX_ORDER)) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
//finalise_ac用于确定首选的zone
finalise_ac(gfp_mask, &ac);
//alloc_flags_nofragment用于内存碎片化方面的优化
//引入新标志:ALLOC_NOFRAGMENT, 使用ZONE_DMA32作为合适的zone来避免碎片化
alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
//get_page_from_freelist尝试从伙伴系统的空闲链表中分配内存
//如果分配成功,则返回内存块的第一个页面的page数据结构
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
goto out;
alloc_mask = current_gfp_context(gfp_mask);
ac.spread_dirty_pages = false;
if (unlikely(ac.nodemask != nodemask))
ac.nodemask = nodemask;
//当get_page_from_freelist()分配不成功,就会进入slowpath, 慢速分配
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
快速路径分配函数: prepare_alloc_pages()源码分析
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
//参数会临时存在alloc_context数据结构中,该函数位于<mm/internal.h>
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
//gfp_zone根据分配掩码计算出zone的zoneidx,并存放在high_zoneidx成员中
ac->high_zoneidx = gfp_zone(gfp_mask);
//node_zonelist返回首选内存节点perferred_nid对应的zonelist
//一个node一般包含两个zonelist,一个是本地ZONELIST_FALLBACK, 另一个是远端ZONELIST_NOFALLBACK
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
//gfpflags_to_migratetype函数根据gfp_mask获取内存的迁移类型
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
if (!ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
}
fs_reclaim_acquire(gfp_mask);
fs_reclaim_release(gfp_mask);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
if (should_fail_alloc_page(gfp_mask, order)) //引入故障注入技术
return false;
if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
*alloc_flags |= ALLOC_CMA;
return true;
}
快速路径分配函数: finalise_ac()源码分析
static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
{
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
/*
* The preferred zone is used for statistics but crucially it is
* also used as the starting point for the zonelist iterator. It
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
快速路径分配函数: get_page_from_freelist()源码分析
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
bool no_fallback;
retry:
//ALLOC_NOFRAGMENT表示避免内存碎片化
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
//preferred_zoneref表示zonelist中首选和推荐的zone,这个值是通过finalise_ac()函数中的first_zones_zonelist()宏计算的
z = ac->preferred_zoneref;
//从推荐的zong开始遍历所有zone,使用for_next_zone_zonelist_nodemask宏
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
struct page *page;
unsigned long mark;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
if (ac->spread_dirty_pages) {
if (last_pgdat_dirty_limit == zone->zone_pgdat)
continue;
if (!node_dirty_ok(zone->zone_pgdat)) {
last_pgdat_dirty_limit = zone->zone_pgdat;
continue;
}
}
//NUMA特性,当分配的zone不是local node内(在远端,即no_fallback)
//需要考虑的不是内存碎片化,而是内存本地性。
if (no_fallback && nr_online_nodes > 1 &&
zone != ac->preferred_zoneref->zone) {
int local_nid;
local_nid = zone_to_nid(ac->preferred_zoneref->zone);
if (zone_to_nid(zone) != local_nid) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
}
//通过wmark_pages宏计算zone的某个水位的页面大小
//水位有WMARK_MIN, WMARK_LOW,WMARK_HIGH三种
//Linux5.0开始新增加了临时增加水位boost watermark功能,用于应对外碎片化问题
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
//zone_watermark_fast函数判断当前zone的空闲页面是否满足WMARK_LOW
//返回true,表示zone的页面高于指定水位或者满足order分配需求
if (!zone_watermark_fast(zone, order, mark,
ac_classzone_idx(ac), alloc_flags)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
*/
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
//如果node_reclaim_mode等于0,则尝试从下一个zone或者内存节点中分配内存
if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
//调用node_reclaim尝试回收一部分内存
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
ac_classzone_idx(ac), alloc_flags))
goto try_this_zone;
continue;
}
}
//try_this_zone,从当前zone分配内存,rmqueue()函数会从伙伴系统中分配内存
//rmqueue()函数是伙伴系统的核心分配函数
try_this_zone:
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
/*
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
*/
if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
reserve_highatomic_pageblock(page, zone, order);
return page;
} else {
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
}
}
/*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
//遍历完所有的zone之后,还是没有成功分配内存
//可能是产生了外碎片化,需要goto retry再尝试一次
if (no_fallback) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
return NULL;
}
get_page_from_freelist注意要点:
-
遍历zonelist中的zone时,扫描zone的方向是从高端zone到低端zone
-
大部分情况优先从首选的zone(preferred_zone)开始扫描。首选zone是通过gfp_mask换算的
-
alloc_context是一个非常重要的参数,它定义了从哪个zone开始扫描和分配内存的迁移类型等信息,必须在调用get_page_from_freelist()函数时初始化
struct alloc_context {
struct zonelist *zonelist;
nodemask_t *nodemask;
struct zoneref *preferred_zoneref;
int migratetype;
enum zone_type high_zoneidx;
bool spread_dirty_pages;
};
-
zone_watermark_ok()函数负责判断zone的水位情况及是否满足分配连续大内存块的需求
快速路径分配函数: zone_watermark_fast()源码分析
//*z: 表示检测是否满足分配请求的zone
//order:表示阶数
//mark:表示测试的水位标准
//classzone_idx:表示首选zone的编号
//alloc_flags:分配器内部使用的标志位属性
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, unsigned int alloc_flags)
{
//zone_page_state用于获取zone中空闲页面的数量
//zone里有一个关于物理页面统计数据的数组vm_stat[]
//该数组存放了该zone各种页面的统计数据
long free_pages = zone_page_state(z, NR_FREE_PAGES);
long cma_pages = 0;
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
//快速处理order=0的单个页面分配请求
if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
return true;
//调用__zone_watermark_ok做进一步检查
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
free_pages);
}
快速路径分配函数: rmqueue()源码分析
//preferred_zone:首选的zone
//zone:当前遍历的zone
//order:阶数
//gfp_flags:调用者传递过来的GFP Mask(分配掩码)
//alloc_flags:页面分配器内部使用的标志位
//migratetype:分配内存的迁移类型
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
if (likely(order == 0)) {
//分配单个页面时,使用rmqueue_pcplist函数
//从变量per_cpu_pages中分配物理页面
page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
migratetype, alloc_flags);
goto out;
}
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
//申请一个zone->lock的自旋锁,保护zone中的伙伴系统(在PCP机制中不需要自旋锁)
spin_lock_irqsave(&zone->lock, flags);
//do...while循环调用__rmqueue()函数分配内存
do {
page = NULL;
if (alloc_flags & ALLOC_HARDER) {
//通过__rmqueue_smallest()函数寻找高阶内存切块
//比如从5阶的页块切成2个4阶的页块,把1个4阶分配出去,另一个加入4阶的空闲链表)
//如果__rmqueue_smallest分配失败了,会调用__rmqueue_fallback()函数
//该函数会到伙伴系统的备份空闲链表中挪用内存
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);
//check_new_pages判断分配出来的页面是否合格
} while (page && check_new_pages(page, order));
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
out:
/* Separate test+clear to avoid unnecessary atomics */
//判断zone->flags是否设置了ZONE_BOOSTED_WATERMARK标志位
//如果有,则清零,并唤醒kswapd内核线程回收内存
if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
//VM_BUG_ON_PAGE需要打开CONFIG_DEBUG_VM配置才会生效
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;//返回分配好的第一个页面的page数据结构
failed:
local_irq_restore(flags);
return NULL;
}
快速路径分配函数: __rmqueue_smallest()源码分析
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
//从order开始查找zone中的空闲链表,如果没有就会查找上一级的order
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
del_page_from_free_area(page, area);
//expand函数用于实现分配的功能,
expand(zone, page, order, current_order, area, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
return NULL;
}
快速路径分配函数: expand()源码分析
通过expand函数,将n阶的空闲页切分成两个n-1阶的页面,用于分配
//参数high,即为current_order
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
int migratetype)
{
unsigned long size = 1 << high;
while (high > low) {
area--;
high--;
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
add_to_free_area(&page[size], area, migratetype);
set_page_order(&page[size], high);
}
}
释放页面
释放页面的核心函数是free_page(), 但最终还是会调用__free_pages()函数
__free_pages()
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page))
free_the_page(page, order); //order不为0时释放多个页面
}
free_the_page()
static inline void free_the_page(struct page *page, unsigned int order)
{
if (order == 0) /* Via pcp? */
free_unref_page(page); //释放单个页面
else
__free_pages_ok(page, order); //释放多个页面
}
单个页面释放free_unref_page()
void free_unref_page(struct page *page)
{
unsigned long flags;
//page_to_pfn宏将page数据结构转换成页帧号
unsigned long pfn = page_to_pfn(page);
//free_unref_page_prepare函数对物理页面做一些释放前检查
if (!free_unref_page_prepare(page, pfn))
return;
//关闭本地中断,以防中断过程中触发其他页面分配,扰乱本地CPU的PCP链表结构
local_irq_save(flags);
//释放单个页面到PCP链表中
free_unref_page_commit(page, pfn);
local_irq_restore(flags);
}
多个页面释放:__free_pages_ok
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
if (!free_pages_prepare(page, order, true))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
//调用free_one_page函数,将内存释放到伙伴系统中
free_one_page(page_zone(page), page, pfn, order, migratetype);
local_irq_restore(flags);
}
多个页面释放:__free_pages_ok ->free_one_page->__free_one_page
static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
int migratetype)
{
//使用自旋锁zone->lock, 保护zone中的伙伴系统
spin_lock(&zone->lock);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
//调用__free_one_page函数,将内存释放到伙伴系统中
//该函数除了释放内存页面,还能合并空闲页面
__free_one_page(page, pfn, zone, order, migratetype);
spin_unlock(&zone->lock);
}
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype)
{
unsigned long combined_pfn;
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
struct capture_control *capc = task_capc(zone);
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
while (order < max_order - 1) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
return;
}
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
if (!pfn_valid_within(buddy_pfn))
goto done_merging;
if (!page_is_buddy(page, buddy, order))
goto done_merging;
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
else
del_page_from_free_area(buddy, &zone->free_area[order]);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
order++;
}
if (max_order < MAX_ORDER) {
/* If we are here, it means order is >= pageblock_order.
* We want to prevent merge between freepages on isolate
* pageblock and normal pageblock. Without this, pageblock
* isolation could cause incorrect freepage or CMA accounting.
*
* We don't want to hit this code for the more frequent
* low-order merging.
*/
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
&& (is_migrate_isolate(migratetype) ||
is_migrate_isolate(buddy_mt)))
goto done_merging;
}
max_order++;
goto continue_merging;
}
done_merging:
set_page_order(page, order);
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
&& !is_shuffle_order(order)) {
struct page *higher_page, *higher_buddy;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
add_to_free_area_tail(page, &zone->free_area[order],
migratetype);
return;
}
}
if (is_shuffle_order(order))
add_to_free_area_random(page, &zone->free_area[order],
migratetype);
else
add_to_free_area(page, &zone->free_area[order], migratetype);
}
释放内存页面的核心功能就是把空闲页面添加到伙伴系统中适当的空闲链表中。在释放内存时,会检查相邻的内存块是否空闲,如果空闲,就将其合并成一个大的内存块,放置到高一阶的空闲链表中。如果还能继续合并邻近的内存块,就会继续合并,转移到更高阶的空闲链表中,这个过程会不断重复执行,知道所有可能的合并的内存块都合并完成。
名词解释:外碎片化(external fragmentation)
外碎片化指系统有足够的空闲内存,但是没有办法分配出想要的内存块。这是因为有很多空闲内存分散在众多的页块中,导致没法分配出一个连续和完整的大内存块。
在Linux内核在分配物理页面时,发现没有办法分配出想要的物理内存,特别是大内存块,那么他会去从其他迁移类型中挪用内存(通过__rmqueue_fallbackh函数),此时就可以认为系统存在外碎片化倾向。
Linux5.0引入了临时增加水位功能,当探测到有外碎片化倾向时,就临时提高低水位,提前触发kswapd内核线程回收内存,然后触发kcompactd内核线程做内存规整。提高大块物理内存分配的需求满足的概率。