[内核内存] 伙伴系统4---alloc_pages(内存块分配)

最新推荐文章于 2024-08-24 17:07:34 发布

早起的虫儿有鹰吃

最新推荐文章于 2024-08-24 17:07:34 发布

阅读量3k

点赞数 3

分类专栏： linux内存文章标签：伙伴系统 linux内核内存

本文链接：https://blog.csdn.net/u010923083/article/details/115873669

版权

linux内存专栏收录该内容

40 篇文章 77 订阅

订阅专栏

文章目录

alloc_pages源码流程分析

alloc_pages源码流程分析

alloc_pages开始了真正的页面申请,函数作用是:用于申请一块2^order的连续物理内存块

alloc_pages函数实现流程

在这里插入图片描述

代码细节分析

参数准备

//include/linux/gfp.h
/*numa_node_id()返回当前所在节点编号*/
#define alloc_pages(gfp_mask, order) \
		alloc_pages_node(numa_node_id(), gfp_mask, order)
		
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
						unsigned int order)
{
	//nid最好是传递过来的参数，若参数无效，则会选择一个目前距离cup最近的一个在线节点
	if (nid == NUMA_NO_NODE)
		nid = numa_mem_id();

	return __alloc_pages_node(nid, gfp_mask, order);
}

static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
	VM_WARN_ON(!node_online(nid));
	//node_zonelist(nid, gfp_mask)是用于获取node节点的zone管理区列表,按优先级顺序存储
	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}

static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
		struct zonelist *zonelist)
{
	return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
}

核心函数__alloc_pages_nodemask

_alloc_pages_nodemask是伙伴系统的心脏，处理实质的内存分配工作。

（1）先进行参数初始化:alloc_mask, alloc_flags和struct alloc_context ac，用于决定内存块的分配配条件。
（2） get_page_from_freelist:内核内存环境良好，直接进行快速分配，若成功返回获取free内存块
（3）__alloc_pages_slowpath:当前内存环境恶劣时，进入慢分配流程，若成功返回free内存块
（4）获取空间内存块后对内存块和系统环境做检查，满足预定要求则返回申请的内存给内核使用

//mm/page_alloc.c
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
			struct zonelist *zonelist, nodemask_t *nodemask)
{
	struct page *page;
	//分配标志，仅在低水位water mark及以上限制页面分配
	unsigned int alloc_flags = ALLOC_WMARK_LOW;
	gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
	//用于存储函数间传递的参数
	struct alloc_context ac = {
		//gfp对应的zone,参考文档'如何根据gfp_flag找到对应的zone',内存分配的zoneidx一定小于等于该idx
		.high_zoneidx = gfp_zone(gfp_mask),
		//zone优先级搜索列表
		.zonelist = zonelist,
		.nodemask = nodemask,
		//根据gfp找到迁移类型,gfp_flags的bit3-bit4表示migrate类型
		.migratetype = gfpflags_to_migratetype(gfp_mask),
	};

	if (cpusets_enabled()) {
		alloc_mask |= __GFP_HARDWALL;
		alloc_flags |= ALLOC_CPUSET;
		if (!ac.nodemask)
			ac.nodemask = &cpuset_current_mems_allowed;
	}

	gfp_mask &= gfp_allowed_mask;
    //需要CONFIG_TRACE_IRQFLAGS和CONFIG_PROVE_LOCKING同时定义的时候，才起作用，否则为空函数
	lockdep_trace_alloc(gfp_mask);
    /*如果申请页面传入的gfp_mask掩码携带__GFP_WAIT标识，表示允许页面申请时休眠，
	 *则会进入might_sleep_if()检查是否需要休眠等待以及重新调度
	 */
	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
    //由于未设置CONFIG_FAIL_PAGE_ALLOC，则should_fail_alloc_page()恒定返回false
	if (should_fail_alloc_page(gfp_mask, order))
		return NULL;

	/*
	 * Check the zones suitable for the gfp_mask contain at least one
	 * valid zone. It's possible to have an empty zonelist as a result
	 * of __GFP_THISNODE and a memoryless node
	 */
	 //用于检查当前申请页面的内存管理区zone是否为空
	if (unlikely(!zonelist->_zonerefs->zone))
		return NULL;

	if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
		alloc_flags |= ALLOC_CMA;

	/* Dirty zone balancing only done in the fast path */
	ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);

	/*
	 * The preferred zone is used for statistics but crucially it is
	 * also used as the starting point for the zonelist iterator. It
	 * may get reset for allocations that ignore memory policies.
	 */
	//用于根据nodemask，找到合适的不大于high_zoneidx的内存管理区preferred_zone
	ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
					ac.high_zoneidx, ac.nodemask);
	//选择出的zone为NULL，进入慢分配
	if (!ac.preferred_zoneref->zone) {
		page = NULL;
		/*
		 * This might be due to race with cpuset_current_mems_allowed
		 * update, so make sure we retry with original nodemask in the
		 * slow path.
		 */
		goto no_zone;
	}
    
	/* First allocation attempt(快速路径分配) */
	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
	if (likely(page))
		goto out;

no_zone:
	/*
	 * Runtime PM, block IO and its error handling path can deadlock
	 * because I/O on the device might not complete.
	 */
	alloc_mask = memalloc_noio_flags(gfp_mask);
	ac.spread_dirty_pages = false;

	/*
	 * Restore the original nodemask if it was potentially replaced with
	 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
	 */
	if (unlikely(ac.nodemask != nodemask))
		ac.nodemask = nodemask;

	page = __alloc_pages_slowpath(alloc_mask, order, &ac);

out:
    /*返回的page检查和系统当前环境检查*/
    //memcg_kmem_newpage_charge()与控制组群Cgroup相关
	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
	    unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
		__free_pages(page, order);
		page = NULL;
	}

	if (kmemcheck_enabled && page)
		kmemcheck_pagealloc_alloc(page, order, gfp_mask);

	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

	return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);

__alloc_pages_nodemask()分配内存页面的关键函数是：get_page_from_freelist()和__alloc_pages_slowpath()，其中所谓的快速分配就是直接从现有的内存中去分配，如果不成功，再去尝试慢速分配。慢速分配会进行内存压缩，回收，然后再去尝试分配内存。

快速内存分配（get_page_from_freelist)

get_page_from_freelist()最先用于尝试页面分配，它通过标志集和分配阶（order）来判断是否能进行请求的内存分配操作，如果分配失败的情况下，则会进一步调用__alloc_pages_slowpath()。下面先展示该函数代码流程图，然后再具体分析代码实现细节

get_page_from_freelist流程图

在这里插入图片描述
图片参考LoyenWang博主的内存分配器

get_page_from_freelist代码实现细节

//mm/page_alloc.c
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
						const struct alloc_context *ac)
{
	struct zoneref *z = ac->preferred_zoneref;
	struct zone *zone;
	struct pglist_data *last_pgdat_dirty_limit = NULL;
	 /*
	  *该for循环遍历备用列表所有的内存区域，查找到一个合适的空闲内存块
	  *即是遍历ac->zonelist中不大于ac->high_zoneidx的所有zone
	  */
	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
								ac->nodemask) {
		struct page *page;
		unsigned long mark;
        /*
		 *（1）__cpuset_zone_allowed_softwall是一个辅助函数,用于检查给定内存域是否属于该进程允许运行的      
		 *    CPU
		 *（2）如果使能cpuset而且设置了ALLOC_CPUSET标志就检查看当前CPU是否允许在内存域zone所在结点中分配
		 *     内存
		 */
		if (cpusets_enabled() &&
			(alloc_flags & ALLOC_CPUSET) &&
			!__cpuset_zone_allowed(zone, gfp_mask))
				continue;
		
        // ac->spread_dirty_pages不为零标识本次内存分配用于写，可能增加脏页数
		if (ac->spread_dirty_pages) {
			
            //如果当前zone所在节点被标记为脏页超标就跳过
			if (last_pgdat_dirty_limit == zone->zone_pgdat)
				continue;
            //检查zone所在节点脏页数是否超过限制
			if (!node_dirty_ok(zone->zone_pgdat)) {
				last_pgdat_dirty_limit = zone->zone_pgdat;
				continue;
			}
		}
        /*
		 *接下来检查所遍历到的内存域是否有足够的空闲页，空闲内存页中是否具有大小为2^order大小的连续内存块。
		 *如果没有足够的空闲页或者没有连续内存块可满足分配请求(两者出现任意一个)，则将循环进行到备用列表中
		 *的下一个内存域，作同样的检查. 直到找到一个合适的空闲且连续的内存页块, 才会进行try_this_node进行
		 *内存分配
		 */
		//获取分配所用水印
		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
		//检查zone中空闲内存是否在水印之上
		if (!zone_watermark_fast(zone, order, mark,
				       ac_classzone_idx(ac), alloc_flags)) {
			int ret;

			/* Checked here to keep the fast path fast */
			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
			//如果设置无忽略水印标志就尝试从当前选定zone中分配内存
			if (alloc_flags & ALLOC_NO_WATERMARKS)
				goto try_this_zone;
            
            /*程序运行到此处说明空闲页在水印之下，接下来需要做内存回收，但有两种特殊情况:
			 *1. 如果系统不允许内存回收；
			 *2. 如果目标zone和当前zone的distance不小于RECLAIM_DISTANCE
			 */
			if (node_reclaim_mode == 0 ||
			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
				continue;
            //node_reclaim内存回收函数
			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
			switch (ret) {
            //设置了禁止扫描的标志
			case NODE_RECLAIM_NOSCAN:
				/* did not scan */
				continue;
            //没有可回收的页
			case NODE_RECLAIM_FULL:
				/* scanned but unreclaimable */
				continue;
			default:				
				//回收了部分页再次检查看是否满足水印限制
				if (zone_watermark_ok(zone, order, mark,
						ac_classzone_idx(ac), alloc_flags))
					goto try_this_zone;

				continue;
			}
		}
//执行到此处代表选定的zone的有满足要求的空闲内存块
try_this_zone:
        //内存分配
		page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
				gfp_mask, alloc_flags, ac->migratetype);
		if (page) {
            //清除相关标志或者设置联合页
			prep_new_page(page, order, gfp_mask, alloc_flags);

			/*
			 * If this is a high-order atomic allocation then check
			 * if the pageblock should be reserved for the future
			 */
            /*
             *在申请page的时候，会通过判定该page申请时的一些flags配置,若alloc_flags的ALLOC_HARDER被设置，且
             *order != 0,则认为该page是从高阶内存块分配下来的.通过reserve_highatomic_pageblock尝试将该页加入到
             *highatomic_pageblock中(highatomic_pageblock页面数量不能超过zone里面page的1/100).
             */
			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
				reserve_highatomic_pageblock(page, zone, order);

			return page;
		}
	}

	return NULL;
}

慢速内存分配(__alloc_pages_slowpath())

__alloc_pages_slowpath()是用于慢速页面分配，允许等待,内存压缩和内存回收等.需要注意的是慢速分配仍然要调用到get_page_from_freelist函数来进行内存的获取，慢速分配的大致过程概况如下：

降低水印ALLOC_WMARK_MIN，如果设置了GFP_KSWAPD_RECLAIM就唤醒交换线程
调用get_page_from_freelist尝试重新分配
如果分配的页阶大于0尝试内存规整操作，通过内存迁移合并出较大的内存块，然后尝试内存分配
如果设置了GFP_KSWAPD_RECLAIM再次唤醒交换线程，确保交换线程不会意外睡去
直接进行内存回收之后尝试分配
如果内存回收没有分配到所需内存，就再次进行内存规整之后尝试分配内存
如果当前内存获取仍然失败就尝试杀死一些进程后再尝试分配内存
（内存获取还未成功）检查分配标志是否存在一些潜在可调的空间，然后再次调用get_page_from_freelist尝试份分配
最后若获取到需要的内存空间返回空闲内存，若系统尽了最大努力仍然无法提供需要的空闲内存则返回NULL

__alloc_pages_slowpath函数流程图

在这里插入图片描述
图片参考LoyenWang博主的内存分配器

__alloc_pages_slowpath代码实现细节

//mm/page_alloc.c
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
						struct alloc_context *ac)
{
....
....
....
retry_cpuset:
	compaction_retries = 0;
	no_progress_loops = 0;
	compact_priority = DEF_COMPACT_PRIORITY;
	cpuset_mems_cookie = read_mems_allowed_begin();

	//重置nodemask和zonelist，因为可能在fast path中对值进行了更新
	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
					ac->high_zoneidx, ac->nodemask);
	if (!ac->preferred_zoneref->zone)
		goto nopage;


    //降低水印ALLOC_WMARK_MIN重新构建分配标志（保守内存分配转向为激进内存分配）
	alloc_flags = gfp_to_alloc_flags(gfp_mask);
    //如果设置了__GFP_KSWAPD_RECLAIM,当内存太低或零散则唤醒交换线程
	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
		wake_all_kswapds(order, ac);


	//调整后重新利用get_page_from_freelist进行内存分配
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	if (page)
		goto got_pg;


	/*
	 *前面没有分配到内存可能由于内存碎片的缘故，调用函数__alloc_pages_direct_compact
	 *尝试内存规整操作，进行页的迁移，然后再尝试分配执行该操作需要同时满足如下条件:
	 *(1)分配请求允许直接回收（gfp_mask & __GFP_DIRECT_RECLAI为真）
	 *(2)内存分配的阶要大于3（PAGE_ALLOC_COSTLY_ORDER）：因为低阶内存块受内存碎片化影响较小，内存规整不能解决问
	 *   题
	 *(3)本次内存分配不能是无水线限制的内存分配，函数gfp_pfmemalloc_allowed(gfp_mask)返回false
	 */
	if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
		!gfp_pfmemalloc_allowed(gfp_mask)) {
		page = __alloc_pages_direct_compact(gfp_mask, order,
						alloc_flags, ac,
						INIT_COMPACT_PRIORITY,
						&compact_result);
		if (page)
			goto got_pg;

		/*
		 * Checks for costly allocations with __GFP_NORETRY, which
		 * includes THP page fault allocations
		 */
		if (gfp_mask & __GFP_NORETRY) {

			if (compact_result == COMPACT_DEFERRED)
				goto nopage;
            
			compact_priority = INIT_COMPACT_PRIORITY;
		}
	}

retry:
	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
	//确保交换线程没有意外睡去
	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
		wake_all_kswapds(order, ac);
    
    //对gfp_mask进行分析看是否可以不受水线限制进行内存分配
	if (gfp_pfmemalloc_allowed(gfp_mask))
		alloc_flags = ALLOC_NO_WATERMARKS;

	 //调整nodemask和zonelist
	if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
					ac->high_zoneidx, ac->nodemask);
	}

	/* Attempt with potentially adjusted zonelist and alloc_flags */
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	if (page)
		goto got_pg;

	/* Caller is not willing to reclaim, we can't balance anything */
	if (!can_direct_reclaim) {
		WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
		goto nopage;
	}
	/* Avoid recursion of direct reclaim */
    //如果当前进程不能使用紧急内存，内存回收很可能会失败，容易造成递归调用
	if (current->flags & PF_MEMALLOC) {
		if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
			cond_resched();
			goto retry;
		}
		goto nopage;
	}

	/* Avoid allocations with no watermarks from looping endlessly */
	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
		goto nopage;


	/* Try direct reclaim and then allocating */
    //防止非__GFP_NOFAIL内存分配多次retry
	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
							&did_some_progress);
	if (page)
		goto got_pg;

	/* Try direct compaction and then allocating */
    //进行直接内存回收，回收完后再次尝试内存分配
	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
					compact_priority, &compact_result);
	if (page)
		goto got_pg;

	/* Do not loop if specifically requested */
	if (gfp_mask & __GFP_NORETRY)
		goto nopage;

	/*
	 * 除非gfp_mask设置了__GFP_REPEAT标志，否则退出高阶的空闲内存的循环申请（costly）
	 */
	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
		goto nopage;

	/* 告知该空闲内存申请时间已经过长 */
	if (time_after(jiffies, alloc_start + stall_timeout)) {
		warn_alloc(gfp_mask,
			"page allocation stalls for %ums, order:%u",
			jiffies_to_msecs(jiffies-alloc_start), order);
		stall_timeout += 10 * HZ;
	}
    /*检查是否有必要重新做内存回收*/
	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
				 did_some_progress > 0, &no_progress_loops))
		goto retry;

	/*检查是否有必要重新进行内存规整*/
	if (did_some_progress > 0 &&
			should_compact_retry(ac, order, alloc_flags,
				compact_result, &compact_priority,
				&compaction_retries))
		goto retry;

	 /*在开始进行OOM KILL前先检查cpuset是否更新若更新，则跳转到retry_cpuset*/
	if (read_mems_allowed_retry(cpuset_mems_cookie))
		goto retry_cpuset;

	/*前面的一系列工作，都没有成功分配到需要的空闲内存，开启oom杀死一些进
	 *程，并从新获取的页面中直接进行空闲内存分配 
	 */
	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
	if (page)
		goto got_pg;

	/* Retry as long as the OOM killer is making progress */
	/*当直接从oom获取到的页面分配内存失败，且oom已经杀死了一些优先级低的进程,
	 *此时将no_progress_loops赋值为0，并跳转到retry再一次进行内存分配操作
	 */
	if (did_some_progress) {
		no_progress_loops = 0;
		goto retry;
	}

nopage:
	 /*
	  *如果检查到cpuset更新，并检测到竞争情况则，跳转到retry_cpuset，重新按流程分配内存
	  */
	if (read_mems_allowed_retry(cpuset_mems_cookie))
		goto retry_cpuset;

	warn_alloc(gfp_mask,
			"page allocation failure: order:%u", order);
got_pg:
	return page;
}