linux kernel page分配实现二

最新推荐文章于 2023-10-31 15:15:14 发布

jerry_ms

最新推荐文章于 2023-10-31 15:15:14 发布

阅读量499

点赞数

本文链接：https://blog.csdn.net/u014089131/article/details/53231638

版权

linux系统经过长时间运行后，很难有太多free pages可供分配，

这个时候分配内存只能走慢速分配方式，里面涉及到内存回收和紧缩；

内存回收跟紧缩算法比较复杂，涉及内容比较多，将在后续逐个介绍；

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
	struct zonelist *zonelist, enum zone_type high_zoneidx,
	nodemask_t *nodemask, struct zone *preferred_zone,
	int classzone_idx, int migratetype)
{
	const gfp_t wait = gfp_mask & __GFP_WAIT; //wait high fs
	struct page *page = NULL;
	int alloc_flags;
	unsigned long pages_reclaimed = 0;
	unsigned long did_some_progress;
	enum migrate_mode migration_mode = MIGRATE_ASYNC;
	bool deferred_compaction = false;
	int contended_compaction = COMPACT_CONTENDED_NONE;


	/*
	 * In the slowpath, we sanity check order to avoid ever trying to
	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
	 * be using allocators in order of preference for an area that is
	 * too large.
	 */
	if (order >= MAX_ORDER) {
		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
		return NULL;
	}

	/*
	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
	 * using a larger set of nodes after it has established that the
	 * allowed per node queues are empty and that nodes are
	 * over allocated.
	 */
	if (IS_ENABLED(CONFIG_NUMA) &&
	    (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
		goto nopage;

restart:
	//如果未禁止kswapd，则唤醒节点的kswapd线程
	if (!(gfp_mask & __GFP_NO_KSWAPD))
		wake_all_kswapds(order, zonelist, high_zoneidx,
				preferred_zone, nodemask);

	/*
	 * OK, we're below the kswapd watermark and have kicked background
	 * reclaim. Now things get more complex, so set up alloc_flags according
	 * to how we want to proceed.
	 */
	//gfp 转成alloc_flags
	alloc_flags = gfp_to_alloc_flags(gfp_mask);

	/*
	 * Find the true preferred zone if the allocation is unconstrained by
	 * cpusets.
	 */
	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
		struct zoneref *preferred_zoneref;
		preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
				NULL, &preferred_zone);
		classzone_idx = zonelist_zone_idx(preferred_zoneref);
	}

rebalance:
	/* This is the last chance, in general, before the goto nopage. */
	//再次尝试从空闲链表中获取内存
	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
			preferred_zone, classzone_idx, migratetype);
	if (page)
		goto got_pg;

	/* Allocate without watermarks if the context allows */
	//需要忽略watermark时，表示系统紧急需要内存，要比较高优先级处理
	if (alloc_flags & ALLOC_NO_WATERMARKS) {
		/*
		 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
		 * the allocation is high priority and these type of
		 * allocations are system rather than user orientated
		 */
		zonelist = node_zonelist(numa_node_id(), gfp_mask);

		page = __alloc_pages_high_priority(gfp_mask, order,
				zonelist, high_zoneidx, nodemask,
				preferred_zone, classzone_idx, migratetype);
		if (page) {
			goto got_pg;
		}
	}

	/* Atomic allocations - we can't balance anything */
	if (!wait) {
		/*
		 * All existing users of the deprecated __GFP_NOFAIL are
		 * blockable, so warn of any new users that actually allow this
		 * type of allocation to fail.
		 */
		WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
		goto nopage;
	}

	/* Avoid recursion of direct reclaim */
	//调用者本身是内存回收线程的话，直接返回，避免死锁
	if (current->flags & PF_MEMALLOC)
		goto nopage;

	/* Avoid allocations with no watermarks from looping endlessly */
	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
		goto nopage;

	/*
	 * Try direct compaction. The first pass is asynchronous. Subsequent
	 * attempts after direct reclaim are synchronous
	 */
	//接下来尝试通过内存紧缩，减少内存碎片，看能否分配到连续的page，
	//先尝试异步压缩，接着尝试同步压缩。
	//这部分比较复杂，后面单独进行分析。
	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
					high_zoneidx, nodemask, alloc_flags,
					preferred_zone,
					classzone_idx, migratetype,
					migration_mode, &contended_compaction,
					&deferred_compaction);
	if (page)
		goto got_pg;

	/* Checks for THP-specific high-order allocations */
	//透明大页的分配
	if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
		/*
		 * If compaction is deferred for high-order allocations, it is
		 * because sync compaction recently failed. If this is the case
		 * and the caller requested a THP allocation, we do not want
		 * to heavily disrupt the system, so we fail the allocation
		 * instead of entering direct reclaim.
		 */
		//运行延迟紧缩的话，直接返回nopage
		if (deferred_compaction)
			goto nopage;

		/*
		 * In all zones where compaction was attempted (and not
		 * deferred or skipped), lock contention has been detected.
		 * For THP allocation we do not want to disrupt the others
		 * so we fallback to base pages instead.
		 */
		if (contended_compaction == COMPACT_CONTENDED_LOCK)
			goto nopage;

		/*
		 * If compaction was aborted due to need_resched(), we do not
		 * want to further increase allocation latency, unless it is
		 * khugepaged trying to collapse.
		 */
		if (contended_compaction == COMPACT_CONTENDED_SCHED
			&& !(current->flags & PF_KTHREAD))
			goto nopage;
	}

	/*
	 * It can become very expensive to allocate transparent hugepages at
	 * fault, so use asynchronous memory compaction for THP unless it is
	 * khugepaged trying to collapse.
	 */
	//内核线程或者非透明大页的情况，设置为轻同步紧缩模式
	if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
						(current->flags & PF_KTHREAD))
		migration_mode = MIGRATE_SYNC_LIGHT;

	/* Try direct reclaim and then allocating */
	//接下来尝试回收内存后，在分配空闲页面
	page = __alloc_pages_direct_reclaim(gfp_mask, order,
					zonelist, high_zoneidx,
					nodemask,
					alloc_flags, preferred_zone,
					classzone_idx, migratetype,
					&did_some_progress);
	if (page)
		goto got_pg;

	/*
	 * If we failed to make any progress reclaiming, then we are
	 * running out of options and have to consider going OOM
	 */
	 //还是分配不到内存的话，就得尝试通过oom回收内存
	if (!did_some_progress) {
		if (oom_gfp_allowed(gfp_mask)) {
			if (oom_killer_disabled)
				goto nopage;
			/* Coredumps can quickly deplete all memory reserves */
			if ((current->flags & PF_DUMPCORE) &&
			    !(gfp_mask & __GFP_NOFAIL))
				goto nopage;
			page = __alloc_pages_may_oom(gfp_mask, order,
					zonelist, high_zoneidx,
					nodemask, preferred_zone,
					classzone_idx, migratetype);
			if (page)
				goto got_pg;

			if (!(gfp_mask & __GFP_NOFAIL)) {
				/*
				 * The oom killer is not called for high-order
				 * allocations that may fail, so if no progress
				 * is being made, there are no other options and
				 * retrying is unlikely to help.
				 */
				if (order > PAGE_ALLOC_COSTLY_ORDER)
					goto nopage;
				/*
				 * The oom killer is not called for lowmem
				 * allocations to prevent needlessly killing
				 * innocent tasks.
				 */
				if (high_zoneidx < ZONE_NORMAL)
					goto nopage;
			}

			goto restart;
		}
	}

	/* Check if we should retry the allocation */
	pages_reclaimed += did_some_progress;
	if (should_alloc_retry(gfp_mask, order, did_some_progress,
						pages_reclaimed)) {
		/* Wait for some write requests to complete then retry */
		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
		goto rebalance;
	} else {
		/*
		 * High-order allocations do not necessarily loop after
		 * direct reclaim and reclaim/compaction depends on compaction
		 * being called after reclaim so call directly if necessary
		 */
		//再次尝试通过内存紧缩分配内存
		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
					high_zoneidx, nodemask, alloc_flags,
					preferred_zone,
					classzone_idx, migratetype,
					migration_mode, &contended_compaction,
					&deferred_compaction);
		if (page)
			goto got_pg;
	}

nopage:
	warn_alloc_failed(gfp_mask, order, NULL);
	return page;
got_pg:
	if (kmemcheck_enabled)
		kmemcheck_pagealloc_alloc(page, order, gfp_mask);

	if (page)
		set_page_owner(page, order, gfp_mask);
	return page;
}

//这个函数中在while中一直尝试从空闲链表获取内存，直到成功为止
static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
	struct zonelist *zonelist, enum zone_type high_zoneidx,
	nodemask_t *nodemask, struct zone *preferred_zone,
	int classzone_idx, int migratetype)
{
	struct page *page;

	do {
		page = get_page_from_freelist(gfp_mask, nodemask, order,
			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
			preferred_zone, classzone_idx, migratetype);

		if (!page && gfp_mask & __GFP_NOFAIL)
			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
	} while (!page && (gfp_mask & __GFP_NOFAIL));

	return page;
}