Linux pageset

Don't_Touch_Me

已于 2024-02-21 23:17:56 修改

阅读量445

点赞数 7

分类专栏： Kernel 文章标签： linux pcp pageset

于 2024-02-21 22:39:25 首次发布

本文链接：https://blog.csdn.net/assiduous_me/article/details/136133994

版权

Kernel 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

1. 引言

在用户进程发生缺页异常时，Linux内核需要分配所需物理页面以及建立也表映射，来维持进程的正常内存使用需求。而对于分配物理页面仅依赖于buddy系统，对于小order页面的分配效率较低。因此Linux通过在每个cpu维护一个page链表（percpu page list简称pageset），用来满足小order页面分配请求，提高页面分配效率。
下面我们重点来看一下，pageset的原理是什么，以及在Linux内核中是怎样实现和使用的。

2. pageset定义

struct zone {
......
	struct pglist_data	*zone_pgdat;
	struct per_cpu_pages	__percpu *per_cpu_pageset;
	struct per_cpu_zonestat	__percpu *per_cpu_zonestats;
	/*
	 * the high and batch values are copied to individual pagesets for
	 * faster access
	 */
	int pageset_high;
	int pageset_batch;
......
};

pageset的定义是放在zone里，每个zone里有一个per_cpu_pageset成员，用于这个zone内小order页面的快速分配。

3. pageset的初始化流程

调用流程

start_kernel(void)
---> setup_per_cpu_pageset();

从内核启动流程开始，通过调用setup_per_cpu_pageset()函数完成per_cpu_pageset初始化动作；

/*
 * Allocate per cpu pagesets and initialize them.
 * Before this call only boot pagesets were available.
 */
void __init setup_per_cpu_pageset(void)
{
	struct pglist_data *pgdat;
	struct zone *zone;
	int __maybe_unused cpu;

	for_each_populated_zone(zone) // 遍历可用的zone，设置zone_pageset
		setup_zone_pageset(zone);

#ifdef CONFIG_NUMA
	/*
	 * Unpopulated zones continue using the boot pagesets.
	 * The numa stats for these pagesets need to be reset.
	 * Otherwise, they will end up skewing the stats of
	 * the nodes these zones are associated with.
	 */
	for_each_possible_cpu(cpu) {
		struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
		memset(pzstats->vm_numa_event, 0,
		       sizeof(pzstats->vm_numa_event));
	}
#endif

	for_each_online_pgdat(pgdat)
		pgdat->per_cpu_nodestats =
			alloc_percpu(struct per_cpu_nodestat);
}

void __meminit setup_zone_pageset(struct zone *zone)
{
	int cpu;

	/* Size may be 0 on !SMP && !NUMA */
	if (sizeof(struct per_cpu_zonestat) > 0)
		zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);

	zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages); // 为当前zone的per_cpu_pageset分配percpu内存
	for_each_possible_cpu(cpu) { // 遍历所有cpu
		struct per_cpu_pages *pcp;
		struct per_cpu_zonestat *pzstats;

		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
		per_cpu_pages_init(pcp, pzstats); // 初始化per_cpu_pages
	}

	zone_set_pageset_high_and_batch(zone, 0);
}

/*
 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
 * costly to service.  That is between allocation orders which should
 * coalesce naturally under reasonable reclaim pressure and those which
 * will not.
 */
#define PAGE_ALLOC_COSTLY_ORDER 3 // 这个是指pageset支持分配的最大order，[0-3]
enum migratetype {
	MIGRATE_UNMOVABLE,
	MIGRATE_MOVABLE,
	MIGRATE_RECLAIMABLE,
	MIGRATE_PCPTYPES,	/* the number of types on the pcp lists */
	MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
	......
	MIGRATE_TYPES
};
/*
 * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list
 * for THP which will usually be GFP_MOVABLE. Even if it is another type,
 * it should not contribute to serious fragmentation causing THP allocation
 * failures.
 */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define NR_PCP_THP 1
#else
#define NR_PCP_THP 0
#endif
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)

static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
{
	int pindex;

	memset(pcp, 0, sizeof(*pcp));
	memset(pzstats, 0, sizeof(*pzstats));

	spin_lock_init(&pcp->lock);
	for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) // 初始化pcp中不同迁移类型，不同order用来存放页面的链表
		INIT_LIST_HEAD(&pcp->lists[pindex]);

	/*
	 * Set batch and high values safe for a boot pageset. A true percpu
	 * pageset's initialization will update them subsequently. Here we don't
	 * need to be as careful as pageset_update() as nobody can access the
	 * pageset yet.
	 */
	pcp->high = BOOT_PAGESET_HIGH;
	pcp->batch = BOOT_PAGESET_BATCH;
	pcp->free_factor = 0;
}

4. pageset的页面分配（用来分配order为[0-3]的页面）

调用流程

alloc_pages()
---> alloc_pages_node()
-------> __alloc_pages_node()
----------> __alloc_pages()
-------------> get_page_from_freelist()
-----------------> rmqueue()

/*
 * Allocate a page from the given zone.
 * Use pcplists for THP or "cheap" high-order allocations.
 */

/*
 * Do not instrument rmqueue() with KMSAN. This function may call
 * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
 * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
 * may call rmqueue() again, which will result in a deadlock.
 */
__no_sanitize_memory
static inline
struct page *rmqueue(struct zone *preferred_zone,
			struct zone *zone, unsigned int order,
			gfp_t gfp_flags, unsigned int alloc_flags,
			int migratetype)
{
	struct page *page;

	/*
	 * We most definitely don't want callers attempting to
	 * allocate greater than order-1 page units with __GFP_NOFAIL.
	 */
	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));

	if (likely(pcp_allowed_order(order))) { // 检查要分配的页面order是否是pcp允许的order
		/*
		 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
		 * we need to skip it when CMA area isn't allowed.
		 */
		if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
				migratetype != MIGRATE_MOVABLE) { // 进行一些参数检查，如果满足条件，则从pageset中分配page
			page = rmqueue_pcplist(preferred_zone, zone, order, // 从pageset中分配页面
					migratetype, alloc_flags);
			if (likely(page))
				goto out;
		}
	}

	page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
							migratetype);

out:
	/* Separate test+clear to avoid unnecessary atomics */
	if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
	}

	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
	return page;
}

/*
 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
 * costly to service.  That is between allocation orders which should
 * coalesce naturally under reasonable reclaim pressure and those which
 * will not.
 */
#define PAGE_ALLOC_COSTLY_ORDER 3

static inline bool pcp_allowed_order(unsigned int order) // 检查该order页面是否允许从pageset中分配
{
	if (order <= PAGE_ALLOC_COSTLY_ORDER) // 主要就是判断order是否小于PAGE_ALLOC_COSTLY_ORDER，可以从前面的定义入手，发现order只要在[0-3]范围内就允许从pageset中分配
		return true;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	if (order == pageblock_order)
		return true;
#endif
	return false;
}

接下来我们看一下rmqueue_pcplist()是如何从pageset中分配页面的

/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
			struct zone *zone, unsigned int order,
			int migratetype, unsigned int alloc_flags)
{
	struct per_cpu_pages *pcp;
	struct list_head *list;
	struct page *page;
	unsigned long flags;
	unsigned long __maybe_unused UP_flags;

	/*
	 * spin_trylock may fail due to a parallel drain. In the future, the
	 * trylock will also protect against IRQ reentrancy.
	 */
	pcp_trylock_prepare(UP_flags);
	pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); // 获取当前cpu上的per_cpu_pages对象
	if (!pcp) {
		pcp_trylock_finish(UP_flags);
		return NULL;
	}

	/*
	 * On allocation, reduce the number of pages that are batch freed.
	 * See nr_pcp_free() where free_factor is increased for subsequent
	 * frees.
	 */
	pcp->free_factor >>= 1;
	list = &pcp->lists[order_to_pindex(migratetype, order)]; // 根据迁移类型和order大小找寻要从哪个页面链表中摘取页面
	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); // 摘取页面
	pcp_spin_unlock_irqrestore(pcp, flags);
	pcp_trylock_finish(UP_flags);
	if (page) { // 如果分配页面成功，做一些统计
		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
		zone_statistics(preferred_zone, zone, 1);
	}
	return page; // 返回从pageset中分配到的页面
}

static inline unsigned int order_to_pindex(int migratetype, int order) // 根据迁移类型和要分配的order计算要从哪条页面链表中摘取页面，这个计算index的逻辑和一开始pageset初始化时一致（看不明白，可以往前翻找一下）
{
	int base = order;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	if (order > PAGE_ALLOC_COSTLY_ORDER) {
		VM_BUG_ON(order != pageblock_order);
		return NR_LOWORDER_PCP_LISTS;
	}
#else
	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif

	return (MIGRATE_PCPTYPES * base) + migratetype;
}

接下来看看__rmqueue_pcplist()函数内部是如何实现的：

/* Remove page from the per-cpu list, caller must protect the list */
static inline
struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
			int migratetype,
			unsigned int alloc_flags,
			struct per_cpu_pages *pcp,
			struct list_head *list)
{
	struct page *page;

	do {
		if (list_empty(list)) { // 如果当前list中没有页面，则需要从buddy系统中请求页面
			int batch = READ_ONCE(pcp->batch);
			int alloced;

			/*
			 * Scale batch relative to order if batch implies
			 * free pages can be stored on the PCP. Batch can
			 * be 1 for small zones or for boot pagesets which
			 * should never store free pages as the pages may
			 * belong to arbitrary zones.
			 */
			if (batch > 1)
				batch = max(batch >> order, 2);
			alloced = rmqueue_bulk(zone, order, // 从buddy中批量申请batch个order大小、migratetype类型的页面
					batch, list,
					migratetype, alloc_flags);

			pcp->count += alloced << order;
			if (unlikely(list_empty(list))) // 如果从buddy系统中申请不到页面，则返回NULL
				return NULL;
		}

		page = list_first_entry(list, struct page, pcp_list); // 从list中获取页面
		list_del(&page->pcp_list); // 删除页面
		pcp->count -= 1 << order; // pcp页面个数更新
	} while (check_new_pcp(page, order));

	return page;
}

/*
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
static int rmqueue_bulk(struct zone *zone, unsigned int order,
			unsigned long count, struct list_head *list,
			int migratetype, unsigned int alloc_flags)
{
	int i, allocated = 0;

	/* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
	spin_lock(&zone->lock);
	for (i = 0; i < count; ++i) { // 重复count次
		struct page *page = __rmqueue(zone, order, migratetype, // 每次从zone的buddy系统中申请一个对应order和migratetype的页面
								alloc_flags);
		if (unlikely(page == NULL)) // 从buddy系统中申请不到内存，则退出，否则继续申请
			break;

		if (unlikely(check_pcp_refill(page, order)))
			continue;

		/*
		 * Split buddy pages returned by expand() are received here in
		 * physical page order. The page is added to the tail of
		 * caller's list. From the callers perspective, the linked list
		 * is ordered by page number under some conditions. This is
		 * useful for IO devices that can forward direction from the
		 * head, thus also in the physical page order. This is useful
		 * for IO devices that can merge IO requests if the physical
		 * pages are ordered properly.
		 */
		list_add_tail(&page->pcp_list, list); // 将申请到的页面，挂载到pageset中的页面链表中
		allocated++; // 已分配的个数加一
		if (is_migrate_cma(get_pcppage_migratetype(page)))
			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
					      -(1 << order));
	}

	/*
	 * i pages were removed from the buddy list even if some leak due
	 * to check_pcp_refill failing so adjust NR_FREE_PAGES based
	 * on i. Do not confuse with 'allocated' which is the number of
	 * pages added to the pcp list.
	 */
	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
	spin_unlock(&zone->lock);
	return allocated; // 返回已分配页面个数
}

5. pageset的页面释放

调用流程

free_pages()
---> __free_pages()
------> free_the_page()

static inline void free_the_page(struct page *page, unsigned int order)
{
	if (pcp_allowed_order(order))		/* Via pcp? */ // 检查该order页面是否是从pageset中分配的
		free_unref_page(page, order); // 如果是的话，则释放到pageset中
	else
		__free_pages_ok(page, order, FPI_NONE);
}

/*
 * Free a pcp page
 */
void free_unref_page(struct page *page, unsigned int order)
{
	unsigned long flags;
	unsigned long __maybe_unused UP_flags;
	struct per_cpu_pages *pcp;
	struct zone *zone;
	unsigned long pfn = page_to_pfn(page);
	int migratetype;

	if (!free_unref_page_prepare(page, pfn, order))
		return;

	/*
	 * We only track unmovable, reclaimable and movable on pcp lists.
	 * Place ISOLATE pages on the isolated list because they are being
	 * offlined but treat HIGHATOMIC as movable pages so we can get those
	 * areas back if necessary. Otherwise, we may have to free
	 * excessively into the page allocator
	 */
	migratetype = get_pcppage_migratetype(page);
	if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
		if (unlikely(is_migrate_isolate(migratetype))) {
			free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
			return;
		}
		migratetype = MIGRATE_MOVABLE;
	}

	zone = page_zone(page);
	pcp_trylock_prepare(UP_flags);
	pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); // 获取当前cpu的pageset对象
	if (pcp) {
		free_unref_page_commit(zone, pcp, page, migratetype, order); // 调用该函数将页面释放到pageset中
		pcp_spin_unlock_irqrestore(pcp, flags);
	} else {
		free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
	}
	pcp_trylock_finish(UP_flags);
}

static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
				   struct page *page, int migratetype,
				   unsigned int order)
{
	int high;
	int pindex;
	bool free_high;

	__count_vm_events(PGFREE, 1 << order);
	pindex = order_to_pindex(migratetype, order); // 计算该migratetype和order应该对应pageset哪条页面链表
	list_add(&page->pcp_list, &pcp->lists[pindex]); // 将该页面重新挂载到该链表中，用于后续分配
	pcp->count += 1 << order; // 更新pageset页面个数

	/*
	 * As high-order pages other than THP's stored on PCP can contribute
	 * to fragmentation, limit the number stored when PCP is heavily
	 * freeing without allocation. The remainder after bulk freeing
	 * stops will be drained from vmstat refresh context.
	 */
	free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);

	high = nr_pcp_high(pcp, zone, free_high);
	if (pcp->count >= high) { // 计算当前pageset保存的页面数量是否超过high值
		int batch = READ_ONCE(pcp->batch); // 如果超过，则需要将batch个页面返还给buddy系统

		free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); // 将多余页面返还给buddy系统
	}
}

/*
 * Frees a number of pages from the PCP lists
 * Assumes all pages on list are in same zone.
 * count is the number of pages to free.
 */
static void free_pcppages_bulk(struct zone *zone, int count,
					struct per_cpu_pages *pcp,
					int pindex)
{
	int min_pindex = 0;
	int max_pindex = NR_PCP_LISTS - 1;
	unsigned int order;
	bool isolated_pageblocks;
	struct page *page;

	/*
	 * Ensure proper count is passed which otherwise would stuck in the
	 * below while (list_empty(list)) loop.
	 */
	count = min(pcp->count, count);

	/* Ensure requested pindex is drained first. */
	pindex = pindex - 1;

	/* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
	spin_lock(&zone->lock);
	isolated_pageblocks = has_isolate_pageblock(zone);

	while (count > 0) { // 不断地将页面返还给buddy系统
		struct list_head *list;
		int nr_pages;

		/* Remove pages from lists in a round-robin fashion. */
		do {
			if (++pindex > max_pindex)
				pindex = min_pindex;
			list = &pcp->lists[pindex]; // 获取到页面所在链表
			if (!list_empty(list)) // 如果链表不为空，则跳出循环
				break;

			if (pindex == max_pindex)
				max_pindex--;
			if (pindex == min_pindex)
				min_pindex++;
		} while (1);

		order = pindex_to_order(pindex);
		nr_pages = 1 << order;
		do {
			int mt;

			page = list_last_entry(list, struct page, pcp_list); // 获取当前list中最后一个页面
			mt = get_pcppage_migratetype(page); // 获取页面的迁移类型

			/* must delete to avoid corrupting pcp list */
			list_del(&page->pcp_list); // 将页面从list中删除
			count -= nr_pages; // 减少要释放到页面数量
			pcp->count -= nr_pages; // 更新pageset页面个数

			if (bulkfree_pcp_prepare(page))
				continue;

			/* MIGRATE_ISOLATE page should not go to pcplists */
			VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
			/* Pageblock could have been isolated meanwhile */
			if (unlikely(isolated_pageblocks))
				mt = get_pageblock_migratetype(page);

			__free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); // 释放页面
			trace_mm_page_pcpu_drain(page, order, mt);
		} while (count > 0 && !list_empty(list));
	}

	spin_unlock(&zone->lock);
}

至此Linux pageset初始化和使用流程介绍完毕，感谢各位读者浏览！

Don't_Touch_Me

关注

7
点赞
踩
8

收藏

觉得还不错? 一键收藏
2
评论
Linux pageset

在用户进程发生缺页异常时，Linux内核需要分配所需物理页面以及建立也表映射，来维持进程的正常内存使用需求。而对于分配物理页面仅依赖于buddy系统，对于小order页面的分配效率较低。因此Linux通过在每个cpu维护一个page链表（percpu page list简称pageset），用来满足小order页面分配请求，提高页面分配效率。
复制链接

扫一扫