1. 引言
在用户进程发生缺页异常时,Linux内核需要分配所需物理页面以及建立也表映射,来维持进程的正常内存使用需求。而对于分配物理页面仅依赖于buddy系统,对于小order页面的分配效率较低。因此Linux通过在每个cpu维护一个page链表(percpu page list简称pageset),用来满足小order页面分配请求,提高页面分配效率。
下面我们重点来看一下,pageset的原理是什么,以及在Linux内核中是怎样实现和使用的。
2. pageset定义
struct zone {
......
struct pglist_data *zone_pgdat;
struct per_cpu_pages __percpu *per_cpu_pageset;
struct per_cpu_zonestat __percpu *per_cpu_zonestats;
/*
* the high and batch values are copied to individual pagesets for
* faster access
*/
int pageset_high;
int pageset_batch;
......
};
pageset
的定义是放在zone里,每个zone里有一个per_cpu_pageset成员,用于这个zone内小order页面的快速分配。
3. pageset的初始化流程
调用流程
start_kernel(void)
---> setup_per_cpu_pageset();
从内核启动流程开始,通过调用setup_per_cpu_pageset()
函数完成per_cpu_pageset初始化动作;
/*
* Allocate per cpu pagesets and initialize them.
* Before this call only boot pagesets were available.
*/
void __init setup_per_cpu_pageset(void)
{
struct pglist_data *pgdat;
struct zone *zone;
int __maybe_unused cpu;
for_each_populated_zone(zone) // 遍历可用的zone,设置zone_pageset
setup_zone_pageset(zone);
#ifdef CONFIG_NUMA
/*
* Unpopulated zones continue using the boot pagesets.
* The numa stats for these pagesets need to be reset.
* Otherwise, they will end up skewing the stats of
* the nodes these zones are associated with.
*/
for_each_possible_cpu(cpu) {
struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
memset(pzstats->vm_numa_event, 0,
sizeof(pzstats->vm_numa_event));
}
#endif
for_each_online_pgdat(pgdat)
pgdat->per_cpu_nodestats =
alloc_percpu(struct per_cpu_nodestat);
}
void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
/* Size may be 0 on !SMP && !NUMA */
if (sizeof(struct per_cpu_zonestat) > 0)
zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages); // 为当前zone的per_cpu_pageset分配percpu内存
for_each_possible_cpu(cpu) { // 遍历所有cpu
struct per_cpu_pages *pcp;
struct per_cpu_zonestat *pzstats;
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
per_cpu_pages_init(pcp, pzstats); // 初始化per_cpu_pages
}
zone_set_pageset_high_and_batch(zone, 0);
}
/*
* PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
* costly to service. That is between allocation orders which should
* coalesce naturally under reasonable reclaim pressure and those which
* will not.
*/
#define PAGE_ALLOC_COSTLY_ORDER 3 // 这个是指pageset支持分配的最大order,[0-3]
enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
MIGRATE_RECLAIMABLE,
MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
......
MIGRATE_TYPES
};
/*
* One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list
* for THP which will usually be GFP_MOVABLE. Even if it is another type,
* it should not contribute to serious fragmentation causing THP allocation
* failures.
*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define NR_PCP_THP 1
#else
#define NR_PCP_THP 0
#endif
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
{
int pindex;
memset(pcp, 0, sizeof(*pcp));
memset(pzstats, 0, sizeof(*pzstats));
spin_lock_init(&pcp->lock);
for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) // 初始化pcp中不同迁移类型,不同order用来存放页面的链表
INIT_LIST_HEAD(&pcp->lists[pindex]);
/*
* Set batch and high values safe for a boot pageset. A true percpu
* pageset's initialization will update them subsequently. Here we don't
* need to be as careful as pageset_update() as nobody can access the
* pageset yet.
*/
pcp->high = BOOT_PAGESET_HIGH;
pcp->batch = BOOT_PAGESET_BATCH;
pcp->free_factor = 0;
}
4. pageset的页面分配(用来分配order为[0-3]的页面)
调用流程
alloc_pages()
---> alloc_pages_node()
-------> __alloc_pages_node()
----------> __alloc_pages()
-------------> get_page_from_freelist()
-----------------> rmqueue()
/*
* Allocate a page from the given zone.
* Use pcplists for THP or "cheap" high-order allocations.
*/
/*
* Do not instrument rmqueue() with KMSAN. This function may call
* __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
* If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
* may call rmqueue() again, which will result in a deadlock.
*/
__no_sanitize_memory
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
struct page *page;
/*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
if (likely(pcp_allowed_order(order))) { // 检查要分配的页面order是否是pcp允许的order
/*
* MIGRATE_MOVABLE pcplist could have the pages on CMA area and
* we need to skip it when CMA area isn't allowed.
*/
if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
migratetype != MIGRATE_MOVABLE) { // 进行一些参数检查,如果满足条件,则从pageset中分配page
page = rmqueue_pcplist(preferred_zone, zone, order, // 从pageset中分配页面
migratetype, alloc_flags);
if (likely(page))
goto out;
}
}
page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
migratetype);
out:
/* Separate test+clear to avoid unnecessary atomics */
if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
}
/*
* PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
* costly to service. That is between allocation orders which should
* coalesce naturally under reasonable reclaim pressure and those which
* will not.
*/
#define PAGE_ALLOC_COSTLY_ORDER 3
static inline bool pcp_allowed_order(unsigned int order) // 检查该order页面是否允许从pageset中分配
{
if (order <= PAGE_ALLOC_COSTLY_ORDER) // 主要就是判断order是否小于PAGE_ALLOC_COSTLY_ORDER,可以从前面的定义入手,发现order只要在[0-3]范围内就允许从pageset中分配
return true;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order == pageblock_order)
return true;
#endif
return false;
}
接下来我们看一下rmqueue_pcplist()
是如何从pageset中分配页面的
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
unsigned long flags;
unsigned long __maybe_unused UP_flags;
/*
* spin_trylock may fail due to a parallel drain. In the future, the
* trylock will also protect against IRQ reentrancy.
*/
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); // 获取当前cpu上的per_cpu_pages对象
if (!pcp) {
pcp_trylock_finish(UP_flags);
return NULL;
}
/*
* On allocation, reduce the number of pages that are batch freed.
* See nr_pcp_free() where free_factor is increased for subsequent
* frees.
*/
pcp->free_factor >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)]; // 根据迁移类型和order大小找寻要从哪个页面链表中摘取页面
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); // 摘取页面
pcp_spin_unlock_irqrestore(pcp, flags);
pcp_trylock_finish(UP_flags);
if (page) { // 如果分配页面成功,做一些统计
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
}
return page; // 返回从pageset中分配到的页面
}
static inline unsigned int order_to_pindex(int migratetype, int order) // 根据迁移类型和要分配的order计算要从哪条页面链表中摘取页面,这个计算index的逻辑和一开始pageset初始化时一致(看不明白,可以往前翻找一下)
{
int base = order;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != pageblock_order);
return NR_LOWORDER_PCP_LISTS;
}
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif
return (MIGRATE_PCPTYPES * base) + migratetype;
}
接下来看看__rmqueue_pcplist()
函数内部是如何实现的:
/* Remove page from the per-cpu list, caller must protect the list */
static inline
struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
{
struct page *page;
do {
if (list_empty(list)) { // 如果当前list中没有页面,则需要从buddy系统中请求页面
int batch = READ_ONCE(pcp->batch);
int alloced;
/*
* Scale batch relative to order if batch implies
* free pages can be stored on the PCP. Batch can
* be 1 for small zones or for boot pagesets which
* should never store free pages as the pages may
* belong to arbitrary zones.
*/
if (batch > 1)
batch = max(batch >> order, 2);
alloced = rmqueue_bulk(zone, order, // 从buddy中批量申请batch个order大小、migratetype类型的页面
batch, list,
migratetype, alloc_flags);
pcp->count += alloced << order;
if (unlikely(list_empty(list))) // 如果从buddy系统中申请不到页面,则返回NULL
return NULL;
}
page = list_first_entry(list, struct page, pcp_list); // 从list中获取页面
list_del(&page->pcp_list); // 删除页面
pcp->count -= 1 << order; // pcp页面个数更新
} while (check_new_pcp(page, order));
return page;
}
/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
int i, allocated = 0;
/* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) { // 重复count次
struct page *page = __rmqueue(zone, order, migratetype, // 每次从zone的buddy系统中申请一个对应order和migratetype的页面
alloc_flags);
if (unlikely(page == NULL)) // 从buddy系统中申请不到内存,则退出,否则继续申请
break;
if (unlikely(check_pcp_refill(page, order)))
continue;
/*
* Split buddy pages returned by expand() are received here in
* physical page order. The page is added to the tail of
* caller's list. From the callers perspective, the linked list
* is ordered by page number under some conditions. This is
* useful for IO devices that can forward direction from the
* head, thus also in the physical page order. This is useful
* for IO devices that can merge IO requests if the physical
* pages are ordered properly.
*/
list_add_tail(&page->pcp_list, list); // 将申请到的页面,挂载到pageset中的页面链表中
allocated++; // 已分配的个数加一
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
/*
* i pages were removed from the buddy list even if some leak due
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
* on i. Do not confuse with 'allocated' which is the number of
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
return allocated; // 返回已分配页面个数
}
5. pageset的页面释放
调用流程
free_pages()
---> __free_pages()
------> free_the_page()
static inline void free_the_page(struct page *page, unsigned int order)
{
if (pcp_allowed_order(order)) /* Via pcp? */ // 检查该order页面是否是从pageset中分配的
free_unref_page(page, order); // 如果是的话,则释放到pageset中
else
__free_pages_ok(page, order, FPI_NONE);
}
/*
* Free a pcp page
*/
void free_unref_page(struct page *page, unsigned int order)
{
unsigned long flags;
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
struct zone *zone;
unsigned long pfn = page_to_pfn(page);
int migratetype;
if (!free_unref_page_prepare(page, pfn, order))
return;
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Place ISOLATE pages on the isolated list because they are being
* offlined but treat HIGHATOMIC as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
migratetype = get_pcppage_migratetype(page);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
return;
}
migratetype = MIGRATE_MOVABLE;
}
zone = page_zone(page);
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); // 获取当前cpu的pageset对象
if (pcp) {
free_unref_page_commit(zone, pcp, page, migratetype, order); // 调用该函数将页面释放到pageset中
pcp_spin_unlock_irqrestore(pcp, flags);
} else {
free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
}
pcp_trylock_finish(UP_flags);
}
static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
struct page *page, int migratetype,
unsigned int order)
{
int high;
int pindex;
bool free_high;
__count_vm_events(PGFREE, 1 << order);
pindex = order_to_pindex(migratetype, order); // 计算该migratetype和order应该对应pageset哪条页面链表
list_add(&page->pcp_list, &pcp->lists[pindex]); // 将该页面重新挂载到该链表中,用于后续分配
pcp->count += 1 << order; // 更新pageset页面个数
/*
* As high-order pages other than THP's stored on PCP can contribute
* to fragmentation, limit the number stored when PCP is heavily
* freeing without allocation. The remainder after bulk freeing
* stops will be drained from vmstat refresh context.
*/
free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
high = nr_pcp_high(pcp, zone, free_high);
if (pcp->count >= high) { // 计算当前pageset保存的页面数量是否超过high值
int batch = READ_ONCE(pcp->batch); // 如果超过,则需要将batch个页面返还给buddy系统
free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); // 将多余页面返还给buddy系统
}
}
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone.
* count is the number of pages to free.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp,
int pindex)
{
int min_pindex = 0;
int max_pindex = NR_PCP_LISTS - 1;
unsigned int order;
bool isolated_pageblocks;
struct page *page;
/*
* Ensure proper count is passed which otherwise would stuck in the
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
/* Ensure requested pindex is drained first. */
pindex = pindex - 1;
/* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
while (count > 0) { // 不断地将页面返还给buddy系统
struct list_head *list;
int nr_pages;
/* Remove pages from lists in a round-robin fashion. */
do {
if (++pindex > max_pindex)
pindex = min_pindex;
list = &pcp->lists[pindex]; // 获取到页面所在链表
if (!list_empty(list)) // 如果链表不为空,则跳出循环
break;
if (pindex == max_pindex)
max_pindex--;
if (pindex == min_pindex)
min_pindex++;
} while (1);
order = pindex_to_order(pindex);
nr_pages = 1 << order;
do {
int mt;
page = list_last_entry(list, struct page, pcp_list); // 获取当前list中最后一个页面
mt = get_pcppage_migratetype(page); // 获取页面的迁移类型
/* must delete to avoid corrupting pcp list */
list_del(&page->pcp_list); // 将页面从list中删除
count -= nr_pages; // 减少要释放到页面数量
pcp->count -= nr_pages; // 更新pageset页面个数
if (bulkfree_pcp_prepare(page))
continue;
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
__free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); // 释放页面
trace_mm_page_pcpu_drain(page, order, mt);
} while (count > 0 && !list_empty(list));
}
spin_unlock(&zone->lock);
}
至此Linux pageset初始化和使用流程介绍完毕,感谢各位读者浏览!