在前文介绍了zone_list初始化完之后,即将迎来真正的内存管理机制buddy system
大体流程为:
mm_init
--->mem_init
---|--->free_all_bootmem ---1
---|---|--->free_low_memory_core_early ---2
---|---|---|--->__free_memory_core(start, end) ---3 /*memblock.memory到memblock.reserved的地址,是按每一个段来吗?*/
---|---|---|---|--->__free_pages_memory(start_pfn, end_pfn) ---4 /*以页帧入参*/
---|---|---|---|---|--->__free_pages_bootmem(pfn_to_page(start), start, order) ---5 /*第一参数为该页帧对应的页,此处计算了该页帧对应的order*/
---|---|---|---|---|---|--->__free_pages_boot_core(page, order) ---6
---|---|---|---|---|---|---|--->__free_pages(page, order) ---7
---|---|---|---|---|---|---|---|--->free_the_page(page, order) ---8
---|---|---|---|---|---|---|---|---|--->free_unref_page(page) ---9 /*order等于0*/
---|---|---|---|---|---|---|---|---|--->__free_pages_ok(page, order) ---10 /*order不等于0*/
上述1~10步为系统启动阶段物理内存从memblock回归到buddy system的大体流程。
页帧与页的转换:
在不同的内存模型中页与页帧的转换方式是不同的,此处仅关注CONFIG_SPARSEMEM
/*
* Note: section's mem_map is encoded to reflect its start_pfn.
* section[i].section_mem_map == mem_map's address - start_pfn;
*/
#define __page_to_pfn(pg) \
({ const struct page *__pg = (pg); \
int __sec = page_to_section(__pg); \
(unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \
})
#define __pfn_to_page(pfn) \
({ unsigned long __pfn = (pfn); \
struct mem_section *__sec = __pfn_to_section(__pfn); \
__section_mem_map_addr(__sec) + __pfn; \
})
static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
return __nr_to_section(pfn_to_section_nr(pfn));
}
static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
return pfn >> PFN_SECTION_SHIFT; /*PFN_SECTION_SHIFT = 2^14个页*/
}
static inline struct mem_section *__nr_to_section(unsigned long nr)
{
#ifdef CONFIG_SPARSEMEM_EXTREME
if (!mem_section)
return NULL;
#endif
if (!mem_section[SECTION_NR_TO_ROOT(nr)])
return NULL;
return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
}
#ifdef CONFIG_SPARSEMEM_EXTREME
#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) /*256*/
#else
#define SECTIONS_PER_ROOT 1
#endif
static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
unsigned long map = section->section_mem_map;
map &= SECTION_MAP_MASK;
return (struct page *)map;
}
可以看到两者转换的中间桥梁是section,在SPARSEMEM稀疏内存模型中mem_section使用的是静态二维数组;
静态的数据越大对内存的消耗越大,若是依据实际物理内存来分配,可能消耗会小点;因为稀疏中会有空洞;
在定义了CONFIG_SPARSEMEM_EXTREME后,mem_setcion为二级指针类型
static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
phys_addr_t start, end;
u64 i;
memblock_clear_hotplug(0, -1);
for_each_reserved_mem_region(i, &start, &end)
reserve_bootmem_region(start, end);
/*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
NULL)
count += __free_memory_core(start, end);
return count;
}
static unsigned long __init __free_memory_core(phys_addr_t start,
phys_addr_t end)
{
unsigned long start_pfn = PFN_UP(start);
unsigned long end_pfn = min_t(unsigned long,
PFN_DOWN(end), max_low_pfn);
if (start_pfn >= end_pfn)
return 0;
__free_pages_memory(start_pfn, end_pfn);
return end_pfn - start_pfn;
}
order的统计算法:
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
int order;
while (start < end)
{
order = min(MAX_ORDER - 1UL, __ffs(start)); /*__ffs计算该页帧对应的order大小*/
while (start + (1UL << order) > end)
order--;
__free_pages_bootmem(pfn_to_page(start), start, order);
start += (1UL << order);
}
}
static __always_inline unsigned long __ffs(unsigned long word)
{
int num = 0;
#if BITS_PER_LONG == 64
if ((word & 0xffffffff) == 0) {
num += 32;
word >>= 32;
}
#endif
if ((word & 0xffff) == 0) {
num += 16;
word >>= 16;
}
if ((word & 0xff) == 0) {
num += 8;
word >>= 8;
}
if ((word & 0xf) == 0) {
num += 4;
word >>= 4;
}
if ((word & 0x3) == 0) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0)
num += 1;
return num;
}
void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order)
{
if (early_page_uninitialised(pfn))
return;
return __free_pages_boot_core(page, order);
}
static void __init __free_pages_boot_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
unsigned int loop;
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
__ClearPageReserved(p);
set_page_count(p, 0);
}
__ClearPageReserved(p);
set_page_count(p, 0);
page_zone(page)->managed_pages += nr_pages; /*managed_pages是伙伴系统所管理的页总和*/
set_page_refcounted(page); /*置page->_refcount为1*/
__free_pages(page, order);
}
static inline void free_the_page(struct page *page, unsigned int order)
{
if (order == 0) /* Via pcp? */ /*在此阶段是将order为0的页挂载到pcp上面*/
free_unref_page(page);
else
__free_pages_ok(page, order);
}
《free_the_page即是伙伴系统内存管理建立的接口,也是内存释放的真正行动者》
order等于0的页划入pcp
/*
* Free a 0-order page
*/
void free_unref_page(struct page *page)
{
unsigned long flags;
unsigned long pfn = page_to_pfn(page); /*在__free_pages_memory中通过pfn_to_page(start),获取了页,此处拿pfn*/
if (!free_unref_page_prepare(page, pfn))
return;
local_irq_save(flags);
free_unref_page_commit(page, pfn);
local_irq_restore(flags);
}
static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
{
int migratetype;
if (!free_pcp_prepare(page))
return false;
migratetype = get_pfnblock_migratetype(page, pfn);
set_pcppage_migratetype(page, migratetype);
return true;
}
static bool free_pcp_prepare(struct page *page)
{
return free_pages_prepare(page, 0, false);
}
static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free)
{
int bad = 0;
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
*/
if (unlikely(order)) {
bool compound = PageCompound(page);
int i;
VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
if (compound)
ClearPageDoubleMap(page);
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
if (unlikely(free_pages_check(page + i))) {
bad++;
continue;
}
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
memcg_kmem_uncharge(page, order);
if (check_free)
bad += free_pages_check(page);
if (bad)
return false;
page_cpupid_reset_last(page); /*空*/
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order); /*空*/
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),
PAGE_SIZE << order);
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
arch_free_page(page, order); /*空*/
kernel_poison_pages(page, 1 << order, 0); /*空*/
kernel_map_pages(page, 1 << order, 0); /*空*/
kasan_free_nondeferred_pages(page, order); /*空*/
return true;
}
/*
* Flags checked when a page is prepped for return by the page allocator.
* Pages being prepped should not have these flags set. It they are set,
* there has been a kernel bug or struct page corruption.
*
* __PG_HWPOISON is exceptional because it needs to be kept beyond page's
* alloc-free cycle to prevent from reusing the page.
*/
#define PAGE_FLAGS_CHECK_AT_PREP \
(((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
#define __PG_HWPOISON 0
static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
{
return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
}
static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
unsigned long pfn,
unsigned long end_bitidx,
unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
unsigned long word;
bitmap = get_pageblock_bitmap(page, pfn); /*从pageblock_flags中获取bitmap,进而获取迁移类型*/
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
word = bitmap[word_bitidx];
bitidx += end_bitidx;
return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
}
static inline unsigned long *get_pageblock_bitmap(struct page *page,
unsigned long pfn)
{
return __pfn_to_section(pfn)->pageblock_flags;
}
static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
return __nr_to_section(pfn_to_section_nr(pfn));
}
static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
return pfn >> PFN_SECTION_SHIFT;
}
static inline struct mem_section *__nr_to_section(unsigned long nr)
{
#ifdef CONFIG_SPARSEMEM_EXTREME
if (!mem_section)
return NULL;
#endif
if (!mem_section[SECTION_NR_TO_ROOT(nr)])
return NULL;
return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
}
#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)
#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))
static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
{
pfn &= (PAGES_PER_SECTION-1);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
#define pageblock_order (MAX_ORDER-1)
/* Bit indices that affect a whole block of pages */
enum pageblock_bits {
PB_migrate,
PB_migrate_end = PB_migrate + 3 - 1,
/* 3 bits required for migrate types */
PB_migrate_skip,/* If set the block is skipped by compaction */
/*
* Assume the bits will always align on a word. If this assumption
* changes then get/set pageblock needs updating.
*/
NR_PAGEBLOCK_BITS
};
#define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
#define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
static inline void set_pcppage_migratetype(struct page *page, int migratetype)
{
page->index = migratetype;
}
static void free_unref_page_commit(struct page *page, unsigned long pfn)
{
struct zone *zone = page_zone(page); /*通过page可获取其对应的zone*/
struct per_cpu_pages *pcp; /*pcp正式登场*/
int migratetype;
migratetype = get_pcppage_migratetype(page); /*page->index*/
__count_vm_event(PGFREE);
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Free ISOLATE pages back to the allocator because they are being
* offlined but treat HIGHATOMIC as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
if (migratetype >= MIGRATE_PCPTYPES) /*类型大于3时,类型重新赋值为MOVABLE*/
{
if (unlikely(is_migrate_isolate(migratetype)))
{
free_one_page(zone, page, pfn, 0, migratetype);
return;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list_add(&page->lru, &pcp->lists[migratetype]); /*将此页添加到pcp->list链表头部*/
pcp->count++; /*pcp中管理的页数量加一*/
if (pcp->count >= pcp->high) { /*当pcp管理的页面数量大于high值,释放batch个页面至伙伴系统中*/
unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
}
}
/*Frees a number of pages from the PCP lists*/
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp)
{
int migratetype = 0;
int batch_free = 0;
int prefetch_nr = 0;
bool isolated_pageblocks;
struct page *page, *tmp;
LIST_HEAD(head);
/*
* Ensure proper count is passed which otherwise would stuck in the
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
while (count) {
struct list_head *list;
/*
* Remove pages from lists in a round-robin fashion. A
* batch_free count is maintained that is incremented when an
* empty list is encountered. This is so more pages are freed
* off fuller lists instead of spinning excessively around empty
* lists
*/
do {
batch_free++;
if (++migratetype == MIGRATE_PCPTYPES)
migratetype = 0;
list = &pcp->lists[migratetype];
} while (list_empty(list));
/* This is the only non-empty list. Free them all. */
if (batch_free == MIGRATE_PCPTYPES)
batch_free = count;
do {
page = list_last_entry(list, struct page, lru);
/* must delete to avoid corrupting pcp list */
list_del(&page->lru); /*从链表中删除该page的lru节点*/
pcp->count--; /*pcp count减一*/
if (bulkfree_pcp_prepare(page))
continue;
list_add_tail(&page->lru, &head); /*将page添加到head的尾部*/
/*
* We are going to put the page back to the global
* pool, prefetch its buddy to speed up later access
* under zone->lock. It is believed the overhead of
* an additional test and calculating buddy_pfn here
* can be offset by reduced memory latency later. To
* avoid excessive prefetching due to large count, only
* prefetch buddy for the first pcp->batch nr of pages.
*/
if (prefetch_nr++ < pcp->batch)
prefetch_buddy(page);
} while (--count && --batch_free && !list_empty(list));
}
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
/*
* Use safe version since after __free_one_page(),
* page->lru.next will not point to original list.
*/
list_for_each_entry_safe(page, tmp, &head, lru) {
int mt = get_pcppage_migratetype(page);
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
__free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
}
static inline void prefetch_buddy(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
struct page *buddy = page + (buddy_pfn - pfn);
prefetch(buddy);
}
static inline unsigned long
__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
return page_pfn ^ (1 << order);
}
static inline void prefetch(const void *ptr)
{
asm volatile("prfm pldl1keep, %a0\n" : : "p" (ptr));
}
static inline bool has_isolate_pageblock(struct zone *zone)
{
return zone->nr_isolate_pageblock;
}
static bool bulkfree_pcp_prepare(struct page *page)
{
return free_pages_check(page);
}
static inline int free_pages_check(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return 0;
/* Something has gone sideways, find it */
free_pages_check_bad(page);
return 1;
}
/*page, page_to_pfn(page), zone, 0, MOVABLE*/
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype)
{
unsigned long combined_pfn;
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); /*pageblock_order =10, 此接口返回11*/
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
/* 0 < 10*/
while (order < max_order - 1) {
buddy_pfn = __find_buddy_pfn(pfn, order); /*通过固有公式计算该pfn的伙伴*/
buddy = page + (buddy_pfn - pfn); /*对应伙伴的页地址*/
if (!pfn_valid_within(buddy_pfn)) /*判断不可再继续合并,执行合并的动作*/
goto done_merging;
if (!page_is_buddy(page, buddy, order)) /*判断不可再继续合并,执行合并的动作*/
goto done_merging;
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy)) {
clear_page_guard(zone, buddy, order, migratetype);
} else {
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
}
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
order++;
}
if (max_order < MAX_ORDER) {
/* If we are here, it means order is >= pageblock_order.
* We want to prevent merge between freepages on isolate
* pageblock and normal pageblock. Without this, pageblock
* isolation could cause incorrect freepage or CMA accounting.
*
* We don't want to hit this code for the more frequent
* low-order merging.
*/
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
&& (is_migrate_isolate(migratetype) ||
is_migrate_isolate(buddy_mt)))
goto done_merging;
}
max_order++;
goto continue_merging;
}
done_merging:
set_page_order(page, order);
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
struct page *higher_page, *higher_buddy;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
goto out;
}
}
list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
out:
zone->free_area[order].nr_free++;
}
/* 检查该页是否空闲且是buddy,我们可以将该页和其buddy进行合并,如果
* buddy 不在空洞中 &&
* buddy 在伙伴系统中 &&
* 页和它的伙伴有相同的order&&
* 页和它的伙伴在同一个zone中
* This function checks whether a page is free && is the buddy
* we can coalesce a page and its buddy if
* (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
* For recording whether a page is in the buddy system, we set PageBuddy.
* Setting, clearing, and testing PageBuddy is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
static inline int page_is_buddy(struct page *page, struct page *buddy,
unsigned int order)
{
/*page_is_guard为false*/
if (page_is_guard(buddy) && page_order(buddy) == order) {
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
return 1;
}
if (PageBuddy(buddy) && page_order(buddy) == order) {
/*
* zone check is done late to avoid uselessly
* calculating zone/node ids for pages that could
* never merge.
*/
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
return 1;
}
return 0;
}
/*
* PageBuddy() indicates that the page is free and in the buddy system
* (see mm/page_alloc.c).
*/
PAGE_TYPE_OPS(Buddy, buddy)
/*
* For pages that are never mapped to userspace (and aren't PageSlab),
* page_type may be used. Because it is initialised to -1, we invert the
* sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
* __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and
* low bits so that an underflow or overflow of page_mapcount() won't be
* mistaken for a page type value.
*/
#define PAGE_TYPE_BASE 0xf0000000
/* Reserve 0x0000007f to catch underflows of page_mapcount */
#define PG_buddy 0x00000080
#define PG_balloon 0x00000100
#define PG_kmemcg 0x00000200
#define PG_table 0x00000400
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
#define PAGE_TYPE_OPS(uname, lname) \
static __always_inline int Page##uname(struct page *page) \
{ \
return PageType(page, PG_##lname); \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!PageType(page, 0), page); \
page->page_type &= ~PG_##lname; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type |= PG_##lname; \
}
static inline bool page_is_guard(struct page *page) { return false; }
static inline unsigned int page_order(struct page *page)
{
/* PageBuddy() must be checked by the caller */
return page_private(page);
}
#define page_private(page) ((page)->private)
#define pfn_valid_within(pfn) pfn_valid(pfn)
int pfn_valid(unsigned long pfn)
{
phys_addr_t addr = pfn << PAGE_SHIFT;
if ((addr >> PAGE_SHIFT) != pfn)
return 0;
#ifdef CONFIG_SPARSEMEM
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
return 0;
if (!valid_section(__nr_to_section(pfn_to_section_nr(pfn))))
return 0;
#endif
return memblock_is_map_memory(addr);
}
/*zone, 1<<0, MOVABLE*/
static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
int migratetype)
{
__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
if (is_migrate_cma(migratetype))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
}
/*
* For use when we know that interrupts are disabled,
* or when we know that preemption is disabled and that
* particular counter cannot be updated from interrupt context.
*/
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
long delta)
{
struct per_cpu_pageset __percpu *pcp = zone->pageset;
s8 __percpu *p = pcp->vm_stat_diff + item;
long x;
long t;
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(x > t || x < -t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
__this_cpu_write(*p, x);
}
struct per_cpu_pages {
int count; /* number of pages in the list */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[MIGRATE_PCPTYPES];
};
struct per_cpu_pageset {
struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
s8 expire;
u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
#endif
#ifdef CONFIG_SMP
s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};
struct per_cpu_nodestat {
s8 stat_threshold;
s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
};
#define this_cpu_ptr(ptr) raw_cpu_ptr(ptr)
#define raw_cpu_ptr(ptr) \
({ \
__verify_pcpu_ptr(ptr); \
arch_raw_cpu_ptr(ptr); \
})
#define arch_raw_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, __my_cpu_offset)
#define SHIFT_PERCPU_PTR(__p, __offset) \
RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset))
static inline unsigned long __my_cpu_offset(void)
{
unsigned long off;
/*
* We want to allow caching the value, so avoid using volatile and
* instead use a fake stack read to hazard against barrier().
*/
asm(ALTERNATIVE("mrs %0, tpidr_el1",
"mrs %0, tpidr_el2",
ARM64_HAS_VIRT_HOST_EXTN)
: "=r" (off) :
"Q" (*(const unsigned long *)current_stack_pointer));
return off;
}
/*
* __verify_pcpu_ptr() verifies @ptr is a percpu pointer without evaluating
* @ptr and is invoked once before a percpu area is accessed by all
* accessors and operations. This is performed in the generic part of
* percpu and arch overrides don't need to worry about it; however, if an
* arch wants to implement an arch-specific percpu accessor or operation,
* it may use __verify_pcpu_ptr() to verify the parameters.
*
* + 0 is required in order to convert the pointer type from a
* potential array type to a pointer to a single item of the array.
*/
#define __verify_pcpu_ptr(ptr) \
do { \
const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL; \
(void)__vpp_verify; \
} while (0)
enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
MIGRATE_RECLAIMABLE,
MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
/*
* MIGRATE_CMA migration type is designed to mimic the way
* ZONE_MOVABLE works. Only movable pages can be allocated
* from MIGRATE_CMA pageblocks and page allocator never
* implicitly change migration type of MIGRATE_CMA pageblock.
*
* The way to use it is to change migratetype of a range of
* pageblocks to MIGRATE_CMA which can be done by
* __free_pageblock_cma() function. What is important though
* is that a range of pageblocks must be aligned to
* MAX_ORDER_NR_PAGES should biggest page be bigger then
* a single pageblock.
*/
MIGRATE_CMA,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
MIGRATE_ISOLATE, /* can't allocate from here */
#endif
MIGRATE_TYPES
};
static inline struct zone *page_zone(const struct page *page)
{
return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}
static inline enum zone_type page_zonenum(const struct page *page)
{
return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
}
#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
/*
* Define the bit shifts to access each section. For non-existent
* sections we define the shift as 0; that plus a 0 mask ensures
* the compiler will optimise away reference to them.
*/
#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
order不等于0的页的常规处理,假设当前order为10
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
if (!free_pages_prepare(page, order, true))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype);
local_irq_restore(flags);
}
NOTE:order不论是不是0,其最终的处理核心都是由free_one_page来实现
static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
int migratetype)
{
spin_lock(&zone->lock);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype);
spin_unlock(&zone->lock);
}
/*eg: pfn=0, order=10, zone=ZONE_DMA32,ZONE_MOVABLE*/
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype)
{
unsigned long combined_pfn;
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
static int r_count = 0;
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); /*最小是11*/
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
while (order < max_order - 1) {
buddy_pfn = __find_buddy_pfn(pfn, order); /*通过固有公式计算该pfn的伙伴:pfn ^ (1 << order)*/
buddy = page + (buddy_pfn - pfn); /*对应伙伴的页地址*/
if (!pfn_valid_within(buddy_pfn))/*如果buddy_pfn不再合法section内,则说明该buddy_pfn异常特殊,不能继续合并*/
{
goto done_merging;
}
if (!page_is_buddy(page, buddy, order))
{
goto done_merging;
}
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy)) {
clear_page_guard(zone, buddy, order, migratetype);
} else {
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
}
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
order++;
}
if (max_order < MAX_ORDER) {
/* If we are here, it means order is >= pageblock_order.
* We want to prevent merge between freepages on isolate
* pageblock and normal pageblock. Without this, pageblock
* isolation could cause incorrect freepage or CMA accounting.
*
* We don't want to hit this code for the more frequent
* low-order merging.
*/
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
&& (is_migrate_isolate(migratetype) ||
is_migrate_isolate(buddy_mt)))
goto done_merging;
}
max_order++;
goto continue_merging;
}
done_merging:
set_page_order(page, order);
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
struct page *higher_page, *higher_buddy;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
goto out;
}
}
/*将连续1024个页添加到具体order=10对应的free_area数组的具体type的free_list中*/
list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
out:
zone->free_area[order].nr_free++; /*free_area的下标为10的nr_free空闲页自加*/
/*至此该page被放到了buddy system中了!!!*/
}
static inline void set_page_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page); /*page->page_type &= ~PG_buddy;*/
}
#define set_page_private(page, v) ((page)->private = (v))
PAGE_TYPE_OPS(Buddy, buddy)
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
#define PAGE_TYPE_OPS(uname, lname) \
static __always_inline int Page##uname(struct page *page) \
{ \
return PageType(page, PG_##lname); \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!PageType(page, 0), page); \
page->page_type &= ~PG_##lname; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type |= PG_##lname; \
}
#define PAGE_TYPE_BASE 0xf0000000
/* Reserve 0x0000007f to catch underflows of page_mapcount */
#define PG_buddy 0x00000080
#define PG_balloon 0x00000100
#define PG_kmemcg 0x00000200
#define PG_table 0x00000400
Summary:
1、先是有memblock.memory和memblock.reserved的初始化
2、section的初始化、section->section_mem_map是一个关键
3、zone、zone_list的初始化、order、迁移类型以及pageflag
4、free_area的初始化,order=0的,先往pcp上挂载,若超出量 则返回给buddy system;
order不为0的,直接往buddy system中挂载;其中根据page查找其page_buddy,判断page_buddy是否合法,
合法则合并,删除page_budd->lru,以page为起始地址,将order++,add到free_area[order]->free_list[type]
5、page的order是记录在其private字段中;
6、pfn、page是通过section中间量转换的;
此内容仅为buddy system 学习过程中的笔记记录,有问题请及时指出,谢谢。