buddy system - Linux内存管理(6)

1 概述

  Buddy伙伴内存分配算法由Knowlton设计,后来Knuth又进行了更深刻的描述。与其他分配器相比,这个算法显示出了超快的速度。

  这是一个结合了2的方幂(阶)个分配器与空闲缓冲区合并技术的分配方案,其基本概念非常简单。内存被分成了含有很多页面的大块,每一块都是2个页面大小的幂。如果找不到想要的块,一个大块会被分成两部分,这两部分彼此就成了伙伴。其中一半被用来分配而另一半空闲。这些块会继续被二分直至产生一个所需大小的块。当一个块被最终释放时,其伙伴将被检查出来,如果它空闲则合并两者。

  我们可以通过命令cat /proc/buddyinfo查看伙伴系统信息。

2 数据结构

  每个内存域zone有独立的buddy分配器数据结构

struct zone
{
     /* free areas of different sizes */
    struct free_area        free_area[MAX_ORDER];
};
  free_area表示一个指定阶数的内存块 定义在 include/linux/mmzone.h#L95

struct free_area {
	struct list_head	free_list[MIGRATE_TYPES];
	unsigned long		nr_free;
};

  • free_list用于连接每种迁移类型内存块的首页的链表
  • nr_free指定了当前内存区的内存块的数目。

2.1 MAX_ORDER

  frea_area数组中第0个元素的阶为0, 它的free_list链表域指向具有包含区为单页(

20=1 )的内存页面链表,数组中第1个元素的free_list域管理的内存区为两页( 21=2 ),第3个管理的内存区为4页, 依次类推。直到 2MAX_ODER-1 个页面大小的块。


  MAX_ORDER默认值是11,定义在include/linux/mmzone.h#L24,内核配置参数CONFIG_FORCE_MAX_ZONEORDER可以修改MAX_ORDER值。

/* Free memory management - zoned buddy allocator.  */
#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))

2.2 zoonlist

  buddy伙伴系统是基于内存域zone的。内核会根据当前结点的备用分配列表zoonlist排列优先级从不同的内存域中去满足分配申请。


2.3 MIGRATE_TYPES

  定义在include/linux/mmzone.h#L63

enum migratetype {
	MIGRATE_UNMOVABLE,      /* 在内存中有固定位置, 不能移动到其他地方。核心内核分配的大多数内存属于该类别 */
	MIGRATE_MOVABLE,        /* 可以随意地移动 属于用户空间应用程序的页属于该类别. 它们是通过页表映射的
	                         * 如果它们复制到新位置,页表项可以相应地更新,应用程序不会注意到任何事 */
	MIGRATE_RECLAIMABLE,    /* 不能直接移动, 但可以删除, 其内容可以从某些源重新生成。
	                         * 例如,映射自文件的数据属于该类别
	                         * kswapd守护进程会根据可回收页访问的频繁程度,周期性释放此类内存. 
	                         * 页面回收本身就是一个复杂的过程. 内核会在可回收页占据了太多内存时进行回收, 
	                         * 在内存短缺(即分配失败)时也可以发起页面回收 */
	MIGRATE_PCPTYPES,	/* the number of types on the pcp lists 
	                         * 是per_cpu_pageset, 即用来表示每CPU页框高速缓存的数据结构中的链表的迁移类型数目 */
	MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,  /* 在罕见的情况下,内核需要分配一个高阶的页面块而不能休眠.
	                                         * 如果向具有特定可移动性的列表请求分配内存失败,
	                                         * 这种紧急情况下可从MIGRATE_HIGHATOMIC中分配内存 */
#ifdef CONFIG_CMA
	/*
	 * MIGRATE_CMA migration type is designed to mimic the way
	 * ZONE_MOVABLE works.  Only movable pages can be allocated
	 * from MIGRATE_CMA pageblocks and page allocator never
	 * implicitly change migration type of MIGRATE_CMA pageblock.
	 *
	 * The way to use it is to change migratetype of a range of
	 * pageblocks to MIGRATE_CMA which can be done by
	 * __free_pageblock_cma() function.  What is important though
	 * is that a range of pageblocks must be aligned to
	 * MAX_ORDER_NR_PAGES should biggest page be bigger then
	 * a single pageblock.
	 * Linux内核最新的连续内存分配器(CMA), 用于避免预留大块内存导致系统可用内存减少而实现的,
	 * 即当驱动不使用内存时,将其分配给用户使用,而需要时则通过回收或者迁移的方式将内存腾出来。
	 */
	MIGRATE_CMA,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
	MIGRATE_ISOLATE,	/* can't allocate from here 
	                         * 是一个特殊的虚拟区域, 用于跨越NUMA结点移动物理内存页. 
	                         * 在大型系统上, 它有益于将物理内存页移动到接近于使用该页最频繁的CPU.*/
#endif
	MIGRATE_TYPES           /* 只是表示迁移类型的数目, 也不代表具体的区域 */
};

  在free_area结构中每一种迁移类型都有一个链表。对于不可移动,可移动和可回收页面的申请会优先去对应的迁移类型链表中去申请。这样的好处是减少内核页框的碎片化。如果内存申请不区分不可移动,可移动,可回收页框,经过长时间运行后系统由于不可移动页框的随机分布导致系统很难有大块连续空闲页框。

2.4 遍历链表

  宏for_each_migratetype_order用于遍历各阶内存块的各个迁移类型,定义在include/linux/mmzone.h#L82

#define for_each_migratetype_order(order, type) \
	for (order = 0; order < MAX_ORDER; order++) \
		for (type = 0; type < MIGRATE_TYPES; type++)

2.5 迁移类型确认

  内核申请内存时会提供flag标识希望申请的内存的迁移类型。定义在include/linux/gfp.h#L21

#define ___GFP_MOVABLE		0x08u
#define ___GFP_RECLAIMABLE	0x10u
  当上面两个flag都没有设置时,申请的就是不可移动类型内存。
  内核还提供了辅助函数gfpflags_to_migratetype用于确认迁移类型,定义在 include/linux/gfp.h#L276

/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
#define GFP_MOVABLE_SHIFT 3

static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
{
	VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
	BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
	BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);

	if (unlikely(page_group_by_mobility_disabled))
		return MIGRATE_UNMOVABLE;

	/* Group based on mobility */
	return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
}

3 pageblock

  mem_section或者zone负责管理一段连续页框。内核会把这些连续页框再分组成一个个pageblock。一个pageblock就是buddy伙伴系统一次能分配的最大页框数。每个pageblock可以设置一个迁移属性。


3.1 pageblock_nr_pages

  pageblock_nr_pages默认值是MAX_ORDER-1,定义在 include/linux/pageblock-flags.h#L63
#ifdef CONFIG_HUGETLB_PAGE

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

/* Huge page sizes are variable */
extern unsigned int pageblock_order;

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

/* Huge pages are a constant size */
#define pageblock_order		HUGETLB_PAGE_ORDER

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

#else /* CONFIG_HUGETLB_PAGE */

/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
#define pageblock_order		(MAX_ORDER-1)

#endif /* CONFIG_HUGETLB_PAGE */

#define pageblock_nr_pages	(1UL << pageblock_order)

3.2 pageblock_flags

  上面我们知道一个buddy伙伴系统管理的最大的连续页块是pageblock_nr_pages个页框。buddy伙伴系统管理的一个最大的连续页块就是一个pageblock。buddy伙伴系统管理的小阶数的连续页块都是从pageblock拆分出来的buddy。每个pageblock都有一个标志位pageblocks_flags用于表示一个它的属性。

  pageblock_flags定义在两个地方。对于非SPARSMEM模式,位于zone结构,定义在include/linux/mmzone.h#L380

struct zone {
#ifndef CONFIG_SPARSEMEM
	/*
	 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
	 * In SPARSEMEM, this map is stored in struct mem_section
	 */
	unsigned long		*pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
};
  对于SPARSMEM模式,位于mem_section,定义在 include/linux/mmzone.h#L1100

struct mem_section {
	/* See declaration of similar field in struct zone */
	unsigned long *pageblock_flags;
};

3.3 pageblock_bits

  pageblock_flags给每一个页块分配一定pageblock_bits位来表示该页块的迁移类型。pageblock_bits定义在include/linux/pageblock-flags.h#L29

/* Bit indices that affect a whole block of pages */
enum pageblock_bits {
	PB_migrate,
	PB_migrate_end = PB_migrate + 3 - 1,
			/* 3 bits required for migrate types */
	PB_migrate_skip,/* If set the block is skipped by compaction */

	/*
	 * Assume the bits will always align on a word. If this assumption
	 * changes then get/set pageblock needs updating.
	 */
	NR_PAGEBLOCK_BITS
};
  • 当前内核需要3位来表示页块的迁移类型

3.4 get_pageblock_bitmap

  get_pageblock_bitmap得到给定page的 pageblock_flags指针,定义在mm/page_alloc.c#L368
/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(struct page *page,
							unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
	return __pfn_to_section(pfn)->pageblock_flags;
#else
	return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
}
  • 对于SPARSMEM模式,通过页框号得到section结构内的pageblock_flags指针
  • 对于非SPARSMEM模式,通过page结构得到zone结构内的pageblock_flags

3.5 set_pageblock_migratetype

  在pageblock_flags中设置一个页框的迁移类型,定义在mm/page_alloc.c#L468

void set_pageblock_migratetype(struct page *page, int migratetype)
{
	if (unlikely(page_group_by_mobility_disabled &&
		     migratetype < MIGRATE_PCPTYPES))
		migratetype = MIGRATE_UNMOVABLE;

	set_pageblock_flags_group(page, (unsigned long)migratetype,
					PB_migrate, PB_migrate_end);
}
  set_pageblock_flags_group定义在 include/linux/pageblock-flags.h#L84
#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \
	set_pfnblock_flags_mask(page, flags, page_to_pfn(page),		\
			end_bitidx,					\
			(1 << (end_bitidx - start_bitidx + 1)) - 1)
  set_pfnblock_flags_mask定义在 mm/page_alloc.c#L437
/**
 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
 * @page: The page within the block of interest
 * @flags: The flags to set
 * @pfn: The target page frame number
 * @end_bitidx: The last bit of interest
 * @mask: mask of bits that the caller is interested in
 */
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
					unsigned long pfn,
					unsigned long end_bitidx,
					unsigned long mask)
{
	unsigned long *bitmap;
	unsigned long bitidx, word_bitidx;
	unsigned long old_word, word;

	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

	bitmap = get_pageblock_bitmap(page, pfn);
	bitidx = pfn_to_bitidx(page, pfn);
	word_bitidx = bitidx / BITS_PER_LONG;
	bitidx &= (BITS_PER_LONG-1);

	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);

	bitidx += end_bitidx;
	mask <<= (BITS_PER_LONG - bitidx - 1);
	flags <<= (BITS_PER_LONG - bitidx - 1);

	word = READ_ONCE(bitmap[word_bitidx]);
	for (;;) {
		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
		if (word == old_word)
			break;
		word = old_word;
	}
}

3.6 change_pageblock_range

  change_pageblock_range修改一组pageblock range的迁移类型,定义在mm/page_alloc.c#L1923

static void change_pageblock_range(struct page *pageblock_page,
					int start_order, int migratetype)
{
	int nr_pageblocks = 1 << (start_order - pageblock_order);

	while (nr_pageblocks--) {
		set_pageblock_migratetype(pageblock_page, migratetype);
		pageblock_page += pageblock_nr_pages;
	}
}

3.7 get_pfnblock_migratetype

  get_pfnblock_migratetype得到给定page的迁移类型,定义在 include/linux/mmzone.h#L91
#define get_pageblock_migratetype(page)					\
	get_pfnblock_flags_mask(page, page_to_pfn(page),		\
			PB_migrate_end, MIGRATETYPE_MASK)
  get_pfnblock_flags_mask会直接调用内部函数__get_pfnblock_flags_mask,定义在 mm/page_alloc.c#L398
/**
 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
 * @page: The page within the block of interest
 * @pfn: The target page frame number
 * @end_bitidx: The last bit of interest to retrieve
 * @mask: mask of bits that the caller is interested in
 *
 * Return: pageblock_bits flags
 */
static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
					unsigned long pfn,
					unsigned long end_bitidx,
					unsigned long mask)
{
	unsigned long *bitmap;
	unsigned long bitidx, word_bitidx;
	unsigned long word;

	bitmap = get_pageblock_bitmap(page, pfn);
	bitidx = pfn_to_bitidx(page, pfn);
	word_bitidx = bitidx / BITS_PER_LONG;
	bitidx &= (BITS_PER_LONG-1);

	word = bitmap[word_bitidx];
	bitidx += end_bitidx;
	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
}

3.8 move_freepages_block

  move_freepages_block函数负责对一个给定page,按照pageblock size对齐,移动到新的迁移类型的free_list中,order不变。定义在mm/page_alloc.c#L1901

/* 函数返回值表示移动的page页框数
 * num_movable表示迁移类型是可移动的已分配的页框数量,它用于精细的确定是否修改page block类型
 */
int move_freepages_block(struct zone *zone, struct page *page,
				int migratetype, int *num_movable)
{
	unsigned long start_pfn, end_pfn;
	struct page *start_page, *end_page;

	/* 起始,结束页框调整为pageblock_nr_pages对齐
	start_pfn = page_to_pfn(page);
	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
	start_page = pfn_to_page(start_pfn);
	end_page = start_page + pageblock_nr_pages - 1;
	end_pfn = start_pfn + pageblock_nr_pages - 1;

	/* Do not cross zone boundaries */
	/* 检查开始页框和结束页框是否都处于zone中,如果不属于,则用page作为开始页框 */
	if (!zone_spans_pfn(zone, start_pfn))
		start_page = page;
	/* 如果结束页框不属于zone,则直接返回0 */
	if (!zone_spans_pfn(zone, end_pfn))
		return 0;

	/* 将这个pageblock内的free page从旧迁移类型移动到新的类型链表free_list中,order不变,正在使用的页会被跳过 */
	return move_freepages(zone, start_page, end_page, migratetype,
								num_movable);
}

3.8.1 move_freepages

  move_freepages定义在mm/page_alloc.c#L1846。它将一段页框范围(不需要pageblock对齐)的空闲页框从原来的迁移类型移动到新的迁移类型。移动的总空闲页框数保存作为函数返回值。move_freepages还会通过指针返回num_movable值,它表示页框范围内已经被分配的页框中是可以移动的页框数,这边只是统计一个数目,并不真正的执行移动操作,因为这个代价很大。具体可以参加kernel patch mm, page_alloc: count movable pages when stealing from pageblock

/*
 * Move the free pages in a range to the free lists of the requested type.
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 * 将此段页框移动到新的migratetype类型的伙伴系统链表中,正在使用的页会被跳过
 */
static int move_freepages(struct zone *zone,
			  struct page *start_page, struct page *end_page,
			  int migratetype, int *num_movable)
{
	struct page *page;
	unsigned int order;
	int pages_moved = 0;

#ifndef CONFIG_HOLES_IN_ZONE
	/*
	 * page_zone is not safe to call in this context when
	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
	 * anyway as we check zone boundaries in move_freepages_block().
	 * Remove at a later date when no bug reports exist related to
	 * grouping pages by mobility
	 */
	VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif

	if (num_movable)
		*num_movable = 0;

	/* 遍历这组页框 */
	for (page = start_page; page <= end_page;) {
		/* 检查页框和页框号是否属于内存,如果不正确则跳过 */
		if (!pfn_valid_within(page_to_pfn(page))) {
			page++;
			continue;
		}

		/* Make sure we are not inadvertently changing nodes */
		VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);

		/* 如果页框不在伙伴系统中则跳到下一页,通过判断page->_mapcount是否等于-128 */
		if (!PageBuddy(page)) {
			/*
			 * We assume that pages that could be isolated for
			 * migration are movable. But we don't actually try
			 * isolating, as that would be expensive.
			 * 对于已经分配的LRU和Movable页增加num_movable,这里只是标记,不真正执行isloating
			 * page++跳过该页
			 */
			if (num_movable &&
					(PageLRU(page) || __PageMovable(page)))
				(*num_movable)++;

			page++;
			continue;
		}

		/* 获取此页框的order号,保存在page->private中 */
		order = page_order(page);
		/* 从伙伴系统中拿出来,并放到新的migratetype类型中的order链表中 */
		list_move(&page->lru,
			  &zone->free_area[order].free_list[migratetype]);
		/* 跳过此order个页框数量 */
		page += 1 << order;
		/* 记录拿出来了多少个页框 */
		pages_moved += 1 << order;
	}
	/* 返回一共拿出来的页框 */
	return pages_moved;
}

4 fallback

  在前面我们看到当一个内存域没有办法满足分配请求时,内核会根据结点内的备用列表zoonlist去别的内存域申请内存。

  在一个内存域内当指定的迁移类型不能满足分配请求时,内核也提供迁移类型fallback机制去同一内存域内的其他迁移类型内存链表去偷取(Steal)一段空间用以申请。

4.1 迁移备用列表fallbacks

  fallbacks定义在mm/page_alloc.c#L1818

/*
 * This array describes the order lists are fallen back to when
 * the free lists for the desirable migrate type are depleted
 * 该数组描述了指定迁移类型的空闲列表耗尽时
 * 其他空闲列表在备用列表中的次序
 */
static int fallbacks[MIGRATE_TYPES][4] = {
	//  分配不可移动页失败的备用列表
	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
	//  分配可回收页失败时的备用列表
	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
	//  分配可移动页失败时的备用列表
	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
	[MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
#endif
#ifdef CONFIG_MEMORY_ISOLATION
	[MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
#endif
};

4.2 can_steal_fallback

  can_steal_fallback判断是否当前的申请是否支持fallback steal。定义在mm/page_alloc.c#L1946

/*
 * When we are falling back to another migratetype during allocation, try to
 * steal extra free pages from the same pageblocks to satisfy further
 * allocations, instead of polluting multiple pageblocks.
 *
 * If we are stealing a relatively large buddy page, it is likely there will
 * be more free pages in the pageblock, so try to steal them all. For
 * reclaimable and unmovable allocations, we steal regardless of page size,
 * as fragmentation caused by those allocations polluting movable pageblocks
 * is worse than movable allocations stealing from unmovable and reclaimable
 * pageblocks.
 */
static bool can_steal_fallback(unsigned int order, int start_mt)
{
	/*
	 * Leaving this order check is intended, although there is
	 * relaxed order check in next check. The reason is that
	 * we can actually steal whole pageblock if this condition met,
	 * but, below check doesn't guarantee it and that is just heuristic
	 * so could be changed anytime.
	 * 需要申请的是整个pageblock,支持steal整个pageblock
	 */
	if (order >= pageblock_order)
		return true;

	/*
	 * 满足如下条件支持Steal
	 * - 需要申请的是一个大于1/2 pageblock大小的块
	 * - 类型是可回收或者不可移动
	 * - 迁移类型被disable
	 */
	if (order >= pageblock_order / 2 ||
		start_mt == MIGRATE_RECLAIMABLE ||
		start_mt == MIGRATE_UNMOVABLE ||
		page_group_by_mobility_disabled)
		return true;

	return false;
}

4.3 find_suitable_fallback

  函数find_suitable_fallback用于查找是否有可用的fallback,定义在mm/page_alloc.c#L2051。only_stealable输入标志用于

/*
 * Check whether there is a suitable fallback freepage with requested order.
 * If only_stealable is true, this function returns fallback_mt only if
 * we can steal other freepages all together. This would help to reduce
 * fragmentation due to mixed migratetype pages in one pageblock.
 * 检查是否有合适的fallback空闲页框。only_stealable为TRUE时,can_steal_fallback还必须返回TRUE
 */
int find_suitable_fallback(struct free_area *area, unsigned int order,
			int migratetype, bool only_stealable, bool *can_steal)
{
	int i;
	int fallback_mt;

	if (area->nr_free == 0)
		return -1;

	*can_steal = false;

	/* 给定迁移类型,在fallbacks表内循环 */
	for (i = 0;; i++) {
		fallback_mt = fallbacks[migratetype][i];
		if (fallback_mt == MIGRATE_TYPES)
			break;

		/* fallback迁移类型没有空余页块, 跳到一下个循环 */
		if (list_empty(&area->free_list[fallback_mt]))
			continue;

		/* 检查是否满足steal 条件 */
		if (can_steal_fallback(order, migratetype))
			*can_steal = true;

		/* only_stealable标志为FALSE,找到第一个有空余页块的迁移类型即返回 */
		if (!only_stealable)
			return fallback_mt;

		/* only_stealable标志为TRUE,can_steal_fallback检查允许steal,返回fallback迁移类型 */
		if (*can_steal)
			return fallback_mt;
	}

	/* 没有找到合适的fallback 迁移类型,返回-1 */
	return -1;
}

4.4 steal_suitable_fallback

 steal_suitable_fallback执行真正的steal动作。如果order足够大,就steal整个page block。反之,先把page block内的空闲页框移动到要申请的迁移类型,然后检查已经申请的页框中是否是兼容的迁移类型。如果有超过一半page block size是空闲的页或者已分配的兼容的迁移类型,就可以修改page block的迁移类型到新的申请的类型。这样已分配的兼容的迁移类型在以后被free释放时就会被放置到正确的free list中了。定义在mm/page_alloc.c#L1975

/*
 * This function implements actual steal behaviour. If order is large enough,
 * we can steal whole pageblock. If not, we first move freepages in this
 * pageblock to our migratetype and determine how many already-allocated pages
 * are there in the pageblock with a compatible migratetype. If at least half
 * of pages are free or compatible, we can change migratetype of the pageblock
 * itself, so pages freed in the future will be put on the correct free list.
 */
static void steal_suitable_fallback(struct zone *zone, struct page *page,
					int start_type, bool whole_block)
{
	unsigned int current_order = page_order(page);
	struct free_area *area;
	int free_pages, movable_pages, alike_pages;
	int old_block_type;

	/* page block当前的迁移类型 */
	old_block_type = get_pageblock_migratetype(page);

	/*
	 * This can happen due to races and we want to prevent broken
	 * highatomic accounting.
	 */
	if (is_migrate_highatomic(old_block_type))
		goto single_page;

	/* Take ownership for orders >= pageblock_order */
	/* 申请order大于pageblock_order时,直接更改pageblock的迁移类型
	 * 然后跳转到signle_page,把plageblock首页框挂到area->free_list[start_type]
	 * /
	if (current_order >= pageblock_order) {
		change_pageblock_range(page, current_order, start_type);
		goto single_page;
	}

	/* We are not allowed to try stealing from the whole block */
	/* 如果输入标志whole_block为FALSE,不能修改pageblock迁移属性,跳转到signle_page把page挂到area->free_list[start_type] */
	if (!whole_block)
		goto single_page;

	/* 移动pageblock到start_type迁移类型的free_list下
	 * free_pages为实际移动的页框数
	 * movable_pages是已分配的以后可以移动的兼容页框
	 * /
	free_pages = move_freepages_block(zone, page, start_type,
						&movable_pages);
	/*
	 * Determine how many pages are compatible with our allocation.
	 * For movable allocation, it's the number of movable pages which
	 * we just obtained. For other types it's a bit more tricky.
	 * 计算当前申请类型的已分配页框大概数目
	 */
	if (start_type == MIGRATE_MOVABLE) {
		/* 要申请的是可移动类型时,alike页框数数就是movable_pages
		alike_pages = movable_pages;
	} else {
		/*
		 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
		 * to MOVABLE pageblock, consider all non-movable pages as
		 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
		 * vice versa, be conservative since we can't distinguish the
		 * exact migratetype of non-movable pages.
		 * 如果要申请迁移类型是非可移动的,老的页框是可移动属性,alike页框数数
		 * 把所有不可移动的页框当作兼容的alike页框
		 */
		if (old_block_type == MIGRATE_MOVABLE)
			alike_pages = pageblock_nr_pages
						- (free_pages + movable_pages);
		else
			alike_pages = 0;
	}

	/* moving whole block can fail due to zone boundary conditions */
	if (!free_pages)
		goto single_page;

	/*
	 * If a sufficient number of pages in the block are either free or of
	 * comparable migratability as our allocation, claim the whole block.
	 * 空闲页框加上alike页框总数超过1/2 page block size时,修改page block迁移类型
	 */
	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
			page_group_by_mobility_disabled)
		set_pageblock_migratetype(page, start_type);

	return;

single_page:
	/* 把首页框移动到新的free_list,order不变
	area = &zone->free_area[current_order];
	list_move(&page->lru, &area->free_list[start_type]);
}

4.5 获取页面迁移类型信息

  /proc/pagetypeinfo


5 初始化伙伴系统

  在初始化伙伴系统之前,所有的node和zone的描述符都已经初始化完毕。

5.1 Page block migrate type初始化

  所有的页框最开始都被初始化为可移动属性,这个在memmap_init_zone中完成,定义在mm/page_alloc.c#L5335它的调用路径是start_kernel -> setup_arch -> x86_init.paging.pagetable_init() -> paging_init -> zone_sizes_init -> free_area_init_nodes -> free_area_init_node -> free_area_init_core  -> memmap_init -> memmap_init_zone

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
		unsigned long start_pfn, enum memmap_context context)
{
	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
not_early:
		/*
		 * Mark the block movable so that blocks are reserved for
		 * movable at startup. This will force kernel allocations
		 * to reserve their blocks rather than leaking throughout
		 * the address space during boot when many long-lived
		 * kernel allocations are made.
		 *
		 * bitmap is created for zone's valid pfn range. but memmap
		 * can be created for invalid pages (for alignment)
		 * check here not to call set_pageblock_migratetype() against
		 * pfn out of zone.
		 * 对于所有的pageblock的首页设置迁移属性为可移动
		 */
		if (!(pfn & (pageblock_nr_pages - 1))) {
			struct page *page = pfn_to_page(pfn);

			__init_single_page(page, pfn, zone, nid);
			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
		} else {
			__init_single_pfn(pfn, zone, nid);
		}
	}
}

5.1 fallback初始化

  迁移类型fallback机制不是在任何时候都有作用。如果各迁移类型的链表中没有一块较大的连续内存, 那么页面迁移不会提供任何好处, 因此在可用内存太少时内核会关闭该特性。这是在build_all_zonelists函数中检查的, 该函数用于初始化内存域列表. 如果没有足够的内存可用, 则全局变量page_group_by_mobility_disabled设置为0, 否则设置为1。定义在mm/page_alloc.c#L5230。它的调用路径是

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)  
{  
    vm_total_pages = nr_free_pagecache_pages();  
    /* 
     * Disable grouping by mobility if the number of pages in the 
     * system is too low to allow the mechanism to work. It would be 
     * more accurate, but expensive to check per-zone. This check is 
     * made on memory-hotadd so a system can start with mobility 
     * disabled and enable it later 
     * 当系统可能内存数小于一个阈值时关闭迁移类型fallback
     */  
    if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))  
        page_group_by_mobility_disabled = 1;  
    else  
        page_group_by_mobility_disabled = 0; 
}

5.2 zone_init_free_lists

  内存域zone的伙伴系统在zone_init_free_lists中首先被清空,定义在mm/page_alloc.c#L5342它的调用路径是start_kernel -> setup_arch -> x86_init.paging.pagetable_init() -> paging_init -> zone_sizes_init -> free_area_init_nodes -> free_area_init_node -> free_area_init_core  -> init_currently_empty_zone -> zone_init_free_lists

static void __meminit zone_init_free_lists(struct zone *zone)
{
	unsigned int order, t;
	for_each_migratetype_order(order, t) {
		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
		zone->free_area[order].nr_free = 0;
	}
}

5.3 mem_init

  伙伴系统的初始化主要在mem_init中完成,x86_32中分为高端内存和普通内存两部分,分别调用set_highmem_pages_init和free_all_bootmem。x86_64中没有高端内存概念,直接调用free_all_bootmem。伙伴系统初始化完成后设置全局变量after_bootmem = 1。mem_init的调用路径是start_kernel -> mm_init -> mem_init


5.4 free boot memory

    除了free_all_bootmem内核还提供另外几个接口用于释放boot内存给伙伴系统。


6 APIs

  buddy系统调用接口内部实现框图如下















评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值