内存管理---伙伴系统---

内核使用伙伴系统来解决内存分配引起的外部碎片问题。

一、数据结构描述

struct zone {
<span style="white-space:pre">	</span>/**********/
	struct free_area	free_area[MAX_ORDER]; 每一种元素对应一种块的大小。
<span style="white-space:pre">	</span>/*********/
};

struct free_area {
	struct list_head	free_list[MIGRATE_TYPES];
	unsigned long		nr_free;
};
free_area共有MAX_ORDER个元素,其中第order个元素记录了2^order的空闲块,这些空闲块在free_list中以双向链表的形式组织起来,对于同等大小的空闲块,其类型不同,将组织在不同的free_list中,这些节点对应的着struct page中的lru域;nr_free制定了大小为2^k页的空闲块的个数;

struct page {
	
	。。。。。
	struct list_head lru;		/* Pageout list, eg. active_list
					 * protected by zone->lru_lock !
					 */
	。。。。。
}

对于:

free_area中的链表数组;

#define MIGRATE_UNMOVABLE     0
#define MIGRATE_RECLAIMABLE   1
#define MIGRATE_MOVABLE       2
#define MIGRATE_PCPTYPES      3 /* the number of types on the pcp lists */
#define MIGRATE_RESERVE       3
#define MIGRATE_ISOLATE       4 /* can't allocate from here */
#define MIGRATE_TYPES         5

MIGRATE_PCPTYPES是per_cpu_pageset,即用来表示每CPU页框高速缓存的数据结构中的链表的迁移类型数目

MIGRATE_RESERVE是在前三种的列表中都没用可满足分配的内存块时,就可以从MIGRATE_RESERVE分配

MIGRATE_ISOLATE用于跨越NUMA节点移动物理内存页,在大型系统上,它有益于将物理内存页移动到接近于是用该页最频繁地CPU

MIGRATE_TYPES表示迁移类型的数目

当一个指定的迁移类型所对应的链表中没有空闲块时,将会按以下定义的顺序到其他迁移类型的链表中寻找

static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
};



伙伴系统的初始化:

在初始化物理管理区的时候初始化伙伴系统的,具体实现在下面的函数中:

Start_kernel()->setup_arch()->paging_init()->zone_sizes_init()->free_area_init_nodes()->free_area_init_node()->free_area_init_core()->init_currently_empty_zone()->zone_init_free_lists();

static void __meminit zone_init_free_lists(struct zone *zone)
{
	int order, t;
	for_each_migratetype_order(order, t) {//宏替换  两层循环
		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
		zone->free_area[order].nr_free = 0;
	}
}
#define for_each_migratetype_order(order, type) \
	for (order = 0; order < MAX_ORDER; order++) \
		for (type = 0; type < MIGRATE_TYPES; type++)


伙伴系统中数据初始化

bootmem分配器中的数据回收到伙伴系统中

start_kernel()->mm_init()->mem_init

/*
 * Set up kernel memory allocators
 */
static void __init mm_init(void)
{
	/*
	 * page_cgroup requires contiguous pages,
	 * bigger than MAX_ORDER unless SPARSEMEM.
	 */
	page_cgroup_init_flatmem();
	mem_init();
	kmem_cache_init();//salb   高速缓存初始化
	percpu_init_late();每cpu变量??
	pgtable_cache_init();
	vmalloc_init();//非连续内存管理 用到 
}


init_32.c中

void __init mem_init(void)
{
	int codesize, reservedpages, datasize, initsize;
	int tmp;

	pci_iommu_alloc();  //pci相关

#ifdef CONFIG_FLATMEM
	BUG_ON(!mem_map);
#endif
	/*
	 * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
	 * be done before free_all_bootmem(). Memblock use free low memory for
	 * temporary data (see find_range_array()) and for this purpose can use
	 * pages that was already passed to the buddy allocator, hence marked as
	 * not accessible in the page tables when compiled with
	 * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
	 * important here.
	 */
	set_highmem_pages_init();

	/* this will put all low memory onto the freelists */
<span style="white-space:pre">		 /*释放bootmem中的内存到伙伴系统中,包括bootmem占有的位图 
     返回总共释放的页面数**/  </span>
	totalram_pages += free_all_bootmem();

	reservedpages = 0;
	for (tmp = 0; tmp < max_low_pfn; tmp++)
		/*
		 * Only count reserved RAM pages:
		 */
		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
			reservedpages++;
<span style="white-space:pre">		/*内核代码段、数据段、初始化端长度*/  </span>
	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
			"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
		nr_free_pages() << (PAGE_SHIFT-10),
		num_physpages << (PAGE_SHIFT-10),
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10,
		totalhigh_pages << (PAGE_SHIFT-10));

	printk(KERN_INFO "virtual kernel memory layout:\n"
		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
		"    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
#endif
		"    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
		"    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
		"      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
		"      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
		"      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
		FIXADDR_START, FIXADDR_TOP,
		(FIXADDR_TOP - FIXADDR_START) >> 10,

#ifdef CONFIG_HIGHMEM
		PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
		(LAST_PKMAP*PAGE_SIZE) >> 10,
#endif

		VMALLOC_START, VMALLOC_END,
		(VMALLOC_END - VMALLOC_START) >> 20,

		(unsigned long)__va(0), (unsigned long)high_memory,
		((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,

		(unsigned long)&__init_begin, (unsigned long)&__init_end,
		((unsigned long)&__init_end -
		 (unsigned long)&__init_begin) >> 10,

		(unsigned long)&_etext, (unsigned long)&_edata,
		((unsigned long)&_edata - (unsigned long)&_etext) >> 10,

		(unsigned long)&_text, (unsigned long)&_etext,
		((unsigned long)&_etext - (unsigned long)&_text) >> 10);

	/*
	 * Check boundaries twice: Some fundamental inconsistencies can
	 * be detected at build time already.
	 */
#define __FIXADDR_TOP (-PAGE_SIZE)
#ifdef CONFIG_HIGHMEM
	BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
	BUILD_BUG_ON(VMALLOC_END			> PKMAP_BASE);
#endif
#define high_memory (-128UL << 20)
	BUILD_BUG_ON(VMALLOC_START			>= VMALLOC_END);
#undef high_memory
#undef __FIXADDR_TOP

#ifdef CONFIG_HIGHMEM
	BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
	BUG_ON(VMALLOC_END				> PKMAP_BASE);
#endif
	BUG_ON(VMALLOC_START				>= VMALLOC_END);
	BUG_ON((unsigned long)high_memory		> VMALLOC_START);

	if (boot_cpu_data.wp_works_ok < 0)
		test_wp_bit();
}
其中会调用的几个函数:

/**
 * free_all_bootmem - release free pages to the buddy allocator
 *
 * Returns the number of pages actually released.
 */
unsigned long __init free_all_bootmem(void)
{
	unsigned long total_pages = 0;
	bootmem_data_t *bdata;

	list_for_each_entry(bdata, &bdata_list, list)
		total_pages += free_all_bootmem_core(bdata);

	return total_pages;
}

static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
	struct page *page;
	unsigned long start, end, pages, count = 0;

	if (!bdata->node_bootmem_map)
		return 0;
<span style="white-space:pre">		 /*节点内存开始和结束处*/  </span>
	start = bdata->node_min_pfn;
	end = bdata->node_low_pfn;

	bdebug("nid=%td start=%lx end=%lx\n",
		bdata - bootmem_node_data, start, end);

	while (start < end) {/*释放整个bootmem所涉及的内存*/  
		unsigned long *map, idx, vec;

		map = bdata->node_bootmem_map;
		idx = start - bdata->node_min_pfn;/*相对于开始处的偏移*/  
		vec = ~map[idx / BITS_PER_LONG];/*vec值为页面分配情况*/ 
<span style="white-space:pre">		  /*如果开始地址以BITS_PER_LONG</span><span style="white-space:pre">位对其、连续的BITS_PER_LONG个页面都没有被分配(空闲),并且 
       		 释放起点以上的BITS_PER_LONG个页面都是合法的(不超过end值),则释放连续的BITS_PER_LONG个 
       		 页面,*/  </span>
		/*
		 * If we have a properly aligned and fully unreserved
		 * BITS_PER_LONG block of pages in front of us, free
		 * it in one go.
		 */
		if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
			int order = ilog2(BITS_PER_LONG);

			__free_pages_bootmem(pfn_to_page(start), order);/*释放到伙伴系统中*/  
			count += BITS_PER_LONG;
			start += BITS_PER_LONG;
		} else {
			unsigned long off = 0;

			vec >>= start & (BITS_PER_LONG - 1);
			while (vec) {
				if (vec & 1) {/*如果页面空闲  注意一开始vec的值是经过取反后得来的*/  
					page = pfn_to_page(start + off);
					__free_pages_bootmem(page, 0);
					count++;/*更新释放页面总数*/  
				}
				vec >>= 1;/*vec向右移动一位,表示访问下一个页面*/ 
				off++;
			}
			start = ALIGN(start + 1, BITS_PER_LONG);
		}
	}
<span style="white-space:pre">		/*虚拟地址转化为page 
    用于释放bdata中的位图所占有的内存*/ </span>
	page = virt_to_page(bdata->node_bootmem_map);
	pages = bdata->node_low_pfn - bdata->node_min_pfn;
	pages = bootmem_bootmap_pages(pages); /*计算bootmem分配器中所使用的页面数,即位图使用的页面数*/  
	count += pages;
	while (pages--)//每次释放一个页面
		__free_pages_bootmem(page++, 0);

	bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);

	return count;
}


void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
{
	unsigned int nr_pages = 1 << order;
	unsigned int loop;

	prefetchw(page);?????????
	for (loop = 0; loop < nr_pages; loop++) {
		struct page *p = &page[loop];

		if (loop + 1 < nr_pages)
			prefetchw(p + 1);
		__ClearPageReserved(p);
		set_page_count(p, 0);/*设置页面的引用位为0*/ 
	}

	set_page_refcounted(page);/*设置页面的引用计数为1*/  
	__free_pages(page, order);/*释放页面*/  
}

void __init set_highmem_pages_init(void)
{
	struct zone *zone;
	int nid;

	for_each_zone(zone) {
		unsigned long zone_start_pfn, zone_end_pfn;

		if (!is_highmem(zone))/*验证是否属于高端内存区域中*/  
			continue;
<span style="white-space:pre">		/*记录高端内存管理区的起始页框号和结束页框号*/  </span>
		zone_start_pfn = zone->zone_start_pfn;
		zone_end_pfn = zone_start_pfn + zone->spanned_pages;

		nid = zone_to_nid(zone);
		printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
				zone->name, nid, zone_start_pfn, zone_end_pfn);
<span style="white-space:pre">			  /*将高端内存的页框添加到伙伴系统*/  </span>
		add_highpages_with_active_regions(nid, zone_start_pfn,
				 zone_end_pfn);
	}
	totalram_pages += totalhigh_pages;

void __init add_highpages_with_active_regions(int nid,
			 unsigned long start_pfn, unsigned long end_pfn)
{
	phys_addr_t start, end;
	u64 i;

	for_each_free_mem_range(i, nid, &start, &end, NULL) {
		unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
					    start_pfn, end_pfn);
		unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
					      start_pfn, end_pfn);
		for ( ; pfn < e_pfn; pfn++)
			if (pfn_valid(pfn))
				add_one_highpage_init(pfn_to_page(pfn));
	}
}






评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值