linux内存管理----伙伴系统初始化

内核:X86 openeuler4.19
内存大小3G,采用的是sparse vmemmap内存模型,相关config如下

CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_ARCH_SPARSEMEM_DEFAULT=y
CONFIG_SPARSEMEM_MANUAL=y
CONFIG_SPARSEMEM=y
CONFIG_SPARSEMEM_EXTREME=y
CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
CONFIG_SPARSEMEM_VMEMMAP=y

memblock

伙伴系统初始化之前的内存分配是通过memblock进行的,在启动参数添加memblock=debug后,可以看到这个阶段使用memblock分配器的内存使用信息。
在vmware虚拟机的环境下,物理内存大小通过boot_params传递给内核,然后加入到memblock的memory类型里,早期通过memblock预留的内存则加入到reserve类型

paging_init

sparse_memory_present_with_active_regions

对加入到memblock.memory里的所有内存区域调用memory_present


/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
	unsigned long pfn;

#ifdef CONFIG_SPARSEMEM_EXTREME
	if (unlikely(!mem_section)) {
		unsigned long size, align;

		size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; //16kb
		align = 1 << (INTERNODE_CACHE_SHIFT);
		mem_section = memblock_virt_alloc(size, align); /*通过
		* memblcok申请16kb内存,sparse内存模型下,每128M作为一个section,用
		* mem_section结构体表示,所有的mem_section结构体每256(SECTION_PER_ROOT)
		* 个分为一个组,首地址存入到全局数组mem_section。每个组可以表示的物理地址大小
		* 为256*128M=64G,在本次实验环境中只有3GB内存,所以一个组就够了,通过crash工具
		* 查看mem_section变量的值也可以看到除了mem_section[0]之外全都指向0x0
		* mem_section结构体的成员section_mem_map指向一个page
		* 数组,这样通过全局变量mem_section就可以访问到所有的page*/
	}
#endif

	start &= PAGE_SECTION_MASK;
	mminit_validate_memmodel_limits(&start, &end);
	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { /*每2的15次方个page分配一个mem_section*/
		unsigned long section = pfn_to_section_nr(pfn);
		struct mem_section *ms;

		sparse_index_init(section, nid);//分配一个mem_section组(256个mem_section结构体组成一个组,占4kb大小)
		set_section_nid(section, nid);

		ms = __nr_to_section(section);//以section作为索引得到mem_section[0]数组中的某一个mem_section,本例中3gb物理地址空间为0-0xbfffffff,所以mem_section[0][0]到mem_section[0][23](3GB/128M=24)的mem_section结构体都会被填充
		if (!ms->section_mem_map) { //因为每128M分配一个mem_section结构体,需要128M/4K/sizeof(struct page),所以sectiom_mem_map的低位可以用来作为标志位
			ms->section_mem_map = sparse_encode_early_nid(nid) |
							SECTION_IS_ONLINE;
			section_mark_present(ms);
		}
	}
}

#define NR_SECTION_ROOTS	DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
#define NR_MEM_SECTIONS		(1UL << SECTIONS_SHIFT)
#define SECTIONS_SHIFT	(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
# define SECTION_SIZE_BITS	27 /* matt - 128 is convenient right now */
# define MAX_PHYSADDR_BITS	(pgtable_l5_enabled() ? 52 : 44)
# define MAX_PHYSMEM_BITS	(pgtable_l5_enabled() ? 52 : 46) //46
#define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))

crash查看mem_section变量,2048已溢出
在这里插入图片描述

sparse_init

按内存节点为所有的mem_section结构体分配page数组,并为其section_mem_map成员赋值,本例中仅有一个内存节点(笔者接触到的手机、个人pc等设备都只有一个内存结点)


/*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
 */
void __init sparse_init(void)
{
	unsigned long pnum_begin = first_present_section_nr();
	int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
	unsigned long pnum_end, map_count = 1;

	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
	set_pageblock_order();

	for_each_present_section_nr(pnum_begin + 1, pnum_end) {
		int nid = sparse_early_nid(__nr_to_section(pnum_end));

		if (nid == nid_begin) {
			map_count++;
			continue;
		}
		/* Init node with sections in range [pnum_begin, pnum_end) */
		sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
		nid_begin = nid;
		pnum_begin = pnum_end;
		map_count = 1;
	}
	/* cover the last node */
	sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); //node 0,map_count表示有多少个mem_section,本例中为23
	vmemmap_populate_print_last();
}

sparse_init_nid


/*
 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 * And number of present sections in this node is map_count.
 */
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
				   unsigned long pnum_end,
				   unsigned long map_count)
{
	unsigned long pnum, usemap_longs, *usemap;
	struct page *map;
	//mem_section的成员page_block_flags指向一块区域,用于表示该mem_section中的page的使用情况,
	//此处计算该区域的大小为几个long的长度。此处为4
	usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS); 
//分配24*4*8字节的空间
	usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
							  usemap_size() *
							  map_count);

	if (!usemap) {
		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
		goto failed;
	}
	//分配sizeof(struct page)*PAGES_PER_SECTION*map_count字节内存,以PAGE_SIZE(4K)对齐,
	//此处为64*32K*24=50331648字节
	sparse_buffer_init(map_count * section_map_size(), nid);
	for_each_present_section_nr(pnum_begin, pnum) { 
		if (pnum >= pnum_end)
			break;
		map = sparse_mem_map_populate(pnum, nid, NULL);
		if (!map) {
			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
			       __func__, nid);
			pnum_begin = pnum;
			goto failed;
		}
		check_usemap_section_nr(nid, usemap);
		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); 
		/*设置此mem_section的secton_mem_map成员,此处会把map的地址减去偏移,
		*所以可以看到每个mem_section的section_mem_map值都是相同的,section_mem_map末位为7,表明此mem_section的以下标志位已置位
		* #define	SECTION_MARKED_PRESENT	(1UL<<0)
		* #define  SECTION_HAS_MEM_MAP	(1UL<<1)
		* #define  SECTION_IS_ONLINE	(1UL<<2)
		*/
		usemap += usemap_longs;
	}
	sparse_buffer_fini();//释放未使用的空间
	return;
failed:
	/* We failed to allocate, mark all the following pnums as not present */
	for_each_present_section_nr(pnum_begin, pnum) {
		struct mem_section *ms;

		if (pnum >= pnum_end)
			break;
		ms = __nr_to_section(pnum);
		ms->section_mem_map = 0;
	}
}

sparse_mem_map_populate

struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
		struct vmem_altmap *altmap)
{
	unsigned long start;
	unsigned long end;
	struct page *map;

	map = pfn_to_page(pnum * PAGES_PER_SECTION);//将页帧号转换为page指针,sparse_vmemmap内存模型使用虚拟地址vmemmap_base作为
	//page数组的起始地址,从而减少page和pfn的转换时的内存访问(sparse内存模型要先找到mem_section组,再找到page组)。vmemmap_base初
	//始值为ffffea0000000000,而后在setup_arch-->kernel_randomize_memory内变为一个随机的值
	start = (unsigned long)map;
	end = (unsigned long)(map + PAGES_PER_SECTION);//每个section表示128M内存,即需要32K个page

	if (vmemmap_populate(start, end, nid, altmap))//为start到end之间的页分配内存空间,如果page对应的pud、p4d空间未分配,则分配其空间。
		return NULL;

	return map;
}

vmemmap_populate–>vmemmap_populate_hugepages

static int __meminit vmemmap_populate_hugepages(unsigned long start,
		unsigned long end, int node, struct vmem_altmap *altmap)
{
	unsigned long addr;
	unsigned long next;
	pgd_t *pgd;
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;
	
	for (addr = start; addr < end; addr = next) {
		next = pmd_addr_end(addr, end);
		pgd = vmemmap_pgd_populate(addr, node);//以init_mm的pgd作为页表基地址
		if (!pgd)
			return -ENOMEM;

		p4d = vmemmap_p4d_populate(pgd, addr, node);//分配4kb作为p4d页表
		if (!p4d)
			return -ENOMEM;

		pud = vmemmap_pud_populate(p4d, addr, node);//分配4kb作为pud页表,每个pud可以容纳512个pmd指针
		if (!pud)
			return -ENOMEM;

		pmd = pmd_offset(pud, addr);
		if (pmd_none(*pmd)) {
			void *p;

			if (altmap)
				p = altmap_alloc_block_buf(PMD_SIZE, altmap);
			else
				p = vmemmap_alloc_block_buf(PMD_SIZE, node);//分配2M作为pmd页表基地址,其中的每项是一个pte,
					//现在就相当于为128M物理内存分配了32K个page, 此处的分配时通过移动sparsemap_buf指针实现的,
					//因为之前已经为所有pte分配了空间(sparse_buffer_init),sparsemap_buf指向这块空间的开头
			if (p) {
				pte_t entry;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
				continue;
			} else if (altmap)
				return -ENOMEM; /* no fallback */
		} else if (pmd_large(*pmd)) {
			vmemmap_verify((pte_t *)pmd, node, addr, next);
			continue;
		}
		if (vmemmap_populate_basepages(addr, next, node))
			return -ENOMEM;
	}
	return 0;
}

zone_sizes_init

max_pfn和max_low_pfn都为786432

void __init zone_sizes_init(void)
{
	unsigned long max_zone_pfns[MAX_NR_ZONES];

	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));

#ifdef CONFIG_ZONE_DMA
	max_zone_pfns[ZONE_DMA]		= min(MAX_DMA_PFN, max_low_pfn); //16M
#endif
#ifdef CONFIG_ZONE_DMA32
	max_zone_pfns[ZONE_DMA32]	= min(MAX_DMA32_PFN, max_low_pfn); //3G
#endif
	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
#ifdef CONFIG_HIGHMEM
	max_zone_pfns[ZONE_HIGHMEM]	= max_pfn;
#endif

	free_area_init_nodes(max_zone_pfns);
}

free_area_init_nodes会计算得到各个zone的大小并输出,此例中会得到以下输出
Zone ranges:
DMA [mem 0x0000000000001000-0x0000000000ffffff]
DMA32 [mem 0x0000000001000000-0x00000000bfffffff]
Normal empty
Device empty

free_area_init_node


void __init free_area_init_node(int nid, unsigned long *zones_size,
				   unsigned long node_start_pfn,
				   unsigned long *zholes_size)
{
	pg_data_t *pgdat = NODE_DATA(nid);
	unsigned long start_pfn = 0;
	unsigned long end_pfn = 0;

	/* pg_data_t should be reset to zero when it's allocated */
	WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);

	pgdat->node_id = nid;
	pgdat->node_start_pfn = node_start_pfn;
	pgdat->per_cpu_nodestats = NULL;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); //获得该node的页帧的起始和结束值
	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
		(u64)start_pfn << PAGE_SHIFT,
		end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
/*Initmem setup node 0 [mem 0x0000000000001000-0x00000000bfffffff]*/
#else
	start_pfn = node_start_pfn;
#endif
//计算node和zone的page数量
	calculate_node_totalpages(pgdat, start_pfn, end_pfn,
				  zones_size, zholes_size);

	alloc_node_mem_map(pgdat); //平坦内存模型才会分配
	pgdat_set_deferred_range(pgdat);

	free_area_init_core(pgdat);
}

free_area_init_core


/*
 * Set up the zone data structures:
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 *
 * NOTE: pgdat should get zeroed by caller.
 * NOTE: this function is only called during early init.
 */
static void __init free_area_init_core(struct pglist_data *pgdat)
{
	enum zone_type j;
	int nid = pgdat->node_id;

	pgdat_init_internals(pgdat); //初始化节点的lru链表等成员
	pgdat->per_cpu_nodestats = &boot_nodestats;

	for (j = 0; j < MAX_NR_ZONES; j++) {
		struct zone *zone = pgdat->node_zones + j;
		unsigned long size, freesize, memmap_pages;
		unsigned long zone_start_pfn = zone->zone_start_pfn;

		size = zone->spanned_pages;
		freesize = zone->present_pages;

		/*
		 * Adjust freesize so that it accounts for how much memory
		 * is used by this zone for memmap. This affects the watermark
		 * and per-cpu initialisations
		 */
		memmap_pages = calc_memmap_size(size, freesize);//page结构体占用的页
		if (!is_highmem_idx(j)) {
			if (freesize >= memmap_pages) {
				freesize -= memmap_pages;
				if (memmap_pages)
					printk(KERN_DEBUG
					       "  %s zone: %lu pages used for memmap\n",
					       zone_names[j], memmap_pages);
			} else
				pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",
					zone_names[j], memmap_pages, freesize);
		}

		/* Account for reserved pages */
		if (j == 0 && freesize > dma_reserve) {
			freesize -= dma_reserve;
			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
					zone_names[0], dma_reserve);
		}

		if (!is_highmem_idx(j))
			nr_kernel_pages += freesize;
		/* Charge for highmem memmap if there are enough kernel pages */
		else if (nr_kernel_pages > memmap_pages * 2)
			nr_kernel_pages -= memmap_pages;
		nr_all_pages += freesize;

		/*
		 * Set an approximate value for lowmem here, it will be adjusted
		 * when the bootmem allocator frees pages into the buddy system.
		 * And all highmem pages will be managed by the buddy system.
		 */
		zone_init_internals(zone, j, nid, freesize);

		if (!size)
			continue;

		set_pageblock_order(); //不执行
		setup_usemap(pgdat, zone, zone_start_pfn, size);//不执行
		init_currently_empty_zone(zone, zone_start_pfn, size); //初始化zone的free_area链表
		memmap_init(size, nid, j, zone_start_pfn);
	}
}

memmap_init–>memmap_init_zone–>__init_single_page(page, pfn, zone, nid)

static void __meminit __init_single_page(struct page *page, unsigned long pfn,
				unsigned long zone, int nid)
{
	mm_zero_struct_page(page);
	set_page_links(page, zone, nid, pfn); //设置page->flags,表面其所处的zone和inode
	init_page_count(page); //设置page->_refcount为1
	page_mapcount_reset(page);设置page->mapcount为-1
	page_cpupid_reset_last(page);

	INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
	if (!is_highmem_idx(zone))
		set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}

build_all_zonelists

将zone按从高到低的顺序加入内存结点pglist的zonelist中

mm_init–>mem_init



void __init mem_init(void)
{
	pci_iommu_alloc();

	/* clear_bss() already clear the empty_zero_page */

	/* this will put all memory onto the freelists */
	free_all_bootmem();
	after_bootmem = 1;
	x86_init.hyper.init_after_bootmem();

	/*
	 * Must be done after boot memory is put on freelist, because here we
	 * might set fields in deferred struct pages that have not yet been
	 * initialized, and free_all_bootmem() initializes all the reserved
	 * deferred pages for us.
	 */
	register_page_bootmem_info();

	/* Register memory areas for /proc/kcore */
	if (get_gate_vma(&init_mm))
		kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);

	mem_init_print_info(NULL);//打印如下log
	/*Memory: 210408K/3145204K available (12300K kernel code, 2546K rwd....*/
}


free_all_bootmem


unsigned long __init free_all_bootmem(void)
{
	unsigned long pages;

	reset_all_zones_managed_pages(); //将zone->manage_pages设置为0

	pages = free_low_memory_core_early();//将page->_refcount设置为0,然后调用free_page将页释放到伙伴系统中。
	totalram_pages += pages;

	return pages;
}

  • 21
    点赞
  • 25
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值