内存管理--检测内存

linux kernel被bootloader加载到内存后,cpu首先执行head.s中的start_of_setup函数等函数,然后跳转到main.c,main中首先执行detect_memory函数探测内存;

int detect_memory(void)
{
	int err = -1;

	if (detect_memory_e820() > 0)
		err = 0;

	if (!detect_memory_e801())
		err = 0;

	if (!detect_memory_88())
		err = 0;

	return err;
}

linux内核通过detect_memory_xxx来获取内存相关信息;这几个函数都是通过触发int 0x15 中断获取;同时调用前分别把AX寄存器设置为0xe820h、0xe801h、0x88h


对于e820();

struct e820entry {
	__u64 addr;	/* start of memory segment */该内存段的起始地址
	__u64 size;	/* size of memory segment */该内存段段的大小
	__u32 type;	/* type of memory segment */该内存段的类型
} __attribute__((packed));

struct e820map {
<span style="white-space:pre">		</span>__u32 nr_map;
<span style="white-space:pre">		</span>struct e820entry map[E820_X_MAX];
};
type:该内存段的类型,可分为Usable (normal) RAM,Reserved - unusable,ACPI reclaimable memory,ACPI NVS memory,Area containing bad memory,要获取所有的内存段的信息,detect_memory_e820()通过一个do_while循环来不断触发int 0x15中断来获取每个内存段的信息,并且将这些信息保存在一个struct e820entry类型的数组中。

static int detect_memory_e820(void)
{
	int count = 0;
	struct biosregs ireg, oreg;
	struct e820entry *desc = boot_params.e820_map;
	static struct e820entry buf; /* static so it is zeroed */

	initregs(&ireg);
	ireg.ax  = 0xe820;
	ireg.cx  = sizeof buf;
	ireg.edx = SMAP;
	ireg.di  = (size_t)&buf;

	/*
	 * Note: at least one BIOS is known which assumes that the
	 * buffer pointed to by one e820 call is the same one as
	 * the previous call, and only changes modified fields.  Therefore,
	 * we use a temporary buffer and copy the results entry by entry.
	 *
	 * This routine deliberately does not try to account for
	 * ACPI 3+ extended attributes.  This is because there are
	 * BIOSes in the field which report zero for the valid bit for
	 * all ranges, and we don't currently make any use of the
	 * other attribute bits.  Revisit this if we see the extended
	 * attribute bits deployed in a meaningful way in the future.
	 */

	do {
<span style="white-space:pre">		</span> /*在执行这条内联汇编语句时输入的参数有: 
        eax寄存器=0xe820 
        dx寄存器=’SMAP’ 
        edi寄存器=desc 
        ebx寄存器=next 
        ecx寄存器=size 
         
        返回给c语言代码的参数有: 
        id=eax寄存器 
        rr=edx寄存器 
        ext=ebx寄存器 
        size=ecx寄存器 
        desc指向的内存地址在执行0x15中断调用时被设置 
        */  <span style="white-space:pre">	</span>
		intcall(0x15, &ireg, &oreg);/*触发中断0x15*/
		ireg.ebx = oreg.ebx; /* for next iteration... */

		/* BIOSes which terminate the chain with CF = 1 as opposed
		   to %ebx = 0 don't always report the SMAP signature on
		   the final, failing, probe. */
		if (oreg.eflags & X86_EFLAGS_CF)
			break;

		/* Some BIOSes stop returning SMAP in the middle of
		   the search loop.  We don't know exactly how the BIOS
		   screwed up the map at that point, we might have a
		   partial map, the full map, or complete garbage, so
		   just return failure. */
		if (oreg.eax != SMAP) {
			count = 0;
			break;
		}

		*desc++ = buf;/*保存获取的内存段信息*/  
		count++;  /*获取的内存段数目加1*/  
	} while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
<span style="white-space:pre">		</span>/*将内存块数保持到变量中*/  
	return boot_params.e820_entries = count;
}


  

static int detect_memory_e801(void)
{
	struct biosregs ireg, oreg;

	initregs(&ireg);
	ireg.ax = 0xe801;
	intcall(0x15, &ireg, &oreg);

	if (oreg.eflags & X86_EFLAGS_CF)
		return -1;

	/* Do we really need to do this? */
	if (oreg.cx || oreg.dx) {
		oreg.ax = oreg.cx;
		oreg.bx = oreg.dx;
	}

	if (oreg.ax > 15*1024) {
		return -1;	/* Bogus! */
	} else if (oreg.ax == 15*1024) {
		boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax;
	} else {
		/*
		 * This ignores memory above 16MB if we have a memory
		 * hole there.  If someone actually finds a machine
		 * with a memory hole at 16MB and no support for
		 * 0E820h they should probably generate a fake e820
		 * map.
		 */
		boot_params.alt_mem_k = oreg.ax;
	}

	return 0;
}

static int detect_memory_88(void)
{
	struct biosregs ireg, oreg;

	initregs(&ireg);
	ireg.ah = 0x88;
	intcall(0x15, &ireg, &oreg);

	boot_params.screen_info.ext_mem_k = oreg.ax;

	return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
}

对于32位的系统,通过调用链arch/x86/boot/main.c:main()--->arch/x86/boot/pm.c:go_to_protected_mode()--->arch/x86/boot/pmjump.S:protected_mode_jump()--->arch/i386/boot/compressed/head_32.S:startup_32()--->arch/x86/kernel/head_32.S:startup_32()--->arch/x86/kernel/head32.c:i386_start_kernel()--->init/main.c:start_kernel(),到达众所周知的Linux内核启动函数start_kernel(),这里会调用setup_arch()完成与体系结构相关的一系列初始化工作,其中就包括各种内存的初始化工作,如内存图的建立、管理区的初始化等等。对x86体系结构,setup_arch()函数在arch/x86/kernel/setup.c中,如下:

void __init setup_arch(char **cmdline_p)
{
	/* ...... */

	x86_init.oem.arch_setup();

	setup_memory_map(); /* 建立内存图 */

	e820_reserve_setup_data();

	/* ...... */

	/*
	 * partially used pages are not usable - thus
	 * we are rounding upwards:
	 */
	max_pfn = e820_end_of_ram_pfn(); /* 找出最大可用内存页面帧号 */
<span style="white-space:pre">		</span><pre name="code" class="cpp" style="font-size: 24px;">       /* ...... */
#ifdef CONFIG_X86_32/* max_low_pfn在这里更新 */find_low_pfn_range(); /* 找出低端内存的最大页帧号 */#elsenum_physpages = max_pfn;/* ...... *//* max_pfn_mapped在这更新 *//* 初始化内存映射机制 */max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);max_pfn_mapped = max_low_pfn_mapped;/* ...... */initmem_init(0, max_pfn); /* 启动内存分配器 *//* ...... */x86_init.paging.pagetable_setup_start(swapper_pg_dir);paging_init(); /* 建立完整的页表 */x86_init.paging.pagetable_setup_done(swapper_pg_dir);/* ...... */}

 

在 start_kernel---->setup_arch()--------------->setup_memory_map

void __init setup_memory_map(void)
{
	char *who;

	who = x86_init.resources.memory_setup();
	memcpy(&e820_saved, &e820, sizeof(struct e820map));
	printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
	e820_print_map(who);
}
x86_init.c中定义了x86下的memory_setup函数:

/*
 * The platform setup functions are preset with the default functions
 * for standard PC hardware.
 */
struct x86_init_ops x86_init __initdata = {

	.resources = {
		.probe_roms		= probe_roms,
		.reserve_resources	= reserve_standard_io_resources,
		.memory_setup		= default_machine_specific_memory_setup,
	},

	.mpparse = {
		.mpc_record		= x86_init_uint_noop,
		.setup_ioapic_ids	= x86_init_noop,
		.mpc_apic_id		= default_mpc_apic_id,
		.smp_read_mpc_oem	= default_smp_read_mpc_oem,
		.mpc_oem_bus_info	= default_mpc_oem_bus_info,
		.find_smp_config	= default_find_smp_config,
		.get_smp_config		= default_get_smp_config,
	},

	.irqs = {
		.pre_vector_init	= init_ISA_irqs,
		.intr_init		= native_init_IRQ,
		.trap_init		= x86_init_noop,
	},

	.oem = {
		.arch_setup		= x86_init_noop,
		.banner			= default_banner,
	},

	.mapping = {
		.pagetable_reserve		= native_pagetable_reserve,
	},

	.paging = {
		.pagetable_setup_start	= native_pagetable_setup_start,
		.pagetable_setup_done	= native_pagetable_setup_done,
	},

	.timers = {
		.setup_percpu_clockev	= setup_boot_APIC_clock,
		.tsc_pre_init		= x86_init_noop,
		.timer_init		= hpet_time_init,
		.wallclock_init		= x86_init_noop,
	},

	.iommu = {
		.iommu_init		= iommu_init_noop,
	},

	.pci = {
		.init			= x86_default_pci_init,
		.init_irq		= x86_default_pci_init_irq,
		.fixup_irqs		= x86_default_pci_fixup_irqs,
	},
};

可知会回调:default_machine_specific_memory_setup();

char *__init default_machine_specific_memory_setup(void)
{
	char *who = "BIOS-e820";
	u32 new_nr;
	/*
	 * Try to copy the BIOS-supplied E820-map.
	 *
	 * Otherwise fake a memory map; one section from 0k->640k,
	 * the next section from 1mb->appropriate_mem_k
	 */
	new_nr = boot_params.e820_entries;
	sanitize_e820_map(boot_params.e820_map, /*消除重叠的内存段*/  
			ARRAY_SIZE(boot_params.e820_map),
			&new_nr);
	boot_params.e820_entries = new_nr;
	if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
	  < 0) { /*将内存布局的信息从boot_params.e820_map拷贝到struct e820map e820*/  
		u64 mem_size;

		/* compare results from other methods and take the greater */
		if (boot_params.alt_mem_k
		    < boot_params.screen_info.ext_mem_k) {
			mem_size = boot_params.screen_info.ext_mem_k;
			who = "BIOS-88";
		} else {
			mem_size = boot_params.alt_mem_k;
			who = "BIOS-e801";
		}

		e820.nr_map = 0;
		e820_add_region(0, LOWMEMSIZE(), E820_RAM);
		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
	}

	/* In case someone cares... */
	return who;
}

1.消除内存段的重叠部分

2.将内存布局信息从boot_params.e820_map拷贝到e820中

append_e820_map(boot_params.e820_map, boot_params.e820_entries)将会调用一下函数:

static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
{
	while (nr_map) {  
		u64 start = biosmap->addr;
		u64 size = biosmap->size;
		u64 end = start + size;
		u32 type = biosmap->type;

		/* Overflow in 64 bits? Ignore the memory map. */
		if (start > end)
			return -1;

		e820_add_region(start, size, type);  循环nr_map次添加内存块到e820中去;
 
		biosmap++;
		nr_map--;
	}
	return 0;
}
  
void __init e820_add_region(u64 start, u64 size, int type)
{
	__e820_add_region(&e820, start, size, type);
}
struct e820map e820;

物理内存就已经从BIOS中读出来存放到全局变量e820中,

建立内存后

setup_arch------------->e820_end_of_ram_pfn;

/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
max_pfn = e820_end_of_ram_pfn();

static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
{
	int i;
	unsigned long last_pfn = 0;
	unsigned long max_arch_pfn = MAX_ARCH_PFN;/*4G地址空间对应的页面数*/  

	for (i = 0; i < e820.nr_map; i++) {  /*循环遍历内存布局数组*/
		struct e820entry *ei = &e820.map[i];
		unsigned long start_pfn;
		unsigned long end_pfn;

		if (ei->type != type)
			continue;

		start_pfn = ei->addr >> PAGE_SHIFT;
		end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;

		if (start_pfn >= limit_pfn)/*起始地址大于MAX_ARCH_PFN,无视之*/
			continue;
		if (end_pfn > limit_pfn) { /*结束地址大于MAX_ARCH_PFN则直接最大页框编号设为MAX_ARCH_PFN*/
									
			last_pfn = limit_pfn;
			break;
		}
		if (end_pfn > last_pfn)    /*该内存段的末地址大于之前找到的最大页框编号,
								则重置最大页框编号*/
			last_pfn = end_pfn;
	}

	if (last_pfn > max_arch_pfn)/*大于4G空间时*/  
		last_pfn = max_arch_pfn;

	printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
			 last_pfn, max_arch_pfn);
	return last_pfn; /*返回最后一个页面帧号*/  
}

unsigned long __init e820_end_of_ram_pfn(void)
{
<span style="white-space:pre">	</span>return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
}


#define MAXMEM    (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)  

其中__VANALLOC_RESERVE为128M,上图说明了第4GB的内存划分

可知:MAXMEM为一个略小于896M的值(896M-8K-4M-4M)即略小于低端内存的上限,高端内存的起始地址


setup_arch()-->find_low_pfn_range().该函数用来划分低端内存和高端内存的界限,确定高端内存的起始地址

/* max_low_pfn get updated here */
find_low_pfn_range();

/*
 * Determine low and high memory ranges:
 */
void __init find_low_pfn_range(void)
{
	/* it could update max_pfn */

	if (max_pfn <= MAXMEM_PFN)/*实际物理内存小于等于低端内存896M*/  
		lowmem_pfn_init();
	else
		highmem_pfn_init();
}
/*
 * We have more RAM than fits into lowmem - we try to put it into
 * highmem, also taking the highmem=x boot parameter into account:
 */
 /*高端地址空间的页面数可以在启动中进行配置;
 如果不配置,在这里进行设置大小*/
void __init highmem_pfn_init(void)
{
	/*MAXMEM_PFN为最大物理地址-(4M+4M+8K+128M);
	所以低端内存的大小其实比我们说的896M低一些*/
	max_low_pfn = MAXMEM_PFN;/*设定高端内存和低端内存的分界线*/  

	if (highmem_pages == -1)/*高端内存页面数如果在开机没有设置*/
		highmem_pages = max_pfn - MAXMEM_PFN;/*总页面数减去低端页面数*/
	/*如果highmem_pages变量在启动项设置了,那么在这里就要进行这样的判断,因为可能出现不一致的情况*/
	if (highmem_pages + MAXMEM_PFN < max_pfn)
		max_pfn = MAXMEM_PFN + highmem_pages;

	if (highmem_pages + MAXMEM_PFN > max_pfn) {
		printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
			pages_to_mb(max_pfn - MAXMEM_PFN),
			pages_to_mb(highmem_pages));
		highmem_pages = 0;
	}
#ifndef CONFIG_HIGHMEM
	/* Maximum memory usable is what is directly addressable */
	printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
	if (max_pfn > MAX_NONPAE_PFN)
		printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
	else
		printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
	max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM *//*存在高端地址情况*/
#ifndef CONFIG_HIGHMEM64G
	/*在没有配置64G的情况下,内存的大小不能超过4G*/
	if (max_pfn > MAX_NONPAE_PFN) {
		max_pfn = MAX_NONPAE_PFN;
		printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
	}
#endif /* !CONFIG_HIGHMEM64G */
#endif /* !CONFIG_HIGHMEM */
}
当实际内存小于896M时
void __init lowmem_pfn_init(void)
{
	/* max_low_pfn is 0, we already have early_res support */
	/*将分界线初始化为实际物理内存的最大页框号,由于系统的内存小于896M,
	所以全部内存为低端内存,如需要高端内存,则从中分一部分出来进行分配*/
	max_low_pfn = max_pfn;

	if (highmem_pages == -1)
		highmem_pages = 0;
#ifdef CONFIG_HIGHMEM  /*如果用户定义了HIGHMEM,即需要分配高端内存*/
	if (highmem_pages >= max_pfn) {       /*如果高端内存的页起始地址>=最大页框号,则无法分配*/
		printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
			pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
		highmem_pages = 0;
	}
	if (highmem_pages) {
		/*这个条件保证低端内存不能小于64M*/
		if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
			printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
				pages_to_mb(highmem_pages));
			highmem_pages = 0;
		}
		max_low_pfn -= highmem_pages; /*设定好低、高端内存的分界线*/
	}
#else
	if (highmem_pages)
		printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
#endif
}

当实际的物理内存大于896M,由highmem_pfn_init()进行分配
void __init highmem_pfn_init(void)
{
	max_low_pfn = MAXMEM_PFN; /*设定高端内存和低端内存的分界线*/
	
								
	if (highmem_pages == -1)  /*未设定高端内存的页框数*/
		highmem_pages = max_pfn - MAXMEM_PFN;  /*默认为最大页框数减去MAXMEM_PFN*/

	if (highmem_pages + MAXMEM_PFN < max_pfn)      /*高端内存页框数加上MAXMEM_PFN小于最大页框数*/
		max_pfn = MAXMEM_PFN + highmem_pages;  /*将最大页框数下调到前两者的和*/

	if (highmem_pages + MAXMEM_PFN > max_pfn){     /*申请的高端内存超过范围则不分配*/
		printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
			pages_to_mb(max_pfn - MAXMEM_PFN),
			pages_to_mb(highmem_pages));
		highmem_pages = 0;
	}
#ifndef CONFIG_HIGHMEM
	/* Maximum memory usable is what is directly addressable */
	printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
	if (max_pfn > MAX_NONPAE_PFN)
		printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
	else
		printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
	max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM */
#ifndef CONFIG_HIGHMEM64G
	if (max_pfn > MAX_NONPAE_PFN) {
		max_pfn = MAX_NONPAE_PFN;
		printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
	}
#endif /* !CONFIG_HIGHMEM64G */
#endif /* !CONFIG_HIGHMEM */
}




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值