linux内存管理,总的来说,分为3个阶段,分别是:
- memblock/bootmem分配器初始化
- Buddy 系统初始化
- Slab/slub分配器初始化
为什么要分为3个阶段呢?由于系统在初始化最初阶段,buddy系统和slab分配器都还未初始化完成,这时候内核分配内存,需要一种方式来分配物理内存,linux系统最初提供bootmem分配器分配内存,后续提供了memblock分配器用来代替bootmem分配器,不过它们使用的系统接口API是相同的。
为了分配1个或者多个page,linux如何分配呢?linux系统为分配2^order个pages提供了buddy系统分配器,其核心思想是内核将内存分成多个zone,每个zone维护了个链表数组frea_area[MAX_ORDER], 将内存按照1,2,4,8,…,2^max_order-1个page,分别挂在对应的链表上。用户分配内存时,先从对应的链表上分配,如果没有空闲内存块,则从比它大一级的链表中,取一个内存块,并将其拆分为2块,他们彼此称之为伙伴,其中一个分配给用户,另一个链入对应的空闲链表中,以此类推。
Buddy只能分配2^order个pages,为了分配任意byte的内存,避免造成内存的浪费,linux提供了slab分配器和slub分配器。slab分配器将page划分为对象大小的内存池,当需要一个对象时,就从中分配出一个对象,当不需要时,就把该对象的内存归还给内存池。
下面我们一步步来分析从系统探测出内存大小,到一步步初始化内存分配器的过程。
物理内存探测
在x86平台下,物理内存探测是通过bios INT15中断,返回可使用内存的基地址、长度、type等信息。
探测内存函数在arch/x86/boot/memory.c:
int detect_memory(void)
{
int err = -1;
if (detect_memory_e820() > 0)
err = 0;
if (!detect_memory_e801())
err = 0;
if (!detect_memory_88())
err = 0;
return err;
}
主要探测函数在detect_memory_e820()中,通过循环遍历int15中断,将获取内存segment信息保存在boot_params.e820.map中。
static int detect_memory_e820(void)
{
int count = 0;
struct biosregs ireg, oreg;
struct e820entry *desc = boot_params.e820_map;
static struct e820entry buf; /* static so it is zeroed */
initregs(&ireg);
ireg.ax = 0xe820;
ireg.cx = sizeof buf;
ireg.edx = SMAP;
ireg.di = (size_t)&buf;
/*
* Note: at least one BIOS is known which assumes that the
* buffer pointed to by one e820 call is the same one as
* the previous call, and only changes modified fields. Therefore,
* we use a temporary buffer and copy the results entry by entry.
*
* This routine deliberately does not try to account for
* ACPI 3+ extended attributes. This is because there are
* BIOSes in the field which report zero for the valid bit for
* all ranges, and we don't currently make any use of the
* other attribute bits. Revisit this if we see the extended
* attribute bits deployed in a meaningful way in the future.
*/
do {
intcall(0x15, &ireg, &oreg);
ireg.ebx = oreg.ebx; /* for next iteration... */
/* BIOSes which terminate the chain with CF = 1 as opposed
to %ebx = 0 don't always report the SMAP signature on
the final, failing, probe. */
if (oreg.eflags & X86_EFLAGS_CF)
break;
/* Some BIOSes stop returning SMAP in the middle of
the search loop. We don't know exactly how the BIOS
screwed up the map at that point, we might have a
partial map, the full map, or complete garbage, so
just return failure. */
if (oreg.eax != SMAP) {
count = 0;
break;
}
*desc++ = buf;
count++;
} while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
return boot_params.e820_entries = count;
}
这里介绍几个数据结构。
#define E820MAX 128 /* number of entries in E820MAP */
struct e820entry {
__u64 addr; /* start of memory segment */
__u64 size; /* size of memory segment */
__u32 type; /* type of memory segment */
} __attribute__((packed));
Struct boot_params{
struct e820entry e820_map[E820MAX]; /* 0x2d0 */
}
这里假设已经得到了可用内存地址段,保存在boot_params.e820_map[]数组中。具体如何从bios得到内存地址空间,在另外一篇文章中单独讨论。
detect_memory调用栈及后续memblock初始化调用栈如下:
[Arch/x86/boot/header.S: call main]->Main()->detect_memory()->go_to_protected_mode()->[arch/x86/boot/pmjump.S:protected_mode_jump:code32_start]->[arch/x86/boot/header.S:code32_start: 0x100000]->…->
[arch/x86/kernel/head_64.S:startup_64:x86_64_start_kernel()]->x86_64_start_kernel()->x86_64_start_reservations()start_kernel()->arch_setup()
整个初始化从汇编到C程序跳转过程,后续会单独在x86系统在初始化从实模式到保护模式跳转过程。
bootloader加载内核到内存,也会单独介绍。
回到主题,接下来介绍几个与内存管理相关的关键函数分析。
在start_kernel()->setup_arch()中,
Void _init setup_arch(char **cmdline_p)
{
…
…
Setup_memory_map();
…
…
Max_pfn = e820_end_of_ram_pfn();//找到最大的页框编号
…
Find_low_pfn_range();//设定高低内存分界线
}
在setup_memory_map()中,首先看一下几个初始化,
struct x86_init_ops x86_init __initdata = {
.resources = {
.probe_roms = probe_roms,
.reserve_resources = reserve_standard_io_resources,
.memory_setup = default_machine_specific_memory_setup,
},
…
}
void __init setup_memory_map(void)
{
char *who;
who = x86_init.resources.memory_setup();//实际上调用为x86_init.resourses.default_machine_special_memory_setup.
memcpy(&e820_saved, &e820, sizeof(struct e820map));//将e820结构保存一份
printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
e820_print_map(who);
}
char *__init default_machine_specific_memory_setup(void)
{
char *who = "BIOS-e820";
u32 new_nr;
/*
* Try to copy the BIOS-supplied E820-map.
*
* Otherwise fake a memory map; one section from 0k->640k,
* the next section from 1mb->appropriate_mem_k
*/
new_nr = boot_params.e820_entries;
//消除重叠内存块
sanitize_e820_map(boot_params.e820_map,
ARRAY_SIZE(boot_params.e820_map),
&new_nr);
boot_params.e820_entries = new_nr;
//将内存布局信息从boot_params.e820_map拷贝到全局e820结构中
if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
< 0) {
u64 mem_size;
/* compare results from other methods and take the greater */
if (boot_params.alt_mem_k
< boot_params.screen_info.ext_mem_k) {
mem_size = boot_params.screen_info.ext_mem_k;
who = "BIOS-88";
} else {
mem_size = boot_params.alt_mem_k;
who = "BIOS-e801";
}
e820.nr_map = 0;
e820_add_region(0, LOWMEMSIZE(), E820_RAM);
e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
}
/* In case someone cares... */
return who;
}
可见,该函数主要完成2个功能,一是消除内存段重叠部分,二是将内存布局信息从boot_params.e820_map中拷贝到e820中。最终调用e820_add_region()将内存信息保存到e820的map数组中。
static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
int type)
{
int x = e820x->nr_map;
if (x >= ARRAY_SIZE(e820x->map)) {
printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
(unsigned long long) start,
(unsigned long long) (start + size - 1));
return;
}
e820x->map[x].addr = start;
e820x->map[x].size = size;
e820x->map[x].type = type;
e820x->nr_map++;
}
下面再来看看e820_end_of_ram_pfn(),该函数用来遍历所有内存段的页框,找到低端内存的最大页框编号。
unsigned long __init e820_end_of_ram_pfn(void)
{
return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
}
该函数找到系统中最高的物理内存页框号。
static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
{
int i;
unsigned long last_pfn = 0;
unsigned long max_arch_pfn = MAX_ARCH_PFN;
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
unsigned long start_pfn;
unsigned long end_pfn;
if (ei->type != type)
continue;
start_pfn = ei->addr >> PAGE_SHIFT;
end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
if (start_pfn >= limit_pfn)
continue;
if (end_pfn > limit_pfn) {
last_pfn = limit_pfn;
break;
}
if (end_pfn > last_pfn)
last_pfn = end_pfn;
}
if (last_pfn > max_arch_pfn)
last_pfn = max_arch_pfn;
printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
last_pfn, max_arch_pfn);
return last_pfn;
}
接下来,在回到setup_arch()中
Setup_arch()
{
…
max_pfn = e820_end_of_ram_pfn();
…
#ifdef CONFIG_X86_32
/* max_low_pfn get updated here */
find_low_pfn_range();
#else
check_x2apic();
/* How many end-of-memory variables you have, grandma! */
/* need this before calling reserve_initrd */
if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
max_low_pfn = e820_end_of_low_ram_pfn();//获取低端内存最大值,对于大于4G的内存,低端内存设置为4G,对于小于4G内存,低端内存最大值设置为max_pfn
else
max_low_pfn = max_pfn;
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;//高端内存
#endif
}
因此,对于64位系统,低端内存和高端内存的分界线为4G;但当物理内存小于4G时,高端内存和低端内存就重合了,都是到max_pfn。
memblock初始化
物理内存探测完成,接下来就该初始化memblock分配器了。
主要函数为:
Void setup_arch()
{
…
memblock_set_current_limit(ISA_END_ADDRESS);//设置memblock.current_limit=1M
memblock_x86_fill();//主要初始化memblock函数
…
init_mem_mapping();
…
memblock_set_current_limit(get_max_mapped());
…
initmem_init();
}
Memblock debug:可以在grub参数中加入memblock=debug,会在dmesg中打印出memblock初始化相关debug信息。
在memblock_x86_fill中遍历e820中保存的内存布局信息,调用memblock_add加入到memblock结构中。
void __init memblock_x86_fill(void)
{
int i;
u64 end;
/*
* EFI may have more than 128 entries
* We are safe to enable resizing, beause memblock_x86_fill()
* is rather later for x86
*/
memblock_allow_resize();
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
end = ei->addr + ei->size;
if (end != (resource_size_t)end)
continue;
if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
continue;
memblock_add(ei->addr, ei->size);//
}
/* throw away partial pages */
memblock_trim_memory(PAGE_SIZE);
memblock_dump_all();
}
好了,下面重点介绍memblock,主要函数在mm/memblock.c中。
linux系统中有一个struct memblock memblock的实例,用来保存memblock信息。
struct memblock {
bool bottom_up; /* is bottom up direction? */
phys_addr_t current_limit;//memblock限制,
struct memblock_type memory;
struct memblock_type reserved;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
struct memblock_type physmem;
#endif
};
bottom_up: 用来标志memblock是从上往下分配还是从下往上分配,默认=false,即采取从上往下分配
current_limit:分配内存上限
memory:描述当前内存区域的内存区数目、总大小以及每个内存region
reserved:描述当前内存块已经分配的内存区数目、总大小以及每个内存region,在reserved中描述的地址范围,表示不可以再备memblock分配。
memory用来描述全部region,不区分分配和未分配,reserved用来描述memblock中已经分配的内存region。
struct memblock_type {
unsigned long cnt; /* number of regions */
unsigned long max; /* size of the allocated array */
phys_addr_t total_size; /* size of all regions */
struct memblock_region *regions;
};
cnt:regions数目
max:最大regions数目
total_size:regions总大小
regions: regions array,记录每个regions的信息
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
unsigned long flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
#endif
};
base:region基地址
size:region大小
flags:region 标志
memblock初始化
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
#endif
struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = 1, /* empty dummy entry */
.memory.max = INIT_MEMBLOCK_REGIONS,
.reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
.physmem.regions = memblock_physmem_init_regions,
.physmem.cnt = 1, /* empty dummy entry */
.physmem.max = INIT_PHYSMEM_REGIONS,
#endif
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
.memory.regions 和.reserved.regions初始化为固定数组,最多支持128个regions;
.current_limit初始化最大为可能物理地址。
调用memblock_add将可用内存区间加入到memblock中。
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
return memblock_add_range(&memblock.memory, base, size,
MAX_NUMNODES, 0);
}
/*
添加一个memblock region
@type: memblock type添加新的region
@base:新region的base address
@size:新region的size
@nid:新region的nid
@flags:新region的flags
*/
int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int nid, unsigned long flags)
{
bool insert = false;
phys_addr_t obase = base;
phys_addr_t end = base + memblock_cap_size(base, &size);//调整size,不被溢出
int i, nr_new;
if (!size)
return 0;
/* special case for empty array */
if (type->regions[0].size == 0) {//初次添加走这里
WARN_ON(type->cnt != 1 || type->total_size);
type->regions[0].base = base;
type->regions[0].size = size;
type->regions[0].flags = flags;
memblock_set_region_node(&type->regions[0], nid);
type->total_size = size;
return 0;
}
repeat:
/*
* The following is executed twice. Once with %false @insert and
* then with %true. The first counts the number of regions needed
* to accomodate the new area. The second actually inserts them.
*/
//后续添加过程执行2次,第一次insert=false,并不真正insert,只计算出需要添加的area的个数;第二次真正insert到regions中。
base = obase;
nr_new = 0;
for (i = 0; i < type->cnt; i++) {
struct memblock_region *rgn = &type->regions[i];
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;
if (rbase >= end)
break;
if (rend <= base)
continue;
/*
* @rgn overlaps. If it separates the lower part of new
* area, insert that portion.
*/
if (rbase > base) {
nr_new++;
if (insert)
memblock_insert_region(type, i++, base,
rbase - base, nid,
flags);
}
/* area below @rend is dealt with, forget about it */
base = min(rend, end);
}
/* insert the remaining portion */
if (base < end) {
nr_new++;
if (insert)
memblock_insert_region(type, i, base, end - base,
nid, flags);
}
/*
* If this was the first round, resize array and repeat for actual
* insertions; otherwise, merge and return.
*/
//如果第一次走到这里,resize array并goto repeat
if (!insert) {
while (type->cnt + nr_new > type->max)
if (memblock_double_array(type, obase, size) < 0)
return -ENOMEM;
insert = true;
goto repeat;
} else {
memblock_merge_regions(type);//第二次时合并重叠区域
return 0;
}
}
走到这里,系统已经将e820中保存的内存加入到memblock中。
64位模式,采用的直接映射,即 内核虚拟地址=物理地址,PAGE_OFFSET=0;
memblock分配API
void * __init memblock_virt_alloc_try_nid_nopanic(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
__func__, (u64)size, (u64)align, nid, (u64)min_addr,
(u64)max_addr, (void *)_RET_IP_);
return memblock_virt_alloc_internal(size, align, min_addr,
max_addr, nid);
}
下面我们深入到memblock_virt_alloc_try_nid_nopanic函数中,探究一下OS初期是如何用memblock来分配内存的?
首先解析一下memblock_virt_alloc_try_nid_nopanic入参:
Size: 分配内存大小
Align: 分配内存对其byte
Min_addr: 最小的分配地址,就是说分配的最小地址不能低于min_addr
Max_addr:最大的分配地址,就是说分配的最大地址不能大于max_addr
Nid:分配内存所属node
下面看分配流程,首先看是否有slab分配器,如果没有,再从memblock分配器分配,优先从本地nid的内存上分配,如果分配不到,再从其他node的内存上分配,从node上分配内存函数为memblock_find_in_range_node。
static void * __init memblock_virt_alloc_internal(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
phys_addr_t alloc;
void *ptr;
if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;
/*
* Detect any accidental use of these APIs after slab is ready, as at
* this moment memblock may be deinitialized already and its
* internal data may be destroyed (after execution of free_all_bootmem)
*/
if (WARN_ON_ONCE(slab_is_available()))//首先判断slab分配器是否可用,如果可用,直接从slab分配器分配
return kzalloc_node(size, GFP_NOWAIT, nid);
if (!align)
align = SMP_CACHE_BYTES;//在没有指明align时,以L1cache line对齐
if (max_addr > memblock.current_limit)
max_addr = memblock.current_limit;//memblock设置current_limit时,max_addr不能超过设定的上限
again:
alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
nid);//从memblock中找到一段未分配的region
if (alloc)//如果找到了可用地址,goto done,否则从其他nid上分配
goto done;
if (nid != NUMA_NO_NODE) {//从其他node上分配
alloc = memblock_find_in_range_node(size, align, min_addr,
max_addr, NUMA_NO_NODE);
if (alloc)
goto done;
}
if (min_addr) {
min_addr = 0;
goto again;
} else {
goto error;
}
done:
memblock_reserve(alloc, size);//将分配的区域保存在reserved regions
ptr = phys_to_virt(alloc);//在64位系统上,kernel 虚拟地址直接等于物理地址
memset(ptr, 0, size);//将分配的内存初始化为0
/*
* The min_count is set to 0 so that bootmem allocated blocks
* are never reported as leaks. This is because many of these blocks
* are only referred via the physical address which is not
* looked up by kmemleak.
*/
kmemleak_alloc(ptr, size, 0, 0);
return ptr;
error:
return NULL;
}
下面看如何从node上分配内存的?
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid)
{
phys_addr_t kernel_end, ret;
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
end = memblock.current_limit;
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
kernel_end = __pa_symbol(_end);// _end是kernel_image end
/*
* try bottom-up allocation only when bottom-up mode
* is set and @end is above the kernel image.
*/
if (memblock_bottom_up() && end > kernel_end) {//x86采用的topdown 分配,不走这段
phys_addr_t bottom_up_start;
/* make sure we will allocate above the kernel */
bottom_up_start = max(start, kernel_end);//保证分配的起始位置在kernel image之上
/* ok, try bottom-up allocation first */
ret = __memblock_find_range_bottom_up(bottom_up_start, end,
size, align, nid);
if (ret)
return ret;
/*
* we always limit bottom-up allocation above the kernel,
* but top-down allocation doesn't have the limit, so
* retrying top-down allocation may succeed when bottom-up
* allocation failed.
*
* bottom-up allocation is expected to be fail very rarely,
* so we use WARN_ONCE() here to see the stack trace if
* fail happens.
*/
WARN_ONCE(1, "memblock: bottom-up allocation failed, "
"memory hotunplug may be affected\n");
}
return __memblock_find_range_top_down(start, end, size, align, nid);
}
X86采用的topdown分配方式,具体分配过程如下:
static phys_addr_t __init_memblock
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid)
{
phys_addr_t this_start, this_end, cand;
u64 i;
/*
memblock分配器核心分配函数过程,所谓分配就是从已经记录的memblock的memory regions中找到一段内存区域,
该内存区域不在memblock.reserverd的regions中记录,找到的内存区域起始地址保存在this_start,结束地址保存在this_end中
*/
for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);//this_start,this_end要在start和end中间
this_end = clamp(this_end, start, end);
if (this_end < size)//
continue;
cand = round_down(this_end - size, align);
if (cand >= this_start)//找到的内存区域要大于size,即align(this_end-size)后大于this_start的才满足分配要求
return cand;
}
return 0;
}
那重点来了,找内存region是由for_each_free_mem_range_reverse宏来实现的,该宏遍历所有的memblock.memory中的region,从中找到不在memblock.reserved中记录的区域,(注:memblock.reserved中保存的是已经分配出去的区域)
#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \
for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
nid, p_start, p_end, p_nid)
for_each_mem_range_rev定义,其中p_start,p_end,p_nid保存找到区域的起始地址、结束地址和所属的nid:
#define for_each_mem_range_rev(i, type_a, type_b, nid, \
p_start, p_end, p_nid) \
for (i = (u64)ULLONG_MAX, \
__next_mem_range_rev(&i, nid, type_a, type_b, \
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_mem_range_rev(&i, nid, type_a, type_b, \
p_start, p_end, p_nid))
遍历memblock region核心函数如下:
void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid)
{
int idx_a = *idx & 0xffffffff;//idx_a idx_b分别保存在idx的低32位和高32位
int idx_b = *idx >> 32;
if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;
if (*idx == (u64)ULLONG_MAX) {//初始遍历mem_range,idx赋值为ULONG_MAX
idx_a = type_a->cnt - 1; //获得type_a和type_b的index
idx_b = type_b->cnt;
}
for (; idx_a >= 0; idx_a--) {//遍历type_a中的memblock_type,找出不在type_b中的空间
struct memblock_region *m = &type_a->regions[idx_a];
phys_addr_t m_start = m->base;
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
/* only memory regions are associated with nodes, check it */
if (nid != NUMA_NO_NODE && nid != m_nid)
continue;
/* skip hotpluggable memory regions if needed */
if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
continue;
if (!type_b) {//如果没有定义type_b,则直接返回找到的type_a的start和end
if (out_start)
*out_start = m_start;
if (out_end)
*out_end = m_end;
if (out_nid)
*out_nid = m_nid;
idx_a++;
*idx = (u32)idx_a | (u64)idx_b << 32;
return;
}
/* scan areas before each reservation */
for (; idx_b >= 0; idx_b--) {//找reservation之前的区域
struct memblock_region *r;
phys_addr_t r_start;
phys_addr_t r_end;
r = &type_b->regions[idx_b];
r_start = idx_b ? r[-1].base + r[-1].size : 0;//找到不在type_b中的区域,这段区域与type_a中重叠的区域可以拿出来分配
r_end = idx_b < type_b->cnt ?
r->base : ULLONG_MAX;
/*
* if idx_b advanced past idx_a,
* break out to advance idx_a
*/
if (r_end <= m_start)//如果找到的type_a起始地址m_start大于r_end,那说明type_a 区域都在type_b之上,应break出来,遍历下一个type_a
break;
/* if the two regions intersect, we're done */
if (m_end > r_start) {//如果两个区域有重叠,则取出重叠区域
if (out_start)
*out_start = max(m_start, r_start);
if (out_end)
*out_end = min(m_end, r_end);
if (out_nid)
*out_nid = m_nid;
if (m_start >= r_start)
idx_a--;
else
idx_b--;
*idx = (u32)idx_a | (u64)idx_b << 32;
return;
}
}
}
/* signal end of iteration */
*idx = ULLONG_MAX;
}
经历了以上for_each_free_mem_range_reverse函数执行,我们就得到了一段free memblock region,[p_start, p_end, nid],返回该region的起始地址,即memblock分配的地址,回到memblock_virt_alloc_try_nid_nopanic 分配函数,即返回分配的地址。