物理内存探测和memblock分配器初始化和分配

最新推荐文章于 2023-08-03 11:04:24 发布

置顶王wwww

最新推荐文章于 2023-08-03 11:04:24 发布

阅读量891

点赞数

分类专栏： linux 内存管理文章标签：内存管理 memblock 物理内存探测

本文链接：https://blog.csdn.net/qqqqqq999999/article/details/90318422

版权

linux 内存管理专栏收录该内容

3 篇文章 0 订阅

订阅专栏

linux内存管理，总的来说，分为3个阶段，分别是：

memblock/bootmem分配器初始化
Buddy 系统初始化
Slab/slub分配器初始化

为什么要分为3个阶段呢？由于系统在初始化最初阶段，buddy系统和slab分配器都还未初始化完成，这时候内核分配内存，需要一种方式来分配物理内存，linux系统最初提供bootmem分配器分配内存，后续提供了memblock分配器用来代替bootmem分配器，不过它们使用的系统接口API是相同的。

为了分配1个或者多个page，linux如何分配呢？linux系统为分配2^order个pages提供了buddy系统分配器，其核心思想是内核将内存分成多个zone，每个zone维护了个链表数组frea_area[MAX_ORDER], 将内存按照1,2,4,8,…,2^max_order-1个page，分别挂在对应的链表上。用户分配内存时，先从对应的链表上分配，如果没有空闲内存块，则从比它大一级的链表中，取一个内存块，并将其拆分为2块，他们彼此称之为伙伴，其中一个分配给用户，另一个链入对应的空闲链表中，以此类推。

Buddy只能分配2^order个pages，为了分配任意byte的内存，避免造成内存的浪费，linux提供了slab分配器和slub分配器。slab分配器将page划分为对象大小的内存池，当需要一个对象时，就从中分配出一个对象，当不需要时，就把该对象的内存归还给内存池。

下面我们一步步来分析从系统探测出内存大小，到一步步初始化内存分配器的过程。

物理内存探测

在x86平台下，物理内存探测是通过bios INT15中断，返回可使用内存的基地址、长度、type等信息。

探测内存函数在arch/x86/boot/memory.c:

int detect_memory(void)

{

int err = -1;

if (detect_memory_e820() > 0)

err = 0;

if (!detect_memory_e801())

err = 0;

if (!detect_memory_88())

err = 0;

return err;

}

主要探测函数在detect_memory_e820()中，通过循环遍历int15中断，将获取内存segment信息保存在boot_params.e820.map中。

static int detect_memory_e820(void)

{

int count = 0;

struct biosregs ireg, oreg;

struct e820entry *desc = boot_params.e820_map;

static struct e820entry buf; /* static so it is zeroed */

initregs(&ireg);

ireg.ax = 0xe820;

ireg.cx = sizeof buf;

ireg.edx = SMAP;

ireg.di = (size_t)&buf;

* Note: at least one BIOS is known which assumes that the

* buffer pointed to by one e820 call is the same one as

* the previous call, and only changes modified fields. Therefore,

* we use a temporary buffer and copy the results entry by entry.

* This routine deliberately does not try to account for

* ACPI 3+ extended attributes. This is because there are

* BIOSes in the field which report zero for the valid bit for

* all ranges, and we don't currently make any use of the

* other attribute bits. Revisit this if we see the extended

* attribute bits deployed in a meaningful way in the future.

do {

intcall(0x15, &ireg, &oreg);

ireg.ebx = oreg.ebx; /* for next iteration... */

/* BIOSes which terminate the chain with CF = 1 as opposed

to %ebx = 0 don't always report the SMAP signature on

the final, failing, probe. */

if (oreg.eflags & X86_EFLAGS_CF)

break;

/* Some BIOSes stop returning SMAP in the middle of

the search loop. We don't know exactly how the BIOS

screwed up the map at that point, we might have a

partial map, the full map, or complete garbage, so

just return failure. */

if (oreg.eax != SMAP) {

count = 0;

break;

}

*desc++ = buf;

count++;

} while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));

return boot_params.e820_entries = count;

}

这里介绍几个数据结构。

#define E820MAX 128 /* number of entries in E820MAP */

struct e820entry {

__u64 addr; /* start of memory segment */

__u64 size; /* size of memory segment */

__u32 type; /* type of memory segment */

} __attribute__((packed));

Struct boot_params{

struct e820entry e820_map[E820MAX]; /* 0x2d0 */

}

这里假设已经得到了可用内存地址段，保存在boot_params.e820_map[]数组中。具体如何从bios得到内存地址空间，在另外一篇文章中单独讨论。

detect_memory调用栈及后续memblock初始化调用栈如下：

[Arch/x86/boot/header.S: call main]->Main()->detect_memory()->go_to_protected_mode()->[arch/x86/boot/pmjump.S:protected_mode_jump:code32_start]->[arch/x86/boot/header.S:code32_start: 0x100000]->…->

[arch/x86/kernel/head_64.S:startup_64:x86_64_start_kernel()]->x86_64_start_kernel()->x86_64_start_reservations()start_kernel()->arch_setup()

整个初始化从汇编到C程序跳转过程，后续会单独在x86系统在初始化从实模式到保护模式跳转过程。

bootloader加载内核到内存，也会单独介绍。

回到主题，接下来介绍几个与内存管理相关的关键函数分析。

在start_kernel()->setup_arch()中，

Void _init setup_arch(char **cmdline_p)

{

…

Setup_memory_map();

…

Max_pfn = e820_end_of_ram_pfn();//找到最大的页框编号

…

Find_low_pfn_range();//设定高低内存分界线

}

在setup_memory_map()中，首先看一下几个初始化，

struct x86_init_ops x86_init __initdata = {

.resources = {

.probe_roms = probe_roms,

.reserve_resources = reserve_standard_io_resources,

.memory_setup = default_machine_specific_memory_setup,

…

}

void __init setup_memory_map(void)

{

char *who;

who = x86_init.resources.memory_setup();//实际上调用为x86_init.resourses.default_machine_special_memory_setup.

memcpy(&e820_saved, &e820, sizeof(struct e820map));//将e820结构保存一份

printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");

e820_print_map(who);

}

char *__init default_machine_specific_memory_setup(void)

{

char *who = "BIOS-e820";

u32 new_nr;

* Try to copy the BIOS-supplied E820-map.

* Otherwise fake a memory map; one section from 0k->640k,

* the next section from 1mb->appropriate_mem_k

new_nr = boot_params.e820_entries;

//消除重叠内存块

sanitize_e820_map(boot_params.e820_map,

ARRAY_SIZE(boot_params.e820_map),

&new_nr);

boot_params.e820_entries = new_nr;

//将内存布局信息从boot_params.e820_map拷贝到全局e820结构中

if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)

< 0) {

u64 mem_size;

/* compare results from other methods and take the greater */

if (boot_params.alt_mem_k

< boot_params.screen_info.ext_mem_k) {

mem_size = boot_params.screen_info.ext_mem_k;

who = "BIOS-88";

} else {

mem_size = boot_params.alt_mem_k;

who = "BIOS-e801";

}

e820.nr_map = 0;

e820_add_region(0, LOWMEMSIZE(), E820_RAM);

e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);

}

/* In case someone cares... */

return who;

}

可见，该函数主要完成2个功能，一是消除内存段重叠部分，二是将内存布局信息从boot_params.e820_map中拷贝到e820中。最终调用e820_add_region()将内存信息保存到e820的map数组中。

static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,

int type)

{

int x = e820x->nr_map;

if (x >= ARRAY_SIZE(e820x->map)) {

printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",

(unsigned long long) start,

(unsigned long long) (start + size - 1));

return;

}

e820x->map[x].addr = start;

e820x->map[x].size = size;

e820x->map[x].type = type;

e820x->nr_map++;

}

下面再来看看e820_end_of_ram_pfn()，该函数用来遍历所有内存段的页框，找到低端内存的最大页框编号。

unsigned long __init e820_end_of_ram_pfn(void)

{

return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);

}

该函数找到系统中最高的物理内存页框号。

static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)

{

int i;

unsigned long last_pfn = 0;

unsigned long max_arch_pfn = MAX_ARCH_PFN;

for (i = 0; i < e820.nr_map; i++) {

struct e820entry *ei = &e820.map[i];

unsigned long start_pfn;

unsigned long end_pfn;

if (ei->type != type)

continue;

start_pfn = ei->addr >> PAGE_SHIFT;

end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;

if (start_pfn >= limit_pfn)

continue;

if (end_pfn > limit_pfn) {

last_pfn = limit_pfn;

break;

}

if (end_pfn > last_pfn)

last_pfn = end_pfn;

}

if (last_pfn > max_arch_pfn)

last_pfn = max_arch_pfn;

printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",

last_pfn, max_arch_pfn);

return last_pfn;

}

接下来，在回到setup_arch()中

Setup_arch()

{

…

max_pfn = e820_end_of_ram_pfn();

…

#ifdef CONFIG_X86_32

/* max_low_pfn get updated here */

find_low_pfn_range();

#else

check_x2apic();

/* How many end-of-memory variables you have, grandma! */

/* need this before calling reserve_initrd */

if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))

max_low_pfn = e820_end_of_low_ram_pfn();//获取低端内存最大值，对于大于4G的内存，低端内存设置为4G，对于小于4G内存，低端内存最大值设置为max_pfn

else

max_low_pfn = max_pfn;

high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;//高端内存

#endif

}

因此，对于64位系统，低端内存和高端内存的分界线为4G；但当物理内存小于4G时，高端内存和低端内存就重合了，都是到max_pfn。

memblock初始化

物理内存探测完成，接下来就该初始化memblock分配器了。

主要函数为：

Void setup_arch()

{

…

memblock_set_current_limit(ISA_END_ADDRESS);//设置memblock.current_limit=1M

memblock_x86_fill();//主要初始化memblock函数

…

init_mem_mapping();

…

memblock_set_current_limit(get_max_mapped());

…

initmem_init();

}

Memblock debug：可以在grub参数中加入memblock=debug，会在dmesg中打印出memblock初始化相关debug信息。

在memblock_x86_fill中遍历e820中保存的内存布局信息，调用memblock_add加入到memblock结构中。

void __init memblock_x86_fill(void)

{

int i;

u64 end;

* EFI may have more than 128 entries

* We are safe to enable resizing, beause memblock_x86_fill()

* is rather later for x86

memblock_allow_resize();

for (i = 0; i < e820.nr_map; i++) {

struct e820entry *ei = &e820.map[i];

end = ei->addr + ei->size;

if (end != (resource_size_t)end)

continue;

if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)

continue;

memblock_add(ei->addr, ei->size);//

}

/* throw away partial pages */

memblock_trim_memory(PAGE_SIZE);

memblock_dump_all();

}

好了，下面重点介绍memblock，主要函数在mm/memblock.c中。

linux系统中有一个struct memblock memblock的实例，用来保存memblock信息。

struct memblock {

bool bottom_up; /* is bottom up direction? */

phys_addr_t current_limit;//memblock限制，

struct memblock_type memory;

struct memblock_type reserved;

#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP

struct memblock_type physmem;

#endif

};

bottom_up: 用来标志memblock是从上往下分配还是从下往上分配，默认=false，即采取从上往下分配

current_limit：分配内存上限

memory：描述当前内存区域的内存区数目、总大小以及每个内存region

reserved：描述当前内存块已经分配的内存区数目、总大小以及每个内存region，在reserved中描述的地址范围，表示不可以再备memblock分配。

memory用来描述全部region，不区分分配和未分配，reserved用来描述memblock中已经分配的内存region。

struct memblock_type {

unsigned long cnt; /* number of regions */

unsigned long max; /* size of the allocated array */

phys_addr_t total_size; /* size of all regions */

struct memblock_region *regions;

};

cnt：regions数目

max：最大regions数目

total_size：regions总大小

regions： regions array，记录每个regions的信息

struct memblock_region {

phys_addr_t base;

phys_addr_t size;

unsigned long flags;

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

int nid;

#endif

};

base：region基地址

size：region大小

flags：region 标志

memblock初始化

static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;

static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;

#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP

static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;

#endif

struct memblock memblock __initdata_memblock = {

.memory.regions = memblock_memory_init_regions,

.memory.cnt = 1, /* empty dummy entry */

.memory.max = INIT_MEMBLOCK_REGIONS,

.reserved.regions = memblock_reserved_init_regions,

.reserved.cnt = 1, /* empty dummy entry */

.reserved.max = INIT_MEMBLOCK_REGIONS,

#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP

.physmem.regions = memblock_physmem_init_regions,

.physmem.cnt = 1, /* empty dummy entry */

.physmem.max = INIT_PHYSMEM_REGIONS,

#endif

.bottom_up = false,

.current_limit = MEMBLOCK_ALLOC_ANYWHERE,

};

.memory.regions 和.reserved.regions初始化为固定数组，最多支持128个regions；

.current_limit初始化最大为可能物理地址。

调用memblock_add将可用内存区间加入到memblock中。

int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)

{

return memblock_add_range(&memblock.memory, base, size,

MAX_NUMNODES, 0);

}

添加一个memblock region

@type： memblock type添加新的region

@base：新region的base address

@size：新region的size

@nid：新region的nid

@flags：新region的flags

int __init_memblock memblock_add_range(struct memblock_type *type,

phys_addr_t base, phys_addr_t size,

int nid, unsigned long flags)

{

bool insert = false;

phys_addr_t obase = base;

phys_addr_t end = base + memblock_cap_size(base, &size);//调整size，不被溢出

int i, nr_new;

if (!size)

return 0;

/* special case for empty array */

if (type->regions[0].size == 0) {//初次添加走这里

WARN_ON(type->cnt != 1 || type->total_size);

type->regions[0].base = base;

type->regions[0].size = size;

type->regions[0].flags = flags;

memblock_set_region_node(&type->regions[0], nid);

type->total_size = size;

return 0;

}

repeat:

* The following is executed twice. Once with %false @insert and

* then with %true. The first counts the number of regions needed

* to accomodate the new area. The second actually inserts them.

//后续添加过程执行2次，第一次insert=false，并不真正insert，只计算出需要添加的area的个数；第二次真正insert到regions中。

base = obase;

nr_new = 0;

for (i = 0; i < type->cnt; i++) {

struct memblock_region *rgn = &type->regions[i];

phys_addr_t rbase = rgn->base;

phys_addr_t rend = rbase + rgn->size;

if (rbase >= end)

break;

if (rend <= base)

continue;

* @rgn overlaps. If it separates the lower part of new

* area, insert that portion.

if (rbase > base) {

nr_new++;

if (insert)

memblock_insert_region(type, i++, base,

rbase - base, nid,

flags);

}

/* area below @rend is dealt with, forget about it */

base = min(rend, end);

}

/* insert the remaining portion */

if (base < end) {

nr_new++;

if (insert)

memblock_insert_region(type, i, base, end - base,

nid, flags);

}

* If this was the first round, resize array and repeat for actual

* insertions; otherwise, merge and return.

//如果第一次走到这里，resize array并goto repeat

if (!insert) {

while (type->cnt + nr_new > type->max)

if (memblock_double_array(type, obase, size) < 0)

return -ENOMEM;

insert = true;

goto repeat;

} else {

memblock_merge_regions(type);//第二次时合并重叠区域

return 0;

}

走到这里，系统已经将e820中保存的内存加入到memblock中。

64位模式，采用的直接映射，即内核虚拟地址=物理地址，PAGE_OFFSET=0；

memblock分配API

void * __init memblock_virt_alloc_try_nid_nopanic(

phys_addr_t size, phys_addr_t align,

phys_addr_t min_addr, phys_addr_t max_addr,

int nid)

{

memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",

__func__, (u64)size, (u64)align, nid, (u64)min_addr,

(u64)max_addr, (void *)_RET_IP_);

return memblock_virt_alloc_internal(size, align, min_addr,

max_addr, nid);

}

下面我们深入到memblock_virt_alloc_try_nid_nopanic函数中，探究一下OS初期是如何用memblock来分配内存的？

首先解析一下memblock_virt_alloc_try_nid_nopanic入参：

Size: 分配内存大小

Align: 分配内存对其byte

Min_addr: 最小的分配地址，就是说分配的最小地址不能低于min_addr

Max_addr:最大的分配地址，就是说分配的最大地址不能大于max_addr

Nid:分配内存所属node

下面看分配流程，首先看是否有slab分配器，如果没有，再从memblock分配器分配，优先从本地nid的内存上分配，如果分配不到，再从其他node的内存上分配，从node上分配内存函数为memblock_find_in_range_node。

static void * __init memblock_virt_alloc_internal(

phys_addr_t size, phys_addr_t align,

phys_addr_t min_addr, phys_addr_t max_addr,

int nid)

{

phys_addr_t alloc;

void *ptr;

if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))

nid = NUMA_NO_NODE;

* Detect any accidental use of these APIs after slab is ready, as at

* this moment memblock may be deinitialized already and its

* internal data may be destroyed (after execution of free_all_bootmem)

if (WARN_ON_ONCE(slab_is_available()))//首先判断slab分配器是否可用，如果可用，直接从slab分配器分配

return kzalloc_node(size, GFP_NOWAIT, nid);

if (!align)

align = SMP_CACHE_BYTES;//在没有指明align时，以L1cache line对齐

if (max_addr > memblock.current_limit)

max_addr = memblock.current_limit;//memblock设置current_limit时，max_addr不能超过设定的上限

again:

alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,

nid);//从memblock中找到一段未分配的region

if (alloc)//如果找到了可用地址，goto done，否则从其他nid上分配

goto done;

if (nid != NUMA_NO_NODE) {//从其他node上分配

alloc = memblock_find_in_range_node(size, align, min_addr,

max_addr, NUMA_NO_NODE);

if (alloc)

goto done;

}

if (min_addr) {

min_addr = 0;

goto again;

} else {

goto error;

}

done:

memblock_reserve(alloc, size);//将分配的区域保存在reserved regions

ptr = phys_to_virt(alloc);//在64位系统上，kernel 虚拟地址直接等于物理地址

memset(ptr, 0, size);//将分配的内存初始化为0

* The min_count is set to 0 so that bootmem allocated blocks

* are never reported as leaks. This is because many of these blocks

* are only referred via the physical address which is not

* looked up by kmemleak.

kmemleak_alloc(ptr, size, 0, 0);

return ptr;

error:

return NULL;

}

下面看如何从node上分配内存的？

phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,

phys_addr_t align, phys_addr_t start,

phys_addr_t end, int nid)

{

phys_addr_t kernel_end, ret;

/* pump up @end */

if (end == MEMBLOCK_ALLOC_ACCESSIBLE)

end = memblock.current_limit;

/* avoid allocating the first page */

start = max_t(phys_addr_t, start, PAGE_SIZE);

end = max(start, end);

kernel_end = __pa_symbol(_end);// _end是kernel_image end

* try bottom-up allocation only when bottom-up mode

* is set and @end is above the kernel image.

if (memblock_bottom_up() && end > kernel_end) {//x86采用的topdown 分配，不走这段

phys_addr_t bottom_up_start;

/* make sure we will allocate above the kernel */

bottom_up_start = max(start, kernel_end);//保证分配的起始位置在kernel image之上

/* ok, try bottom-up allocation first */

ret = __memblock_find_range_bottom_up(bottom_up_start, end,

size, align, nid);

if (ret)

return ret;

* we always limit bottom-up allocation above the kernel,

* but top-down allocation doesn't have the limit, so

* retrying top-down allocation may succeed when bottom-up

* allocation failed.

* bottom-up allocation is expected to be fail very rarely,

* so we use WARN_ONCE() here to see the stack trace if

* fail happens.

WARN_ONCE(1, "memblock: bottom-up allocation failed, "

"memory hotunplug may be affected\n");

}

return __memblock_find_range_top_down(start, end, size, align, nid);

}

X86采用的topdown分配方式，具体分配过程如下：

static phys_addr_t __init_memblock

__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,

phys_addr_t size, phys_addr_t align, int nid)

{

phys_addr_t this_start, this_end, cand;

u64 i;

memblock分配器核心分配函数过程，所谓分配就是从已经记录的memblock的memory regions中找到一段内存区域，

该内存区域不在memblock.reserverd的regions中记录，找到的内存区域起始地址保存在this_start，结束地址保存在this_end中

for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {

this_start = clamp(this_start, start, end);//this_start,this_end要在start和end中间

this_end = clamp(this_end, start, end);

if (this_end < size)//

continue;

cand = round_down(this_end - size, align);

if (cand >= this_start)//找到的内存区域要大于size，即align(this_end-size)后大于this_start的才满足分配要求

return cand;

}

return 0;

}

那重点来了，找内存region是由for_each_free_mem_range_reverse宏来实现的，该宏遍历所有的memblock.memory中的region，从中找到不在memblock.reserved中记录的区域，（注：memblock.reserved中保存的是已经分配出去的区域）

#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \

for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \

nid, p_start, p_end, p_nid)

for_each_mem_range_rev定义，其中p_start,p_end,p_nid保存找到区域的起始地址、结束地址和所属的nid：

#define for_each_mem_range_rev(i, type_a, type_b, nid, \

p_start, p_end, p_nid) \

for (i = (u64)ULLONG_MAX, \

__next_mem_range_rev(&i, nid, type_a, type_b, \

p_start, p_end, p_nid); \

i != (u64)ULLONG_MAX; \

__next_mem_range_rev(&i, nid, type_a, type_b, \

p_start, p_end, p_nid))

遍历memblock region核心函数如下：

void __init_memblock __next_mem_range_rev(u64 *idx, int nid,

struct memblock_type *type_a,

struct memblock_type *type_b,

phys_addr_t *out_start,

phys_addr_t *out_end, int *out_nid)

{

int idx_a = *idx & 0xffffffff;//idx_a idx_b分别保存在idx的低32位和高32位

int idx_b = *idx >> 32;

if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))

nid = NUMA_NO_NODE;

if (*idx == (u64)ULLONG_MAX) {//初始遍历mem_range，idx赋值为ULONG_MAX

idx_a = type_a->cnt - 1; //获得type_a和type_b的index

idx_b = type_b->cnt;

}

for (; idx_a >= 0; idx_a--) {//遍历type_a中的memblock_type,找出不在type_b中的空间

struct memblock_region *m = &type_a->regions[idx_a];

phys_addr_t m_start = m->base;

phys_addr_t m_end = m->base + m->size;

int m_nid = memblock_get_region_node(m);

/* only memory regions are associated with nodes, check it */

if (nid != NUMA_NO_NODE && nid != m_nid)

continue;

/* skip hotpluggable memory regions if needed */

if (movable_node_is_enabled() && memblock_is_hotpluggable(m))

continue;

if (!type_b) {//如果没有定义type_b,则直接返回找到的type_a的start和end

if (out_start)

*out_start = m_start;

if (out_end)

*out_end = m_end;

if (out_nid)

*out_nid = m_nid;

idx_a++;

*idx = (u32)idx_a | (u64)idx_b << 32;

return;

}

/* scan areas before each reservation */

for (; idx_b >= 0; idx_b--) {//找reservation之前的区域

struct memblock_region *r;

phys_addr_t r_start;

phys_addr_t r_end;

r = &type_b->regions[idx_b];

r_start = idx_b ? r[-1].base + r[-1].size : 0;//找到不在type_b中的区域，这段区域与type_a中重叠的区域可以拿出来分配

r_end = idx_b < type_b->cnt ?

r->base : ULLONG_MAX;

* if idx_b advanced past idx_a,

* break out to advance idx_a

if (r_end <= m_start)//如果找到的type_a起始地址m_start大于r_end，那说明type_a 区域都在type_b之上，应break出来，遍历下一个type_a

break;

/* if the two regions intersect, we're done */

if (m_end > r_start) {//如果两个区域有重叠，则取出重叠区域

if (out_start)

*out_start = max(m_start, r_start);

if (out_end)

*out_end = min(m_end, r_end);

if (out_nid)

*out_nid = m_nid;

if (m_start >= r_start)

idx_a--;

else

idx_b--;

*idx = (u32)idx_a | (u64)idx_b << 32;

return;

}

/* signal end of iteration */

*idx = ULLONG_MAX;

}

经历了以上for_each_free_mem_range_reverse函数执行，我们就得到了一段free memblock region，[p_start, p_end, nid]，返回该region的起始地址，即memblock分配的地址，回到memblock_virt_alloc_try_nid_nopanic 分配函数，即返回分配的地址。

王wwww

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
物理内存探测和memblock分配器初始化和分配

linux内存管理，总的来说，分为3个阶段，分别是：memblock/bootmem分配器初始化 Buddy 系统初始化 Slab/slub分配器初始化为什么要分为3个阶段呢？由于系统在初始化最初阶段，buddy系统和slab分配器都还未初始化完成，这时候内核分配内存，需要一种方式来分配物理内存，linux系统最初提供bootmem分配器分配内存，后续提供了memblock分配器...
复制链接

扫一扫