2023.12.3.-9
Memblock is a method of managing memory regions during the early boot period when the usual kernel memory allocators are not up and running.
即在系统初始化的阶段使用。memblock将系统的内存看成是几种不同的连续内存集合:
-
memory 描述kernel可以获取的物理内存。
-
reserved 描述已经分配了的内存区域
-
physmem 描述在启动阶段所有的可以获取的物理内存,忽略约束和热插拔,只在部分架构上可用。
每种类型的memory通过 strcut memblock_type描述,该数据结构里面管理了一个regions数组,里面描述了这种类型的memory的各个region。三种memblocks数组大小是编译时静态指定的。regions数组的size分别被设置成为INIT_MEMBLOCK_MEMORY_REGIONS,INIT_MEMBLOCK_RESERVED_REGIONS和INIT_PHYSMEM_REGIONS,前两个值默认情况下为128, physmem的默认值为4.
#ifndef CONFIG_NUMA
struct pglist_data __refdata contig_page_data;
EXPORT_SYMBOL(contig_page_data);
#endif
unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
unsigned long long max_possible_pfn;
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS];
#endif
// mm/memblock.c
为了管理这三个数组,又定义了一个memblock的管理入口数据结构 struct_memblock,其主要数据成员就是这三个数组,以及指定分配时的搜索顺序,从上到下还是从下到上等。
struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = 1, /* empty dummy entry */
.memory.max = INIT_MEMBLOCK_MEMORY_REGIONS,
.memory.name = "memory",
.reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
.reserved.name = "reserved",
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
memblock_type的具体定义为:
/**
* struct memblock_type - collection of memory regions of certain type
* @cnt: number of regions
* @max: size of the allocated array
* @total_size: size of all regions
* @regions: array of regions
* @name: the memory type symbolic name
*/
struct memblock_type {
unsigned long cnt;
unsigned long max;
phys_addr_t total_size;
struct memblock_region *regions;
char *name;
};
region通过 struct memblock_region表示,定义了范围、属性、node id(NUMA架构下)
/**
* struct memblock_region - represents a memory region
* @base: base address of the region
* @size: size of the region
* @flags: memory region attributes
* @nid: NUMA node id
*/
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
enum memblock_flags flags;
#ifdef CONFIG_NUMA
int nid;
#endif
};
// include/linux/memblock.h
在架构初始化阶段,应该通过memblock_add()或者memblock_add_node()函数,告诉memblock分配器物理内存的布局。memblock分配器被设置好后,可以通过:
-
memblock_phys_alloc* 系列函数申请物理内存。
-
memblock_alloc* 系列函数申请虚拟内存。
这两个系列的函数最终会调用memblock_alloc_internal或者memblock_alloc_internal_nid
/**
* memblock_alloc_internal - allocate boot memory block
* @size: size of memory block to be allocated in bytes
* @align: alignment of the region and block's size
* @min_addr: the lower bound of the memory region to allocate (phys address)
* @max_addr: the upper bound of the memory region to allocate (phys address)
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @exact_nid: control the allocation fall back to other nodes
*
* Allocates memory block using memblock_alloc_range_nid() and
* converts the returned physical address to virtual.
*
* The @min_addr limit is dropped if it can not be satisfied and the allocation
* will fall back to memory below @min_addr. Other constraints, such
* as node and mirrored memory will be handled again in
* memblock_alloc_range_nid().
*
* Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
static void * __init memblock_alloc_internal(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
int nid, bool exact_nid)
{
phys_addr_t alloc;
/*
* Detect any accidental use of these APIs after slab is ready, as at
* this moment memblock may be deinitialized already and its
* internal data may be destroyed (after execution of memblock_free_all)
*/
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, nid);
if (max_addr > memblock.current_limit)
max_addr = memblock.current_limit;
alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid,
exact_nid);
/* retry allocation without lower limit */
if (!alloc && min_addr)
alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid,
exact_nid);
if (!alloc)
return NULL;
return phys_to_virt(alloc);
}
该函数进一步调用:
-
memblock_alloc_range_nid()
-
memblock_find_in_range_node
-
__memblock_find_range_bottom_up
-
__memblock_find_range_top_down
-
-
memblock_reserve
- memblock_reserve
-
memblock_alloc_range_nid()函数从memory类型的regions数组中找到一块合适大小的region,然后检查这个region是否会于reserve类型中regions覆盖,如果不覆盖就满足要求。
/**
* memblock_alloc_range_nid - allocate boot memory block
* @size: size of memory block to be allocated in bytes
* @align: alignment of the region and block's size
* @start: the lower bound of the memory region to allocate (phys address)
* @end: the upper bound of the memory region to allocate (phys address)
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @exact_nid: control the allocation fall back to other nodes
*
* The allocation is performed from memory region limited by
* memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE.
*
* If the specified node can not hold the requested memory and @exact_nid
* is false, the allocation falls back to any node in the system.
*
* For systems with memory mirroring, the allocation is attempted first
* from the regions with mirroring enabled and then retried from any
* memory region.
*
* In addition, function using kmemleak_alloc_phys for allocated boot
* memory block, it is never reported as leaks.
*
* Return:
* Physical address of allocated memory block on success, %0 on failure.
*/
phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid,
bool exact_nid)
{
enum memblock_flags flags = choose_memblock_flags();
phys_addr_t found;
if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;
if (!align) {
/* Can't use WARNs this early in boot on powerpc */
dump_stack();
align = SMP_CACHE_BYTES;
}
again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
if (found && !memblock_reserve(found, size))
goto done;
if (nid != NUMA_NO_NODE && !exact_nid) {
found = memblock_find_in_range_node(size, align, start,
end, NUMA_NO_NODE,
flags);
if (found && !memblock_reserve(found, size))
goto done;
}
if (flags & MEMBLOCK_MIRROR) {
flags &= ~MEMBLOCK_MIRROR;
pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
&size);
goto again;
}
return 0;
done:
/*
* Skip kmemleak for those places like kasan_init() and
* early_pgtable_alloc() due to high volume.
*/
if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
/*
* Memblock allocated blocks are never reported as
* leaks. This is because many of these blocks are
* only referred via the physical address which is
* not looked up by kmemleak.
*/
kmemleak_alloc_phys(found, size, 0);
return found;
}
从memory类型的regions中查找满足要求的region分为两种方式,bottom_up和top_down.
/**
* memblock_find_in_range_node - find free area in given range and node
* @size: size of free area to find
* @align: alignment of free area to find
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
* %MEMBLOCK_ALLOC_ACCESSIBLE
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @flags: pick from blocks based on memory attributes
*
* Find @size free area aligned to @align in the specified range and node.
*
* Return:
* Found address on success, 0 on failure.
*/
static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid,
enum memblock_flags flags)
{
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
end == MEMBLOCK_ALLOC_NOLEAKTRACE)
end = memblock.current_limit;
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
if (memblock_bottom_up())
return __memblock_find_range_bottom_up(start, end, size, align,
nid, flags);
else
return __memblock_find_range_top_down(start, end, size, align,
nid, flags);
}
以bottom_up方式进行说明,可以看见寻找合适的region过程,就是遍历memblock数组。其中memblock数组通过宏for_each_free_mem_range获得。
/**
* __memblock_find_range_bottom_up - find free area utility in bottom-up
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
* %MEMBLOCK_ALLOC_ACCESSIBLE
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @flags: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
* Return:
* Found address on success, 0 on failure.
*/
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid,
enum memblock_flags flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
cand = round_up(this_start, align);
if (cand < this_end && this_end - cand >= size)
return cand;
}
return 0;
}
for_each_free_mem_range宏的实现为:
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*
* Walks over free (memory && !reserved) areas of memblock. Available as
* soon as memblock is initialized.
*/
#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \
__for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
nid, flags, p_start, p_end, p_nid)
//include/linux/memblock.h
最后,当系统boot到合适的阶段时,会调用mem_init()将memblock管理的region全部释放给buddy分配器。管理memblock的数据结构也将会被释放调。
memblock总结
- memblock是物理分配器还是虚拟分配器?
答:memblock管理的核心逻辑是基于物理内存的,从这个角度来看是物理分配器。但是分配的返回地址可以是物理地址也可以是虚拟地址。虚拟地址是通过phys_to_virt函数完成的映射。因此可以看出memblock的工作在直接映射区上,即通过__va()宏得到虚拟地址。
- memblock的生命周期是怎样的?
答:memblock存在于Linux启动初期,这个阶段buddy还未工作。memblock的数据结构在编译时就静态确定了,然后在基于架构的初始化阶段通过memblock_add()进一步获得真实的物理内存布局信息。最后,在启动buddy之后,将所有管理的region释放给buddy管理。
- memblock是如何管理内存区域的?
答:基于数组进行管理的,建立了memblock_memory_init_regions,memblock_reserved_init_regions,memblock_phys_init_regions数组,描述三种不同类型的内存区域。分配和释放都通过在数组中遍历进行。