解析设备树中的内存信息过程分析
有个疑问,引导内存分配器怎么知道内存的大小和物理地址范围?
回答:ARM64 架构使用扁平设备树(Flattened Device Tree,FDT)描述板卡的硬件信息,FDT是写在设备树中的。设备启动时,引导程序把设备树二进制文件从存储设备读到内存中,引导内核的时候把设备树二进制文件的起始地址传给内核,内核解析设备树二进制文件后得到硬件信息。举个例子:
memory@00{
device_type = "memory";
reg = <0x0 0x80000000 0x2 0x00000000>;
};
而预留内存在设备树中有两种写法:
第一种:
/memreserve/ 0x80000000 0x10000;
第二种:
reserved-memory {
#address-cells = <0x2>;
#size-cells = <0x2>;
ranges;
reserved0: reserved@0 {
no-map;
reg = <0x0 0xb0100000 0x0 0x19900000>;
};
};
linux启动从init/main.c文件的start_kernel函数开始,然后从文件setup_arch(arch/arm64/kernel/setup.c文件中)函数检测处理器类型,初始化处理器和内存,其中的arm64_memblock_init(arch/arm64/mm/init.c文件中)函数就是arm64架构的memblock初始化流程。arm64_memblock_init的前面有个函数setup_machine_fdt,就是他解析设备树的内存信息的。
void __init setup_arch(char **cmdline_p)
{
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = (unsigned long) _end;
*cmdline_p = boot_command_line;
early_fixmap_init();
early_ioremap_init();
setup_machine_fdt(__fdt_pointer);//解析设备树的内存信息
parse_early_param();
/*
* Unmask asynchronous aborts and fiq after bringing up possible
* earlycon. (Report possible System Errors once we can report this
* occurred).
*/
local_daif_restore(DAIF_PROCCTX_NOIRQ);
/*
* TTBR0 is only used for the identity mapping at this stage. Make it
* point to zero page to avoid speculatively fetching new entries.
*/
cpu_uninstall_idmap();
xen_early_init();
efi_init();
arm64_memblock_init();//arm64的引导内存分配器的初始化
paging_init();
acpi_table_upgrade();
/* Parse the ACPI tables for possible boot-time configuration */
acpi_boot_table_init();
if (acpi_disabled)
unflatten_device_tree();
bootmem_init();
kasan_init();
request_standard_resources();
early_ioremap_reset();
if (acpi_disabled)
psci_dt_init();
else
psci_acpi_init();
cpu_read_bootcpu_ops();
smp_init_cpus();
smp_build_mpidr_hash();
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
/*
* Make sure init_thread_info.ttbr0 always generates translation
* faults in case uaccess_enable() is inadvertently called by the init
* thread.
*/
init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page);
#endif
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
conswitchp = &dummy_con;
#endif
#endif
if (boot_args[1] || boot_args[2] || boot_args[3]) {
pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n"
"\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n"
"This indicates a broken bootloader or old kernel\n",
boot_args[1], boot_args[2], boot_args[3]);
}
}
我们看看setup_machine_fdt是如何解析fdt的:
static void __init setup_machine_fdt(phys_addr_t dt_phys)
{
void *dt_virt = fixmap_remap_fdt(dt_phys);//根据物理地址获取虚拟地址
const char *name;
if (!dt_virt || !early_init_dt_scan(dt_virt)) {
pr_crit("\n"
"Error: invalid device tree blob at physical address %pa (virtual address 0x%p)\n"
"The dtb must be 8-byte aligned and must not exceed 2 MB in size\n"
"\nPlease check your bootloader.",
&dt_phys, dt_virt);
while (true)
cpu_relax();
}
name = of_flat_dt_get_machine_name();//获取根节点的name
if (!name)
return;
pr_info("Machine model: %s\n", name);
dump_stack_set_arch_desc("%s (DT)", name);
}
setup_machine_fdt首先通过setup_machine_fdt函数根据设备树的物理地址获取到对应的虚拟地址,然后调用early_init_dt_scan进行早期的设备树扫描的初始化,最后获取根节点的name。只有early_init_dt_scan需要深入研究一下:
bool __init early_init_dt_scan(void *params)
{
bool status;
//校验设备树二进制文件
status = early_init_dt_verify(params);
if (!status)
return false;
early_init_dt_scan_nodes();
return true;
}
bool __init early_init_dt_verify(void *params)
{
if (!params)
return false;
/* check device tree validity */
//检查设备树有效性
if (fdt_check_header(params))
return false;
/* Setup flat device-tree pointer */
//设置平面设备树指针
initial_boot_params = params;
of_fdt_crc32 = crc32_be(~0, initial_boot_params,
fdt_totalsize(initial_boot_params));
return true;
}
early_init_dt_scan主要是调用early_init_dt_verify函数校验设备树二进制文件,然后调用early_init_dt_scan_nodes,进行设备树二进制解析:
void __init early_init_dt_scan_nodes(void)
{
/* Retrieve various information from the /chosen node */
//读取 /chosen节点信息,写入到boot_command_line
of_scan_flat_dt(early_init_dt_scan_chosen, boot_command_line);
/* Initialize {size,address}-cells info */
//初始化size-cells和address-cells信息
of_scan_flat_dt(early_init_dt_scan_root, NULL);
/* Setup memory, calling early_init_dt_add_memory_arch */
//调用函数early_init_dt_add_memory_arch设置内存
of_scan_flat_dt(early_init_dt_scan_memory, NULL);
}
early_init_dt_scan_nodes三次调用of_scan_flat_dt函数,我们看看of_scan_flat_dt到底干了啥:
int __init of_scan_flat_dt(int (*it)(unsigned long node,
const char *uname, int depth,
void *data),
void *data)
{
const void *blob = initial_boot_params;//获取dtb的基地址
const char *pathp;
int offset, rc = 0, depth = -1;
if (!blob)
return 0;
for (offset = fdt_next_node(blob, -1, &depth);
offset >= 0 && depth >= 0 && !rc;
//遍历dtb的所有节点
offset = fdt_next_node(blob, offset, &depth)) {
pathp = fdt_get_name(blob, offset, NULL);// 获取给定节点的名字
if (*pathp == '/')
pathp = kbasename(pathp);
rc = it(offset, pathp, depth, data);//调用传入的回调函数
}
return rc;
}
of_scan_flat_dt扫描FDT,获取dtb的基地址,遍历dtb的所有节点,获取给定节点的名字,调用传入的回调函数。
early_init_dt_scan_chosen、early_init_dt_scan_root、early_init_dt_scan_memory就是三个回调函数:
int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
int depth, void *data)
{
int l;
const char *p;
pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname);
//如果不是chosen节点,返回吧
if (depth != 1 || !data ||
(strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0))
return 0;
//从FDT上解析initrd位置
early_init_dt_check_for_initrd(node);
/* Retrieve command line */
//解析bootargs,写入到COMMAND_LINE_SIZE中
p = of_get_flat_dt_prop(node, "bootargs", &l);
if (p != NULL && l > 0)
strlcpy(data, p, min((int)l, COMMAND_LINE_SIZE));
/*
* CONFIG_CMDLINE is meant to be a default in case nothing else
* managed to set the command line, unless CONFIG_CMDLINE_FORCE
* is set in which case we override whatever was found earlier.
*/
#ifdef CONFIG_CMDLINE
#if defined(CONFIG_CMDLINE_EXTEND)
strlcat(data, " ", COMMAND_LINE_SIZE);
strlcat(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
#elif defined(CONFIG_CMDLINE_FORCE)
strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
#else
/* No arguments from boot loader, use kernel's cmdl*/
if (!((char *)data)[0])
strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
#endif
#endif /* CONFIG_CMDLINE */
pr_debug("Command line is: %s\n", (char*)data);
/* break now */
return 1;
}
early_init_dt_scan_chosen主要是确定了是chosen节点,解析initrd位置和bootargs。我们看下一个回调函数early_init_dt_scan_root:
int __init early_init_dt_scan_root(unsigned long node, const char *uname,
int depth, void *data)
{
const __be32 *prop;
if (depth != 0)
return 0;
//给dt_root_size_cells 和dt_root_addr_cells 赋初值
dt_root_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT;
dt_root_addr_cells = OF_ROOT_NODE_ADDR_CELLS_DEFAULT;
//获取#size-cells节点大小
prop = of_get_flat_dt_prop(node, "#size-cells", NULL);
if (prop)
dt_root_size_cells = be32_to_cpup(prop);
pr_debug("dt_root_size_cells = %x\n", dt_root_size_cells);
//获取#address-cells节点大小
prop = of_get_flat_dt_prop(node, "#address-cells", NULL);
if (prop)
dt_root_addr_cells = be32_to_cpup(prop);
pr_debug("dt_root_addr_cells = %x\n", dt_root_addr_cells);
/* break now */
return 1;
}
early_init_dt_scan_root也很简单,就是获取size-cells和address-cells信息,并且赋值给dt_root_size_cells 和dt_root_addr_cells而已。最后看看那early_init_dt_scan_memory:
int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
int depth, void *data)
{
//解析FDT的device_type节点,返回属性ptr
const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
const __be32 *reg, *endp;
int l;
bool hotpluggable;
/* We are scanning "memory" nodes only */
//只解析memory节点,不是memory节点就返回吧
if (type == NULL || strcmp(type, "memory") != 0)
return 0;
//获取memory节点的linux,usable-memory信息
reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
if (reg == NULL)
//找不到就找reg也行
reg = of_get_flat_dt_prop(node, "reg", &l);
if (reg == NULL)//reg信息也找不到就返回吧
return 0;
endp = reg + (l / sizeof(__be32));
// 获取内存节点的热插拔属性
hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
pr_debug("memory scan node %s, reg size %d,\n", uname, l);
//遍历reg属性中所有的内存region
while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
u64 base, size;
//获取每个region的base和size
base = dt_mem_next_cell(dt_root_addr_cells, ®);
size = dt_mem_next_cell(dt_root_size_cells, ®);
if (size == 0)
continue;
pr_debug(" - %llx , %llx\n", (unsigned long long)base,
(unsigned long long)size);
early_init_dt_add_memory_arch(base, size);//将region信息添加到memblock中
if (!hotpluggable)
continue;
//若该region支持hotplug,则为其添加对应标志
if (early_init_dt_mark_hotplug_memory_arch(base, size))
pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n",
base, base + size);
}
return 0;
}
到这里,我们的设备树的内存节点信息已经解析完毕,并且添加到memblock中了。我们在继续看看early_init_dt_add_memory_arch是怎么添加到memblock中的:
void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
{
const u64 phys_offset = MIN_MEMBLOCK_ADDR;
if (!PAGE_ALIGNED(base)) {
if (size < PAGE_SIZE - (base & ~PAGE_MASK)) {
pr_warn("Ignoring memory block 0x%llx - 0x%llx\n",
base, base + size);
return;
}
size -= PAGE_SIZE - (base & ~PAGE_MASK);
base = PAGE_ALIGN(base);
}
size &= PAGE_MASK;
if (base > MAX_MEMBLOCK_ADDR) {
pr_warning("Ignoring memory block 0x%llx - 0x%llx\n",
base, base + size);
return;
}
if (base + size - 1 > MAX_MEMBLOCK_ADDR) {
pr_warning("Ignoring memory range 0x%llx - 0x%llx\n",
((u64)MAX_MEMBLOCK_ADDR) + 1, base + size);
size = MAX_MEMBLOCK_ADDR - base + 1;
}
if (base + size < phys_offset) {
pr_warning("Ignoring memory block 0x%llx - 0x%llx\n",
base, base + size);
return;
}
if (base < phys_offset) {
pr_warning("Ignoring memory range 0x%llx - 0x%llx\n",
base, phys_offset);
size -= phys_offset - base;
base = phys_offset;
}
memblock_add(base, size);
}
很简单,early_init_dt_add_memory_arch主要是对base和size进行各种校验,最后调用memblock_add加入到memblock中。memblock_add之前已经讲过了,这里就不在复述了。不过我对early_init_dt_mark_hotplug_memory_arch是怎么设置hotplug标志位有点感兴趣,可以一起看看:
int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size)
{
return memblock_mark_hotplug(base, size);
}
int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
{
return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG);
}
static int __init_memblock memblock_setclr_flag(phys_addr_t base,
phys_addr_t size, int set, int flag)
{
struct memblock_type *type = &memblock.memory;
int i, ret, start_rgn, end_rgn;
ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
if (ret)
return ret;
for (i = start_rgn; i < end_rgn; i++)
if (set)
memblock_set_region_flags(&type->regions[i], flag);
else
memblock_clear_region_flags(&type->regions[i], flag);
memblock_merge_regions(type);//合并相邻的兼容区域
return 0;
}
static inline void memblock_set_region_flags(struct memblock_region *r,
enum memblock_flags flags)
{
r->flags |= flags;
}
原来early_init_dt_mark_hotplug_memory_arch是通过memblock_isolate_range分离出对应的start_rgn和end_rgn,然后对[start_rgn,end_rgn]进行memblock_set_region_flags设置标志位。