1. linux kernel启动时,会首先调用init/main.c 中的 asmlinkage void __init start_kernel(void); 定义如下:
asmlinkage void __init start_kernel(void)
{
char * command_line;
extern struct kernel_param __start___param[], __stop___param[];
smp_setup_processor_id();
/*
* Need to run as early as possible, to initialize the
* lockdep hash:
*/
unwind_init();
lockdep_init();
debug_objects_early_init();
cgroup_init_early();
core_imv_update();
local_irq_disable();
early_boot_irqs_off();
early_init_irq_lock_class();
/*
* Interrupts are still disabled. Do necessary setups, then
* enable them
*/
lock_kernel();
tick_init();
boot_cpu_init();
page_address_init();
printk(KERN_NOTICE);
printk(linux_banner);
//这里主要介绍这个函数,setup_arch, 会调用到arch/目录下的代码
//这里的command_line参数使用来获取的,原来并没有赋值
setup_arch(&command_line);
mm_init_owner(&init_mm, &init_task);
setup_command_line(command_line);
unwind_setup();
setup_per_cpu_areas();
setup_nr_cpu_ids();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
//太多了就不全贴了
......
}
2. 因为这里是mips系统,所以会调用到 arch/mips/kernel/setup.c 中的 setup_arch(char **cmdline_p)
定义如下:
void __init setup_arch(char **cmdline_p)
{
cpu_probe();
//内存分配情况主要是这个函数
prom_init();
#ifdef CONFIG_EARLY_PRINTK
setup_early_printk();
#endif
cpu_report();
check_bugs_early();
#if defined(CONFIG_VT)
#if defined(CONFIG_VGA_CONSOLE)
conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
conswitchp = &dummy_con;
#endif
#endif
arch_mem_init(cmdline_p);
resource_init();
plat_smp_setup();
}
3. 这里的prom_init(), 会调用到 arch/mips/bcm5621x/prom.c 中去,因为这CPU是bcm5621x。
void __init prom_init(void)
{
prom_init_cfe();
prom_init_console();
prom_init_cmdline();
prom_meminit();
}
很简单,就四个函数,但这四个函数可不简单
3.1 prom_init_cfe(void) --- 这是个初始化cfe的函数,作用是从cfe中获取cmdline参数。
CFE是一个类似uboot的bootloader, 可以进行参数设置。
void __init prom_init_cfe(void)
{
uint32_t cfe_ept, cfe_handle;
unsigned int cfe_eptseal;
int argc = fw_arg0;
//这里的fw_arg2,3 就是用来传递CFE参数的
char **envp = (char **) fw_arg2;
int *prom_vec = (int *) fw_arg3;
_machine_restart = cfe_linux_restart;
_machine_halt = cfe_linux_halt;
/*
* Check if a loader was used; if NOT, the 4 arguments are
* what CFE gives us (handle, 0, EPT and EPTSEAL)
*/
printk(KERN_INFO "xxha:-------- %s\n", __func__);
if (argc < 0) {
//运行时会走这个if分支,也就是 从envp中获取参数
printk(KERN_INFO "xxha:-------- 1111111\n");
cfe_handle = (uint32_t)(long)argc;
cfe_ept = (long)envp;
cfe_eptseal = (uint32_t)(unsigned long)prom_vec;
} else {
if ((int32_t)(long)prom_vec < 0) {
printk(KERN_INFO "xxha:--------222222\n");
/*
* Old loader; all it gives us is the handle,
* so use the "known" entrypoint and assume
* the seal.
*/
cfe_handle = (uint32_t)(long)prom_vec;
cfe_ept = (uint32_t)((int32_t)0x9fc00500);
cfe_eptseal = CFE_EPTSEAL;
} else {
printk(KERN_INFO "xxha:--------33333333333\n");
/*
* Newer loaders bundle the handle/ept/eptseal
* Note: prom_vec is in the loader's useg
* which is still alive in the TLB.
*/
cfe_handle = (uint32_t)((int32_t *)prom_vec)[0];
cfe_ept = (uint32_t)((int32_t *)prom_vec)[2];
cfe_eptseal = (unsigned int)((uint32_t *)prom_vec)[3];
}
}
if (cfe_eptseal != CFE_EPTSEAL) {
/* too early for panic to do any good */
printk("CFE's entrypoint seal doesn't match. Spinning.");
while (1) ;
}
printk(KERN_INFO "xxha:----------cfe_ept = %s\n", (char *)cfe_ept);
printk(KERN_INFO "xxha:----------cfe_handle = %s\n", (char *)cfe_handle);
cfe_init(cfe_handle, cfe_ept);
}
3.1.1 int cfe_init(u64 handle, u64 ept) --- 这是个cfe初始化函数:
/*
* Declare the dispatch function with args of "intptr_t".
* This makes sure whatever model we're compiling in
* puts the pointers in a single register. For example,
* combining -mlong64 and -mips1 or -mips2 would lead to
* trouble, since the handle and IOCB pointer will be
* passed in two registers each, and CFE expects one.
*/
static int (*cfe_dispfunc) (intptr_t handle, intptr_t xiocb) = 0;
static u64 cfe_handle = 0;
int cfe_init(u64 handle, u64 ept)
{
cfe_dispfunc = NATIVE_FROM_XPTR(ept);
cfe_handle = handle;
return 0;
}
3.1.2 int cfe_iocb_dispatch(struct cfe_xiocb * xiocb)
这应该是 传递参数的一个函数,内容有点不理解,跟cfe_init()肯定是有关系的:
int cfe_iocb_dispatch(struct cfe_xiocb * xiocb)
{
if (!cfe_dispfunc)
return -1;
return (*cfe_dispfunc) ((intptr_t) cfe_handle, (intptr_t) xiocb);
}
3.2 prom_meminit(void) --- 初始化内存:
static __init void prom_meminit(void)
{
u64 addr, size, type; /* regardless of 64BIT_PHYS_ADDR */
int mem_flags = 0;
unsigned int idx;
int rd_flag;
unsigned int board_mem_region_count = 0;
#ifdef CONFIG_BLK_DEV_INITRD
unsigned long initrd_pstart;
unsigned long initrd_pend;
printk(KERN_INFO "xxha: ---------- %s\n", __func__);
//这里的initrd_pstart 和 initrd_pend 是cmdline 中的initrd=3300000@81000000传进来的.
//如果没有定义initrd, 那这两个参数都是0.
initrd_pstart = CPHYSADDR(initrd_start);
initrd_pend = CPHYSADDR(initrd_end);
printk(KERN_INFO "xxha: ---------- initrd_pstart = 0x%lx\n", initrd_pstart);
printk(KERN_INFO "xxha: ---------- initrd_pend = 0x%lx\n", initrd_pend);
if (initrd_start &&
((initrd_pstart > MAX_RAM_SIZE)
|| (initrd_pend > MAX_RAM_SIZE))) {
panic("initrd out of addressable memory");
}
#endif /* INITRD */
//这里主要是循环调用 cfe_enummem() 函数,用来获取CFE 中memory的分块情况
for (idx = 0; cfe_enummem(idx, mem_flags, &addr, &size, &type) != CFE_ERR_NOMORE;
idx++) {
rd_flag = 0;
if (type == CFE_MI_AVAILABLE) {
/*
* See if this block contains (any portion of) the
* ramdisk
*/
#ifdef CONFIG_BLK_DEV_INITRD
printk(KERN_INFO "xxha: ---------- initrd_start = 0x%lx\n", initrd_start);
if (initrd_start) {
if ((initrd_pstart > addr) &&
(initrd_pstart < (addr + size))) {
printk(KERN_INFO "xxha: ---------- addr = 0x%llx\n", addr);
printk(KERN_INFO "xxha: ---------- initrd_pstart - addr = 0x%llx\n", (initrd_pstart - addr));
//添加memory分区,就是用这个函数
add_memory_region(addr,
initrd_pstart - addr,
BOOT_MEM_RAM);
rd_flag = 1;
}
if ((initrd_pend > addr) &&
(initrd_pend < (addr + size))) {
printk(KERN_INFO "xxha: ---------- initrd_pend = 0x%lx\n", initrd_pend);
printk(KERN_INFO "xxha: ----------(addr + size) - initrd_pend = 0x%llx\n", (addr + size) - initrd_pend);
add_memory_region(initrd_pend,
(addr + size) - initrd_pend,
BOOT_MEM_RAM);
rd_flag = 1;
}
}
#endif
if (!rd_flag) {
if (addr > MAX_RAM_SIZE)
continue;
if (addr+size > MAX_RAM_SIZE)
size = MAX_RAM_SIZE - (addr+size) + 1;
/*
* memcpy/__copy_user prefetch, which
* will cause a bus error for
* KSEG/KUSEG addrs not backed by RAM.
* Hence, reserve some padding for the
* prefetch distance.
*/
if (size > 512)
size -= 512;
printk(KERN_INFO "xxha: ---------- addr = 0x%llx\n", addr);
printk(KERN_INFO "xxha: ---------- size = 0x%llx\n", size);
add_memory_region(addr, size, BOOT_MEM_RAM);
}
printk(KERN_INFO "xxha: ---------- board_mem_region_count = %d\n", board_mem_region_count);
board_mem_region_addrs[board_mem_region_count] = addr;
board_mem_region_sizes[board_mem_region_count] = size;
board_mem_region_count++;
if (board_mem_region_count ==
BCM56218_MAX_MEM_REGIONS) {
/*
* Too many regions. Need to configure more
*/
while(1);
}
}
}
#if 1
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start) {
printk(KERN_INFO "xxha: ---------- initrd_pstart = 0x%lx\n", initrd_pstart);
printk(KERN_INFO "xxha: ---------- initrd_pend - initrd_pstart = 0x%lx\n", initrd_pend - initrd_pstart);
add_memory_region(initrd_pstart, initrd_pend - initrd_pstart,
BOOT_MEM_RESERVED);
}
#endif
#endif
}
这里的CONFIG_BLK_DEV_INITRD 就是menuconfig --> General Setup --> [*] Initial RAM filesystem and RAM disk (initramfs/initrd) support 选项。
3.2.1 int cfe_enummem(int idx, int flags, u64 *start, u64 *length, u64 *type)
这个函数定义在arch/mips/fw/cfe/cfe_api.c 中,用来获取cfe中memory 分块信息。
int
cfe_enummem(int idx, int flags, u64 *start, u64 *length, u64 *type)
{
struct cfe_xiocb xiocb;
printk(KERN_INFO "xxha: ---------- %s\n", __func__);
//先设置xiocb.xiocb_fcode 和 xiocb.plist.xiocb_meminfo.mi_idx 等信息
xiocb.xiocb_fcode = CFE_CMD_FW_MEMENUM;
xiocb.xiocb_status = 0;
xiocb.xiocb_handle = 0;
xiocb.xiocb_flags = flags;
xiocb.xiocb_psize = sizeof(struct xiocb_meminfo);
xiocb.plist.xiocb_meminfo.mi_idx = idx;
//然后再用cfe_iocb_dispatch函数来获取 meminfo的信息。
cfe_iocb_dispatch(&xiocb);
#if 1
//通过 xiocb.xiocb_status 来判断获取是否成功
if (xiocb.xiocb_status < 0) {
printk(KERN_INFO "xxha: ---------- 2 %s\n", __func__);
return xiocb.xiocb_status;
}
#endif
//memory分块的其实地址,分块大小,和分块类型 三个参数用来表示memory信息。
*start = xiocb.plist.xiocb_meminfo.mi_addr;
*length = xiocb.plist.xiocb_meminfo.mi_size;
*type = xiocb.plist.xiocb_meminfo.mi_type;
printk(KERN_INFO "xxha: ---------- start = 0x%llx\n", xiocb.plist.xiocb_meminfo.mi_addr);
printk(KERN_INFO "xxha: ---------- length = 0x%llx\n", xiocb.plist.xiocb_meminfo.mi_size);
printk(KERN_INFO "xxha: ---------- type = 0x%llx\n", xiocb.plist.xiocb_meminfo.mi_type);
return 0;
}
3.3 void __init add_memory_region(phys_t start, phys_t size, long type)
添加 memory 分区,参数是起始地址,分区大小和分区类型:
void __init add_memory_region(phys_t start, phys_t size, long type)
{
int x = boot_mem_map.nr_map;
struct boot_mem_map_entry *prev = boot_mem_map.map + x - 1;
/* Sanity check */
if (start + size < start) {
pr_warning("Trying to add an invalid memory region, skipped\n");
return;
}
/*
* Try to merge with previous entry if any. This is far less than
* perfect but is sufficient for most real world cases.
*/
if (x && prev->addr + prev->size == start && prev->type == type) {
prev->size += size;
return;
}
if (x == BOOT_MEM_MAP_MAX) {
pr_err("Ooops! Too many entries in the memory map!\n");
return;
}
//add_memory_region之后,就生成了 boot_mem_map结构体 供内核使用。
boot_mem_map.map[x].addr = start;
boot_mem_map.map[x].size = size;
boot_mem_map.map[x].type = type;
boot_mem_map.nr_map++;
}
3.3.1 boot_mem_map 结构体定义: 这里结构体类型名 和定义的变量名一样的,真奇怪,这样不会出错么。
struct boot_mem_map boot_mem_map;
/*
* A memory map that's built upon what was determined
* or specified on the command line.
*/
struct boot_mem_map {
//memory map 的分区数
int nr_map;
struct boot_mem_map_entry {
phys_t addr; /* start of memory segment */
phys_t size; /* size of memory segment */
long type; /* type of memory segment */
} map[BOOT_MEM_MAP_MAX];
};
extern struct boot_mem_map boot_mem_map;
4. arch_mem_init(char **cmdline_p)
这个函数又回到了 arch/mips/kernel/setup.c 中. 定义如下:
static void __init arch_mem_init(char **cmdline_p)
{
extern void plat_mem_setup(void);
strlcpy(command_line, arcs_cmdline, sizeof(command_line));
strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
parse_early_param();
// 平台相关的mem 初始化
plat_mem_setup();
//kernel log中会打印出以下的log
if (usermem) {
pr_info("User-defined physical RAM map:\n");
print_memory_map();
} else {
pr_info("Determined physical RAM map:\n");
print_memory_map();
}
//boot相关的mem初始化
bootmem_init();
#ifdef CONFIG_KEXEC
pr_info("Crashkernel info:\n");
pr_info("\tstart=%llu end=%llu\n", crashk_res.start, crashk_res.end);
if (crashk_res.start != crashk_res.end)
reserve_bootmem(crashk_res.start,
crashk_res.end - crashk_res.start + 1, 0);
#endif
sparse_init();
paging_init();
}
4.1 plat_mem_setup() --- 平台相关的mem初始化
void __init plat_mem_setup(void)
{
/* Get global SB handle */
sbh = sb_kattach();
/* Initialize clocks and interrupts */
_machine_restart = bcm956218_machine_restart;
_machine_halt = bcm956218_machine_halt;
pm_power_off = bcm956218_machine_halt;
}
4.1.1 sb_kattach() ---这个函数好怪异。。
/* generic kernel variant of sb_attach() */
void*
sb_kattach()
{
uint32 *regs;
char *unused;
int varsz;
if (ksi.curmap == NULL) {
regs = (uint32 *)REG_MAP(SB_ENUM_BASE, SB_CORE_SIZE);
sb_doattach(&ksi, BCM56218_DEVICE_ID, NULL, (void*)regs,
SB_BUS, NULL, &unused, &varsz);
}
return &ksi;
}
REG_MAP 就是 physical -> virtual memory map 函数:
/* map/unmap physical to virtual I/O */
#define REG_MAP(pa, size) ioremap_nocache((unsigned long)(pa), (unsigned long)(size))
#define REG_UNMAP(va) iounmap((void *)(va))
一些常量的定义:
#ifdef CONFIG_BCM5621X
#define SB_SDRAM_BASE 0x00000000 /* Physical SDRAM */
#define SB_SDRAM_SWAPPED 0x10000000 /* Byteswapped Physical SDRAM */
#define SB_ENUM_BASE 0x18000000 /* Enumeration space base */
#define SB_ENUM_LIM 0x18010000 /* Enumeration space limit */
#define SB_CORE_SIZE 0x1000 /* each core gets 4Kbytes for registers */
#define SB_MAXCORES ((SB_ENUM_LIM - SB_ENUM_BASE)/SB_CORE_SIZE)
4.1.2 static void*
sb_doattach(sb_info_t *si, uint devid, void *osh, void *regs, uint bustype,
void *sdh, char **vars, int *varsz)
static void*
sb_doattach(sb_info_t *si, uint devid, void *osh, void *regs, uint bustype,
void *sdh, char **vars, int *varsz)
{
uint origidx;
chipcregs_t *cc;
ASSERT(GOODREGS(regs));
bzero((uchar*)si, sizeof (sb_info_t));
si->gpioidx = BADIDX;
si->osh = osh;
si->curmap = regs;
si->sdh = sdh;
si->bus = bustype;
/* initialize current core index value */
si->curidx = _sb_coreidx((void*)si);
/* keep and reuse the initial register mapping */
origidx = si->curidx;
if (si->bus == SB_BUS)
si->regs[origidx] = regs;
/* is core-0 a chipcommon core? */
si->numcores = 1;
cc = (chipcregs_t*) sb_setcoreidx((void*)si, 0);
si->chip = BCM56218_DEVICE_ID;
si->chiprev = (R_REG(&cc->chipid) & CID_REV_MASK) >> CID_REV_SHIFT;
si->numcores = sb_chip2numcores(si->chip);
/* return to original core */
sb_setcoreidx((void*)si, origidx);
/* sanity checks */
ASSERT(si->chip);
/*
* Check if cores can be mapped statically. If not, do a scan.
*/
if (sb_map_cores(si)) {
SB_ERROR(("sb_attach: unable to map cores !!\n"));
goto bad;
}
/* gpio control core is required */
if (!GOODIDX(si->gpioidx)) {
SB_ERROR(("sb_attach: gpio control core not found\n"));
goto bad;
}
/* get boardtype and boardrev */
si->boardvendor = VENDOR_BROADCOM;
si->boardtype = 0;
return ((void*)si);
bad:
MFREE(si, sizeof (sb_info_t));
return (NULL);
}
4.2 bootmem_init() --- boot 相关的mem 初始化, 函数很简单
static void __init bootmem_init(void)
{
init_initrd();
finalize_initrd();
}
4.2.1 init_initrd() ---- 主要就是为了初始化 initrd_stard 和 initrd_end 两个变量
/* it returns the next free pfn after initrd */
static unsigned long __init init_initrd(void)
{
unsigned long end;
/*
* Board specific code or command line parser should have
* already set up initrd_start and initrd_end. In these cases
* perfom sanity checks and use them if all looks good.
*/
if (!initrd_start || initrd_end <= initrd_start) {
#ifdef CONFIG_PROBE_INITRD_HEADER
u32 *initrd_header;
/*
* See if initrd has been added to the kernel image by
* arch/mips/boot/addinitrd.c. In that case a header is
* prepended to initrd and is made up by 8 bytes. The first
* word is a magic number and the second one is the size of
* initrd. Initrd start must be page aligned in any cases.
*/
initrd_header = __va(PAGE_ALIGN(__pa_symbol(&_end) + 8)) - 8;
if (initrd_header[0] != 0x494E5244)
goto disable;
initrd_start = (unsigned long)(initrd_header + 2);
initrd_end = initrd_start + initrd_header[1];
#else
goto disable;
#endif
}
if (initrd_start & ~PAGE_MASK) {
pr_err("initrd start must be page aligned\n");
goto disable;
}
if (initrd_start < PAGE_OFFSET) {
pr_err("initrd start < PAGE_OFFSET\n");
goto disable;
}
/*
* Sanitize initrd addresses. For example firmware
* can't guess if they need to pass them through
* 64-bits values if the kernel has been built in pure
* 32-bit. We need also to switch from KSEG0 to XKPHYS
* addresses now, so the code can now safely use __pa().
*/
end = __pa(initrd_end);
initrd_end = (unsigned long)__va(end);
initrd_start = (unsigned long)__va(__pa(initrd_start));
ROOT_DEV = Root_RAM0;
return PFN_UP(end);
disable:
initrd_start = 0;
initrd_end = 0;
return 0;
}
4.2.2 finalize_initrd(void) --- 确定 initrd_stard 和 initrd_end 两个变量的值
static void __init finalize_initrd(void)
{
unsigned long size = initrd_end - initrd_start;
if (size == 0) {
printk(KERN_INFO "Initrd not found or empty");
goto disable;
}
if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) {
printk(KERN_ERR "Initrd extends beyond end of memory");
goto disable;
}
reserve_bootmem(__pa(initrd_start), size, BOOTMEM_DEFAULT);
initrd_below_start_ok = 1;
pr_info("Initial ramdisk at: 0x%lx (%lu bytes)\n",
initrd_start, size);
return;
disable:
printk(KERN_CONT " - disabling initrd\n");
initrd_start = 0;
initrd_end = 0;
}
4.3 sparse_init()
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
*/
void __init sparse_init(void)
{
unsigned long pnum;
struct page *map;
unsigned long *usemap;
unsigned long **usemap_map;
int size;
/*
* map is using big page (aka 2M in x86 64 bit)
* usemap is less one page (aka 24 bytes)
* so alloc 2M (with 2M align) and 24 bytes in turn will
* make next 2M slip to one more 2M later.
* then in big system, the memory will have a lot of holes...
* here try to allocate 2M pages continously.
*
* powerpc need to call sparse_init_one_section right after each
* sparse_early_mem_map_alloc, so allocate usemap_map at first.
*/
size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
usemap_map = alloc_bootmem(size);
if (!usemap_map)
panic("can not allocate usemap_map\n");
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
if (!present_section_nr(pnum))
continue;
usemap_map[pnum] = sparse_early_usemap_alloc(pnum);
}
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
if (!present_section_nr(pnum))
continue;
usemap = usemap_map[pnum];
if (!usemap)
continue;
map = sparse_early_mem_map_alloc(pnum);
if (!map)
continue;
sparse_init_one_section(__nr_to_section(pnum), pnum, map,
usemap);
}
vmemmap_populate_print_last();
free_bootmem(__pa(usemap_map), size);
}
4.4 paging_init(); --- page zone 相关的初始化。
void __init paging_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
unsigned long lastpfn;
pagetable_init();
#ifdef CONFIG_HIGHMEM
kmap_init();
#endif
kmap_coherent_init();
#ifdef CONFIG_ZONE_DMA
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
#endif
#ifdef CONFIG_ZONE_DMA32
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
lastpfn = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
lastpfn = highend_pfn;
if (cpu_has_dc_aliases && max_low_pfn != highend_pfn) {
printk(KERN_WARNING "This processor doesn't support highmem."
" %ldk highmem ignored\n",
(highend_pfn - max_low_pfn) << (PAGE_SHIFT - 10));
max_zone_pfns[ZONE_HIGHMEM] = max_low_pfn;
lastpfn = max_low_pfn;
}
#endif
free_area_init_nodes(max_zone_pfns);
}