全面解析Linux 内核 3.10.x - start_kernel()

From: 全面解析Linux 内核 3.10.x - 本文章完全基于MIPS架构

赶得早,不如赶得巧 - 古语

start_kernel

前几年穿越题材很火爆,如神话,,步步惊心等,可是这些剧都有一些共性,那就是总是穿越到了过去! 为什么穿越一定要到过去呢?在我看来无非就是觉得回到过去题材丰富,因为在中国科幻题材很难火爆,题材有限!换言之就是,中国人总是喜欢活在过去,我祖上怎样怎样..我去年买了个表啊!在看看国外人家穿越基本都是现在穿到未来,未来穿到现在!就这么任性的穿来穿去,很少有穿越到过去的!美国人都在拯救世界,中国人希望穿越到过去征服世界!总是那么喜欢意淫!言归正传.. 
为什么要管start_kernel叫隧道之门呢,这个函数是承前启后将前辈BIOS or Boot的棒接过来后,进行了完美的变身,因为这才是被称之为操作系统的第一个函数!在这个函数后,所有操作系统的特性进行完全展开!这个世纪最伟大的操作系统之一Linux 就这么的映入眼帘!让我们开启操作系统的灵魂kernel的大门吧! 
另外我建议你去买一本Linux 内核修炼之道 - 任老师的书,特别棒!

一、start_kernel 函数

start_kernel

asmlinkage void __init start_kernel(void)
{
    char * command_line;
    extern const struct kernel_param __start___param[], __stop___param[];

    /*
     * Need to run as early as possible, to initialize the
     * lockdep hash:
     */
    lockdep_init();
    smp_setup_processor_id();
    debug_objects_early_init();

    /*
     * Set up the the initial canary ASAP:
     */
    boot_init_stack_canary();

    cgroup_init_early();
    /*关闭当前CPU的中断*/
    local_irq_disable();
    early_boot_irqs_disabled = true;

/*
 * Interrupts are still disabled. Do necessary setups, then
 * enable them
 */
    /*引导CPU初始化*/
    boot_cpu_init();
    /*页表地址初始化,使用链表串起来*/
    page_address_init();
    /*内核第一条打印信息*/
    pr_notice("%s", linux_banner);
    /*体系架构初始化函数*/
    setup_arch(&command_line);
    mm_init_owner(&init_mm, &init_task);
    mm_init_cpumask(&init_mm);
    setup_command_line(command_line);
    /*每CPU初始化*/
    setup_nr_cpu_ids();
    setup_per_cpu_areas();
    smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
    /*内存区域初始化*/
    build_all_zonelists(NULL, NULL);
    /*CPU热插拔支持*/
    page_alloc_init();
    /*打印启动命令行参数*/
    pr_notice("Kernel command line: %s\n", boot_command_line);
    parse_early_param();
    parse_args("Booting kernel", static_command_line, __start___param,
       __stop___param - __start___param,
       -1, -1, &unknown_bootoption);
    jump_label_init();

    /*
     * These use large bootmem allocations and must precede
     * kmem_cache_init()
     */
    setup_log_buf(0);
    /*初始化hash表,便于从进程的PID获取对应进程描述符指针*/
    pidhash_init();
    /*虚拟文件系统初始化*/
    vfs_caches_init_early();
    sort_main_extable();
    /*完成对系统保留中断向量的初始化,init_IRQ完成其余中断向量的初始化*/
    trap_init();
    /*初始化内存管理*/
    mm_init();

    /*
     * Set up the scheduler prior starting any interrupts (such as the
     * timer interrupt). Full topology setup happens at smp_init()
     * time - but meanwhile we still have a functioning scheduler.
     */
    /*进程调度器的初始化*/
    sched_init();
    /*
     * Disable preemption - early bootup scheduling is extremely
     * fragile until we cpu_idle() for the first time.
     */
    /*禁止内核抢占*/
    preempt_disable();
    if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n"))
        local_irq_disable();
    idr_init_cache();
    perf_event_init();
    /*RCU机制初始化*/
    rcu_init();
    tick_nohz_init();
    radix_tree_init();
    /* init some links before init_ISA_irqs() */
    early_irq_init();
    /*中断向量初始化*/
    init_IRQ();
    /*注册clockevents框架*/
    tick_init();
    /*初始化定时器相关内容*/
    init_timers();
    /*初始化高进度时钟*/
    hrtimers_init();
    /*初始化tasklet_softirq 和*hi_softirq/
    softirq_init();
    timekeeping_init();
    /*初始化系统时钟源*/
    time_init();
    /*初始化profile功能)*/
    profile_init();

    call_function_init();
    WARN(!irqs_disabled(), "Interrupts were enabled early\n");
    early_boot_irqs_disabled = false;
    local_irq_enable();
    /*Slab初始化*/
    kmem_cache_init_late();

    /*
     * HACK ALERT! This is early. We're enabling the console before
     * we've done PCI setups etc, and console_init() must be aware of
     * this. But we do want output early, in case something goes wrong.
     */
    console_init();
    if (panic_later)
        panic(panic_later, panic_param);
    /*打印锁依赖信息*/
    lockdep_info();

    /*
     * Need to run this when irqs are enabled, because it wants
     * to self-test [hard/soft]-irqs on/off lock inversion bugs
     * too:
     */
    locking_selftest();

#ifdef CONFIG_BLK_DEV_INITRD
    if (initrd_start && !initrd_below_start_ok &&
        page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
        pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
            page_to_pfn(virt_to_page((void *)initrd_start)),
            min_low_pfn);
        initrd_start = 0;
    }
#endif
    page_cgroup_init();
    debug_objects_mem_init();
    kmemleak_init();
    setup_per_cpu_pageset();
    numa_policy_init();
    if (late_time_init)
        late_time_init();
    sched_clock_init();
    /*CPU性能测试函数,可以计算CPU1s执行多少次一个极短的循环,计算的值经过处理得到bogoMIPS.*/
    calibrate_delay();
    pidmap_init();
    anon_vma_init();
#ifdef CONFIG_X86
    if (efi_enabled(EFI_RUNTIME_SERVICES))
        efi_enter_virtual_mode();
#endif
#ifdef CONFIG_X86_ESPFIX64
    /* Should be run before the first non-init thread is created */
    init_espfix_bsp();
#endif
    thread_info_cache_init();
    cred_init();
    /*根据物理内存计算出允许创建的进程的数量*/
    fork_init(totalram_pages);
    proc_caches_init();
    buffer_init();
    key_init();
    /*security 模块初始化*/
    security_init();
    dbg_late_init();
    /*虚拟文件系统相关*/
    vfs_caches_init(totalram_pages);
    signals_init();
    /* rootfs populating might need page-writeback */
    page_writeback_init();
    /*初始化proc文件系统*/
#ifdef CONFIG_PROC_FS
    proc_root_init();
#endif
    cgroup_init();
    cpuset_init();
    taskstats_init_early();
    delayacct_init();
    /*内核缺陷测试*/
    check_bugs();

    acpi_early_init(); /* before LAPIC and SMP init */
    sfi_init_late();

    if (efi_enabled(EFI_RUNTIME_SERVICES)) {
        efi_late_init();
        efi_free_boot_services();
    }

    ftrace_init();

    /* Do the rest non-__init'ed, we're now alive */
    rest_init();
}

二、start_kernel 函数 - 我所关心的内容

三、start_kernel 函数 - 其它细节

1、如何激活第一个cpu?

static void __init boot_cpu_init(void)
{
    /*返回启动的时候的CPU Number*/
    int cpu = smp_processor_id();
    /* Mark the boot cpu "present", "online" etc for SMP and UP case */
    /*将此CPU Online*/
    set_cpu_online(cpu, true);
    /*将此CPU 激活*/
    set_cpu_active(cpu, true);
    /*将此CPU 设为父CPU*/
    set_cpu_present(cpu, true);
    /**/
    set_cpu_possible(cpu, true);
}

2、页表初始化始都做了甚么?

void __init page_address_init(void)
{
    int i;

    for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
        INIT_LIST_HEAD(&page_address_htable[i].lh);
        spin_lock_init(&page_address_htable[i].lock);
    }
}

上述代码page_address_htable为存储页面映射后的对应信息! 
具体请参考内存管理- 内核中的页 
3、command_line 传参流程 
setup_command_line(command_line); 函数,此函数将boot启动传到内核的参数或者自己的设置的参数保存到一个全局变量以及指定的内存单元中! 
4、每CPU环境环境初始化 
nr_cpu_ids是一个特殊的值,在单CPU 情况下是 1;而 SMP 情况下,又是一个全局变量,被find_last_bit函数配置。 
接着到setup_per_cpu_areas函数,此函数设置了SMP 的每CPU存储区,为系统中的每个cpu的per_cpu 变量申请空间。 
然后到smp_prepare_boot_cpu函数:

/* preload SMP state for boot cpu */
void smp_prepare_boot_cpu(void)
{
    set_cpu_possible(0, true);
    set_cpu_online(0, true);
    cpu_set(0, cpu_callin_map);
}

关于每cpu变量以及多cpu唤醒等内容也会单独列出来进行仔细分析。

5、内存管理区列表初始化 - build_all_zonelists

/*
 * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
 */
void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
    set_zonelist_order();

    if (system_state == SYSTEM_BOOTING) {
        __build_all_zonelists(NULL);
        mminit_verify_zonelist();
        cpuset_init_current_mems_allowed();
    } else {
        /* we have to stop all cpus to guarantee there is no user
           of zonelist */
#ifdef CONFIG_MEMORY_HOTPLUG
        if (zone)
            setup_zone_pageset(zone);
#endif
        stop_machine(__build_all_zonelists, pgdat, NULL);
        /* cpuset refresh routine should be here */
    }
    vm_total_pages = nr_free_pagecache_pages();
    /*
     * Disable grouping by mobility if the number of pages in the
     * system is too low to allow the mechanism to work. It would be
     * more accurate, but expensive to check per-zone. This check is
     * made on memory-hotadd so a system can start with mobility
     * disabled and enable it later
     */
    if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
        page_group_by_mobility_disabled = 1;
    else
        page_group_by_mobility_disabled = 0;

    printk("Built %i zonelists in %s order, mobility grouping %s.  "
        "Total pages: %ld\n",
            nr_online_nodes,
            zonelist_order_name[current_zonelist_order],
            page_group_by_mobility_disabled ? "off" : "on",
            vm_total_pages);
#ifdef CONFIG_NUMA
    printk("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}

dmesg显示Built 1 zonelists in Zone order, mobility grouping on. Total pages: 63663 
上述代码有两个知识点: 
a.内核内存区域管理大系统 
b.NUMA - 非统一内存访问 
此处具体内容放到内存管理那一章再去深入研究。 
我们只要知道显示的页总数为63363. 
Ps. 通过页总数可计算当前物理内存大小,页总数 * 页表大小 = 页总大小 
63663 * 65536(64k) / 1024 / 1024 / 1024 = 3927 / 1024 = 3.9G; 
6、CPU热插拔支持 - page_alloc_init

void __init page_alloc_init(void)
{
    hotcpu_notifier(page_alloc_cpu_notify, 0);
}

这么时髦的技术我就暂时不去研究了,附几篇高大上的文章!


By: Keven - 点滴积累

阅读更多

没有更多推荐了,返回首页