寇亚飞 + 原创作品转载请注明出处 + 《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000
实验内容
跟踪分析Linux内核的启动过程
- 使用gdb跟踪调试内核从start_kernel到init进程启动
实验步骤及结果
参见实验报告
实验分析
本次实验是使用gdb调试跟踪内核启动代码。通过前几次课了解到,计算机是通过执行指令来完成一系列操作,这其中很重要的一点就是eip寄存器,它给cpu指明了下一条要执行的指令是谁。因此,x86计算机启动的第一个动作就是从硬盘固定位置读取BIOS程序位置到eip中,然后BIOS例行程序检测完硬件并完成相应的初始化之后就会寻找可引导介质,找到后把引导程序加载到指定内存区域后,就把控制权交给了引导程序。这里一般是把硬盘的第一个扇区MBR和活动分区的引导程序加载到内存(即加载BootLoader),加载完整后把控制权交给BootLoader。引导程序BootLoader开始负责操作系统初始化,然后起动操作系统。启动操作系统时一般会指定kernel、initrd和root所在的分区和目录,比如root (hd0,0),kernel (hd0,0)/bzImage root=/dev/ram init=/bin/ash,initrd (hd0,0)/myinitrd4M.img
内核启动过程包括start_kernel之前和之后,之前全部是做初始化的汇编指令,之后开始C代码的操作系统初始化,最后执行第一个用户态进程init。本次实验就是调试从start_kernel开始的代码。
在start_kernel中
set_task_stack_end_magic(&init_task);
完成了创建0号进程的任务,即idle进程。它是系统创建的第一个进程。
在start_kernel中,完成了许多初始化的工作。如trap_init()完成了中断初始化,mm_init()完成了内存管理初始化,sched_init()完成了调度初始化等等。start_kernel代码如下:
asmlinkage __visible void __init start_kernel(void)
{
char *command_line;
char *after_dashes;
set_task_stack_end_magic(&init_task);
smp_setup_processor_id();
debug_objects_early_init();
/*
* Set up the the initial canary ASAP:
*/
boot_init_stack_canary();
cgroup_init_early();
local_irq_disable();
early_boot_irqs_disabled = true;
/*
* Interrupts are still disabled. Do necessary setups, then
* enable them
*/
boot_cpu_init();
page_address_init();
pr_notice("%s", linux_banner);
setup_arch(&command_line);
mm_init_cpumask(&init_mm);
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
boot_cpu_state_init();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
build_all_zonelists(NULL, NULL);
page_alloc_init();
pr_notice("Kernel command line: %s\n", boot_command_line);
parse_early_param();
after_dashes = parse_args("Booting kernel",
static_command_line, __start___param,
__stop___param - __start___param,
-1, -1, NULL, &unknown_bootoption);
if (!IS_ERR_OR_NULL(after_dashes))
parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
NULL, set_init_arg);
jump_label_init();
/*
* These use large bootmem allocations and must precede
* kmem_cache_init()
*/
setup_log_buf(0);
pidhash_init();
vfs_caches_init_early();
sort_main_extable();
trap_init();
mm_init();
/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
/*
* Disable preemption - early bootup scheduling is extremely
* fragile until we cpu_idle() for the first time.
*/
preempt_disable();
if (WARN(!irqs_disabled(),
"Interrupts were enabled *very* early, fixing it\n"))
local_irq_disable();
idr_init_cache();
/*
* Allow workqueue creation and work item queueing/cancelling
* early. Work item execution depends on kthreads and starts after
* workqueue_init().
*/
workqueue_init_early();
rcu_init();
/* trace_printk() and trace points may be used after this */
trace_init();
context_tracking_init();
radix_tree_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
tick_init();
rcu_init_nohz();
init_timers();
hrtimers_init();
softirq_init();
timekeeping_init();
time_init();
sched_clock_postinit();
printk_safe_init();
perf_event_init();
profile_init();
call_function_init();
WARN(!irqs_disabled(), "Interrupts were enabled early\n");
early_boot_irqs_disabled = false;
local_irq_enable();
kmem_cache_init_late();
/*
* HACK ALERT! This is early. We're enabling the console before
* we've done PCI setups etc, and console_init() must be aware of
* this. But we do want output early, in case something goes wrong.
*/
console_init();
if (panic_later)
panic("Too many boot %s vars at `%s'", panic_later,
panic_param);
lockdep_info();
/*
* Need to run this when irqs are enabled, because it wants
* to self-test [hard/soft]-irqs on/off lock inversion bugs
* too:
*/
locking_selftest();
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && !initrd_below_start_ok &&
page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
page_to_pfn(virt_to_page((void *)initrd_start)),
min_low_pfn);
initrd_start = 0;
}
#endif
page_ext_init();
debug_objects_mem_init();
kmemleak_init();
setup_per_cpu_pageset();
numa_policy_init();
if (late_time_init)
late_time_init();
calibrate_delay();
pidmap_init();
anon_vma_init();
acpi_early_init();
#ifdef CONFIG_X86
if (efi_enabled(EFI_RUNTIME_SERVICES))
efi_enter_virtual_mode();
#endif
#ifdef CONFIG_X86_ESPFIX64
/* Should be run before the first non-init thread is created */
init_espfix_bsp();
#endif
thread_stack_cache_init();
cred_init();
fork_init();
proc_caches_init();
buffer_init();
key_init();
security_init();
dbg_late_init();
vfs_caches_init();
pagecache_init();
signals_init();
proc_root_init();
nsfs_init();
cpuset_init();
cgroup_init();
taskstats_init_early();
delayacct_init();
check_bugs();
acpi_subsystem_init();
arch_post_acpi_subsys_init();
sfi_init_late();
if (efi_enabled(EFI_RUNTIME_SERVICES)) {
efi_free_boot_services();
}
ftrace_init();
/* Do the rest non-__init'ed, we're now alive */
rest_init();
}
在实验中,将断点设置在start_kernel的最后一行,即rest_init(),它完成了剩余部分的初始化。
在rest_init()中
kernel_thread(kernel_init, NULL, CLONE_FS)
创建了1号进程,它是所有用户态进程的祖先,也就是init进程。
然后通过
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
创建的kthreadd2号进程,它是所有内核态线程的祖先。
static noinline void __ref rest_init(void)
{
int pid;
rcu_scheduler_starting();
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
kernel_thread(kernel_init, NULL, CLONE_FS);
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
complete(&kthreadd_done);
/*
* The boot idle thread must execute schedule()
* at least once to get things moving:
*/
init_idle_bootup_task(current);
schedule_preempt_disabled();
/* Call into cpu_idle with preempt disabled */
cpu_startup_entry(CPUHP_ONLINE);
}
在rest_init的最后
void cpu_startup_entry(enum cpuhp_state state)
{
/*
* This #ifdef needs to die, but it's too late in the cycle to
* make this generic (arm and sh have never invoked the canary
* init for the non boot cpus!). Will be fixed in 3.11
*/
#ifdef CONFIG_X86
/*
* If we're the non-boot CPU, nothing set the stack canary up
* for us. The boot CPU already has it initialized but no harm
* in doing it again. This is a good place for updating it, as
* we wont ever return from this function (so the invalid
* canaries already on the stack wont ever trigger).
*/
boot_init_stack_canary();
#endif
arch_cpu_idle_prepare();
cpu_idle_loop();
}
cpu_startup_entry()调用了cpu_idle_loop(),在cpu_idle_loop()中,是一个while(1)循环,它是在系统没有别的进程执行时就调用idle进程即0号进程。
更具体的关于0号,1号,2号进程的分析参考这个人的博客
总结
计算机在启动后,执行到start_kernel后,创建了第一个进程即0号进程,然后完成了一系列的内核初始化工作;然后0号进程通过kernel_thread创建了init进程,即1号进程;通过kernel_thread创建 kthreadd进程即2号进程。idle进程真的没意义吗?这么看来它是无比的有意义。