Linux内核的启动从入口函数start_kernel开始,在init/main.c中,start_kernel就相当于内核的main函数。该函数中调用了很多的xxx_init函数。
创建0号进程
set_task_stack_end_magic(&init_task);
init_task是系统创建第一个进程,也称为0号进程,这也是唯一一个没有使用fork或者kernel_thread产生的进程,是进程列表的第一个,它的定义如下:
struct task_struct init_task{
.state = 0,
.stack = init_stack,
.usage = ATOMIC_INIT(2),
.flags = PF_KTHREAD,
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
.policy = SCHED_NORMAL,
.cpus_allowed = CPU_MASK_ALL,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
.restart_block = {
.fn = do_no_restart_syscall,
},
.se = {
.group_node = LIST_HEAD_INIT(init_task.se.group_node),
},
.rt = {
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
.time_slice = RR_TIMESLICE,
},
.tasks = LIST_HEAD_INIT(init_task.tasks),
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
.real_parent = &init_task,
.parent = &init_task,
.children = LIST_HEAD_INIT(init_task.children),
.sibling = LIST_HEAD_INIT(init_task.sibling),
.group_leader = &init_task,
RCU_POINTER_INITIALIZER(real_cred, &init_cred),
RCU_POINTER_INITIALIZER(cred, &init_cred),
.comm = INIT_TASK_COMM,
.thread = INIT_THREAD,
.fs = &init_fs,
.files = &init_files,
.signal = &init_signals,
.sighand = &init_sighand,
.nsproxy = &init_nsproxy,
.pending = {
.list = LIST_HEAD_INIT(init_task.pending.list),
.signal = {{0}}
},
.blocked = {{0}},
.alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
.journal_info = NULL,
INIT_CPU_TIMERS(init_task)
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
.timer_slack_ns = 50000, /* 50 usec default slack */
.thread_pid = &init_struct_pid,
.thread_group = LIST_HEAD_INIT(init_task.thread_group),
.thread_node = LIST_HEAD_INIT(init_signals.thread_head),
#ifdef CONFIG_PREEMPT_RCU
.rcu_read_lock_nesting = 0,
.rcu_read_unlock_special.s = 0,
.rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry),
.rcu_blocked_node = NULL,
INIT_PREV_CPUTIME(init_task)
}
中断门
函数trap_init里面设置了很多的中断门,用于处理各种中断,其中的 SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),就是系统调用的中断门,用户态进程调用系统调用也是通过发送中断的方式进行的。
函数调用关系如下
trap_init
-- idt_setup_traps();
- idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
其中def_idts的详细定义如下,这里面就定义了各种中断门。
static const __initconst struct idt_data def_idts[] = {
INTG(X86_TRAP_DE, divide_error),
INTG(X86_TRAP_NMI, nmi),
INTG(X86_TRAP_BR, bounds),
INTG(X86_TRAP_UD, invalid_op),
INTG(X86_TRAP_NM, device_not_available),
INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun),
INTG(X86_TRAP_TS, invalid_TSS),
INTG(X86_TRAP_NP, segment_not_present),
INTG(X86_TRAP_SS, stack_segment),
INTG(X86_TRAP_GP, general_protection),
INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug),
INTG(X86_TRAP_MF, coprocessor_error),
INTG(X86_TRAP_AC, alignment_check),
INTG(X86_TRAP_XF, simd_coprocessor_error),
#ifdef CONFIG_X86_32
TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
#else
INTG(X86_TRAP_DF, double_fault),
#endif
INTG(X86_TRAP_DB, debug),
#ifdef CONFIG_X86_MCE
INTG(X86_TRAP_MC, &machine_check),
#endif
SYSG(X86_TRAP_OF, overflow),
#if defined(CONFIG_IA32_EMULATION)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
#elif defined(CONFIG_X86_32)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
#endif
};
mm_init函数用于初始化内存管理模块。
sched_init函数用于初始化内核调度模块。主要代码如下:
for_each_possible_cpu(i){
struct rq *rq;
rq = cpu_rq(i);
raw_spin_lock_init(&rq->lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
}
上面的代码中首先取到每个CPU上的进程运行队列,初始化每一个进程的自旋锁,然后又一次初始化rq的字队列cfs_rq、rt_rq和dl_rq等。
文件系统初始化
vfs_caches_init();用来初始化基于内存的文件系统rootfs,该函数会一次调用mnt_init()->init_rootfs().
int __init init_rootfs(void)
{
int err = register_filesystem(&rootfs_fs_type);
if (err)
return err;
if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
err = shmem_init();
is_tmpfs = true;
} else {
err = init_ramfs_fs();
}
if (err)
unregister_filesystem(&rootfs_fs_type);
return err;
}
上面的register_filesystem在linux的虚拟文件系统里面注册了一种文件系统类型为rootfs_fs_type。
最后start_kernel调用了rest_init用来做其他方面的初始化工作。
创建1号进程
rest_init的第一个工作就是调用kernel_thread创建第二个进程也就是1号进程。
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
pid = kernel_thread(kernel_init, NULL, CLONE_FS);
1号进程是系统运行的第一个用户进程,1号进程创建的时候还在内核态,那么又是怎么切换到用户态的呢?
Kernel_thread的第一个参数是kernel_init,进程创建后就会运行这个kernel_init函数。函数具体实现如下
static int __ref kernel_init(void *unused)
{
int ret;
kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
ftrace_free_init_mem();
jump_label_invalidate_initmem();
free_initmem();
mark_readonly();
/*
* Kernel mappings are now finalized - update the userspace page-table
* to finalize PTI.
*/
pti_finalize();
system_state = SYSTEM_RUNNING;
numa_default_policy();
rcu_end_inkernel_boot();
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d)\n",
ramdisk_execute_command, ret);
}
/*
* We try each of these until one succeeds.
*
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*/
if (execute_command) {
ret = run_init_process(execute_command);
if (!ret)
return 0;
panic("Requested init %s failed (error %d).",
execute_command, ret);
}
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;
panic("No working init found. Try passing init= option to kernel. "
"See Linux Documentation/admin-guide/init.rst for guidance.");
}
其中在kernel_init_freeable函数有如下的调用代码
if (!ramdisk_execute_command)
ramdisk_execute_command = "/init";
这里将ramdisk_execute_comman设置为"/init"
然后kernel_init后面还有下面的代码
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d)\n",
ramdisk_execute_command, ret);
}
if (execute_command) {
ret = run_init_process(execute_command);
if (!ret)
return 0;
panic("Requested init %s failed (error %d).",
execute_command, ret);
}
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;
此时ramdisk_execute_command不为空程序将执行run_init_process函数,在该函数内部调用do_execve了函数,这里就是系统调用execve函数的内核实现,他的作用就是运行一个文件,因此程序运行到这里就会尝试运行ramdisk的init进程或者是普通文件系统上的/sbin/init、"/etc/init"、"/bin/init"、"/bin/sh"。
创建2号进程
rest_init的第二个工作就是调用kernel_thread创建第三个进程 即2号进程。
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
这里有一次使用了kernel_thread函数创建进程,这里创建的kthreadd负责所有内核态的线程调度和管理,是内核态所有线程的祖先。Kthreadd的主要代码如下
For(;;){
set_current_state(TASK_INTERRUPTIBLE);
if (list_empty(&kthread_create_list))
schedule();
__set_current_state(TASK_RUNNING);
spin_lock(&kthread_create_lock);
while (!list_empty(&kthread_create_list)) {
struct kthread_create_info *create;
create = list_entry(kthread_create_list.next,
struct kthread_create_info, list);
list_del_init(&create->list);
spin_unlock(&kthread_create_lock);
create_kthread(create);
spin_lock(&kthread_create_lock);
}
spin_unlock(&kthread_create_lock);
}
该函数的核心就是一个for循环和一个while循环,for循环中首先将线程状态设置为可中断的睡眠状态,然后判断线程链表是否为空,如果为空则执行一次线程调度让出CPU。如果变成链表不为空则进入while循环,最终会进入到create_kthread函数
create_kthread
- pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
- _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
(unsigned long)arg, NULL, NULL, 0);
最终会调用到_do_fork函数完成线程创建。