在linux初始化start_kernel最后会调用rest_init进行最后的初始化工作。Linux下有3个特殊的进程,idle进程(PID = 0), init进程(PID = 1)和kthreadd(PID = 2)。rest_init函数的作用是创建了init和kthreadd这两个进程,并且idle进程继续往下执行,最终在一个while循环中作为空闲进程,如果没有其他进程调度,当前cpu进行轮训或者休眠。
static noinline void __init_refok rest_init(void)
{
int pid;
rcu_scheduler_starting();
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);//init=1 号进程的创建
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);//内核进程kthread的创建,用来管理内核层线程
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
complete(&kthreadd_done);//完成量,通知kernel_init线程可以继续往下执行初始化
/*
* The boot idle thread must execute schedule()
* at least once to get things moving:
*/
init_idle_bootup_task(current);//设置idle进程的调度类为idle_sched_class
schedule_preempt_disabled();//打开抢占并调度,当调度回来再次执行的时候,关闭抢占,接着往下执行
/* Call into cpu_idle with preempt disabled */
cpu_startup_entry(CPUHP_ONLINE);//idle进程最终的执行代码
}
cpu_startup_entry完成idle进程的最后工作:
void cpu_startup_entry(enum cpuhp_state state)
{
/*
* This #ifdef needs to die, but it's too late in the cycle to
* make this generic (arm and sh have never invoked the canary
* init for the non boot cpus!). Will be fixed in 3.11
*/
#ifdef CONFIG_X86
/*
* If we're the non-boot CPU, nothing set the stack canary up
* for us. The boot CPU already has it initialized but no harm
* in doing it again. This is a good place for updating it, as
* we wont ever return from this function (so the invalid
* canaries already on the stack wont ever trigger).
*/
boot_init_stack_canary();
#endif
current_set_polling();//设置idle线程的状态为polling
arch_cpu_idle_prepare();
printk(KERN_INFO " cpu_startup_entry.\n");
cpu_idle_loop();
}
核心函数为cpu_idle_loop,最终操作系统在没有其他任务调度的时候,idle线程就在cpu_idle_loop这个循环中执行。
static void cpu_idle_loop(void)
{
while (1) {
tick_nohz_idle_enter();
while (!need_resched()) {//当不需要调度的时候,idle进程则进while循环中执行
check_pgt_cache();
rmb();
if (cpu_is_offline(smp_processor_id()))
arch_cpu_idle_dead();
local_irq_disable();
arch_cpu_idle_enter();
/*
* In poll mode we reenable interrupts and spin.
*
* Also if we detected in the wakeup from idle
* path that the tick broadcast device expired
* for us, we don't want to go deep idle as we
* know that the IPI is going to arrive right
* away
*/
if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
cpu_idle_poll(); //如果设置了cpu_idle_force_poll,或者broadcast超时,则调用poll方法
} else {
current_clr_polling();
if (!need_resched()) {
stop_critical_timings();
rcu_idle_enter();
arch_cpu_idle();
WARN_ON_ONCE(irqs_disabled());
rcu_idle_exit();
start_critical_timings();
} else {
local_irq_enable();
}
current_set_polling();
}
arch_cpu_idle_exit();
}
tick_nohz_idle_exit();
schedule_preempt_disabled();//如果需要调度,则调用schedule()进行调度
}
}
cpu_idle_loop分为两种情况:
(1)如果idle进程设置了TIF_NEED_RESCHED标志,则调用schedule_preempt_disabled进行调度
(2)如果当前进程不需要被调度,即调度idle进程运行以后,还是没有其他进程需要调度,当前cpu则进行poll或者休眠。又可以分为两种情况,即当前体系架构是实现了poll还是休眠,对于这两种情况,关键代码是这块:
if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
cpu_idle_poll();
} else {
current_clr_polling();
if (!need_resched()) {
stop_critical_timings();
rcu_idle_enter();
arch_cpu_idle();
WARN_ON_ONCE(irqs_disabled());
rcu_idle_exit();
start_critical_timings();
} else {
local_irq_enable();
}
current_set_polling();
}
(2.1)可以看到第一种情况,当设置了cpu_idle_force_poll 或者broadcast超时,则调用cpu_idle_poll,当相关体系结构没有实现arch_cpu_idle时,arch_cpu_idle函数为:
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
local_irq_enable();
}
即即使走else分支,总是设置cpu_idle_force_poll 为1,下次while循环,就会走if分支,所以在这种体系架构下面,idle进程总是会执行cpu_idle_poll:
static inline int cpu_idle_poll(void)
{
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
return 1;
}
#if __LINUX_ARM_ARCH__ == 6 || defined(CONFIG_ARM_ERRATA_754327)
#define cpu_relax() smp_mb()
#else
#define cpu_relax() barrier()
#endif
可以看到在中断到来以后,每有设置idle进程需要调度的情况下,cpu_idle_poll程序会一直在while循环里调用cpu_relax,该操作是个内存屏障操作。
(2.2)如果对用的体系架构下面重新实现了arch_cpu_idle:
void arch_cpu_idle(void)
{
if (cpuidle_idle_call())
default_idle();
}
static void default_idle(void)
{
if (arm_pm_idle)//我们的体系架构没有设置该函数
arm_pm_idle();
else
cpu_do_idle();
local_irq_enable();
}
核心函数是cpu_do_idle,我们的体系架构下是
ENTRY(cpu_arm920_do_idle)
mcr p15, 0, r0, c7, c0, 4 @ Wait for interrupt
mov pc, lr
该函数的作用是通过cp15协处理器,使cpu进入低电压状态,停止运行等待中断出现,at91rm9200进入省电模式。