本文参考了
Linux kernel启动的过程概览
init/main.c:start_kernel()
|
\|/
init/main.c:rest_init
{
……
kernel_thread(kernel_init, NULL, CLONES_FS | CLONE_SIGHAND)
……
cpu_idle()
}
|
\|/
init/main.c:kernel_init//从上面代码可以看出,kernel_init是一个内核线程
|
\|/
init/main.c:init_post //会在最后调用启动脚本
{
……
823 /*
824 * We try each of these until one succeeds.
825 *
826 * The Bourne shell can be used instead of init if we are
827 * trying to recover a really broken machine.
828 */
829 if (execute_command) {
830 run_init_process(execute_command);
831 printk(KERN_WARNING "Failed to execute %s. Attempting "
832 "defaults...\n", execute_command);
833 }
834 run_init_process("/sbin/init");
835 run_init_process("/etc/init");
836 run_init_process("/bin/init");
837 run_init_process("/bin/sh");
838
839 panic("No init found. Try passing init= option to kernel.");
……
}
我们再来看看内核启动多核的详细过程。
init/main.c:start_kernel()
|
\|/
init/main.c:rest_init
{
……
kernel_thread(kernel_init, NULL, CLONES_FS | CLONE_SIGHAND)
……
}
|
\|/
kernel_init
|
\|/
/* called by boot processor to activate the rest */
init/main.c: smp_init()
{
……
for_each_present_cpu(cpu) {
if (num_onlien_cpus() >= setup_max_cpus)
break;
if ( !cpu_online(cpu))
cpu_up(cpu);
}
/* Any cleanup work */
printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
smp_cpu_done(setup_max_cpus);
……
}
--------------------------------------------------------------
cpu_up = native_cpu_up是一个回调函数。
注册地方是在:arch/x86/kernel/smp.c
struct smp_ops smp_ops = {
……
.cpu_up = native_cpu_up,
……
}
--------------------------------------------------------------
|
\|/
arch/x86/kernel/smpboot.c:native_cpu_up(unsigned int cpu)
|
\|/
arch/x86/kernel/smpboot.c: do_boot_cpu(int apicid, int cpu)
|
\|/
wakeup_secondary_cpu_via_init(apicid, start_ip)
在启动多核的过程中有两个bitmap很重要,一个是cpu_callin_mask,另一个是cpu_callout_mask。
cpu_callin_mask代表某个cpu是否已经启动,它的某个bit被与之对应的cpu在启动后置位,标记已经启动。
cpu_callout_mask在do_boot_cpu中被置位,并在检查到对应cpu已经启动后重新清零。
我们下面来详细看看do_boot_cpu(int apicid, int cpu)与wakeup_secondary_cpu_via_init(apicid, start_ip)
C代码
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
* Returns zero if CPU booted OK, else error code from
* ->wakeup_secondary_cpu.
*/
staticint__cpuinit do_boot_cpu(intapicid,intcpu)
{
unsigned longboot_error = 0;
unsigned longstart_ip;
inttimeout;
structcreate_idle c_idle = {
.cpu = cpu,
.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
};
INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
alternatives_smp_switch(1);
c_idle.idle = get_idle_for_cpu(cpu);
/*
* We can't use kernel_thread since we must avoid to
* reschedule the child.
*/
if(c_idle.idle) {
c_idle.idle->thread.sp = (unsignedlong) (((structpt_regs *)
(THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
init_idle(c_idle.idle, cpu);
gotodo_rest;
}
if(!keventd_up() || current_is_keventd())
c_idle.work.func(&c_idle.work);
else{
schedule_work(&c_idle.work);
wait_for_completion(&c_idle.done);
}
if(IS_ERR(c_idle.idle)) {
printk("failed fork for CPU %d\n", cpu);
destroy_work_on_stack(&c_idle.work);
returnPTR_ERR(c_idle.idle);
}
set_idle_for_cpu(cpu, c_idle.idle);
do_rest:
per_cpu(current_task, cpu) = c_idle.idle;
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
per_cpu(kernel_stack, cpu) =
(unsigned long)task_stack_page(c_idle.idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
stack_start.sp = (void*) c_idle.idle->thread.sp;
/* start_ip had better be page-aligned! */
start_ip = setup_trampoline();
/* So we see what's up */
announce_cpu(cpu, apicid);
/*
* This grunge runs the startup process for
* the targeted processor.
*/
atomic_set(&init_deasserted, 0);
if(get_uv_system_type() != UV_NON_UNIQUE_APIC) {
pr_debug("Setting warm reset code and vector.\n");
smpboot_setup_warm_reset_vector(start_ip);
/*
* Be paranoid about clearing APIC errors.
*/
if(APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
}
/*
* Kick the secondary CPU. Use the method in the APIC driver
* if it's defined - or use an INIT boot APIC message otherwise:
*/
if(apic->wakeup_secondary_cpu)
boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
else
boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
if(!boot_error) {
/*
* allow APs to start initializing.
*/
pr_debug("Before Callout %d.\n", cpu);
cpumask_set_cpu(cpu, cpu_callout_mask);
pr_debug("After Callout %d.\n", cpu);
/*
* Wait 5s total for a response
*/
for(timeout = 0; timeout
if(cpumask_test_cpu(cpu, cpu_callin_mask))
break;/* It has booted */
udelay(100);
}
if(cpumask_test_cpu(cpu, cpu_callin_mask))
pr_debug("CPU%d: has booted.\n", cpu);
else{
boot_error = 1;
if(*((volatileunsignedchar*)trampoline_base)
== 0xA5)
/* trampoline started but...? */
pr_err("CPU%d: Stuck ??\n", cpu);
else
/* trampoline code not run */
pr_err("CPU%d: Not responding.\n", cpu);
if(apic->inquire_remote_apic)
apic->inquire_remote_apic(apicid);
}
}
if(boot_error) {
/* Try to put things back the way they were before ... */
numa_remove_cpu(cpu); /* was set by numa_add_cpu */
/* was set by do_boot_cpu() */
cpumask_clear_cpu(cpu, cpu_callout_mask);
/* was set by cpu_init() */
cpumask_clear_cpu(cpu, cpu_initialized_mask);
set_cpu_present(cpu, false);
per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
}
/* mark "stuck" area as not stuck */
*((volatileunsignedlong*)trampoline_base) = 0;
if(get_uv_system_type() != UV_NON_UNIQUE_APIC) {
/*
* Cleanup possible dangling ends...
*/
smpboot_restore_warm_reset_vector();
}
destroy_work_on_stack(&c_idle.work);
returnboot_error;
}
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
* Returns zero if CPU booted OK, else error code from
* ->wakeup_secondary_cpu.
*/
static int __cpuinit do_boot_cpu(int apicid, int cpu)
{
unsigned long boot_error = 0;
unsigned long start_ip;
int timeout;
struct create_idle c_idle = {
.cpu= cpu,
.done= COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
};
INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
alternatives_smp_switch(1);
c_idle.idle = get_idle_for_cpu(cpu);
/*
* We can't use kernel_thread since we must avoid to
* reschedule the child.
*/
if (c_idle.idle) {
c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
(THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
init_idle(c_idle.idle, cpu);
goto do_rest;
}
if (!keventd_up() || current_is_keventd())
c_idle.work.func(&c_idle.work);
else {
schedule_work(&c_idle.work);
wait_for_completion(&c_idle.done);
}
if (IS_ERR(c_idle.idle)) {
printk("failed fork for CPU %d\n", cpu);
destroy_work_on_stack(&c_idle.work);
return PTR_ERR(c_idle.idle);
}
set_idle_for_cpu(cpu, c_idle.idle);
do_rest:
per_cpu(current_task, cpu) = c_idle.idle;
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
per_cpu(kernel_stack, cpu) =
(unsigned long)task_stack_page(c_idle.idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
stack_start.sp = (void *) c_idle.idle->thread.sp;
/* start_ip had better be page-aligned! */
start_ip = setup_trampoline();
/* So we see what's up */
announce_cpu(cpu, apicid);
/*
* This grunge runs the startup process for
* the targeted processor.
*/
atomic_set(&init_deasserted, 0);
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
pr_debug("Setting warm reset code and vector.\n");
smpboot_setup_warm_reset_vector(start_ip);
/*
* Be paranoid about clearing APIC errors.
*/
if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
}
/*
* Kick the secondary CPU. Use the method in the APIC driver
* if it's defined - or use an INIT boot APIC message otherwise:
*/
if (apic->wakeup_secondary_cpu)
boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
else
boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
if (!boot_error) {
/*
* allow APs to start initializing.
*/
pr_debug("Before Callout %d.\n", cpu);
cpumask_set_cpu(cpu, cpu_callout_mask);
pr_debug("After Callout %d.\n", cpu);
/*
* Wait 5s total for a response
*/
for (timeout = 0; timeout < 50000; timeout++) {
if (cpumask_test_cpu(cpu, cpu_callin_mask))
break;/* It has booted */
udelay(100);
}
if (cpumask_test_cpu(cpu, cpu_callin_mask))
pr_debug("CPU%d: has booted.\n", cpu);
else {
boot_error = 1;
if (*((volatile unsigned char *)trampoline_base)
== 0xA5)
/* trampoline started but...? */
pr_err("CPU%d: Stuck ??\n", cpu);
else
/* trampoline code not run */
pr_err("CPU%d: Not responding.\n", cpu);
if (apic->inquire_remote_apic)
apic->inquire_remote_apic(apicid);
}
}
if (boot_error) {
/* Try to put things back the way they were before ... */
numa_remove_cpu(cpu); /* was set by numa_add_cpu */
/* was set by do_boot_cpu() */
cpumask_clear_cpu(cpu, cpu_callout_mask);
/* was set by cpu_init() */
cpumask_clear_cpu(cpu, cpu_initialized_mask);
set_cpu_present(cpu, false);
per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
}
/* mark "stuck" area as not stuck */
*((volatile unsigned long *)trampoline_base) = 0;
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
/*
* Cleanup possible dangling ends...
*/
smpboot_restore_warm_reset_vector();
}
destroy_work_on_stack(&c_idle.work);
return boot_error;
}
C代码
/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
* has made sure it's suitably aligned.
*/
unsigned long__trampinit setup_trampoline(void)
{
memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
returnvirt_to_phys(trampoline_base);
}
/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
* has made sure it's suitably aligned.
*/
unsigned long __trampinit setup_trampoline(void)
{
memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
return virt_to_phys(trampoline_base);
}
可以从上面代码中看出do_boot_cpu会为编号为apicid的AP设定好它将要使用的stack以及它将要执行的代码start_eip,在完成这些后,通过发送IPI序列来启动AP,
并会将cpu_callout_mask的代表相应AP的位清零。
C代码
staticint__cpuinit
wakeup_secondary_cpu_via_init(intphys_apicid, unsignedlongstart_eip)
{
unsigned longsend_status, accept_status = 0;
intmaxlvt, num_starts, j;
maxlvt = lapic_get_maxlvt();
/*
* Be paranoid about clearing APIC errors.
*/
if(APIC_INTEGRATED(apic_version[phys_apicid])) {
if(maxlvt > 3)/* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
pr_debug("Asserting INIT.\n");
/*
* Turn INIT on target chip
*/
/*
* Send IPI
*/
apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
phys_apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
mdelay(10);
pr_debug("Deasserting INIT.\n");
/* Target chip */
/* Send IPI */
apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
mb();
atomic_set(&init_deasserted, 1);
/*
* Should we send STARTUP IPIs ?
*
* Determine this based on the APIC version.
* If we don't have an integrated APIC, don't send the STARTUP IPIs.
*/
if(APIC_INTEGRATED(apic_version[phys_apicid]))
num_starts = 2;
else
num_starts = 0;
/*
* Paravirt / VMI wants a startup IPI hook here to set up the
* target processor state.
*/
startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
(unsigned long)stack_start.sp);
/*
* Run STARTUP IPI loop.
*/
pr_debug("#startup loops: %d.\n", num_starts);
for(j = 1; j <= num_starts; j++) {
pr_debug("Sending STARTUP #%d.\n", j);
if(maxlvt > 3)/* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
pr_debug("After apic_write.\n");
/*
* STARTUP IPI
*/
/* Target chip */
/* Boot on the stack */
/* Kick the second */
apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
phys_apicid);
/*
* Give the other CPU some time to accept the IPI.
*/
udelay(300);
pr_debug("Startup point 1.\n");
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
/*
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
if(maxlvt > 3)/* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
accept_status = (apic_read(APIC_ESR) & 0xEF);
if(send_status || accept_status)
break;
}
pr_debug("After Startup.\n");
if(send_status)
printk(KERN_ERR "APIC never delivered???\n");
if(accept_status)
printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
return(send_status | accept_status);
}
static int __cpuinit
wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
int maxlvt, num_starts, j;
maxlvt = lapic_get_maxlvt();
/*
* Be paranoid about clearing APIC errors.
*/
if (APIC_INTEGRATED(apic_version[phys_apicid])) {
if (maxlvt > 3)/* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
pr_debug("Asserting INIT.\n");
/*
* Turn INIT on target chip
*/
/*
* Send IPI
*/
apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
phys_apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
mdelay(10);
pr_debug("Deasserting INIT.\n");
/* Target chip */
/* Send IPI */
apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
mb();
atomic_set(&init_deasserted, 1);
/*
* Should we send STARTUP IPIs ?
*
* Determine this based on the APIC version.
* If we don't have an integrated APIC, don't send the STARTUP IPIs.
*/
if (APIC_INTEGRATED(apic_version[phys_apicid]))
num_starts = 2;
else
num_starts = 0;
/*
* Paravirt / VMI wants a startup IPI hook here to set up the
* target processor state.
*/
startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
(unsigned long)stack_start.sp);
/*
* Run STARTUP IPI loop.
*/
pr_debug("#startup loops: %d.\n", num_starts);
for (j = 1; j <= num_starts; j++) {
pr_debug("Sending STARTUP #%d.\n", j);
if (maxlvt > 3)/* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
pr_debug("After apic_write.\n");
/*
* STARTUP IPI
*/
/* Target chip */
/* Boot on the stack */
/* Kick the second */
apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
phys_apicid);
/*
* Give the other CPU some time to accept the IPI.
*/
udelay(300);
pr_debug("Startup point 1.\n");
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
/*
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
if (maxlvt > 3)/* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
accept_status = (apic_read(APIC_ESR) & 0xEF);
if (send_status || accept_status)
break;
}
pr_debug("After Startup.\n");
if (send_status)
printk(KERN_ERR "APIC never delivered???\n");
if (accept_status)
printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
return (send_status | accept_status);
}
一段wakeup_secondary_cpu_via_init执行的log
C代码
656 CPU17: has booted.
657 WP output: cpu :18
658 ------native_cpu_up cpu:18, apicid:18----------
659 ------------in 3 do_boot_cpu------- #18
660 Asserting INIT.
661 Waiting forsend to finish...
662 Deasserting INIT.
663 Waiting forsend to finish...
664 #startup loops: 2.
665 Sending STARTUP #1.
666 After apic_write.
667 Startup point 1.
668 Waiting forsend to finish...
669 Sending STARTUP #2.
670 After apic_write.
671 Startup point 1.
672 Waiting forsend to finish...
673 in the cpu_init())
674 After Startup.
675 Before Callout 18.
676 After Callout 18.
677 cpu is: 12
678 in the enable_x2apic()
679 ------in x2apic_phys_get_apic_id-----
680 CPU#18 (phys ID: 18) waiting forCALLOUT
681 CALLIN, before setup_local_APIC().
682 ------3------
683 Stack at about ffff88021f953f44
684 ------in x2apic_phys_get_apic_id-----
685 CPU18: has booted.
656 CPU17: has booted.
657 WP output: cpu :18
658 ------native_cpu_up cpu:18, apicid:18----------
659 ------------in 3 do_boot_cpu------- #18
660 Asserting INIT.
661 Waiting for send to finish...
662 Deasserting INIT.
663 Waiting for send to finish...
664 #startup loops: 2.
665 Sending STARTUP #1.
666 After apic_write.
667 Startup point 1.
668 Waiting for send to finish...
669 Sending STARTUP #2.
670 After apic_write.
671 Startup point 1.
672 Waiting for send to finish...
673 in the cpu_init())
674 After Startup.
675 Before Callout 18.
676 After Callout 18.
677 cpu is: 12
678 in the enable_x2apic()
679 ------in x2apic_phys_get_apic_id-----
680 CPU#18 (phys ID: 18) waiting for CALLOUT
681 CALLIN, before setup_local_APIC().
682 ------3------
683 Stack at about ffff88021f953f44
684 ------in x2apic_phys_get_apic_id-----
685 CPU18: has booted.
wakeup_secondary_cpu_via_init是与硬件相关的代码,它的主要作用是通过发送INIT-INIT-Startup IPI序列来将AP从halted的状态唤醒并让它开始执行代码start_eip所指向的代码。
Startup IPI会有一个域来指定需要执行代码的地址:apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid);
如果想彻底搞清楚一段代码,请去看Intel文档。
start_secondary是AP会执行的代码,这段代码通过smp_callin来将设定cpu_callin_mask来告诉BSP它已经启动。start_secondary最后是idle循环。
C代码
/*
* Activate a secondary processor.
*/
notrace staticvoid__cpuinit start_secondary(void*unused)
{
/*
* Don't put *anything* before cpu_init(), SMP booting is too
* fragile that we want to limit the things done here to the
* most necessary things.
*/
vmi_bringup();
cpu_init();
preempt_disable();
smp_callin();
/* otherwise gcc will move up smp_processor_id before the cpu_init */
barrier();
/*
* Check TSC synchronization with the BP:
*/
check_tsc_sync_target();
if(nmi_watchdog == NMI_IO_APIC) {
disable_8259A_irq(0);
enable_NMI_through_LVT0();
enable_8259A_irq(0);
}
#ifdef CONFIG_X86_32
while(low_mappings)
cpu_relax();
__flush_tlb_all();
#endif
/* This must be done before setting cpu_online_mask */
set_cpu_sibling_map(raw_smp_processor_id());
wmb();
/*
* We need to hold call_lock, so there is no inconsistency
* between the time smp_call_function() determines number of
* IPI recipients, and the time when the determination is made
* for which cpus receive the IPI. Holding this
* lock helps us to not include this cpu in a currently in progress
* smp_call_function().
*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
*/
ipi_call_lock();
lock_vector_lock();
__setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
ipi_call_unlock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
/* enable local interrupts */
local_irq_enable();
x86_cpuinit.setup_percpu_clockev();
wmb();
cpu_idle();
}