Linux 多核启动过程

以这篇博文来纪念自己与“Linux kernel多核启动”相处的两个多月。
本文章以2.6.33.1的linux内核在x86_64平台上为例进行说明。
本文参考了[url]http://tldp.org/HOWTO/Linux-i386-Boot-Code-HOWTO/smpboot.html[/url]

Linux kernel启动的过程概览
init/main.c:start_kernel()
|
\|/
init/main.c:rest_init
{
……
kernel_thread(kernel_init, NULL, CLONES_FS | CLONE_SIGHAND)
……
cpu_idle()
}
|
\|/
init/main.c:kernel_init//从上面代码可以看出,kernel_init是一个内核线程
|
\|/
init/main.c:init_post //会在最后调用启动脚本
{
……
823 /*
824 * We try each of these until one succeeds.
825 *
826 * The Bourne shell can be used instead of init if we are
827 * trying to recover a really broken machine.
828 */
829 if (execute_command) {
830 run_init_process(execute_command);
831 printk(KERN_WARNING "Failed to execute %s. Attempting "
832 "defaults...\n", execute_command);
833 }
834 run_init_process("/sbin/init");
835 run_init_process("/etc/init");
836 run_init_process("/bin/init");
837 run_init_process("/bin/sh");
838
839 panic("No init found. Try passing init= option to kernel.");
……
}


我们再来看看内核启动多核的详细过程。

init/main.c:start_kernel()
|
\|/
init/main.c:rest_init
{
……
kernel_thread(kernel_init, NULL, CLONES_FS | CLONE_SIGHAND)
……
}
|
\|/
kernel_init
|
\|/
/* called by boot processor to activate the rest */
init/main.c: smp_init()
{
……
for_each_present_cpu(cpu) {
if (num_onlien_cpus() >= setup_max_cpus)
break;
if ( !cpu_online(cpu))
cpu_up(cpu);
}
/* Any cleanup work */
printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
smp_cpu_done(setup_max_cpus);
……
}
--------------------------------------------------------------
cpu_up = native_cpu_up是一个回调函数。
注册地方是在:arch/x86/kernel/smp.c

struct smp_ops smp_ops = {
……
.cpu_up = native_cpu_up,
……
}
--------------------------------------------------------------
|
\|/
arch/x86/kernel/smpboot.c:native_cpu_up(unsigned int cpu)
|
\|/
arch/x86/kernel/smpboot.c: do_boot_cpu(int apicid, int cpu)
|
\|/
wakeup_secondary_cpu_via_init(apicid, start_ip)


在启动多核的过程中有两个bitmap很重要,一个是cpu_callin_mask,另一个是cpu_callout_mask。
cpu_callin_mask代表某个cpu是否已经启动,它的某个bit被与之对应的cpu在启动后置位,标记已经启动。
cpu_callout_mask在do_boot_cpu中被置位,并在检查到对应cpu已经启动后重新清零。

我们下面来详细看看do_boot_cpu(int apicid, int cpu)与wakeup_secondary_cpu_via_init(apicid, start_ip)


/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
* Returns zero if CPU booted OK, else error code from
* ->wakeup_secondary_cpu.
*/
static int __cpuinit do_boot_cpu(int apicid, int cpu)
{
unsigned long boot_error = 0;
unsigned long start_ip;
int timeout;
struct create_idle c_idle = {
.cpu = cpu,
.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
};

INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);

alternatives_smp_switch(1);

c_idle.idle = get_idle_for_cpu(cpu);

/*
* We can't use kernel_thread since we must avoid to
* reschedule the child.
*/
if (c_idle.idle) {
c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
(THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
init_idle(c_idle.idle, cpu);
goto do_rest;
}

if (!keventd_up() || current_is_keventd())
c_idle.work.func(&c_idle.work);
else {
schedule_work(&c_idle.work);
wait_for_completion(&c_idle.done);
}

if (IS_ERR(c_idle.idle)) {
printk("failed fork for CPU %d\n", cpu);
destroy_work_on_stack(&c_idle.work);
return PTR_ERR(c_idle.idle);
}

set_idle_for_cpu(cpu, c_idle.idle);
do_rest:
per_cpu(current_task, cpu) = c_idle.idle;
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
per_cpu(kernel_stack, cpu) =
(unsigned long)task_stack_page(c_idle.idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
stack_start.sp = (void *) c_idle.idle->thread.sp;

/* start_ip had better be page-aligned! */
start_ip = setup_trampoline();

/* So we see what's up */
announce_cpu(cpu, apicid);

/*
* This grunge runs the startup process for
* the targeted processor.
*/

atomic_set(&init_deasserted, 0);

if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {

pr_debug("Setting warm reset code and vector.\n");

smpboot_setup_warm_reset_vector(start_ip);
/*
* Be paranoid about clearing APIC errors.
*/
if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
}

/*
* Kick the secondary CPU. Use the method in the APIC driver
* if it's defined - or use an INIT boot APIC message otherwise:
*/
if (apic->wakeup_secondary_cpu)
boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
else
boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);

if (!boot_error) {
/*
* allow APs to start initializing.
*/
pr_debug("Before Callout %d.\n", cpu);
cpumask_set_cpu(cpu, cpu_callout_mask);
pr_debug("After Callout %d.\n", cpu);

/*
* Wait 5s total for a response
*/
for (timeout = 0; timeout < 50000; timeout++) {
if (cpumask_test_cpu(cpu, cpu_callin_mask))
break; /* It has booted */
udelay(100);
}

if (cpumask_test_cpu(cpu, cpu_callin_mask))
pr_debug("CPU%d: has booted.\n", cpu);
else {
boot_error = 1;
if (*((volatile unsigned char *)trampoline_base)
== 0xA5)
/* trampoline started but...? */
pr_err("CPU%d: Stuck ??\n", cpu);
else
/* trampoline code not run */
pr_err("CPU%d: Not responding.\n", cpu);
if (apic->inquire_remote_apic)
apic->inquire_remote_apic(apicid);
}
}

if (boot_error) {
/* Try to put things back the way they were before ... */
numa_remove_cpu(cpu); /* was set by numa_add_cpu */

/* was set by do_boot_cpu() */
cpumask_clear_cpu(cpu, cpu_callout_mask);

/* was set by cpu_init() */
cpumask_clear_cpu(cpu, cpu_initialized_mask);

set_cpu_present(cpu, false);
per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
}

/* mark "stuck" area as not stuck */
*((volatile unsigned long *)trampoline_base) = 0;

if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
/*
* Cleanup possible dangling ends...
*/
smpboot_restore_warm_reset_vector();
}

destroy_work_on_stack(&c_idle.work);
return boot_error;
}




/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
* has made sure it's suitably aligned.
*/
unsigned long __trampinit setup_trampoline(void)
{
memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
return virt_to_phys(trampoline_base);
}


可以从上面代码中看出do_boot_cpu会为编号为apicid的AP设定好它将要使用的stack以及它将要执行的代码start_eip,在完成这些后,通过发送IPI序列来启动AP,
并会将cpu_callout_mask的代表相应AP的位清零。



static int __cpuinit
wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
int maxlvt, num_starts, j;

maxlvt = lapic_get_maxlvt();

/*
* Be paranoid about clearing APIC errors.
*/
if (APIC_INTEGRATED(apic_version[phys_apicid])) {
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}

pr_debug("Asserting INIT.\n");

/*
* Turn INIT on target chip
*/
/*
* Send IPI
*/
apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
phys_apicid);

pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();

mdelay(10);

pr_debug("Deasserting INIT.\n");

/* Target chip */
/* Send IPI */
apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);

pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();

mb();
atomic_set(&init_deasserted, 1);

/*
* Should we send STARTUP IPIs ?
*
* Determine this based on the APIC version.
* If we don't have an integrated APIC, don't send the STARTUP IPIs.
*/
if (APIC_INTEGRATED(apic_version[phys_apicid]))
num_starts = 2;
else
num_starts = 0;

/*
* Paravirt / VMI wants a startup IPI hook here to set up the
* target processor state.
*/
startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
(unsigned long)stack_start.sp);

/*
* Run STARTUP IPI loop.
*/
pr_debug("#startup loops: %d.\n", num_starts);

for (j = 1; j <= num_starts; j++) {
pr_debug("Sending STARTUP #%d.\n", j);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
pr_debug("After apic_write.\n");

/*
* STARTUP IPI
*/

/* Target chip */
/* Boot on the stack */
/* Kick the second */
apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
phys_apicid);

/*
* Give the other CPU some time to accept the IPI.
*/
udelay(300);

pr_debug("Startup point 1.\n");

pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();

/*
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
accept_status = (apic_read(APIC_ESR) & 0xEF);
if (send_status || accept_status)
break;
}
pr_debug("After Startup.\n");

if (send_status)
printk(KERN_ERR "APIC never delivered???\n");
if (accept_status)
printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);

return (send_status | accept_status);
}



一段wakeup_secondary_cpu_via_init执行的log

656 CPU17: has booted.
657 WP output: cpu :18
658 ------native_cpu_up cpu:18, apicid:18----------
659 ------------in 3 do_boot_cpu------- #18
660 Asserting INIT.
661 Waiting for send to finish...
662 Deasserting INIT.
663 Waiting for send to finish...
664 #startup loops: 2.
665 Sending STARTUP #1.
666 After apic_write.
667 Startup point 1.
668 Waiting for send to finish...
669 Sending STARTUP #2.
670 After apic_write.
671 Startup point 1.
672 Waiting for send to finish...
673 in the cpu_init())
674 After Startup.
675 Before Callout 18.
676 After Callout 18.
677 cpu is: 12
678 in the enable_x2apic()
679 ------in x2apic_phys_get_apic_id-----
680 CPU#18 (phys ID: 18) waiting for CALLOUT
681 CALLIN, before setup_local_APIC().
682 ------3------
683 Stack at about ffff88021f953f44
684 ------in x2apic_phys_get_apic_id-----
685 CPU18: has booted.

wakeup_secondary_cpu_via_init是与硬件相关的代码,它的主要作用是通过发送INIT-INIT-Startup IPI序列来将AP从halted的状态唤醒并让它开始执行代码start_eip所指向的代码。
Startup IPI会有一个域来指定需要执行代码的地址:apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid);
如果想彻底搞清楚一段代码,请去看Intel文档。


start_secondary是AP会执行的代码,这段代码通过smp_callin来将设定cpu_callin_mask来告诉BSP它已经启动。start_secondary最后是idle循环。

/*
* Activate a secondary processor.
*/
notrace static void __cpuinit start_secondary(void *unused)
{
/*
* Don't put *anything* before cpu_init(), SMP booting is too
* fragile that we want to limit the things done here to the
* most necessary things.
*/
vmi_bringup();
cpu_init();
preempt_disable();
smp_callin();

/* otherwise gcc will move up smp_processor_id before the cpu_init */
barrier();
/*
* Check TSC synchronization with the BP:
*/
check_tsc_sync_target();

if (nmi_watchdog == NMI_IO_APIC) {
disable_8259A_irq(0);
enable_NMI_through_LVT0();
enable_8259A_irq(0);
}

#ifdef CONFIG_X86_32
while (low_mappings)
cpu_relax();
__flush_tlb_all();
#endif

/* This must be done before setting cpu_online_mask */
set_cpu_sibling_map(raw_smp_processor_id());
wmb();

/*
* We need to hold call_lock, so there is no inconsistency
* between the time smp_call_function() determines number of
* IPI recipients, and the time when the determination is made
* for which cpus receive the IPI. Holding this
* lock helps us to not include this cpu in a currently in progress
* smp_call_function().
*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
*/
ipi_call_lock();
lock_vector_lock();
__setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
ipi_call_unlock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;

/* enable local interrupts */
local_irq_enable();

x86_cpuinit.setup_percpu_clockev();

wmb();
cpu_idle();
}
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值