linux内核启动多核,linux SMP多核启动分析

startup_32:

cld

cli

movl $(KERNEL_DS),%eax

mov %ax,%ds

mov %ax,%es

mov %ax,%fs

mov %ax,%gs

#ifdef

__SMP__

orw %bx,%bx#

What state are we in BX=1 for SMP

# 0 for boot

jz2f# Initial boot

//根据bx值指示是主cpu(bx=0)还是次cpu(bx=1)

//然后会有不同的执行路径

/*

*We are

trampolining an SMP processor

*//这里是其他次cpu执行路径

mov %ax,%ss

xorl %eax,%eax# Back to 0

mov%cx,%ax# SP low 16 bits

movl %eax,%esp

pushl 0#

Clear NT

popfl

ljmp $(KERNEL_CS),

$0x100000# Into C and sanity

2://这里是主cpu的执行路径

#endif

lss SYMBOL_NAME(stack_start),%esp

xorl %eax,%eax

1:incl %eax#

check that A20 really IS enabled

movl %eax,0x000000# loop forever if it isn't

cmpl %eax,0x100000

je 1b

/*

* Initialize eflags.Some BIOS's leave bits like NT set.This would

* confuse the debugger if this code is traced.

* XXX - best to initialize before switching to

protected mode.

*/

pushl $0

popfl

/*

* Clear BSS

*/

xorl %eax,%eax

movl $ SYMBOL_NAME(_edata),%edi

movl $ SYMBOL_NAME(_end),%ecx

subl %edi,%ecx

cld

rep

stosb

/*

* Do the decompression, and jump to the new

kernel..

*/

subl $16,%esp# place for structure on the stack

pushl %esp#

address of structure as first arg

call SYMBOL_NAME(decompress_kernel)

orl%eax,%eax

jnz3f

xorl %ebx,%ebx

ljmp $(KERNEL_CS),

$0x100000

ljmp $(KERNEL_CS), $0x100000

这个其实就是跳到start_kernel函数。

asmlinkage void start_kernel(void)

{

char * command_line;

/*

*This little check will move.

*/

#ifdef __SMP__

static int

first_cpu=1;

//这个不是函数局部变量,是函数静态变量,主cpu执行这个函数时复位为1,其他cpu为0,因为主cpu总是第一个执行这个函数的。

if(!first_cpu)

start_secondary();

//对于

first_cpu=0;

#endif

/*

* Interrupts are still

disabled. Do necessary setups, then

* enable them

*/

setup_arch(&command_line,

&memory_start, &memory_end);

memory_start =

paging_init(memory_start,memory_end);

trap_init();

init_IRQ();

sched_init();

time_init();

parse_options(command_line);

#ifdef CONFIG_MODULES

init_modules();

#endif

#ifdef CONFIG_PROFILE

if (!prof_shift)

#ifdef CONFIG_PROFILE_SHIFT

prof_shift =

CONFIG_PROFILE_SHIFT;

#else

prof_shift = 2;

#endif

#endif

if (prof_shift) {

prof_buffer =

(unsigned int *) memory_start;

/* only text is

profiled */

prof_len = (unsigned

long) &_etext - (unsigned long) &_stext;

prof_len >>=

prof_shift;

memory_start +=

prof_len * sizeof(unsigned int);

}

memory_start =

console_init(memory_start,memory_end);

#ifdef CONFIG_PCI

memory_start =

pci_init(memory_start,memory_end);

#endif

memory_start =

kmalloc_init(memory_start,memory_end);

sti();

calibrate_delay();

memory_start =

inode_init(memory_start,memory_end);

memory_start = file_table_init(memory_start,memory_end);

memory_start =

name_cache_init(memory_start,memory_end);

#ifdef CONFIG_BLK_DEV_INITRD

if (initrd_start

&& initrd_start < memory_start) {

printk(KERN_CRIT

"initrd overwritten (0x%08lx < 0x%08lx) - "

"disabling it.\n",initrd_start,memory_start);

initrd_start = 0;

}

#endif

mem_init(memory_start,memory_end);

buffer_init();

sock_init();

#if defined(CONFIG_SYSVIPC) || defined(CONFIG_KERNELD)

ipc_init();

#endif

dquot_init();

arch_syms_export();

sti();

check_bugs();

printk(linux_banner);

#ifdef __SMP__

smp_init();

#endif

sysctl_init();

/*

*We

count on the initial thread going ok

*Like

idlers init is an unlocked kernel thread, which will

*make

syscalls (and thus be locked).

*/

kernel_thread(init,

NULL, 0);

/*

* task[0] is meant to be

used as an "idle" task: it may not sleep, but

* it might do some general

things like count free pages or it could be

* used to implement a

reasonable LRU algorithm for the paging routines:

* anything that can be

useful, but shouldn't take time from the real

* processes.

*

* Right now task[0] just

does a infinite idle loop.

*/

cpu_idle(NULL);

}

asmlinkage void start_secondary(void)

{

trap_init();

init_IRQ();

//初始化自己的irq

smp_callin();

//这个等待主cpu给大家发送开始信号

cpu_idle(NULL);

//这个是ide进程。

}

void smp_callin(void)

{

extern void

calibrate_delay(void);

int

cpuid=GET_APIC_ID(apic_read(APIC_ID));

unsigned long l;

/*

*Activate

our APIC

*/

SMP_PRINTK(("CALLIN

%d\n",smp_processor_id()));

l=apic_read(APIC_SPIV);

l|=(1<<8);/*

Enable */

apic_write(APIC_SPIV,l);

sti();

/*

*Get

our bogomips.

*/

calibrate_delay();

/*

*Save

our processor parameters

*/

smp_store_cpu_info(cpuid);

/*

*Allow

the master to continue.

*/

set_bit(cpuid, (unsigned

long *)&cpu_callin_map[0]);

/*

*Until

we are ready for SMP scheduling

*/

load_ldt(0);

/*printk("Testing

faulting...\n");

*(long *)0=1; OOPS... */

local_flush_tlb();

while(!smp_commenced);

//这个可以看成是自旋锁,等待主cpu发smp_commenced信号即开始信号。

if (cpu_number_map[cpuid]

== -1)

while(1);

local_flush_tlb();

SMP_PRINTK(("Commenced..\n"));

load_TR(cpu_number_map[cpuid]);

/*while(1);*/

}

int cpu_idle(void *unused)

{

for(;;)

idle();

}

主cpu给各次cpu发开始信号是在init函数中调用smp_begin函数:

static void smp_begin(){

smp_threads_ready=1;

smp_commence();

//这个会通过IPI给各个次cpu发送相关中断来通信

}

每个cpu有一个current指针。

刚开始的时候由主cpu赋值为init_task;

在主cpu调用 sched_init赋值。

void sched_init(void)

{

/*

*We

have to do a little magic to get the first

*process

right in SMP mode.

*/

int

cpu=smp_processor_id();//这个为0,因为是主cpu才调用。

#ifndef __SMP__

current_set[cpu]=&init_task;

#else

init_task.processor=cpu;

//这个是将init_task标志为主cpu在运行。

for(cpu = 0; cpu <

NR_CPUS; cpu++)

current_set[cpu] =

&init_task;

#endif

init_bh(TIMER_BH,

timer_bh);

init_bh(TQUEUE_BH,

tqueue_bh);

init_bh(IMMEDIATE_BH,

immediate_bh);

}

同时这些还会在 smp_init丰富。

static void smp_init(void)

{

int i, j;

smp_boot_cpus();

/*

*Create

the slave init tasks as sharing pid 0.

*

*This

should only happen if we have virtual CPU numbers

*higher

than 0.

*/

for (i=1;

i

{

struct task_struct

*n, *p;

j =

cpu_logical_map[i];

/*

*We use

kernel_thread for the idlers which are

*unlocked

tasks running in kernel space.

*/

kernel_thread(cpu_idle,

NULL, CLONE_PID);

//这个其实就是创建线程然后这个线程体现在task[i]上了,因为创建的时候的task_struct就是从task[i]取的。

/*

*Don't

assume linear processor numbering

*/

current_set[j]=task[i];

current_set[j]->processor=j;

cli();

n =

task[i]->next_run;

p =

task[i]->prev_run;

nr_running--;

n->prev_run = p;

p->next_run = n;

task[i]->next_run

= task[i]->prev_run = task[i];

sti();

}

}

上面执行完后就给每个cpu加了一个idle任务。

然后kernel_thread(init, NULL, 0)创建的init任务。

//每个cpu在时间中断时都可能调用这个共同的函数。

asmlinkage void schedule(void)

{

int c;

struct task_struct * p;

struct task_struct *

prev, * next;

unsigned long timeout =

0;

int this_cpu=smp_processor_id();

//获取cpu_id;

/* check alarm, wake up any interruptible tasks that have got a

signal */

if (intr_count)

goto

scheduling_in_interrupt;

if (bh_active &

bh_mask) {

intr_count = 1;

do_bottom_half();

intr_count = 0;

}

run_task_queue(&tq_scheduler);

need_resched = 0;

prev = current;

cli();

/* move an exhausted RR

process to be last.. */

if (!prev->counter

&& prev->policy == SCHED_RR) {

prev->counter =

prev->priority;

move_last_runqueue(prev);

}

switch (prev->state) {

case

TASK_INTERRUPTIBLE:

if

(prev->signal & ~prev->blocked)

goto

makerunnable;

timeout =

prev->timeout;

if (timeout

&& (timeout <= jiffies)) {

prev->timeout

= 0;

timeout = 0;

makerunnable:

prev->state

= TASK_RUNNING;

break;

}

default:

del_from_runqueue(prev);

case TASK_RUNNING:

}

p =

init_task.next_run;

//获取进程双向链表的一个节点。

sti();

#ifdef __SMP__

/*

*This

is safe as we do not permit re-entry of schedule()

*/

prev->processor =

NO_PROC_ID;

#define idle_task (task[cpu_number_map[this_cpu]])

#else

#define idle_task (&init_task)

#endif

/*

* Note! there may appear

new tasks on the run-queue during this, as

* interrupts are enabled.

However, they will be put on front of the

* list, so our list

starting at "p" is essentially fixed.

*/

/* this is the scheduler proper: */

c = -1000;

next = idle_task;

while (p !=

&init_task) {

//p初始值为init_task.next_run

//当回到init_task时说明已经查找为所有的了。

int weight =

goodness(p, prev, this_cpu);

if (weight > c)

c = weight, next =

p;

p = p->next_run;

}

//这个是查找所有的task,找出最合适的task来调度。

/* if all runnable processes

have "counter == 0", re-calculate counters */

if (!c) {

for_each_task(p)

p->counter =

(p->counter >> 1) + p->priority;

}

#ifdef __SMP__

/*

*Allocate

process to CPU

*/

next->processor = this_cpu;

//将这个将要被执行的processor标识为这个cpu

next->last_processor = this_cpu;

#endif

#ifdef __SMP_PROF__

/* mark processor running

an idle thread */

if (0==next->pid)

set_bit(this_cpu,&smp_idle_map);

else

clear_bit(this_cpu,&smp_idle_map);

#endif

if (prev != next) {

struct timer_list

timer;

kstat.context_swtch++;

if (timeout) {

init_timer(&timer);

timer.expires =

timeout;

timer.data =

(unsigned long) prev;

timer.function =

process_timeout;

add_timer(&timer);

}

get_mmu_context(next);

switch_to(prev,next);

if (timeout)

del_timer(&timer);

}

return;

scheduling_in_interrupt:

printk("Aiee:

scheduling in interrupt %p\n",

__builtin_return_address(0));

}

上面需要注意的是current变量,在单核中肯定就是一个变量,在多核中肯定是各个cpu有自己的current:

其定义如下:

#define current (0+current_set[smp_processor_id()]

在smp中current是current_set数组中的一个元素,是指具体一个cpu的当前进程。

从上面可以看出一个cpu是从全局task找一个task来运行,每个cpu有一个idle_task,这个task的编号是固定的。

所有的task可以通过init_task来找到,因为创建新进程(内核线程)的时候,会将新建的挂到链表上。

而init_task是静态挂在这上面的。

附上task_struct:

struct task_struct {

/* these are hardcoded - don't touch */

volatile long state;/* -1 unrunnable, 0 runnable, >0 stopped

*/

long counter;

long priority;

unsigned long signal;

unsigned long blocked;/* bitmap of masked signals */

unsigned long flags;/* per process flags, defined below */

int errno;

long debugreg[8];/* Hardware debugging registers */

struct exec_domain

*exec_domain;

/* various fields */

struct linux_binfmt

*binfmt;

struct task_struct

*next_task, *prev_task;

struct task_struct

*next_run,*prev_run;

unsigned long

saved_kernel_stack;

unsigned long

kernel_stack_page;

int exit_code,

exit_signal;

/* ??? */

unsigned long

personality;

int dumpable:1;

int did_exec:1;

/* shouldn't this be

pid_t? */

int pid;

int pgrp;

int tty_old_pgrp;

int session;

/* boolean value for

session group leader */

int leader;

intgroups[NGROUPS];

/*

* pointers to (original) parent process,

youngest child, younger sibling,

* older sibling, respectively.(p->father can be replaced with

* p->p_pptr->pid)

*/

struct task_struct

*p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;

struct wait_queue

*wait_chldexit;/* for wait4() */

unsigned short

uid,euid,suid,fsuid;

unsigned short gid,egid,sgid,fsgid;

unsigned long timeout,

policy, rt_priority;

unsigned long

it_real_value, it_prof_value, it_virt_value;

unsigned long

it_real_incr, it_prof_incr, it_virt_incr;

struct timer_list

real_timer;

long utime, stime,

cutime, cstime, start_time;

/* mm fault and swap info: this can arguably be seen as either

mm-specific or thread-specific */

unsigned long min_flt,

maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;

int swappable:1;

unsigned long

swap_address;

unsigned long

old_maj_flt;/* old value of maj_flt */

unsigned long dec_flt;/* page fault count of the last time */

unsigned long swap_cnt;/* number of pages to swap on next pass */

/* limits */

struct rlimit

rlim[RLIM_NLIMITS];

unsigned short used_math;

char comm[16];

/* file system info */

int link_count;

struct tty_struct *tty;

/* NULL if no tty */

/* ipc stuff */

struct sem_undo *semundo;

struct sem_queue

*semsleeping;

/* ldt for this task - used by Wine.If NULL, default_ldt is used */

struct desc_struct *ldt;

/* tss for this task */

struct thread_struct tss;

/* filesystem information */

struct fs_struct *fs;

/* open file information */

struct files_struct

*files;

/* memory management info */

struct mm_struct *mm;

/* signal handlers */

struct signal_struct

*sig;

#ifdef __SMP__

int processor;

int last_processor;

int lock_depth;/* Lock depth. We can context switch in and

out of holding a syscall kernel lock... */

#endif

};

故这个p = init_task.next_run;

p可以获取到所有在就绪状态的task;

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值