Process "Parant"
--> 软中断int $0x80 [entry.S] <---> _set_gate(idt_table+0x80,15,3,system_call,__KERNEL_CS); [arch/i386/kernel/traps.c]
system_call:
pushl %eax /* __NR_fork */
SAVE_ALL
syscall_call:
call *sys_call_table(,%eax,4)
/* 于是,系统流程转向函数sys_fork()。(“arch/i386/kernel/process.c”)*/
asmlinkage int sys_fork(struct pt_regs regs) /* regs来自陷入内核时的SAVE_ALL
* 和硬件压入的%eip, %cs, %oldesp, %oldss */
{
return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL);
/* SIGCHLD告诉do_fork()函数应创建一子进程 */
}
--> do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL)
--> alloc_pid()
...
--> copy_process(SIGCHLD, regs.esp, ®s, 0, NULL, NULL)
...
--> dup_task_struct(current)
这个函数创建了一个空进程描述符和一个空内核栈,并把它们关联起来,最后返回这个描述符给'p'
struct task_struct *tsk; /* 指向新创建的进程描述符 */
struct thread_info *ti; /* 指向新的内核栈 */
tsk = alloc_task_struct(); /* 创建一个空的进程描述符 */
ti = alloc_thread_info(tsk); /* 创建一个空的内核栈 */
*tsk = *current; /* 赋值父进程的描述符 */
tsk->thread_info = ti; /* 关联起来 */
setup_thread_stack(tsk, orig); /* 初始化子进程内核栈中的thread_info为父进程(current)的thread_info
* 的内容 ,然后将子进程的thread_info 和子进程描述符关联起来 */
...
--> copy_thread(0, clone_flags, regs.esp, /* unused */, p /* 被创建进程 */, regs)
--> childregs = task_pt_regs(p);
得到子进程内核栈中寄存器的起始地址,见下图
|_____thread_info____| LOW
|__________________|
|__________________|
|__________________|
|__________________|
childregs-->|__________________|
|__________________|-->空出32个字节(Ring0)
HIGH
*childregs = *regs; /* 即子进程内核栈中存放陷入内核之前所有寄存器的值 */
childregs->eax = 0; /* 子进程从fork返回0 */
childregs->esp = regs.esp; /* WHY HERE? */
p->thread.esp = (unsigned long) childregs;
p->thread.esp0 = (unsigned long) (childregs+1); /* 记录寄存器信息,内核栈顶esp和esp0 */
p->thread.eip = (unsigned long) ret_from_fork;
...
perfctr_copy_task(p, regs);
...
--> if (!(clone_flags & CLONE_STOPPED))
wake_up_new_task(p, clone_flags);
else
p->state = TASK_STOPPED;
--> void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
--> rq = task_rq_lock(p, &flags);
--> rq = task_rq(p); /* 得到p所在CPU的运行队列rq */
--> this_cpu = smp_processor_id();
cpu = task_cpu(p);
--> if (likely(cpu == this_cpu)) {
if (!(clone_flags & CLONE_VM)) {
/* If the child will run on the same CPU as the parent, and parent and child do not * share the same set of page tables
* (CLONE_VM flag cleared), it then forces the
* child to run before the parent by inserting it into the parent's runqueue right * before the parent. */
if (unlikely(!current->array))
__activate_task(p, rq);
else {
p->prio = current->prio;
p->normal_prio = current->normal_prio;
/*
* Each task_struct descriptor includes a run_list field of type
* list_head. If the process priority is equal to k (a value ranging
* between 0 and 139), the run_list field links the process
* descriptor into the list of runnable processes having priority k.
*/
list_add_tail(&p->run_list, ¤t->run_list); /*Run child first*/
p->array = current->array;
p->array->nr_active++;
inc_nr_running(p, rq);
}
set_need_resched(); /* set the TIF_NEED_RESCHED flag of the 'current'
* As see above, the child thread is going to run
* immediatly */
} else
/* Run child last */
__activate_task(p, rq);
...
--> 此时,当前CPU还在执行父进程,即current,current从fork()系统调用返回到entry.S中继续执行,返回值nr(即子进程
号)存放在%eax中
[entry.S]
movl %eax,EAX(%esp) # store the return value
syscall_exit:
cli # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx # current->work
jne syscall_exit_work
/* 正如内核中的注释,如果当前进程还有什么额外的工作需要完成,则跳转到syscall_exit_work */
syscall_exit_work:
testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
jz work_pending # 如果thread_info的这些位被设置,则跳到work_pending
# 由于在wake_up_new_task()函数中,通过set_need_resched()
# 设置了_TIF_NEED_RESCHED,因此在work_pending中会调用调度函数schedule
# 见下面分析
TRACE_IRQS_ON
sti # could let do_syscall_trace() call
# schedule() instead
movl %esp, %eax
movl $1, %edx
call do_syscall_trace
/* 上面的工作处理完后,跳转到resume_userspace. 由于系统调用属于软中断,也就是用户请求
* 请求内核为自己做一些工作,因此最后还要回到用户层 */
jmp resume_userspace
work_pending:
testb $(1<<TIF_NEED_RESCHED), %cl
jz work_notifysig
work_resched:
call schedule
cli
jmp resume_userspace
最后:
ENTRY(resume_userspace)
cli # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
# int/exception return?
jne work_pending
jmp restore_all
--> 由于父进程调用了schedule(),因此将切换到新进程next被执行,这个线程很可能是由父进程创建的那个子进程
asmlinkage void __sched schedule(void)
...
--> idx = sched_find_first_bit(array->bitmap); /* 这就是这个算法为何是O(1)的 */
queue = array->queue + idx; /* 最高优先级的进程 */
next = list_entry(queue->next, struct task_struct, run_list);
...
--> 如果是子进程,那么子进程将会执行ret_from_fork
这是因为在copy_thread()中, p->thread.eip = (unsigned long) ret_from_fork;
--> ENTRY(ret_from_fork)
pushl %eax # (%esp) = 0, %esp = %esp - 4
call schedule_tail # invokes the finish_task_switch() function
# to complete the process switch
popl %eax # %eax = 0
pushl $0x0202 # Reset kernel eflags
popfl
jmp syscall_exit # 见上面的分析
...
restore_all:
...
iret <--『这就是硬件性能计数器开始计数的点!!!!』
最后的最后,子进程执行iret,真正return from fork ...
如果从内核态到用户态
iret--->一次从内核栈中弹出%eip,%cs,%eflags /------------------------> 则还弹出%oldesp, %oldss
/------------------------> 则iret工作到此结束
如果从内核态到内核态
而由于前面分析过,*childregs = *regs; /* 即子进程内核栈中存放陷入内核之前所有寄存器的值 */
因此父进程和子进程从同一段代码开始执行,只是返回值不同。