在linux中进程切换的核心函数是context_switch()定义在kernel/sched/core.c#L2711, 如下所示
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
struct mm_struct *mm, *oldmm;
/* 完成进程切换的准备工作 */
prepare_task_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
* one hypercall.
*/
arch_start_context_switch(prev);
/* 如果next是内核线程,则线程使用prev所使用的地址空间
* schedule( )函数把该线程设置为懒惰TLB模式
* 内核线程并不拥有自己的页表集(task_struct->mm = NULL)
* 它使用一个普通进程的页表集
* 不过,没有必要使一个用户态线性地址对应的TLB表项无效
* 因为内核线程不访问用户态地址空间。
*/
if (!mm) /* 内核线程无虚拟地址空间, mm = NULL*/
{
/* 内核线程的active_mm为上一个进程的mm
* 注意此时如果prev也是内核线程,
* 则oldmm为NULL, 即next->active_mm也为NULL */
next->active_mm = oldmm;
/* 增加mm的引用计数 */
atomic_inc(&oldmm->mm_count);
/* 通知底层体系结构不需要切换虚拟地址空间的用户部分
* 这种加速上下文切换的技术称为惰性TBL */
enter_lazy_tlb(oldmm, next);
}
else /* 不是内核线程, 则需要切切换虚拟地址空间 */
switch_mm(oldmm, mm, next);
/* 如果prev是内核线程或正在退出的进程
* 就重新设置prev->active_mm
* 然后把指向prev内存描述符的指针保存到运行队列的prev_mm字段中
*/
if (!prev->mm)
{
/* 将prev的active_mm赋值和为空 */
prev->active_mm = NULL;
/* 更新运行队列的prev_mm成员 */
rq->prev_mm = oldmm;
}
/*
* Since the runqueue lock will be released by the next
* task (which is an invalid locking op but in the case
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
lockdep_unpin_lock(&rq->lock);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
/* Here we just switch the register state and the stack.
* 切换进程的执行环境, 包括堆栈和寄存器
* 同时返回上一个执行的程序
* 相当于prev = witch_to(prev, next) */
switch_to(prev, next, prev);
/* switch_to之后的代码只有在
* 当前进程再次被选择运行(恢复执行)时才会运行
* 而此时当前进程恢复执行时的上一个进程可能跟参数传入时的prev不同
* 甚至可能是系统中任意一个随机的进程
* 因此switch_to通过第三个参数将此进程返回
*/
/* 路障同步, 一般用编译器指令实现
* 确保了switch_to和finish_task_switch的执行顺序
* 不会因为任何可能的优化而改变 */
barrier();
/* 进程切换之后的处理工作 */
return finish_task_switch(prev);
}
可把context_switch()函数分成5个部分:
1.prepare_task_switch()
在进程切换之前, 首先执行调用每个体系结构都必须定义的prepare_task_switch挂钩, 这使得内核执行特定于体系结构的代码, 为切换做事先准备该函数定义在kernel/sched/core.c, line 2558, 如下所示:
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
* @prev: the current task that is being switched out
* @next: the task we are going to switch to.
*
* This is called with the rq lock held and interrupts off. It must
* be paired with a subsequent finish_task_switch after the context
* switch.
*
* prepare_task_switch sets up locking and calls architecture specific
* hooks.
*/
static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
}
2.switch_mm()
切换全局页目录以安装一个新的地址空间
3.switch_to()
执行环境的切换是在switch_to()中完成的, switch_to完成最终的进程切换,它保存原进程的所有寄存器信息,恢复新进程的所有寄存器信息,并执行新的进程, 以x86_32位下的switch_to函数为例, 其定义在arch/x86/include/asm/switch_to.h, line 27
/*
* Saving eflags is important. It switches not only IOPL between tasks,
* it also protects other tasks from NT leaking through sysenter etc.
*/
#define switch_to(prev, next, last) \
do { \
/* \
* Context-switching clobbers all registers, so we clobber \
* them explicitly, via unused output variables. \
* (EAX and EBP is not listed because EBP is saved/restored \
* explicitly for wchan access and EAX is the return value of \
* __switch_to()) \
*/ \
unsigned long ebx, ecx, edx, esi, edi; \
\
asm volatile("pushfl\n\t" /* save flags 保存就的ebp、和flags寄存器到旧进程的内核栈中*/ \
"pushl %%ebp\n\t" /* save EBP */ \
"movl %%esp,%[prev_sp]\n\t" /* save ESP 将旧进程esp保存到thread_info结构中 */ \
"movl %[next_sp],%%esp\n\t" /* restore ESP 用新进程esp填写esp寄存器,此时内核栈已切换 */ \
"movl $1f,%[prev_ip]\n\t" /* save EIP 将该进程恢复执行时的下条地址保存到旧进程的thread中*/ \
"pushl %[next_ip]\n\t" /* restore EIP 将新进程的ip值压入到新进程的内核栈中 */ \
__switch_canary \
"jmp __switch_to\n" /* regparm call */ \
"1:\t" \
"popl %%ebp\n\t" /* restore EBP 该进程执行,恢复ebp寄存器*/ \
"popfl\n" /* restore flags 恢复flags寄存器*/ \
\
/* output parameters */ \
: [prev_sp] "=m" (prev->thread.sp), \
[prev_ip] "=m" (prev->thread.ip), \
"=a" (last), \
\
/* clobbered output registers: */ \
"=b" (ebx), "=c" (ecx), "=d" (edx), \
"=S" (esi), "=D" (edi) \
\
__switch_canary_oparam \
\
/* input parameters: */ \
: [next_sp] "m" (next->thread.sp), \
[next_ip] "m" (next->thread.ip), \
\
/* regparm parameters for __switch_to(): */ \
[prev] "a" (prev), \
[next] "d" (next) \
\
__switch_canary_iparam \
\
: /* reloaded segment registers */ \
"memory"); \
} while (0)
先对flags寄存器和ebp压入旧进程内核栈,并将确定旧进程恢复执行的下一跳地址,并将旧进程ip,esp保存到task_struct->thread_info中,这样旧进程保存完毕;然后用新进程的thread_info->esp恢复新进程的内核堆栈,用thread->info的ip恢复新进程地址执行。
4.barrier()
为了程序编译后指令的执行顺序不会因为编译器的优化而改变, 因此内核提供了路障同步barrier来保证程序的执行顺序
5.finish_task_switch()
完成一些清理工作, 使得能够正确的释放锁