linux调度中,在schedule函数中完成选择下一个进行、进程间切换进程的切换在schedule函数中主要由两个函数完成:
sched_info_switch(prev, next);主要是更新切换出去和进来进程以及对应rq的相关变量。该函数主要调用__sched_info_switch函数来实现。
/*
* Called when tasks are switched involuntarily due, typically, to expiring
* their time slice. (This may also be called when switching to or from
* the idle task.) We are only called when prev != next.
*/
static inline void
__sched_info_switch(struct task_struct *prev, struct task_struct *next)
{
struct rq *rq = task_rq(prev);
/*
* prev now departs the cpu. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
if (prev != rq->idle)/*如果被切换出去的进程不是idle进程*/
sched_info_depart(prev);/*更新prev进程和他对应rq的相关变量*/
if (next != rq->idle)/*如果切换进来的进程不是idle进程*/
sched_info_arrive(next);/*更新next进程和对应队列的相关变量*/
}
/*
* Called when a process ceases being the active-running process, either
* voluntarily or involuntarily. Now we can calculate how long we ran.
* Also, if the process is still in the TASK_RUNNING state, call
* sched_info_queued() to mark that it has now again started waiting on
* the runqueue.
*/
static inline void sched_info_depart(struct task_struct *t)
{
/*计算在进程在rq中运行的时间长度*/
unsigned long long delta = task_rq(t)->clock -
t->sched_info.last_arrival;
/*更新RunQueue中的Task所得到CPU執行
時間的累加值.*/
rq_sched_info_depart(task_rq(t), delta);
/*如果被切换出去进程的状态是运行状态
那么将进程sched_info.last_queued设置为rq的clock
last_queued为最后一次排队等待运行的时间*/
if (t->state == TASK_RUNNING)
sched_info_queued(t);
}
/*
* Called when a task finally hits the cpu. We can now calculate how
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
static void sched_info_arrive(struct task_struct *t)
{
unsigned long long now = task_rq(t)->clock, delta = 0;
if (t->sched_info.last_queued)/*如果被切换进来前在运行进程中排队*/
delta = now - t->sched_info.last_queued;/*计算排队等待的时间长度*/
sched_info_reset_dequeued(t);/*因为进程将被切换进来运行,设定last_queued为0*/
t->sched_info.run_delay += delta;/*更新进程在运行队列里面等待的时间*/
t->sched_info.last_arrival = now;/*更新最后一次运行的时间*/
t->sched_info.pcount++;/*cpu上运行的次数加一*/
/*更新rq中rq_sched_info中的对应的变量*/
rq_sched_info_arrive(task_rq(t), delta);
}
context_switch函数完成主要的硬件、寄存器等实际的切换工作。
/*
* context_switch - switch to the new MM and the new
* thread's register state.
*/
static inline void
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
trace_sched_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
* one hypercall.
*/
arch_start_context_switch(prev);
if (unlikely(!mm)) {/*如果被切换进来的进程的mm为空*/
next->active_mm = oldmm;/*将共享切换出去进程的active_mm*/
atomic_inc(&oldmm->mm_count);/*有一个进程共享,所有引用计数加一*/
/*将per cpu变量cpu_tlbstate状态设为LAZY*/
enter_lazy_tlb(oldmm, next);
} else/*如果mm不会空,那么进行mm切换*/
switch_mm(oldmm, mm, next);
if (unlikely(!prev->mm)) {/*如果切换出去的mm为空,从上面
可以看出本进程的active_mm为共享先前切换出去的进程
的active_mm,所有需要在这里置空*/
prev->active_mm = NULL;
rq->prev_mm = oldmm; /*更新rq的前一个mm结构*/
}
/*
* Since the runqueue lock will be released by the next
* task (which is an invalid locking op but in the case
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
barrier();
/*
* this_rq must be evaluated again because prev may have moved
* CPUs since it called schedule(), thus the 'rq' on its stack
* frame will be invalid.
*/
finish_task_switch(this_rq(), prev);
}
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
unsigned cpu = smp_processor_id();
if (likely(prev != next)) {
/* stop flush ipis for the previous mm */
/*将被替换进程使用的内存描述结构的CPU
掩码中当前处理器号对应的位码清0*/
cpumask_clear_cpu(cpu, mm_cpumask(prev));
#ifdef CONFIG_SMP
/*设置per cpu变量tlb*/
percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
percpu_write(cpu_tlbstate.active_mm, next);
#endif
/*将要被调度运行进程拥有的内存描述结构
的CPU掩码中当前处理器号对应的位码设置为1*/
cpumask_set_cpu(cpu, mm_cpumask(next));
/* Re-load page tables */
load_cr3(next->pgd);/*将切换进来进程的pgd load到cr3寄存器*/
/*
* load the LDT, if the LDT is different:
*/
if (unlikely(prev->context.ldt != next->context.ldt))
load_LDT_nolock(&next->context);
}
#ifdef CONFIG_SMP
else {/*如果切换的两个进程相同*/
percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
/* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables.
*/
load_cr3(next->pgd);
load_LDT_nolock(&next->context);
}
}
#endif
}
具体寄存器相关的切换由函数switch_to完成,改函数用汇编代码保持各种寄存器的值,然后调用c函数__switch_to,
汇编中实现了具体的切换:
/*
* Saving eflags is important. It switches not only IOPL between tasks,
* it also protects other tasks from NT leaking through sysenter etc.
*/
#define switch_to(prev, next, last) \
do { \
/* \
* Context-switching clobbers all registers, so we clobber \
* them explicitly, via unused output variables. \
* (EAX and EBP is not listed because EBP is saved/restored \
* explicitly for wchan access and EAX is the return value of \
* __switch_to()) \
*/ \
unsigned long ebx, ecx, edx, esi, edi; \
\
asm volatile("pushfl\n\t" /* save flags */ \
"pushl %%ebp\n\t" /* save EBP */ \
"movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
"movl %[next_sp],%%esp\n\t" /* restore ESP */ \
"movl $1f,%[prev_ip]\n\t" /* save EIP */ \
/*将next_ip入栈,下面用jmp跳转,这样
返回到标号1时就切换过来了*/
"pushl %[next_ip]\n\t" /* restore EIP */ \
__switch_canary \
"jmp __switch_to\n" /* regparm call */ \
"1:\t" \
/*切换到新进程的第一条指令*/
"popl %%ebp\n\t" /* restore EBP */ \
"popfl\n" /* restore flags */ \
\
/* output parameters */ \
: [prev_sp] "=m" (prev->thread.sp), \
[prev_ip] "=m" (prev->thread.ip), \
"=a" (last), \
\
/* clobbered output registers: */ \
"=b" (ebx), "=c" (ecx), "=d" (edx), \
"=S" (esi), "=D" (edi) \
\
__switch_canary_oparam \
\
/* input parameters: */ \
: [next_sp] "m" (next->thread.sp), \
[next_ip] "m" (next->thread.ip), \
\
/* regparm parameters for __switch_to(): */ \
[prev] "a" (prev), \
[next] "d" (next) \
\
__switch_canary_iparam \
\
: /* reloaded segment registers */ \
"memory"); \
} while (0)
/*
* switch_to(x,yn) should switch tasks from x to y.
*
* We fsave/fwait so that an exception goes off at the right time
* (as a call from the fsave or fwait in effect) rather than to
* the wrong process. Lazy FP saving no longer makes any sense
* with modern CPU's, and this simplifies a lot of things (SMP
* and UP become the same).
*
* NOTE! We used to use the x86 hardware context switching. The
* reason for not using it any more becomes apparent when you
* try to recover gracefully from saved state that is no longer
* valid (stale segment register values in particular). With the
* hardware task-switch, there is no way to fix up bad state in
* a reasonable manner.
*
* The fact that Intel documents the hardware task-switching to
* be slow is a fairly red herring - this code is not noticeably
* faster. However, there _is_ some room for improvement here,
* so the performance issues may eventually be a valid point.
* More important, however, is the fact that this allows us much
* more flexibility.
*
* The return value (in %ax) will be the "prev" task after
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/
__notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);/*init_tss为一个per cpu变量*/
bool preload_fpu;
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
/*
* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap; the
* chances of needing FPU soon are obviously high now
*/
preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
/*保存FPU寄存器*/
__unlazy_fpu(prev_p);
/* we're going to use this soon, after a few expensive things */
if (preload_fpu)
prefetch(next->xstate);
/*
* Reload esp0.
*/
/*吧next_p->thread.esp0装入对应于本地cpu的tss的esp0
字段;任何由sysenter汇编指令产生的从用户态
到内核态的特权级转换将把这个地址拷贝到
esp寄存器中*/
load_sp0(tss, next);
/*
* Save away %gs. No need to save %fs, as it was saved on the
* stack on entry. No need to save %es and %ds, as those are
* always kernel segments while inside the kernel. Doing this
* before setting the new TLS descriptors avoids the situation
* where we temporarily have non-reloadable segments in %fs
* and %gs. This could be an issue if the NMI handler ever
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
lazy_save_gs(prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
*/
/*把next进程使用的县城局部存储(TLS)段装入本地CPU
的全局描述符表;三个段选择符保存在进程描述符
内的tls_array数组中*/
load_TLS(next, cpu);
/*
* Restore IOPL if needed. In normal use, the flags restore
* in the switch assembly will handle this. But if the kernel
* is running virtualized at a non-zero CPL, the popf will
* not restore flags, so it must be done in a separate step.
*/
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
set_iopl_mask(next->iopl);
/*
* Now maybe handle debug registers and/or IO bitmaps
*/
if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
__switch_to_xtra(prev_p, next_p, tss);
/* If we're going to preload the fpu context, make sure clts
is run while we're batching the cpu state updates. */
if (preload_fpu)
clts();
/*
* Leave lazy mode, flushing any hypercalls made here.
* This must be done before restoring TLS segments so
* the GDT and LDT are properly updated, and must be
* done before math_state_restore, so the TS bit is up
* to date.
*/
arch_end_context_switch(next_p);
if (preload_fpu)
__math_state_restore();/*装载FPU寄存器*/
/*
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
lazy_load_gs(next->gs);
percpu_write(current_task, next_p);
return prev_p;
}
static inline void __unlazy_fpu(struct task_struct *tsk)
{ /*包含在thread_info描述符的status字段中的
TS_USEDFPU标志。他表示进程在当前执行的过程中
是否使用过FPU/MMU/XMM寄存器*/
if (task_thread_info(tsk)->status & TS_USEDFPU) {
/*由于tsk在这次执行中使用了FPU/MMX/SSE或
SSE2指令;因此内核必须保存相关的硬件
上下文*/
__save_init_fpu(tsk);
stts();
} else
tsk->fpu_counter = 0;
}
static inline void __save_init_fpu(struct task_struct *tsk)
{ /*如果CPU使用SSE/SSE2扩展,则*/
if (task_thread_info(tsk)->status & TS_XSAVE)
xsave(tsk);
else
fxsave(tsk);
clear_fpu_state(tsk);
task_thread_info(tsk)->status &= ~TS_USEDFPU;/*重置TS_USEDFPU标志*/
}