上下文切换的具体步骤
场景:进程A下CPU,进程B上CPU
1. 保存进程A的上下文环境(程序计数器,程序状态字,其他寄存器…)
2. 用新状态和其他相关信息更新进程A的PCB
3. 把进程A移至合适的队列(就绪,阻塞…)
4. 将进程B的状态设置为运行态
5. 从进程B的PCB中恢复上下文(程序计数器,程序状态字,其他寄存器…)
Ref: 陈奶奶2018.3.29的课件
上下文切换的代码实现
schedule 调度函数
// Ref: linux-2.6.0\kernel\sched.c
/*
* schedule() is the main scheduler function.
*/
asmlinkage void schedule(void)
{
task_t *prev, *next;
runqueue_t *rq;
prio_array_t *array;
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
int idx;
/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
if (unlikely(in_atomic())) {
printk(KERN_ERR "bad: scheduling while atomic!\n");
dump_stack();
}
}
need_resched:
preempt_disable();
prev = current;
rq = this_rq();
release_kernel_lock(prev);
now = sched_clock();
if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
run_time = now - prev->timestamp;
else
run_time = NS_MAX_SLEEP_AVG;
/*
* Tasks with interactive credits get charged less run_time
* at high sleep_avg to delay them losing their interactive
* status
*/
if (HIGH_CREDIT(prev))
run_time /= (CURRENT_BONUS(prev) ? : 1);
spin_lock_irq(&rq->lock);
/*
* if entering off of a kernel preemption go straight
* to picking the next task.
*/
if (unlikely(preempt_count() & PREEMPT_ACTIVE))
goto pick_next_task;
switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (unlikely(signal_pending(prev))) {
prev->state = TASK_RUNNING;
break;
}
default:
deactivate_task(prev, rq);
prev->nvcsw++;
break;
case TASK_RUNNING:
prev->nivcsw++;
}
pick_next_task:
if (unlikely(!rq->nr_running)) {
#ifdef CONFIG_SMP
load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
if (rq->nr_running)
goto pick_next_task;
#endif
next = rq->idle;
rq->expired_timestamp = 0;
goto switch_tasks;
}
array = rq->active;
if (unlikely(!array->nr_active)) {
/*
* Switch the active and expired arrays.
*/
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
rq->expired_timestamp = 0;
}
idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);
if (next->activated > 0) {
unsigned long long delta = now - next->timestamp;
if (next->activated == 1)
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
dequeue_task(next, array);
recalc_task_prio(next, next->timestamp + delta);
enqueue_task(next, array);
}
next->activated = 0;
switch_tasks:
prefetch(next);
clear_tsk_need_resched(prev);
RCU_qsctr(task_cpu(prev))++;
prev->sleep_avg -= run_time;
if ((long)prev->sleep_avg <= 0){
prev->sleep_avg = 0;
if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
prev->interactive_credit--;
}
prev->timestamp = now;
if (likely(prev != next)) {
// 检查prev和next是否为同一个进程
next->timestamp = now;
rq->nr_switches++;
rq->curr = next;
prepare_arch_switch(rq, next);
// Context Switching!
prev = context_switch(rq, prev, next);
barrier();
finish_task_switch(prev);
} else
spin_unlock_irq(&rq->lock);
reacquire_kernel_lock(current);
preempt_enable_no_resched();
if (test_thread_flag(TIF_NEED_RESCHED))
goto need_resched;
}
Context_switch()进程上下文切换
调用switch_mm(),把虚拟内存从一个进程映射切换到新进程中。主要包括加载页表, 刷出地址转换后备缓冲器(部分或者全部), 向内存管理单元(MMU)提供新的信息
调用switch_to(),从上一个进程的处理器状态切换到新进程的处理器状态。包括保存、恢复栈信息和寄存器信息
// Ref: linux-2.6.0\kernel\sched.c
/*
* context_switch - switch to the new MM and the new
* thread's register state.
*/
static inline task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
{
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;
if (unlikely(!mm)) {
// 内核线程无虚拟地址空间,mm = NULL
// 内核线程的active_mm为上一个进程的mm
next->active_mm = oldmm;
// 增加mm的引用计数
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
// 不是内核线程, 则需要切切换虚拟地址空间
switch_mm(oldmm, mm, next);
if (unlikely(!prev->mm)) {
prev->active_mm = NULL;
WARN_ON(rq->prev_mm);
rq->prev_mm = oldmm;
}
/* Here we just switch the register state and the stack. */
// 切换进程的执行环境,包括堆栈和寄存器
switch_to(prev, next, prev);
return prev;
}
Switch_to() 函数
switch_mm()
和 switch_to()
是体系结构相关的代码,以x86为例来展示代码。
内核在switch_to中执行如下操作
1. 进程切换, 即rsp的切换, 从rsp可以找到进程的描述符
2. 硬件上下文切换,jmp到__switch_to函数
// Ref: linux-2.6.0\include\asm-x86_64\system.h
#define switch_to(prev,next,last) \
asm volatile(SAVE_CONTEXT \
// 保存旧进程的RSP
"movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
// 用新进程的RSP填入RSP寄存器,切换内核栈
"movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
"call __switch_to\n\t" \
".globl thread_return\n" \
"thread_return:\n\t" \
"movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
"movq %P[thread_info](%%rsi),%%r8\n\t" \
"btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
"movq %%rax,%%rdi\n\t" \
"jc ret_from_fork\n\t" \
RESTORE_CONTEXT \
: "=a" (last) \
: [next] "S" (next), [prev] "D" (prev), \
[threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
[ti_flags] "i" (offsetof(struct thread_info, flags)),\
[tif_fork] "i" (TIF_FORK), \
[thread_info] "i" (offsetof(struct task_struct, thread_info)), \
[pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
: "memory", "cc" __EXTRA_CLOBBER)
extern void load_gs_index(unsigned);
// Ref: linux-2.6.0\include\asm-x86_64\thread_info.h
struct thread_info {
struct task_struct *task; /* main task structure */
struct exec_domain *exec_domain; /* execution domain */
__u32 flags; /* low level flags */
__u32 status; /* thread synchronous flags */
__u32 cpu; /* current CPU */
int preempt_count;
mm_segment_t addr_limit;
struct restart_block restart_block;
};
// Ref: linux-2.6.0\include\asm-x86_64\system.h
#define RESTORE_CONTEXT \
__RESTORE(rbx, 12) __RESTORE(rdi, 1) \
__RESTORE(rdx, 6) __RESTORE(rcx, 7) \
__RESTORE(r12, 2) __RESTORE(r13, 3) \
__RESTORE(r14, 4) __RESTORE(r15, 5) \
__RESTORE(r10, 10) __RESTORE(r11, 11) \
__RESTORE(r8, 8) __RESTORE(r9, 9) \
__RESTORE(rbp, 13) __RESTORE(rsi, 0) \
"addq $14*8,%%rsp\n\t" \
"popfq\n\t"
#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"