context_switch
进程上下文切换的核心函数为`context_switch`,这一段代码位于内核中(目录为`kernel/sched/core.c`),与体系结构无关。
```C
static inline void
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
* one hypercall.
*/
arch_start_context_switch(prev);
if (!mm) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
switch_mm(oldmm, mm, next);
if (!prev->mm) {
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
/*
* Since the runqueue lock will be released by the next
* task (which is an invalid locking op but in the case
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
barrier();
/*
* this_rq must be evaluated again because prev may have moved
* CPUs since it called schedule(), thus the 'rq' on its stack
* frame will be invalid.
*/
finish_task_switch(this_rq(), prev);
}
```
`context_switch()` 一共有三个传入参数,其中`rq`表示当前就绪队列,`struct rq`是一个成员非常多的结构体,描述了此CPU上所运行的所有进程,`prev`与`next`分别指向了是前序、后序进程的描述符。
`prepare_task_switch()`定义了特定体系结构的事前准备工作,大多数体系结构不需要其中的选项,该函数与`finish_task_switch()`是配套使用的。
`mm`表示`memory management`,当`mm`为`NULL`的时候,表示下一进程只是内核线程,它不会访问用户态地址空间,因此无需使当前TLB作废,于是在`atomic_inc()`增加前序`mm`的引用计数后,调用`惰性TLB`相关代码`enter_lazy_tlb()`;当`mm`非`NULL`,表示非内核线程,则需要切换虚拟地址空间,调用`switch_mm()`函数。
接下来,若前序线程是内核线程或正在退出的进程,即`prev->mm`非`NULL`,则其`active_mm`字段被置为`NULL`,整个运行队列的`prev_mm`也变为前序线程的`mm`。
`spin_release()`的功能:运行队列锁往往由下一个任务释放,但在程序调度时需要将它提前释放。
`switch_to()`将使CPU的状态从上一个进程切换到当前进程,在`x86-64`和`arm64`中,这都是一个宏定义的函数。不同的体系结构有不同的`switch_to()`实现。
arm64 switch_to
以下是`5.4.34`:`arm64`体系结构的实现。它是一个`do-while`循环,引入`__switch_to`这段函数。
#define switch_to(prev,next,last) \
do { \
__complete_pending_tlbi(); \
last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \
} while (0)
在`__switch_to`函数中,`fpsimd_thread_switch(next)`将当前的`float-point SIMD`的状态保存到内存中,`tls_thread_switch`将完成线程本地存储(thread local storage)的切换,` hw_breakpoint_thread_switch(next)`与`contextidr_thread_switch(next)`和硬件追踪相关。
__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next)
{
struct task_struct *last;
fpsimd_thread_switch(next);
tls_thread_switch(next);
hw_breakpoint_thread_switch(next);
contextidr_thread_switch(next);
entry_task_switch(next);
uao_thread_switch(next);
ptrauth_thread_switch(next);
ssbs_thread_switch(next);
/*
* Complete any pending TLB or cache maintenance on this CPU in case
* the thread migrates to a different CPU.
* This full barrier is also required by the membarrier system
* call.
*/
dsb(ish);
/* the actual thread switch */
last = cpu_switch_to(prev, next);
return last;
}
```
`cpu_switch_to`是`__switch_to`函数的最后一个动作,它是一系列汇编代码:
ENTRY(cpu_switch_to)
mov x10, #THREAD_CPU_CONTEXT
add x8, x0, x10
mov x9, sp
stp x19, x20, [x8], #16 // store callee-saved registers
stp x21, x22, [x8], #16
stp x23, x24, [x8], #16
stp x25, x26, [x8], #16
stp x27, x28, [x8], #16
stp x29, x9, [x8], #16
str lr, [x8]
add x8, x1, x10
ldp x19, x20, [x8], #16 // restore callee-saved registers
ldp x21, x22, [x8], #16
ldp x23, x24, [x8], #16
ldp x25, x26, [x8], #16
ldp x27, x28, [x8], #16
ldp x29, x9, [x8], #16
ldr lr, [x8]
mov sp, x9
msr sp_el0, x1
ret
ENDPROC(cpu_switch_to)
NOKPROBE(cpu_switch_to)
其中`THREAD_CPU_CONTEXT`在文件`arch/arm64/kernel/asm-offsets.c`被作了如下定义,它取了进程描述符中的`thread.cpu_context`的首地址在进程描述符中的偏移量。
int main(void)
{
// ...
DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context));
// ...
}
`cpu_context`的定义如下,它存放了即将保存的进程上下文中关键的CPU寄存器的值,其中包括了重要的堆栈指针`sp`和程序计数器`pc`。
struct cpu_context {
unsigned long x19;
unsigned long x20;
unsigned long x21;
unsigned long x22;
unsigned long x23;
unsigned long x24;
unsigned long x25;
unsigned long x26;
unsigned long x27;
unsigned long x28;
unsigned long fp;
unsigned long sp;
unsigned long pc;
};
`stp`是成对的`store`指令,而`ldp`是成对的`load`指令,前半段向内核堆栈保存了前一个进程的多个通用寄存器的值,而后半段将即将恢复的进程在堆栈中已保存的寄存器值载入到CPU中。值得注意的是,`x29`存放的是栈底地址而`x9`存放了栈顶地址。
至此,`arm64`体系结构的进程切换完成。
x86-64 switch_to
`x86-64`体系结构对进程切换的实现比较相似,但是它需要先通过`switch_to`函数引出`__switch_to_asm`这段切换CPU现场的的汇编代码,执行这段汇编代码后才能进入`__switch_to`函数。显然,`x86-64`与`arm64`体系结构都有这段类似的汇编代码,只是一执行的顺序不同。
#define switch_to(prev,next,last) \
do { \
__complete_pending_tlbi(); \
last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \
} while (0)
ENTRY(__switch_to_asm)
UNWIND_HINT_FUNC
/*
* Save callee-saved registers
* This must match the order in inactive_task_frame
*/
pushq %rbp
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15 // 保存当前进程的现场,包括栈底指针和一系列寄存器的值
/* switch stack */
movq %rsp, TASK_threadsp(%rdi) // 保存旧进程的栈顶%rsp
movq TASK_threadsp(%rsi), %rsp // 栈顶指针%rsp指向新进程
/* restore callee-saved registers */
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
popq %rbp
jmp __switch_to
END(__switch_to_asm)
一系列的`pushq`指令将保存当前进程的CPU现场,包括栈底指针和一系列寄存器的值。
两个`movq`将完成旧栈顶指针的保存和新栈顶指针的切换。
一系列的`popq`指令会将已保存的新进程CPU现场恢复到CPU中。
最后,这段汇编代码不会像一般的函数调用般立即返回,而是需要先通过`jmp __switch_to`指令将进入`__switch_to`函数,在该函数中才会有`ret`指令,使得程序计数器`%eip`的值恢复。
```C
__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_prepare(prev_fpu, cpu);
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
*
* (e.g. xen_load_tls())
*/
save_fsgs(prev_p);
/*
* Load TLS before restoring any segments so that segment loads
* reference the correct GDT entries.
*/
load_TLS(next, cpu);
/*
* Leave lazy mode, flushing any hypercalls made here. This
* must be done after loading TLS entries in the GDT but before
* loading segments that might reference them.
*/
arch_end_context_switch(next_p);
/* Switch DS and ES.
*
* Reading them only returns the selectors, but writing them (if
* nonzero) loads the full descriptor from the GDT or LDT. The
* LDT for next is loaded in switch_mm, and the GDT is loaded
* above.
*
* We therefore need to write new values to the segment
* registers on every context switch unless both the new and old
* values are zero.
*
* Note that we don't need to do anything for CS and SS, as
* those are saved and restored as part of pt_regs.
*/
savesegment(es, prev->es);
if (unlikely(next->es | prev->es))
loadsegment(es, next->es);
savesegment(ds, prev->ds);
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
x86_fsgsbase_load(prev, next);
/*
* Switch the PDA and FPU contexts.
*/
this_cpu_write(current_task, next_p);
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
switch_fpu_finish(next_fpu);
/* Reload sp0. */
update_task_stack(next_p);
switch_to_extra(prev_p, next_p);
#ifdef CONFIG_XEN_PV
/*
* On Xen PV, IOPL bits in pt_regs->flags have no effect, and
* current_pt_regs()->flags may not match the current task's
* intended IOPL. We need to switch it manually.
*/
if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
prev->iopl != next->iopl))
xen_set_iopl_mask(next->iopl);
#endif
if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/*
* AMD CPUs have a misfeature: SYSRET sets the SS selector but
* does not update the cached descriptor. As a result, if we
* do SYSRET while SS is NULL, we'll end up in user mode with
* SS apparently equal to __USER_DS but actually unusable.
*
* The straightforward workaround would be to fix it up just
* before SYSRET, but that would slow down the system call
* fast paths. Instead, we ensure that SS is never NULL in
* system call context. We do this by replacing NULL SS
* selectors at every context switch. SYSCALL sets up a valid
* SS, so the only way to get NULL is to re-enter the kernel
* from CPL 3 through an interrupt. Since that can't happen
* in the same task as a running syscall, we are guaranteed to
* context switch between every interrupt vector entry and a
* subsequent SYSRET.
*
* We read SS first because SS reads are much faster than
* writes. Out of caution, we force SS to __KERNEL_DS even if
* it previously had a different non-NULL value.
*/
unsigned short ss_sel;
savesegment(ss, ss_sel);
if (ss_sel != __KERNEL_DS)
loadsegment(ss, __KERNEL_DS);
}
/* Load the Intel cache allocation PQR MSR. */
resctrl_sched_in();
return prev_p;
}
```
在这段代码中,首先完成了线程指针的切换,然后进行一系列与TLS线程本地存储与CPU硬件相关的操作,这里与`arm64`的代码是类似的。
至此,`x86-64`体系结构的进程切换完成。