通过前文的介绍,我们已经知道所有的linux调度都是通过调用schudule函数实现进程调度切换的。本篇我们就主要讲解这个调度最核心的函数---schedule函数。
schedule函数流程分析
schedule函数代码如下:
从上面可以看出,schedule函数主要包含一个do while流程。在该流程中,做了如下工作:
- 关闭抢占,避免在切换的过程中被再次抢占,出现混乱。
- 调用调度核心流程函数__schedule
- 使能抢占
- 判断新切换后的进程是否需要重新调度。若需要,则重复步骤1-3,直至调度后的进程不再需要重新调度。
__schedule函数流程分析
__schedule函数作为主要的调度器函数,是我们需要重点分析的。__schedule函数的实现如下:
static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
struct rq *rq;
int cpu;
cpu = smp_processor_id();
rq = cpu_rq(cpu); /* 1. 获取cpu的运行队列 */
prev = rq->curr;
schedule_debug(prev, preempt);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
rcu_note_context_switch(preempt);
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up():
*
* __set_current_state(@state) signal_wake_up()
* schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
* wake_up_state(p, state)
* LOCK rq->lock LOCK p->pi_state
* smp_mb__after_spinlock() smp_mb__after_spinlock()
* if (signal_pending_state()) if (p->state & @state)
*
* Also, the membarrier system call requires a full memory barrier
* after coming from user-space, before storing to rq->curr.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
switch_count = &prev->nivcsw;
/*
* We must load prev->state once (task_struct::state is volatile), such
* that:
*
* - we form a control dependency vs deactivate_task() below.
* - ptrace_{,un}freeze_traced() can change ->state underneath us.
*/
prev_state = prev->state;
if (!preempt && prev_state) {
if (signal_pending_state(prev_state, prev)) {
prev->state = TASK_RUNNING;
} else {
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
!(prev->flags & PF_FROZEN);
if (prev->sched_contributes_to_load)
rq->nr_uninterruptible++;
/*
* __schedule() ttwu()
* prev_state = prev->state; if (p->on_rq && ...)
* if (prev_state) goto out;
* p->on_rq = 0; smp_acquire__after_ctrl_dep();
* p->state = TASK_WAKING
*
* Where __schedule() and ttwu() have matching control dependencies.
*
* After this, schedule() must not care about p->state any more.
*/
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
}
switch_count = &prev->nvcsw;
}
next = pick_next_task(rq, prev, &rf); /* 2. 选取下一个调度的任务 */
clear_tsk_need_resched(prev); /* 清除上一个进程(即即将被切换进程)需要重新调度标志 */
clear_preempt_need_resched();
if (likely(prev != next)) { /* 如果需要切换的进程不是当前正在运行的进程 */
rq->nr_switches++;
/*
* RCU users of rcu_dereference(rq->curr) may not see
* changes to task_struct made by pick_next_task().
*/
RCU_INIT_POINTER(rq->curr, next);
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
* rq->curr, before returning to user-space.
*
* Here are the schemes providing that barrier on the
* various architectures:
* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
* switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
* - finish_lock_switch() for weakly-ordered
* architectures where spin_unlock is a full barrier,
* - switch_to() for arm64 (weakly-ordered, spin_unlock
* is a RELEASE barrier),
*/
++*switch_count;
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(preempt, prev, next);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf); /* 3. 进程上下文切换 */
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_unlock_irq(&rq->lock);
}
}
在上面的代码中,我已经把其中的关键部分做了注释。总结如下:
- 调用cpu_rq接口获取当前cpu的运行队列。(每个cpu都有一个运行队列)
- 调用pick_next_task接口获取下一个需要执行的任务。(具体哪个任务由调度算法决定)
- 调用context_switch接口完成进程的上下文切换。(包括内存空间切换和寄存器状态切换)
context_switch函数流程分析
context_switch函数完成了进程调度最核心的部分:进程运行内存空间切换和寄存器状态切换。该函数的实现如下:
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
prepare_task_switch(rq, prev, next);
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
* one hypercall.
*/
arch_start_context_switch(prev);
/*
* kernel -> kernel lazy + transfer active
* user -> kernel lazy + mmgrab() active
*
* kernel -> user switch + mmdrop() active
* user -> user switch
*/
if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm; /* 调度切换的进程为内核线程,则借用前一个用户态进程的用户态内存空间 */
if (prev->mm) // from user
mmgrab(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
membarrier_switch_mm(rq, prev->active_mm, next->mm);
/*
* sys_membarrier() requires an smp_mb() between setting
* rq->curr / membarrier_switch_mm() and returning to userspace.
*
* The below provides this either through switch_mm(), or in
* case 'prev->active_mm == next->mm' through
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next); /* 4. 切换进程运行内存空间 */
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev); /* 5. 切换进程的寄存器状态和栈 */
barrier();
return finish_task_switch(prev);
}
注意,进程结构体task_struct中有两个内存相关数据:mm和active_mm。
添加这两个类似的内存数据主要是用于区分内核线程和用户态进程。由于内核态线程只能运行在内核空间中(高1G空间),它不需要用户态的内存空间(0-3G空间),为了使内核线程调度和用户态进程调度流程一致,内核线程的用户态空间总是借用前一个用户态进程的用户空间。因为所有用户态进程共用一个内核空间,因此这种借用也是合理的。
- task_struct中mm内存用于表示该进程真实对应的用户态内存空间,因此由于内核线程没有用户态内存空间,其task_struct中的mm指针为空。
- active_mm主要用于表示进程切换时使用的内存空间。对于用户态进程来说,其active_mm等于自身mm;而内核线程由于没有用户态空间,它将借用调度的前一个用户态进程的用户态内存空间,即内核线程的active_mm等于调度的前一个用户态进程的mm。
在切换进程上下文函数context_switch中主要做了两件事:
- 调用switch_mm_irqs_off函数切换进程运行的内存空间。如在x86环境下,会将待切换进程页表的页全局目录pgd的基地址对应的物理地址保存到cr3寄存器中。
- 调用switch_to函数切换寄存器状态和栈。将老进程的寄存器值保存到其内核栈thread_info中的cpu_context中。
完成了上面两件事即实现了我们通常所说的进程上下文切换,实现了真正的进程切换。
switch_to函数流程分析
switch_to函数主要调用了__switch_to函数。
而__switch_to函数是一个汇编函数,在entry-armv.S文件中实现(针对arm架构)。该函数的实现如下:
/*
* Register switch for ARMv3 and ARMv4 processors
* r0 = previous task_struct, r1 = previous thread_info, r2 = next thread_info
* previous and next are guaranteed not to be the same.
*/
ENTRY(__switch_to)
UNWIND(.fnstart )
UNWIND(.cantunwind )
add ip, r1, #TI_CPU_SAVE
ARM( stmia ip!, {r4 - sl, fp, sp, lr} ) @ Store most regs on stack
THUMB( stmia ip!, {r4 - sl, fp} ) @ Store most regs on stack
THUMB( str sp, [ip], #4 )
THUMB( str lr, [ip], #4 )
ldr r4, [r2, #TI_TP_VALUE]
ldr r5, [r2, #TI_TP_VALUE + 4]
#ifdef CONFIG_CPU_USE_DOMAINS
mrc p15, 0, r6, c3, c0, 0 @ Get domain register
str r6, [r1, #TI_CPU_DOMAIN] @ Save old domain register
ldr r6, [r2, #TI_CPU_DOMAIN]
#endif
switch_tls r1, r4, r5, r3, r7
#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP)
ldr r7, [r2, #TI_TASK]
ldr r8, =__stack_chk_guard
.if (TSK_STACK_CANARY > IMM12_MASK)
add r7, r7, #TSK_STACK_CANARY & ~IMM12_MASK
.endif
ldr r7, [r7, #TSK_STACK_CANARY & IMM12_MASK]
#endif
#ifdef CONFIG_CPU_USE_DOMAINS
mcr p15, 0, r6, c3, c0, 0 @ Set domain register
#endif
mov r5, r0
add r4, r2, #TI_CPU_SAVE
ldr r0, =thread_notify_head
mov r1, #THREAD_NOTIFY_SWITCH
bl atomic_notifier_call_chain
#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP)
str r7, [r8]
#endif
THUMB( mov ip, r4 )
mov r0, r5
ARM( ldmia r4, {r4 - sl, fp, sp, pc} ) @ Load all regs saved previously
THUMB( ldmia ip!, {r4 - sl, fp} ) @ Load all regs saved previously
THUMB( ldr sp, [ip], #4 )
THUMB( ldr pc, [ip] )
UNWIND(.fnend )
ENDPROC(__switch_to)
上述汇编代码我挑选几个重要部分进行说明。
- 首先进行几个函数入参进行说明:
- r0寄存器:保存指向前一个进程的task_struct指针
- r1寄存器:保存指向前一个进程的内核栈thread_info指针
- r2寄存器:保存指向下一个进程的内核栈thread_info指针
- 获取前一个进程内核栈thread_info中用于保存寄存器状态值的成员cpu_context的地址。
注意,上面的TI_CPU_SAVE即为thread_info结构体中用于保存寄存器值的cpu_context成员偏移值。该宏定义在asm-offsets.c文件中。从下面的图中也可以看到也定义了很多其他常见的宏。
- 将前一个进程的寄存器值保存到其内核栈thread_info的cpu_context成员中。
- 获取后一个进程内核栈thread_info中cpu_context成员地址
- 从下一个进程内核栈thread_info的cpu_context成员加载寄存器值到寄存器中。cpu_context中的这些寄存器值都是上次调度时保存的,现在恢复即可回到当初代码执行的现场。
至此完成了进程寄存器值和栈的切换。
schedule函数流程总结: