Linux深入理解进程切换

文章详细阐述了Linux内核中的进程切换过程,包括在中断处理程序中调用schedule()函数进行调度,pick_next_task()函数选择下一个进程,以及context_switch()函数实现上下文切换。关键函数switch_to()用于切换寄存器状态和堆栈,完成进程间硬件上下文的切换。
摘要由CSDN通过智能技术生成

1.进程切换

1.1 进程切换时机

  • 中断处理程序中直接调用schedule()函数,实现进程调度。
  • 内核线程只有内核态,可以主动调用schedule()函数进行调度,也可以被中断处理程序调用。
  • 而schedule函数是一个内核函数,只能在中断处理程序中进行调度。

1.2 主要流程

  • schedule()函数会调用next = pick_next_task(rq, prev),根据调度算法策略,选取要执行的下一个进程。
  • 根据调度策略得到要执行的进程后,调用context_switch(rq, prev, next),完成进程上下文切换。其中,最关键的switch_to(prev,next, prev),切换堆栈和寄存器的状态。

其中,pick_next_task函数在/kernel/sched/core.c中

static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	struct task_struct *next, *p, *max = NULL;
	const struct cpumask *smt_mask;
	bool fi_before = false;
	bool core_clock_updated = (rq == rq->core);
	unsigned long cookie;
	int i, cpu, occ = 0;
	struct rq *rq_i;
	bool need_sync;

	if (!sched_core_enabled(rq))
		return __pick_next_task(rq, prev, rf);

	cpu = cpu_of(rq);

	/* Stopper task is switching into idle, no need core-wide selection. */
	if (cpu_is_offline(cpu)) {
		/*
		 * Reset core_pick so that we don't enter the fastpath when
		 * coming online. core_pick would already be migrated to
		 * another cpu during offline.
		 */
		rq->core_pick = NULL;
		return __pick_next_task(rq, prev, rf);
	}

	/*
	 * If there were no {en,de}queues since we picked (IOW, the task
	 * pointers are all still valid), and we haven't scheduled the last
	 * pick yet, do so now.
	 *
	 * rq->core_pick can be NULL if no selection was made for a CPU because
	 * it was either offline or went offline during a sibling's core-wide
	 * selection. In this case, do a core-wide selection.
	 */
	if (rq->core->core_pick_seq == rq->core->core_task_seq &&
	    rq->core->core_pick_seq != rq->core_sched_seq &&
	    rq->core_pick) {
		WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);

		next = rq->core_pick;
		if (next != prev) {
			put_prev_task(rq, prev);
			set_next_task(rq, next);
		}

		rq->core_pick = NULL;
		goto out;
	}

	put_prev_task_balance(rq, prev, rf);

	smt_mask = cpu_smt_mask(cpu);
	need_sync = !!rq->core->core_cookie;

	/* reset state */
	rq->core->core_cookie = 0UL;
	if (rq->core->core_forceidle_count) {
		if (!core_clock_updated) {
			update_rq_clock(rq->core);
			core_clock_updated = true;
		}
		sched_core_account_forceidle(rq);
		/* reset after accounting force idle */
		rq->core->core_forceidle_start = 0;
		rq->core->core_forceidle_count = 0;
		rq->core->core_forceidle_occupation = 0;
		need_sync = true;
		fi_before = true;
	}

	/*
	 * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
	 *
	 * @task_seq guards the task state ({en,de}queues)
	 * @pick_seq is the @task_seq we did a selection on
	 * @sched_seq is the @pick_seq we scheduled
	 *
	 * However, preemptions can cause multiple picks on the same task set.
	 * 'Fix' this by also increasing @task_seq for every pick.
	 */
	rq->core->core_task_seq++;

	/*
	 * Optimize for common case where this CPU has no cookies
	 * and there are no cookied tasks running on siblings.
	 */
	if (!need_sync) {
		next = pick_task(rq);
		if (!next->core_cookie) {
			rq->core_pick = NULL;
			/*
			 * For robustness, update the min_vruntime_fi for
			 * unconstrained picks as well.
			 */
			WARN_ON_ONCE(fi_before);
			task_vruntime_update(rq, next, false);
			goto out_set_next;
		}
	}

	/*
	 * For each thread: do the regular task pick and find the max prio task
	 * amongst them.
	 *
	 * Tie-break prio towards the current CPU
	 */
	for_each_cpu_wrap(i, smt_mask, cpu) {
		rq_i = cpu_rq(i);

		/*
		 * Current cpu always has its clock updated on entrance to
		 * pick_next_task(). If the current cpu is not the core,
		 * the core may also have been updated above.
		 */
		if (i != cpu && (rq_i != rq->core || !core_clock_updated))
			update_rq_clock(rq_i);

		p = rq_i->core_pick = pick_task(rq_i);
		if (!max || prio_less(max, p, fi_before))
			max = p;
	}

	cookie = rq->core->core_cookie = max->core_cookie;

	/*
	 * For each thread: try and find a runnable task that matches @max or
	 * force idle.
	 */
	for_each_cpu(i, smt_mask) {
		rq_i = cpu_rq(i);
		p = rq_i->core_pick;

		if (!cookie_equals(p, cookie)) {
			p = NULL;
			if (cookie)
				p = sched_core_find(rq_i, cookie);
			if (!p)
				p = idle_sched_class.pick_task(rq_i);
		}

		rq_i->core_pick = p;

		if (p == rq_i->idle) {
			if (rq_i->nr_running) {
				rq->core->core_forceidle_count++;
				if (!fi_before)
					rq->core->core_forceidle_seq++;
			}
		} else {
			occ++;
		}
	}

	if (schedstat_enabled() && rq->core->core_forceidle_count) {
		rq->core->core_forceidle_start = rq_clock(rq->core);
		rq->core->core_forceidle_occupation = occ;
	}

	rq->core->core_pick_seq = rq->core->core_task_seq;
	next = rq->core_pick;
	rq->core_sched_seq = rq->core->core_pick_seq;

	/* Something should have been selected for current CPU */
	WARN_ON_ONCE(!next);

	/*
	 * Reschedule siblings
	 *
	 * NOTE: L1TF -- at this point we're no longer running the old task and
	 * sending an IPI (below) ensures the sibling will no longer be running
	 * their task. This ensures there is no inter-sibling overlap between
	 * non-matching user state.
	 */
	for_each_cpu(i, smt_mask) {
		rq_i = cpu_rq(i);

		/*
		 * An online sibling might have gone offline before a task
		 * could be picked for it, or it might be offline but later
		 * happen to come online, but its too late and nothing was
		 * picked for it.  That's Ok - it will pick tasks for itself,
		 * so ignore it.
		 */
		if (!rq_i->core_pick)
			continue;

		/*
		 * Update for new !FI->FI transitions, or if continuing to be in !FI:
		 * fi_before     fi      update?
		 *  0            0       1
		 *  0            1       1
		 *  1            0       1
		 *  1            1       0
		 */
		if (!(fi_before && rq->core->core_forceidle_count))
			task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);

		rq_i->core_pick->core_occupation = occ;

		if (i == cpu) {
			rq_i->core_pick = NULL;
			continue;
		}

		/* Did we break L1TF mitigation requirements? */
		WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));

		if (rq_i->curr == rq_i->core_pick) {
			rq_i->core_pick = NULL;
			continue;
		}

		resched_curr(rq_i);
	}

out_set_next:
	set_next_task(rq, next);
out:
	if (rq->core->core_forceidle_count && next == rq->idle)
		queue_core_balance(rq);

	return next;
}

其中rq是running queue,指的是就绪队列,在就绪队列中选出合适的next进程执行。

当选好了next进程,就调用content_switch 函数,位于kernel/sched/core.c 中,其目的就是实现进程上下文切换。

static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
	       struct task_struct *next, struct rq_flags *rf)
{
	prepare_task_switch(rq, prev, next);

	/*
	 * For paravirt, this is coupled with an exit in switch_to to
	 * combine the page table reload and the switch backend into
	 * one hypercall.
	 */
	arch_start_context_switch(prev);

	/*
	 * kernel -> kernel   lazy + transfer active
	 *   user -> kernel   lazy + mmgrab() active
	 *
	 * kernel ->   user   switch + mmdrop() active
	 *   user ->   user   switch
	 */
	if (!next->mm) {                                // to kernel
		enter_lazy_tlb(prev->active_mm, next);

		next->active_mm = prev->active_mm;
		if (prev->mm)                           // from user
			mmgrab(prev->active_mm);
		else
			prev->active_mm = NULL;
	} else {                                        // to user
		membarrier_switch_mm(rq, prev->active_mm, next->mm);
		/*
		 * sys_membarrier() requires an smp_mb() between setting
		 * rq->curr / membarrier_switch_mm() and returning to userspace.
		 *
		 * The below provides this either through switch_mm(), or in
		 * case 'prev->active_mm == next->mm' through
		 * finish_task_switch()'s mmdrop().
		 */
		switch_mm_irqs_off(prev->active_mm, next->mm, next);
		lru_gen_use_mm(next->mm);

		if (!prev->mm) {                        // from kernel
			/* will mmdrop() in finish_task_switch(). */
			rq->prev_mm = prev->active_mm;
			prev->active_mm = NULL;
		}
	}

	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);

	prepare_lock_switch(rq, next, rf);

	/* Here we just switch the register state and the stack. */
	switch_to(prev, next, prev);
	barrier();
	
	return finish_task_switch(prev);
}

其中,content_switch 函数有三个参数:rq、prev、next,rq即使上面我们说的running queue(就绪队列),prev 指向切换前进程的进程描述符,next 指向切换后进程的进程描述符。
倒数第4行代码,调用switch_to函数,进程硬件上下文的切换是由switch_to完成的。

#define switch_to(prev, next, last)                             \
do {                                                            \
    /*
     * 进程切换可能会改变所有的寄存器,所以我们通过未使用的输出变量显式地修改它们。
     * EAX和EBP没有被列出,是因为EBP是为当前进程访问显式地保存和恢复的寄存器,
     * 而EAX将会作为函数__switch_to()的返回值。
     */
    unsigned long ebx, ecx, edx, esi, edi;                      \
                                                                \
    asm volatile("pushfl\n\t"               /* save    flags */ \
             "pushl %%ebp\n\t"              /* save    EBP   */ \
             "movl %%esp,%[prev_sp]\n\t"    /* save    ESP   */ \
             "movl %[next_sp],%%esp\n\t"    /* restore ESP   */ \
             "movl $1f,%[prev_ip]\n\t"      /* save    EIP   */ \
             "pushl %[next_ip]\n\t"         /* restore EIP   */ \
             __switch_canary                                    \
             __retpoline_fill_return_buffer                     \
             "jmp __switch_to\n"            /* regparm call  */ \
             "1:\t"                                             \
             "popl %%ebp\n\t"               /* restore EBP   */ \
             "popfl\n"                      /* restore flags */ \
                                                                \
             /* 输出参数 */                                     \
             : [prev_sp] "=m" (prev->thread.sp),                \
               [prev_ip] "=m" (prev->thread.ip),                \
               "=a" (last),                                     \
                                                                \
               /* 列出所有可能会修改的寄存器  */                \
               "=b" (ebx), "=c" (ecx), "=d" (edx),              \
               "=S" (esi), "=D" (edi)                           \
                                                                \
               __switch_canary_oparam                           \
                                                                \
               /* 输入参数 */                                   \
             : [next_sp]  "m" (next->thread.sp),                \
               [next_ip]  "m" (next->thread.ip),                \
                                                                \
               /* 为函数__switch_to()设置寄存器参数 */          \
               [prev]     "a" (prev),                           \
               [next]     "d" (next)                            \
                                                                \
               __switch_canary_iparam                           \
                                                                \
             : /* reloaded segment registers */                 \
            "memory");                                          \
} while (0)

switch_to 调用3个参数,prev、next和last,prev和next分别指向新旧进程的描述符地址,last,是一个输出参数,用来记录是从哪个进程切换来的。
switch_to的处理过程如下:
1.将新旧进程描述符存放到CPU寄存器中:

movl prev, %eax
movl next, %edx

2.保存旧进程的内核态栈,比如eflags和ebp寄存器的内容。

pushfl
pushl %ebp

3.保存旧进程栈指针esp到prev->thread.esp中

movl %esp,484(%eax)

操作数484(%eax)表明目的地址是寄存器eax中的地址加上484。
将新进程的栈指针加载到esp寄存器中。

4.新进程的栈指针位于next->thread.esp中。从现在起,内核在新进程的内核态栈上操作,所以,这条指令才是执行旧进程切换到新进程的开始。因为内核态栈的地址和进程描述符的地址紧密相关,那么改变内核栈意味着改变了当前的进程。

movl 484(%edx), %esp

5.保存标签1的地址->prev->thread.eip。
标签1标记进程当前执行的指令。这条指令意味着,再恢复进程A执行的时候,就从标签1处的地址中的指令开始执行。

movl $1f, 480(%eax)

6.加载新进程的指令流。

pushl 480(%edx)

意义和第5步差不多,就是执行顺序相反。

7.跳转到__switch_to()函数执行,是一个C函数。

jmp __switch_to

8.至此,进程A被进程B取代:开始执行B进程的指令。第一步应该是先弹出eflags和ebp寄存器的值。

popl %ebp
popfl

9.拷贝eax寄存器的内容(第1步加载的)到last变量中。

movl %eax, last

也就是说,last记录了被取代的进程。

  • 2
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值