参考
kernel-4.19/kernel/sched/core.c
/*
4249 * __schedule() is the main scheduler function.
4250 *
4251 * The main means of driving the scheduler and thus entering this function are:
4252 *
4253 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
4254 *
4255 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
4256 * paths. For example, see arch/x86/entry_64.S.
4257 *
4258 * To drive preemption between tasks, the scheduler sets the flag in timer
4259 * interrupt handler scheduler_tick().
4260 *
4261 * 3. Wakeups don't really cause entry into schedule(). They add a
4262 * task to the run-queue and that's it.
4263 *
4264 * Now, if the new task added to the run-queue preempts the current
4265 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
4266 * called on the nearest possible occasion:
4267 *
4268 * - If the kernel is preemptible (CONFIG_PREEMPT=y):
4269 *
4270 * - in syscall or exception context, at the next outmost
4271 * preempt_enable(). (this might be as soon as the wake_up()'s
4272 * spin_unlock()!)
4273 *
4274 * - in IRQ context, return from interrupt-handler to
4275 * preemptible context
4276 *
4277 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
4278 * then at the next:
4279 *
4280 * - cond_resched() call
4281 * - explicit schedule() call
4282 * - return from syscall or exception to user-space
4283 * - return from interrupt-handler to user-space
4284 *
4285 * WARNING: must be called with preemption disabled!
286 */
__schedule()是调度核心函数,其作用是让调度器选择和切换到一个合适进程远程。调度时机如下三种:
1、阻塞操作:互斥量mute、信号量 、等待队列等
2、在中断返回前和系统调用返回到用户空间时,检测TIF_NEED_RESCHED标志位确定是否需要调度
3、将要被唤醒(wakeups)进程不会马上调用schedule()要求调度,而是被添加到cfs 就绪队列中,并且设置TIF_NEED_RESCHED 标志位。唤醒进程被调度分下面两种情况:
A :内核可以抢占
如果唤醒发生在系统调用或者异常处理上下文,在下一次调用preempt_enable()时会检查是否需要抢占调用。
如果发生在硬件中断(irq)上下文,在硬件中断返前(不管中断发生在用户空间还是内核空间)检查是否需要抢占。
B :如果内核不可抢占
当前进程调用cond_resched() 时会检查是否需要调度
主动调用schedule()
系统调用或者异常处理返回用户空间时
中断处理完成返回用户空间时(中断发生在用户空间)
4287 static void __sched notrace __schedule(bool preempt)
4288 {
4289 struct task_struct *prev, *next;
4290 unsigned long *switch_count;
4291 struct rq_flags rf;
4292 struct rq *rq;
4293 int cpu;
//获取当前进程cpu,获取cpu 对应运行队列rq;每个cpu对应一个运行队列;
//获取运行队列当前运行task
4298 cpu = smp_processor_id();
4299 rq = cpu_rq(cpu);
4300 prev = rq->curr;
4301
4302 schedule_debug(prev);
4304 if (sched_feat(HRTICK))
4305 hrtick_clear(rq);
4306
4307 local_irq_disable();
4308 rcu_note_context_switch(preempt);
4309
4310 /*
4311 * Make sure that signal_pending_state()->signal_pending() below
4312 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4313 * done by the caller to avoid the race with signal_wake_up().
4314 *
4315 * The membarrier system call requires a full memory barrier
4316 * after coming from user-space, before storing to rq->curr.
4317 */
4318 rq_lock(rq, &rf);
4319 smp_mb__after_spinlock();
4320
4321 /* Promote REQ to ACT */
4322 rq->clock_update_flags <<= 1;
4323 update_rq_clock(rq);
switch_count = &prev->nivcsw;
4326 if (!preempt && prev->state) {
4327 if (unlikely(signal_pending_state(prev->state, prev))) {
4328 prev->state = TASK_RUNNING;
4329 } else {
4330 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4331 prev->on_rq = 0;
4332
4333 if (prev->in_iowait) {
4334 atomic_inc(&rq->nr_iowait);
4335 delayacct_blkio_start();
4336 }
4337
4338 /*
4339 * If a worker went to sleep, notify and ask workqueue
4340 * whether it wants to wake up a task to maintain
4341 * concurrency.
4342 */
4343 if (prev->flags & PF_WQ_WORKER) {
4344 struct task_struct *to_wakeup;
4345
4346 to_wakeup = wq_worker_sleeping(prev);
4347 if (to_wakeup)
4348 try_to_wake_up_local(to_wakeup, &rf);
4349 }
4350 }
4351 switch_count = &prev->nvcsw; ###主动切换,如调用sleep
4352 }
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
4361 clear_preempt_need_resched();
4362
4363 if (likely(prev != next)) {
rq->nr_switches++;
4371 rq->curr = next;
4372 /*
4373 * The membarrier system call requires each architecture
4374 * to have a full memory barrier after updating
4375 * rq->curr, before returning to user-space.
4376 *
4377 * Here are the schemes providing that barrier on the
4378 * various architectures:
4379 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
4380 * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
4381 * - finish_lock_switch() for weakly-ordered
4382 * architectures where spin_unlock is a full barrier,
4383 * - switch_to() for arm64 (weakly-ordered, spin_unlock
4384 * is a RELEASE barrier),
4385 */
4386 ++*switch_count;
4387
4388 trace_sched_switch(preempt, prev, next);
4389
4390 /* Also unlocks the rq: */
4391 rq = context_switch(rq, prev, next, &rf);
4392 } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4397 rq_unlock_irq(rq, &rf);
4398 }
4399
4400 balance_callback(rq);
4401 }
4204 /*
4205 * Pick up the highest-prio task:
4206 */
4207 static inline struct task_struct *
4208 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
4209 {
4210 const struct sched_class *class;
4211 struct task_struct *p;
4212
4213 /*
4214 * Optimization: we know that if all tasks are in the fair class we can
4215 * call that function directly, but only if the @prev task wasn't of a
4216 * higher scheduling class, because otherwise those loose the
4217 * opportunity to pull in more work from other CPUs.
4218 */
4219 if (likely((prev->sched_class == &idle_sched_class ||
4220 prev->sched_class == &fair_sched_class) &&
4221 rq->nr_running == rq->cfs.h_nr_running)) {
4222
4223 p = fair_sched_class.pick_next_task(rq, prev, rf);
4224 if (unlikely(p == RETRY_TASK))
4225 goto again;
4226
4227 /* Assumes fair_sched_class->next == idle_sched_class */
4228 if (unlikely(!p))
4229 p = idle_sched_class.pick_next_task(rq, prev, rf);
4230
4231 return p;
4232 }
4233
4234 again:
4235 for_each_class(class) {
4236 p = class->pick_next_task(rq, prev, rf);
4237 if (p) {
4238 if (unlikely(p == RETRY_TASK))
4239 goto again;
4240 return p;
4241 }
4242 }
4243
4244 /* The idle class should always have a runnable task: */
4245 BUG();
4246 }
4247
### if (!preempt && prev->state) {
如果不是抢占调用且当前进程不是 TASK_RUNNING 状态,就需要将task 移除就绪队列。
4529 #ifdef CONFIG_PREEMPT
4530 /*
4531 * this is the entry point to schedule() from in-kernel preemption
4532 * off of preempt_enable. Kernel preemptions off return from interrupt
4533 * occur there and call schedule directly.
4534 */
4535 asmlinkage __visible void __sched notrace preempt_schedule(void)
4536 {
4537 /*
4538 * If there is a non-zero preempt_count or interrupts are disabled,
4539 * we do not want to preempt the current task. Just return..
4540 */
4541 if (likely(!preemptible()))
4542 return;
4543
4544 preempt_schedule_common();
4545 }
4499
4500 static void __sched notrace preempt_schedule_common(void)
4501 {
4502 do {
4503 /*
4504 * Because the function tracer can trace preempt_count_sub()
4505 * and it also uses preempt_enable/disable_notrace(), if
4506 * NEED_RESCHED is set, the preempt_enable_notrace() called
4507 * by the function tracer will call this function again and
4508 * cause infinite recursion.
4509 *
4510 * Preemption must be disabled here before the function
4511 * tracer can trace. Break up preempt_disable() into two
4512 * calls. One to disable preemption without fear of being
4513 * traced. The other to still record the preemption latency,
4514 * which can also be traced by the function tracer.
4515 */
4516 preempt_disable_notrace();
4517 preempt_latency_start(1);
4518 __schedule(true);
4519 preempt_latency_stop(1);
4520 preempt_enable_no_resched_notrace();
4521
4522 /*
4523 * Check again in case we missed a preemption opportunity
4524 * between schedule and now.
4525 */
4526 } while (need_resched());
4527 }
4528
### if (!preempt && prev->state) {
如果进程在wait_event 中
##257 if (condition) ## 这个位置发送中断,并在中断返回时,被抢占调度了,这个时候 if (!preempt && prev->state) 就不会将被移除运行队列,否则就如果conditon 在中断返回前一刻preempt_schedule 调度前发生一次后,不再发发生,就再也没有谁唤醒这个被抢占的任务加入到运行队列;
####255 long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
### 这个位置在加入等待队列前一刻发生中断,中断返回时,如果该任务被抢占调度了,这个时候 if (!preempt && prev->state) 就不会将被移除运行队列,否则就由于该任务不在运行队列,又没有加入到等待队列,就再也得不到调度。
287 #define wait_event(wq_head, condition) \
288 do { \
289 might_sleep(); \
290 if (condition) \
291 break; \
292 __wait_event(wq_head, condition); \
293 } while (0)
294
#define __wait_event(wq_head, condition) \
272 (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
273 schedule())
247 #define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \
248 ({ \
249 __label__ __out; \
250 struct wait_queue_entry __wq_entry; \
251 long __ret = ret; /* explicit shadow */ \
252 \
253 init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \
254 for (;;) { \
255 long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
256 \
257 if (condition) \
258 break; \
259 \
260 if (___wait_is_interruptible(state) && __int) { \
261 __ret = __int; \
262 goto __out; \
263 } \
264 \
265 cmd; \
266 } \
267 finish_wait(&wq_head, &__wq_entry); \
268 __out: __ret; \
269 })
####rq = context_switch(rq, prev, next, &rf);
3614 /*
3615 * context_switch - switch to the new MM and the new thread's register state.
3616 */
3617 static __always_inline struct rq *
3618 context_switch(struct rq *rq, struct task_struct *prev,
3619 struct task_struct *next, struct rq_flags *rf)
3620 {
3621 struct mm_struct *mm, *oldmm;
3622
3623 prepare_task_switch(rq, prev, next);
3624
3625 mm = next->mm;
3626 oldmm = prev->active_mm;
3627 /*
3628 * For paravirt, this is coupled with an exit in switch_to to
3629 * combine the page table reload and the switch backend into
3630 * one hypercall.
3631 */
3632 arch_start_context_switch(prev);
3633
3634 /*
3635 * If mm is non-NULL, we pass through switch_mm(). If mm is
3636 * NULL, we will pass through mmdrop() in finish_task_switch().
3637 * Both of these contain the full memory barrier required by
3638 * membarrier after storing to rq->curr, before returning to
3639 * user-space.
3640 */
3641 if (!mm) {
3642 next->active_mm = oldmm;
3643 mmgrab(oldmm);
3644 enter_lazy_tlb(oldmm, next);
3645 } else
3646 switch_mm_irqs_off(oldmm, mm, next);
3647
3648 if (!prev->mm) {
3649 prev->active_mm = NULL;
3650 rq->prev_mm = oldmm;
3651 }
3652
3653 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3654
3655 prepare_lock_switch(rq, next, rf);
3656
3657 /* Here we just switch the register state and the stack. */
3658 switch_to(prev, next, prev);
3659 barrier();
3660
3661 return finish_task_switch(prev);
3662 }
###mm 为空,则为内核线程,那就使用被调度出去的任务的active_mm,否则是普通进程,则需要switch_mm_irqs_off进行进程地址空间切换;如果被切换出去的任务mm 为空,说明是内核线程,否则是普通线程其prev->active_mm = prev->mm;这个时候内核线程设置prev->active_mm = NULL,尽快换出,并设置rq->prev_mm = oldmm,这个finish_task_switch() 会用到。
## prepare_lock_switch(rq, next, rf);
###设置next 任务on_cpu =1 ,说明马上会被执行。
finish_task_switch 设置 pre 任务on_cpu =0
1、设置抢占标记TIF_NEED_RESCHED
唤醒设置抢占
WAKEUP_PREEMPTION sched_feat
唤醒任务时,根据WAKEUP_PRREMPTION 特性是否进一步检测判断设置TIF_NEED_RESCHED
new task 设置抢占
tick 时钟设置抢占
......
2、抢占点
系统调用、中断、异常返回用户空间前一刻,检测并判断TIF_NEED_RESCHED 标记进行schedule() 抢占当前任务;
中断、异常返回内核空间前一刻,检测并判断TIF_NEED_RESCHED 标记进行schedule() 抢占当前任务;
3、preempt_count =0 抢占
_preempt_count 设置为PREEMPT_NEED_RESCHED
在spin_lock 释放时,preempt_enable() 出就立即可以发生抢占。
linux内核的preempt抢占调度,preempt_count抢占保护“锁”-技术文章-jiaocheng.bubufx.com