linux 抢占调度

lei7143

已于 2023-09-12 22:13:15 修改

阅读量522

点赞数

分类专栏： android-linux 文章标签： linux 服务器 java

于 2022-07-05 20:43:47 首次发布

本文链接：https://blog.csdn.net/lei7143/article/details/125627345

版权

android-linux 专栏收录该内容

20 篇文章 1 订阅

订阅专栏

参考

内核抢占，让世界变得更美好 | Linux 内核 - 知乎

kernel-4.19/kernel/sched/core.c
/*
4249 * __schedule() is the main scheduler function.
4250 *
4251 * The main means of driving the scheduler and thus entering this function are:
4252 *
4253 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
4254 *
4255 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
4256 * paths. For example, see arch/x86/entry_64.S.
4257 *
4258 * To drive preemption between tasks, the scheduler sets the flag in timer
4259 * interrupt handler scheduler_tick().
4260 *
4261 * 3. Wakeups don't really cause entry into schedule(). They add a
4262 * task to the run-queue and that's it.
4263 *
4264 * Now, if the new task added to the run-queue preempts the current
4265 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
4266 * called on the nearest possible occasion:
4267 *
4268 * - If the kernel is preemptible (CONFIG_PREEMPT=y):
4269 *
4270 * - in syscall or exception context, at the next outmost
4271 * preempt_enable(). (this might be as soon as the wake_up()'s
4272 * spin_unlock()!)
4273 *
4274 * - in IRQ context, return from interrupt-handler to
4275 * preemptible context
4276 *
4277 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
4278 * then at the next:
4279 *
4280 * - cond_resched() call
4281 * - explicit schedule() call
4282 * - return from syscall or exception to user-space
4283 * - return from interrupt-handler to user-space
4284 *
4285 * WARNING: must be called with preemption disabled!
286 */

__schedule()是调度核心函数，其作用是让调度器选择和切换到一个合适进程远程。调度时机如下三种：

1、阻塞操作：互斥量mute、信号量、等待队列等

2、在中断返回前和系统调用返回到用户空间时，检测TIF_NEED_RESCHED标志位确定是否需要调度

3、将要被唤醒（wakeups）进程不会马上调用schedule()要求调度，而是被添加到cfs 就绪队列中，并且设置TIF_NEED_RESCHED 标志位。唤醒进程被调度分下面两种情况：

A ：内核可以抢占

如果唤醒发生在系统调用或者异常处理上下文，在下一次调用preempt_enable()时会检查是否需要抢占调用。

如果发生在硬件中断（irq）上下文，在硬件中断返前（不管中断发生在用户空间还是内核空间）检查是否需要抢占。

B ：如果内核不可抢占

当前进程调用cond_resched() 时会检查是否需要调度

主动调用schedule()

系统调用或者异常处理返回用户空间时

中断处理完成返回用户空间时（中断发生在用户空间）

4287 static void __sched notrace __schedule(bool preempt)
4288 {
4289     struct task_struct *prev, *next;
4290     unsigned long *switch_count;
4291     struct rq_flags rf;
4292     struct rq *rq;
4293     int cpu;

//获取当前进程cpu，获取cpu 对应运行队列rq；每个cpu对应一个运行队列；

//获取运行队列当前运行task

4298     cpu = smp_processor_id();
4299     rq = cpu_rq(cpu);
4300     prev = rq->curr;
4301
4302     schedule_debug(prev);

4304     if (sched_feat(HRTICK))
4305         hrtick_clear(rq);
4306
4307     local_irq_disable();
4308     rcu_note_context_switch(preempt);
4309
4310     /*
4311     * Make sure that signal_pending_state()->signal_pending() below
4312     * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4313     * done by the caller to avoid the race with signal_wake_up().
4314     *
4315     * The membarrier system call requires a full memory barrier
4316     * after coming from user-space, before storing to rq->curr.
4317     */
4318     rq_lock(rq, &rf);
4319     smp_mb__after_spinlock();
4320
4321     /* Promote REQ to ACT */
4322     rq->clock_update_flags <<= 1;
4323     update_rq_clock(rq);

switch_count = &prev->nivcsw;
4326     if (!preempt && prev->state) {
4327         if (unlikely(signal_pending_state(prev->state, prev))) {
4328             prev->state = TASK_RUNNING;
4329         } else {
4330             deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4331             prev->on_rq = 0;
4332
4333             if (prev->in_iowait) {
4334                 atomic_inc(&rq->nr_iowait);
4335                 delayacct_blkio_start();
4336             }
4337
4338             /*
4339             * If a worker went to sleep, notify and ask workqueue
4340             * whether it wants to wake up a task to maintain
4341             * concurrency.
4342             */
4343             if (prev->flags & PF_WQ_WORKER) {
4344                 struct task_struct *to_wakeup;
4345
4346                 to_wakeup = wq_worker_sleeping(prev);
4347                 if (to_wakeup)
4348                     try_to_wake_up_local(to_wakeup, &rf);
4349             }
4350         }
4351         switch_count = &prev->nvcsw; ###主动切换，如调用sleep
4352     }

next = pick_next_task(rq, prev, &rf);

clear_tsk_need_resched(prev);
4361     clear_preempt_need_resched();
4362
4363     if (likely(prev != next)) {
rq->nr_switches++;
4371         rq->curr = next;
4372         /*
4373         * The membarrier system call requires each architecture
4374         * to have a full memory barrier after updating
4375         * rq->curr, before returning to user-space.
4376         *
4377         * Here are the schemes providing that barrier on the
4378         * various architectures:
4379         * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
4380         * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
4381         * - finish_lock_switch() for weakly-ordered
4382         * architectures where spin_unlock is a full barrier,
4383         * - switch_to() for arm64 (weakly-ordered, spin_unlock
4384         * is a RELEASE barrier),
4385         */
4386         ++*switch_count;
4387
4388         trace_sched_switch(preempt, prev, next);
4389
4390         /* Also unlocks the rq: */
4391         rq = context_switch(rq, prev, next, &rf);
4392     } else {   rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4397         rq_unlock_irq(rq, &rf);
4398     }
4399
4400     balance_callback(rq);
4401 }

4204 /*
4205 * Pick up the highest-prio task:
4206 */
4207 static inline struct task_struct *
4208 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
4209 {
4210     const struct sched_class *class;
4211     struct task_struct *p;
4212
4213     /*
4214     * Optimization: we know that if all tasks are in the fair class we can
4215     * call that function directly, but only if the @prev task wasn't of a
4216     * higher scheduling class, because otherwise those loose the
4217     * opportunity to pull in more work from other CPUs.
4218     */

4219     if (likely((prev->sched_class == &idle_sched_class ||
4220         prev->sched_class == &fair_sched_class) &&
4221         rq->nr_running == rq->cfs.h_nr_running)) {
4222
4223         p = fair_sched_class.pick_next_task(rq, prev, rf);
4224         if (unlikely(p == RETRY_TASK))
4225             goto again;
4226
4227         /* Assumes fair_sched_class->next == idle_sched_class */
4228         if (unlikely(!p))
4229             p = idle_sched_class.pick_next_task(rq, prev, rf);
4230
4231         return p;
4232     }
4233
4234 again:
4235     for_each_class(class) {
4236         p = class->pick_next_task(rq, prev, rf);
4237         if (p) {
4238             if (unlikely(p == RETRY_TASK))
4239                 goto again;
4240             return p;
4241         }
4242     }
4243
4244     /* The idle class should always have a runnable task: */
4245     BUG();
4246 }
4247

### if (!preempt && prev->state) {

如果不是抢占调用且当前进程不是 TASK_RUNNING 状态，就需要将task 移除就绪队列。

4529 #ifdef CONFIG_PREEMPT
4530 /*
4531 * this is the entry point to schedule() from in-kernel preemption
4532 * off of preempt_enable. Kernel preemptions off return from interrupt
4533 * occur there and call schedule directly.
4534 */
4535 asmlinkage __visible void __sched notrace preempt_schedule(void)
4536 {
4537     /*
4538     * If there is a non-zero preempt_count or interrupts are disabled,
4539     * we do not want to preempt the current task. Just return..
4540     */
4541     if (likely(!preemptible()))
4542         return;
4543
4544     preempt_schedule_common();
4545 }

4499
4500 static void __sched notrace preempt_schedule_common(void)
4501 {
4502     do {
4503         /*
4504         * Because the function tracer can trace preempt_count_sub()
4505         * and it also uses preempt_enable/disable_notrace(), if
4506         * NEED_RESCHED is set, the preempt_enable_notrace() called
4507         * by the function tracer will call this function again and
4508         * cause infinite recursion.
4509         *
4510         * Preemption must be disabled here before the function
4511         * tracer can trace. Break up preempt_disable() into two
4512         * calls. One to disable preemption without fear of being
4513         * traced. The other to still record the preemption latency,
4514         * which can also be traced by the function tracer.
4515         */
4516         preempt_disable_notrace();
4517         preempt_latency_start(1);
4518         __schedule(true);
4519         preempt_latency_stop(1);
4520         preempt_enable_no_resched_notrace();
4521
4522         /*
4523         * Check again in case we missed a preemption opportunity
4524         * between schedule and now.
4525         */
4526     } while (need_resched());
4527 }
4528

### if (!preempt && prev->state) {

如果进程在wait_event 中

##257 if (condition) ## 这个位置发送中断，并在中断返回时，被抢占调度了，这个时候 if (!preempt && prev->state) 就不会将被移除运行队列，否则就如果conditon 在中断返回前一刻preempt_schedule 调度前发生一次后，不再发发生，就再也没有谁唤醒这个被抢占的任务加入到运行队列；

####255 long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\

### 这个位置在加入等待队列前一刻发生中断，中断返回时，如果该任务被抢占调度了，这个时候 if (!preempt && prev->state) 就不会将被移除运行队列，否则就由于该任务不在运行队列，又没有加入到等待队列，就再也得不到调度。

287 #define wait_event(wq_head, condition)                       \
288 do {                                       \
289     might_sleep();                               \
290     if (condition)                               \
291         break;                               \
292     __wait_event(wq_head, condition);                   \
293 } while (0)
294

#define __wait_event(wq_head, condition)                   \
272     (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,   \
273             schedule())

247 #define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)       \
248 ({                                       \
249     __label__ __out;                           \
250     struct wait_queue_entry __wq_entry;                   \
251     long __ret = ret;   /* explicit shadow */               \
252                                         \
253     init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);   \
254     for (;;) {                               \
255         long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
256                                         \
257         if (condition)                           \
258             break;                           \
259                                         \
260         if (___wait_is_interruptible(state) && __int) {           \
261             __ret = __int;                       \
262             goto __out;                       \
263         }                               \
264                                         \
265         cmd;                               \
266     }                                   \
267     finish_wait(&wq_head, &__wq_entry);                   \
268 __out:   __ret;                                   \
269 })

####rq = context_switch(rq, prev, next, &rf);

3614 /*
3615 * context_switch - switch to the new MM and the new thread's register state.
3616 */
3617 static __always_inline struct rq *
3618 context_switch(struct rq *rq, struct task_struct *prev,
3619     struct task_struct *next, struct rq_flags *rf)
3620 {
3621     struct mm_struct *mm, *oldmm;
3622
3623     prepare_task_switch(rq, prev, next);
3624
3625     mm = next->mm;
3626     oldmm = prev->active_mm;
3627     /*
3628     * For paravirt, this is coupled with an exit in switch_to to
3629     * combine the page table reload and the switch backend into
3630     * one hypercall.
3631     */
3632     arch_start_context_switch(prev);
3633
3634     /*
3635     * If mm is non-NULL, we pass through switch_mm(). If mm is
3636     * NULL, we will pass through mmdrop() in finish_task_switch().
3637     * Both of these contain the full memory barrier required by
3638     * membarrier after storing to rq->curr, before returning to
3639     * user-space.
3640     */
3641     if (!mm) {
3642         next->active_mm = oldmm;
3643         mmgrab(oldmm);
3644         enter_lazy_tlb(oldmm, next);
3645     } else
3646         switch_mm_irqs_off(oldmm, mm, next);
3647
3648     if (!prev->mm) {
3649         prev->active_mm = NULL;
3650         rq->prev_mm = oldmm;
3651     }
3652
3653     rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3654
3655     prepare_lock_switch(rq, next, rf);
3656
3657     /* Here we just switch the register state and the stack. */
3658     switch_to(prev, next, prev);
3659     barrier();
3660
3661     return finish_task_switch(prev);
3662 }

###mm 为空，则为内核线程，那就使用被调度出去的任务的active_mm，否则是普通进程，则需要switch_mm_irqs_off进行进程地址空间切换；如果被切换出去的任务mm 为空，说明是内核线程，否则是普通线程其prev->active_mm = prev->mm；这个时候内核线程设置prev->active_mm = NULL,尽快换出，并设置rq->prev_mm = oldmm，这个finish_task_switch() 会用到。

## prepare_lock_switch(rq, next, rf);

###设置next 任务on_cpu =1 ,说明马上会被执行。

finish_task_switch 设置 pre 任务on_cpu =0

1、设置抢占标记TIF_NEED_RESCHED

唤醒设置抢占

WAKEUP_PREEMPTION sched_feat

唤醒任务时，根据WAKEUP_PRREMPTION 特性是否进一步检测判断设置TIF_NEED_RESCHED