linux 抢占调度

参考

内核抢占,让世界变得更美好 | Linux 内核 - 知乎

kernel-4.19/kernel/sched/core.c 
 /*
4249   * __schedule() is the main scheduler function.
4250   *
4251   * The main means of driving the scheduler and thus entering this function are:
4252   *
4253   *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
4254   *
4255   *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
4256   *      paths. For example, see arch/x86/entry_64.S.
4257   *
4258   *      To drive preemption between tasks, the scheduler sets the flag in timer
4259   *      interrupt handler scheduler_tick().
4260   *
4261   *   3. Wakeups don't really cause entry into schedule(). They add a
4262   *      task to the run-queue and that's it.
4263   *
4264   *      Now, if the new task added to the run-queue preempts the current
4265   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
4266   *      called on the nearest possible occasion:
4267   *
4268   *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
4269   *
4270   *         - in syscall or exception context, at the next outmost
4271   *           preempt_enable(). (this might be as soon as the wake_up()'s
4272   *           spin_unlock()!)
4273   *
4274   *         - in IRQ context, return from interrupt-handler to
4275   *           preemptible context
4276   *
4277   *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
4278   *         then at the next:
4279   *
4280   *          - cond_resched() call
4281   *          - explicit schedule() call
4282   *          - return from syscall or exception to user-space
4283   *          - return from interrupt-handler to user-space
4284   *
4285   * WARNING: must be called with preemption disabled!
286   */

__schedule()是调度核心函数,其作用是让调度器选择和切换到一个合适进程远程。调度时机如下三种:

1、阻塞操作:互斥量mute、信号量 、等待队列等

2、在中断返回前和系统调用返回到用户空间时,检测TIF_NEED_RESCHED标志位确定是否需要调度

3、将要被唤醒(wakeups)进程不会马上调用schedule()要求调度,而是被添加到cfs 就绪队列中,并且设置TIF_NEED_RESCHED 标志位。唤醒进程被调度分下面两种情况:

 A :内核可以抢占

       如果唤醒发生在系统调用或者异常处理上下文,在下一次调用preempt_enable()时会检查是否需要抢占调用。

       如果发生在硬件中断(irq)上下文,在硬件中断返前(不管中断发生在用户空间还是内核空间)检查是否需要抢占。

B :如果内核不可抢占

        当前进程调用cond_resched() 时会检查是否需要调度

        主动调用schedule()

         系统调用或者异常处理返回用户空间时

         中断处理完成返回用户空间时(中断发生在用户空间)

4287  static void __sched notrace __schedule(bool preempt)
4288  {
4289      struct task_struct *prev, *next;
4290      unsigned long *switch_count;
4291      struct rq_flags rf;
4292      struct rq *rq;
4293      int cpu;

//获取当前进程cpu,获取cpu 对应运行队列rq;每个cpu对应一个运行队列;

//获取运行队列当前运行task

4298      cpu = smp_processor_id();
4299      rq = cpu_rq(cpu);
4300      prev = rq->curr;
4301  
4302      schedule_debug(prev);
 

4304      if (sched_feat(HRTICK))
4305          hrtick_clear(rq);
4306  
4307      local_irq_disable();
4308      rcu_note_context_switch(preempt);
4309  
4310      /*
4311       * Make sure that signal_pending_state()->signal_pending() below
4312       * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4313       * done by the caller to avoid the race with signal_wake_up().
4314       *
4315       * The membarrier system call requires a full memory barrier
4316       * after coming from user-space, before storing to rq->curr.
4317       */
4318      rq_lock(rq, &rf);
4319      smp_mb__after_spinlock();
4320  
4321      /* Promote REQ to ACT */
4322      rq->clock_update_flags <<= 1;
4323      update_rq_clock(rq);

         switch_count = &prev->nivcsw;       
4326      if (!preempt && prev->state) {
4327          if (unlikely(signal_pending_state(prev->state, prev))) {
4328              prev->state = TASK_RUNNING;
4329          } else {
4330              deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4331              prev->on_rq = 0;
4332  
4333              if (prev->in_iowait) {
4334                  atomic_inc(&rq->nr_iowait);
4335                  delayacct_blkio_start();
4336              }
4337  
4338              /*
4339               * If a worker went to sleep, notify and ask workqueue
4340               * whether it wants to wake up a task to maintain
4341               * concurrency.
4342               */
4343              if (prev->flags & PF_WQ_WORKER) {
4344                  struct task_struct *to_wakeup;
4345  
4346                  to_wakeup = wq_worker_sleeping(prev);
4347                  if (to_wakeup)
4348                      try_to_wake_up_local(to_wakeup, &rf);
4349              }
4350          }
4351          switch_count = &prev->nvcsw;   ###主动切换,如调用sleep
4352      }

               next = pick_next_task(rq, prev, &rf);

              clear_tsk_need_resched(prev);
4361      clear_preempt_need_resched();
4362  
4363      if (likely(prev != next)) {
            rq->nr_switches++;
4371          rq->curr = next;
4372          /*
4373           * The membarrier system call requires each architecture
4374           * to have a full memory barrier after updating
4375           * rq->curr, before returning to user-space.
4376           *
4377           * Here are the schemes providing that barrier on the
4378           * various architectures:
4379           * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
4380           *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
4381           * - finish_lock_switch() for weakly-ordered
4382           *   architectures where spin_unlock is a full barrier,
4383           * - switch_to() for arm64 (weakly-ordered, spin_unlock
4384           *   is a RELEASE barrier),
4385           */
4386          ++*switch_count;
4387  
4388          trace_sched_switch(preempt, prev, next);
4389  
4390          /* Also unlocks the rq: */
4391          rq = context_switch(rq, prev, next, &rf);
4392      } else {    rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4397          rq_unlock_irq(rq, &rf);
4398      }
4399  
4400      balance_callback(rq);
4401  }

4204  /*
4205   * Pick up the highest-prio task:
4206   */
4207  static inline struct task_struct *
4208  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
4209  {
4210      const struct sched_class *class;
4211      struct task_struct *p;
4212  
4213      /*
4214       * Optimization: we know that if all tasks are in the fair class we can
4215       * call that function directly, but only if the @prev task wasn't of a
4216       * higher scheduling class, because otherwise those loose the
4217       * opportunity to pull in more work from other CPUs.
4218       */


4219      if (likely((prev->sched_class == &idle_sched_class ||
4220              prev->sched_class == &fair_sched_class) &&
4221             rq->nr_running == rq->cfs.h_nr_running)) {
4222  
4223          p = fair_sched_class.pick_next_task(rq, prev, rf);
4224          if (unlikely(p == RETRY_TASK))
4225              goto again;
4226  
4227          /* Assumes fair_sched_class->next == idle_sched_class */
4228          if (unlikely(!p))
4229              p = idle_sched_class.pick_next_task(rq, prev, rf);
4230  
4231          return p;
4232      }
4233  
4234  again:
4235      for_each_class(class) {
4236          p = class->pick_next_task(rq, prev, rf);
4237          if (p) {
4238              if (unlikely(p == RETRY_TASK))
4239                  goto again;
4240              return p;
4241          }
4242      }
4243  
4244      /* The idle class should always have a runnable task: */
4245      BUG();
4246  }
4247  

###  if (!preempt && prev->state) { 

如果不是抢占调用且当前进程不是 TASK_RUNNING 状态,就需要将task 移除就绪队列。


4529  #ifdef CONFIG_PREEMPT
4530  /*
4531   * this is the entry point to schedule() from in-kernel preemption
4532   * off of preempt_enable. Kernel preemptions off return from interrupt
4533   * occur there and call schedule directly.
4534   */
4535  asmlinkage __visible void __sched notrace preempt_schedule(void)
4536  {
4537      /*
4538       * If there is a non-zero preempt_count or interrupts are disabled,
4539       * we do not want to preempt the current task. Just return..
4540       */
4541      if (likely(!preemptible()))
4542          return;
4543  
4544      preempt_schedule_common();
4545  }

4499  
4500  static void __sched notrace preempt_schedule_common(void)
4501  {
4502      do {
4503          /*
4504           * Because the function tracer can trace preempt_count_sub()
4505           * and it also uses preempt_enable/disable_notrace(), if
4506           * NEED_RESCHED is set, the preempt_enable_notrace() called
4507           * by the function tracer will call this function again and
4508           * cause infinite recursion.
4509           *
4510           * Preemption must be disabled here before the function
4511           * tracer can trace. Break up preempt_disable() into two
4512           * calls. One to disable preemption without fear of being
4513           * traced. The other to still record the preemption latency,
4514           * which can also be traced by the function tracer.
4515           */
4516          preempt_disable_notrace();
4517          preempt_latency_start(1);
4518          __schedule(true);
4519          preempt_latency_stop(1);
4520          preempt_enable_no_resched_notrace();
4521  
4522          /*
4523           * Check again in case we missed a preemption opportunity
4524           * between schedule and now.
4525           */
4526      } while (need_resched());
4527  }
4528  

###  if (!preempt && prev->state) { 

如果进程在wait_event 中   

##257          if (condition)                           ## 这个位置发送中断,并在中断返回时,被抢占调度了,这个时候 if (!preempt && prev->state) 就不会将被移除运行队列,否则就如果conditon 在中断返回前一刻preempt_schedule 调度前发生一次后,不再发发生,就再也没有谁唤醒这个被抢占的任务加入到运行队列;

####255          long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\

### 这个位置在加入等待队列前一刻发生中断,中断返回时,如果该任务被抢占调度了,这个时候 if (!preempt && prev->state) 就不会将被移除运行队列,否则就由于该任务不在运行队列,又没有加入到等待队列,就再也得不到调度。

287  #define wait_event(wq_head, condition)                        \
288  do {                                        \
289      might_sleep();                                \
290      if (condition)                                \
291          break;                                \
292      __wait_event(wq_head, condition);                    \
293  } while (0)
294  

 #define __wait_event(wq_head, condition)                    \
272      (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,    \
273                  schedule())

247  #define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)        \
248  ({                                        \
249      __label__ __out;                            \
250      struct wait_queue_entry __wq_entry;                    \
251      long __ret = ret;    /* explicit shadow */                \
252                                          \
253      init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);    \
254      for (;;) {                                \
255          long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
256                                          \
257          if (condition)                            \
258              break;                            \
259                                          \
260          if (___wait_is_interruptible(state) && __int) {            \
261              __ret = __int;                        \
262              goto __out;                        \
263          }                                \
264                                          \
265          cmd;                                \
266      }                                    \
267      finish_wait(&wq_head, &__wq_entry);                    \
268  __out:    __ret;                                    \
269  })

####rq = context_switch(rq, prev, next, &rf);


3614  /*
3615   * context_switch - switch to the new MM and the new thread's register state.
3616   */
3617  static __always_inline struct rq *
3618  context_switch(struct rq *rq, struct task_struct *prev,
3619             struct task_struct *next, struct rq_flags *rf)
3620  {
3621      struct mm_struct *mm, *oldmm;
3622  
3623      prepare_task_switch(rq, prev, next);
3624  
3625      mm = next->mm;
3626      oldmm = prev->active_mm;
3627      /*
3628       * For paravirt, this is coupled with an exit in switch_to to
3629       * combine the page table reload and the switch backend into
3630       * one hypercall.
3631       */
3632      arch_start_context_switch(prev);
3633  
3634      /*
3635       * If mm is non-NULL, we pass through switch_mm(). If mm is
3636       * NULL, we will pass through mmdrop() in finish_task_switch().
3637       * Both of these contain the full memory barrier required by
3638       * membarrier after storing to rq->curr, before returning to
3639       * user-space.
3640       */
3641      if (!mm) {
3642          next->active_mm = oldmm;
3643          mmgrab(oldmm);
3644          enter_lazy_tlb(oldmm, next);
3645      } else
3646          switch_mm_irqs_off(oldmm, mm, next);
3647  
3648      if (!prev->mm) {
3649          prev->active_mm = NULL;
3650          rq->prev_mm = oldmm;
3651      }
3652  
3653      rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3654  
3655      prepare_lock_switch(rq, next, rf);
3656  
3657      /* Here we just switch the register state and the stack. */
3658      switch_to(prev, next, prev);
3659      barrier();
3660  
3661      return finish_task_switch(prev);
3662  }

###mm 为空,则为内核线程,那就使用被调度出去的任务的active_mm,否则是普通进程,则需要switch_mm_irqs_off进行进程地址空间切换;如果被切换出去的任务mm 为空,说明是内核线程,否则是普通线程其prev->active_mm = prev->mm;这个时候内核线程设置prev->active_mm = NULL,尽快换出,并设置rq->prev_mm = oldmm,这个finish_task_switch() 会用到。

## prepare_lock_switch(rq, next, rf);    

###设置next 任务on_cpu =1 ,说明马上会被执行。

finish_task_switch 设置 pre 任务on_cpu =0

1、设置抢占标记TIF_NEED_RESCHED

唤醒设置抢占

WAKEUP_PREEMPTION   sched_feat

唤醒任务时,根据WAKEUP_PRREMPTION 特性是否进一步检测判断设置TIF_NEED_RESCHED

new task 设置抢占

tick 时钟设置抢占

......

2、抢占点

系统调用、中断、异常返回用户空间前一刻,检测并判断TIF_NEED_RESCHED 标记进行schedule() 抢占当前任务;

中断、异常返回内核空间前一刻,检测并判断TIF_NEED_RESCHED 标记进行schedule() 抢占当前任务;

3、preempt_count =0 抢占

_preempt_count  设置为PREEMPT_NEED_RESCHED

在spin_lock 释放时,preempt_enable() 出就立即可以发生抢占。

linux内核的preempt抢占调度,preempt_count抢占保护“锁”-技术文章-jiaocheng.bubufx.com

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值