调度的时机
/*
* __schedule() is the main scheduler function.
*
* The main means of driving the scheduler and thus entering this function are:
*
* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
*
* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
* paths. For example, see arch/x86/entry_64.S.
*
* To drive preemption between tasks, the scheduler sets the flag in timer
* interrupt handler scheduler_tick().
*
* 3. Wakeups don't really cause entry into schedule(). They add a
* task to the run-queue and that's it.
*
* Now, if the new task added to the run-queue preempts the current
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
* called on the nearest possible occasion:
*
* - If the kernel is preemptible (CONFIG_PREEMPTION=y):
*
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
* spin_unlock()!)
*
* - in IRQ context, return from interrupt-handler to
* preemptible context
*
* - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
* then at the next:
*
* - cond_resched() call
* - explicit schedule() call
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
*
* WARNING: must be called with preemption disabled!
*/
__schedule()是主调度函数
- 驱动调度器然后进入这个函数的主要方法包括
- 1、显示阻塞:互斥锁、信号量、等待队列等,当前线程不能继续执行
- 2、在返回用户空间的路径上,检查TIF_NEED_RESCHED标志位,如果置位就重新调度
- 3、wakeup()并不会导致调度,只会把这个线程加入就绪队列中
- 4、切换线程的调度点:
- 如果内核是不可抢占的(CONFIG_PREEMPT=n)
- 调度点:
- 显式调用cond_resched()或者schedule()
- 从系统调用或异常返回到用户空间
- 中断处理函数返回用户空间
- 调度点:
- 如果内核是可抢占的(CONFIG_PREEMPT=y)
- 如果添加到就绪队列的这个新线程确实要抢占当前线程,那在wakeup函数中会设置TIF_NEED_RESCHED标志位(非抢占式的则只加到就绪队列中),然后在最近的抢占点调用schedule()
- 调度点:
所有"内核是不可抢占式的"调度点schedule()- 在系统调用和异常上下文中,在下一个最近的preempt_enable()中(可能就在wake_up()中的spin_unlock()后)(更早)
- 在中断上下文中,从中断函数返回到抢占上下文
- 如果内核是不可抢占的(CONFIG_PREEMPT=n)
注意:所有调用方必须在之后重新检查need_resched()并相应地重新调度,以防在__schedule()中禁用抢占时,事件触发了重新调度的需要(例如中断唤醒任务)
类似酱紫:do {__schedule();} while (need_resched());
开抢占提高实时性的原理:开启CONFIG_PREEMPT可以让调度点更多更密集,比如不开要等到下次系统调用,开了在wakeup()的spin_unlock()直接就调度了
疑问:
1、cond_resched()的用法?显式调用cond_resched()并不一定可以放弃处理器吧,因为可能此刻就是它自己最适合运行?还有另一种机制可以强制放弃处理器的函数是什么?
(1)cond_resched()是一种不可抢占式内核中的低延时处理,在一些比较耗时的处理中(已经被内核开发者识别出来),如文件系统和内存回收的一些路径会调用cond_resched,看一下cond_resched这个宏的实现
/* include/linux/sched.h */
#define cond_resched() ({ \
___might_sleep(__FILE__, __LINE__, 0); \
_cond_resched(); \
})
#ifndef CONFIG_PREEMPTION
extern int _cond_resched(void); /* 见下 */
#else
static inline int _cond_resched(void) { return 0; } /* 对于抢占式内核来说cond_resched()是个空函数 */
#endif
/* kernel/sched/core.c */
#ifndef CONFIG_PREEMPTION
int __sched _cond_resched(void)
{
/* 只有抢占计数器为0,且当前任务被设置了重新调度标志,才会进行抢占式调度 */
if (should_resched(0)) { /* 判断抢占计数器(thread_info->preempt_count)是否为0, 0 => preemptible, >0 => 不可抢占 */
preempt_schedule_common(); /* 抢占式调度 *//* 见下 */
return 1;
}
rcu_all_qs();
return 0;
}
EXPORT_SYMBOL(_cond_resched);
#endif
static void __sched notrace preempt_schedule_common(void)
{
do {
preempt_latency_start(1);
__schedule(true);
preempt_latency_stop(1);
/*
* Check again in case we missed a preemption opportunity
* between schedule and now.
*/
} while (need_resched()); /* 内部检查current->thread_info->flag中的TIF_NEED_RESCHED */
}
(2)分析源码可知,cond_resched()函数的功能是放弃主动权,等待下一次调度运行
- 如果当前任务的调度策略是FIFO,那么调度器会将它放到同一优先级队列的队尾,只有当前面的FIFO任务都运行完,才会轮到它。但如果没有优先级更高或同级任务,则自己会继续执行
- 如果调度策略是RR,那么调度器会将它放在同一优先级队列的队尾,只有当当前RR任务运行完一个时间片之后,才能轮到它再次运行。与上同理其可以马上再上处理器
- 如果调度器是CFS,那么根据这次已经运行的时间计算出虚拟运行时间,加到总的虚拟运行时间里,再根据虚拟运行时间插入红黑树,cfs选一个虚拟运行时间最短的上处理器运行。与上同理其可以马上再上处理器(不是很确定)
(3)没查到,我记得有
2、非抢占式的内核也会去设置TIF_NEED_RESCHED么?
会,见问题1
参考资料:
- https://zhuanlan.zhihu.com/p/554764883
抢占点的实现(并非调度点)
1、时钟中断tick时
kernel/sched/core.c
scheduler_tick
->curr->sched_class->task_tick(rq, curr, 0)
->task_tick_fair
->entity_tick
->check_preempt_tick
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;
struct sched_entity *se;
s64 delta;
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) { /* 当前任务的实际运行时间 > 理想运行时间 */
resched_curr(rq_of(cfs_rq)); /* 重新调度 */
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
*/
clear_buddies(cfs_rq, curr); /* 保证自己不会因buddy的偏爱被再次选中 */
return;
}
/*
* Ensure that a task that missed wakeup preemption by a
* narrow margin doesn't have to wait for a full slice.
* This also mitigates buddy induced latencies under load.
*/
if (delta_exec < sysctl_sched_min_granularity) /* 当前任务的实际运行时间小于最小调度粒度,750000ns=0.75ms */
return;
se = __pick_first_entity(cfs_rq); /* 选取虚拟运行时间最小的线程 */
delta = curr->vruntime - se->vruntime;
if (delta < 0)
return;
if (delta > ideal_runtime) /* 如果(curr虚拟时间-se虚拟运行时间)>curr的理想运行时间 */
resched_curr(rq_of(cfs_rq)); /* 重新调度 */
}
2、唤醒抢占——fork路径
kernel/fork.c
kernel_clone
->wake_up_new_task(p)
->check_preempt_curr(rq, p, WF_FORK)
3、唤醒抢占——正常路径
kernel/sched/core.c
wake_up_process
->try_to_wake_up
->ttwu_queue
->ttwu_do_activate
->ttwu_do_wakeup
->check_preempt_curr(rq, p, wake_flags)
/* rq是目标CPU的队列 */
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
const struct sched_class *class;
if (p->sched_class == rq->curr->sched_class) {
rq->curr->sched_class->check_preempt_curr(rq, p, flags); /* 以cfs为例分析 */
} else {
for_each_class(class) {
if (class == rq->curr->sched_class)
break;
if (class == p->sched_class) {
resched_curr(rq);
break;
}
}
}
/*
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
rq_clock_skip_update(rq);
}
const struct sched_class fair_sched_class = {
.check_preempt_curr = check_preempt_wakeup,
//……
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
/* scale的含义: 如果nr_running大于或者等于sched_nr_latency,则scale为1。那么scale为1更深层次的含义是什么呢?
* 实际上这意味着CFS运行队列上的进程比较多,那么被唤醒的进程可能要等较长时间才能得到调度。虽然被唤醒的进程会被赋予一
* 个比较小的vruntime,已保证它尽快得到调度,但是它并一定是最小的。为了保证被唤醒的进程不会等太长时间,会调用
* set_next_buddy接口设置cfs队列的next成员,这样在下一次调度的时候就会优先选择被唤醒的进程。这里我们要注意的是,
* scale为1并不会导致当前进程被抢占,只会设置cfs队列的next成员,在不抢占当前进程的情况下,会等待当前进程用完时间片
* 在schedule函数中判断是否需要选择next执行的进程来执行 */
int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0;
if (unlikely(se == pse))
return;
/*
* This is possible from callers such as attach_tasks(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
return;
if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
next_buddy_marked = 1;
}
/*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path.
*
* Note: this also catches the edge-case of curr being in a throttled
* group (e.g. via set_curr_task), since update_curr() (in the
* enqueue of curr) will have resulted in resched being set. This
* prevents us from potentially nominating it as a false LAST_BUDDY
* below.
*/
/* 如果当前进程已经设置了TIF_NEED_RESCHED标记,那就不用再跟他纠缠了,直接返回 */
if (test_tsk_need_resched(curr))
return;
/* Idle tasks are by definition preempted by non-idle tasks. */
/* 如果当前进程是idle进程,而被唤醒的进程不是idle进程,那没什么说的,直接抢占 */
if (unlikely(task_has_idle_policy(curr)) &&
likely(!task_has_idle_policy(p)))
goto preempt;
/*
* Batch and idle tasks do not preempt non-idle tasks (their preemption
* is driven by the tick):
*/
if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
return;
find_matching_se(&se, &pse);
update_curr(cfs_rq_of(se));
BUG_ON(!pse);
/* 如果当前进程和被唤醒的进程都是普通的进程,那么调用wakeup_preempt_entity函数判断是否应该抢占当前进程 */
if (wakeup_preempt_entity(se, pse) == 1) {
/*
* Bias pick_next to pick the sched entity that is
* triggering this preemption.
*/
if (!next_buddy_marked)
set_next_buddy(pse);
goto preempt;
}
return;
preempt:
/* 当前进程被抢占通过resched_curr来完成 */
resched_curr(rq);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
* with schedule on the ->pre_schedule() or idle_balance()
* point, either of which can * drop the rq lock.
*
* Also, during early boot the idle thread is in the fair class,
* for obvious reasons its a bad idea to schedule back to it.
*/
if (unlikely(!se->on_rq || curr == rq->idle))
return;
/* 最终如果scale为1,会调用set_last_buddy设置cfs调度队列的last成员为当前进程
* last成员会受到调度器的优待,由于为了补偿被唤醒的进程,抢占了当前进程,
* 那么补偿完被唤醒的进程后,需要再补偿下被抢占的进程。 */
if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
set_last_buddy(se);
}
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
s64 gran, vdiff = curr->vruntime - se->vruntime;
/* 如果当前进程的vruntime比较小,那么不能抢占,因为这意味着当前进程需要受到优待 */
if (vdiff <= 0)
return -1;
/* 如果当前进程的vruntime比较大,而且相比于被唤醒的进程超过了一定的门限值,
* 那么需要补偿被唤醒的进程,也就是允许被唤醒的进程抢占当前进程 */
gran = wakeup_gran(se);
if (vdiff > gran)
return 1;
return 0;
}
static unsigned long wakeup_gran(struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity; /* 1000000ns=1ms */
return calc_delta_fair(gran, se); /* 计算gran对应的虚拟运行时间 */
}
总结:无论是创建新任务或者是唤醒任务的时候,都有可能新唤醒的任务抢占当前任务,判断条件如下:唤醒的任务的虚拟运行时间和当前任务的虚拟运行时间差值大于最小唤醒抢占粒度转换的虚拟运行时间(唤醒的任务的虚拟运行时间更小)
参考资料:https://blog.csdn.net/liuhangtiant/article/details/84455351