linux内核的进程调度—调度策略
(注:下文代码片段均出自linux内核版本4.1.15
)
文章目录
在linux内核中,默认实现了5种调度策略:
(1)stop调度策略
(2)deadline调度策略
(3)realtime调度策略
(4)CFS调度策略
(5)idle调度策略
以上5种调度策略通过.next
指针串联在一起。如下代码片段(/kernel/sched/sched.h):
#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
for_each_class
宏在check_preempt_curr()
函数pick_next_task()
函数中被使用,从而遍历由5个调度策略组成的调度器链表。
对于调度策略的选择,根据:不同的进程采用不同的调度策略
。
源码中,5种调度策略的具体实例都使用struct sched_class
来定义调度类(/kernel/sched/sched.h):
struct sched_class {
const struct sched_class *next;
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
/*
* It is the responsibility of the pick_next_task() method that will
* return the next task to call put_prev_task() on the @prev task or
* something equivalent.
*
* May return RETRY_TASK when it finds a higher prio class has runnable
* tasks.
*/
struct task_struct * (*pick_next_task) (struct rq *rq,
struct task_struct *prev);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
#endif
void (*set_curr_task) (struct rq *rq);
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
void (*task_fork) (struct task_struct *p);
void (*task_dead) (struct task_struct *p);
/*
* The switched_from() call is allowed to drop rq->lock, therefore we
* cannot assume the switched_from/switched_to pair is serliazed by
* rq->lock. They are however serialized by p->pi_lock.
*/
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio);
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);
void (*update_curr) (struct rq *rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
};
【设置调度策略】
用户空间程序可以使用sched_setscheduler()
来设定用户进程的调度策略。(/include/uapi/linux/sched.h)
/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
#define SCHED_DEADLINE 6
SCHED_NORMAL和SCHED_BATCH使用CFS调度器
SCHED_FIFO和SCHED_RR使用realtime调度器
SCHED_IDILE使用idle调度器
SCHED_DEADLINE使用deadline调度器
一、调度策略
(1-1)stop调度策略
优先级最高的调度类,特点是:可以抢占其他所有进程,不能被其他进程所抢占。如下为stop调度器的具体实例stop_sched_class
(/kernel/sched/stop_task.c):
const struct sched_class stop_sched_class = {
.next = &dl_sched_class,
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
.check_preempt_curr = check_preempt_curr_stop,
.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_stop,
#endif
.set_curr_task = set_curr_task_stop,
.task_tick = task_tick_stop,
.get_rr_interval = get_rr_interval_stop,
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
};
(1-2)deadline调度策略
使用红黑树,把进程按照绝对截止期限进行排序,选择最小期限进程调度运行。如下为deadline调度策略的具体实例dl_sched_class
(/kernel/sched/deadline.c):
const struct sched_class dl_sched_class = {
.next = &rt_sched_class,
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
.check_preempt_curr = check_preempt_curr_dl,
.pick_next_task = pick_next_task_dl,
.put_prev_task = put_prev_task_dl,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
.post_schedule = post_schedule_dl,
.task_woken = task_woken_dl,
#endif
.set_curr_task = set_curr_task_dl,
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
.task_dead = task_dead_dl,
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
};
(1-3)realtime调度策略
为每个优先级维护一个队列。如下为realtime调度策略的具体实例(/kernel/sched/rt.c):
const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
.check_preempt_curr = check_preempt_curr_rt,
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
#endif
.set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
.get_rr_interval = get_rr_interval_rt,
.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
};
(1-4)CFS调度策略
采用完全公平调度算法,引入虚拟运行时间概念。如下fair_sched_class
的cfs具体实例(/kernel/sched/fair.c):
const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
.task_waking = task_waking_fair,
#endif
.set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
.prio_changed = prio_changed_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
.get_rr_interval = get_rr_interval_fair,
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
};
(1-5)idle调度策略
每个CPU都会有一个idle线程,当没有其他进程可以调度时,调度运行idle线程。idle_sched_class
idle调度器的具体实例(/kernel/sched/idle_task.h):
const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
.check_preempt_curr = check_preempt_curr_idle,
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
#endif
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
};
注:在(/kernel/sched/core.c)init_idle()
函数中为指定cpu进行idle调度器的初始化
二、调度时刻
调度的本质是:选择下一个进程,然后进行线程切换。在执行调度之前需要设置调度标记TIF_NEED_RESCHED
,然后在调度的时候判断进程是否被设置TIF_NEED_RESCHED
,如果设置则调用schedule
来进行调度。
(2-1)设置调度标记
CPU上正在运行的进程由thread_info
(/arch/arm/include/asm/thread_info.h)描述,设置调度标记本质就是设置thread_info
结构体中的flags成员为TIF_NEED_RESCHED
。
在以下五个时刻会设置TIF_NEED_RESCHED
:
1、scheduler_tick
时钟中断
2、wake_up_process
唤醒进程的时刻
3、使用do_fork
创建新进程的时候
4、set_user_nice
修改进程nice值的时候
5、smp_send_reschedule
负载均衡的时候
三、执行调度
linux kernel会判当前今标记是否是TIF_NEED_RESCHED
,如果是,则会调用schedule()
函数执行调度,切换上下文。
那么在哪些情况下会执行schedule()
函数呢?
(1)阻塞操作:互斥量(Mutex)、信号量(Semaphore)、等待队列(waitqueue)等。
(2)在中断返回前和系统调用返回用户空间时,去检查TIF_NEED_RESCHED
标志位判断是否需要调度。
(3)将要被唤醒的进程不会马上调用schedule()
要求被调度,而是会被添加到CFS就绪队列中,并且设置TIF_NEED_RESCHED
标志位。那么唤醒进程什么时候被调度呢?这要根据内核是否具体可抢占功能(CONFIG_PREEMPT=y)分两种情况:
【如果内核不可抢占】,则:
- 当前进程调用
cond_resched()
时会检查是否要调度。 - 主动调度调用
schedule()
- 系统调用或者异常处理返回用户空间时
- 中断处理完成返回用户空间时
【如果内核可抢占】,则:
- 如果唤醒动作发生在系统调用或异常处理上下文中,在下一次调用
preempt_enable()
时会检查是否需要抢占调度。 - 如果唤醒动作发生在硬件中断处理上下文中,硬件中断处理返回前夕会检查是否要抢占当前进程。
硬件中断返回前夕
与硬件中断返回用户空间前前夕
是两个不同的概念。硬件中断返回前夕
是每次返回前夕都会检查是否进程需要被抢占调度,不管中断发生点是在内核空间,还是用户空间;硬件中断返回用户空间前夕
是只有中断发生点在用户空间才会检查。
本人由于知识与精力有限,如果分享的文章存在有不妥的地方,请多多批评!哈哈。
搜索/关注【嵌入式小生vx公众号,获取更多精彩内容>>>>