linux2.6.34版本的调度器主要是提供了一个调度框架,类似于主板的插槽,内核开发者可以开发自己的调度器适配到这个调度框架中,类似于插入插槽的插卡。现在内核中提供了三种调度算法:实时调度类、公平调度类及idle调度类。内核这种调度器的框架是非常灵活的,给开发自己的调度器降低了难度。今天我们主要讲的是CFS调度算法。
首先,内核有一个基本的调度类,定义如下:
点击(此处)折叠或打开
struct sched_class {
const struct sched_class *next;
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
bool head);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
void (*yield_task) (struct rq *rq);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
struct task_struct * (*pick_next_task) (struct rq *rq);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct rq *this_rq, struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
#endif
void (*set_curr_task) (struct rq *rq);
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
void (*task_fork) (struct task_struct *p);
void (*switched_from) (struct rq *this_rq, struct task_struct *task,
int running);
void (*switched_to) (struct rq *this_rq, struct task_struct *task,
int running);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio, int running);
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*moved_group) (struct task_struct *p, int on_rq);
#endif
}
与之对应的CFS调度器实现如下:
/*
* All the scheduling class methods:
*/
static const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
.task_waking = task_waking_fair,
#endif
.set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
.prio_changed = prio_changed_fair,
.switched_to = switched_to_fair,
.get_rr_interval = get_rr_interval_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
.moved_group = moved_group_fair,
#endif
}
可以看到就是要把标准调度类的函数都有一个对应的实现。
其次,我们介绍一下CFS调度算法的基本原理:CFS调度器与传统的调度器最大的区别,就是调度的依据不再是时间片轮转,而是依据进程就绪队列中等待的时间长短来进行调度选择,即在就绪队列中等待时间越长的进程得到调度的机会就越大,否则,机会就越小。这一段原理实际上涉及到几个关键的内容需要解释,否则,就有点空中楼阁的意味。
第一个关键内容就是就绪队列的描述和组织形式是什么样的?如何在就绪队列中排序,就绪队列又是如何不断的变动才能保障进程的公平调度的?
1)就绪队列的数据结构cfs_rq定义如下:
点击(此处)折叠或打开
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned long nr_running;
u64 exec_clock;
u64 min_vruntime;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
struct list_head tasks;
struct list_head *balance_iterator;
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr, *next, *last;
unsigned int nr_spread_over;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_SMP
/*
* the part of load.weight contributed by tasks
*/
unsigned long task_weight;
/*
* h_load = weight * f(tg)
*
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long h_load;
/*
* this cpu's part of tg->shares
*/
unsigned long shares;
/*
* load.weight at the time we set shares
*/
unsigned long rq_weight;
#endif
#endif
}
2)其组织形式采用红黑树进行组织,cfs_rq中下面两个域就体现了红黑的组织:
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
其中,tasks_timeline表示红黑树的根,rb_leftmost表示红黑树的最左子节点,即指向在就绪队列汇中最优先被调度的进程。搜索一下代码中tasks_timeline要被使用的地方,发现如下地方被使用:
初始化cfs就绪队列:
static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
{
cfs_rq->tasks_timeline = RB_ROOT;
。。。。。。
}
进程入队就绪队列:
点击(此处)折叠或打开
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
s64 key = entity_key(cfs_rq, se);
int leftmost = 1;
/*
* Find the right place in the rbtree:
*/
while (*link) {
parent = *link;
entry = rb_entry(parent, struct sched_entity, run_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (key < entity_key(cfs_rq, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}
/*
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}
进程出队就绪队列:
点击(此处)折叠或打开
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;
next_node = rb_next(&se->run_node);
cfs_rq->rb_leftmost = next_node;
}
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
挑选最后一个进程:
点击(此处)折叠或打开
static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
if (!last)
return NULL;
return rb_entry(last, struct sched_entity, run_node);
}
从上面的代码片段,我们还会发现令整个红黑树发生变动的动作主要就是进程入队和进程出队。
3)在整个linux内核中,都有哪些时机可能会导致进程的入队和出队,从而导致红黑树发生变化呢?
activate_task、try_to_wake_up、wake_up_new_task、__sched_setscheduler、__migrate_task pull_task schedule、rt_mutex_setprio、set_user_nice、sched_move_task
上面是主要调用入队和出队的地方,总结起来应当是几个时机点:唤醒、调度、迁移(负载均衡)、睡眠及其他的一些静态设置优先级和调度的地方。
第二个关键内容就是时间更新的相关内容,详细的细节已经在前一篇时间相关内容中阐述了。主要就是在时钟中断中计算每个进程理想运行时间和总的运行时间,而在update_curr中增加虚拟时间。更新虚拟时间的主要地点如下:
点击(此处)折叠或打开
|->enqueue_entity
|
|->dequeue_entity
|
|->put_prev_entity
|
|->entity_tick
|
|->yield_task_fair
|
|->check_preempt_wakeup
|
|->task_fork_fair
|
|->moved_group_fair
第三个关键内容是进程的优先级和权重。
进程的优先级:在内核中是从0~139,其中0~99为实时优先级,剩下的为普通优先级。用户可以使用nice改变进程的静态优先级(范围从-20~19)。而进程的优先级又分为三种:静态优先级、普通优先级和动态优先级。三者关系为:1、对于非实时进程来说,三者的值相同,均为静态优先级。2、对于优先级提高的非实时进程来说,静态优先级和普通优先级相同,而动态优先级不同,为被临时提高的优先级。3、对于实时进程来说,静态优先级不变,普通优先级为MAX_RT_PRIO-1-rt_priority,动态优先级不变。
进程的权重:权重和优先级及时间有着密切关系。
权重与优先级的关系为:
点击(此处)折叠或打开
static const int prio_to_weight[40] = {
/* -20 */ 88761, 71755, 56483, 46273, 36291,
/* -15 */ 29154, 23254, 18705, 14949, 11916,
/* -10 */ 9548, 7620, 6100, 4904, 3906,
/* -5 */ 3121, 2501, 1991, 1586, 1277,
/* 0 */ 1024, 820, 655, 526, 423,
/* 5 */ 335, 272, 215, 172, 137,
/* 10 */ 110, 87, 70, 56, 45,
/* 15 */ 36, 29, 23, 18, 15,
};
可以看到,优先级越高,权重越大。
权重和时间的关系:
ideal_time = sum_runtime *se.weight/cfs_rq.weight
1) vruntime = delta* NICE_0_LOAD/se.weight;(if curr.nice!=NICE_0_LOAD)
2)vruntime = delta; (ifcurr.nice=NICE_0_LOAD)
所以,从上面来看,优先级越高,权重越大,所得到的虚拟时间越少,最后越靠近就绪队列的红黑中的左边,否则,就越靠右边。而就绪队列红黑树的排序时靠如下key来决定的:
点击(此处)折叠或打开
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return se->vruntime - cfs_rq->min_vruntime;
}
之所以用个减法来实现,主要考虑到vruntime是进程获得执行的虚拟时间的长度,也就是说在进程睡眠的时候,这个虚拟时间的长度是不会增长的,而min_vruntime是整个就绪队列的最小虚拟时间基准,是一直会增长的。当一个进程睡眠一段时间后,通过上面的算法计算出来的key就是变小,从而更靠近红黑树的左边,更容易得到调度。因为这个进程受到了不公平的待遇,需要平衡一下。而利用键值排序红黑树的地方在进程入队的时候,具体如下:
点击(此处)折叠或打开
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
s64 key = entity_key(cfs_rq, se);
int leftmost = 1;
/*
* Find the right place in the rbtree:
*/
while (*link) {
parent = *link;
entry = rb_entry(parent, struct sched_entity, run_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (key < entity_key(cfs_rq, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}
/*
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}
我们再简单解析一下CFS调度器的几个关键的函数:
进程入队操作:
点击(此处)折叠或打开
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int flags = 0;
//若进程此前已经是可运行状态,则wakeup为0,否则,想入队的进程最近被唤醒,并转为运行状态则为1
if (wakeup)
flags |= ENQUEUE_WAKEUP;
if (p->state == TASK_WAKING)
flags |= ENQUEUE_MIGRATE;
//遍历组内的各个实体
for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
//入队操作
enqueue_entity(cfs_rq, se, flags);
flags = ENQUEUE_WAKEUP;
}
hrtick_update(rq);
}
点击(此处)折叠或打开
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
*/
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
se->vruntime += cfs_rq->min_vruntime;
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
account_entity_enqueue(cfs_rq, se);
//更新进程的虚拟时间
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se);
}
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
//进程入队,重排红黑树
__enqueue_entity(cfs_rq, se);
}
进程出队操作:
点击(此处)折叠或打开
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
//遍历调度组
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
//执行具体的出队操作
dequeue_entity(cfs_rq, se, sleep);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
sleep = 1;
}
hrtick_update(rq);
}
点击(此处)折叠或打开
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
update_stats_dequeue(cfs_rq, se);
if (sleep) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
se->sleep_start = rq_of(cfs_rq)->clock;
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->block_start = rq_of(cfs_rq)->clock;
}
#endif
}
clear_buddies(cfs_rq, se);
if (se != cfs_rq->curr)
//进程出队,重排红黑树。
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
/*
* Normalize the entity after updating the min_vruntime because the
* update can refer to the ->curr item and we need to reflect this
* movement in our normalized position.
*/
if (!sleep)
se->vruntime -= cfs_rq->min_vruntime;
}
最后,我们说一下,每个进程的时间控制是在什么时机做的?实际上是在周期调度entity_tick中实现的,具体如下:
点击(此处)折叠或打开
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
* validating it and just reschedule.
*/
if (queued) {
resched_task(rq_of(cfs_rq)->curr);
return;
}
/*
* don't let the period tick interfere with the hrtick preemption
*/
if (!sched_feat(DOUBLE_TICK) &&
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
//在此处检测进程的运行时间是否超期
check_preempt_tick(cfs_rq, curr);
}
点击(此处)折叠或打开
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;
//进程应当运行的理想时间
ideal_runtime = sched_slice(cfs_rq, curr);
//进程时机运行的时间
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
//检查进程运行的时间是否超出了理想运行的时间,若是,则设置调度标志,调度进程
if (delta_exec > ideal_runtime) {
resched_task(rq_of(cfs_rq)->curr);
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
*/
clear_buddies(cfs_rq, curr);
return;
}
/*
* Ensure that a task that missed wakeup preemption by a
* narrow margin doesn't have to wait for a full slice.
* This also mitigates buddy induced latencies under load.
*/
if (!sched_feat(WAKEUP_PREEMPT))
return;
//实际运行的时间小于最小的可以保证的时间
if (delta_exec < sysctl_sched_min_granularity)
return;
if (cfs_rq->nr_running > 1) {
struct sched_entity *se = __pick_next_entity(cfs_rq);
s64 delta = curr->vruntime - se->vruntime;
//当前进程运行的虚拟时间大于(红黑树上最左子节点的虚拟时间+进程应当运行的时间)时,当前进程也要被调度。
if (delta > ideal_runtime)
resched_task(rq_of(cfs_rq)->curr);
}
}