进程调度是确保进程能够有效运行的内核子系统
多任务
多任务操作系统是 并发地交互执行多个进程的操作系统;可分为 抢占式任务和非抢占式任务。
抢占式:
每个进程的可运行的时间是预设好的,叫作时间片;每当进程的时间片走完,进程会被强制挂起,这个动作叫作 抢占,这个是由操作系统的调度程序来决定的,这样可以避免其他进程独占系统资源;
非抢占式:
除非进程主动停止,否则会一直运行下去,而进程主动挂起的动作叫让步,优点是给进程足够的时间处理任务,缺点是时间无法预计,并且有可能因为长时间占用系统资源造成系统崩溃;
策略:(进程分为 I/O消耗型 和 处理器消耗型)
- I/O消耗型:此类进程大部分时间用来提交 I/O请求或者是等待 I/O请求,经常处于可运行状态, 但是由于经常处理大量的 I/O请求,会导致程序容易阻塞;(不需要太长的时间片)
- 处理器消耗型: 除非被抢占,否则大部分时间都被用在执行代码上,没有太多的I/O请求;(时间片越长越好)
也有既是I/O消耗型,也是处理器消耗型的进程。
进程优先级:
一般是优先级高的进程比低的先运行,若是相同的优先级,则按照轮转的方式进行调度。
若是优先级高且时间片长,则先运行;
Linux有两种优先级:
- nice值:值越大,优先级越低;
- 实时优先级:值越大,优先级越高;
时间片:
时间片表明程序在被抢占前持续运行的时间;
一个进程是否要立刻运行,取决于进程优先级和时间片;
Linux调度算法
- 调度器类: 以模块方式提供,允许不同类型可以有针对性的选择调度算法,每个调度器都有优先级。
<kernek/sched/core.c>
struct sched_class {
const struct sched_class *next;
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
/*
* Both @prev and @rf are optional and may be NULL, in which case the
* caller must already have invoked put_prev_task(rq, prev, rf).
*
* Otherwise it is the responsibility of the pick_next_task() to call
* put_prev_task() on the @prev task or something equivalent, IFF it
* returns a next task.
*
* In that case (@rf != NULL) it may return RETRY_TASK when it finds a
* higher prio class has runnable tasks.
*/
struct task_struct * (*pick_next_task)(struct rq *rq,
struct task_struct *prev,
struct rq_flags *rf);
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
#endif
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
void (*task_fork)(struct task_struct *p);
void (*task_dead)(struct task_struct *p);
/*
* The switched_from() call is allowed to drop rq->lock, therefore we
* cannot assume the switched_from/switched_to pair is serliazed by
* rq->lock. They are however serialized by p->pi_lock.
*/
void (*switched_from)(struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio);
unsigned int (*get_rr_interval)(struct rq *rq,
struct task_struct *task);
void (*update_curr)(struct rq *rq);
#define TASK_SET_GROUP 0
#define TASK_MOVE_GROUP 1
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p, int type);
#endif
};
CFS:公平调度算法
在多任务情况下趋近于完美的多任务算法,但是并不是完美的公平,降低了调度带来的不公平性。
nice值和时间片的分配通过CFS算法来平衡,例如nice值分别为10和15的两个进程,时间片的
分配为15ms 和 5ms。而觉得这个分配的不是nice的绝对值,而是其相对值和比例。
在进程优先级相同的情况下,如果进程过多甚至趋近于无限,则每个进程获得的处理器使用比和时间比都将趋近于0,这样就毫无意义(考虑到还有切换进程带来的开销代价),所以CFS算法设置了时间片最小粒度,每个进程最少也会分配到1ms的时间片。
调度实现:
调度实现由四个部分组成:
- 时间记账
- 进程选择
- 调度器入口
- 睡眠和唤醒
其相关代码在<kernel/fair.c>中
1.时间记账
所有调度器都会对进程的时间片进行记账,每当系统时钟的节拍发生时,时间片都会减少一个周期,当时间片耗尽时,就会被其他进程抢占。
CFS使用调度器的实体结构
struct sched_entity {
/* For load-balancing: */
struct load_weight load;
unsigned long runnable_weight;
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime; //存放进程的虚拟允许时间:经过了所有可运行进程总数的标准化,和节拍不再相关,因为是ns为单位
u64 prev_sum_exec_runtime;
u64 nr_migrations;
struct sched_statistics statistics;
#ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
#ifdef CONFIG_SMP
/*
* Per entity load average tracking.
*
* Put into separate cache line so it does not
* collide with read-mostly values above.
*/
struct sched_avg avg;
#endif
};
该结构体嵌在task_struct 内;
记账功能:
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq));
u64 delta_exec; //当前进程允许的时间,然后再传给__update_curr(),然后再根据当前进程运行总数对时间进行加权
if (unlikely(!curr))
return;
/*
获得从最后一次修改负载后当前任务所占用的运行时间*/
delta_exec = now - curr->exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
curr->exec_start = now;
schedstat_set(curr->statistics.exec_max,
max(delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cgroup_account_cputime(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
2.进程选择(太长,暂时不写)
当CFS选择下一个运行进程时,他会挑选一个具有最小vruntime的进程。(使用红黑树来组织进程队列,沿着根节点往左子树找,可以找到最小vruntime的进程)
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
struct rb_node *next = rb_next(&se->run_node);
if (!next)
return NULL;
return rb_entry(next, struct sched_entity, run_node);
}
添加进程至CFS:
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
if (renorm && curr)
se->vruntime += cfs_rq->min_vruntime;
// 更新当前任务的运行时统计数据
update_curr(cfs_rq);
/*
* Otherwise, renormalise after, such that we're placed at the current
* moment in time, instead of some random moment in the past. Being
* placed in the past could significantly boost this task to the
* fairness detriment of existing tasks.
*/
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
/*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - Add its load to cfs_rq->runnable_avg
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
update_cfs_group(se);
enqueue_runnable_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
check_schedstat_required();
update_stats_enqueue(cfs_rq, se, flags);
check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
/*
* When bandwidth control is enabled, cfs might have been removed
* because of a parent been throttled but cfs->nr_running > 1. Try to
* add it unconditionnally.
*/
if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
list_add_leaf_cfs_rq(cfs_rq);
if (cfs_rq->nr_running == 1)
check_enqueue_throttle(cfs_rq);
}
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
bool leftmost = true;
/*
* Find the right place in the rbtree:
*/
while (*link) {
parent = *link;
entry = rb_entry(parent, struct sched_entity, run_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (entity_before(se, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = false;
}
}
rb_link_node(&se->run_node, parent, link);
rb_insert_color_cached(&se->run_node,
&cfs_rq->tasks_timeline, leftmost);
}
删除进程:
static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - Subtract its load from the cfs_rq->runnable_avg.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
update_load_avg(cfs_rq, se, UPDATE_TG);
dequeue_runnable_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
/*
* Normalize after update_curr(); which will also have moved
* min_vruntime if @se is the one holding it back. But before doing
* update_min_vruntime() again, which will discount @se's position and
* can move min_vruntime forward still more.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
update_cfs_group(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
* put back on, and if we advance min_vruntime, we'll be placed back
* further than we started -- ie. we'll be penalized.
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}
3.调度器入口
进程调度入口函数是schedule(),(找到最高优先级的调度列)
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
* higher scheduling class, because otherwise those loose the
* opportunity to pull in more work from other CPUs.
*/
if (likely((prev->sched_class == &idle_sched_class ||
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
p = idle_sched_class.pick_next_task(rq, prev, rf);
return p;
}
4.睡眠和唤醒
休眠的进程处于一个不可执行的状态
进程把自己标记成休眠状态,从可执行红黑树中移除,放入等待队列,然后调用schedule()选择和执行其他进程,唤醒的过程则相反。
休眠有两种状态:TASK_INTERRUPTABLE 和 TASK_UNINTERRUPTABLE,前者会忽略信号量,后者遇到信号量会被提前唤醒。
抢占和上下文切换
上下文切换:从一个可执行进程切换到另外一个可执行进程。函数为context_switch()
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
prepare_task_switch(rq, prev, next);
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
* one hypercall.
*/
arch_start_context_switch(prev);
/*
* kernel -> kernel lazy + transfer active
* user -> kernel lazy + mmgrab() active
*
* kernel -> user switch + mmdrop() active
* user -> user switch
*/
if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
if (prev->mm) // from user
mmgrab(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
membarrier_switch_mm(rq, prev->active_mm, next->mm);
/*
* sys_membarrier() requires an smp_mb() between setting
* rq->curr / membarrier_switch_mm() and returning to userspace.
*
* The below provides this either through switch_mm(), or in
* case 'prev->active_mm == next->mm' through
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next); //负责将虚拟内存从上一个进程映射到新的进程中
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev); //将上一个进程的处理器状态切换到新进程的处理器状态
barrier();
return finish_task_switch(prev);
}