文章目录
先回顾下之前的定义,进程调度是指操作系统按照某种策略或者规则选择进程占用CPU进行运行的过程;
即:什么时候调度(调度时机)、怎么调度(上下文切换)、按照什么方式调度(调度策略)
1.调度策略
linux将进程主要划分为实时进程与普通进程,使用5种调度器;
一共五种调度器:STOP、RT、 DEADLINE、 CFS、 IDLE调度器;
调度策略:SCHED_FIFO
SCHED_RR
SCHED_IDLE
SCHED_NORMAL
SCHED_BATCH
其中STOP、IDEL调度器仅供内核使用,用户态常用的为CFS、RT调度器,其中RT常用于嵌入式系统。
Deadline调度器是linux3.14版本引入的,本文分析所使用的是linux3.0.20版本,因此还未引入,不过道理都是类似的,我们重点介绍CFS 与RT调度器
2.调度的数据结构
抽象体 | 结构体 |
---|---|
CPU | struct rq |
调度队列 | rq->cfs_rq rq->rt_rq |
进程 | struct task_struct |
调度实体 | task_struct->sched_entity task_struct->sched_rt_entity |
调度类 | task_struct->sched_class |
内核通过rq task_struct sched_class等结构体进行抽象调度器,每个cpu抽象了一个rq结构体,进程提取了一个task_struct,sched_class代表一种调度类,以下为几者的关系:
【1】图示左上,对cpu的内核核进行的抽象,提取为struct rq 结构体,cpu可以认为是一个具有一定算力的贪吃蛇,对cpu来说,可以反复不断运行进程,这两个进程队列cfs_rq、rt_rq已经指向了进程实体,cpu只需要按照优先级反复执行即可。
【2】图示上中,数据结构,可以看见内核选取了红黑树作为CFS调度器的数据结构,选择了哈希表作为rt调度器的数据结构。
【3】图示右上,进程抽象,其中包含着普通进程调度实体与实时进程调度实体,通过调度实体找到了进程task_struct就完成了进程的调度。
【4】图示下,调度器,对调度器的抽象,调度器是承载在进程task_struct上,这样就可以通过配置调度策略来选择对应的调度器。
可以看出,内核背后的抽象思维非常值得我们学习,可以在做实际项目中体会这种思维;
3. cpu抽象rq结构体
此结构体主要是对CPU进行抽象,主要为cpu上的运行队列,主要以单核调度进行说明,多核与组调度,在此并不作为重点进行说明;
/*
* This is the main, per-CPU runqueue data structure.
*
* Locking rule: those places that want to lock multiple runqueues
* (such as the load balancing or the thread migration code), lock
* acquire operations must be ordered by ascending &runqueue.
*/
struct rq {
/* runqueue lock: */
raw_spinlock_t lock;
/*
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
运行队列上调度实体的个数,是所有子调度器类中就绪实体之和
unsigned long nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ
u64 nohz_stamp;
unsigned char nohz_balance_kick;
#endif
int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
// 表示 rq 的权重,对于每个调度实体,都有一个权重值来表示进程的优先级,这里的 load 是整个队列上的总 load 值,反映了当前 runqueue 上进程的总体权重信息.
struct load_weight load;
struct load_weight load;
// 负载的统计次数
unsigned long nr_load_updates;
// 该运行队列上进程的切换次数
u64 nr_switches;
// cfs 调度器类的就绪队列
struct cfs_rq cfs;
// rt 调度器类的就绪队列
struct rt_rq rt;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list;
#endif
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
unsigned long nr_uninterruptible;
// 保存的进程指针,分别对应当前执行进程 curr,idle 进程(空闲时调用),stop进程(用于停止 CPU)
struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
u64 clock;
u64 clock_task;
atomic_t nr_iowait;
#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain *sd;
unsigned long cpu_power;
unsigned char idle_at_tick;
/* For active balancing */
int post_schedule;
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
/* cpu of this runqueue: */
int cpu;
int online;
unsigned long avg_load_per_task;
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
struct call_single_data hrtick_csd;
#endif
struct hrtimer hrtick_timer;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_count;
/* schedule() stats */
unsigned int sched_switch;
unsigned int sched_count;
unsigned int sched_goidle;
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
#endif
#ifdef CONFIG_SMP
struct task_struct *wake_list;
#endif
};
4.调度队列cfs_rq rt_rq
每个CPU上管理这两个调度队列,实时调度队列,CFS调度队列;其目的就是用来管理调度实体的,将调度实体按照规则进行组织;
cfs_rq即普通进程运行队列,管理着普通任务,cfs使用红黑树进行管理,运行队列就是指向其红黑树根节点以及,最左边的叶子节点,即下一个要调度的节点;
rt_rq即实时进程运行队列,管理着实时任务,rt使用哈希表进行管理,运行队列同样指向与哈希表的节点,进行调度节点管理。
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned long nr_running;
u64 exec_clock;
u64 min_vruntime;
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
struct list_head tasks;
struct list_head *balance_iterator;
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr, *next, *last, *skip;
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_SMP
/*
* the part of load.weight contributed by tasks
*/
unsigned long task_weight;
/*
* h_load = weight * f(tg)
*
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long h_load;
/*
* Maintaining per-cpu shares distribution for group scheduling
*
* load_stamp is the last time we updated the load average
* load_last is the last time we updated the load average and saw load
* load_unacc_exec_time is currently unaccounted execution time
*/
u64 load_avg;
u64 load_period;
u64 load_stamp, load_last, load_unacc_exec_time;
unsigned long load_contribution;
#endif
#endif
};
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
int next; /* next highest */
#endif
} highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
struct rq *rq;
struct list_head leaf_rt_rq_list;
struct task_group *tg;
#endif
};
5.调度实体sched_entity sched_rt_entity
调度实体,每一种调度器的调度实体是不一样的,是每一种调度器关键的调度载体。
cfs调度器,使用sched_entity调度实体,其中包含虚拟的运行时间,总执行时间等信息。
rt调度器,使用sched_rt_entity调度实体,其中包括时间片等信息。
struct sched_entity {
struct load_weight load; /* for load-balancing */
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
u64 prev_sum_exec_runtime;
u64 nr_migrations;
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics statistics;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
};
struct sched_rt_entity {
struct list_head run_list;
unsigned long timeout;
unsigned int time_slice;
int nr_cpus_allowed;
struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity *parent;
/* rq on which this entity is (to be) queued: */
struct rt_rq *rt_rq;
/* rq "owned" by this entity/group: */
struct rt_rq *my_q;
#endif
};
6.调度类sched_class
每个 CPU 拥有各自的 runqueue,而 runqueue 中维护了各个调度器类的相关信息:包括 cfs_rq,rt_rq.
每个不同的调度器类按照优先级排列依次为: stop_sched_class->rt_sched_class->fair_sched_class->idle_sched_class,
当高优先级调度器中存在就绪任务时,就不会轮到低优先级调度器中的任务执行;
内核对实时进程设置了运行占比为0.95,即当实时进程一直占用 CPU 时,会强行给非实时任务留出 5% 的执行时间,当然也是可以配置的,使用sysctl指令进行配置;
struct sched_class{
const struct sched_class *next;
//加入至调度队列
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
//从调度队列中删除
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
//获取下一个即将调度的进程或者线程
struct task_struct * (*pick_next_task) (struct rq *rq);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
#endif
void (*set_curr_task) (struct rq *rq);
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
void (*task_fork) (struct task_struct *p);
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
//设置进程优先级
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio);
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
}