3.10版本内核为例,linux为了提升多核调度的效率,每个cpu上都有一个runqueue结构,
这样就避免了多核争抢同一个runqueue造成的瓶颈。
在每个runqueue中,包含了多个调度类(sched_class)的runqueue,调度类下面的子类主要有cfs(完全公平调度器),
RT(实时调度器),其他的子类使用不多。
几个调度器子类间的优先级顺序:deadline class->rt class -> cfs class ->idle class.
对于cfs rq来讲,为了更好地体现公平性原则以及负载均衡需要,从高到低按照cpu物理架构依次划分成
调度域(sched_domain), 调度组(sched_group),调度组里面包含的是调度实体(sched_entity),
一个调度实体既可以是单个task_struct,也可以是一个task_group。
这样就避免了多核争抢同一个runqueue造成的瓶颈。
在每个runqueue中,包含了多个调度类(sched_class)的runqueue,调度类下面的子类主要有cfs(完全公平调度器),
RT(实时调度器),其他的子类使用不多。
几个调度器子类间的优先级顺序:deadline class->rt class -> cfs class ->idle class.
对于cfs rq来讲,为了更好地体现公平性原则以及负载均衡需要,从高到低按照cpu物理架构依次划分成
调度域(sched_domain), 调度组(sched_group),调度组里面包含的是调度实体(sched_entity),
一个调度实体既可以是单个task_struct,也可以是一个task_group。
可以参考下网络上的组织关系图:
接下来我们逐一看下调度相关的数据结构:
/*
* This is the main, per-CPU runqueue data structure.
*
* Locking rule: those places that want to lock multiple runqueues
* (such as the load balancing or the thread migration code), lock
* acquire operations must be ordered by ascending &runqueue.
*/
/*每个CPU都有自己的 struct rq 结构,其用于描述在此CPU上所运行的所有进程,
其包括一个实时进程队列和一个根CFS运行队列,在调度时,
调度器首先会先去实时进程队列找是否有实时进程需要运行,
如果没有才会去CFS运行队列找是否有进行需要运行
*/
struct rq {
/* runqueue lock: */
raw_spinlock_t lock;
/*
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
//此CPU上总共就绪的进程数,包括cfs,rt和正在运行的
unsigned int nr_running;
/* 根据CPU历史情况计算的负载,cpu_load[0]一直等于load.weight,当达到负载平衡时,cpu_load[1]和cpu_load[2]都应该等于load.weight */
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ_COMMON
u64 nohz_stamp;
unsigned long nohz_flags;
#endif
#ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
#endif
//是否需要更新rq的运行时间
int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
//CPU负载,该CPU上所有可运行进程的load之和,nr_running更新时这个值也必须更新
struct load_weight load;
unsigned long nr_load_updates;
//上下文切换次数
u64 nr_switches;
//不同调度子类rq
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
#ifdef CONFIG_SMP
unsigned long h_load_throttle;
#endif /* CONFIG_SMP */
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list;
#endif
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
//曾经处于队列但现在处于TASK_UNINTERRUPTIBLE状态的进程数量
unsigned long nr_uninterruptible;
struct task_struct *curr, *idle, *stop;
//下次进行负载平衡执行时间
unsigned long next_balance;
struct mm_struct *prev_mm;//在进程切换时用来存放换出进程的内存描述符地址
//rq 运行时间
u64 clock;
u64 clock_task;
//等待io的task数
atomic_t nr_iowait;
#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain *sd;
unsigned long cpu_power;
unsigned char idle_balance;
/* For active balancing */
int post_schedule;
int active_balance;
int push_cpu;
struct task_struct *push_task;
struct cpu_stop_work active_balance_work;
/* cpu of this runqueue: */
int cpu;
int online;
//rq 所有cfs task队列
struct list_head cfs_tasks;
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
int cstate, wakeup_latency, wakeup_energy;
#endif
#ifdef CONFIG_SCHED_HMP
struct sched_cluster *cluster;
struct cpumask freq_domain_cpumask;
struct hmp_sched_stats hmp_stats;
u64 window_start;
int prefer_idle;
u32 mostly_idle_load;
int mostly_idle_nr_run;
int mostly_idle_freq;
unsigned long hmp_flags;
u64 cur_irqload;
u64 avg_irqload;
u64 irqload_ts;
#ifdef CONFIG_SCHED_FREQ_INPUT
unsigned int old_busy_time;
int notifier_sent;
#endif
#endif
#ifdef CONFIG_SCHED_FREQ_INPUT
u64 curr_runnable_sum;
u64 prev_runnable_sum;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
u64 prev_steal_time_rq;
#endif
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
struct call_single_data hrtick_csd;
#endif
struct hrtimer hrtick_timer;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_count;
/* schedule() stats */
unsigned int sched_count;
unsigned int sched_goidle;
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
#endif
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
struct sched_avg avg;
}
/* CFS-related fields in a runqueue */
struct cfs_rq{
//cfs_rq中所有进程的总负载
struct load_weight load;
//
unsigned int nr_running, h_nr_running;
u64 exec_clock;
//当前CFS队列上最小运行时间
u64 min_vruntime;
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
//rb tree root node
struct rb_root tasks_timeline;
//下一个调度结点(红黑树最左边结点,最左边结点就是下个调度实体)
struct rb_node *rb_leftmost;
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
//next: 表示有些进程急需运行,即使不遵从CFS调度也必须运行它,调度时会检查是否next需要调度,有就调度next
struct sched_entity *curr, *next, *last, *skip;
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
#endif
#ifdef CONFIG_SMP
/*
* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
* removed when useful for applications beyond shares distribution (e.g.
* load-balance).
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* CFS Load tracking
* Under CFS, load is tracked on a per-entity basis and aggregated up.
* This allows for the description of both thread and group usage (in
* the FAIR_GROUP_SCHED case).
*/
u64 runnable_load_avg, blocked_load_avg;
atomic64_t decay_counter, removed_load;
u64 last_decay;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/* These always depend on CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
u32 tg_runnable_contrib;
u64 tg_load_contrib;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
* h_load = weight * f(tg)
*
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long h_load;
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
//所属的cpu rq
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
int on_list;
struct list_head leaf_cfs_rq_list;
//该cfs_rq所属的group
struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_CFS_BANDWIDTH
#ifdef CONFIG_SCHED_HMP
struct hmp_sched_stats hmp_stats;
#endif
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time;
int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
}
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
//优先级队列,根据优先级分成不同的队列,位图标记优先级
struct rt_prio_array active;
//运行状态进程数
unsigned int rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
int next; /* next highest */
#endif
} highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
//所属rq
struct rq *rq;
struct list_head leaf_rt_rq_list;
//所属task group
struct task_group *tg;
#endif
};
struct sched_domain {
/* These fields must be setup */
struct sched_domain *parent; /* top domain must be null terminated */
struct sched_domain *child; /* bottom domain must be null terminated */
struct sched_group *groups; /* the balancing groups of the domain */
unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */
unsigned int busy_factor; /* less balancing by factor if busy *///busy时平衡因子
unsigned int imbalance_pct; /* No balance until over watermark *///判断该调度域是否已经均衡的一个基准值
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
unsigned int busy_idx; //忙均衡的cpu_load索引
unsigned int idle_idx; //空闲均衡的cpu_load索引
unsigned int newidle_idx; //马上就要进入idle的cpu为了尽量不进入idle而进行负载均衡时的cpu_load索引
unsigned int wake_idx;
unsigned int forkexec_idx;
unsigned int smt_gain;
int nohz_idle; /* NOHZ IDLE status */
int flags; /* See SD_* */
int level; //domain 所处层次级别
/* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies *///domain上次做balance时间
unsigned int balance_interval; /* initialise to 1. units in ms. *///每次balance的间隔时间
unsigned int nr_balance_failed; /* initialise to 0 */ //balance失败次数
u64 last_update;
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
/* Active load balancing */
unsigned int alb_count;
unsigned int alb_failed;
unsigned int alb_pushed;
/* SD_BALANCE_EXEC stats */
unsigned int sbe_count;
unsigned int sbe_balanced;
unsigned int sbe_pushed;
/* SD_BALANCE_FORK stats */
unsigned int sbf_count;
unsigned int sbf_balanced;
unsigned int sbf_pushed;
/* try_to_wake_up() stats */
unsigned int ttwu_wake_remote;
unsigned int ttwu_move_affine;
unsigned int ttwu_move_balance;
#endif
#ifdef CONFIG_SCHED_DEBUG
char *name;
#endif
union {
void *private; /* used during construction */
struct rcu_head rcu; /* used during destruction */
};
unsigned int span_weight;
/*
* Span of all CPUs in this domain.
*
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
unsigned long span[0]; //当前 domain 中的所有 cpu 位图
};
struct sched_group {
struct sched_group *next; /* Must be a circular list *///环形list sg==sg->next domain内group遍历完成
atomic_t ref;
unsigned int group_weight;
struct sched_group_power *sgp;
/*
* The CPUs this group covers.
*
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
unsigned long cpumask[0]; //当前 group 有哪些 CPU
};
/*该结构包含了完整的信息,用于实现对单个任务或任务组的调度。它可用于实现组调度。调度实体可能与进程没有关联。
调度实体可以表示一个进程,也可以表示一个进程组task group
*/
struct sched_entity {
/* 权重,在数组prio_to_weight[]包含优先级转权重的数值 */
struct load_weight load; /* for load-balancing */
//实体在rb tree中的节点
struct rb_node run_node;
//实体所在进程组
struct list_head group_node;
//该实体是否在rb tree中,rb tree保存就绪队列
unsigned int on_rq;
//实体开始执行的时间
u64 exec_start;
u64 sum_exec_runtime;//总运行时间
u64 vruntime;//虚拟运行时间,每次时钟中断都会修改该值
u64 prev_sum_exec_runtime;
u64 nr_migrations;//调度实体中进程移到其他cpu数量
#ifdef CONFIG_SCHEDSTATS
//调度实体统计信息
struct sched_statistics statistics;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
/* 父亲调度实体指针,如果是进程则指向其运行队列的调度实体,如果是进程组则指向其上一个进程组的调度实体*/
struct sched_entity *parent;
/* rq on which this entity is (to be) queued 该实体所在rq */
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
/* 实体的红黑树运行队列,如果为NULL表明其是一个进程,若非NULL表明其是调度组 */
struct cfs_rq *my_q;
#endif
/*
* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
* removed when useful for applications beyond shares distribution (e.g.
* load-balance).
*/
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
/* Per-entity load-tracking */
struct sched_avg avg;
#endif
};
/*
my_q:如果此调度实体代表的是一个进程组,那么此调度实体就包含
有一个自己的CFS运行队列,其CFS运行队列中存放的是此进程组中的进程,
这些进程就不会在其他CFS运行队列的红黑树中被包含(包括顶层红黑树也不会包含他们,他们只属于这个进程组的红黑树)。
*/
struct sched_rt_entity {
struct list_head run_list;
unsigned long timeout;
unsigned long watchdog_stamp;
unsigned int time_slice;
struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity *parent;
/* rq on which this entity is (to be) queued: *///实体所在rq
struct rt_rq *rt_rq;
/* rq "owned" by this entity/group: *///实体属于这个rq
struct rt_rq *my_q;
#endif
};
/*
linux是一个多用户系统,如果有两个进程分别属于两个用户,
而进程的优先级不同,会导致两个用户所占用的CPU时间不同,
这样显然是不公平的(如果优先级差距很大,低优先级进程所属用户使用
CPU的时间就很小),所以内核引入组调度
如果基于用户分组,即使进程优先级不同,这两个用户使用的CPU时间都为50%
在多核多CPU的情况下,同一进程组的进程有可能在不同CPU上同时运行,所以每个进程组都必须对每个CPU分配它的调度实体(struct sched_entity 和 struct sched_rt_entity)和运行队列(struct cfs_rq 和 struct rt_rq)。
*/
/* task group related information */
struct task_group {
/* 用于进程找到其所属进程组结构 */
struct cgroup_subsys_state css;
bool notify_on_migrate;
#ifdef CONFIG_SCHED_HMP
bool upmigrate_discouraged;
#endif
//参照alloc_fair_sched_group/alloc_rt_sched_group
///* 该进程组在每个CPU上都有对应的一个调度实体,因为有可能此进程组同时在两个CPU上运行(它的A进程在CPU0上运行,B进程在CPU1上运行) */
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
//保存tg优先级
unsigned long shares;
atomic_t load_weight;
atomic64_t load_avg;
atomic_t runnable_avg;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
//rt调度器进程组
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
struct rt_bandwidth rt_bandwidth;
#endif
struct rcu_head rcu;
//task list of this task group
struct list_head list;
//上层进程组
struct task_group *parent;
//进程组兄弟节点list
struct list_head siblings;
//进程组儿子节点list
struct list_head children;
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup *autogroup;
#endif
struct cfs_bandwidth cfs_bandwidth;
};
struct task_struct {
//一个进程共有7种可能状态,分别是:TASK_RUNNING、TASK_INTERRUPTIBLE、TASK_UNINTERRUPTIBLE、TASK_STOPPED、TASK_TRACED、EXIT_ZOMBIE和EXIT_DEAD
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace;
#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu; //task 是否正在cpu上运行
#endif
//是否在rq
int on_rq;
//优先级,prio:动态优先级,100~139,static_prio: 静态优先级,static_prio = 100 + nice + 20,
//normal 正常优先级
int prio, static_prio, normal_prio;
unsigned int rt_priority;//实时进程优先级1~99
const struct sched_class *sched_class; //调度类
struct sched_entity se;//调度实体
struct sched_rt_entity rt;
#ifdef CONFIG_SCHED_HMP
struct ravg ravg;
/*
* 'init_load_pct' represents the initial task load assigned to children
* of this task
*/
u32 init_load_pct;
u64 run_start;
u64 last_sleep_ts;//记录上次睡眠时间
struct related_thread_group *grp;
struct list_head grp_list;
#endif
#ifdef CONFIG_CGROUP_SCHED
//指向其所在进程组
struct task_group *sched_task_group;
#endif
struct sched_dl_entity dl;
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif
/*
* fpu_counter contains the number of consecutive context switches
* that the FPU is used. If this is over a threshold, the lazy fpu
* saving becomes unlazy to save the trap. This is an unsigned char
* so that after 256 times the counter wraps and the behavior turns
* lazy again; this to deal with bursty apps that only use FPU for
* a short time
*/
unsigned char fpu_counter;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
char rcu_read_unlock_special;
struct list_head rcu_node_entry;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
struct rt_mutex *rcu_boost_mutex;
#endif /* #ifdef CONFIG_RCU_BOOST */
.....
}