struct cfs_rq
在系统中至少有一个CFS运行队列,其就是根CFS运行队列,而其他的进程组 和进程都包含在此运行队列中,不同的是进程组又有它自己的CFS运行队列,其运行队列中包含的 是此进程组中的所有进程。当调度器从根CFS运行队列中选择了一个进程组进行调度时,进程组会 从自己的CFS运行队列中选择一个调度实体进行调度(这个调度实体可能为进程,也可能又是一个 子进程组),就这样一直深入,直到最后选出一个进程进行运行为止。
cfs_rq实际上是rq中与cfs相关的字段
/* CFS-related fields in a runqueue */
struct cfs_rq {
/*
该cfs_rq的load,它只计算它本层下面的se的weight之和,并不是这个se的load,也不是递归到叶子节点上的所有se weight之和(理解这点非常重要)*/
struct load_weight load;/*所有进程的累计负荷值*/
//h_nr_running只对于组才有效,包括底层所有cfs_rq的nr_running之和
unsigned int nr_running, h_nr_running;nr_running/*当前就绪队列的进程数*/
u64 exec_clock;//该cfs_rq总共占用的cpu时间(物理),只累计本层
/*
* 当前CFS队列上最小运行时间,单调递增
* 两种情况下更新该值:
* 1、更新当前运行任务的累计运行时间时
* 2、当任务从队列删除去,如任务睡眠或退出,这时候会查看剩下的任务的vruntime是否大于min_vruntime,如果是则更新该值。
*/
//用于调整se的vruntime,它是递增的,但不一定是该cfs_rq里所有se最小
u64 min_vruntime; //该cpu运行队列的vruntime推进值, 一般是红黑树中最小的vruntime值
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
struct rb_root tasks_timeline;/*红黑树的头结点*/
struct rb_node *rb_leftmost;/*红黑树的最左面节点*/
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
// current是正在被调用的实体对象
//当前运行的se(对于组虽然它不会在cpu上运行,但是当它的下层有一个task在cpu上运行,那么它所在的cfs_rq就把它当做是该cfs_rq上当前正在运行的se)
struct sched_entity *curr, *next, *last, *skip;
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
* curr: 当前正在运行的sched_entity(对于组虽然它不会在cpu上运行,但是当它的下层有一个task在cpu上运行,那么它所在的cfs_rq就把它当做是该cfs_rq上当前正在运行的sched_entity)
* next: 表示有些进程急需运行,即使不遵从CFS调度也必须运行它,调度时会检查是否next需要调度,有就调度next
*
* skip: 略过进程(不会选择skip指定的进程调度)
*/
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
#endif
#ifdef CONFIG_SMP
/*
* CFS load tracking
*/
struct sched_avg avg;
u64 runnable_load_sum;
unsigned long runnable_load_avg;
#ifdef CONFIG_64BIT_ONLY_CPU
unsigned long runnable_load_avg_32bit;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* h_load = weight * f(tg)
*
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long h_load;
u64 last_h_load_update;
struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
/* 所属于的CPU rq */
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
int on_list;
struct list_head leaf_cfs_rq_list;
/*属于这个cfs_rq的进程组*/
struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_SCHED_WALT
u64 cumulative_runnable_avg;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time;
int throttled, throttle_count, throttle_uptodate;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
task_struct
每个task对应一个se,但是反过去不一定成立,因为有task_group的概念
struct task_struct
{
........
/* 表示是否在运行队列 */
int on_rq;
/* 进程优先级
* prio: 动态优先级,范围为100~139,与静态优先级和补偿(bonus)有关
* static_prio: 静态优先级,static_prio = 100 + nice + 20 (nice值为-20~19,所以static_prio值为100~139)
* normal_prio: 没有受优先级继承影响的常规优先级,具体见normal_prio函数,跟属于什么类型的进程有关
*/
int prio, static_prio, normal_prio;
/* 实时进程优先级 */
unsigned int rt_priority;
/* 调度类,调度处理函数类 */
const struct sched_class *sched_class;
/* 调度实体(红黑树的一个结点) */
struct sched_entity se; //通过这个调度实体可以找到对应的task
/* 调度实体(实时调度使用) */
struct sched_rt_entity rt;
struct sched_dl_entity dl;
#ifdef CONFIG_CGROUP_SCHED
/* 指向其所在进程组 */
struct task_group *sched_task_group;
#endif
........
}
struct sched_domain
struct sched_domain {
/* These fields must be setup */
//调用域可以被别的调用域所包含,parent指向父调用域
struct sched_domain *parent; /* top domain must be null terminated(终止) */
struct sched_domain *child; /* bottom domain must be null terminated */
// 指向正在均衡的group
struct sched_group *groups; /* the balancing groups of the domain */
//最小的时间间隔,用于检查进行负载均衡操作的时机是否到了
unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */
//当处理器在不空闲的状态下时,进行负载均衡操作的时间间隔一般也长很多,该factor为其乘数银子
unsigned int busy_factor; /* less balancing by factor if busy *///busy时平衡因子
//判断该调度域是否已经均衡的一个基准值
unsigned int imbalance_pct; /* No balance until over watermark(水印) */
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
unsigned int busy_idx;//忙均衡的cpu_load索引
unsigned int idle_idx;//空闲均衡的cpu_load索引
//马上就要进入idle的cpu为了尽量不进入idle而进行负载均衡时的cpu_load索引
unsigned int newidle_idx;
unsigned int wake_idx;
unsigned int forkexec_idx;
unsigned int smt_gain;
//进入nohz_idle模式的时候该值为1
int nohz_idle; /* NOHZ IDLE status */
int flags; /* See SD_* */
int level; //domain 所处层次级别
/* Runtime fields. */
//domain上次做balance时间
unsigned long last_balance; /* init to jiffies. units in jiffies */
//每次balance的间隔时间
unsigned int balance_interval; /* initialise to 1. units in ms. */
//balance失败次数
unsigned int nr_balance_failed; /* initialise to 0 */
/* idle_balance() stats */
//这里的max_newidle_lb_cost是指做load balance所花时间。如上面注释所说,max_newidle_lb_cost每个1s衰减1%
u64 max_newidle_lb_cost;
//是下一次进行衰减的时间,HZ为jiffies的1s时间
unsigned long next_decay_max_lb_cost;
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
/* Active load balancing */
unsigned int alb_count;
unsigned int alb_failed;
unsigned int alb_pushed;
/* SD_BALANCE_EXEC stats */
unsigned int sbe_count;
unsigned int sbe_balanced;
unsigned int sbe_pushed;
/* SD_BALANCE_FORK stats */
unsigned int sbf_count;
unsigned int sbf_balanced;
unsigned int sbf_pushed;
/* try_to_wake_up() stats */
unsigned int ttwu_wake_remote;
unsigned int ttwu_move_affine;
unsigned int ttwu_move_balance;
struct eas_stats eas_stats;
#endif
#ifdef CONFIG_SCHED_DEBUG
char *name;
#endif
union {
void *private; /* used during construction */
struct rcu_head rcu; /* used during destruction */
};
#ifdef CONFIG_INTEL_DWS
unsigned int total_groups; /* total group number */
unsigned int group_number; /* this CPU's group sequence */
unsigned int dws_tf; /* consolidating degree */
struct sched_group *first_group; /* ordered by CPU number */
#endif
unsigned int span_weight;
/*
* Span of all CPUs in this domain.
*
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
unsigned long span[0];//当前 domain 中的所有 cpu 位图
};
4、struct sd_lb_stats
/*
* sd_lb_stats - Structure to store the statistics of a sched_domain
* during load balancing.
*/
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
///domain中各個group的平均負載
unsigned long avg_load; /* Average load across all groups in sd */
unsigned long total_util;
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
struct sg_lb_stats local_stat; /* Statistics of the local group */
};
sd做初始化:
static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
{
/*
* Skimp(略过) on the clearing(结算) to avoid duplicate(重复的) work. We can avoid clearing
* local_stat because update_sg_lb_stats() does a full clear/assignment.
* We must however clear busiest_stat::avg_load because
* update_sd_pick_busiest() reads this before assignment(分配).
*/
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
.total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.total_util = 0UL,
.busiest_stat = {
.avg_load = 0UL,
.sum_nr_running = 0,
.group_type = group_other,
},
};
}
4、struct sched_group
struct sched_group {
///* Must be a circular list *///环形list sg==sg->next domain内group遍历完成
struct sched_group *next; /* Must be a circular list */
atomic_t ref;
unsigned int group_weight;
struct sched_group_capacity *sgc;
const struct sched_group_energy *sge;
bool overutilized;
/*
* The CPUs this group covers.
*
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
// 当前group具有哪些cpu
unsigned long cpumask[0];
};
5、sg_lb_stats
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
// 在函数update_sg_lb_stats里面更新
struct sg_lb_stats {
// sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
// 后续作为判断sg busy的主要依据
unsigned long avg_load; /*Avg load across the CPUs of the group */
// 每个cpu的load的sum
unsigned long group_load; /* Total load over the CPUs of the group */
//也是sum
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
//sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
unsigned long load_per_task;//組中平均每個task的負載
group可容納的task數量,这个不分轻重吗?
//这个存在多个说法
// 单个cpu需要考虑rt的影响
unsigned long group_capacity; //sg所有cpu capacity的累加
// sum
unsigned long group_util; /* Total utilization of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
unsigned int idle_cpus;//idle状态的cpu计数
// 这个是存疑且不确定的值
unsigned int group_weight;// online的cpu的个数
// 严重级别 group_overloaded > group_imbalanced > group_other,后面还多了一个
enum group_type group_type;
int group_no_capacity;// sgs的capacity已经不够用,赶不上util,所以此时group_overloaded了
int group_misfit_task; /* A cpu has a task too big for its capacity */
};
5、struct lb_env
enum fbq_type { regular, remote, all };
struct lb_env {
struct sched_domain *sd;//所在的sd
struct rq *src_rq;
int src_cpu;
int dst_cpu;//这里dst_cpu就是需要将task pull到的cpu,目标cpu
struct rq *dst_rq;
//由于一些cpu allows的设置,导致一些task不能被迁移到dst_cpu上,
所以在出现这种情况的时候,就需要从dst cpu所在的group上选择另外一个cpu
struct cpumask *dst_grpmask;
int new_dst_cpu;
enum cpu_idle_type idle;//当前cpu是否是idle
long imbalance;//需要迁移的负载,这个是数量还是load值?
unsigned int src_grp_nr_running;// 源cpu的task的数量,不一定是最busiest的cpu
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
unsigned int flags;
unsigned int loop;
unsigned int loop_break;
unsigned int loop_max;//最大迁移的task的数量
enum fbq_type fbq_type;
enum group_type busiest_group_type;
//初始化链表,后续会将需要迁移的task暂时放在这个链表里面
struct list_head tasks;
};
6、struct root_domain
/*
* We add the notion(概念) of a root-domain which will be used to define per-domain
* variables. Each exclusive(单独的) cpuset essentially(本质上) defines an island domain by
* fully partitioning(分割的) the member cpus from any other cpuset. Whenever a new
* exclusive cpuset is created, we also create and attach a new root-domain
* object.
*
*/
struct root_domain {
atomic_t refcount;
atomic_t rto_count;
struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
/*
* Indicate pullable load on at least one CPU, e.g:
* - More than one runnable task
* - Running task is misfit
*/
int overload;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
*/
cpumask_var_t dlo_mask;
atomic_t dlo_count;
struct dl_bw dl_bw;
struct cpudl cpudl;
#ifdef HAVE_RT_PUSH_IPI
/*
* For IPI pull requests, loop across the rto_mask.
*/
struct irq_work rto_push_work;
raw_spinlock_t rto_lock;
/* These are only updated and read within rto_lock */
int rto_loop;
int rto_cpu;
/* These atomics are updated outside of a lock */
atomic_t rto_loop_next;
atomic_t rto_loop_start;
#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
*/
cpumask_var_t rto_mask;
struct cpupri cpupri;
/* Maximum cpu capacity in the system. */
struct max_cpu_capacity max_cpu_capacity;
/* First cpu with maximum and minimum original capacity */
int max_cap_orig_cpu, min_cap_orig_cpu;
/* First cpu with middle original capacity */
int mid_cap_orig_cpu;
};
6、sched_group_capacity
struct sched_group_capacity {
atomic_t ref;
/*
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
unsigned long capacity;
unsigned long max_capacity; //这里应该是指是一个每cpu变量
/* Max per-cpu capacity in group */
unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long next_update;//下次更新的时间,到了时间会调用update_group_capacity函数
// 1表示不平衡,0表示已经平衡或者不能进行均衡吗?
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
* Number of busy cpus in this group.
*/
atomic_t nr_busy_cpus;//进入idle的时候会减少这个值
unsigned long cpumask[0]; /* iteration mask */
};
7、struct ravg
/* ravg represents frequency scaled cpu-demand of tasks */
struct ravg {
/*
* 'mark_start' marks the beginning of an event (task waking up, task
* starting to execute, task being preempted) within a window
*
* 'sum' represents how runnable a task has been within current
* window. It incorporates both running time and wait time and is
* frequency scaled.(频率缩放)
*
* 'sum_history' keeps track of history of 'sum' seen over previous
* RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
* ignored.
*
* 'demand' represents maximum sum seen over previous
* sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
* demand for tasks.
*
* 'curr_window' represents task's contribution to cpu busy time
* statistics (rq->curr_runnable_sum) in current window
*
* 'prev_window' represents task's contribution to cpu busy time
* statistics (rq->prev_runnable_sum) in previous window
*/
/*'mark_start'标志着事件的开始(任务醒来,任务在窗口内开始执行,任务被抢占)*/
u64 mark_start;
/*'sum'表示任务在当前窗口内的可运行程度。它结合了运行时间和等待时间,并且是频率缩放的*/
// 0表示不可运行
/*'demand'表示在之前的sysctl_sched_ravg_hist_size窗口中看到的最大总和。 “需求”可以推动任务的频率需求。*/
u32 sum, demand;
/*'sum_history'跟踪先前RAVG_HIST_SIZE窗口上看到的'sum'的历史记录。 任务完全休眠的Windows将被忽略。*/
u32 sum_history[RAVG_HIST_SIZE_MAX];
/*'curr_window'表示任务对当前窗口中cpu繁忙时间统计信息(rq-> curr_runnable_sum)的贡献*/
u32 curr_window, prev_window;
u16 active_windows;
};