【数据结构】【cfs_rq】【task_struct】【sched_domain】

struct cfs_rq

在系统中至少有一个CFS运行队列,其就是根CFS运行队列,而其他的进程组 和进程都包含在此运行队列中,不同的是进程组又有它自己的CFS运行队列,其运行队列中包含的 是此进程组中的所有进程。当调度器从根CFS运行队列中选择了一个进程组进行调度时,进程组会 从自己的CFS运行队列中选择一个调度实体进行调度(这个调度实体可能为进程,也可能又是一个 子进程组),就这样一直深入,直到最后选出一个进程进行运行为止。

cfs_rq实际上是rq中与cfs相关的字段

/* CFS-related fields in a runqueue */
struct cfs_rq {
/*
该cfs_rq的load,它只计算它本层下面的se的weight之和,并不是这个se的load,也不是递归到叶子节点上的所有se weight之和(理解这点非常重要)*/
	struct load_weight load;/*所有进程的累计负荷值*/
//h_nr_running只对于组才有效,包括底层所有cfs_rq的nr_running之和
	unsigned int nr_running, h_nr_running;nr_running/*当前就绪队列的进程数*/

	u64 exec_clock;//该cfs_rq总共占用的cpu时间(物理),只累计本层
/*
     * 当前CFS队列上最小运行时间,单调递增
     * 两种情况下更新该值: 
     * 1、更新当前运行任务的累计运行时间时
     * 2、当任务从队列删除去,如任务睡眠或退出,这时候会查看剩下的任务的vruntime是否大于min_vruntime,如果是则更新该值。
     */
//用于调整se的vruntime,它是递增的,但不一定是该cfs_rq里所有se最小
	u64 min_vruntime; //该cpu运行队列的vruntime推进值, 一般是红黑树中最小的vruntime值
#ifndef CONFIG_64BIT
	u64 min_vruntime_copy;
#endif

	struct rb_root tasks_timeline;/*红黑树的头结点*/
	struct rb_node *rb_leftmost;/*红黑树的最左面节点*/

	/*
	 * 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
// current是正在被调用的实体对象
//当前运行的se(对于组虽然它不会在cpu上运行,但是当它的下层有一个task在cpu上运行,那么它所在的cfs_rq就把它当做是该cfs_rq上当前正在运行的se)
	struct sched_entity *curr, *next, *last, *skip;
/*
     * 'curr' points to currently running entity on this cfs_rq.
     * It is set to NULL otherwise (i.e when none are currently running).
     * curr: 当前正在运行的sched_entity(对于组虽然它不会在cpu上运行,但是当它的下层有一个task在cpu上运行,那么它所在的cfs_rq就把它当做是该cfs_rq上当前正在运行的sched_entity)
     * next: 表示有些进程急需运行,即使不遵从CFS调度也必须运行它,调度时会检查是否next需要调度,有就调度next
     *
     * skip: 略过进程(不会选择skip指定的进程调度)
     */

#ifdef	CONFIG_SCHED_DEBUG
	unsigned int nr_spread_over;
#endif

#ifdef CONFIG_SMP
	/*
	 * CFS load tracking
	 */
	struct sched_avg avg;
	u64 runnable_load_sum;
	unsigned long runnable_load_avg;
#ifdef CONFIG_64BIT_ONLY_CPU
	unsigned long runnable_load_avg_32bit;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
	unsigned long tg_load_avg_contrib;
	unsigned long propagate_avg;
#endif
	atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
	u64 load_last_update_time_copy;
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
	/*
	 *   h_load = weight * f(tg)
	 *
	 * Where f(tg) is the recursive weight fraction assigned to
	 * this group.
	 */
	unsigned long h_load;
	u64 last_h_load_update;
	struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */

#ifdef CONFIG_FAIR_GROUP_SCHED
    /* 所属于的CPU rq */
	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */

	/*
	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
	 * (like users, containers etc.)
	 *
	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
	 * list is used during load balance.
	 */
	int on_list;
	struct list_head leaf_cfs_rq_list;
/*属于这个cfs_rq的进程组*/ 
	struct task_group *tg;	/* group that "owns" this runqueue */

#ifdef CONFIG_SCHED_WALT
	u64 cumulative_runnable_avg;
#endif

#ifdef CONFIG_CFS_BANDWIDTH
	int runtime_enabled;
	u64 runtime_expires;
	s64 runtime_remaining;

	u64 throttled_clock, throttled_clock_task;
	u64 throttled_clock_task_time;
	int throttled, throttle_count, throttle_uptodate;
	struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};

task_struct

每个task对应一个se,但是反过去不一定成立,因为有task_group的概念

 

struct task_struct
{
    ........
    /* 表示是否在运行队列 */
    int on_rq;

    /* 进程优先级 
     * prio: 动态优先级,范围为100~139,与静态优先级和补偿(bonus)有关
     * static_prio: 静态优先级,static_prio = 100 + nice + 20 (nice值为-20~19,所以static_prio值为100~139)
     * normal_prio: 没有受优先级继承影响的常规优先级,具体见normal_prio函数,跟属于什么类型的进程有关
     */
    int prio, static_prio, normal_prio;
    /* 实时进程优先级 */
    unsigned int rt_priority;

    /* 调度类,调度处理函数类 */
    const struct sched_class *sched_class;

    /* 调度实体(红黑树的一个结点) */
    struct sched_entity se; //通过这个调度实体可以找到对应的task
    /* 调度实体(实时调度使用) */
    struct sched_rt_entity rt;
    struct sched_dl_entity dl;

#ifdef CONFIG_CGROUP_SCHED
    /* 指向其所在进程组 */
    struct task_group *sched_task_group;
#endif
    ........
}

struct sched_domain

struct sched_domain {
	/* These fields must be setup */
//调用域可以被别的调用域所包含,parent指向父调用域 
	struct sched_domain *parent;	/* top domain must be null terminated(终止) */
	struct sched_domain *child;	/* bottom domain must be null terminated */
    // 指向正在均衡的group
	struct sched_group *groups;	/* the balancing groups of the domain */
//最小的时间间隔,用于检查进行负载均衡操作的时机是否到了 
	unsigned long min_interval;	/* Minimum balance interval ms */
	unsigned long max_interval;	/* Maximum balance interval ms */
 //当处理器在不空闲的状态下时,进行负载均衡操作的时间间隔一般也长很多,该factor为其乘数银子 
	unsigned int busy_factor;	/* less balancing by factor if busy *///busy时平衡因子
//判断该调度域是否已经均衡的一个基准值
	unsigned int imbalance_pct;	/* No balance until over watermark(水印) */

	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */

	unsigned int busy_idx;//忙均衡的cpu_load索引
	unsigned int idle_idx;//空闲均衡的cpu_load索引
//马上就要进入idle的cpu为了尽量不进入idle而进行负载均衡时的cpu_load索引 
	unsigned int newidle_idx;
	unsigned int wake_idx;
	unsigned int forkexec_idx;
	unsigned int smt_gain;
//进入nohz_idle模式的时候该值为1
	int nohz_idle;			/* NOHZ IDLE status */
	int flags;			/* See SD_* */
	int level;          //domain 所处层次级别

	/* Runtime fields. */
//domain上次做balance时间
	unsigned long last_balance;	/* init to jiffies. units in jiffies */
//每次balance的间隔时间
	unsigned int balance_interval;	/* initialise to 1. units in ms. */
 //balance失败次数
	unsigned int nr_balance_failed; /* initialise to 0 */


	/* idle_balance() stats */
//这里的max_newidle_lb_cost是指做load balance所花时间。如上面注释所说,max_newidle_lb_cost每个1s衰减1%
	u64 max_newidle_lb_cost;
//是下一次进行衰减的时间,HZ为jiffies的1s时间
	unsigned long next_decay_max_lb_cost;

#ifdef CONFIG_SCHEDSTATS
	/* load_balance() stats */
	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];

	/* Active load balancing */
	unsigned int alb_count;
	unsigned int alb_failed;
	unsigned int alb_pushed;

	/* SD_BALANCE_EXEC stats */
	unsigned int sbe_count;
	unsigned int sbe_balanced;
	unsigned int sbe_pushed;

	/* SD_BALANCE_FORK stats */
	unsigned int sbf_count;
	unsigned int sbf_balanced;
	unsigned int sbf_pushed;

	/* try_to_wake_up() stats */
	unsigned int ttwu_wake_remote;
	unsigned int ttwu_move_affine;
	unsigned int ttwu_move_balance;

	struct eas_stats eas_stats;
#endif
#ifdef CONFIG_SCHED_DEBUG
	char *name;
#endif
	union {
		void *private;		/* used during construction */
		struct rcu_head rcu;	/* used during destruction */
	};

#ifdef CONFIG_INTEL_DWS
	unsigned int total_groups;		/* total group number */
	unsigned int group_number;		/* this CPU's group sequence */
	unsigned int dws_tf;			/* consolidating degree */
	struct sched_group *first_group;	/* ordered by CPU number */
#endif

	unsigned int span_weight;
	/*
	 * Span of all CPUs in this domain.
	 *
	 * NOTE: this field is variable length. (Allocated dynamically
	 * by attaching extra space to the end of the structure,
	 * depending on how many CPUs the kernel has booted up with)
	 */
	unsigned long span[0];//当前 domain 中的所有 cpu 位图
};

4、struct sd_lb_stats

​
/*
 * sd_lb_stats - Structure to store the statistics of a sched_domain
 *		 during load balancing.
 */
struct sd_lb_stats {
	struct sched_group *busiest;	/* Busiest group in this sd */
	struct sched_group *local;	/* Local group in this sd */
	unsigned long total_load;	/* Total load of all groups in sd */
	unsigned long total_capacity;	/* Total capacity of all groups in sd */
    ///domain中各個group的平均負載
	unsigned long avg_load;	/* Average load across all groups in sd */
	unsigned long total_util;

	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
	struct sg_lb_stats local_stat;	/* Statistics of the local group */
};

​

sd做初始化:

static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
{
	/*
	 * Skimp(略过) on the clearing(结算) to avoid duplicate(重复的) work. We can avoid clearing
	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
	 * We must however clear busiest_stat::avg_load because
	 * update_sd_pick_busiest() reads this before assignment(分配).
	 */
	*sds = (struct sd_lb_stats){
		.busiest = NULL,
		.local = NULL,
		.total_running = 0UL,
		.total_load = 0UL,
		.total_capacity = 0UL,
		.total_util = 0UL,
		.busiest_stat = {
			.avg_load = 0UL,
			.sum_nr_running = 0,
			.group_type = group_other,
		},
	};
}

4、​struct sched_group

​struct sched_group {
///* Must be a circular list *///环形list sg==sg->next  domain内group遍历完成
        struct sched_group *next;       /* Must be a circular list */
        atomic_t ref;

        unsigned int group_weight;
        struct sched_group_capacity *sgc;
        const struct sched_group_energy *sge;

        bool overutilized;
        /*
         * The CPUs this group covers.
         *
         * NOTE: this field is variable length. (Allocated dynamically
         * by attaching extra space to the end of the structure,
         * depending on how many CPUs the kernel has booted up with)
         */
        // 当前group具有哪些cpu
        unsigned long cpumask[0];
};

5、sg_lb_stats

 
/*
 * sg_lb_stats - stats of a sched_group required for load_balancing
 */
// 在函数update_sg_lb_stats里面更新
struct sg_lb_stats {
    // sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
    // 后续作为判断sg busy的主要依据
	unsigned long avg_load; /*Avg load across the CPUs of the group */
    // 每个cpu的load的sum
	unsigned long group_load; /* Total load over the CPUs of the group */
    //也是sum
	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
    //sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
	unsigned long load_per_task;//組中平均每個task的負載
group可容納的task數量,这个不分轻重吗?
//这个存在多个说法
    // 单个cpu需要考虑rt的影响
	unsigned long group_capacity; //sg所有cpu capacity的累加
    // sum
	unsigned long group_util; /* Total utilization of the group */
	unsigned int sum_nr_running; /* Nr tasks running in the group */
	unsigned int idle_cpus;//idle状态的cpu计数
    // 这个是存疑且不确定的值
	unsigned int group_weight;// online的cpu的个数
// 严重级别 group_overloaded > group_imbalanced > group_other,后面还多了一个
	enum group_type group_type;
	int group_no_capacity;// sgs的capacity已经不够用,赶不上util,所以此时group_overloaded了
	int group_misfit_task; /* A cpu has a task too big for its capacity */
};

5、struct lb_env

​enum fbq_type { regular, remote, all };

struct lb_env {
	struct sched_domain	*sd;//所在的sd

	struct rq		*src_rq;
	int			src_cpu;

	int			dst_cpu;//这里dst_cpu就是需要将task pull到的cpu,目标cpu
	struct rq		*dst_rq;
//由于一些cpu allows的设置,导致一些task不能被迁移到dst_cpu上,
所以在出现这种情况的时候,就需要从dst cpu所在的group上选择另外一个cpu
	struct cpumask		*dst_grpmask;
	int			new_dst_cpu;
	
	enum cpu_idle_type	idle;//当前cpu是否是idle
	long			imbalance;//需要迁移的负载,这个是数量还是load值?
	unsigned int		src_grp_nr_running;// 源cpu的task的数量,不一定是最busiest的cpu
	/* The set of CPUs under consideration for load-balancing */
	struct cpumask		*cpus;

	unsigned int		flags;

	unsigned int		loop;
	unsigned int		loop_break;
	unsigned int		loop_max;//最大迁移的task的数量

	enum fbq_type		fbq_type;
	enum group_type		busiest_group_type;
//初始化链表,后续会将需要迁移的task暂时放在这个链表里面
	struct list_head	tasks;
};

6、struct root_domain

/*
 * We add the notion(概念) of a root-domain which will be used to define per-domain
 * variables. Each exclusive(单独的) cpuset essentially(本质上) defines an island domain by
 * fully partitioning(分割的) the member cpus from any other cpuset. Whenever a new
 * exclusive cpuset is created, we also create and attach a new root-domain
 * object.
 *
 */
struct root_domain {
	atomic_t refcount;
	atomic_t rto_count;
	struct rcu_head rcu;
	cpumask_var_t span;
	cpumask_var_t online;

	/*
	 * Indicate pullable load on at least one CPU, e.g:
	 * - More than one runnable task
	 * - Running task is misfit
	 */
	int overload;

	/*
	 * The bit corresponding to a CPU gets set here if such CPU has more
	 * than one runnable -deadline task (as it is below for RT tasks).
	 */
	cpumask_var_t dlo_mask;
	atomic_t dlo_count;
	struct dl_bw dl_bw;
	struct cpudl cpudl;

#ifdef HAVE_RT_PUSH_IPI
	/*
	 * For IPI pull requests, loop across the rto_mask.
	 */
	struct irq_work rto_push_work;
	raw_spinlock_t rto_lock;
	/* These are only updated and read within rto_lock */
	int rto_loop;
	int rto_cpu;
	/* These atomics are updated outside of a lock */
	atomic_t rto_loop_next;
	atomic_t rto_loop_start;
#endif
	/*
	 * The "RT overload" flag: it gets set if a CPU has more than
	 * one runnable RT task.
	 */
	cpumask_var_t rto_mask;
	struct cpupri cpupri;

	/* Maximum cpu capacity in the system. */
	struct max_cpu_capacity max_cpu_capacity;

	/* First cpu with maximum and minimum original capacity */
	int max_cap_orig_cpu, min_cap_orig_cpu;
	/* First cpu with middle original capacity */
	int mid_cap_orig_cpu;
};

6、sched_group_capacity


struct sched_group_capacity {
	atomic_t ref;
	/*
	 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
	 * for a single CPU.
	 */
	unsigned long capacity;
	unsigned long max_capacity; //这里应该是指是一个每cpu变量
/* Max per-cpu capacity in group */
	unsigned long min_capacity; /* Min per-CPU capacity in group */
	unsigned long next_update;//下次更新的时间,到了时间会调用update_group_capacity函数
    // 1表示不平衡,0表示已经平衡或者不能进行均衡吗?
	int imbalance; /* XXX unrelated to capacity but shared group state */
	/*
	 * Number of busy cpus in this group.
	 */
	atomic_t nr_busy_cpus;//进入idle的时候会减少这个值

	unsigned long cpumask[0]; /* iteration mask */
};

7、struct ravg

/* ravg represents frequency scaled cpu-demand of tasks */
struct ravg {
	/*
	 * 'mark_start' marks the beginning of an event (task waking up, task
	 * starting to execute, task being preempted) within a window
	 *
	 * 'sum' represents how runnable a task has been within current
	 * window. It incorporates both running time and wait time and is
	 * frequency scaled.(频率缩放)
	 *
	 * 'sum_history' keeps track of history of 'sum' seen over previous
	 * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
	 * ignored.
	 *
	 * 'demand' represents maximum sum seen over previous
	 * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
	 * demand for tasks.
	 *
	 * 'curr_window' represents task's contribution to cpu busy time
	 * statistics (rq->curr_runnable_sum) in current window
	 *
	 * 'prev_window' represents task's contribution to cpu busy time
	 * statistics (rq->prev_runnable_sum) in previous window
	 */
/*'mark_start'标志着事件的开始(任务醒来,任务在窗口内开始执行,任务被抢占)*/
	u64 mark_start;
/*'sum'表示任务在当前窗口内的可运行程度。它结合了运行时间和等待时间,并且是频率缩放的*/
// 0表示不可运行
/*'demand'表示在之前的sysctl_sched_ravg_hist_size窗口中看到的最大总和。 “需求”可以推动任务的频率需求。*/
	u32 sum, demand;
/*'sum_history'跟踪先前RAVG_HIST_SIZE窗口上看到的'sum'的历史记录。 任务完全休眠的Windows将被忽略。*/
	u32 sum_history[RAVG_HIST_SIZE_MAX];
/*'curr_window'表示任务对当前窗口中cpu繁忙时间统计信息(rq-> curr_runnable_sum)的贡献*/
	u32 curr_window, prev_window;
	u16 active_windows;
};
 
 
 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值