linux schedule 数据结构

3.10版本内核为例,linux为了提升多核调度的效率,每个cpu上都有一个runqueue结构,
这样就避免了多核争抢同一个runqueue造成的瓶颈。
在每个runqueue中,包含了多个调度类(sched_class)的runqueue,调度类下面的子类主要有cfs(完全公平调度器),
RT(实时调度器),其他的子类使用不多。
几个调度器子类间的优先级顺序:deadline class->rt class -> cfs class ->idle class.


对于cfs rq来讲,为了更好地体现公平性原则以及负载均衡需要,从高到低按照cpu物理架构依次划分成
调度域(sched_domain), 调度组(sched_group),调度组里面包含的是调度实体(sched_entity),
一个调度实体既可以是单个task_struct,也可以是一个task_group。


可以参考下网络上的组织关系图:




接下来我们逐一看下调度相关的数据结构:

/*
 * This is the main, per-CPU runqueue data structure.
 *
 * Locking rule: those places that want to lock multiple runqueues
 * (such as the load balancing or the thread migration code), lock
 * acquire operations must be ordered by ascending &runqueue.
 */
 /*每个CPU都有自己的 struct rq 结构,其用于描述在此CPU上所运行的所有进程,
 其包括一个实时进程队列和一个根CFS运行队列,在调度时,
 调度器首先会先去实时进程队列找是否有实时进程需要运行,
 如果没有才会去CFS运行队列找是否有进行需要运行
 */
 
 struct rq {
	/* runqueue lock: */
	raw_spinlock_t lock;

	/*
	 * nr_running and cpu_load should be in the same cacheline because
	 * remote CPUs use both these fields when doing load calculation.
	 */
	//此CPU上总共就绪的进程数,包括cfs,rt和正在运行的
	unsigned int nr_running;
	/* 根据CPU历史情况计算的负载,cpu_load[0]一直等于load.weight,当达到负载平衡时,cpu_load[1]和cpu_load[2]都应该等于load.weight */
	#define CPU_LOAD_IDX_MAX 5
	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
	unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ_COMMON
	u64 nohz_stamp;
	unsigned long nohz_flags;
#endif
#ifdef CONFIG_NO_HZ_FULL
	unsigned long last_sched_tick;
#endif
	//是否需要更新rq的运行时间
	int skip_clock_update;

	/* capture load from *all* tasks on this cpu: */
	//CPU负载,该CPU上所有可运行进程的load之和,nr_running更新时这个值也必须更新
	struct load_weight load;
	unsigned long nr_load_updates;
	//上下文切换次数
	u64 nr_switches;
	
	//不同调度子类rq
	struct cfs_rq cfs;
	struct rt_rq rt;
	struct dl_rq dl;

#ifdef CONFIG_FAIR_GROUP_SCHED
	/* list of leaf cfs_rq on this cpu: */
	struct list_head leaf_cfs_rq_list;
#ifdef CONFIG_SMP
	unsigned long h_load_throttle;
#endif /* CONFIG_SMP */
#endif /* CONFIG_FAIR_GROUP_SCHED */

#ifdef CONFIG_RT_GROUP_SCHED
	struct list_head leaf_rt_rq_list;
#endif

	/*
	 * This is part of a global counter where only the total sum
	 * over all CPUs matters. A task can increase this counter on
	 * one CPU and if it got migrated afterwards it may decrease
	 * it on another CPU. Always updated under the runqueue lock:
	 */
	//曾经处于队列但现在处于TASK_UNINTERRUPTIBLE状态的进程数量
	unsigned long nr_uninterruptible;

	struct task_struct *curr, *idle, *stop;
	//下次进行负载平衡执行时间 
	unsigned long next_balance;
	struct mm_struct *prev_mm;//在进程切换时用来存放换出进程的内存描述符地址
	//rq 运行时间
	u64 clock;
	u64 clock_task;
	//等待io的task数
	atomic_t nr_iowait;

#ifdef CONFIG_SMP
	struct root_domain *rd;
	struct sched_domain *sd;

	unsigned long cpu_power;

	unsigned char idle_balance;
	/* For active balancing */
	int post_schedule;
	int active_balance;
	int push_cpu;
	struct task_struct *push_task;
	struct cpu_stop_work active_balance_work;
	/* cpu of this runqueue: */
	int cpu;
	int online;

	//rq 所有cfs task队列
	struct list_head cfs_tasks;

	u64 rt_avg;
	u64 age_stamp;
	u64 idle_stamp;
	u64 avg_idle;
	int cstate, wakeup_latency, wakeup_energy;
#endif

#ifdef CONFIG_SCHED_HMP
	struct sched_cluster *cluster;
	struct cpumask freq_domain_cpumask;
	struct hmp_sched_stats hmp_stats;

	u64 window_start;
	int prefer_idle;
	u32 mostly_idle_load;
	int mostly_idle_nr_run;
	int mostly_idle_freq;
	unsigned long hmp_flags;

	u64 cur_irqload;
	u64 avg_irqload;
	u64 irqload_ts;

#ifdef CONFIG_SCHED_FREQ_INPUT
	unsigned int old_busy_time;
	int notifier_sent;
#endif
#endif

#ifdef CONFIG_SCHED_FREQ_INPUT
	u64 curr_runnable_sum;
	u64 prev_runnable_sum;
#endif

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
	u64 prev_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
	u64 prev_steal_time;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
	u64 prev_steal_time_rq;
#endif

	/* calc_load related fields */
	unsigned long calc_load_update;
	long calc_load_active;

#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
	int hrtick_csd_pending;
	struct call_single_data hrtick_csd;
#endif
	struct hrtimer hrtick_timer;
#endif

#ifdef CONFIG_SCHEDSTATS
	/* latency stats */
	struct sched_info rq_sched_info;
	unsigned long long rq_cpu_time;
	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

	/* sys_sched_yield() stats */
	unsigned int yld_count;

	/* schedule() stats */
	unsigned int sched_count;
	unsigned int sched_goidle;

	/* try_to_wake_up() stats */
	unsigned int ttwu_count;
	unsigned int ttwu_local;
#endif

#ifdef CONFIG_SMP
	struct llist_head wake_list;
#endif

	struct sched_avg avg;
}

/* CFS-related fields in a runqueue */
struct cfs_rq{
    //cfs_rq中所有进程的总负载
	struct load_weight load;
	//
	unsigned int nr_running, h_nr_running;

	u64 exec_clock;
	//当前CFS队列上最小运行时间
	u64 min_vruntime;
#ifndef CONFIG_64BIT
	u64 min_vruntime_copy;
#endif
    //rb tree root node
	struct rb_root tasks_timeline;
	//下一个调度结点(红黑树最左边结点,最左边结点就是下个调度实体) 
	struct rb_node *rb_leftmost;

	/*
	 * 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	//next: 表示有些进程急需运行,即使不遵从CFS调度也必须运行它,调度时会检查是否next需要调度,有就调度next
	struct sched_entity *curr, *next, *last, *skip;

#ifdef	CONFIG_SCHED_DEBUG
	unsigned int nr_spread_over;
#endif

#ifdef CONFIG_SMP
/*
 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
 * removed when useful for applications beyond shares distribution (e.g.
 * load-balance).
 */
#ifdef CONFIG_FAIR_GROUP_SCHED
	/*
	 * CFS Load tracking
	 * Under CFS, load is tracked on a per-entity basis and aggregated up.
	 * This allows for the description of both thread and group usage (in
	 * the FAIR_GROUP_SCHED case).
	 */
	u64 runnable_load_avg, blocked_load_avg;
	atomic64_t decay_counter, removed_load;
	u64 last_decay;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/* These always depend on CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
	u32 tg_runnable_contrib;
	u64 tg_load_contrib;
#endif /* CONFIG_FAIR_GROUP_SCHED */

	/*
	 *   h_load = weight * f(tg)
	 *
	 * Where f(tg) is the recursive weight fraction assigned to
	 * this group.
	 */
	unsigned long h_load;
#endif /* CONFIG_SMP */

#ifdef CONFIG_FAIR_GROUP_SCHED
	//所属的cpu rq
	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */

	/*
	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
	 * (like users, containers etc.)
	 *
	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
	 * list is used during load balance.
	 */
	int on_list;
	struct list_head leaf_cfs_rq_list;
	//该cfs_rq所属的group
	struct task_group *tg;	/* group that "owns" this runqueue */

#ifdef CONFIG_CFS_BANDWIDTH

#ifdef CONFIG_SCHED_HMP
	struct hmp_sched_stats hmp_stats;
#endif

	int runtime_enabled;
	u64 runtime_expires;
	s64 runtime_remaining;

	u64 throttled_clock, throttled_clock_task;
	u64 throttled_clock_task_time;
	int throttled, throttle_count;
	struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
}

/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
	//优先级队列,根据优先级分成不同的队列,位图标记优先级
	struct rt_prio_array active;
	//运行状态进程数
	unsigned int rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
	struct {
		int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
		int next; /* next highest */
#endif
	} highest_prio;
#endif
#ifdef CONFIG_SMP
	unsigned long rt_nr_migratory;
	unsigned long rt_nr_total;
	int overloaded;
	struct plist_head pushable_tasks;
#endif
	int rt_throttled;
	u64 rt_time;
	u64 rt_runtime;
	/* Nests inside the rq lock: */
	raw_spinlock_t rt_runtime_lock;

#ifdef CONFIG_RT_GROUP_SCHED
	unsigned long rt_nr_boosted;
	//所属rq
	struct rq *rq;
	struct list_head leaf_rt_rq_list;
	//所属task group
	struct task_group *tg;
#endif
};

struct sched_domain {
	/* These fields must be setup */
	struct sched_domain *parent;	/* top domain must be null terminated */
	struct sched_domain *child;	/* bottom domain must be null terminated */
	struct sched_group *groups;	/* the balancing groups of the domain */
	unsigned long min_interval;	/* Minimum balance interval ms */
	unsigned long max_interval;	/* Maximum balance interval ms */
	unsigned int busy_factor;	/* less balancing by factor if busy *///busy时平衡因子
	unsigned int imbalance_pct;	/* No balance until over watermark *///判断该调度域是否已经均衡的一个基准值
	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
	unsigned int busy_idx; //忙均衡的cpu_load索引
	unsigned int idle_idx; //空闲均衡的cpu_load索引
	unsigned int newidle_idx; //马上就要进入idle的cpu为了尽量不进入idle而进行负载均衡时的cpu_load索引  
	unsigned int wake_idx;
	unsigned int forkexec_idx;
	unsigned int smt_gain;

	int nohz_idle;			/* NOHZ IDLE status */
	int flags;			/* See SD_* */
	int level; //domain 所处层次级别

	/* Runtime fields. */
	unsigned long last_balance;	/* init to jiffies. units in jiffies *///domain上次做balance时间
	unsigned int balance_interval;	/* initialise to 1. units in ms. *///每次balance的间隔时间
	unsigned int nr_balance_failed; /* initialise to 0 */ //balance失败次数

	u64 last_update;

#ifdef CONFIG_SCHEDSTATS
	/* load_balance() stats */
	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];

	/* Active load balancing */
	unsigned int alb_count;
	unsigned int alb_failed;
	unsigned int alb_pushed;

	/* SD_BALANCE_EXEC stats */
	unsigned int sbe_count;
	unsigned int sbe_balanced;
	unsigned int sbe_pushed;

	/* SD_BALANCE_FORK stats */
	unsigned int sbf_count;
	unsigned int sbf_balanced;
	unsigned int sbf_pushed;

	/* try_to_wake_up() stats */
	unsigned int ttwu_wake_remote;
	unsigned int ttwu_move_affine;
	unsigned int ttwu_move_balance;
#endif
#ifdef CONFIG_SCHED_DEBUG
	char *name;
#endif
	union {
		void *private;		/* used during construction */
		struct rcu_head rcu;	/* used during destruction */
	};

	unsigned int span_weight;
	/*
	 * Span of all CPUs in this domain.
	 *
	 * NOTE: this field is variable length. (Allocated dynamically
	 * by attaching extra space to the end of the structure,
	 * depending on how many CPUs the kernel has booted up with)
	 */
	unsigned long span[0]; //当前 domain 中的所有 cpu 位图
};

struct sched_group {
	struct sched_group *next;	/* Must be a circular list *///环形list sg==sg->next  domain内group遍历完成
	atomic_t ref;

	unsigned int group_weight;
	struct sched_group_power *sgp;

	/*
	 * The CPUs this group covers.
	 *
	 * NOTE: this field is variable length. (Allocated dynamically
	 * by attaching extra space to the end of the structure,
	 * depending on how many CPUs the kernel has booted up with)
	 */
	unsigned long cpumask[0]; //当前 group 有哪些 CPU
};


/*该结构包含了完整的信息,用于实现对单个任务或任务组的调度。它可用于实现组调度。调度实体可能与进程没有关联。
调度实体可以表示一个进程,也可以表示一个进程组task group
*/
struct sched_entity {
	/* 权重,在数组prio_to_weight[]包含优先级转权重的数值 */
	struct load_weight	load;		/* for load-balancing */
	//实体在rb tree中的节点
	struct rb_node		run_node;
	//实体所在进程组
	struct list_head	group_node;
	//该实体是否在rb tree中,rb tree保存就绪队列
	unsigned int		on_rq;

	//实体开始执行的时间
	u64			exec_start;
	u64			sum_exec_runtime;//总运行时间
	u64			vruntime;//虚拟运行时间,每次时钟中断都会修改该值
	u64			prev_sum_exec_runtime;

	u64			nr_migrations;//调度实体中进程移到其他cpu数量

#ifdef CONFIG_SCHEDSTATS
	//调度实体统计信息
	struct sched_statistics statistics;
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
	/* 父亲调度实体指针,如果是进程则指向其运行队列的调度实体,如果是进程组则指向其上一个进程组的调度实体*/
	struct sched_entity	*parent;
	/* rq on which this entity is (to be) queued 该实体所在rq */
	struct cfs_rq		*cfs_rq;
	/* rq "owned" by this entity/group: */
	 /* 实体的红黑树运行队列,如果为NULL表明其是一个进程,若非NULL表明其是调度组 */
	struct cfs_rq		*my_q;
#endif

/*
 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
 * removed when useful for applications beyond shares distribution (e.g.
 * load-balance).
 */
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
	/* Per-entity load-tracking */
	struct sched_avg	avg;
#endif
};
/*
my_q:如果此调度实体代表的是一个进程组,那么此调度实体就包含
有一个自己的CFS运行队列,其CFS运行队列中存放的是此进程组中的进程,
这些进程就不会在其他CFS运行队列的红黑树中被包含(包括顶层红黑树也不会包含他们,他们只属于这个进程组的红黑树)。

*/
struct sched_rt_entity {
	struct list_head run_list;
	unsigned long timeout;
	unsigned long watchdog_stamp;
	unsigned int time_slice;

	struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
	struct sched_rt_entity	*parent;
	/* rq on which this entity is (to be) queued: *///实体所在rq
	struct rt_rq		*rt_rq;
	/* rq "owned" by this entity/group: *///实体属于这个rq
	struct rt_rq		*my_q;
#endif
};


/*
linux是一个多用户系统,如果有两个进程分别属于两个用户,
而进程的优先级不同,会导致两个用户所占用的CPU时间不同,
这样显然是不公平的(如果优先级差距很大,低优先级进程所属用户使用
CPU的时间就很小),所以内核引入组调度
如果基于用户分组,即使进程优先级不同,这两个用户使用的CPU时间都为50%
在多核多CPU的情况下,同一进程组的进程有可能在不同CPU上同时运行,所以每个进程组都必须对每个CPU分配它的调度实体(struct sched_entity 和 struct sched_rt_entity)和运行队列(struct cfs_rq 和 struct rt_rq)。

*/
/* task group related information */
struct task_group {
	/* 用于进程找到其所属进程组结构 */
	struct cgroup_subsys_state css;

	bool notify_on_migrate;
#ifdef CONFIG_SCHED_HMP
	bool upmigrate_discouraged;
#endif
//参照alloc_fair_sched_group/alloc_rt_sched_group 
///* 该进程组在每个CPU上都有对应的一个调度实体,因为有可能此进程组同时在两个CPU上运行(它的A进程在CPU0上运行,B进程在CPU1上运行) */
#ifdef CONFIG_FAIR_GROUP_SCHED
	/* schedulable entities of this group on each cpu */
	struct sched_entity **se;
	/* runqueue "owned" by this group on each cpu */
	struct cfs_rq **cfs_rq;
	//保存tg优先级
	unsigned long shares;

	atomic_t load_weight;
	atomic64_t load_avg;
	atomic_t runnable_avg;
#endif

#ifdef CONFIG_RT_GROUP_SCHED
	//rt调度器进程组
	struct sched_rt_entity **rt_se;
	struct rt_rq **rt_rq;

	struct rt_bandwidth rt_bandwidth;
#endif

	struct rcu_head rcu;
	//task list of this task group 
	struct list_head list;
	//上层进程组
	struct task_group *parent;
	//进程组兄弟节点list
	struct list_head siblings;
	//进程组儿子节点list
	struct list_head children;

#ifdef CONFIG_SCHED_AUTOGROUP
	struct autogroup *autogroup;
#endif

	struct cfs_bandwidth cfs_bandwidth;
};


struct task_struct {
//一个进程共有7种可能状态,分别是:TASK_RUNNING、TASK_INTERRUPTIBLE、TASK_UNINTERRUPTIBLE、TASK_STOPPED、TASK_TRACED、EXIT_ZOMBIE和EXIT_DEAD	
	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
	void *stack;
	atomic_t usage;
	unsigned int flags;	/* per process flags, defined below */
	unsigned int ptrace;

#ifdef CONFIG_SMP
	struct llist_node wake_entry;
	int on_cpu; //task 是否正在cpu上运行
#endif
	//是否在rq
	int on_rq;
    //优先级,prio:动态优先级,100~139,static_prio: 静态优先级,static_prio = 100 + nice + 20,
    //normal 正常优先级
	int prio, static_prio, normal_prio;
	unsigned int rt_priority;//实时进程优先级1~99
	const struct sched_class *sched_class; //调度类
	struct sched_entity se;//调度实体
	struct sched_rt_entity rt;
#ifdef CONFIG_SCHED_HMP
	struct ravg ravg;
	/*
	 * 'init_load_pct' represents the initial task load assigned to children
	 * of this task
	 */
	u32 init_load_pct;
	u64 run_start;
	u64 last_sleep_ts;//记录上次睡眠时间
	struct related_thread_group *grp;
	struct list_head grp_list;
#endif
#ifdef CONFIG_CGROUP_SCHED
	//指向其所在进程组
	struct task_group *sched_task_group;
#endif
	struct sched_dl_entity dl;

#ifdef CONFIG_PREEMPT_NOTIFIERS
	/* list of struct preempt_notifier: */
	struct hlist_head preempt_notifiers;
#endif

	/*
	 * fpu_counter contains the number of consecutive context switches
	 * that the FPU is used. If this is over a threshold, the lazy fpu
	 * saving becomes unlazy to save the trap. This is an unsigned char
	 * so that after 256 times the counter wraps and the behavior turns
	 * lazy again; this to deal with bursty apps that only use FPU for
	 * a short time
	 */
	unsigned char fpu_counter;
#ifdef CONFIG_BLK_DEV_IO_TRACE
	unsigned int btrace_seq;
#endif

	unsigned int policy;
	int nr_cpus_allowed;
	cpumask_t cpus_allowed;

#ifdef CONFIG_PREEMPT_RCU
	int rcu_read_lock_nesting;
	char rcu_read_unlock_special;
	struct list_head rcu_node_entry;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TREE_PREEMPT_RCU
	struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
	struct rt_mutex *rcu_boost_mutex;
#endif /* #ifdef CONFIG_RCU_BOOST */
.....
}



                
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值