Linux的进程管理之相关数据结构—3

本文链接：https://blog.csdn.net/weixin_43708235/article/details/125711381

文章目录

1.调度策略
2.调度的数据结构
3. cpu抽象rq结构体
4.调度队列cfs_rq rt_rq
5.调度实体sched_entity sched_rt_entity
6.调度类sched_class

先回顾下之前的定义，进程调度是指操作系统按照某种策略或者规则选择进程占用CPU进行运行的过程；
即：什么时候调度（调度时机）、怎么调度（上下文切换）、按照什么方式调度（调度策略）

1.调度策略

linux将进程主要划分为实时进程与普通进程，使用5种调度器；
一共五种调度器：STOP、RT、 DEADLINE、 CFS、 IDLE调度器；
调度策略：SCHED_FIFO
SCHED_RR
SCHED_IDLE
SCHED_NORMAL
SCHED_BATCH
其中STOP、IDEL调度器仅供内核使用，用户态常用的为CFS、RT调度器，其中RT常用于嵌入式系统。
Deadline调度器是linux3.14版本引入的，本文分析所使用的是linux3.0.20版本，因此还未引入，不过道理都是类似的，我们重点介绍CFS 与RT调度器

2.调度的数据结构

抽象体	结构体
CPU	struct rq
调度队列	rq->cfs_rq rq->rt_rq
进程	struct task_struct
调度实体	task_struct->sched_entity task_struct->sched_rt_entity
调度类	task_struct->sched_class

内核通过rq task_struct sched_class等结构体进行抽象调度器，每个cpu抽象了一个rq结构体，进程提取了一个task_struct，sched_class代表一种调度类，以下为几者的关系：

在这里插入图片描述

【1】图示左上，对cpu的内核核进行的抽象，提取为struct rq 结构体，cpu可以认为是一个具有一定算力的贪吃蛇，对cpu来说，可以反复不断运行进程，这两个进程队列cfs_rq、rt_rq已经指向了进程实体，cpu只需要按照优先级反复执行即可。
【2】图示上中，数据结构，可以看见内核选取了红黑树作为CFS调度器的数据结构，选择了哈希表作为rt调度器的数据结构。
【3】图示右上，进程抽象，其中包含着普通进程调度实体与实时进程调度实体，通过调度实体找到了进程task_struct就完成了进程的调度。
【4】图示下，调度器，对调度器的抽象，调度器是承载在进程task_struct上，这样就可以通过配置调度策略来选择对应的调度器。
可以看出，内核背后的抽象思维非常值得我们学习，可以在做实际项目中体会这种思维；

3. cpu抽象rq结构体

此结构体主要是对CPU进行抽象，主要为cpu上的运行队列，主要以单核调度进行说明，多核与组调度，在此并不作为重点进行说明；

/*
 * This is the main, per-CPU runqueue data structure.
 *
 * Locking rule: those places that want to lock multiple runqueues
 * (such as the load balancing or the thread migration code), lock
 * acquire operations must be ordered by ascending &runqueue.
 */
struct rq {
	/* runqueue lock: */
	raw_spinlock_t lock;

	/*
	 * nr_running and cpu_load should be in the same cacheline because
	 * remote CPUs use both these fields when doing load calculation.
	 */
	  运行队列上调度实体的个数，是所有子调度器类中就绪实体之和
	unsigned long nr_running;
	#define CPU_LOAD_IDX_MAX 5
	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
	unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ
	u64 nohz_stamp;
	unsigned char nohz_balance_kick;
#endif
	int skip_clock_update;

	/* capture load from *all* tasks on this cpu: */
	// 表示 rq 的权重，对于每个调度实体，都有一个权重值来表示进程的优先级，这里的 load 是整个队列上的总 load 值，反映了当前 runqueue 上进程的总体权重信息.
    struct load_weight load;
	struct load_weight load;
	// 负载的统计次数
	unsigned long nr_load_updates;
	// 该运行队列上进程的切换次数
	u64 nr_switches;

    // cfs 调度器类的就绪队列
    struct cfs_rq cfs;
    // rt 调度器类的就绪队列
	struct rt_rq rt;

#ifdef CONFIG_FAIR_GROUP_SCHED
	/* list of leaf cfs_rq on this cpu: */
	struct list_head leaf_cfs_rq_list;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
	struct list_head leaf_rt_rq_list;
#endif

	/*
	 * This is part of a global counter where only the total sum
	 * over all CPUs matters. A task can increase this counter on
	 * one CPU and if it got migrated afterwards it may decrease
	 * it on another CPU. Always updated under the runqueue lock:
	 */
	unsigned long nr_uninterruptible;
// 保存的进程指针，分别对应当前执行进程 curr，idle 进程(空闲时调用)，stop进程(用于停止 CPU)
	struct task_struct *curr, *idle, *stop;
	unsigned long next_balance;
	struct mm_struct *prev_mm;

	u64 clock;
	u64 clock_task;

	atomic_t nr_iowait;

#ifdef CONFIG_SMP
	struct root_domain *rd;
	struct sched_domain *sd;

	unsigned long cpu_power;

	unsigned char idle_at_tick;
	/* For active balancing */
	int post_schedule;
	int active_balance;
	int push_cpu;
	struct cpu_stop_work active_balance_work;
	/* cpu of this runqueue: */
	int cpu;
	int online;

	unsigned long avg_load_per_task;

	u64 rt_avg;
	u64 age_stamp;
	u64 idle_stamp;
	u64 avg_idle;
#endif

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
	u64 prev_irq_time;
#endif

	/* calc_load related fields */
	unsigned long calc_load_update;
	long calc_load_active;

#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
	int hrtick_csd_pending;
	struct call_single_data hrtick_csd;
#endif
	struct hrtimer hrtick_timer;
#endif

#ifdef CONFIG_SCHEDSTATS
	/* latency stats */
	struct sched_info rq_sched_info;
	unsigned long long rq_cpu_time;
	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

	/* sys_sched_yield() stats */
	unsigned int yld_count;

	/* schedule() stats */
	unsigned int sched_switch;
	unsigned int sched_count;
	unsigned int sched_goidle;

	/* try_to_wake_up() stats */
	unsigned int ttwu_count;
	unsigned int ttwu_local;
#endif

#ifdef CONFIG_SMP
	struct task_struct *wake_list;
#endif
};

4.调度队列cfs_rq rt_rq

每个CPU上管理这两个调度队列，实时调度队列，CFS调度队列；其目的就是用来管理调度实体的，将调度实体按照规则进行组织；
cfs_rq即普通进程运行队列，管理着普通任务，cfs使用红黑树进行管理，运行队列就是指向其红黑树根节点以及，最左边的叶子节点，即下一个要调度的节点；
rt_rq即实时进程运行队列，管理着实时任务，rt使用哈希表进行管理，运行队列同样指向与哈希表的节点，进行调度节点管理。

/* CFS-related fields in a runqueue */
struct cfs_rq {
	struct load_weight load;
	unsigned long nr_running;

	u64 exec_clock;
	u64 min_vruntime;
#ifndef CONFIG_64BIT
	u64 min_vruntime_copy;
#endif

	struct rb_root tasks_timeline;
	struct rb_node *rb_leftmost;

	struct list_head tasks;
	struct list_head *balance_iterator;

	/*
	 * 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	struct sched_entity *curr, *next, *last, *skip;

#ifdef	CONFIG_SCHED_DEBUG
	unsigned int nr_spread_over;
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */

	/*
	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
	 * (like users, containers etc.)
	 *
	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
	 * list is used during load balance.
	 */
	int on_list;
	struct list_head leaf_cfs_rq_list;
	struct task_group *tg;	/* group that "owns" this runqueue */

#ifdef CONFIG_SMP
	/*
	 * the part of load.weight contributed by tasks
	 */
	unsigned long task_weight;

	/*
	 *   h_load = weight * f(tg)
	 *
	 * Where f(tg) is the recursive weight fraction assigned to
	 * this group.
	 */
	unsigned long h_load;

	/*
	 * Maintaining per-cpu shares distribution for group scheduling
	 *
	 * load_stamp is the last time we updated the load average
	 * load_last is the last time we updated the load average and saw load
	 * load_unacc_exec_time is currently unaccounted execution time
	 */
	u64 load_avg;
	u64 load_period;
	u64 load_stamp, load_last, load_unacc_exec_time;

	unsigned long load_contribution;
#endif
#endif
};

/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
	struct rt_prio_array active;
	unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
	struct {
		int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
		int next; /* next highest */
#endif
	} highest_prio;
#endif
#ifdef CONFIG_SMP
	unsigned long rt_nr_migratory;
	unsigned long rt_nr_total;
	int overloaded;
	struct plist_head pushable_tasks;
#endif
	int rt_throttled;
	u64 rt_time;
	u64 rt_runtime;
	/* Nests inside the rq lock: */
	raw_spinlock_t rt_runtime_lock;

#ifdef CONFIG_RT_GROUP_SCHED
	unsigned long rt_nr_boosted;

	struct rq *rq;
	struct list_head leaf_rt_rq_list;
	struct task_group *tg;
#endif
};

5.调度实体sched_entity sched_rt_entity

调度实体，每一种调度器的调度实体是不一样的，是每一种调度器关键的调度载体。
cfs调度器，使用sched_entity调度实体，其中包含虚拟的运行时间，总执行时间等信息。
rt调度器，使用sched_rt_entity调度实体，其中包括时间片等信息。

struct sched_entity {
	struct load_weight	load;		/* for load-balancing */
	struct rb_node		run_node;
	struct list_head	group_node;
	unsigned int		on_rq;

	u64			exec_start;
	u64			sum_exec_runtime;
	u64			vruntime;
	u64			prev_sum_exec_runtime;

	u64			nr_migrations;

#ifdef CONFIG_SCHEDSTATS
	struct sched_statistics statistics;
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
	struct sched_entity	*parent;
	/* rq on which this entity is (to be) queued: */
	struct cfs_rq		*cfs_rq;
	/* rq "owned" by this entity/group: */
	struct cfs_rq		*my_q;
#endif
};

struct sched_rt_entity {
	struct list_head run_list;
	unsigned long timeout;
	unsigned int time_slice;
	int nr_cpus_allowed;

	struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
	struct sched_rt_entity	*parent;
	/* rq on which this entity is (to be) queued: */
	struct rt_rq		*rt_rq;
	/* rq "owned" by this entity/group: */
	struct rt_rq		*my_q;
#endif
};

6.调度类sched_class

每个 CPU 拥有各自的 runqueue，而 runqueue 中维护了各个调度器类的相关信息：包括 cfs_rq，rt_rq.
每个不同的调度器类按照优先级排列依次为： stop_sched_class->rt_sched_class->fair_sched_class->idle_sched_class，
当高优先级调度器中存在就绪任务时，就不会轮到低优先级调度器中的任务执行；
内核对实时进程设置了运行占比为0.95，即当实时进程一直占用 CPU 时，会强行给非实时任务留出 5% 的执行时间，当然也是可以配置的，使用sysctl指令进行配置；

struct sched_class{
	const struct sched_class *next;
     //加入至调度队列
	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
	//从调度队列中删除
	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
	void (*yield_task) (struct rq *rq);
	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);

	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
     //获取下一个即将调度的进程或者线程
	struct task_struct * (*pick_next_task) (struct rq *rq);
	void (*put_prev_task) (struct rq *rq, struct task_struct *p);

#ifdef CONFIG_SMP
	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);

	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
	void (*post_schedule) (struct rq *this_rq);
	void (*task_waking) (struct task_struct *task);
	void (*task_woken) (struct rq *this_rq, struct task_struct *task);

	void (*set_cpus_allowed)(struct task_struct *p,
				 const struct cpumask *newmask);

	void (*rq_online)(struct rq *rq);
	void (*rq_offline)(struct rq *rq);
#endif

	void (*set_curr_task) (struct rq *rq);
	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
	void (*task_fork) (struct task_struct *p);

	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
	void (*switched_to) (struct rq *this_rq, struct task_struct *task);
	//设置进程优先级
	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
			     int oldprio);

	unsigned int (*get_rr_interval) (struct rq *rq,
					 struct task_struct *task);

#ifdef CONFIG_FAIR_GROUP_SCHED
	void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
}