Kernel Scheduler学习之七：CFS调度器之虚拟时间

最新推荐文章于 2023-04-26 01:11:57 发布

sucjhwaxp

最新推荐文章于 2023-04-26 01:11:57 发布

阅读量400

点赞数

分类专栏： Kernel

本文链接：https://blog.csdn.net/sucjhwaxp/article/details/108298666

版权

Kernel 专栏收录该内容

11 篇文章 13 订阅

订阅专栏

Overview
cfs调度器被称作是完全公平调度器，那么如何体现公平的呢？是否有绝对的公平呢？本博客主要回答如下的问题：
vruntime的原理是什么？
vruntime如何更新的？

Vtime的原理
Linux抽象出来一个调度实体，在这个实体当中保存了了一个叫vruntime的变量，如下所示：

struct sched_entity {
	/* For load-balancing: */
	struct load_weight		load;
	unsigned long			runnable_weight;
	struct rb_node			run_node;
	struct list_head		group_node;
	unsigned int			on_rq;

	u64				exec_start;
	u64				sum_exec_runtime;
	u64				vruntime;
	u64				prev_sum_exec_runtime;

	u64				nr_migrations;

	struct sched_statistics		statistics;

#ifdef CONFIG_FAIR_GROUP_SCHED
	int				depth;
	struct sched_entity		*parent;
	/* rq on which this entity is (to be) queued: */
	struct cfs_rq			*cfs_rq;
	/* rq "owned" by this entity/group: */
	struct cfs_rq			*my_q;
#endif

#ifdef CONFIG_SMP
	/*
	 * Per entity load average tracking.
	 *
	 * Put into separate cache line so it does not
	 * collide with read-mostly values above.
	 */
	struct sched_avg		avg;
#endif
};

vruntime更新的时间点与位置：
Kernel会在涉及到task时间变化或者变更cpu的时候，更新vruntime，如下所示：

update_curr的函数如所示：

/*
 * Update the current task's runtime statistics.
 */
static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_clock_task(rq_of(cfs_rq));//此处取得的是task的clock,关于rq相关的clock，后面研究。
	u64 delta_exec;

	if (unlikely(!curr))
		return;

	delta_exec = now - curr->exec_start;
	if (unlikely((s64)delta_exec <= 0))
		return;

	curr->exec_start = now;//重新开始计时

	schedstat_set(curr->statistics.exec_max,
		      max(delta_exec, curr->statistics.exec_max));

	curr->sum_exec_runtime += delta_exec;//计算总运行时间
	schedstat_add(cfs_rq->exec_clock, delta_exec);

	curr->vruntime += calc_delta_fair(delta_exec, curr);//计算vruntime
	update_min_vruntime(cfs_rq);//更新cfs_rq的最小vruntime

	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);

		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
		cgroup_account_cputime(curtask, delta_exec);
		account_group_exec_runtime(curtask, delta_exec);
	}

	account_cfs_rq_runtime(cfs_rq, delta_exec);
}

vruntime的计算：

/*
 * delta /= w
 */
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
	if (unlikely(se->load.weight != NICE_0_LOAD)) //如果权重不是优先级为0的权重，则要根据其权重重新计算。
		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
//否则就是真实的runing时间差值。
	return delta;
}

所以，根据上面的公式看上去和load.weight有很大关系，那么Load.weight表示什么呢？
如之前sched_entity中Load的类型为： struct load_weight load;

struct load_weight {
	unsigned long			weight;
	u32				inv_weight;
};

查询代码，主要是通过如下的函数设置load.weight：

static void set_load_weight(struct task_struct *p, bool update_load)
{
	int prio = p->static_prio - MAX_RT_PRIO;//RT的最大优先级为99,normal的从100开始，此处MAX_RT_PRIO的值为100。
	struct load_weight *load = &p->se.load;

	/*
	 * SCHED_IDLE tasks get minimal weight:
	 */
	if (task_has_idle_policy(p)) {
		load->weight = scale_load(WEIGHT_IDLEPRIO);
		load->inv_weight = WMULT_IDLEPRIO;
		p->se.runnable_weight = load->weight;
		return;
	}

	/*
	 * SCHED_OTHER tasks have to update their load when changing their
	 * weight
	 */
	if (update_load && p->sched_class == &fair_sched_class) {
		reweight_task(p, prio);
	} else {
		load->weight = scale_load(sched_prio_to_weight[prio]); 
		load->inv_weight = sched_prio_to_wmult[prio];
		p->se.runnable_weight = load->weight;
	}
}

#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w)		((w) << SCHED_FIXEDPOINT_SHIFT)

所以，weight的值其实是根据prio在sched_prio_to_weight查表获得一个值并左移（32位系统不用左移）。这个表的内容如下所示：

/*
 * Nice levels are multiplicative, with a gentle 10% change for every
 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 * nice 1, it will get ~10% less CPU time than another CPU-bound task
 * that remained on nice 0.
 *
 * The "10% effect" is relative and cumulative: from _any_ nice level,
 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
 * If a task goes up by ~10% and another task goes down by ~10% then
 * the relative distance between them is ~25%.)
 */
const int sched_prio_to_weight[40] = {
 /* -20 */     88761,     71755,     56483,     46273,     36291,
 /* -15 */     29154,     23254,     18705,     14949,     11916,
 /* -10 */      9548,      7620,      6100,      4904,      3906,
 /*  -5 */      3121,      2501,      1991,      1586,      1277,
 /*   0 */      1024,       820,       655,       526,       423,
 /*   5 */       335,       272,       215,       172,       137,
 /*  10 */       110,        87,        70,        56,        45,
 /*  15 */        36,        29,        23,        18,        15,
};

根据英文注释：这个表中的每个值都是前一个值的80%，以nice为0（即prio的值为120）时候的值为1024。
nice的值区间为[-20,20],那么sched_prio_to_weight对应[-20,20]每一个Nice的weight值。这个数组的值近似满足等比数列。
可以用下面的公式表示：

其中： load->inv_weight = sched_prio_to_wmult[prio];中的sched_prio_to_wmult如下所示：

/*
 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
 *
 * In cases where the weight does not change often, we can use the
 * precalculated inverse to speed up arithmetics by turning divisions
 * into multiplications:
 */
const u32 sched_prio_to_wmult[40] = {
 /* -20 */     48388,     59856,     76040,     92818,    118348,
 /* -15 */    147320,    184698,    229616,    287308,    360437,
 /* -10 */    449829,    563644,    704093,    875809,   1099582,
 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};

根据注释，每个元素的值公式如下：

接下来继续计算delta_time:

/*
 * delta_exec * weight / lw.weight
 *   OR
 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 *
 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
 * we're guaranteed shift stays positive because inv_weight is guaranteed to
 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 *
 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 * weight/lw.weight <= 1, and therefore our shift will also be positive.
 */
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
	u64 fact = scale_load_down(weight); //因为在设定weight的时候对于64位系统，向左移了10位，这里再移回来，这样做的目的，看注释是为了增加分辨率。
	int shift = WMULT_SHIFT;

	__update_inv_weight(lw);

	if (unlikely(fact >> 32)) {
		while (fact >> 32) {
			fact >>= 1;
			shift--;
		}
	}

	/* hint to use a 32x32->64 mul */
	fact = (u64)(u32)fact * lw->inv_weight;

	while (fact >> 32) {
		fact >>= 1;
		shift--;
	}

	return mul_u64_u32_shr(delta_exec, fact, shift);
}

上面的函数对应的公式为：

所以，当优先级为120的时候，dealta_exec不用打折。而对于其他优先级都会打折或者放大。
如果优先级越大那么其nice值越小，对应的weight_load的值越大，则计算出的时间比实际执行时间要少
如果优先级越小，那么其nice值越大，对应的weight_load的值越小，则计算出的时间比较实际执行时间要多。
如果按照vtime进行公平调度的话，那么优先级越大，则实际分配的时间越多。
如下图所示：
config

时间片计算

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	unsigned long ideal_runtime, delta_exec;
	struct sched_entity *se;
	s64 delta;

	ideal_runtime = sched_slice(cfs_rq, curr);
	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	if (delta_exec > ideal_runtime) {
		resched_curr(rq_of(cfs_rq));
		/*
		 * The current task ran long enough, ensure it doesn't get
		 * re-elected due to buddy favours.
		 */
		clear_buddies(cfs_rq, curr);
		return;
	}

	/*
	 * Ensure that a task that missed wakeup preemption by a
	 * narrow margin doesn't have to wait for a full slice.
	 * This also mitigates buddy induced latencies under load.
	 */
	if (delta_exec < sysctl_sched_min_granularity)
		return;

	se = __pick_first_entity(cfs_rq);
	delta = curr->vruntime - se->vruntime;

	if (delta < 0)
		return;

	if (delta > ideal_runtime)
		resched_curr(rq_of(cfs_rq));
}

sucjhwaxp

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Kernel Scheduler学习之七：CFS调度器之虚拟时间

Overview cfs调度器被称作是完全公平调度器，那么如何体现公平的呢？是否有绝对的公平呢？本博客主要回答如下的问题： vruntime的原理是什么？ vruntime如何更新的？ Vtime的原理 Linux抽象出来一个调度实体，在这个实体当中保存了了一个叫vruntime的变量，如下所示： struct sched_entity { /* For load-balancing: */ struct load_weight load; unsigned long runna.
复制链接

扫一扫