CFS之vruntime记录

最新推荐文章于 2023-01-06 16:01:21 发布

And乔

最新推荐文章于 2023-01-06 16:01:21 发布

阅读量542

点赞数 1

分类专栏： # CPU调度文章标签： linux cfs

本文链接：https://blog.csdn.net/xiaoqiaoq0/article/details/108395886

版权

CPU调度专栏收录该内容

8 篇文章 24 订阅

订阅专栏

前言

之前学习调度相关内容的时候，对于这个vruntime有一些概念上的理解，但是没有实际跟踪过code，所以总是感觉处于一种似懂非懂的状态，今天share PELT计算过程的时候顺势跟踪了下这部分，于是跟以前了解的概念都串起来了，这里做下简单记录：
在CFS中提出vruntime，为保证公平，为task分配相同的份额，即vruntime

每次在当前rq上找到runtime最小的执行（红黑树管理）；
执行后该vruntime值更新，需注意，此处更新并非实际运行时间，而是实际运行时间与weight做完处理后的值（等同于nice值为0的处理时间）；
根据weight决策当前的task实际执行时间；

在这里插入图片描述

1. 初始化

vruntime的这么重要，可以决策下一个执行的Task，那么每个Task的vruntime是如何得到的呢？直接上code：

/*
 * called on fork with the child task as argument from the parent's context
 *  - child not yet on the tasklist
 *  - preemption disabled
 */
static void task_fork_fair(struct task_struct *p)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se, *curr;
	struct rq *rq = this_rq();

	raw_spin_lock(&rq->lock);
	update_rq_clock(rq);

	cfs_rq = task_cfs_rq(current);
	curr = cfs_rq->curr;
	if (curr) {
		update_curr(cfs_rq);//这里是fork操作，curr即父进程，首先更新父进程的vruntime；
		se->vruntime = curr->vruntime;//获取父进程的vruntime值；
	}
	place_entity(cfs_rq, se, 1);//根据权重更新子进程的vruntime

	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
		//如果标记了child first，且父进程vruntime < 子进程 vruntime，则swap
		/*
		 * Upon rescheduling, sched_class::put_prev_task() will place
		 * 'current' within the tree based on its new key value.
		 */
		swap(curr->vruntime, se->vruntime);
		resched_curr(rq);
	}

	se->vruntime -= cfs_rq->min_vruntime;//将Task的vruntime - min_vruntime 插入 rq中的红黑树
	raw_spin_unlock(&rq->lock);
}

上述函数即一个Task被创建时，vruntime的初始化过程：

继承自父进程的vruntime
更新得到子进程的vruntime
1. 获取cfs_rq 的min_vruntime
2. 给min_vruntime + sched_vslice，这个sched_vslice就是根据当前task跟cfs上所有task的load比值获取到的实际执行时间，再换算成优先级为0的vruntime；
3. 取上述计算值与父进程值中较大的给到新创建的entity；
如果有sysctl_sched_child_runs_first标记，且子进程vruntime > 父进程veruntime，则交换
vruntime - cfs_rq min_vruntime插入cfs 的红黑树

也就是说，为保证公平，给所有task分配的vruntime都是按照优先级为0做的，在一定周期内所有task按照weight比例分配时间；
但是实际执行时间是根据load比值来计算的；

1.1 核心在第二步，具体来看：

static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
	u64 vruntime = cfs_rq->min_vruntime;//获取到rq的min_vruntime

	if (initial && sched_feat(START_DEBIT))
		vruntime += sched_vslice(cfs_rq, se);//关键步骤：根据rq所有task权重计算当前task可被分配的时间片
//非第一次进来的情况下执行，这里不关注
	if (!initial) {
		unsigned long thresh = sysctl_sched_latency;
		if (sched_feat(GENTLE_FAIR_SLEEPERS)) thresh >>= 1;
		vruntime -= thresh;
	}
	se->vruntime = max_vruntime(se->vruntime, vruntime);//返回较大的一个
}

1.2 继续追踪sched_vslice：

这个函数实质就是根据权重计算task应该被分配到的时间

static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	return calc_delta_fair(sched_slice(cfs_rq, se), se);
}

// weight / load * slice 根据rq权重总值，计算当前task分配到的执行时间
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);//根据task总数获取到当前周期内待分配时间；

	for_each_sched_entity(se) {//获取到当前task
		struct load_weight *load;
		struct load_weight lw;//cfs_rq上的总的权重

		cfs_rq = cfs_rq_of(se);
		load = &cfs_rq->load;//获取rq上权重总数

		if (unlikely(!se->on_rq)) {
			lw = cfs_rq->load;

			update_load_add(&lw, se->load.weight);//累加当前task权重
			load = &lw;
		}
		slice = __calc_delta(slice, se->load.weight, load);//计算
	}
	return slice;
}
//这里区分了nice 为0 的情况，实际上说明，其他nice的Task的vruntime是按照nice为0计算的
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
	if (unlikely(se->load.weight != NICE_0_LOAD))
		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);//计算

	return delta;
}
//计算，实际就是delta_exec * weight / lw
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
	u64 fact = scale_load_down(weight);
	int shift = WMULT_SHIFT;

	__update_inv_weight(lw);

	if (unlikely(fact >> 32)) {
		while (fact >> 32) {
			fact >>= 1;
			shift--;
		}
	}

	/* hint to use a 32x32->64 mul */
	fact = (u64)(u32)fact * lw->inv_weight;

	while (fact >> 32) {
		fact >>= 1;
		shift--;
	}

	return mul_u64_u32_shr(delta_exec, fact, shift);
}

2. 更新

按照我们的理解，vruntime在每次执行结束后都会更新，累加上当前执行的时间，则可以获取到下次执行的时刻：

static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_clock_task(rq_of(cfs_rq));
	u64 delta_exec;

	if (unlikely(!curr)) return;

	delta_exec = now - curr->exec_start;
	if (unlikely((s64)delta_exec <= 0)) return;

	curr->exec_start = now;

	schedstat_set(curr->statistics.exec_max, max(delta_exec, curr->statistics.exec_max));

	curr->sum_exec_runtime += delta_exec;
	schedstat_add(cfs_rq->exec_clock, delta_exec);

	curr->vruntime += calc_delta_fair(delta_exec, curr);//只关注这里，这里之前分析过了，即根据执行时间换算为nice为0的vruntime
	update_min_vruntime(cfs_rq);

	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);

		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
		cpuacct_charge(curtask, delta_exec);
		account_group_exec_runtime(curtask, delta_exec);
	}

	account_cfs_rq_runtime(cfs_rq, delta_exec);
}

在updata_curr时会更新vruntime
这里更新也很简单，使用实际执行时间 delta * nice_0 / weight，就是把实际执行时间换算成优先级为0的虚拟时间；
也就是说有些执行时间与权重强相关；

3. 权重计算

这东西进程的权重的话，是个静态数组决定的，根据nice值从-20 ~ 20依次比例降低

/*
 * Nice levels are multiplicative, with a gentle 10% change for every
 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 * nice 1, it will get ~10% less CPU time than another CPU-bound task
 * that remained on nice 0.
 *
 * The "10% effect" is relative and cumulative: from _any_ nice level,
 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
 * If a task goes up by ~10% and another task goes down by ~10% then
 * the relative distance between them is ~25%.)
 */
const int sched_prio_to_weight[40] = {
 /* -20 */     88761,     71755,     56483,     46273,     36291,
 /* -15 */     29154,     23254,     18705,     14949,     11916,
 /* -10 */      9548,      7620,      6100,      4904,      3906,
 /*  -5 */      3121,      2501,      1991,      1586,      1277,
 /*   0 */      1024,       820,       655,       526,       423,
 /*   5 */       335,       272,       215,       172,       137,
 /*  10 */       110,        87,        70,        56,        45,
 /*  15 */        36,        29,        23,        18,        15,
};

/*
 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
 *
 * In cases where the weight does not change often, we can use the
 * precalculated inverse to speed up arithmetics by turning divisions
 * into multiplications:
 */
const u32 sched_prio_to_wmult[40] = {
 /* -20 */     48388,     59856,     76040,     92818,    118348,
 /* -15 */    147320,    184698,    229616,    287308,    360437,
 /* -10 */    449829,    563644,    704093,    875809,   1099582,
 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};

按照设计来看，优先级每提高1，可以多占10%的系统资源；

And乔

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
CFS之vruntime记录

前言之前学习调度相关内容的时候，对于这个vruntime有一些概念上的理解，但是没有实际跟踪过code，所以总是感觉处于一种似懂非懂的状态，今天share PELT计算过程的时候顺势跟踪了下这部分，于是跟以前了解的概念都串起来了，这里做下简单记录：在CFS中提出vruntime，为保证公平，为task分配相同的份额，即vruntime每次在当前rq上找到runtime最小的执行（红黑树管理）；执行后该vruntime值更新，需注意，此处更新并非实际运行时间，而是实际运行时间与weight做完处理后
复制链接

扫一扫