前言
之前学习调度相关内容的时候,对于这个vruntime有一些概念上的理解,但是没有实际跟踪过code,所以总是感觉处于一种似懂非懂的状态,今天share PELT计算过程的时候顺势跟踪了下这部分,于是跟以前了解的概念都串起来了,这里做下简单记录:
在CFS中提出vruntime,为保证公平,为task分配相同的份额,即vruntime
- 每次在当前rq上找到runtime最小的执行(红黑树管理);
- 执行后该vruntime值更新,需注意,此处更新并非实际运行时间,而是实际运行时间与weight做完处理后的值(等同于nice值为0的处理时间);
- 根据weight决策当前的task实际执行时间;
1. 初始化
vruntime的这么重要,可以决策下一个执行的Task,那么每个Task的vruntime是如何得到的呢?直接上code:
/*
* called on fork with the child task as argument from the parent's context
* - child not yet on the tasklist
* - preemption disabled
*/
static void task_fork_fair(struct task_struct *p)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
struct rq *rq = this_rq();
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr) {
update_curr(cfs_rq);//这里是fork操作,curr即父进程,首先更新父进程的vruntime;
se->vruntime = curr->vruntime;//获取父进程的vruntime值;
}
place_entity(cfs_rq, se, 1);//根据权重更新子进程的vruntime
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
//如果标记了child first,且父进程vruntime < 子进程 vruntime,则swap
/*
* Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
resched_curr(rq);
}
se->vruntime -= cfs_rq->min_vruntime;//将Task的vruntime - min_vruntime 插入 rq中的红黑树
raw_spin_unlock(&rq->lock);
}
上述函数即一个Task被创建时,vruntime的初始化过程:
- 继承自父进程的vruntime
- 更新得到子进程的vruntime
- 获取cfs_rq 的min_vruntime
- 给min_vruntime + sched_vslice,这个sched_vslice就是根据当前task跟cfs上所有task的load比值获取到的实际执行时间,再换算成优先级为0的vruntime;
- 取上述计算值与父进程值中较大的给到新创建的entity;
- 如果有sysctl_sched_child_runs_first标记,且子进程vruntime > 父进程veruntime,则交换
- vruntime - cfs_rq min_vruntime插入cfs 的红黑树
也就是说,为保证公平,给所有task分配的vruntime都是按照优先级为0做的,在一定周期内所有task按照weight比例分配时间;
但是实际执行时间是根据load比值来计算的;
1.1 核心在第二步,具体来看:
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;//获取到rq的min_vruntime
if (initial && sched_feat(START_DEBIT))
vruntime += sched_vslice(cfs_rq, se);//关键步骤:根据rq所有task权重计算当前task可被分配的时间片
//非第一次进来的情况下执行,这里不关注
if (!initial) {
unsigned long thresh = sysctl_sched_latency;
if (sched_feat(GENTLE_FAIR_SLEEPERS)) thresh >>= 1;
vruntime -= thresh;
}
se->vruntime = max_vruntime(se->vruntime, vruntime);//返回较大的一个
}
1.2 继续追踪sched_vslice:
这个函数实质就是根据权重计算task应该被分配到的时间
static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
// weight / load * slice 根据rq权重总值,计算当前task分配到的执行时间
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);//根据task总数获取到当前周期内待分配时间;
for_each_sched_entity(se) {//获取到当前task
struct load_weight *load;
struct load_weight lw;//cfs_rq上的总的权重
cfs_rq = cfs_rq_of(se);
load = &cfs_rq->load;//获取rq上权重总数
if (unlikely(!se->on_rq)) {
lw = cfs_rq->load;
update_load_add(&lw, se->load.weight);//累加当前task权重
load = &lw;
}
slice = __calc_delta(slice, se->load.weight, load);//计算
}
return slice;
}
//这里区分了nice 为0 的情况,实际上说明,其他nice的Task的vruntime是按照nice为0计算的
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
delta = __calc_delta(delta, NICE_0_LOAD, &se->load);//计算
return delta;
}
//计算,实际就是delta_exec * weight / lw
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
u64 fact = scale_load_down(weight);
int shift = WMULT_SHIFT;
__update_inv_weight(lw);
if (unlikely(fact >> 32)) {
while (fact >> 32) {
fact >>= 1;
shift--;
}
}
/* hint to use a 32x32->64 mul */
fact = (u64)(u32)fact * lw->inv_weight;
while (fact >> 32) {
fact >>= 1;
shift--;
}
return mul_u64_u32_shr(delta_exec, fact, shift);
}
2. 更新
按照我们的理解,vruntime在每次执行结束后都会更新,累加上当前执行的时间,则可以获取到下次执行的时刻:
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq));
u64 delta_exec;
if (unlikely(!curr)) return;
delta_exec = now - curr->exec_start;
if (unlikely((s64)delta_exec <= 0)) return;
curr->exec_start = now;
schedstat_set(curr->statistics.exec_max, max(delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);//只关注这里,这里之前分析过了,即根据执行时间换算为nice为0的vruntime
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
在updata_curr时会更新vruntime
这里更新也很简单,使用实际执行时间 delta * nice_0 / weight,就是把实际执行时间换算成优先级为0的虚拟时间;
也就是说有些执行时间与权重强相关;
3. 权重计算
这东西进程的权重的话,是个静态数组决定的,根据nice值从-20 ~ 20依次比例降低
/*
* Nice levels are multiplicative, with a gentle 10% change for every
* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
* nice 1, it will get ~10% less CPU time than another CPU-bound task
* that remained on nice 0.
*
* The "10% effect" is relative and cumulative: from _any_ nice level,
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
* If a task goes up by ~10% and another task goes down by ~10% then
* the relative distance between them is ~25%.)
*/
const int sched_prio_to_weight[40] = {
/* -20 */ 88761, 71755, 56483, 46273, 36291,
/* -15 */ 29154, 23254, 18705, 14949, 11916,
/* -10 */ 9548, 7620, 6100, 4904, 3906,
/* -5 */ 3121, 2501, 1991, 1586, 1277,
/* 0 */ 1024, 820, 655, 526, 423,
/* 5 */ 335, 272, 215, 172, 137,
/* 10 */ 110, 87, 70, 56, 45,
/* 15 */ 36, 29, 23, 18, 15,
};
/*
* Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
*
* In cases where the weight does not change often, we can use the
* precalculated inverse to speed up arithmetics by turning divisions
* into multiplications:
*/
const u32 sched_prio_to_wmult[40] = {
/* -20 */ 48388, 59856, 76040, 92818, 118348,
/* -15 */ 147320, 184698, 229616, 287308, 360437,
/* -10 */ 449829, 563644, 704093, 875809, 1099582,
/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
按照设计来看,优先级每提高1,可以多占10%的系统资源;