Energy aware scheduling (EAS) 能量感知调度流程
引用
Documentation/scheduler/sched-energy.txt
基本流程图
代码
select_task_rq_fair
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
* Balances load by selecting the idlest CPU in the idlest group, or under
* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
*
* Returns the target CPU number.
*/
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
{
// 进程不在退出状态,且唤醒标志包含WF_SYNC标志则需要唤醒
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id(); // 获取cpu id?
int new_cpu = prev_cpu; // 默认以上一个cpu作为新的cpu,上一个cpu是否符合预期还要在进行评估
int want_affine = 0;
/* SD_flags and WF_flags share the first nibble */
int sd_flag = wake_flags & 0xF; // 只要低四位
/*
* required for stable ->cpus_allowed
*/
lockdep_assert_held(&p->pi_lock); // 锁
if (wake_flags & WF_TTWU) { // 检查唤醒标志位
record_wakee(p); // 记录唤醒信息
if (sched_energy_enabled()) { // 若开启唤醒任务则寻找能量
new_cpu = find_energy_efficient_cpu(p, prev_cpu); //重要!!!找到能效优化的cpu
if (new_cpu >= 0)
return new_cpu;
new_cpu = prev_cpu;
}
// 判断亲和性标志,!wake_wide指具有严格亲和性,只能在一个cpu上运行
// cpumask_test_cpu指cpu是否在任务p的cpu上
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
}
rcu_read_lock();
for_each_domain(cpu, tmp) {
/*
* If both 'cpu' and 'prev_cpu' are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
if (cpu != prev_cpu)
new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync); // 选择具有亲和性的CPU
sd = NULL; /* Prefer wake_affine over balance flags */
break;
}
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
break;
}
if (unlikely(sd)) {
/* Slow path */
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); // 选择空闲CPU
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); // 选择空闲的兄弟CPU
}
rcu_read_unlock();
return new_cpu;
}
find_energy_efficient_cpu
/*
* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
* waking task. find_energy_efficient_cpu() looks for the CPU with maximum
* spare capacity in each performance domain and uses it as a potential
* candidate to execute the task. Then, it uses the Energy Model to figure
* out which of the CPU candidates is the most energy-efficient.
*
* 底层使用Energy Model来找到CPU候选者。
*
* The rationale for this heuristic is as follows. In a performance domain,
* all the most energy efficient CPU candidates (according to the Energy
* Model) are those for which we'll request a low frequency. When there are
* several CPUs for which the frequency request will be the same, we don't
* have enough data to break the tie between them, because the Energy Model
* only includes active power costs. With this model, if we assume that
* frequency requests follow utilization (e.g. using schedutil), the CPU with
* the maximum spare capacity in a performance domain is guaranteed to be among
* the best candidates of the performance domain.
*
* 低频CPU可以看做是低功耗的,也是可以用来降低能耗的关键点。相同拓扑(类似都是大核
* 没有小核的CPU)难以降低能耗,任务分配在那个核心上都要以相同频率、相同的功耗比执行,
* 还不如直接负载均衡。以cpu capacity衡量cpu的容量,性能域中保有最大剩余容量的CPU
* 是最佳候选者之一(即capacity - utilization)。
*
* In practice, it could be preferable from an energy standpoint to pack
* small tasks on a CPU in order to let other CPUs go in deeper idle states,
* but that could also hurt our chances to go cluster idle, and we have no
* ways to tell with the current Energy Model if this is actually a good
* idea or not. So, find_energy_efficient_cpu() basically favors
* cluster-packing, and spreading inside a cluster. That should at least be
* a good thing for latency, and this is consistent with the idea that most
* of the energy savings of EAS come from the asymmetry of the system, and
* not so much from breaking the tie between identical CPUs. That's also the
* reason why EAS is enabled in the topology code only for systems where
* SD_ASYM_CPUCAPACITY is set.
*
* 从能源角度上,可以将小任务打包在一个cpu上,让其他cpu更空闲,但这也可能影响进入
* 集群空闲的状态,因此并不清楚是否是一个好注意。但此函数基本上有助于集群打包,并
* 在集群内进行传播,这与EAS适用于非对称拓扑的相同中一致。
* NOTE: Forkees are not accepted in the energy-aware wake-up path because
* they don't have any useful utilization data yet and it's not possible to
* forecast their impact on energy consumption. Consequently, they will be
* placed by find_idlest_cpu() on the least loaded CPU, which might turn out
* to be energy-inefficient in some use-cases. The alternative would be to
* bias new tasks towards specific types of CPUs first, or to try to infer
* their util_avg from the parent task, but those heuristics could hurt
* other use-cases too. So, until someone finds a better way to solve this,
* let's keep things simple by re-using the existing slow path.
*
* 注意:energy-aware wake-up不接受fork出的子进程/线程,因为他们还没有有效利用率
* 很难预测对能耗的影响,因为缺乏其性能消耗的了解,因此目前使用find_idlest_cpu()
* 来找到最空闲的CPU。
* 一些可能得方法:把新任务放置在特定类型CPU上;通过父进程推断任务平均利用率等。
*/
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd; // 根域
int cpu, best_energy_cpu = prev_cpu, target = -1;
unsigned long cpu_cap, util, base_energy = 0;
struct sched_domain *sd; // 调度域
struct perf_domain *pd; // 性能域
rcu_read_lock(); // rcu读锁使用共享数据
pd = rcu_dereference(rd->pd); // 根域rd所指向的性能域pd链表
if (!pd || READ_ONCE(rd->overutilized))
// 不存在pd或rd已经过载,则不在寻找能效最优的cpu了,直接负载均衡吧
goto unlock;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
*
* 能量感知的唤醒发生在最低调度域上,从sd_asym_cpucapacity开始,跨越this_cpu
* 和prev_cpu
*/
// 寻找能源感知路径
sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); // 获取cpu上的调度域对象(基于RCU方式)
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) // 如果sd存在且prev_cpu不在sd调度域所包含的cpu span内
sd = sd->parent; // 调度域扩大,直至包含prev_cpu为止
if (!sd)
goto unlock;
target = prev_cpu;
// 更新同步任务平均负载(在不阻塞之前的的rq的情况下)
sync_entity_load_avg(&p->se);
// 任务没有足够的负载则退出(返回util_avg和util_est的最大值)
// 这里的util_est采用了Exponential Weighted Moving Average(EWMA)
// 指数加权移动平均值,避免任务负载的瞬间变化的影响
if (!task_util_est(p))
goto unlock;
// 遍历性能域
for (; pd; pd = pd->next) {
unsigned long cur_delta, spare_cap, max_spare_cap = 0;
bool compute_prev_delta = false;
unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
// 遍历性能域中的每一个cpu,计算其剩余capacity
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
util = cpu_util_next(cpu, p, cpu); // 计算cpu利用率
cpu_cap = capacity_of(cpu); // 计算cpu最大容量
spare_cap = cpu_cap;
lsub_positive(&spare_cap, util); // 计算cpu空闲容量 = cpu_cap - util
/*
* Skip CPUs that cannot satisfy the capacity request.
* IOW, placing the task there would make the CPU
* overutilized. Take uclamp into account to see how
* much capacity we can get out of the CPU; this is
* aligned with sched_cpu_util().
*
* 跳过不满足capacity请求的CPUs,换句话说,放置任务会使得cpu过载。
* 将uclamp纳入考虑来计算我们可以从CPU拿走多少容量,与sched_cpu_util()对齐
*/
util = uclamp_rq_util_with(cpu_rq(cpu), util, p); // 约束util的上下限
// 如果剩余util不到~20%(((cap) * 1280 < (util) * 1024)),则直接跳过
if (!fits_capacity(util, cpu_cap))
continue;
if (cpu == prev_cpu) {
// 总是以prev_cpu作为候选者
compute_prev_delta = true;
} else if (spare_cap > max_spare_cap) {
// 找到性能域中最大空闲的cpu
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
}
}
if (max_spare_cap_cpu < 0 && !compute_prev_delta) // 没找到cpu且prev_cpu没变化
continue;
/* Compute the 'base' energy of the pd, without @p */
base_energy_pd = compute_energy(p, -1, pd); // 计算性能域的基本能量(-1表示放置在任意cpu上的能耗)
base_energy += base_energy_pd;
/* Evaluate the energy impact of using prev_cpu. */
if (compute_prev_delta) { // 评估使用prev_cpu的影响
prev_delta = compute_energy(p, prev_cpu, pd);
if (prev_delta < base_energy_pd) // prev_cpu的能耗比基本能耗更小,那就不比了
goto unlock;
prev_delta -= base_energy_pd; // prev_cpu的能耗相比于基本能耗的差值
best_delta = min(best_delta, prev_delta); // 相比于基本能耗的最佳差值
}
/* Evaluate the energy impact of using max_spare_cap_cpu. */
if (max_spare_cap_cpu >= 0) { // 评估使用 max_spare_cap_cpu 的能耗影响
cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
if (cur_delta < base_energy_pd)
goto unlock;
cur_delta -= base_energy_pd;
if (cur_delta < best_delta) { // 如果最大剩余capacity的cpu表现更优,更新best_delta
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
}
}
}
rcu_read_unlock();
/*
* Pick the best CPU if prev_cpu cannot be used, or if it saves at
* least 6% of the energy used by prev_cpu.
*/
if ((prev_delta == ULONG_MAX) ||
(prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) // 若prev_cpu不可用或best_cpu至少节省6%的能量,就选best_cpu
target = best_energy_cpu;
return target;
unlock:
rcu_read_unlock();
return target;
}
compute_energy
/*
* compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
* landscape of @pd's CPUs after the task migration, and uses the Energy Model
* to compute what would be the energy if we decided to actually migrate that
* task.
*
* compute_energy(): 估计任务@p迁移到@dst_cpu上时,性能域@pd的能量。compute_energy()
* 预测@pd进行迁移后的所有cpu的利用率,并且利用Energy Model来计算实际迁移时的能量。
*/
static long
compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
// 获取pd所覆盖的cpumask集合
struct cpumask *pd_mask = perf_domain_span(pd);
// 默认的arch_scale_cpu_capacity返回值是1024
unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
unsigned long max_util = 0, sum_util = 0;
unsigned long _cpu_cap = cpu_cap;
int cpu;
// 扣除热节流所占用的capacity,这个值由.config文件配置,且仅存在于arm/arm64平台
_cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
/*
* The capacity state of CPUs of the current rd can be driven by CPUs
* of another rd if they belong to the same pd. So, account for the
* utilization of these CPUs too by masking pd with cpu_online_mask
* instead of the rd span.
*
* 当前rd的CPU的容量状态可以由包含同一pd的另一个rd的CPU驱动。 因此,通过masking
* 带有cpu_online_mask的pd而不是rd span来计算来计算这些CPU的利用率。
*
* If an entire pd is outside of the current rd, it will not appear in
* its pd list and will not be accounted by compute_energy().
*
* 如果整个pd在当前rd范围之外,其不会出现在rd的pd链表中,也不会通过
* compute_energy()计算能量
*/
for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
unsigned long cpu_util, util_running = util_freq;
struct task_struct *tsk = NULL;
/*
* When @p is placed on @cpu:
*
* util_running = max(cpu_util, cpu_util_est) +
* max(task_util, _task_util_est)
*
* while cpu_util_next is: max(cpu_util + task_util,
* cpu_util_est + _task_util_est)
*/
if (cpu == dst_cpu) { // 都是一些util的计算和估计方法
tsk = p; // 不需要细究
util_running =
cpu_util_next(cpu, p, -1) + task_util_est(p);
}
/*
* Busy time computation: utilization clamping is not
* required since the ratio (sum_util / cpu_capacity)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
ENERGY_UTIL, NULL);
sum_util += min(cpu_util, _cpu_cap);
/*
* Performance domain frequency: utilization clamping
* must be considered since it affects the selection
* of the performance domain frequency.
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
FREQUENCY_UTIL, tsk);
max_util = max(max_util, min(cpu_util, _cpu_cap));
}
return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
}
cpu_util_next
/*
* Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
* to @dst_cpu.
*
* 预测当任务@p迁移到@dst_cpu时,@cpu的cpu_util返回值
*/
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
/*
* If @p migrates from @cpu to another, remove its contribution. Or,
* if @p migrates from another CPU to @cpu, add its contribution. In
* the other cases, @cpu is not impacted by the migration, so the
* util_avg should already be correct.
*
* 如果@p从@cpu迁移到其他位置,删除其贡献值;如果@p从其他CPU迁移至@cpu,增加
* 其贡献值。其他情况,@cpu不会受到迁移的影响。
*/
if (task_cpu(p) == cpu && dst_cpu != cpu) // 迁出@cpu,util下降
lsub_positive(&util, task_util(p)); // task_util(p) == p->se.avg.util_avg
else if (task_cpu(p) != cpu && dst_cpu == cpu) // 迁入@cpu,util上升
util += task_util(p);
// 采用估计特性的话,util_est通过EWMA进行估计
if (sched_feat(UTIL_EST)) {
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
* During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be
* cpu_util() after the task has been enqueued.
*
* 在唤醒期间,任务尚未入队,并且不会出现在任何 rq 的
* cfs_rq->avg.util_est.enqueued 中,因此只需将其添加(如果需要)
* 以“模拟” 任务入队后的cpu_util() 。
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
util = max(util, util_est);
}
return min(util, capacity_orig_of(cpu));
}
em_cpu_energy
/**
* em_cpu_energy() - Estimates the energy consumed by the CPUs of a
* performance domain
* @pd : performance domain for which energy has to be estimated
* @max_util : highest utilization among CPUs of the domain
* @sum_util : sum of the utilization of all CPUs in the domain
* @allowed_cpu_cap : maximum allowed CPU capacity for the @pd, which
* might reflect reduced frequency (due to thermal)
*
* This function must be used only for CPU devices. There is no validation,
* i.e. if the EM is a CPU type and has cpumask allocated. It is called from
* the scheduler code quite frequently and that is why there is not checks.
*
* 此功能只能用于CPU设备。没有验证,即EM是否为CPU类型并已分配cpumask。
* 它经常从调度程序代码中调用,这就是为什么没有检查的原因。
*
* Return: the sum of the energy consumed by the CPUs of the domain assuming
* a capacity state satisfying the max utilization of the domain.
*
* Return:假设容量状态满足性能域的最大利用率,性能域的CPU消耗的能量之和。
*/
static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
unsigned long max_util, unsigned long sum_util,
unsigned long allowed_cpu_cap)
{
unsigned long freq, scale_cpu;
struct em_perf_state *ps;
int i, cpu;
if (!sum_util)
return 0;
/*
* In order to predict the performance state, map the utilization of
* the most utilized CPU of the performance domain to a requested
* frequency, like schedutil. Take also into account that the real
* frequency might be set lower (due to thermal capping). Thus, clamp
* max utilization to the allowed CPU capacity before calculating
* effective frequency.
*
* 为了预测性能状态,请将性能域中利用率最高的CPU的利用率映射到请求的频率,
* 如schedutil。还应考虑实际频率可能设置得更低(由于热限流)。因此,在计
* 算有效频率之前,将最大利用率的箝位设置到允许的CPU容量。
*/
cpu = cpumask_first(to_cpumask(pd->cpus));
scale_cpu = arch_scale_cpu_capacity(cpu); // 默认1024
ps = &pd->table[pd->nr_perf_states - 1]; // 获取最后一个perf_state,即能量最大的state
max_util = map_util_perf(max_util); // return: util + (util >> 2); ~ 1.25util ?
max_util = min(max_util, allowed_cpu_cap);
freq = map_util_freq(max_util, ps->frequency, scale_cpu); // return: freq * util / cap;
/*
* Find the lowest performance state of the Energy Model above the
* requested frequency.
*
* 寻找满足上述freq的第一个power_domain table
*/
for (i = 0; i < pd->nr_perf_states; i++) {
ps = &pd->table[i];
if (ps->frequency >= freq)
break;
}
/*
* The capacity of a CPU in the domain at the performance state (ps)
* can be computed as:
*
* ps->freq * scale_cpu
* ps->cap = -------------------- (1)
* cpu_max_freq
*
* So, ignoring the costs of idle states (which are not available in
* the EM), the energy consumed by this CPU at that performance state
* is estimated as:
*
* ps->power * cpu_util
* cpu_nrg = -------------------- (2)
* ps->cap
*
* since 'cpu_util / ps->cap' represents its percentage of busy time.
*
* NOTE: Although the result of this computation actually is in
* units of power, it can be manipulated as an energy value
* over a scheduling period, since it is assumed to be
* constant during that interval.
*
* By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
* of two terms:
*
* ps->power * cpu_max_freq cpu_util
* cpu_nrg = ------------------------ * --------- (3)
* ps->freq scale_cpu
*
* The first term is static, and is stored in the em_perf_state struct
* as 'ps->cost'.
*
* Since all CPUs of the domain have the same micro-architecture, they
* share the same 'ps->cost', and the same CPU capacity. Hence, the
* total energy of the domain (which is the simple sum of the energy of
* all of its CPUs) can be factorized as:
*
* ps->cost * \Sum cpu_util
* pd_nrg = ------------------------ (4)
* scale_cpu
*/
return ps->cost * sum_util / scale_cpu;
}