eas k5.4 (一)：v5.4 - Patch-set sched:fair: Reduce complexity of energy calculation

最新推荐文章于 2024-07-14 19:26:22 发布

liglei

最新推荐文章于 2024-07-14 19:26:22 发布

阅读量593

点赞数

分类专栏： kernel EAS-K54 文章标签： EAS

本文链接：https://blog.csdn.net/liglei/article/details/105408865

版权

kernel 同时被 2 个专栏收录

11 篇文章 2 订阅

订阅专栏

EAS-K54

10 篇文章 3 订阅

订阅专栏

eb92692 sched/fair: Speed-up energy-aware wake-ups

find_energy_efficient_cpu()
/* if(rd->overutilized) end */

pack small task到一个cpu使其它cpu更容易进入deeper idle state，但是却使cluster更难进入idle，find_energy_efficient_cpu基于cluster-packing，把task在cluster内分散，这样可以降低调度延时，不会破坏同构cpu之间的耦合(cache)，也是eas只作用于异构cpu的原因。

Forkees不会走eas wakeup的路径，因为新task还没有util信息，不能预测其对功耗的影响。通过find_idlest_cpu放在负载最小的cpu上。还可以绑定task到指定的cpu上，或继承parent task的util_avg，这些方式的弊端是，会影响其它用例（task的执行）。

6364 /*
6365 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
6366 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
6367 * spare capacity in each performance domain and uses it as a potential
6368 * candidate to execute the task. Then, it uses the Energy Model to figure
6369 * out which of the CPU candidates is the most energy-efficient.
6370 *
6371 * The rationale for this heuristic is as follows. In a performance domain,
6372 * all the most energy efficient CPU candidates (according to the Energy
6373 * Model) are those for which we'll request a low frequency. When there are
6374 * several CPUs for which the frequency request will be the same, we don't
6375 * have enough data to break the tie between them, because the Energy Model
6376 * only includes active power costs. With this model, if we assume that
6377 * frequency requests follow utilization (e.g. using schedutil), the CPU with
6378 * the maximum spare capacity in a performance domain is guaranteed to be among
6379 * the best candidates of the performance domain.
6380 *
6381 * In practice, it could be preferable from an energy standpoint to pack
6382 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
6383 * but that could also hurt our chances to go cluster idle, and we have no
6384 * ways to tell with the current Energy Model if this is actually a good
6385 * idea or not. So, find_energy_efficient_cpu() basically favors
6386 * cluster-packing, and spreading inside a cluster. That should at least be
6387 * a good thing for latency, and this is consistent with the idea that most
6388 * of the energy savings of EAS come from the asymmetry of the system, and
6389 * not so much from breaking the tie between identical CPUs. That's also the
6390 * reason why EAS is enabled in the topology code only for systems where
6391 * SD_ASYM_CPUCAPACITY is set.
6392 *
6393 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
6394 * they don't have any useful utilization data yet and it's not possible to
6395 * forecast their impact on energy consumption. Consequently, they will be
6396 * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
6397 * to be energy-inefficient in some use-cases. The alternative would be to
6398 * bias new tasks towards specific types of CPUs first, or to try to infer
6399 * their util_avg from the parent task, but those heuristics could hurt
6400 * other use-cases too. So, until someone finds a better way to solve this,
6401 * let's keep things simple by re-using the existing slow path.
6402 */
6403 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
6404 {
6405 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6406 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6407 int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1;
6408 unsigned long max_spare_cap_ls = 0, target_cap;
6409 unsigned long cpu_cap, util, base_energy = 0;
6410 bool boosted, latency_sensitive = false;
6411 unsigned int min_exit_lat = UINT_MAX;
6412 int cpu, best_energy_cpu = prev_cpu;
6413 struct cpuidle_state *idle;
6414 struct sched_domain *sd;
6415 struct perf_domain *pd;
6416 unsigned long uclamp_util = uclamp_task_util(p);
6417
6418 rcu_read_lock();
6419 pd = rcu_dereference(rd->pd);
6420 if (!pd || READ_ONCE(rd->overutilized))
6421 goto fail;
6422

/*如果当前cpu rq中只有一个running task；task waker 在wakee run后sleep；当前cpu是task p可运行的cpu mask中的一个
*/
6423 cpu = smp_processor_id();
6424 if (sync && cpu_rq(cpu)->nr_running == 1 &&
6425 cpumask_test_cpu(cpu, p->cpus_ptr)) {
6426 rcu_read_unlock();
6427 return cpu;
6428 }
6429
6430 /*
6431 * Energy-aware wake-up happens on the lowest sched_domain starting
6432 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
6433 */

/*update_top_cache_domain()设置sd_asym_cpucapacity；从最低级sd开始loop sd->flags＝SD_ASYM_CPUCAPACITY的sd,
*即当前cpu对应的最低级异构sd.
*/
6434 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));

/*如果pre_cpu不在sd内，则使用sd->parent作为？
*/
6435 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6436 sd = sd->parent;
6437 if (!sd)
6438 goto fail;
6439
6440 sync_entity_load_avg(&p->se);
6441 if (!task_util_est(p))
6442 goto unlock;
6443
6444 latency_sensitive = uclamp_latency_sensitive(p);

/*检查UCLAMP_MIN的钳位值是否大于0，大于0，boost=true
*/
6445 boosted = uclamp_boosted(p);
6446 target_cap = boosted ? 0 : ULONG_MAX;
6447
6448 trace_sched_task_comm_info(p, latency_sensitive,
6449 boosted, uclamp_util);
6450

/*rd->pd是一个单向链表，根据cluster有两个pd: perf_domain4(cpu4~7)->perf_domain0(cpu0~3)
*/
6451 for (; pd; pd = pd->next) {
6452 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6453 unsigned long base_energy_pd;
6454 int max_spare_cap_cpu = -1;
6455
6456 /* Compute the 'base' energy of the pd, without @p */

/*dst_cpu=-1,计算pd不包括task p的base energy
*/
6457 base_energy_pd = compute_energy(p, -1, pd);

/*
 *计算task p迁移到dst_cpu上时，pd的energy
 */
 6312 /*
 6313  * compute_energy(): Estimates the energy that @pd would consume if @p was
 6314  * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
 6315  * landscape of @pd's CPUs after the task migration, and uses the Energy Model
 6316  * to compute what would be the energy if we decided to actually migrate that
 6317  * task.
 6318  */
 6319 static long
 6320 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 6321 {
 6322     struct cpumask *pd_mask = perf_domain_span(pd);
 6323     unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
 6324     unsigned long max_util = 0, sum_util = 0;
 6325     int cpu;
 6326  
 6327     /*
 6328      * The capacity state of CPUs of the current rd can be driven by CPUs
 6329      * of another rd if they belong to the same pd. So, account for the
 6330      * utilization of these CPUs too by masking pd with cpu_online_mask
 6331      * instead of the rd span.
 6332      *
 6333      * If an entire pd is outside of the current rd, it will not appear in
 6334      * its pd list and will not be accounted by compute_energy().
 6335      */
 6336     for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
/*
 *cpu_util_next计算task p迁移到dst_cpu后，cpu的cfs util
 */
 6337         unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
 6338         struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
 6339  
 6340         /*
 6341          * Busy time computation: utilization clamping is not
 6342          * required since the ratio (sum_util / cpu_capacity)
 6343          * is already enough to scale the EM reported power
 6344          * consumption at the (eventually clamped) cpu_capacity.
 6345          */
/*
 *计算pd中每个cpu的有效util的和（包括irq,rt,dl）,不包括ucalmp和deadline bandwidth
 */
 6346         sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
 6347                            ENERGY_UTIL, NULL);            
 6348  
 6349         /*
 6350          * Performance domain frequency: utilization clamping
 6351          * must be considered since it affects the selection
 6352          * of the performance domain frequency.
 6353          * NOTE: in case RT tasks are running, by default the
 6354          * FREQUENCY_UTIL's utilization can be max OPP.
 6355          */             
/*
 *FREQUENCY_UTIL包括ucalmp和deadline bandwidth,不包括dl util
 */
 6356         cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
 6357                           FREQUENCY_UTIL, tsk); 
/*
 *取pd中每个cpu的FREQUENCY_UTIL的最大值
 */
 6358         max_util = max(max_util, cpu_util);
 6359     }
 6360  
/*
 *em_pd=em_perf_domain是per-cpu的，表示一个perf_domain的具体信息
 */
 6361     return em_pd_energy(pd->em_pd, max_util, sum_util);
 6362 }
 6363

 70 /**
 71  * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain
 72  * @pd      : performance domain for which energy has to be estimated
 73  * @max_util    : highest utilization among CPUs of the domain
 74  * @sum_util    : sum of the utilization of all CPUs in the domain
 75  *
 76  * Return: the sum of the energy consumed by the CPUs of the domain assuming
 77  * a capacity state satisfying the max utilization of the domain.
 78  */
 79 static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
 80                 unsigned long max_util, unsigned long sum_util)
 81 {
 82     unsigned long freq, scale_cpu;
 83     struct em_cap_state *cs;
 84     int i, cpu;
 85 
 86     /*
 87      * In order to predict the capacity state, map the utilization of the
 88      * most utilized CPU of the performance domain to a requested frequency,
 89      * like schedutil.
 90      */
 91     cpu = cpumask_first(to_cpumask(pd->cpus));
 92     scale_cpu = arch_scale_cpu_capacity(cpu);
/*
 *每个table（em_cap_state）代表一个pd支持的频点功耗信息,同一个pd中的每个cpu的
 *pd->em_perf_domain->table指向这个pd的table(em_cap_state)。4+4 soc共两个table.
 *此处cs是table中排序最大的频点?。
 */
 93     cs = &pd->table[pd->nr_cap_states - 1];
 94     freq = map_util_freq(max_util, cs->frequency, scale_cpu);
 95 
 96     /*
 97      * Find the lowest capacity state of the Energy Model above the
 98      * requested frequency.
 99      */
100     for (i = 0; i < pd->nr_cap_states; i++) {
101         cs = &pd->table[i];
102         if (cs->frequency >= freq)
103             break;
104     }
105 
106     /*
107      * The capacity of a CPU in the domain at that capacity state (cs)
108      * can be computed as:
109      *
110      *             cs->freq * scale_cpu
111      *   cs->cap = --------------------                          (1)
112      *                 cpu_max_freq
113      *
114      * So, ignoring the costs of idle states (which are not available in
115      * the EM), the energy consumed by this CPU at that capacity state is
116      * estimated as:
117      *
118      *             cs->power * cpu_util
119      *   cpu_nrg = --------------------                          (2)
120      *                   cs->cap
121      *
122      * since 'cpu_util / cs->cap' represents its percentage of busy time.
123      *
124      *   NOTE: Although the result of this computation actually is in
125      *         units of power, it can be manipulated as an energy value
126      *         over a scheduling period, since it is assumed to be
127      *         constant during that interval.
128      *
129      * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
130      * of two terms:
131      *
132      *             cs->power * cpu_max_freq   cpu_util
133      *   cpu_nrg = ------------------------ * ---------          (3)
134      *                    cs->freq            scale_cpu
135      *
136      * The first term is static, and is stored in the em_cap_state struct
137      * as 'cs->cost'.
138      *
139      * Since all CPUs of the domain have the same micro-architecture, they
140      * share the same 'cs->cost', and the same CPU capacity. Hence, the
141      * total energy of the domain (which is the simple sum of the energy of
142      * all of its CPUs) can be factorized as:
143      *
144      *            cs->cost * \Sum cpu_util
145      *   pd_nrg = ------------------------                       (4)
146      *                  scale_cpu
147      */
148     return cs->cost * sum_util / scale_cpu;
149 }

6458 base_energy += base_energy_pd;
6459
6460 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6461 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6462 continue;
6463

/*cpu_util_next计算task p迁移到dst_cpu后，cpu的cfs util
*/
6464 util = cpu_util_next(cpu, p, cpu);
6465 cpu_cap = capacity_of(cpu);

/*capacity_of(cpu)在哪设置？
*/
kernel/sched/fair.c: cpu_rq(cpu)->cpu_capacity = capacity; /*update_cpu_capacity()*/
kernel/sched/core.c: rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; /*sched_init(void) */

 7902 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 7903 {
 7904     unsigned long capacity = arch_scale_cpu_capacity(cpu);
 7905     struct sched_group *sdg = sd->groups; 
 7906     struct max_cpu_capacity *mcc;  
 7907     unsigned long max_capacity;    
 7908     int max_cap_cpu;    
 7909     unsigned long flags;
 7910  
 7911     cpu_rq(cpu)->cpu_capacity_orig = capacity;
 7912  
 7913     capacity *= arch_scale_max_freq_capacity(sd, cpu);
/*
 *即max_freq_scale，经过qos限频后，对cpu实际最高频scale到1024，1024*qos_f/orig_f
 *参考：https://blog.csdn.net/liglei/article/details/88733255
 */
 7914     capacity >>= SCHED_CAPACITY_SHIFT;
 7915  
 7916     mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
/*
 *root_domain中的最大max_cpu_capacity记录了rd中最大的capacity值及对应的cpu
 */
 7917  
 7918     raw_spin_lock_irqsave(&mcc->lock, flags);
 7919     max_capacity = mcc->val;
 7920     max_cap_cpu = mcc->cpu;
 7921  
/*
 *更新root_domain中max_cpu_capacity的val和cpu
 */
 7922     if ((max_capacity > capacity && max_cap_cpu == cpu) ||
 7923         (max_capacity < capacity)) {   
 7924         mcc->val = capacity;
 7925         mcc->cpu = cpu; 
 7926 #ifdef CONFIG_SCHED_DEBUG
 7927         raw_spin_unlock_irqrestore(&mcc->lock, flags);
 7928         printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
 7929                 cpu, capacity);                
 7930         goto skip_unlock;
 7931 #endif
 7932     }
 7933     raw_spin_unlock_irqrestore(&mcc->lock, flags);
 7934  
 7935 skip_unlock: __attribute__ ((unused));
 7936     capacity = scale_rt_capacity(cpu, capacity);
 7937  
 7938     if (!capacity)
 7939         capacity = 1;
 7940  
 7941     cpu_rq(cpu)->cpu_capacity = capacity;
 7942     sdg->sgc->capacity = capacity; 
 7943     sdg->sgc->min_capacity = capacity;
 7944     sdg->sgc->max_capacity = capacity;
 7945 }

如上7936行，及下scale_rt_capacity()，最终cpu_rq(cpu)->cpu_capacity是：
cpu_capacity_orig经过qos scale，减去rq上的rt和dl util，对irq util做scale，最终得到的cpu实际的capacity

 7874 static unsigned long scale_rt_capacity(int cpu, unsigned long max)
 7875 {
 7876     struct rq *rq = cpu_rq(cpu);   
 7877     unsigned long used, free;
 7878     unsigned long irq;
 7879 
 7880     irq = cpu_util_irq(rq);
 7881  
 7882     if (unlikely(irq >= max))
 7883         return 1;
 7884  
 7885     used = READ_ONCE(rq->avg_rt.util_avg);
 7886     used += READ_ONCE(rq->avg_dl.util_avg);
 7887  
 7888     if (unlikely(used >= max))
 7889         return 1;
 7890  
 7891     free = max - used;
 7892  
 7893     return scale_irq_capacity(free, irq, max);
 7894 }

2477 static inline 
2478 unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
2479 { 
2480     util *= (max - irq);
2481     util /= max;
2482 
2483     return util;
2484 
2485 }

6466 spare_cap = cpu_cap - util;
6467
6468 /*
6469 * Skip CPUs that cannot satisfy the capacity request.
6470 * IOW, placing the task there would make the CPU
6471 * overutilized. Take uclamp into account to see how
6472 * much capacity we can get out of the CPU; this is
6473 * aligned with schedutil_cpu_util().
6474 */

/*因为cpu的load受uclamp影响，所以对计算出的util进行rq和task p的clamp，计算出在clamp后，对cpu load 的需求
*/
6475 util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
6476 trace_sched_cfs_rq_task_util(cpu, p, util, spare_cap, cpu_cap);
6477 if (!fits_capacity(util, cpu_cap))
6478 continue;

103 #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)

6479
6480 /* Always use prev_cpu as a candidate. */
6481 if (!latency_sensitive && cpu == prev_cpu) {
6482 prev_delta = compute_energy(p, prev_cpu, pd);
6483 prev_delta -= base_energy_pd;
6484 best_delta = min(best_delta, prev_delta);
6485 }
6486
6487 /*
6488 * Find the CPU with the maximum spare capacity in
6489 * the performance domain
6490 */
6491 if (spare_cap > max_spare_cap) {
6492 max_spare_cap = spare_cap;
6493 max_spare_cap_cpu = cpu;
6494 }
6495
6496 if (!latency_sensitive)
6497 continue;
6498
6499 if (idle_cpu(cpu)) {
6500 cpu_cap = capacity_orig_of(cpu);
6501 if (boosted && cpu_cap < target_cap)
6502 continue;
6503 if (!boosted && cpu_cap > target_cap)
6504 continue;
6505 idle = idle_get_state(cpu_rq(cpu));
6506 if (idle && idle->exit_latency > min_exit_lat &&
6507 cpu_cap == target_cap)
6508 continue;
6509
6510 if (idle)
6511 min_exit_lat = idle->exit_latency;
6512 target_cap = cpu_cap;
6513 best_idle_cpu = cpu;
6514 } else if (spare_cap > max_spare_cap_ls) {
6515 max_spare_cap_ls = spare_cap;
6516 max_spare_cap_cpu_ls = cpu;
6517 }
6517 }
6518 }
6519
6520 /* Evaluate the energy impact of using this CPU. */
6521 if (!latency_sensitive && max_spare_cap_cpu >= 0 &&
6522 max_spare_cap_cpu != prev_cpu) {
6523 cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6524 cur_delta -= base_energy_pd;
6525 if (cur_delta < best_delta) {
6526 best_delta = cur_delta;
6527 best_energy_cpu = max_spare_cap_cpu;
6528 }
6529 trace_sched_energy_diff(base_energy_pd, base_energy, prev_delta,
6530 cur_delta, best_delta, prev_cpu,
6531 best_energy_cpu);
6532 }
6533
6534 }
6535 unlock:
6536 rcu_read_unlock();
6537
6538 if (latency_sensitive)
6539 return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
6540
6541 /*
6542 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
6543 * least 6% of the energy used by prev_cpu.
6544 */
6545 if (prev_delta == ULONG_MAX)
6546 return best_energy_cpu;
6547
6548 if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6549 return best_energy_cpu;
6550
6551 return prev_cpu;
6552
6553 fail:
6554 rcu_read_unlock();
6555
6556 return -1;
6557 }