eb92692 sched/fair: Speed-up energy-aware wake-ups
find_energy_efficient_cpu()
/* if(rd->overutilized) end */
pack small task到一个cpu使其它cpu更容易进入deeper idle state,但是却使cluster更难进入idle,find_energy_efficient_cpu基于cluster-packing,把task在cluster内分散,这样可以降低调度延时,不会破坏同构cpu之间的耦合(cache),也是eas只作用于异构cpu的原因。
Forkees不会走eas wakeup的路径,因为新task还没有util信息,不能预测其对功耗的影响。通过find_idlest_cpu放在负载最小的cpu上。还可以绑定task到指定的cpu上,或继承parent task的util_avg,这些方式的弊端是,会影响其它用例(task的执行)。
6364 /*
6365 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
6366 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
6367 * spare capacity in each performance domain and uses it as a potential
6368 * candidate to execute the task. Then, it uses the Energy Model to figure
6369 * out which of the CPU candidates is the most energy-efficient.
6370 *
6371 * The rationale for this heuristic is as follows. In a performance domain,
6372 * all the most energy efficient CPU candidates (according to the Energy
6373 * Model) are those for which we'll request a low frequency. When there are
6374 * several CPUs for which the frequency request will be the same, we don't
6375 * have enough data to break the tie between them, because the Energy Model
6376 * only includes active power costs. With this model, if we assume that
6377 * frequency requests follow utilization (e.g. using schedutil), the CPU with
6378 * the maximum spare capacity in a performance domain is guaranteed to be among
6379 * the best candidates of the performance domain.
6380 *
6381 * In practice, it could be preferable from an energy standpoint to pack
6382 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
6383 * but that could also hurt our chances to go cluster idle, and we have no
6384 * ways to tell with the current Energy Model if this is actually a good
6385 * idea or not. So, find_energy_efficient_cpu() basically favors
6386 * cluster-packing, and spreading inside a cluster. That should at least be
6387 * a good thing for latency, and this is consistent with the idea that most
6388 * of the energy savings of EAS come from the asymmetry of the system, and
6389 * not so much from breaking the tie between identical CPUs. That's also the
6390 * reason why EAS is enabled in the topology code only for systems where
6391 * SD_ASYM_CPUCAPACITY is set.
6392 *
6393 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
6394 * they don't have any useful utilization data yet and it's not possible to
6395 * forecast their impact on energy consumption. Consequently, they will be
6396 * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
6397 * to be energy-inefficient in some use-cases. The alternative would be to
6398 * bias new tasks towards specific types of CPUs first, or to try to infer
6399 * their util_avg from the parent task, but those heuristics could hurt
6400 * other use-cases too. So, until someone finds a better way to solve this,
6401 * let's keep things simple by re-using the existing slow path.
6402 */
6403 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
6404 {
6405 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6406 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6407 int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1;
6408 unsigned long max_spare_cap_ls = 0, target_cap;
6409 unsigned long cpu_cap, util, base_energy = 0;
6410 bool boosted, latency_sensitive = false;
6411 unsigned int min_exit_lat = UINT_MAX;
6412 int cpu, best_energy_cpu = prev_cpu;
6413 struct cpuidle_state *idle;
6414 struct sched_domain *sd;
6415 struct perf_domain *pd;
6416 unsigned long uclamp_util = uclamp_task_util(p);
6417
6418 rcu_read_lock();
6419 pd = rcu_dereference(rd->pd);
6420 if (!pd || READ_ONCE(rd->overutilized))
6421 goto fail;
6422
/*如果当前cpu rq中只有一个running task;task waker 在wakee run后sleep;当前cpu是task p可运行的cpu mask中的一个
*/
6423 cpu = smp_processor_id();
6424 if (sync && cpu_rq(cpu)->nr_running == 1 &&
6425 cpumask_test_cpu(cpu, p->cpus_ptr)) {
6426 rcu_read_unlock();
6427 return cpu;
6428 }
6429
6430 /*
6431 * Energy-aware wake-up happens on the lowest sched_domain starting
6432 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
6433 */
/*update_top_cache_domain()设置sd_asym_cpucapacity;从最低级sd开始loop sd->flags=SD_ASYM_CPUCAPACITY的sd,
*即当前cpu对应的最低级异构sd.
*/
6434 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
/*如果pre_cpu不在sd内,则使用sd->parent作为?
*/
6435 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6436 sd = sd->parent;
6437 if (!sd)
6438 goto fail;
6439
6440 sync_entity_load_avg(&p->se);
6441 if (!task_util_est(p))
6442 goto unlock;
6443
6444 latency_sensitive = uclamp_latency_sensitive(p);
/*检查UCLAMP_MIN的钳位值是否大于0,大于0,boost=true
*/
6445 boosted = uclamp_boosted(p);
6446 target_cap = boosted ? 0 : ULONG_MAX;
6447
6448 trace_sched_task_comm_info(p, latency_sensitive,
6449 boosted, uclamp_util);
6450
/*rd->pd是一个单向链表,根据cluster有两个pd: perf_domain4(cpu4~7)->perf_domain0(cpu0~3)
*/
6451 for (; pd; pd = pd->next) {
6452 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6453 unsigned long base_energy_pd;
6454 int max_spare_cap_cpu = -1;
6455
6456 /* Compute the 'base' energy of the pd, without @p */
/*dst_cpu=-1,计算pd不包括task p的base energy
*/
6457 base_energy_pd = compute_energy(p, -1, pd);
/*
*计算task p迁移到dst_cpu上时,pd的energy
*/
6312 /*
6313 * compute_energy(): Estimates the energy that @pd would consume if @p was
6314 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
6315 * landscape of @pd's CPUs after the task migration, and uses the Energy Model
6316 * to compute what would be the energy if we decided to actually migrate that
6317 * task.
6318 */
6319 static long
6320 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6321 {
6322 struct cpumask *pd_mask = perf_domain_span(pd);
6323 unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6324 unsigned long max_util = 0, sum_util = 0;
6325 int cpu;
6326
6327 /*
6328 * The capacity state of CPUs of the current rd can be driven by CPUs
6329 * of another rd if they belong to the same pd. So, account for the
6330 * utilization of these CPUs too by masking pd with cpu_online_mask
6331 * instead of the rd span.
6332 *
6333 * If an entire pd is outside of the current rd, it will not appear in
6334 * its pd list and will not be accounted by compute_energy().
6335 */
6336 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
/*
*cpu_util_next计算task p迁移到dst_cpu后,cpu的cfs util
*/
6337 unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
6338 struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
6339
6340 /*
6341 * Busy time computation: utilization clamping is not
6342 * required since the ratio (sum_util / cpu_capacity)
6343 * is already enough to scale the EM reported power
6344 * consumption at the (eventually clamped) cpu_capacity.
6345 */
/*
*计算pd中每个cpu的有效util的和(包括irq,rt,dl),不包括ucalmp和deadline bandwidth
*/
6346 sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6347 ENERGY_UTIL, NULL);
6348
6349 /*
6350 * Performance domain frequency: utilization clamping
6351 * must be considered since it affects the selection
6352 * of the performance domain frequency.
6353 * NOTE: in case RT tasks are running, by default the
6354 * FREQUENCY_UTIL's utilization can be max OPP.
6355 */
/*
*FREQUENCY_UTIL包括ucalmp和deadline bandwidth,不包括dl util
*/
6356 cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6357 FREQUENCY_UTIL, tsk);
/*
*取pd中每个cpu的FREQUENCY_UTIL的最大值
*/
6358 max_util = max(max_util, cpu_util);
6359 }
6360
/*
*em_pd=em_perf_domain是per-cpu的,表示一个perf_domain的具体信息
*/
6361 return em_pd_energy(pd->em_pd, max_util, sum_util);
6362 }
6363
70 /**
71 * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain
72 * @pd : performance domain for which energy has to be estimated
73 * @max_util : highest utilization among CPUs of the domain
74 * @sum_util : sum of the utilization of all CPUs in the domain
75 *
76 * Return: the sum of the energy consumed by the CPUs of the domain assuming
77 * a capacity state satisfying the max utilization of the domain.
78 */
79 static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
80 unsigned long max_util, unsigned long sum_util)
81 {
82 unsigned long freq, scale_cpu;
83 struct em_cap_state *cs;
84 int i, cpu;
85
86 /*
87 * In order to predict the capacity state, map the utilization of the
88 * most utilized CPU of the performance domain to a requested frequency,
89 * like schedutil.
90 */
91 cpu = cpumask_first(to_cpumask(pd->cpus));
92 scale_cpu = arch_scale_cpu_capacity(cpu);
/*
*每个table(em_cap_state)代表一个pd支持的频点功耗信息,同一个pd中的每个cpu的
*pd->em_perf_domain->table指向这个pd的table(em_cap_state)。4+4 soc共两个table.
*此处cs是table中排序最大的频点?。
*/
93 cs = &pd->table[pd->nr_cap_states - 1];
94 freq = map_util_freq(max_util, cs->frequency, scale_cpu);
95
96 /*
97 * Find the lowest capacity state of the Energy Model above the
98 * requested frequency.
99 */
100 for (i = 0; i < pd->nr_cap_states; i++) {
101 cs = &pd->table[i];
102 if (cs->frequency >= freq)
103 break;
104 }
105
106 /*
107 * The capacity of a CPU in the domain at that capacity state (cs)
108 * can be computed as:
109 *
110 * cs->freq * scale_cpu
111 * cs->cap = -------------------- (1)
112 * cpu_max_freq
113 *
114 * So, ignoring the costs of idle states (which are not available in
115 * the EM), the energy consumed by this CPU at that capacity state is
116 * estimated as:
117 *
118 * cs->power * cpu_util
119 * cpu_nrg = -------------------- (2)
120 * cs->cap
121 *
122 * since 'cpu_util / cs->cap' represents its percentage of busy time.
123 *
124 * NOTE: Although the result of this computation actually is in
125 * units of power, it can be manipulated as an energy value
126 * over a scheduling period, since it is assumed to be
127 * constant during that interval.
128 *
129 * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
130 * of two terms:
131 *
132 * cs->power * cpu_max_freq cpu_util
133 * cpu_nrg = ------------------------ * --------- (3)
134 * cs->freq scale_cpu
135 *
136 * The first term is static, and is stored in the em_cap_state struct
137 * as 'cs->cost'.
138 *
139 * Since all CPUs of the domain have the same micro-architecture, they
140 * share the same 'cs->cost', and the same CPU capacity. Hence, the
141 * total energy of the domain (which is the simple sum of the energy of
142 * all of its CPUs) can be factorized as:
143 *
144 * cs->cost * \Sum cpu_util
145 * pd_nrg = ------------------------ (4)
146 * scale_cpu
147 */
148 return cs->cost * sum_util / scale_cpu;
149 }
6458 base_energy += base_energy_pd;
6459
6460 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6461 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6462 continue;
6463
/*cpu_util_next计算task p迁移到dst_cpu后,cpu的cfs util
*/
6464 util = cpu_util_next(cpu, p, cpu);
6465 cpu_cap = capacity_of(cpu);
/*capacity_of(cpu)在哪设置?
*/
kernel/sched/fair.c: cpu_rq(cpu)->cpu_capacity = capacity; /*update_cpu_capacity()*/
kernel/sched/core.c: rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; /*sched_init(void) */
7902 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7903 {
7904 unsigned long capacity = arch_scale_cpu_capacity(cpu);
7905 struct sched_group *sdg = sd->groups;
7906 struct max_cpu_capacity *mcc;
7907 unsigned long max_capacity;
7908 int max_cap_cpu;
7909 unsigned long flags;
7910
7911 cpu_rq(cpu)->cpu_capacity_orig = capacity;
7912
7913 capacity *= arch_scale_max_freq_capacity(sd, cpu);
/*
*即max_freq_scale,经过qos限频后,对cpu实际最高频scale到1024,1024*qos_f/orig_f
*参考:https://blog.csdn.net/liglei/article/details/88733255
*/
7914 capacity >>= SCHED_CAPACITY_SHIFT;
7915
7916 mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
/*
*root_domain中的最大max_cpu_capacity记录了rd中最大的capacity值及对应的cpu
*/
7917
7918 raw_spin_lock_irqsave(&mcc->lock, flags);
7919 max_capacity = mcc->val;
7920 max_cap_cpu = mcc->cpu;
7921
/*
*更新root_domain中max_cpu_capacity的val和cpu
*/
7922 if ((max_capacity > capacity && max_cap_cpu == cpu) ||
7923 (max_capacity < capacity)) {
7924 mcc->val = capacity;
7925 mcc->cpu = cpu;
7926 #ifdef CONFIG_SCHED_DEBUG
7927 raw_spin_unlock_irqrestore(&mcc->lock, flags);
7928 printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
7929 cpu, capacity);
7930 goto skip_unlock;
7931 #endif
7932 }
7933 raw_spin_unlock_irqrestore(&mcc->lock, flags);
7934
7935 skip_unlock: __attribute__ ((unused));
7936 capacity = scale_rt_capacity(cpu, capacity);
7937
7938 if (!capacity)
7939 capacity = 1;
7940
7941 cpu_rq(cpu)->cpu_capacity = capacity;
7942 sdg->sgc->capacity = capacity;
7943 sdg->sgc->min_capacity = capacity;
7944 sdg->sgc->max_capacity = capacity;
7945 }
如上7936行,及下scale_rt_capacity(),最终cpu_rq(cpu)->cpu_capacity是:
cpu_capacity_orig经过qos scale,减去rq上的rt和dl util,对irq util做scale,最终得到的cpu实际的capacity
7874 static unsigned long scale_rt_capacity(int cpu, unsigned long max)
7875 {
7876 struct rq *rq = cpu_rq(cpu);
7877 unsigned long used, free;
7878 unsigned long irq;
7879
7880 irq = cpu_util_irq(rq);
7881
7882 if (unlikely(irq >= max))
7883 return 1;
7884
7885 used = READ_ONCE(rq->avg_rt.util_avg);
7886 used += READ_ONCE(rq->avg_dl.util_avg);
7887
7888 if (unlikely(used >= max))
7889 return 1;
7890
7891 free = max - used;
7892
7893 return scale_irq_capacity(free, irq, max);
7894 }
2477 static inline
2478 unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
2479 {
2480 util *= (max - irq);
2481 util /= max;
2482
2483 return util;
2484
2485 }
6466 spare_cap = cpu_cap - util;
6467
6468 /*
6469 * Skip CPUs that cannot satisfy the capacity request.
6470 * IOW, placing the task there would make the CPU
6471 * overutilized. Take uclamp into account to see how
6472 * much capacity we can get out of the CPU; this is
6473 * aligned with schedutil_cpu_util().
6474 */
/*因为cpu的load受uclamp影响,所以对计算出的util进行rq和task p的clamp,计算出在clamp后,对cpu load 的需求
*/
6475 util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
6476 trace_sched_cfs_rq_task_util(cpu, p, util, spare_cap, cpu_cap);
6477 if (!fits_capacity(util, cpu_cap))
6478 continue;
103 #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
6479
6480 /* Always use prev_cpu as a candidate. */
6481 if (!latency_sensitive && cpu == prev_cpu) {
6482 prev_delta = compute_energy(p, prev_cpu, pd);
6483 prev_delta -= base_energy_pd;
6484 best_delta = min(best_delta, prev_delta);
6485 }
6486
6487 /*
6488 * Find the CPU with the maximum spare capacity in
6489 * the performance domain
6490 */
6491 if (spare_cap > max_spare_cap) {
6492 max_spare_cap = spare_cap;
6493 max_spare_cap_cpu = cpu;
6494 }
6495
6496 if (!latency_sensitive)
6497 continue;
6498
6499 if (idle_cpu(cpu)) {
6500 cpu_cap = capacity_orig_of(cpu);
6501 if (boosted && cpu_cap < target_cap)
6502 continue;
6503 if (!boosted && cpu_cap > target_cap)
6504 continue;
6505 idle = idle_get_state(cpu_rq(cpu));
6506 if (idle && idle->exit_latency > min_exit_lat &&
6507 cpu_cap == target_cap)
6508 continue;
6509
6510 if (idle)
6511 min_exit_lat = idle->exit_latency;
6512 target_cap = cpu_cap;
6513 best_idle_cpu = cpu;
6514 } else if (spare_cap > max_spare_cap_ls) {
6515 max_spare_cap_ls = spare_cap;
6516 max_spare_cap_cpu_ls = cpu;
6517 }
6517 }
6518 }
6519
6520 /* Evaluate the energy impact of using this CPU. */
6521 if (!latency_sensitive && max_spare_cap_cpu >= 0 &&
6522 max_spare_cap_cpu != prev_cpu) {
6523 cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6524 cur_delta -= base_energy_pd;
6525 if (cur_delta < best_delta) {
6526 best_delta = cur_delta;
6527 best_energy_cpu = max_spare_cap_cpu;
6528 }
6529 trace_sched_energy_diff(base_energy_pd, base_energy, prev_delta,
6530 cur_delta, best_delta, prev_cpu,
6531 best_energy_cpu);
6532 }
6533
6534 }
6535 unlock:
6536 rcu_read_unlock();
6537
6538 if (latency_sensitive)
6539 return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
6540
6541 /*
6542 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
6543 * least 6% of the energy used by prev_cpu.
6544 */
6545 if (prev_delta == ULONG_MAX)
6546 return best_energy_cpu;
6547
6548 if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6549 return best_energy_cpu;
6550
6551 return prev_cpu;
6552
6553 fail:
6554 rcu_read_unlock();
6555
6556 return -1;
6557 }