eas k5.4 (一):v5.4 - Patch-set sched:fair: Reduce complexity of energy calculation

eb92692 sched/fair: Speed-up energy-aware wake-ups

find_energy_efficient_cpu()
/* if(rd->overutilized) end */

pack small task到一个cpu使其它cpu更容易进入deeper idle state,但是却使cluster更难进入idle,find_energy_efficient_cpu基于cluster-packing,把task在cluster内分散,这样可以降低调度延时,不会破坏同构cpu之间的耦合(cache),也是eas只作用于异构cpu的原因。

Forkees不会走eas wakeup的路径,因为新task还没有util信息,不能预测其对功耗的影响。通过find_idlest_cpu放在负载最小的cpu上。还可以绑定task到指定的cpu上,或继承parent task的util_avg,这些方式的弊端是,会影响其它用例(task的执行)。

 6364 /*
 6365  * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
 6366  * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
 6367  * spare capacity in each performance domain and uses it as a potential
 6368  * candidate to execute the task. Then, it uses the Energy Model to figure
 6369  * out which of the CPU candidates is the most energy-efficient.
 6370  *
 6371  * The rationale for this heuristic is as follows. In a performance domain,
 6372  * all the most energy efficient CPU candidates (according to the Energy
 6373  * Model) are those for which we'll request a low frequency. When there are
 6374  * several CPUs for which the frequency request will be the same, we don't
 6375  * have enough data to break the tie between them, because the Energy Model
 6376  * only includes active power costs. With this model, if we assume that
 6377  * frequency requests follow utilization (e.g. using schedutil), the CPU with
 6378  * the maximum spare capacity in a performance domain is guaranteed to be among
 6379  * the best candidates of the performance domain.
 6380  *
 6381  * In practice, it could be preferable from an energy standpoint to pack
 6382  * small tasks on a CPU in order to let other CPUs go in deeper idle states,
 6383  * but that could also hurt our chances to go cluster idle, and we have no
 6384  * ways to tell with the current Energy Model if this is actually a good
 6385  * idea or not. So, find_energy_efficient_cpu() basically favors
 6386  * cluster-packing, and spreading inside a cluster. That should at least be
 6387  * a good thing for latency, and this is consistent with the idea that most
 6388  * of the energy savings of EAS come from the asymmetry of the system, and
 6389  * not so much from breaking the tie between identical CPUs. That's also the
 6390  * reason why EAS is enabled in the topology code only for systems where
 6391  * SD_ASYM_CPUCAPACITY is set.
 6392  *
 6393  * NOTE: Forkees are not accepted in the energy-aware wake-up path because
 6394  * they don't have any useful utilization data yet and it's not possible to
 6395  * forecast their impact on energy consumption. Consequently, they will be
 6396  * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
 6397  * to be energy-inefficient in some use-cases. The alternative would be to
 6398  * bias new tasks towards specific types of CPUs first, or to try to infer
 6399  * their util_avg from the parent task, but those heuristics could hurt
 6400  * other use-cases too. So, until someone finds a better way to solve this,
 6401  * let's keep things simple by re-using the existing slow path.
 6402  */
 6403 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
 6404 {
 6405     unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
 6406     struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 6407     int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1;
 6408     unsigned long max_spare_cap_ls = 0, target_cap;
 6409     unsigned long cpu_cap, util, base_energy = 0;
 6410     bool boosted, latency_sensitive = false;
 6411     unsigned int min_exit_lat = UINT_MAX;
 6412     int cpu, best_energy_cpu = prev_cpu;
 6413     struct cpuidle_state *idle;
 6414     struct sched_domain *sd;
 6415     struct perf_domain *pd;
 6416     unsigned long uclamp_util = uclamp_task_util(p);
 6417 
 6418     rcu_read_lock();
 6419     pd = rcu_dereference(rd->pd);
 6420     if (!pd || READ_ONCE(rd->overutilized))
 6421         goto fail;
 6422

/*如果当前cpu rq中只有一个running task;task waker 在wakee run后sleep;当前cpu是task p可运行的cpu mask中的一个
 */

 6423     cpu = smp_processor_id();
 6424     if (sync && cpu_rq(cpu)->nr_running == 1 &&
 6425         cpumask_test_cpu(cpu, p->cpus_ptr)) {
 6426         rcu_read_unlock();
 6427         return cpu;
 6428     }
 6429 
 6430     /*
 6431      * Energy-aware wake-up happens on the lowest sched_domain starting
 6432      * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
 6433      */

/*update_top_cache_domain()设置sd_asym_cpucapacity;从最低级sd开始loop sd->flags=SD_ASYM_CPUCAPACITY的sd,
*即当前cpu对应的最低级异构sd.
*/

 6434     sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));

/*如果pre_cpu不在sd内,则使用sd->parent作为?
*/

 6435     while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
 6436         sd = sd->parent;
 6437     if (!sd)
 6438         goto fail;
 6439 
 6440     sync_entity_load_avg(&p->se);
 6441     if (!task_util_est(p))
 6442         goto unlock;
 6443 
 6444     latency_sensitive = uclamp_latency_sensitive(p);

/*检查UCLAMP_MIN的钳位值是否大于0,大于0,boost=true
 */

 6445     boosted = uclamp_boosted(p);
 6446     target_cap = boosted ? 0 : ULONG_MAX;
 6447 
 6448     trace_sched_task_comm_info(p, latency_sensitive,
 6449                     boosted, uclamp_util);
 6450 

/*rd->pd是一个单向链表,根据cluster有两个pd: perf_domain4(cpu4~7)->perf_domain0(cpu0~3)
 */

 6451     for (; pd; pd = pd->next) {
 6452         unsigned long cur_delta, spare_cap, max_spare_cap = 0;
 6453         unsigned long base_energy_pd;
 6454         int max_spare_cap_cpu = -1;
 6455 
 6456         /* Compute the 'base' energy of the pd, without @p */

/*dst_cpu=-1,计算pd不包括task p的base energy
 */

 6457         base_energy_pd = compute_energy(p, -1, pd);

/*
 *计算task p迁移到dst_cpu上时,pd的energy
 */
 6312 /*
 6313  * compute_energy(): Estimates the energy that @pd would consume if @p was
 6314  * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
 6315  * landscape of @pd's CPUs after the task migration, and uses the Energy Model
 6316  * to compute what would be the energy if we decided to actually migrate that
 6317  * task.
 6318  */
 6319 static long
 6320 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 6321 {
 6322     struct cpumask *pd_mask = perf_domain_span(pd);
 6323     unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
 6324     unsigned long max_util = 0, sum_util = 0;
 6325     int cpu;
 6326  
 6327     /*
 6328      * The capacity state of CPUs of the current rd can be driven by CPUs
 6329      * of another rd if they belong to the same pd. So, account for the
 6330      * utilization of these CPUs too by masking pd with cpu_online_mask
 6331      * instead of the rd span.
 6332      *
 6333      * If an entire pd is outside of the current rd, it will not appear in
 6334      * its pd list and will not be accounted by compute_energy().
 6335      */
 6336     for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
/*
 *cpu_util_next计算task p迁移到dst_cpu后,cpu的cfs util
 */
 6337         unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
 6338         struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
 6339  
 6340         /*
 6341          * Busy time computation: utilization clamping is not
 6342          * required since the ratio (sum_util / cpu_capacity)
 6343          * is already enough to scale the EM reported power
 6344          * consumption at the (eventually clamped) cpu_capacity.
 6345          */
/*
 *计算pd中每个cpu的有效util的和(包括irq,rt,dl),不包括ucalmp和deadline bandwidth
 */
 6346         sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
 6347                            ENERGY_UTIL, NULL);            
 6348  
 6349         /*
 6350          * Performance domain frequency: utilization clamping
 6351          * must be considered since it affects the selection
 6352          * of the performance domain frequency.
 6353          * NOTE: in case RT tasks are running, by default the
 6354          * FREQUENCY_UTIL's utilization can be max OPP.
 6355          */             
/*
 *FREQUENCY_UTIL包括ucalmp和deadline bandwidth,不包括dl util
 */
 6356         cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
 6357                           FREQUENCY_UTIL, tsk); 
/*
 *取pd中每个cpu的FREQUENCY_UTIL的最大值
 */
 6358         max_util = max(max_util, cpu_util);
 6359     }
 6360  
/*
 *em_pd=em_perf_domain是per-cpu的,表示一个perf_domain的具体信息
 */
 6361     return em_pd_energy(pd->em_pd, max_util, sum_util);
 6362 }
 6363  
 70 /**
 71  * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain
 72  * @pd      : performance domain for which energy has to be estimated
 73  * @max_util    : highest utilization among CPUs of the domain
 74  * @sum_util    : sum of the utilization of all CPUs in the domain
 75  *
 76  * Return: the sum of the energy consumed by the CPUs of the domain assuming
 77  * a capacity state satisfying the max utilization of the domain.
 78  */
 79 static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
 80                 unsigned long max_util, unsigned long sum_util)
 81 {
 82     unsigned long freq, scale_cpu;
 83     struct em_cap_state *cs;
 84     int i, cpu;
 85 
 86     /*
 87      * In order to predict the capacity state, map the utilization of the
 88      * most utilized CPU of the performance domain to a requested frequency,
 89      * like schedutil.
 90      */
 91     cpu = cpumask_first(to_cpumask(pd->cpus));
 92     scale_cpu = arch_scale_cpu_capacity(cpu);
/*
 *每个table(em_cap_state)代表一个pd支持的频点功耗信息,同一个pd中的每个cpu的
 *pd->em_perf_domain->table指向这个pd的table(em_cap_state)。4+4 soc共两个table.
 *此处cs是table中排序最大的频点?。
 */
 93     cs = &pd->table[pd->nr_cap_states - 1];
 94     freq = map_util_freq(max_util, cs->frequency, scale_cpu);
 95 
 96     /*
 97      * Find the lowest capacity state of the Energy Model above the
 98      * requested frequency.
 99      */
100     for (i = 0; i < pd->nr_cap_states; i++) {
101         cs = &pd->table[i];
102         if (cs->frequency >= freq)
103             break;
104     }
105 
106     /*
107      * The capacity of a CPU in the domain at that capacity state (cs)
108      * can be computed as:
109      *
110      *             cs->freq * scale_cpu
111      *   cs->cap = --------------------                          (1)
112      *                 cpu_max_freq
113      *
114      * So, ignoring the costs of idle states (which are not available in
115      * the EM), the energy consumed by this CPU at that capacity state is
116      * estimated as:
117      *
118      *             cs->power * cpu_util
119      *   cpu_nrg = --------------------                          (2)
120      *                   cs->cap
121      *
122      * since 'cpu_util / cs->cap' represents its percentage of busy time.
123      *
124      *   NOTE: Although the result of this computation actually is in
125      *         units of power, it can be manipulated as an energy value
126      *         over a scheduling period, since it is assumed to be
127      *         constant during that interval.
128      *
129      * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
130      * of two terms:
131      *
132      *             cs->power * cpu_max_freq   cpu_util
133      *   cpu_nrg = ------------------------ * ---------          (3)
134      *                    cs->freq            scale_cpu
135      *
136      * The first term is static, and is stored in the em_cap_state struct
137      * as 'cs->cost'.
138      *
139      * Since all CPUs of the domain have the same micro-architecture, they
140      * share the same 'cs->cost', and the same CPU capacity. Hence, the
141      * total energy of the domain (which is the simple sum of the energy of
142      * all of its CPUs) can be factorized as:
143      *
144      *            cs->cost * \Sum cpu_util
145      *   pd_nrg = ------------------------                       (4)
146      *                  scale_cpu
147      */
148     return cs->cost * sum_util / scale_cpu;
149 }

 6458         base_energy += base_energy_pd;
 6459 
 6460         for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
 6461             if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 6462                 continue;
 6463 

 /*cpu_util_next计算task p迁移到dst_cpu后,cpu的cfs util
 */

 6464             util = cpu_util_next(cpu, p, cpu);
 6465             cpu_cap = capacity_of(cpu);

/*capacity_of(cpu)在哪设置?
 */
kernel/sched/fair.c:    cpu_rq(cpu)->cpu_capacity = capacity;    /*update_cpu_capacity()*/
kernel/sched/core.c:        rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;    /*sched_init(void) */

 7902 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 7903 {
 7904     unsigned long capacity = arch_scale_cpu_capacity(cpu);
 7905     struct sched_group *sdg = sd->groups; 
 7906     struct max_cpu_capacity *mcc;  
 7907     unsigned long max_capacity;    
 7908     int max_cap_cpu;    
 7909     unsigned long flags;
 7910  
 7911     cpu_rq(cpu)->cpu_capacity_orig = capacity;
 7912  
 7913     capacity *= arch_scale_max_freq_capacity(sd, cpu);
/*
 *即max_freq_scale,经过qos限频后,对cpu实际最高频scale到1024,1024*qos_f/orig_f
 *参考:https://blog.csdn.net/liglei/article/details/88733255
 */
 7914     capacity >>= SCHED_CAPACITY_SHIFT;
 7915  
 7916     mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
/*
 *root_domain中的最大max_cpu_capacity记录了rd中最大的capacity值及对应的cpu
 */
 7917  
 7918     raw_spin_lock_irqsave(&mcc->lock, flags);
 7919     max_capacity = mcc->val;
 7920     max_cap_cpu = mcc->cpu;
 7921  
/*
 *更新root_domain中max_cpu_capacity的val和cpu
 */
 7922     if ((max_capacity > capacity && max_cap_cpu == cpu) ||
 7923         (max_capacity < capacity)) {   
 7924         mcc->val = capacity;
 7925         mcc->cpu = cpu; 
 7926 #ifdef CONFIG_SCHED_DEBUG
 7927         raw_spin_unlock_irqrestore(&mcc->lock, flags);
 7928         printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
 7929                 cpu, capacity);                
 7930         goto skip_unlock;
 7931 #endif
 7932     }
 7933     raw_spin_unlock_irqrestore(&mcc->lock, flags);
 7934  
 7935 skip_unlock: __attribute__ ((unused));
 7936     capacity = scale_rt_capacity(cpu, capacity);
 7937  
 7938     if (!capacity)
 7939         capacity = 1;
 7940  
 7941     cpu_rq(cpu)->cpu_capacity = capacity;
 7942     sdg->sgc->capacity = capacity; 
 7943     sdg->sgc->min_capacity = capacity;
 7944     sdg->sgc->max_capacity = capacity;
 7945 }

如上7936行,及下scale_rt_capacity(),最终cpu_rq(cpu)->cpu_capacity是:
cpu_capacity_orig经过qos scale,减去rq上的rt和dl util,对irq util做scale,最终得到的cpu实际的capacity

 7874 static unsigned long scale_rt_capacity(int cpu, unsigned long max)
 7875 {
 7876     struct rq *rq = cpu_rq(cpu);   
 7877     unsigned long used, free;
 7878     unsigned long irq;
 7879 
 7880     irq = cpu_util_irq(rq);
 7881  
 7882     if (unlikely(irq >= max))
 7883         return 1;
 7884  
 7885     used = READ_ONCE(rq->avg_rt.util_avg);
 7886     used += READ_ONCE(rq->avg_dl.util_avg);
 7887  
 7888     if (unlikely(used >= max))
 7889         return 1;
 7890  
 7891     free = max - used;
 7892  
 7893     return scale_irq_capacity(free, irq, max);
 7894 }

2477 static inline 
2478 unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
2479 { 
2480     util *= (max - irq);
2481     util /= max;
2482 
2483     return util;
2484 
2485 }    

 6466             spare_cap = cpu_cap - util;
 6467 
 6468             /*
 6469              * Skip CPUs that cannot satisfy the capacity request.
 6470              * IOW, placing the task there would make the CPU
 6471              * overutilized. Take uclamp into account to see how
 6472              * much capacity we can get out of the CPU; this is
 6473              * aligned with schedutil_cpu_util().
 6474              */

/*因为cpu的load受uclamp影响,所以对计算出的util进行rq和task p的clamp,计算出在clamp后,对cpu load 的需求
 */

 6475             util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
 6476             trace_sched_cfs_rq_task_util(cpu, p, util, spare_cap, cpu_cap);
 6477             if (!fits_capacity(util, cpu_cap))
 6478                 continue;

103 #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)

 6479 
 6480             /* Always use prev_cpu as a candidate. */
 6481             if (!latency_sensitive && cpu == prev_cpu) {
 6482                 prev_delta = compute_energy(p, prev_cpu, pd);
 6483                 prev_delta -= base_energy_pd;
 6484                 best_delta = min(best_delta, prev_delta);
 6485             }
 6486 
 6487             /*
 6488              * Find the CPU with the maximum spare capacity in
 6489              * the performance domain
 6490              */
 6491             if (spare_cap > max_spare_cap) {
 6492                 max_spare_cap = spare_cap;
 6493                 max_spare_cap_cpu = cpu;
 6494             }
 6495 
 6496             if (!latency_sensitive)
 6497                 continue;
 6498 
 6499             if (idle_cpu(cpu)) {
 6500                 cpu_cap = capacity_orig_of(cpu);
 6501                 if (boosted && cpu_cap < target_cap)
 6502                     continue;
 6503                 if (!boosted && cpu_cap > target_cap)
 6504                     continue;
 6505                 idle = idle_get_state(cpu_rq(cpu));
 6506                 if (idle && idle->exit_latency > min_exit_lat &&
 6507                         cpu_cap == target_cap)
 6508                     continue;
 6509 
 6510                 if (idle)
 6511                     min_exit_lat = idle->exit_latency;
 6512                 target_cap = cpu_cap;
 6513                 best_idle_cpu = cpu;
 6514             } else if (spare_cap > max_spare_cap_ls) {
 6515                 max_spare_cap_ls = spare_cap;
 6516                 max_spare_cap_cpu_ls = cpu;
 6517             }
 6517             }
 6518         }
 6519 
 6520         /* Evaluate the energy impact of using this CPU. */
 6521         if (!latency_sensitive && max_spare_cap_cpu >= 0 &&
 6522                         max_spare_cap_cpu != prev_cpu) {
 6523             cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
 6524             cur_delta -= base_energy_pd;
 6525             if (cur_delta < best_delta) {
 6526                 best_delta = cur_delta;
 6527                 best_energy_cpu = max_spare_cap_cpu;
 6528             }
 6529             trace_sched_energy_diff(base_energy_pd, base_energy, prev_delta,
 6530                         cur_delta, best_delta, prev_cpu,
 6531                         best_energy_cpu);
 6532         }
 6533 
 6534     }
 6535 unlock:
 6536     rcu_read_unlock();
 6537 
 6538     if (latency_sensitive)
 6539         return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
 6540 
 6541     /*
 6542      * Pick the best CPU if prev_cpu cannot be used, or if it saves at
 6543      * least 6% of the energy used by prev_cpu.
 6544      */
 6545     if (prev_delta == ULONG_MAX)
 6546         return best_energy_cpu;
 6547 
 6548     if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
 6549         return best_energy_cpu;
 6550 
 6551     return prev_cpu;
 6552 
 6553 fail:
 6554     rcu_read_unlock();
 6555 
 6556     return -1;
 6557 }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值