MC层级的group_capacity的更新
这里需要区分是MC层级还是DIE。他们的区别是sd->child是否为NULL。MC层级为NULL
具体可以看update_cpu_capacity这个函数
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long capacity, min_capacity, max_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
//负载均衡的最大值不能超过1UL和max_load_balance_interval 之间,
//也就是负载均衡的最大周期不能超过max_load_balance_interval
interval = clamp(interval, 1UL, max_load_balance_interval);
sdg->sgc->next_update = jiffies + interval;
if (!child) {
update_cpu_capacity(sd, cpu);
return;
}
......
}
rq->cpu_capacity | 本cpu的cfs的计算能力,rq->cpu_capacity = (rq->cpu_capacity_orig - rq->rt.avg.util_avg)*
(rq->cpu_capacity_orig - rq->avg_irq.util_avg)/
rq->cpu_capacity_orig |
|
sd->group->sgc->capacity | sdg->sgc->capacity = rq->cpu_capacity |
|
Sd->group->sgc->max_capacity | sdg->sgc->max_capacity = rq->cpu_capacity |
|
Sd->group->sgc->min_capacity | sdg->sgc->min_capacity = rq->cpu_capacity |
|
Sd->group->sgc->next_update | jiffies + clamp(msecs_to_jiffies(sd->balance_interval), 1UL, max_load_balance_interval) |
|
update_cpu_capacity
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);//这个是dts中配置的值
struct sched_group *sdg = sd->groups;
struct max_cpu_capacity *mcc;
unsigned long max_capacity;
int max_cap_cpu;
unsigned long flags;
capacity *= arch_scale_max_freq_capacity(sd, cpu);//最大为1024
capacity >>= SCHED_CAPACITY_SHIFT;//SCHED_CAPACITY_SHIFT为10,右移10位,等价于除以1024
cpu_rq(cpu)->cpu_capacity_orig = capacity;//dts中配置的值
mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;//后续值实际上对他进行了更改
raw_spin_lock_irqsave(&mcc->lock, flags);
max_capacity = mcc->val;
max_cap_cpu = mcc->cpu;
if ((max_capacity > capacity && max_cap_cpu == cpu) ||
max_capacity < capacity) {
/*update max_cpu_capacity结构体成员,获取整个topology cpu的max capacity
存储在 rd结构体变量max_cpu_capacity中*/
mcc->val = capacity;
mcc->cpu = cpu;
#ifdef CONFIG_SCHED_DEBUG
raw_spin_unlock_irqrestore(&mcc->lock, flags);
printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
cpu, capacity);
goto skip_unlock;
#endif
}
raw_spin_unlock_irqrestore(&mcc->lock, flags);
skip_unlock: __attribute__ ((unused));
capacity = scale_rt_capacity(capacity, cpu);
if (!capacity)
capacity = 1;
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
sdg->sgc->max_capacity = capacity;
}
scale_rt_capacity
static unsigned long scale_rt_capacity(unsigned long max_cap, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long used, free;
unsigned long irq;
irq = cpu_util_irq(rq);//rq->avg_irq.util_avg
if (unlikely(irq >= max_cap))
return 1;
used = READ_ONCE(rq->rt.avg.util_avg);
if (unlikely(used >= max_cap))
return 1;
free = max_cap - used;
//util = (max_cap - rq->rt.avg.util_avg)*(max_cap - rq->avg_irq.util_avg)/max_cap
return scale_irq_capacity(free, irq, max_cap);
}
以sharkl5为例子,配置了宏:
defined(CONFIG_IRQ_TIME_ACCOUNTING) || \
defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return rq->avg_irq.util_avg;
}
static inline
unsigned long scale_irq_capacity(unsigned long util, unsigned long irq,
unsigned long max)
{
util *= (max - irq);
util /= max;
return util;
}
总结一下:update_cpu_capacity如下:
rq->cpu_capacity_orig | cpu_rq(cpu)->cpu_capacity_orig=arch_scale_cpu_capacity(sd, cpu)*arch_scale_max_freq_capacity(sd, cpu)>>=SCHED_CAPACITY_SHIFT | 可以粗略为arch_scale_cpu_capacity(sd, cpu),即dts中配置的值 |
rq->rd->max_cpu_capacity->val | cpu的最大计算能力 满足条件更新 |
|
rq->rd->max_cpu_capacity->cpu | cpu |
|
rq->cpu_capacity | 本cpu的cfs的计算能力,rq->cpu_capacity = (rq->cpu_capacity_orig - rq->rt.avg.util_avg)* (rq->cpu_capacity_orig - rq->avg_irq.util_avg)/ rq->cpu_capacity_orig |
|
sd->group->sgc->capacity | sdg->sgc->capacity = rq->cpu_capacity | |
Sd->group->sgc->max_capacity | sdg->sgc->max_capacity = rq->cpu_capacity |
|
Sd->group->sgc->min_capacity | sdg->sgc->min_capacity = rq->cpu_capacity |
|
DIE层级的group_capacity的更新
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long capacity, min_capacity, max_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
//负载均衡的最大值不能超过1UL和max_load_balance_interval 之间,
//也就是负载均衡的最大周期不能超过max_load_balance_interval
interval = clamp(interval, 1UL, max_load_balance_interval);//在1UL和max_load_balance_interval之间取值
sdg->sgc->next_update = jiffies + interval;
if (!child) {
update_cpu_capacity(sd, cpu);
return;
}
capacity = 0;
min_capacity = ULONG_MAX;
max_capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
* SD_OVERLAP domains cannot assume(假定) that child groups
* span(跨越) the current group.
*/
for_each_cpu(cpu, sched_group_span(sdg)) {
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
/*
* build_sched_domains() -> init_sched_groups_capacity()
* gets here before we've attached the domains to the
* runqueues.
*
* Use capacity_of(), which is set irrespective of domains
* in update_cpu_capacity().
*
* This avoids capacity from being 0 and
* causing divide-by-zero issues on boot.
*/
if (unlikely(!rq->sd)) {
capacity += capacity_of(cpu);
} else {
sgc = rq->sd->groups->sgc;
capacity += sgc->capacity;
}
min_capacity = min(capacity, min_capacity);
max_capacity = max(capacity, max_capacity);
}
} else {//之前跟过code,好像是直接到了这里
/*
* !SD_OVERLAP domains can assume that child groups
* span the current group.
*/
group = child->groups;
do {
struct sched_group_capacity *sgc = group->sgc;
capacity += sgc->capacity;
min_capacity = min(sgc->min_capacity, min_capacity);
max_capacity = max(sgc->max_capacity, max_capacity);
group = group->next;
} while (group != child->groups);//可以看作是计算一个cluster的group
}
// 实际上这里赋值最初传入的值的指针,所以这个值改变,sd也会改变
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = min_capacity;
sdg->sgc->max_capacity = max_capacity;
}
sd->group->sgc->capacity | +=sgc->capacity |
|
Sd->group->sgc->max_capacity | +=min(sgc->min_capacity, min_capacity) |
|
Sd->group->sgc->min_capacity | +=max(sgc->max_capacity, max_capacity) |
|
Sd->group->sgc->next_update | jiffies + clamp(msecs_to_jiffies(sd->balance_interval), 1UL, max_load_balance_interval) |
|
那最初的那个sgc->capacity、sgc->min_capacity、sgc->max_capacity的值又是多少呢?
实际上是在topolocy.c文件中进行初始化
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;