should_we_balance
判断是否应该做balance
在should_we_balance()做各种判断,做dst_cpu的条件有:要么是本sg的第一个idle cpu,要么是本sg的第一个cpu
判断是否需要在当前cpu上做load balance
(1) 如果是NEWLY IDLE,需要做load balance
(2) 否则的需要在idle cpu上做balance
(3) 如果没有idle cpu的话,就在group的第一个cpu上做load balance
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
int cpu, balance_cpu = -1;
/*
* Ensure the balancing environment is consistent; can happen
* when the softirq triggers 'during' hotplug.
*/
if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
return 0;
/*
* In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
/* (7.2.1) 如果本cpu为CPU_NEWLY_IDLE,直接符合迁移条件 */
if (env->idle == CPU_NEWLY_IDLE)
return 1;
/* Try to find first idle cpu */
/* (7.2.2) 本sched_group的第一个idle cpu适合做load_balance */
//如果这里的第一个idle cpu是dst cpu,则开始做load balance
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
if (!idle_cpu(cpu))
continue;
balance_cpu = cpu;
break;
}
/* (7.2.3) 没有idle cpu,则选取本sched_group的第一个cpu做load_balance */
// sg的第一个cpu是dst cpu,则开始做load balance
if (balance_cpu == -1)
balance_cpu = group_balance_cpu(sg);//the first cpu(busiest) in this sched group
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible(合格) for doing load balancing at this and above domains.
*/
return balance_cpu == env->dst_cpu;
}
load balance
pull task 是当前的cpu将busiest cpu上的task拉到自己身上来执行。
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
// 找出负载最重的cpu,将busiest cpu上的部分task迁移到local cpu上。
// 返回的是迁移的task的数量
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
int ld_moved, cur_ld_moved, active_balance = 0;
struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
struct sched_group *group;
struct rq *busiest;
struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
/* (7.1) 构造Load_balance需要的数据结构:*/
struct lb_env env = {
.sd = sd,//本cpu在tl层次的sd
.dst_cpu = this_cpu,//目的cpu是本cpu
.dst_rq = this_rq,// 目的rq是本cpu的rq
/*由于一些cpu allows的设置,导致一些task不能被迁移到dst_cpu上,
所以在出现这种情况的时候,就需要从dst cpu所在的group上选择另外一个cpu
*/
.dst_grpmask = sched_group_span(sd->groups),
.idle = idle,
//以sched_nr_migrate_break为基本单位累积
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
//初始化链表,后续会将需要迁移的task暂时放在这个链表里面。
.tasks = LIST_HEAD_INIT(env.tasks),
};
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
schedstat_inc(sd->lb_count[idle]);
redo:
if (!should_we_balance(&env)) {
*continue_balancing = 0;
goto out_balanced;
}
/* (7.3) 找出本层级sched_group链表中,负载最重的(busiest)的sched_group */
group = find_busiest_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);//我先假设为sd->lb_nobusyg[idle]--
goto out_balanced;
}
/* (7.4) 找出busiest sched_group中sched_group的rq,即负载最重cpu对应的rq */
busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == env.dst_rq);
//env.imbalance需要迁移的负载
schedstat_add(sd->lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
ld_moved = 0;
/* (7.5) 判断busiest cpu rq中的runnable进程数 > 1? 至少有进程可以迁移走 */
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
//最大遷移的task數量
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
rq_lock_irqsave(busiest, &rf);
update_rq_clock(busiest);
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations(迭代)
*/
cur_ld_moved = detach_tasks(&env, &rf);//cur_ld_moved本次迁移的task的数量
/*
* We've detached some tasks from busiest_rq. Every
* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
* unlock busiest->lock, and we are able to be sure
* that nobody can manipulate(操作) the tasks in parallel(平行).
* See task_rq_lock() family for the details.
*/
rq_unlock(busiest, &rf);
if (cur_ld_moved) {
//attach tasks将这些tasks添加到了新的rq中了,也就是enqueue操作
attach_tasks(&env);
ld_moved += cur_ld_moved;//ld_moved累积迁移的task数量
}
local_irq_restore(rf.flags);
/* (7.9) LBF_NEED_BREAK设置,说明balance还没有完成,
循环只是出来休息一下,继续重新balance*/
/*LBF_NEED_BREAK这个flag是在detach_tasks中设置的,
因为detach_tasks是在持有spinlock的情况下运行的。长时间的spinlock会带来一些问题。*/
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
goto more_balance;
}
/*
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
* where they can run. The upper limit on how many times we
* iterate on same src_cpu is dependent on number of cpus in our
* sched_group.
*
* This changes load balance semantics a bit on who can move
* load to a given_cpu. In addition to the given_cpu itself
* (or a ilb_cpu acting on its behalf where given_cpu is
* nohz-idle), we now have balance_cpu in a position to move
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
* given_cpu) causing exceess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
/* (7.10) 设置了LBF_DST_PINNED标志,并且env.imbalance > 0
说明src_cpu上有些进程因为affinity的原因不能迁移到dst_cpu但是能迁移到同sg的new_dst_cpu上
把dst_cpu更改为new_dst_cpu,重新开始balance流程
*/
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
/*
load balance的过程中有很多的flags标志位,其中LBF_SOME_PINNED的flag,
用来表示dst rq不在task的cpu allowed里面。
如果该task可以migrate到group的其余cpu上去,
另外用了一个flag叫着LBF_DST_PINNED来表示。
在这种情况下,会从group中重新选择一个cpu作为target cpu
继续load balance。很显然在NEWLY IDLE的load balance里面
不会出现LBF_DST_PINNED的情况,因为NEWLY IDLE的load balance中dst_grpmask为NULL
*/
/* Prevent to re-select dst_cpu via env's cpus */
cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
*/
goto more_balance;
}
/*
* We failed to reach balance because of affinity.
*/
/* (7.11) 设置了LBF_SOME_PINNED标志,说明有些进程因为affinity迁移失败,
设置当前sd的parent sd的 sgc->imbalance,让parent sd做rebalance的概率增高
*/
if (sd_parent) {//这个是在MC层级来看
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
*group_imbalance = 1;
}
/* All tasks on this runqueue were pinned(固定的) by CPU affinity */
/* (7.12) 如果LBF_ALL_PINNED标志一直被置位,
说明busiest_cpu因为affinity没有一个进程迁移成功,哪怕迁移到dst_cpu同sg的其他cpu也没有一个成功
将busiest cpu从全局cpu mask去掉,重新做整个流程:find_busiest_group -> find_busiest_queue -> detach_tasks -> attach_tasks
*/
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
/*
* Attempting to continue load balancing at the current
* sched_domain level only makes sense if there are
* active CPUs remaining as possible busiest CPUs to
* pull load from which are not contained within the
* destination group that is receiving any migrated
* load.
*/
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
}
goto out_all_pinned;
}
}
/* (7.13) 经过几轮的努力尝试,最终迁移的进程数ld_moved还是0,
说明pull失败,此时采用激进的方式,
即在busiest cpu上主动push task到local cpu上 */
if (!ld_moved) {
schedstat_inc(sd->lb_failed[idle]);//inc可能是increment的缩写
/*
* Increment the failure counter only on periodic(周期性的) balance.
* We do not want newidle balance, which can be very
* frequent, pollute(污染) the failure counter(计数器) causing
* excessive(过多的) cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
if (env.src_grp_nr_running > 1)
sd->nr_balance_failed++;
// 最后一次尝试迁移一个进程
if (need_active_balance(&env)) {
unsigned long flags;
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
* if the curr task on busiest cpu can't be
* moved to this_cpu
*/
/* (7.15) 如果当前cpu不在busiest->curr进程的affinity之内,返回失败 */
if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
* only after active load balance is finished.
*/
if (!busiest->active_balance) {//发起主动负载均衡
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance) {/* (7.16) 迁移busiest->curr进程当前期cpu */
/*
这边的load balance更加的激进,采用了一个stop class的进程(在前面介绍过,stop >
deadline > real time> fair > idle)同时将src rq的running的task重新enqueue到rq
中成为runnable状态。这样将running的task纳入到了load balance的范围
*/
/*
这个rq正在运行fair scheduling的load balance
此时会调用stop_one_cpu_nowait暂停该cpu的进程
//然后通过调用active_load_balance_cpu_stop
把tasks从最忙碌的处理器移到idle的处理器上执行
*/
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
/* We've kicked active balancing, force task migration. */
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else
sd->nr_balance_failed = 0;
//调整负载均衡时间间隔
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;//下次尽早到
} else {
/*
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;//推迟下一次负载均衡的时间
}
goto out;
out_balanced:
/*
* We reach balance although we may have faced some affinity(亲和性)
* constraints(约束). Clear the imbalance flag if it was set.
*/
if (sd_parent) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
*group_imbalance = 0;
}
out_all_pinned:
/*
* We reach balance because all tasks are pinned(固定的) at this level so
* we can't migrate them. Let the imbalance flag set so parent level
* can try to migrate them.
*/
/*
我们达到平衡,因为所有任务都固定在此级别,因此我们无法迁移它们。
设置不平衡标志,因此父级可以尝试迁移它们
*/
schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
ld_moved = 0;// 迁移的task的数量
/*
* idle_balance() disregards(无视) balance intervals, so we could repeatedly
* reach this code, which would lead to balance_interval skyrocketting(暴涨)
* in a short amount of time. Skip the balance_interval increase logic
* to avoid that.
*/
/*
idle_balance()忽略平衡间隔,因此我们可以重复访问此代码,
这将导致balance_interval在短时间内飞涨。
跳过balance_interval增加逻辑来避免这种情况。
*/
if (env.idle == CPU_NEWLY_IDLE)
goto out;
/* tune up the balancing interval */
//LBF_ALL_PINNED cpu具有亲和性
if (((env.flags & LBF_ALL_PINNED) &&
//MAX_PINNED_INTERVAL是512
sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
//不需要进行进程的迁移,适当的加大负载均衡的间隔时间,说明当前的负载均衡做得比较好了
sd->balance_interval *= 2;
out:
return ld_moved;//迁移的task的数量
}
pull task当出现一定的失败次数的时候,就会开始push task,首先我们来看看pull task 失败的类型或者原因
1、cpu affinity,即task被绑定在了指定的cpu上去运行,不能在其余的cpu上运行。此时不能做task 迁移
2、task 正在cpu上运行,或者task上一次运行的时间距离当前运行的时间很近。此时cache上有很多未淘汰的数据
则称该进程的cache是热的,对于cache是热的,我们尽量不做迁移。
push task active_load_balance 主要是想将当时正在运行的task或者距离时间十分接近的task迁移过来。
active_load_balance_cpu_stop
/*
采用了一个stop class的进程(在前面介绍过,stop > deadline > real time> fair > idle),
stop进程将当前正在运行的fair 或者低于stop级别的task停下来。然后从最忙的cpu上load到idle cpu上去
*/
/*
* active_load_balance_cpu_stop is run by cpu stopper. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
* avoids physical / logical imbalances.
*/
static int active_load_balance_cpu_stop(void *data)
{
struct rq *busiest_rq = data;
int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd = NULL;
struct task_struct *p = NULL;
struct rq_flags rf;
struct task_struct *push_task = NULL;
int push_task_detached = 0;
struct lb_env env = {
.sd = sd,
.dst_cpu = target_cpu,
.dst_rq = target_rq,
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,//直接转到idle的cpu上
.flags = 0,
.loop = 0,
};
rq_lock_irq(busiest_rq, &rf);
/*
* Between queueing the stop-work and running it is a hole in which
* CPUs can become inactive. We should not move tasks from or to
* inactive CPUs.
*/
if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
goto out_unlock;
// 也就是要求当前cpu是busiest,那就刚好对应的是push这个名称
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
!busiest_rq->active_balance))
goto out_unlock;
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
goto out_unlock;
/*
* This condition is "impossible", if it occurs
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-cpu setup.
*/
BUG_ON(busiest_rq == target_rq);
push_task = busiest_rq->push_task;
if (push_task) {
if (task_on_rq_queued(push_task) &&
push_task->state == TASK_RUNNING &&
task_cpu(push_task) == busiest_cpu &&
cpu_online(target_cpu)) {
update_rq_clock(busiest_rq);
detach_task(push_task, &env, &rf);
push_task_detached = 1;
}
goto out_unlock;
}
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
break;
}
if (likely(sd)) {
struct lb_env env = {
.sd = sd,
.dst_cpu = target_cpu,
.dst_rq = target_rq,
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
/*
* can_migrate_task() doesn't need to compute new_dst_cpu
* for active balancing. Since we have CPU_IDLE, but no
* @dst_grpmask we need to make that test go away with lying
* about DST_PINNED.
*/
.flags = LBF_DST_PINNED,
};
schedstat_inc(sd->alb_count);
update_rq_clock(busiest_rq);
p = detach_one_task(&env, &rf);
if (p) {
schedstat_inc(sd->alb_pushed);
/* Active balancing done, reset the failure counter. */
sd->nr_balance_failed = 0;
} else {
schedstat_inc(sd->alb_failed);
}
}
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
if (push_task)
busiest_rq->push_task = NULL;
rq_unlock(busiest_rq, &rf);
if (push_task) {
if (push_task_detached)
attach_one_task(target_rq, push_task);
put_task_struct(push_task);
}
if (p)
attach_one_task(target_rq, p);
local_irq_enable();
return 0;
}