Kernel中的CPU负载均衡是对调度器的增强,在多处理器上(SMP/NUMA),必须要考虑CPU的负载均衡,包括:1.CPU负荷尽可能公平地在所有处理器上共享。2.内核必须能够将进程从一个CPU迁移到另一个CPU上。
在Kernel中调度器进行负载均衡的时机有4个:
1.在时钟中断时,周期性调度器scheduler_tick会被调用,它在最后会调用trigger_load_balance函数,该函数可以触发负载均衡软中断。
2.在主调度器schedule中会判断当前CPU的rq上的进程个数是否为0,如果为0的话,就调用idle_balance函数,进行负载均衡。
3.在创建进程时,在函数sched_exec会进行负载均衡。
4.在使用try_to_wake_up函数唤醒进程时,会进行负载均衡。
下面对着四种情况一一介绍。
1.周期性负载均衡。
scheduler_tick会调用trigger_load_balance,trigger_load_balance的代码如下:
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
#ifdef CONFIG_NO_HZ
/*
* If we were in the nohz mode recently and busy at the current
* scheduler tick, then check if we need to nominate new idle
* load balancer.
*/
{
#ifdef CONFIG_NO_HZ
/*
* If we were in the nohz mode recently and busy at the current
* scheduler tick, then check if we need to nominate new idle
* load balancer.
*/
//nohz模式指的是,当CPU进入空闲状态时,系统为了省电,就把CPU的周期性中断关掉
if (rq->in_nohz_recently && !rq->idle_at_tick) { //如果当前cpu最近进入了nohz模式,但是目前不是处于空闲状态。
rq->in_nohz_recently = 0;
if (atomic_read(&nohz.load_balancer) == cpu) { //如果当前cpu之前是idle load balancer,则取消其资格(因为它不在空闲)
cpumask_clear_cpu(cpu, nohz.cpu_mask);
atomic_set(&nohz.load_balancer, -1);
}
if (atomic_read(&nohz.load_balancer) == -1) { //选择一个新的idle load balancer,并且将其唤醒
int ilb = find_new_ilb(cpu);
if (ilb < nr_cpu_ids)
resched_cpu(ilb); //让该cpu重新调度,这样它就会执行 idle load balance
}
}
/*
* If this cpu is idle and doing idle load balancing for all the
* cpus with ticks stopped, is it time for that to stop?
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { //这表示当前的idle load balancer是当前cpu,而且所有cpu都进入了nohz模式,整个系统处于空闲状态,没有必要做负载均衡。
resched_cpu(cpu); //在该cpu上重新调度(该cpu目前执行的应该是负载均衡的代码),停止执行负载均衡。
return;
}
/*
* If this cpu is idle and the idle load balancing is done by
* someone else, then no need raise the SCHED_SOFTIRQ
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
cpumask_test_cpu(cpu, nohz.cpu_mask)) //当前cpu处于空闲状态,当前的idle load balancer不是该cpu(意味着有别的cpu会帮其进行负载均衡),则直接退出,无需触发负载均衡软中断。
return;
#endif
/* Don't need to rebalance while attached to NULL domain */
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ); //触发负载均衡软中断,该中断处理函数是run_rebalance_domains
}
if (rq->in_nohz_recently && !rq->idle_at_tick) { //如果当前cpu最近进入了nohz模式,但是目前不是处于空闲状态。
rq->in_nohz_recently = 0;
if (atomic_read(&nohz.load_balancer) == cpu) { //如果当前cpu之前是idle load balancer,则取消其资格(因为它不在空闲)
cpumask_clear_cpu(cpu, nohz.cpu_mask);
atomic_set(&nohz.load_balancer, -1);
}
if (atomic_read(&nohz.load_balancer) == -1) { //选择一个新的idle load balancer,并且将其唤醒
int ilb = find_new_ilb(cpu);
if (ilb < nr_cpu_ids)
resched_cpu(ilb); //让该cpu重新调度,这样它就会执行 idle load balance
}
}
/*
* If this cpu is idle and doing idle load balancing for all the
* cpus with ticks stopped, is it time for that to stop?
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { //这表示当前的idle load balancer是当前cpu,而且所有cpu都进入了nohz模式,整个系统处于空闲状态,没有必要做负载均衡。
resched_cpu(cpu); //在该cpu上重新调度(该cpu目前执行的应该是负载均衡的代码),停止执行负载均衡。
return;
}
/*
* If this cpu is idle and the idle load balancing is done by
* someone else, then no need raise the SCHED_SOFTIRQ
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
cpumask_test_cpu(cpu, nohz.cpu_mask)) //当前cpu处于空闲状态,当前的idle load balancer不是该cpu(意味着有别的cpu会帮其进行负载均衡),则直接退出,无需触发负载均衡软中断。
return;
#endif
/* Don't need to rebalance while attached to NULL domain */
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ); //触发负载均衡软中断,该中断处理函数是run_rebalance_domains
}
下面看下run_rebalance_domains的代码
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* In CONFIG_NO_HZ case, the idle load balance owner will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static void run_rebalance_domains(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
struct rq *this_rq = cpu_rq(this_cpu);
enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; //后面负载均衡时会根据该cpu的状态(闲/忙),选择不同饿参数。
rebalance_domains(this_cpu, idle); //检查该cpu所处的调度域,看是否存在负载不平衡现象,如果存在则进行负载平衡。详情见下文。
#ifdef CONFIG_NO_HZ
/*
* If this cpu is the owner for idle load balancing, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
if (this_rq->idle_at_tick &&
atomic_read(&nohz.load_balancer) == this_cpu) {
struct rq *rq;
int balance_cpu;
for_each_cpu(balance_cpu, nohz.cpu_mask) {
if (balance_cpu == this_cpu)
continue;
/*
* If this cpu gets work to do, stop the load balancing
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
if (need_resched()) //如果当前cpu有工作要做,则停止进行负载平衡。
break;
rebalance_domains(balance_cpu, CPU_IDLE); //当前cpu替balance_cpu进行负载均衡,即将忙的cpu上的任务迁移到balance_cpu上(如果存在不平衡的话)
rq = cpu_rq(balance_cpu);
if (time_after(this_rq->next_balance, rq->next_balance)) //更新本rq下一次load balance的时间为所有被它代理的rq执行load balance中最早的,不明白原因。
* run_rebalance_domains is triggered when needed from the scheduler tick.
* In CONFIG_NO_HZ case, the idle load balance owner will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static void run_rebalance_domains(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
struct rq *this_rq = cpu_rq(this_cpu);
enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; //后面负载均衡时会根据该cpu的状态(闲/忙),选择不同饿参数。
rebalance_domains(this_cpu, idle); //检查该cpu所处的调度域,看是否存在负载不平衡现象,如果存在则进行负载平衡。详情见下文。
#ifdef CONFIG_NO_HZ
/*
* If this cpu is the owner for idle load balancing, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
if (this_rq->idle_at_tick &&
atomic_read(&nohz.load_balancer) == this_cpu) {
struct rq *rq;
int balance_cpu;
for_each_cpu(balance_cpu, nohz.cpu_mask) {
if (balance_cpu == this_cpu)
continue;
/*
* If this cpu gets work to do, stop the load balancing
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
if (need_resched()) //如果当前cpu有工作要做,则停止进行负载平衡。
break;
rebalance_domains(balance_cpu, CPU_IDLE); //当前cpu替balance_cpu进行负载均衡,即将忙的cpu上的任务迁移到balance_cpu上(如果存在不平衡的话)
rq = cpu_rq(balance_cpu);
if (time_after(this_rq->next_balance, rq->next_balance)) //更新本rq下一次load balance的时间为所有被它代理的rq执行load balance中最早的,不明白原因。