Linux负载均衡
核心结构体
struct sched_domain {
/* These fields must be setup */
struct sched_domain *parent; /* top domain must be null terminated */
struct sched_domain *child; /* bottom domain must be null terminated */
struct sched_group *groups; /* the balancing groups of the domain */
unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */
unsigned int busy_factor; /* less balancing by factor if busy */
unsigned int imbalance_pct; /* No balance until over watermark */
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
unsigned int busy_idx;
unsigned int idle_idx;
unsigned int newidle_idx;
unsigned int wake_idx;
unsigned int forkexec_idx;
unsigned int smt_gain;
int nohz_idle; /* NOHZ IDLE status */
int flags; /* See SD_* */
int level;
/* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */
unsigned int balance_interval; /* initialise to 1. units in ms. */
unsigned int nr_balance_failed; /* initialise to 0 */
/* idle_balance() stats */
u64 max_newidle_lb_cost;
unsigned long next_decay_max_lb_cost;
u64 avg_scan_cost; /* select_idle_sibling */
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];//记录idle type的数量
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
/* Active load balancing */
unsigned int alb_count;
unsigned int alb_failed;
unsigned int alb_pushed;
/* SD_BALANCE_EXEC stats */
unsigned int sbe_count;
unsigned int sbe_balanced;
unsigned int sbe_pushed;
/* SD_BALANCE_FORK stats */
unsigned int sbf_count;
unsigned int sbf_balanced;
unsigned int sbf_pushed;
/* try_to_wake_up() stats */
unsigned int ttwu_wake_remote;
unsigned int ttwu_move_affine;
unsigned int ttwu_move_balance;
#endif
#ifdef CONFIG_SCHED_DEBUG
char *name;
#endif
union {
void *private; /* used during construction */
struct rcu_head rcu; /* used during destruction */
};
struct sched_domain_shared *shared;
unsigned int span_weight;
/*
* Span of all CPUs in this domain.
*
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
unsigned long span[0];
};
struct sched_group {
struct sched_group *next; /* Must be a circular list */
atomic_t ref;
unsigned int group_weight;
struct sched_group_capacity *sgc;
int asym_prefer_cpu; /* CPU of highest priority in group */
/*
* The CPUs this group covers.
*
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
unsigned long cpumask[0];
};
struct sched_group_capacity {
//引用计算,可能多个sd共享一个sg和sgc
atomic_t ref;
/*
* CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
* for a single CPU.
*/
unsigned long capacity;
unsigned long min_capacity; /* Min per-CPU capacity in group */
//下一次更新算力的时间点
unsigned long next_update;
//该sg中是否有由于affinity原因产生不均衡的问题
int imbalance; /* XXX unrelated to capacity but shared group state */
#ifdef CONFIG_SCHED_DEBUG
//MC层级的是每个cpu的id,DIE层级的是每个cluster的首个cpu的id
int id;
#endif
//该sg包含的cpu
unsigned long cpumask[0]; /* Balance mask */
};
struct lb_env {
//要进行负载均衡的domain
struct sched_domain *sd;
//此sd中最忙的cpu和rq,均衡目标就是从其中拉取任务
struct rq *src_rq;
int src_cpu;
/*
*本次均衡的目标CPU,均衡尝试从sd中的最忙的cpu的rq上拉取任务到dst cpu的rq上,
* 第一轮均衡的dst cpu通常为发起均衡的cpu,但后续若有需要,可以从新设定为local
* group中其它的cpu.
*/
int dst_cpu;
struct rq *dst_rq;
// dst cpu所在sched group的cpu mask,MC层级就是dst cpu自己,DIE层级是其cluster.
struct cpumask *dst_grpmask;
/*
* 一般而言,均衡的dst cpu是发起均衡的cpu,但如果由于affinity原因,src上有任务
* 无法迁移到dst cpu从而无法完成负载均衡操作时,会从dst cpu的logcal group中选出
* 一个新的cpu发起第二轮负载均衡。
*/
int new_dst_cpu;
//均衡时dst cpu的idle状态,其会影响负载均衡的走向
enum cpu_idle_type idle;
/*
* 对此成员的解释需要结合migration_type成员, calculate_imbalance:
* migrate_load:表示要迁移的负载量
* migrate_util:表示要迁移的utility
* migrate_task:MC:表示要迁移的任务个数,DIE: busiest group需要增加的idle cpu个数
* migrate_misfit:设定为1,表示一次迁移一个任务
* group_imbalanced:设定为1,表示一次迁移一个任务
*/
long imbalance;
/* The set of CPUs under consideration for load-balancing */
/*
* 负载均衡过程会有多轮操作,不同轮次的操作会涉及不同cpus,此成员表示此次均衡
* 有哪些cpus参与
*/
struct cpumask *cpus;
/*
* 负载均衡标志,位掩码。LBF_NOHZ_STATS 和 LBF_NOHZ_AGAIN 主要用于均衡过程中更
* 新nohz状态。当选中的最忙的cpu上所有任务都由于affinity无法迁移时会设置
* LBF_ALL_PINNED,此时会寻找次忙的cpu进行下一轮均衡。LBF_NEED_BREAK 主要用于
* 减短均衡过程中关中断的时间的。
*/
unsigned int flags;
/*
* 当确定要迁移任务时,load_balance()会循环遍历src rq上的cfs task链表来确定迁移
* 的任务数量。loop用于跟踪循环次数,其值不能超过loop_max成员。
*/
unsigned int loop;
/*
* 如果一次迁移的任务比较多,那么每迁移 sched_nr_migrate_break 个任务就要休息一
* 下,让关中断的临界区小一点。
*/
unsigned int loop_break;
unsigned int loop_max;
enum fbq_type fbq_type;
//需要迁移的任务会挂到这个链表中
struct list_head tasks;
/*
* 要达到sd负载均衡的目标,本次迁移的类型是什么,迁移一定量的负载、一定量的utility、
* 一些任务还是misfit task。见 imbalance 成员的解释。
*/
//enum migration_type migration_type;
};
核心函数 load_balance fair.c
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
* this_cpu:pull task to this cpu;
* this_rq:
* sd:调度域
* cpu_idle_type_idle:
* continue_balancing:是否继续balancing
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
int ld_moved, cur_ld_moved, active_balance = 0;
struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
struct rq *busiest;
struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);//返回load_balance_mask,已经实现了抢占保护,注意,取得是地址
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_span(sd->groups),//返回该调度组中的mask,调度域中的第一个调度组,调度组通过链表链接
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
};
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);//通过与操作,获得load_balance_mask,即在当前调度域的第一个调度组中的mask与在线的mask,sched_domain_span()与dst_grpmask的效果应该相同,都是调度域中第一个调度组中mask
schedstat_inc(sd->lb_count[idle]);//inc 对应idle计数,当前cpu是否要进行idle状态
redo:
if (!should_we_balance(&env)) {//判断是否应该balance-1. dst_cpu是idle cpu或者是dst_cpu是当前调度组的第一个cpu--核心函数1
*continue_balancing = 0;
goto out_balanced;
}
group = find_busiest_group(&env);//返回最繁忙的group,以及对应需要迁移多少任务
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == env.dst_rq);
schedstat_add(sd->lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
ld_moved = 0;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
rq_lock_irqsave(busiest, &rf);
update_rq_clock(busiest);
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);//将任务从src_rq上detach,会有些hot_cache,以及cpu_allowed,以及running的不允许迁移,每次循环都会将load减去,得到新的imbalance,将task设置迁移状态,然后挂在env->tasks的链表之上;当loop>loop_max,会设置LBF_NEED_BREAK的标志,每次最多迁移32个;如果某些p不能够迁移,会设置新的可以迁移的new_dst_cpu,设置标志位LBF_DST_PINNED
/*
* We've detached some tasks from busiest_rq. Every
* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
* unlock busiest->lock, and we are able to be sure
* that nobody can manipulate the tasks in parallel.
* See task_rq_lock() family for the details.
*/
rq_unlock(busiest, &rf);
if (cur_ld_moved) {
attach_tasks(&env);//将env->tasks上的任务挂在dst_rq之上
ld_moved += cur_ld_moved;
}
local_irq_restore(rf.flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
goto more_balance;
}
/*
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
* where they can run. The upper limit on how many times we
* iterate on same src_cpu is dependent on number of CPUs in our
* sched_group.
*
* This changes load balance semantics a bit on who can move
* load to a given_cpu. In addition to the given_cpu itself
* (or a ilb_cpu acting on its behalf where given_cpu is
* nohz-idle), we now have balance_cpu in a position to move
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
* given_cpu) causing exceess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {//迁移完了,还不够,记录了一些new_dst_cpu,可以进行迁移
/* Prevent to re-select dst_cpu via env's CPUs */
cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
*/
goto more_balance;
}
/*
* We failed to reach balance because of affinity.
*/
if (sd_parent) {//在mc层级,即在一个cluster中
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)//LBF_SOME_PINNED的设置在detach_tasks中,表明有些任务是因为亲和性而无法转移,同时,可以转移的转移结束了还没有达到转移平衡的状态;
*group_imbalance = 1;//在一个cluster中没有均衡完全,需要在整个DIE中进行均衡
}
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {//将cpu从可以均衡的cpus中去除,重新进行均衡
cpumask_clear_cpu(cpu_of(busiest), cpus);
/*
* Attempting to continue load balancing at the current
* sched_domain level only makes sense if there are
* active CPUs remaining as possible busiest CPUs to
* pull load from which are not contained within the
* destination group that is receiving any migrated
* load.
*/
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
}
goto out_all_pinned;
}
}
if (!ld_moved) {
schedstat_inc(sd->lb_failed[idle]);//统计了每种平衡失败的次数
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;//非new_idle状态的失败次数
if (need_active_balance(&env)) {//new_idleCPU需要根据asym_perfer判断是否强制;非idle类型需要进行判断cfs capacity;或者失败次数太多也会active_balance
unsigned long flags;
raw_spin_lock_irqsave(&busiest->lock, flags);
/*
* Don't kick the active_load_balance_cpu_stop,
* if the curr task on busiest CPU can't be
* moved to this_cpu:
* 再次判断当前busiest的任务是否可以运行在dst_cpu上
*/
if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
* only after active load balance is finished.
* 可以运行
*/
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
/* We've kicked active balancing, force task migration. */
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else
sd->nr_balance_failed = 0;
if (likely(!active_balance)) {//更新balance_interval
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
/*
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;//调度的时间间隔X2
}
goto out;
out_balanced:
/*
* We reach balance although we may have faced some affinity
* constraints. Clear the imbalance flag if it was set.
*/
if (sd_parent) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
*group_imbalance = 0;
}
out_all_pinned:
/*
* We reach balance because all tasks are pinned at this level so
* we can't migrate them. Let the imbalance flag set so parent level
* can try to migrate them.
*/
schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
/* tune up the balancing interval */
if (((env.flags & LBF_ALL_PINNED) &&
sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
ld_moved = 0;
out:
return ld_moved;
}
核心数据结构
调用核心函数1 should_we_balance()
核心数据结构
enum cpu_idle_type {
CPU_IDLE,
CPU_NOT_IDLE,
CPU_NEWLY_IDLE,
CPU_MAX_IDLE_TYPES
};
核心辅助函数
/*
* Build the balance mask; it contains only those CPUs that can arrive at this
* group and should be considered to continue balancing.
*
* We do this during the group creation pass, therefore the group information
* isn't complete yet, however since each group represents a (child) domain we
* can fully construct this using the sched_domain bits (which are already
* complete).
*/
static void
build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
{
const struct cpumask *sg_span = sched_group_span(sg);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;
cpumask_clear(mask);
for_each_cpu(i, sg_span) {
sibling = *per_cpu_ptr(sdd->sd, i);
/*
* Can happen in the asymmetric case, where these siblings are
* unused. The mask will not be empty because those CPUs that
* do have the top domain _should_ span the domain.
*/
if (!sibling->child)
continue;
/* If we would not end up here, we can't continue from here */
if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
continue;
cpumask_set_cpu(i, mask);
}
/* We must not have empty masks here */
WARN_ON_ONCE(cpumask_empty(mask));
}
核心函数
/*
* DIE层级只有idle cpu和 group的第一个cpu可以发起均衡;DIE阶段,有两个调度组,每个cpu也只能在一个调度组,所以,可以发起均衡的只有0,4
* MC层级的每个cpu都可以发起均衡
*/
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
int cpu, balance_cpu = -1;
/*
* Ensure the balancing environment is consistent; can happen
* when the softirq triggers 'during' hotplug.
*/
if (!cpumask_test_cpu(env->dst_cpu, env->cpus))//当前调度域涉及的cpu
return 0;
/*
* In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
*/
if (env->idle == CPU_NEWLY_IDLE) // 判断所处的idle状态,如果是CPU_NEWLY_IDLE就直接返回应该balance,new idle
return 1;
/* Try to find first idle CPU */
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { //group_balance_mask,将bitmaps转换为struct cpumask*
if (!idle_cpu(cpu))//通过判断当前cpu运行的是否是idle进程,以及wake_list列表上是否为空,判断是否为idle cpu
continue;
balance_cpu = cpu;
break;
}
if (balance_cpu == -1)
balance_cpu = group_balance_cpu(sg);//没有idle cpu,就返回当前调度组的第一个cpu
/*
* First idle CPU or the first CPU(busiest) in this sched group
* is eligible for doing load balancing at this and above domains.
* idle cpu并且是当前cpu可以均衡;
* 如果没有idle cpu在这个调度组中,则只有调度组中的第一个cpu可以发起均衡
* dst_cpu是当前调度组的第一个cpu或者是idle cpu可以发起负载均衡
*/
return balance_cpu == env->dst_cpu;
}
核心调用函数2 find_busiest_group()
核心数据结构
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
* 调度组也有一个类似的数据结构struct sg_lb_stats,用于描述该调度组里的相关信息,例如平均负载、总负载、总权重、进程平均权重等
*/
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;该sg上所有cpu的可用于cfs任务的算力之和
unsigned long group_util; /* Total utilization of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group //该sg上所有任务的数量,包括rt、dl任务*/
unsigned int idle_cpus; //该sg中idle cpu的数量
unsigned int group_weight; //该sg中cpu的数量
enum group_type group_type; //该sg在负载均衡时所处的状态
int group_no_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
#endif
};
/*
* sd_lb_stats - Structure to store the statistics of a sched_domain
* during load balancing. 该结构体描述调度域中的总负载、总能力系数和平均负载等信息
*/
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd;dst_cpu in this group*/
unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
//标记任务应该先去到同cluster的cpu
//unsigned int prefer_sibling;
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
struct sg_lb_stats local_stat; /* Statistics of the local group */
};
核心辅助函数
**
* fix_small_imbalance - Calculate the minor imbalance that exists
* amongst the groups of a sched_domain, during
* load balancing.
* @env: The load balancing environment.
* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
*/
static inline
void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long tmp, capa_now = 0, capa_move = 0;
unsigned int imbn = 2;
unsigned long scaled_busy_load_per_task;
struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
if (!local->sum_nr_running)
local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
else if (busiest->load_per_task > local->load_per_task)
imbn = 1;
scaled_busy_load_per_task =
(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
busiest->group_capacity;
if (busiest->avg_load + scaled_busy_load_per_task >=
local->avg_load + (scaled_busy_load_per_task * imbn)) {
env->imbalance = busiest->load_per_task;
return;
}
/*
* OK, we don't have enough imbalance to justify moving tasks,
* however we may be able to increase total CPU capacity used by
* moving them.
*/
capa_now += busiest->group_capacity *
min(busiest->load_per_task, busiest->avg_load);
capa_now += local->group_capacity *
min(local->load_per_task, local->avg_load);
capa_now /= SCHED_CAPACITY_SCALE;
/* Amount of load we'd subtract */
if (busiest->avg_load > scaled_busy_load_per_task) {
capa_move += busiest->group_capacity *
min(busiest->load_per_task,
busiest->avg_load - scaled_busy_load_per_task);
}
/* Amount of load we'd add */
if (busiest->avg_load * busiest->group_capacity <
busiest->load_per_task * SCHED_CAPACITY_SCALE) {
tmp = (busiest->avg_load * busiest->group_capacity) /
local->group_capacity;
} else {
tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
local->group_capacity;
}
capa_move += local->group_capacity *
min(local->load_per_task, local->avg_load + tmp);
capa_move /= SCHED_CAPACITY_SCALE;
/* Move if we gain throughput */
if (capa_move > capa_now)
env->imbalance = busiest->load_per_task;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
* @env: load balance environment
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
*/
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long max_pull, load_above_capacity = ~0UL;
struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
// 由于affinity而导致的不均衡
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure CPU-load equilibrium, look at wider averages. XXX
*/
busiest->load_per_task =
min(busiest->load_per_task, sds->avg_load);
}
/*
* Avg load of busiest sg can be less and avg load of local sg can
* be greater than avg load across all sgs of sd because avg load
* factors in sg capacity and sgs with smaller group_type are
* skipped when updating the busiest sg:
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
/*
* If there aren't any idle CPUs, avoid creating some.
* 将大于能力的cpu进行均衡
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
if (load_above_capacity > busiest->group_capacity) {
load_above_capacity -= busiest->group_capacity;
load_above_capacity *= scale_load_down(NICE_0_LOAD);
load_above_capacity /= busiest->group_capacity;
} else
load_above_capacity = ~0UL;
}
/*
* We're trying to get all the CPUs to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded CPU below the average load. At the same time,
* we also don't want to reduce the group load below the group
* capacity. Thus we look for the minimum possible imbalance.
*/
max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
env->imbalance = min(
max_pull * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
* a think about bumping its value to force at least one task to be
* moved
*/
if (env->imbalance < busiest->load_per_task)
return fix_small_imbalance(env, sds);
}
/**
* update_sd_pick_busiest - return 1 on busiest group
* @env: The load balancing environment.
* @sds: sched_domain statistics
* @sg: sched_group candidate to be checked for being the busiest
* @sgs: sched_group statistics
*
* Determine if @sg is a busier group than the previously selected
* busiest group.
*
* Return: %true if @sg is a busier group than the previously selected
* busiest group. %false otherwise.
*/
static bool update_sd_pick_busiest(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *sg,
struct sg_lb_stats *sgs)
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
if (sgs->group_type > busiest->group_type) //enum group_type group_other, group_imbalanced, group overloaded
return true;
if (sgs->group_type < busiest->group_type)
return false;
if (sgs->avg_load <= busiest->avg_load)
return false;
if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
goto asym_packing;
/*
* Candidate sg has no more than one task per CPU and
* has higher per-CPU capacity. Migrating tasks to less
* capable CPUs may harm throughput. Maximize throughput,
* power/energy consequences are not considered.
*/
if (sgs->sum_nr_running <= sgs->group_weight &&
group_smaller_cpu_capacity(sds->local, sg))//sg->sgc->min_capacity*capacity_margin(1280) < ref->sgc->min_capacity*1024 当前sg还有capactiy大于local的1.25倍的时候,不可能是busist
return false;
asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
/* No ASYM_PACKING if target CPU is already busy */
if (env->idle == CPU_NOT_IDLE)//sg的min_capacity足够大
return true;
/*
* ASYM_PACKING needs to move all the work to the highest
* prority CPUs in the group, therefore mark all groups
* of lower priority than ourself as busy.
* cpu也有优先级,将低优先级的置为busy,则当前cpu就能被优先选择
*/
if (sgs->sum_nr_running &&
sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
if (!sds->busiest)
return true;
/* Prefer to move from lowest priority CPU's work */
if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
sg->asym_prefer_cpu))
return true;
}
return false;
}
static inline void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
const struct sched_class *curr_class;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);//更新rq->clock 和rq->clock_task, rq->clock表示时间与当前jiffies相同
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);//更新cfs_rq的util以及load,会调用更新频率的函数,减去一些阻塞的任务
curr_class = rq->curr->sched_class;
update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))//cfs_rq_has_blocked()返回当前cfs_rq是否有负载,other检测rt以及dl
rq->has_blocked_load = 0;//什么任务都没有的时候,会为0,是否有block_load
#endif
rq_unlock_irqrestore(rq, &rf);
}
static bool update_nohz_stats(struct rq *rq, bool force) force是否强制更新,判断当前rq上是否block_load有在运行,有的话返回true,没有返回false
{
#ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;
if (!rq->has_blocked_load)//has_blocked_load:进入idle 的cpu是否需要更新blocked_load,一个被kick的idle cpu并不是总是为了完成负载均衡,有时候也是需要更新blocked_load;当前rq没有任务
return false;
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))//idle_cpus_mask记录了系统中的idlecpu
return false;
if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))//在update_blocked_averages中更新,last_blocked_load_update_tick
return true;//说明刚刚更新过了
update_blocked_averages(cpu);//更新cpu的blocked_load
return rq->has_blocked_load;//返回是否有任务,有任务为1,无任务为0
#else
return false;
#endif
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = scale_rt_capacity(sd, cpu);//返回刨去rt以及dl任务之后的当前cpu的cap,百分比
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);//per_cpu变量,cpu_scale
if (!capacity)
capacity = 1;
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
}
/*更新调度域中对调度组的统计信息;cpu会指向的sd的groups中是自己开头的,sd每个cpu都有变量,sg串联成环
* DIE调度域分为3个调度组,在MC有两个调度域,超大核不成域;MC上的两个调度域每个中的cpu包含一个cluster,每个cluster中的cpu单独成为mc下的一个调度组,然后cluster中的调度组串联起来;如果更新MC中的调度组,就是更新每个cpu的算力,如果更新DIE中的调度组,就是将调度组中的cpu(MC调度域中的被包含的调度组)的能力综合起来计算
*/
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long capacity, min_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);//调度域的更新时间间隔 ms
interval = clamp(interval, 1UL, max_load_balance_interval);min(max(interval, 1), max_load_balance_interval) 如果max_load--小于1,则为max_loas--,如果大于1 ,则为interval;即如果用户设定过大,则保持默认,如果用户有设置小均衡周期,则使用用户的
sdg->sgc->next_update = jiffies + interval;//更新下一次组负载均衡的时间
if (!child) {//满足则说明在MC层,该层的sd包含当前cpu所在的cluster,然后每个group包含一个cpu
update_cpu_capacity(sd, cpu);//更新当前cpu所在的capacity以及min_capacity,一组一cpu
return;
}
//DIE层
capacity = 0;
min_capacity = ULONG_MAX;
if (child->flags & SD_OVERLAP) {//sched_domains of this level overlap
/*
* SD_OVERLAP domains cannot assume that child groups
* span the current group.
*/
for_each_cpu(cpu, sched_group_span(sdg)) {//当前调度组中的cpu
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
/*
* build_sched_domains() -> init_sched_groups_capacity()
* gets here before we've attached the domains to the
* runqueues.
*
* Use capacity_of(), which is set irrespective of domains
* in update_cpu_capacity().
*
* This avoids capacity from being 0 and
* causing divide-by-zero issues on boot.
*/
if (unlikely(!rq->sd)) {
capacity += capacity_of(cpu);
} else {
sgc = rq->sd->groups->sgc;
capacity += sgc->capacity;//cfs任务的处理能力
}
min_capacity = min(capacity, min_capacity);//当前调度组中最小的可利用能力
}
} else {
/*
* !SD_OVERLAP domains can assume that child groups
* span the current group.
*/
group = child->groups;//child是MC层级,其中的group是单个cpu,下面的循环遍历一个cluster,将一个cluster中相加起来即MC层级中一个调度域,也是DIE层级中的一个调度组。
do {
struct sched_group_capacity *sgc = group->sgc;
capacity += sgc->capacity;
min_capacity = min(sgc->min_capacity, min_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;//当前调度域cluster中的所有可利用算力cfs任务,一个调度域是一个cluster,一个调度组是其中的一个cpu 该层级只有一个sg,一个cluster中 在DIE中一个调度组中的cpu,在MC会在一个调度域。
sdg->sgc->min_capacity = min_capacity;//当前调度域中的调度组cluster中的cpu的最小算力
}
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated. 调度域中的调度组
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
* @overload: Indicate more than one runnable task for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
bool *overload)
{
unsigned long load;
int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))//LBF_NOHZ_STATS是cpu进入new idle状态的时候设置的,block_load=1,进入设置后,会检测是否是重新检测过,如果以及过了时钟,则进行检测,如果确实是没有负载运行,就将has_bloacked_load=0并返回,如果返回1,则说明还有任务,进入idle失败;返回当前cpu上的block task是否有更新,如果有更新就设置该标志
env->flags |= LBF_NOHZ_AGAIN;
/* Bias balancing toward CPUs of our domain: */
if (local_group)
load = target_load(i, load_idx);
else
load = source_load(i, load_idx);//两个一样,返回cfs_rq->avg.runnable_load_avg,和rq->cpu_load的最大值
sgs->group_load += load;
sgs->group_util += cpu_util(i);//amount of capacity of a cpu that is (estimated to be) used by cfs task.
sgs->sum_nr_running += rq->cfs.h_nr_running;
nr_running = rq->nr_running;
if (nr_running > 1)
*overload = true;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
sgs->sum_weighted_load += weighted_cpuload(rq);//返回cfs_rq->avg.runnable_load_avg
/*
* No need to call idle_cpu() if nr_running is not 0
*/
if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
}
/* Adjust by relative CPU capacity of the group */
sgs->group_capacity = group->sgc->capacity;
sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
sgs->group_no_capacity = group_is_overloaded(env, sgs);//判断是否overloaded
sgs->group_type = group_classify(group, sgs);//当前group 所处的状态。
}
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;//下一级调度域
struct sched_group *sg = env->sd->groups;//当前调度域的第一个调度组
struct sg_lb_stats *local = &sds->local_stat;//当前调度域的调度组的统计信息
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
bool overload = false;
if (child && child->flags & SD_PREFER_SIBLING)//子调度域存在并且,prefer to place tasks in a sibling domain
prefer_sibling = 1;
#ifdef CONFIG_NO_HZ_COMMON
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))// 记录flags的状态 判断是否是已经关闭时钟?
env->flags |= LBF_NOHZ_STATS;
#endif
load_idx = get_sd_load_idx(env->sd, env->idle);//返回所处idle的类型,busy_idx, newidle_idx idle_idx
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));//判断是否在当前调度组中
if (local_group) {
sds->local = sg;
sgs = local;
if (env->idle != CPU_NEWLY_IDLE ||
time_after_eq(jiffies, sg->sgc->next_update))//判断是否已经到了更新sg算力的时间点,得到时间点或者是cpu要进入idle状态时执行
update_group_capacity(env->sd, env->dst_cpu);//更新调度域中调度组的min_capacity以及capacity,均指的是cfs的算力
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
&overload);
if (local_group)
goto next_group;
/*
* In case the child domain prefers tasks go to siblings
* first, lower the sg capacity so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
* these excess tasks. The extra check prevents the case where
* you always pull from the heaviest group when it is already
* under-utilized (possible with a large weight task outweighs
* the tasks on the system).
*/
if (prefer_sibling && sds->local &&
group_has_capacity(env, local) &&
(sgs->sum_nr_running > local->sum_nr_running + 1)) {
sgs->group_no_capacity = 1;
sgs->group_type = group_classify(sg, sgs);
}
if (update_sd_pick_busiest(env, sds, sg, sgs)) {//更新当前调度域中的最繁忙调度组,比较group type , load, 以及高优先级的做负载均衡的cpu
sds->busiest = sg;
sds->busiest_stat = *sgs;
}
next_group:
/* Now, start updating sd_lb_stats */
sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
#ifdef CONFIG_NO_HZ_COMMON
if ((env->flags & LBF_NOHZ_AGAIN) &&
cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
WRITE_ONCE(nohz.next_blocked,
jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
}
#endif
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
}
}
核心函数
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
*
* Also calculates the amount of weighted load which should be moved
* to restore balance.
*
* @env: The load balancing environment.
*
* Return: - The busiest group if imbalance exists.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
{
struct sg_lb_stats *local, *busiest;//记录当前调度域中当前调度组以及busiest 调度组的信息
struct sd_lb_stats sds;
init_sd_lb_stats(&sds);
/*
* Compute the various statistics relavent for load balancing at
* this level.
* 负载信息都是不断的在变化,在寻找最繁忙group的时候,我们首先要更新sd负载均衡信息,
* 以便可以根据最新的负载情况来搜寻。
* 此函数会更新该 sd 上各个 sg 的负载和算力,得到local group以及
* 非local group最忙的那个group的均衡信息,以便后续给出最适合的均衡决策。
*/
update_sd_lb_stats(env, &sds);//更新调度域的情况,
local = &sds.local_stat;
busiest = &sds.busiest_stat;
/* ASYM feature bypasses nice load balance check
* 不对称cpu打包。把低优先级cpu上的任务往高cpu上转移(一般cpu序号越低优先级越高)
*/
if (check_asym_packing(env, &sds))
return sds.busiest;
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
/* XXX broken for overlapping NUMA groups */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
/*
* If the busiest group is imbalanced the below checks don't
* work because they assume all things are equal, which typically
* isn't true due to cpus_allowed constraints and the like.
* busiest group是一个由于cpu affinity导致的不均衡,MC层级均衡时发现均衡不了设置的
*/
if (busiest->group_type == group_imbalanced)
goto force_balance;
/*
* When dst_cpu is idle, prevent SMP nice and/or asymmetric group
* capacities from resulting in underutilization due to avg_load.
*/
if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
if (local->avg_load >= busiest->avg_load)
goto out_balanced;
/*
* Don't pull any tasks if this group is already above the domain
* average load.
*/
if (local->avg_load >= sds.avg_load)
goto out_balanced;
if (env->idle == CPU_IDLE) {
/*
* This CPU is idle. If the busiest group is not overloaded
* and there is no imbalance between this and busiest group
* wrt idle CPUs, it is balanced. The imbalance becomes
* significant if the diff is greater than 1 otherwise we
* might end up to just move the imbalance on another group
*/
if ((busiest->group_type != group_overloaded) &&
(local->idle_cpus <= (busiest->idle_cpus + 1)))
goto out_balanced;
} else {
/*
* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
* imbalance_pct to be conservative.
* busiest的负载高于local不了多少,imbalance_pct = 117
*/
if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it 计算需要更新的task_load,根据不均衡程度判断迁移的数量 */
calculate_imbalance(env, &sds);//
return env->imbalance ? sds.busiest : NULL;
out_balanced:
env->imbalance = 0;
return NULL;
}
核心调用函函数3 find_busiest_queue()
/*
* find_busiest_queue - find the busiest runqueue among the CPUs in the group.
*/
static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
unsigned long capacity, wl;
enum fbq_type rt;
rq = cpu_rq(i);
rt = fbq_classify_rq(rq);
/*
* We classify groups/runqueues into three groups:
* - regular: there are !numa tasks
* - remote: there are numa tasks that run on the 'wrong' node
* - all: there is no distinction
*
* In order to avoid migrating ideally placed numa tasks,
* ignore those when there's better options.
*
* If we ignore the actual busiest queue to migrate another
* task, the next balance pass can still reduce the busiest
* queue by moving tasks around inside the node.
*
* If we cannot move enough load due to this classification
* the next pass will adjust the group classification and
* allow migration of more tasks.
*
* Both cases only affect the total convergence complexity.
*/
if (rt > env->fbq_type)
continue;
capacity = capacity_of(i);
wl = weighted_cpuload(rq);
/*
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the CPU capacity.
*/
if (rq->nr_running == 1 && wl > env->imbalance &&
!check_cpu_capacity(rq, env->sd))//check_cpu_capacity判断当前用于cfs任务的能力是否小于1.17* cpu_capacity < cpu_capacity_orig
continue;
/*
* For the load comparisons with the other CPU's, consider
* the weighted_cpuload() scaled with the CPU capacity, so
* that the load can be moved away from the CPU that is
* potentially running at a lower capacity.
*
* Thus we're looking for max(wl_i / capacity_i), crosswise
* multiplication to rid ourselves of the division works out
* to: wl_i * capacity_j > wl_j * capacity_i; where j is
* our previous maximum.
*/
if (wl * busiest_capacity > busiest_load * capacity) {//比较的是load/capacity,capacity是指的cfs_rq的能力
busiest_load = wl;
busiest_capacity = capacity;
busiest = rq;
}
}
return busiest;
}
结构图
调度域与调度组
参考链接
load_balance()函数分析
sched_domain拓扑结构的建立
Linux时间子系统之一:clock source(时钟源)
Linux时间子系统之一:clock source(时钟源)
ntp时间同步
linux load tracking
linux 拓扑结构的建立
一文读懂linux负载均衡