【Linux负载均衡】

wulaladamowang

已于 2022-04-18 08:24:55 修改

阅读量1.3k

点赞数

文章标签： linux

于 2022-04-13 23:28:18 首次发布

本文链接：https://blog.csdn.net/wulaladamowang/article/details/124159760

版权

Linux负载均衡

核心结构体
核心函数 load_balance fair.c
结构图
- 调度域与调度组
参考链接

核心结构体

struct sched_domain {
	/* These fields must be setup */
	struct sched_domain *parent;	/* top domain must be null terminated */
	struct sched_domain *child;	/* bottom domain must be null terminated */
	struct sched_group *groups;	/* the balancing groups of the domain */
	unsigned long min_interval;	/* Minimum balance interval ms */
	unsigned long max_interval;	/* Maximum balance interval ms */
	unsigned int busy_factor;	/* less balancing by factor if busy */
	unsigned int imbalance_pct;	/* No balance until over watermark */
	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
	unsigned int busy_idx;
	unsigned int idle_idx;
	unsigned int newidle_idx;
	unsigned int wake_idx;
	unsigned int forkexec_idx;
	unsigned int smt_gain;

	int nohz_idle;			/* NOHZ IDLE status */
	int flags;			/* See SD_* */
	int level;

	/* Runtime fields. */
	unsigned long last_balance;	/* init to jiffies. units in jiffies */
	unsigned int balance_interval;	/* initialise to 1. units in ms. */
	unsigned int nr_balance_failed; /* initialise to 0 */

	/* idle_balance() stats */
	u64 max_newidle_lb_cost;
	unsigned long next_decay_max_lb_cost;

	u64 avg_scan_cost;		/* select_idle_sibling */

#ifdef CONFIG_SCHEDSTATS
	/* load_balance() stats */
	unsigned int lb_count[CPU_MAX_IDLE_TYPES];//记录idle type的数量
	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];

	/* Active load balancing */
	unsigned int alb_count;
	unsigned int alb_failed;
	unsigned int alb_pushed;

	/* SD_BALANCE_EXEC stats */
	unsigned int sbe_count;
	unsigned int sbe_balanced;
	unsigned int sbe_pushed;

	/* SD_BALANCE_FORK stats */
	unsigned int sbf_count;
	unsigned int sbf_balanced;
	unsigned int sbf_pushed;

	/* try_to_wake_up() stats */
	unsigned int ttwu_wake_remote;
	unsigned int ttwu_move_affine;
	unsigned int ttwu_move_balance;
#endif
#ifdef CONFIG_SCHED_DEBUG
	char *name;
#endif
	union {
		void *private;		/* used during construction */
		struct rcu_head rcu;	/* used during destruction */
	};
	struct sched_domain_shared *shared;

	unsigned int span_weight;
	/*
	 * Span of all CPUs in this domain.
	 *
	 * NOTE: this field is variable length. (Allocated dynamically
	 * by attaching extra space to the end of the structure,
	 * depending on how many CPUs the kernel has booted up with)
	 */
	unsigned long span[0];
};

struct sched_group {
	struct sched_group	*next;			/* Must be a circular list */
	atomic_t		ref;

	unsigned int		group_weight;
	struct sched_group_capacity *sgc;
	int			asym_prefer_cpu;	/* CPU of highest priority in group */

	/*
	 * The CPUs this group covers.
	 *
	 * NOTE: this field is variable length. (Allocated dynamically
	 * by attaching extra space to the end of the structure,
	 * depending on how many CPUs the kernel has booted up with)
	 */
	unsigned long		cpumask[0];
};

struct sched_group_capacity {
    //引用计算，可能多个sd共享一个sg和sgc
	atomic_t		ref;
	/*
	 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
	 * for a single CPU.
	 */
	unsigned long		capacity;
	unsigned long		min_capacity;		/* Min per-CPU capacity in group */
	//下一次更新算力的时间点
	unsigned long		next_update;
    //该sg中是否有由于affinity原因产生不均衡的问题
	int			imbalance;		/* XXX unrelated to capacity but shared group state */

#ifdef CONFIG_SCHED_DEBUG
    //MC层级的是每个cpu的id，DIE层级的是每个cluster的首个cpu的id
	int			id;
#endif
    //该sg包含的cpu
	unsigned long		cpumask[0];		/* Balance mask */
};

struct lb_env {
	//要进行负载均衡的domain
	struct sched_domain	*sd;
	//此sd中最忙的cpu和rq，均衡目标就是从其中拉取任务
	struct rq		*src_rq;
	int			src_cpu;
	/*
	 *本次均衡的目标CPU,均衡尝试从sd中的最忙的cpu的rq上拉取任务到dst cpu的rq上，
     * 第一轮均衡的dst cpu通常为发起均衡的cpu，但后续若有需要，可以从新设定为local
     * group中其它的cpu.
     */
	int			dst_cpu;
	struct rq		*dst_rq;
	// dst cpu所在sched group的cpu mask，MC层级就是dst cpu自己，DIE层级是其cluster.
	struct cpumask		*dst_grpmask;
    /*
     * 一般而言，均衡的dst cpu是发起均衡的cpu，但如果由于affinity原因，src上有任务
     * 无法迁移到dst cpu从而无法完成负载均衡操作时，会从dst cpu的logcal group中选出
     * 一个新的cpu发起第二轮负载均衡。
     */
	int			new_dst_cpu;
	//均衡时dst cpu的idle状态，其会影响负载均衡的走向
	enum cpu_idle_type	idle;
    /*
     * 对此成员的解释需要结合migration_type成员, calculate_imbalance：
     * migrate_load：表示要迁移的负载量
     * migrate_util：表示要迁移的utility
     * migrate_task：MC:表示要迁移的任务个数,DIE: busiest group需要增加的idle cpu个数
     * migrate_misfit：设定为1,表示一次迁移一个任务
     * group_imbalanced：设定为1,表示一次迁移一个任务
     */
	long			imbalance;
	/* The set of CPUs under consideration for load-balancing */
    /*
     * 负载均衡过程会有多轮操作，不同轮次的操作会涉及不同cpus，此成员表示此次均衡
     * 有哪些cpus参与
     */
	struct cpumask		*cpus;
    /*
     * 负载均衡标志，位掩码。LBF_NOHZ_STATS 和 LBF_NOHZ_AGAIN 主要用于均衡过程中更
     * 新nohz状态。当选中的最忙的cpu上所有任务都由于affinity无法迁移时会设置
     * LBF_ALL_PINNED，此时会寻找次忙的cpu进行下一轮均衡。LBF_NEED_BREAK 主要用于
     * 减短均衡过程中关中断的时间的。
     */
	unsigned int		flags;
    /*
     * 当确定要迁移任务时，load_balance()会循环遍历src rq上的cfs task链表来确定迁移
     * 的任务数量。loop用于跟踪循环次数，其值不能超过loop_max成员。
     */
	unsigned int		loop;
    /*
     * 如果一次迁移的任务比较多，那么每迁移 sched_nr_migrate_break 个任务就要休息一
     * 下，让关中断的临界区小一点。
     */
	unsigned int		loop_break;
	unsigned int		loop_max;

	enum fbq_type		fbq_type;
	//需要迁移的任务会挂到这个链表中
	struct list_head	tasks;
	    /*
     * 要达到sd负载均衡的目标，本次迁移的类型是什么，迁移一定量的负载、一定量的utility、
     * 一些任务还是misfit task。见 imbalance 成员的解释。
     */
     //enum migration_type    migration_type;
};

核心函数 load_balance fair.c

/*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
 * this_cpu:pull task to this cpu;
 * this_rq:
 * sd：调度域
 * cpu_idle_type_idle:
 * continue_balancing:是否继续balancing
 */
static int load_balance(int this_cpu, struct rq *this_rq,
			struct sched_domain *sd, enum cpu_idle_type idle,
			int *continue_balancing)
{
	int ld_moved, cur_ld_moved, active_balance = 0;
	struct sched_domain *sd_parent = sd->parent;
	struct sched_group *group;
	struct rq *busiest;
	struct rq_flags rf;
	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);//返回load_balance_mask,已经实现了抢占保护,注意，取得是地址

	struct lb_env env = {
		.sd		= sd,
		.dst_cpu	= this_cpu,
		.dst_rq		= this_rq,
		.dst_grpmask    = sched_group_span(sd->groups),//返回该调度组中的mask，调度域中的第一个调度组，调度组通过链表链接
		.idle		= idle,
		.loop_break	= sched_nr_migrate_break,
		.cpus		= cpus,
		.fbq_type	= all,
		.tasks		= LIST_HEAD_INIT(env.tasks),
	};

	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);//通过与操作，获得load_balance_mask，即在当前调度域的第一个调度组中的mask与在线的mask，sched_domain_span()与dst_grpmask的效果应该相同，都是调度域中第一个调度组中mask

	schedstat_inc(sd->lb_count[idle]);//inc 对应idle计数，当前cpu是否要进行idle状态

redo:
	if (!should_we_balance(&env)) {//判断是否应该balance-1. dst_cpu是idle cpu或者是dst_cpu是当前调度组的第一个cpu--核心函数1
		*continue_balancing = 0;
		goto out_balanced;
	}

	group = find_busiest_group(&env);//返回最繁忙的group，以及对应需要迁移多少任务
	if (!group) {
		schedstat_inc(sd->lb_nobusyg[idle]);
		goto out_balanced;
	}

	busiest = find_busiest_queue(&env, group);
	if (!busiest) {
		schedstat_inc(sd->lb_nobusyq[idle]);
		goto out_balanced;
	}

	BUG_ON(busiest == env.dst_rq);

	schedstat_add(sd->lb_imbalance[idle], env.imbalance);

	env.src_cpu = busiest->cpu;
	env.src_rq = busiest;

	ld_moved = 0;
	if (busiest->nr_running > 1) {
		/*
		 * Attempt to move tasks. If find_busiest_group has found
		 * an imbalance but busiest->nr_running <= 1, the group is
		 * still unbalanced. ld_moved simply stays zero, so it is
		 * correctly treated as an imbalance.
		 */
		env.flags |= LBF_ALL_PINNED;
		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);

more_balance:
		rq_lock_irqsave(busiest, &rf);
		update_rq_clock(busiest);

		/*
		 * cur_ld_moved - load moved in current iteration
		 * ld_moved     - cumulative load moved across iterations
		 */
		cur_ld_moved = detach_tasks(&env);//将任务从src_rq上detach，会有些hot_cache,以及cpu_allowed，以及running的不允许迁移，每次循环都会将load减去，得到新的imbalance，将task设置迁移状态，然后挂在env->tasks的链表之上；当loop>loop_max，会设置LBF_NEED_BREAK的标志，每次最多迁移32个；如果某些p不能够迁移，会设置新的可以迁移的new_dst_cpu，设置标志位LBF_DST_PINNED

		/*
		 * We've detached some tasks from busiest_rq. Every
		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
		 * unlock busiest->lock, and we are able to be sure
		 * that nobody can manipulate the tasks in parallel.
		 * See task_rq_lock() family for the details.
		 */

		rq_unlock(busiest, &rf);

		if (cur_ld_moved) {
			attach_tasks(&env);//将env->tasks上的任务挂在dst_rq之上
			ld_moved += cur_ld_moved;
		}

		local_irq_restore(rf.flags);

		if (env.flags & LBF_NEED_BREAK) {
			env.flags &= ~LBF_NEED_BREAK;
			goto more_balance;
		}

		/*
		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
		 * us and move them to an alternate dst_cpu in our sched_group
		 * where they can run. The upper limit on how many times we
		 * iterate on same src_cpu is dependent on number of CPUs in our
		 * sched_group.
		 *
		 * This changes load balance semantics a bit on who can move
		 * load to a given_cpu. In addition to the given_cpu itself
		 * (or a ilb_cpu acting on its behalf where given_cpu is
		 * nohz-idle), we now have balance_cpu in a position to move
		 * load to given_cpu. In rare situations, this may cause
		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
		 * _independently_ and at _same_ time to move some load to
		 * given_cpu) causing exceess load to be moved to given_cpu.
		 * This however should not happen so much in practice and
		 * moreover subsequent load balance cycles should correct the
		 * excess load moved.
		 */
		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {//迁移完了，还不够，记录了一些new_dst_cpu，可以进行迁移

			/* Prevent to re-select dst_cpu via env's CPUs */
			cpumask_clear_cpu(env.dst_cpu, env.cpus);

			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
			env.dst_cpu	 = env.new_dst_cpu;
			env.flags	&= ~LBF_DST_PINNED;
			env.loop	 = 0;
			env.loop_break	 = sched_nr_migrate_break;

			/*
			 * Go back to "more_balance" rather than "redo" since we
			 * need to continue with same src_cpu.
			 */
			goto more_balance;
		}

		/*
		 * We failed to reach balance because of affinity.
		 */
		if (sd_parent) {//在mc层级，即在一个cluster中
			int *group_imbalance = &sd_parent->groups->sgc->imbalance;

			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)//LBF_SOME_PINNED的设置在detach_tasks中，表明有些任务是因为亲和性而无法转移，同时，可以转移的转移结束了还没有达到转移平衡的状态；
				*group_imbalance = 1;//在一个cluster中没有均衡完全，需要在整个DIE中进行均衡
		}

		/* All tasks on this runqueue were pinned by CPU affinity */
		if (unlikely(env.flags & LBF_ALL_PINNED)) {//将cpu从可以均衡的cpus中去除，重新进行均衡
			cpumask_clear_cpu(cpu_of(busiest), cpus);
			/*
			 * Attempting to continue load balancing at the current
			 * sched_domain level only makes sense if there are
			 * active CPUs remaining as possible busiest CPUs to
			 * pull load from which are not contained within the
			 * destination group that is receiving any migrated
			 * load.
			 */
			if (!cpumask_subset(cpus, env.dst_grpmask)) {
				env.loop = 0;
				env.loop_break = sched_nr_migrate_break;
				goto redo;
			}
			goto out_all_pinned;
		}
	}

	if (!ld_moved) {
		schedstat_inc(sd->lb_failed[idle]);//统计了每种平衡失败的次数
		/*
		 * Increment the failure counter only on periodic balance.
		 * We do not want newidle balance, which can be very
		 * frequent, pollute the failure counter causing
		 * excessive cache_hot migrations and active balances.
		 */
		if (idle != CPU_NEWLY_IDLE)
			sd->nr_balance_failed++;//非new_idle状态的失败次数

		if (need_active_balance(&env)) {//new_idleCPU需要根据asym_perfer判断是否强制；非idle类型需要进行判断cfs capacity；或者失败次数太多也会active_balance
			unsigned long flags;

			raw_spin_lock_irqsave(&busiest->lock, flags);

			/*
			 * Don't kick the active_load_balance_cpu_stop,
			 * if the curr task on busiest CPU can't be
			 * moved to this_cpu:
			 * 再次判断当前busiest的任务是否可以运行在dst_cpu上
			 */
			if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
				raw_spin_unlock_irqrestore(&busiest->lock,
							    flags);
				env.flags |= LBF_ALL_PINNED;
				goto out_one_pinned;
			}

			/*
			 * ->active_balance synchronizes accesses to
			 * ->active_balance_work.  Once set, it's cleared
			 * only after active load balance is finished.
			 * 可以运行
			 */
			if (!busiest->active_balance) {
				busiest->active_balance = 1;
				busiest->push_cpu = this_cpu;
				active_balance = 1;
			}
			raw_spin_unlock_irqrestore(&busiest->lock, flags);

			if (active_balance) {
				stop_one_cpu_nowait(cpu_of(busiest),
					active_load_balance_cpu_stop, busiest,
					&busiest->active_balance_work);
			}

			/* We've kicked active balancing, force task migration. */
			sd->nr_balance_failed = sd->cache_nice_tries+1;
		}
	} else
		sd->nr_balance_failed = 0;

	if (likely(!active_balance)) {//更新balance_interval
		/* We were unbalanced, so reset the balancing interval */
		sd->balance_interval = sd->min_interval;
	} else {
		/*
		 * If we've begun active balancing, start to back off. This
		 * case may not be covered by the all_pinned logic if there
		 * is only 1 task on the busy runqueue (because we don't call
		 * detach_tasks).
		 */
		if (sd->balance_interval < sd->max_interval)
			sd->balance_interval *= 2;//调度的时间间隔X2
	}

	goto out;

out_balanced:
	/*
	 * We reach balance although we may have faced some affinity
	 * constraints. Clear the imbalance flag if it was set.
	 */
	if (sd_parent) {
		int *group_imbalance = &sd_parent->groups->sgc->imbalance;

		if (*group_imbalance)
			*group_imbalance = 0;
	}

out_all_pinned:
	/*
	 * We reach balance because all tasks are pinned at this level so
	 * we can't migrate them. Let the imbalance flag set so parent level
	 * can try to migrate them.
	 */
	schedstat_inc(sd->lb_balanced[idle]);

	sd->nr_balance_failed = 0;

out_one_pinned:
	/* tune up the balancing interval */
	if (((env.flags & LBF_ALL_PINNED) &&
			sd->balance_interval < MAX_PINNED_INTERVAL) ||
			(sd->balance_interval < sd->max_interval))
		sd->balance_interval *= 2;

	ld_moved = 0;
out:
	return ld_moved;
}

核心数据结构

调用核心函数1 should_we_balance()

核心数据结构

enum cpu_idle_type {
	CPU_IDLE,
	CPU_NOT_IDLE,
	CPU_NEWLY_IDLE,
	CPU_MAX_IDLE_TYPES
};

核心辅助函数

/*
 * Build the balance mask; it contains only those CPUs that can arrive at this
 * group and should be considered to continue balancing.
 *
 * We do this during the group creation pass, therefore the group information
 * isn't complete yet, however since each group represents a (child) domain we
 * can fully construct this using the sched_domain bits (which are already
 * complete).
 */
static void
build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
{
	const struct cpumask *sg_span = sched_group_span(sg);
	struct sd_data *sdd = sd->private;
	struct sched_domain *sibling;
	int i;

	cpumask_clear(mask);

	for_each_cpu(i, sg_span) {
		sibling = *per_cpu_ptr(sdd->sd, i);

		/*
		 * Can happen in the asymmetric case, where these siblings are
		 * unused. The mask will not be empty because those CPUs that
		 * do have the top domain _should_ span the domain.
		 */
		if (!sibling->child)
			continue;

		/* If we would not end up here, we can't continue from here */
		if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
			continue;

		cpumask_set_cpu(i, mask);
	}

	/* We must not have empty masks here */
	WARN_ON_ONCE(cpumask_empty(mask));
}

核心函数

/*
 * DIE层级只有idle cpu和 group的第一个cpu可以发起均衡；DIE阶段，有两个调度组，每个cpu也只能在一个调度组，所以，可以发起均衡的只有0,4
 * MC层级的每个cpu都可以发起均衡
 */
static int should_we_balance(struct lb_env *env)
{
	struct sched_group *sg = env->sd->groups;
	int cpu, balance_cpu = -1;

	/*
	 * Ensure the balancing environment is consistent; can happen
	 * when the softirq triggers 'during' hotplug.
	 */
	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))//当前调度域涉及的cpu
		return 0;

	/*
	 * In the newly idle case, we will allow all the CPUs
	 * to do the newly idle load balance.
	 */
	if (env->idle == CPU_NEWLY_IDLE) // 判断所处的idle状态，如果是CPU_NEWLY_IDLE就直接返回应该balance，new idle
		return 1;

	/* Try to find first idle CPU */
	for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { //group_balance_mask，将bitmaps转换为struct cpumask*
		if (!idle_cpu(cpu))//通过判断当前cpu运行的是否是idle进程，以及wake_list列表上是否为空，判断是否为idle cpu
			continue;

		balance_cpu = cpu;
		break;
	}

	if (balance_cpu == -1)
		balance_cpu = group_balance_cpu(sg);//没有idle cpu，就返回当前调度组的第一个cpu

	/*
	 * First idle CPU or the first CPU(busiest) in this sched group
	 * is eligible for doing load balancing at this and above domains.
	 * idle cpu并且是当前cpu可以均衡；
	 * 如果没有idle cpu在这个调度组中，则只有调度组中的第一个cpu可以发起均衡
	 * dst_cpu是当前调度组的第一个cpu或者是idle cpu可以发起负载均衡
	 */
	return balance_cpu == env->dst_cpu;
}

核心调用函数2 find_busiest_group()

核心数据结构

/*
 * sg_lb_stats - stats of a sched_group required for load_balancing
 * 调度组也有一个类似的数据结构struct sg_lb_stats，用于描述该调度组里的相关信息，例如平均负载、总负载、总权重、进程平均权重等
 */
struct sg_lb_stats {
	unsigned long avg_load; /*Avg load across the CPUs of the group */
	unsigned long group_load; /* Total load over the CPUs of the group */
	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
	unsigned long load_per_task;
	unsigned long group_capacity;该sg上所有cpu的可用于cfs任务的算力之和
	unsigned long group_util; /* Total utilization of the group */
	unsigned int sum_nr_running; /* Nr tasks running in the group //该sg上所有任务的数量，包括rt、dl任务*/
	unsigned int idle_cpus;    //该sg中idle cpu的数量
	unsigned int group_weight;    //该sg中cpu的数量
	enum group_type group_type;    //该sg在负载均衡时所处的状态
	int group_no_capacity;
#ifdef CONFIG_NUMA_BALANCING
	unsigned int nr_numa_running;
	unsigned int nr_preferred_running;
#endif
};

/*
 * sd_lb_stats - Structure to store the statistics of a sched_domain
 *		 during load balancing. 该结构体描述调度域中的总负载、总能力系数和平均负载等信息
 */
struct sd_lb_stats {
	struct sched_group *busiest;	/* Busiest group in this sd */
	struct sched_group *local;	/* Local group in this sd;dst_cpu in this group*/
	unsigned long total_running;
	unsigned long total_load;	/* Total load of all groups in sd */
	unsigned long total_capacity;	/* Total capacity of all groups in sd */
	unsigned long avg_load;	/* Average load across all groups in sd */
    //标记任务应该先去到同cluster的cpu
    //unsigned int prefer_sibling;
	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
	struct sg_lb_stats local_stat;	/* Statistics of the local group */
};

核心辅助函数

**
 * fix_small_imbalance - Calculate the minor imbalance that exists
 *			amongst the groups of a sched_domain, during
 *			load balancing.
 * @env: The load balancing environment.
 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
 */
static inline
void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
	unsigned long tmp, capa_now = 0, capa_move = 0;
	unsigned int imbn = 2;
	unsigned long scaled_busy_load_per_task;
	struct sg_lb_stats *local, *busiest;

	local = &sds->local_stat;
	busiest = &sds->busiest_stat;

	if (!local->sum_nr_running)
		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
	else if (busiest->load_per_task > local->load_per_task)
		imbn = 1;

	scaled_busy_load_per_task =
		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
		busiest->group_capacity;

	if (busiest->avg_load + scaled_busy_load_per_task >=
	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
		env->imbalance = busiest->load_per_task;
		return;
	}

	/*
	 * OK, we don't have enough imbalance to justify moving tasks,
	 * however we may be able to increase total CPU capacity used by
	 * moving them.
	 */

	capa_now += busiest->group_capacity *
			min(busiest->load_per_task, busiest->avg_load);
	capa_now += local->group_capacity *
			min(local->load_per_task, local->avg_load);
	capa_now /= SCHED_CAPACITY_SCALE;

	/* Amount of load we'd subtract */
	if (busiest->avg_load > scaled_busy_load_per_task) {
		capa_move += busiest->group_capacity *
			    min(busiest->load_per_task,
				busiest->avg_load - scaled_busy_load_per_task);
	}

	/* Amount of load we'd add */
	if (busiest->avg_load * busiest->group_capacity <
	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
		tmp = (busiest->avg_load * busiest->group_capacity) /
		      local->group_capacity;
	} else {
		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
		      local->group_capacity;
	}
	capa_move += local->group_capacity *
		    min(local->load_per_task, local->avg_load + tmp);
	capa_move /= SCHED_CAPACITY_SCALE;

	/* Move if we gain throughput */
	if (capa_move > capa_now)
		env->imbalance = busiest->load_per_task;
}

/**
 * calculate_imbalance - Calculate the amount of imbalance present within the
 *			 groups of a given sched_domain during load balance.
 * @env: load balance environment
 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
 */
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
	unsigned long max_pull, load_above_capacity = ~0UL;
	struct sg_lb_stats *local, *busiest;

	local = &sds->local_stat;
	busiest = &sds->busiest_stat;
// 由于affinity而导致的不均衡
	if (busiest->group_type == group_imbalanced) {
		/*
		 * In the group_imb case we cannot rely on group-wide averages
		 * to ensure CPU-load equilibrium, look at wider averages. XXX
		 */
		busiest->load_per_task =
			min(busiest->load_per_task, sds->avg_load);
	}

	/*
	 * Avg load of busiest sg can be less and avg load of local sg can
	 * be greater than avg load across all sgs of sd because avg load
	 * factors in sg capacity and sgs with smaller group_type are
	 * skipped when updating the busiest sg:
	 */
	if (busiest->avg_load <= sds->avg_load ||
	    local->avg_load >= sds->avg_load) {
		env->imbalance = 0;
		return fix_small_imbalance(env, sds);
	}

	/*
	 * If there aren't any idle CPUs, avoid creating some.
	 * 将大于能力的cpu进行均衡
	 */
	if (busiest->group_type == group_overloaded &&
	    local->group_type   == group_overloaded) {
		load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
		if (load_above_capacity > busiest->group_capacity) {
			load_above_capacity -= busiest->group_capacity;
			load_above_capacity *= scale_load_down(NICE_0_LOAD);
			load_above_capacity /= busiest->group_capacity;
		} else
			load_above_capacity = ~0UL;
	}

	/*
	 * We're trying to get all the CPUs to the average_load, so we don't
	 * want to push ourselves above the average load, nor do we wish to
	 * reduce the max loaded CPU below the average load. At the same time,
	 * we also don't want to reduce the group load below the group
	 * capacity. Thus we look for the minimum possible imbalance.
	 */
	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);

	/* How much load to actually move to equalise the imbalance */
	env->imbalance = min(
		max_pull * busiest->group_capacity,
		(sds->avg_load - local->avg_load) * local->group_capacity
	) / SCHED_CAPACITY_SCALE;

	/*
	 * if *imbalance is less than the average load per runnable task
	 * there is no guarantee that any tasks will be moved so we'll have
	 * a think about bumping its value to force at least one task to be
	 * moved
	 */
	if (env->imbalance < busiest->load_per_task)
		return fix_small_imbalance(env, sds);
}

/**
 * update_sd_pick_busiest - return 1 on busiest group
 * @env: The load balancing environment.
 * @sds: sched_domain statistics
 * @sg: sched_group candidate to be checked for being the busiest
 * @sgs: sched_group statistics
 *
 * Determine if @sg is a busier group than the previously selected
 * busiest group.
 *
 * Return: %true if @sg is a busier group than the previously selected
 * busiest group. %false otherwise.
 */
static bool update_sd_pick_busiest(struct lb_env *env,
				   struct sd_lb_stats *sds,
				   struct sched_group *sg,
				   struct sg_lb_stats *sgs)
{
	struct sg_lb_stats *busiest = &sds->busiest_stat;

	if (sgs->group_type > busiest->group_type) //enum group_type group_other, group_imbalanced, group overloaded
		return true;

	if (sgs->group_type < busiest->group_type)
		return false;

	if (sgs->avg_load <= busiest->avg_load)
		return false;

	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
		goto asym_packing;

	/*
	 * Candidate sg has no more than one task per CPU and
	 * has higher per-CPU capacity. Migrating tasks to less
	 * capable CPUs may harm throughput. Maximize throughput,
	 * power/energy consequences are not considered.
	 */
	if (sgs->sum_nr_running <= sgs->group_weight &&
	    group_smaller_cpu_capacity(sds->local, sg))//sg->sgc->min_capacity*capacity_margin(1280) < ref->sgc->min_capacity*1024 当前sg还有capactiy大于local的1.25倍的时候，不可能是busist
		return false;

asym_packing:
	/* This is the busiest node in its class. */
	if (!(env->sd->flags & SD_ASYM_PACKING))
		return true;

	/* No ASYM_PACKING if target CPU is already busy */
	if (env->idle == CPU_NOT_IDLE)//sg的min_capacity足够大
		return true;
	/*
	 * ASYM_PACKING needs to move all the work to the highest
	 * prority CPUs in the group, therefore mark all groups
	 * of lower priority than ourself as busy.
	 * cpu也有优先级，将低优先级的置为busy，则当前cpu就能被优先选择
	 */
	if (sgs->sum_nr_running &&
	    sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
		if (!sds->busiest)
			return true;

		/* Prefer to move from lowest priority CPU's work */
		if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
				      sg->asym_prefer_cpu))
			return true;
	}

	return false;
}

static inline void update_blocked_averages(int cpu)
{
	struct rq *rq = cpu_rq(cpu);
	struct cfs_rq *cfs_rq = &rq->cfs;
	const struct sched_class *curr_class;
	struct rq_flags rf;

	rq_lock_irqsave(rq, &rf);
	update_rq_clock(rq);//更新rq->clock 和rq->clock_task, rq->clock表示时间与当前jiffies相同
	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);//更新cfs_rq的util以及load，会调用更新频率的函数,减去一些阻塞的任务

	curr_class = rq->curr->sched_class;
	update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
	update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
	update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
	rq->last_blocked_load_update_tick = jiffies;
	if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))//cfs_rq_has_blocked()返回当前cfs_rq是否有负载，other检测rt以及dl
		rq->has_blocked_load = 0;//什么任务都没有的时候，会为0,是否有block_load
#endif
	rq_unlock_irqrestore(rq, &rf);
}

static bool update_nohz_stats(struct rq *rq, bool force) force是否强制更新,判断当前rq上是否block_load有在运行，有的话返回true,没有返回false
{
#ifdef CONFIG_NO_HZ_COMMON
	unsigned int cpu = rq->cpu;

	if (!rq->has_blocked_load)//has_blocked_load:进入idle 的cpu是否需要更新blocked_load,一个被kick的idle cpu并不是总是为了完成负载均衡，有时候也是需要更新blocked_load;当前rq没有任务
		return false;

	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))//idle_cpus_mask记录了系统中的idlecpu
		return false;

	if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))//在update_blocked_averages中更新，last_blocked_load_update_tick
		return true;//说明刚刚更新过了

	update_blocked_averages(cpu);//更新cpu的blocked_load

	return rq->has_blocked_load;//返回是否有任务，有任务为1，无任务为0
#else
	return false;
#endif
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
	unsigned long capacity = scale_rt_capacity(sd, cpu);//返回刨去rt以及dl任务之后的当前cpu的cap，百分比
	struct sched_group *sdg = sd->groups;

	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);//per_cpu变量，cpu_scale

	if (!capacity)
		capacity = 1;

	cpu_rq(cpu)->cpu_capacity = capacity;
	sdg->sgc->capacity = capacity;
	sdg->sgc->min_capacity = capacity;
}
/*更新调度域中对调度组的统计信息；cpu会指向的sd的groups中是自己开头的，sd每个cpu都有变量，sg串联成环
 * DIE调度域分为3个调度组，在MC有两个调度域，超大核不成域；MC上的两个调度域每个中的cpu包含一个cluster，每个cluster中的cpu单独成为mc下的一个调度组，然后cluster中的调度组串联起来；如果更新MC中的调度组，就是更新每个cpu的算力，如果更新DIE中的调度组，就是将调度组中的cpu(MC调度域中的被包含的调度组)的能力综合起来计算
 */
void update_group_capacity(struct sched_domain *sd, int cpu)
{
	struct sched_domain *child = sd->child;
	struct sched_group *group, *sdg = sd->groups;
	unsigned long capacity, min_capacity;
	unsigned long interval;

	interval = msecs_to_jiffies(sd->balance_interval);//调度域的更新时间间隔 ms
	interval = clamp(interval, 1UL, max_load_balance_interval);min(max(interval, 1), max_load_balance_interval) 如果max_load--小于1，则为max_loas--，如果大于1 ，则为interval；即如果用户设定过大，则保持默认，如果用户有设置小均衡周期，则使用用户的
	sdg->sgc->next_update = jiffies + interval;//更新下一次组负载均衡的时间

	if (!child) {//满足则说明在MC层，该层的sd包含当前cpu所在的cluster，然后每个group包含一个cpu
		update_cpu_capacity(sd, cpu);//更新当前cpu所在的capacity以及min_capacity，一组一cpu
		return;
	}
	//DIE层
	capacity = 0;
	min_capacity = ULONG_MAX;

	if (child->flags & SD_OVERLAP) {//sched_domains of this level overlap
		/*
		 * SD_OVERLAP domains cannot assume that child groups
		 * span the current group.
		 */

		for_each_cpu(cpu, sched_group_span(sdg)) {//当前调度组中的cpu
			struct sched_group_capacity *sgc;
			struct rq *rq = cpu_rq(cpu);

			/*
			 * build_sched_domains() -> init_sched_groups_capacity()
			 * gets here before we've attached the domains to the
			 * runqueues.
			 *
			 * Use capacity_of(), which is set irrespective of domains
			 * in update_cpu_capacity().
			 *
			 * This avoids capacity from being 0 and
			 * causing divide-by-zero issues on boot.
			 */
			if (unlikely(!rq->sd)) {
				capacity += capacity_of(cpu);
			} else {
				sgc = rq->sd->groups->sgc;
				capacity += sgc->capacity;//cfs任务的处理能力
			}

			min_capacity = min(capacity, min_capacity);//当前调度组中最小的可利用能力
		}
	} else  {
		/*
		 * !SD_OVERLAP domains can assume that child groups
		 * span the current group.
		 */

		group = child->groups;//child是MC层级，其中的group是单个cpu，下面的循环遍历一个cluster,将一个cluster中相加起来即MC层级中一个调度域，也是DIE层级中的一个调度组。
		do {
			struct sched_group_capacity *sgc = group->sgc;

			capacity += sgc->capacity;
			min_capacity = min(sgc->min_capacity, min_capacity);
			group = group->next;
		} while (group != child->groups);
	}

	sdg->sgc->capacity = capacity;//当前调度域cluster中的所有可利用算力cfs任务，一个调度域是一个cluster，一个调度组是其中的一个cpu 该层级只有一个sg，一个cluster中  在DIE中一个调度组中的cpu，在MC会在一个调度域。
	sdg->sgc->min_capacity = min_capacity;//当前调度域中的调度组cluster中的cpu的最小算力
}
/**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 * @env: The load balancing environment.
 * @group: sched_group whose statistics are to be updated. 调度域中的调度组
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
 * @sgs: variable to hold the statistics for this group.
 * @overload: Indicate more than one runnable task for any CPU.
 */
static inline void update_sg_lb_stats(struct lb_env *env,
			struct sched_group *group, int load_idx,
			int local_group, struct sg_lb_stats *sgs,
			bool *overload)
{
	unsigned long load;
	int i, nr_running;

	memset(sgs, 0, sizeof(*sgs));

	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
		struct rq *rq = cpu_rq(i);

		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))//LBF_NOHZ_STATS是cpu进入new idle状态的时候设置的，block_load=1，进入设置后，会检测是否是重新检测过，如果以及过了时钟，则进行检测，如果确实是没有负载运行，就将has_bloacked_load=0并返回，如果返回1，则说明还有任务，进入idle失败；返回当前cpu上的block task是否有更新，如果有更新就设置该标志
			env->flags |= LBF_NOHZ_AGAIN;

		/* Bias balancing toward CPUs of our domain: */
		if (local_group)
			load = target_load(i, load_idx);
		else
			load = source_load(i, load_idx);//两个一样，返回cfs_rq->avg.runnable_load_avg，和rq->cpu_load的最大值

		sgs->group_load += load;
		sgs->group_util += cpu_util(i);//amount of capacity of a cpu that is (estimated to be) used by cfs task.
		sgs->sum_nr_running += rq->cfs.h_nr_running;

		nr_running = rq->nr_running;
		if (nr_running > 1)
			*overload = true;

#ifdef CONFIG_NUMA_BALANCING
		sgs->nr_numa_running += rq->nr_numa_running;
		sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
		sgs->sum_weighted_load += weighted_cpuload(rq);//返回cfs_rq->avg.runnable_load_avg
		/*
		 * No need to call idle_cpu() if nr_running is not 0
		 */
		if (!nr_running && idle_cpu(i))
			sgs->idle_cpus++;
	}

	/* Adjust by relative CPU capacity of the group */
	sgs->group_capacity = group->sgc->capacity;
	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;

	if (sgs->sum_nr_running)
		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

	sgs->group_weight = group->group_weight;

	sgs->group_no_capacity = group_is_overloaded(env, sgs);//判断是否overloaded 
	sgs->group_type = group_classify(group, sgs);//当前group 所处的状态。
}

/**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @env: The load balancing environment.
 * @sds: variable to hold the statistics for this sched_domain.
 */
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
	struct sched_domain *child = env->sd->child;//下一级调度域
	struct sched_group *sg = env->sd->groups;//当前调度域的第一个调度组
	struct sg_lb_stats *local = &sds->local_stat;//当前调度域的调度组的统计信息
	struct sg_lb_stats tmp_sgs;
	int load_idx, prefer_sibling = 0;
	bool overload = false;

	if (child && child->flags & SD_PREFER_SIBLING)//子调度域存在并且，prefer to place tasks in a sibling domain
		prefer_sibling = 1;

#ifdef CONFIG_NO_HZ_COMMON
	if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))// 记录flags的状态  判断是否是已经关闭时钟？
		env->flags |= LBF_NOHZ_STATS;
#endif

	load_idx = get_sd_load_idx(env->sd, env->idle);//返回所处idle的类型，busy_idx, newidle_idx  idle_idx

	do {
		struct sg_lb_stats *sgs = &tmp_sgs;
		int local_group;

		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));//判断是否在当前调度组中
		if (local_group) {
			sds->local = sg;
			sgs = local;

			if (env->idle != CPU_NEWLY_IDLE ||
			    time_after_eq(jiffies, sg->sgc->next_update))//判断是否已经到了更新sg算力的时间点，得到时间点或者是cpu要进入idle状态时执行
				update_group_capacity(env->sd, env->dst_cpu);//更新调度域中调度组的min_capacity以及capacity，均指的是cfs的算力
		}

		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
						&overload);

		if (local_group)
			goto next_group;

		/*
		 * In case the child domain prefers tasks go to siblings
		 * first, lower the sg capacity so that we'll try
		 * and move all the excess tasks away. We lower the capacity
		 * of a group only if the local group has the capacity to fit
		 * these excess tasks. The extra check prevents the case where
		 * you always pull from the heaviest group when it is already
		 * under-utilized (possible with a large weight task outweighs
		 * the tasks on the system).
		 */
		if (prefer_sibling && sds->local &&
		    group_has_capacity(env, local) &&
		    (sgs->sum_nr_running > local->sum_nr_running + 1)) {
			sgs->group_no_capacity = 1;
			sgs->group_type = group_classify(sg, sgs);
		}

		if (update_sd_pick_busiest(env, sds, sg, sgs)) {//更新当前调度域中的最繁忙调度组，比较group type , load, 以及高优先级的做负载均衡的cpu
			sds->busiest = sg;
			sds->busiest_stat = *sgs;
		}

next_group:
		/* Now, start updating sd_lb_stats */
		sds->total_running += sgs->sum_nr_running;
		sds->total_load += sgs->group_load;
		sds->total_capacity += sgs->group_capacity;

		sg = sg->next;
	} while (sg != env->sd->groups);

#ifdef CONFIG_NO_HZ_COMMON
	if ((env->flags & LBF_NOHZ_AGAIN) &&
	    cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {

		WRITE_ONCE(nohz.next_blocked,
			   jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
	}
#endif

	if (env->sd->flags & SD_NUMA)
		env->fbq_type = fbq_classify_group(&sds->busiest_stat);

	if (!env->sd->parent) {
		/* update overload indicator if we are at root domain */
		if (env->dst_rq->rd->overload != overload)
			env->dst_rq->rd->overload = overload;
	}
}

核心函数

/**
 * find_busiest_group - Returns the busiest group within the sched_domain
 * if there is an imbalance.
 *
 * Also calculates the amount of weighted load which should be moved
 * to restore balance.
 *
 * @env: The load balancing environment.
 *
 * Return:	- The busiest group if imbalance exists.
 */
static struct sched_group *find_busiest_group(struct lb_env *env)
{
	struct sg_lb_stats *local, *busiest;//记录当前调度域中当前调度组以及busiest 调度组的信息
	struct sd_lb_stats sds;

	init_sd_lb_stats(&sds);

	/*
	 * Compute the various statistics relavent for load balancing at
	 * this level.
	 * 负载信息都是不断的在变化，在寻找最繁忙group的时候，我们首先要更新sd负载均衡信息，
     * 以便可以根据最新的负载情况来搜寻。
     * 此函数会更新该 sd 上各个 sg 的负载和算力，得到local group以及
     * 非local group最忙的那个group的均衡信息，以便后续给出最适合的均衡决策。
	 */
	update_sd_lb_stats(env, &sds);//更新调度域的情况，
	local = &sds.local_stat;
	busiest = &sds.busiest_stat;

	/* ASYM feature bypasses nice load balance check
	 * 不对称cpu打包。把低优先级cpu上的任务往高cpu上转移（一般cpu序号越低优先级越高）
	 */
	if (check_asym_packing(env, &sds))
		return sds.busiest;

	/* There is no busy sibling group to pull tasks from */
	if (!sds.busiest || busiest->sum_nr_running == 0)
		goto out_balanced;

	/* XXX broken for overlapping NUMA groups */
	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
						/ sds.total_capacity;

	/*
	 * If the busiest group is imbalanced the below checks don't
	 * work because they assume all things are equal, which typically
	 * isn't true due to cpus_allowed constraints and the like.
	 *  busiest group是一个由于cpu affinity导致的不均衡，MC层级均衡时发现均衡不了设置的 
	 */
	if (busiest->group_type == group_imbalanced)
		goto force_balance;

	/*
	 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
	 * capacities from resulting in underutilization due to avg_load.
	 */
	if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
	    busiest->group_no_capacity)
		goto force_balance;

	/*
	 * If the local group is busier than the selected busiest group
	 * don't try and pull any tasks.
	 */
	if (local->avg_load >= busiest->avg_load)
		goto out_balanced;

	/*
	 * Don't pull any tasks if this group is already above the domain
	 * average load.
	 */
	if (local->avg_load >= sds.avg_load)
		goto out_balanced;

	if (env->idle == CPU_IDLE) {
		/*
		 * This CPU is idle. If the busiest group is not overloaded
		 * and there is no imbalance between this and busiest group
		 * wrt idle CPUs, it is balanced. The imbalance becomes
		 * significant if the diff is greater than 1 otherwise we
		 * might end up to just move the imbalance on another group
		 */
		if ((busiest->group_type != group_overloaded) &&
				(local->idle_cpus <= (busiest->idle_cpus + 1)))
			goto out_balanced;
	} else {
		/*
		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
		 * imbalance_pct to be conservative. 
		 * busiest的负载高于local不了多少，imbalance_pct = 117
		 */
		if (100 * busiest->avg_load <=
				env->sd->imbalance_pct * local->avg_load)
			goto out_balanced;
	}

force_balance:
	/* Looks like there is an imbalance. Compute it  计算需要更新的task_load,根据不均衡程度判断迁移的数量 */
	calculate_imbalance(env, &sds);//
	return env->imbalance ? sds.busiest : NULL;

out_balanced:
	env->imbalance = 0;
	return NULL;
}

核心调用函函数3 find_busiest_queue()

/*
 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
 */
static struct rq *find_busiest_queue(struct lb_env *env,
				     struct sched_group *group)
{
	struct rq *busiest = NULL, *rq;
	unsigned long busiest_load = 0, busiest_capacity = 1;
	int i;

	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
		unsigned long capacity, wl;
		enum fbq_type rt;

		rq = cpu_rq(i);
		rt = fbq_classify_rq(rq);

		/*
		 * We classify groups/runqueues into three groups:
		 *  - regular: there are !numa tasks
		 *  - remote:  there are numa tasks that run on the 'wrong' node
		 *  - all:     there is no distinction
		 *
		 * In order to avoid migrating ideally placed numa tasks,
		 * ignore those when there's better options.
		 *
		 * If we ignore the actual busiest queue to migrate another
		 * task, the next balance pass can still reduce the busiest
		 * queue by moving tasks around inside the node.
		 *
		 * If we cannot move enough load due to this classification
		 * the next pass will adjust the group classification and
		 * allow migration of more tasks.
		 *
		 * Both cases only affect the total convergence complexity.
		 */
		if (rt > env->fbq_type)
			continue;

		capacity = capacity_of(i);

		wl = weighted_cpuload(rq);

		/*
		 * When comparing with imbalance, use weighted_cpuload()
		 * which is not scaled with the CPU capacity.
		 */

		if (rq->nr_running == 1 && wl > env->imbalance &&
		    !check_cpu_capacity(rq, env->sd))//check_cpu_capacity判断当前用于cfs任务的能力是否小于1.17* cpu_capacity < cpu_capacity_orig
			continue;

		/*
		 * For the load comparisons with the other CPU's, consider
		 * the weighted_cpuload() scaled with the CPU capacity, so
		 * that the load can be moved away from the CPU that is
		 * potentially running at a lower capacity.
		 *
		 * Thus we're looking for max(wl_i / capacity_i), crosswise
		 * multiplication to rid ourselves of the division works out
		 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
		 * our previous maximum.
		 */
		if (wl * busiest_capacity > busiest_load * capacity) {//比较的是load/capacity，capacity是指的cfs_rq的能力
			busiest_load = wl;
			busiest_capacity = capacity;
			busiest = rq;
		}
	}

	return busiest;
}

结构图

调度域与调度组

参考链接

load_balance()函数分析
 sched_domain拓扑结构的建立
 Linux时间子系统之一：clock source（时钟源）
Linux时间子系统之一：clock source（时钟源）
ntp时间同步
 linux load tracking
linux 拓扑结构的建立
 一文读懂linux负载均衡

wulaladamowang

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
【Linux负载均衡】

Linux负载均衡核心结构体核心函数 load_balance fair.c核心结构体struct sched_domain { /* These fields must be setup */ struct sched_domain *parent; /* top domain must be null terminated */ struct sched_domain *child; /* bottom domain must be null terminated */ struct sch
复制链接

扫一扫