【内核调度、负载均衡】【update_sd_lb_stats】

get_sd_load_idx

这里面涉及到了一个unsigned long cpu_load[CPU_LOAD_IDX_MAX];数组。scheduler会更具不同的load balance类型(busy,newly idle,idle)选择不同的load进行计算,主要是在update_sg_lb_stats时进行计算

/**
 * get_sd_load_idx - Obtain the load index for a given sched domain.
 * @sd: The sched_domain whose load_idx is to be obtained.
 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
 *
 * Return: The load index.
 */
static inline int get_sd_load_idx(struct sched_domain *sd,
					enum cpu_idle_type idle)
{
	int load_idx;

	switch (idle) {
	case CPU_NOT_IDLE:
		load_idx = sd->busy_idx;
		break;

	case CPU_NEWLY_IDLE:
		load_idx = sd->newidle_idx;
		break;
	default:
		load_idx = sd->idle_idx;
		break;
	}

	return load_idx;
}

set_sd_overutilized&clear_sd_overutilized

static void set_sd_overutilized(struct sched_domain *sd)
{
	trace_sched_overutilized(sd, sd->shared->overutilized, true);
	sd->shared->overutilized = true;
}

static void clear_sd_overutilized(struct sched_domain *sd)
{
	trace_sched_overutilized(sd, sd->shared->overutilized, false);
	sd->shared->overutilized = false;
}

update_sd_lb_stats

update_blocked_averages实际上可以视为更新cpu上entity的load和cpu的load值,详见

https://blog.csdn.net/feifei_csdn/article/details/107332155

/**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @env: The load balancing environment.
 * @sds: variable to hold the statistics for this sched_domain.
 */
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
	struct sched_domain *child = env->sd->child;
	struct sched_group *sg = env->sd->groups;
	struct sg_lb_stats *local = &sds->local_stat;
	struct sg_lb_stats tmp_sgs;
	int load_idx;
	bool overload = false, overutilized = false, misfit_task = false;
	bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;

#ifdef CONFIG_NO_HZ_COMMON//一般配置的都是true
	if (env->idle == CPU_NEWLY_IDLE) {
		int cpu;

		/* Update the stats of NOHZ idle CPUs in the sd */
		for_each_cpu_and(cpu, sched_domain_span(env->sd),
				 nohz.idle_cpus_mask) {
			struct rq *rq = cpu_rq(cpu);

			/* ... Unless we've already done since the last tick */
            // a在b的后面
			if (time_after(jiffies, rq->last_blocked_load_update_tick))
				update_blocked_averages(cpu);
		}
	}
	/*
	 * If we've just updated all of the NOHZ idle CPUs, then we can push
	 * back the next nohz.next_update, which will prevent an unnecessary
	 * wakeup for the nohz stats kick
	 */
	if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
		nohz.next_update = jiffies + LOAD_AVG_PERIOD;
#endif
	//load_idx 实际上在sd的rq->cpu_load[load_idx-1]
	load_idx = get_sd_load_idx(env->sd, env->idle);
	/* (7.3.1.2) 逐个轮询本层级sched_group链表中的每个sched_group */
	do {
		struct sg_lb_stats *sgs = &tmp_sgs;
		int local_group;
		/* (7.3.1.3) 如果sg是当前cpu所在的sg,则本sg称为local_group 
            使用专门的数据结构来存储local_group的信息:
            sds->local = sg;        // 使用sds->local来存储local_group
            sgs = &sds->local_stat; // 使用sds->local_stat来存储local_group的统计
         */
		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
		if (local_group) {
			sds->local = sg;
			sgs = local;
/* (7.3.1.4) 更新local_group的capacity,更新的周期为sd->balance_interval 
                主要目的是动态减去RT进程消耗的capacity
             */
			if (env->idle != CPU_NEWLY_IDLE || time_after_eq(jiffies, sg->sgc->next_update))
				update_group_capacity(env->sd, env->dst_cpu);
		}
/* (7.3.1.5) 更新当前sched_group的负载统计 
            sgs:sg统计数据放到sgs当中
            overload:rq中runnable的进程>1,那么肯定有进程在等待
            overutilized:cpu的capacity < util,运算能力不足
         */
		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
						&overload, &overutilized,
						&misfit_task);

		if (local_group)
			goto next_group;

		/*
		 * In case the child domain prefers tasks go to siblings
		 * first, lower the sg capacity so that we'll try
		 * and move all the excess tasks away. We lower the capacity
		 * of a group only if the local group has the capacity to fit
		 * these excess tasks. The extra check prevents the case where
		 * you always pull from the heaviest group when it is already
		 * under-utilized (possible with a large weight task outweighs
		 * the tasks on the system).
		 */
		 /* (7.3.1.7) 如果设置了SD_PREFER_SIBLING标志,说明local_group希望其他人迁移任务到它身上,
            提高其他sg的迁移优先级
         */
		if (prefer_sibling && sds->local &&
		    group_has_capacity(env, local) &&
		    (sgs->sum_nr_running > local->sum_nr_running + 1)) {
			sgs->group_no_capacity = 1;
			sgs->group_type = group_classify(sg, sgs);
		}
 /* (7.3.1.8) 根据计算的sgs统计数据,找出busiest sg */
		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
			sds->busiest = sg;
			sds->busiest_stat = *sgs;
		}

next_group:
		/* Now, start updating sd_lb_stats */
		/* (7.3.1.9) 更新sds中的负载、capacity统计 */
		sds->total_running += sgs->sum_nr_running;
		sds->total_load += sgs->group_load;
		sds->total_capacity += sgs->group_capacity;
		sds->total_util += sgs->group_util;

		sg = sg->next;
	} while (sg != env->sd->groups);

	if (env->sd->flags & SD_NUMA)
		env->fbq_type = fbq_classify_group(&sds->busiest_stat);

	env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;

	if (!lb_sd_parent(env->sd)) {
		/* update overload indicator if we are at root domain */
		if (READ_ONCE(env->dst_rq->rd->overload) != overload)
			WRITE_ONCE(env->dst_rq->rd->overload, overload);
	}

	if (overutilized)
		set_sd_overutilized(env->sd);
	else
		clear_sd_overutilized(env->sd);

	/*
	 * If there is a misfit task in one cpu in this sched_domain
	 * it is likely that the imbalance cannot be sorted out among
	 * the cpu's in this sched_domain. In this case set the
	 * overutilized flag at the parent sched_domain.
	 */
	if (misfit_task) {
		struct sched_domain *sd = env->sd->parent;

		/*
		 * In case of a misfit task, load balance at the parent
		 * sched domain level will make sense only if the the cpus
		 * have a different capacity. If cpus at a domain level have
		 * the same capacity, the misfit task cannot be well
		 * accomodated	in any of the cpus and there in no point in
		 * trying a load balance at this level
		 */
		while (sd) {
			if (sd->flags & SD_ASYM_CPUCAPACITY) {
				set_sd_overutilized(sd);
				break;
			}
			sd = sd->parent;
		}
	}

	/*
	 * If the domain util is greater that domain capacity, load balancing
	 * needs to be done at the next sched domain level as well.
	 */
	if (lb_sd_parent(env->sd) &&
	    sds->total_capacity * 1024 < sds->total_util * capacity_margin)
		set_sd_overutilized(env->sd->parent);
}

如果是针对DIE层级,我们可以视为更新cluster组成的哦group,如果是MC层级,我们可以视为更新的是每个cpu对应的group

Sds->local

env->sd->groups

 

sds->total_running

+= env->sd->groups->sum_nr_running;

 

sds->total_load

+= env->sd->groups->group_load;

 

sds->total_capacity

+= env->sd->groups->group_capacity;

 

sds->total_util ;

+= env->sd->groups->group_util

 

sds->busiest

=env->sd->groups

 

sds->busiest_stat

=

update_sg_lb_stats(env, sg, load_idx, local_group, sgs

 

在这个函数中进行更新

 

 

 

 

 

 

update_group_capacity详见

https://blog.csdn.net/feifei_csdn/article/details/107318037

update_sg_lb_stat详见

https://blog.csdn.net/feifei_csdn/article/details/107310483

update_sd_pick_busiest

判断传入的group是否是最忙的group

group_has_capacity

/*
 * group_has_capacity returns true if the group has spare capacity that could
 * be used by some tasks.
 * We consider that a group has spare capacity if the  * number of task is
 * smaller than the number of CPUs or if the utilization is lower than the
 * available capacity for CFS tasks.
 * For the latter, we use a threshold to stabilize the state, to take into
 * account the variance of the tasks' load and to return true if the available
 * capacity in meaningful for the load balancer.
 * As an example, an available capacity of 1% can appear but it doesn't make
 * any benefit for the load balance.
 */
static inline bool
group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
{
	if (sgs->sum_nr_running < sgs->group_weight)
		return true;
 
	if ((sgs->group_capacity * 100) >
			(sgs->group_util * env->sd->imbalance_pct))//实际上是负载均衡的一个条件值,越大就是越放宽条件
		return true;
 
	return false;
}

update_sd_pick_busiest

 
/*
 * The margin used when comparing utilization with CPU capacity:
 * util * margin < capacity * 1024
 *
 * (default: ~20%)
 */
unsigned int capacity_margin				= 1280;
/*
 * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
 * per-CPU capacity_orig than sched_group ref.
 */
static inline bool
group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
	return sg->sgc->max_capacity * capacity_margin <
						ref->sgc->max_capacity * 1024;
}
 
/*
 * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
 * per-CPU capacity than sched_group ref.
 */
static inline bool
group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
	return sg->sgc->min_capacity * capacity_margin <
						ref->sgc->min_capacity * 1024;
}
/**
 * update_sd_pick_busiest - return 1 on busiest group
 * @env: The load balancing environment.
 * @sds: sched_domain statistics
 * @sg: sched_group candidate(候选人) to be checked for being the busiest
 * @sgs: sched_group statistics
 *
 * Determine if @sg is a busier group than the previously selected
 * busiest group.
 *
 * Return: %true if @sg is a busier group than the previously selected
 * busiest group. %false otherwise.
 */
 // struct sched_group *sg 是否是busiest的group
static bool update_sd_pick_busiest(struct lb_env *env,
				   struct sd_lb_stats *sds,
				   struct sched_group *sg,
				   struct sg_lb_stats *sgs)
{
	/* Statistics of the busiest group */
	struct sg_lb_stats *busiest = &sds->busiest_stat;
 
	/*
	 * Don't try to pull misfit tasks we can't help.
	 * We can use max_capacity here as reduction in capacity on some
	 * cpus in the group should either be possible to resolve
	 * internally or be covered by avg_load imbalance (eventually).
	 */
	 /*
	 不要试图拉扯我们无法帮助的不合适任务。 
	 我们可以在此处使用max_capacity,
	 因为该组中某些cpus的容量减少应该可以在内部解决,
	 也可以被avg_load不平衡所覆盖(最终)。
	 */
	if (sgs->group_type == group_misfit_task &&
	    (!group_smaller_max_cpu_capacity(sg, sds->local) ||
	     !group_has_capacity(env, &sds->local_stat)))
		return false;
 
	if (sgs->group_type > busiest->group_type)
		return true;
 
	if (sgs->group_type < busiest->group_type)
		return false;
 
	if (sgs->avg_load <= busiest->avg_load)
		return false;
 
	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
		goto asym_packing;
 
	/*
	 * Candidate sg has no more than one task per CPU and
	 * has higher per-CPU capacity. Migrating tasks to less
	 * capable(容量) CPUs may harm throughput(吞吐量). Maximize throughput,
	 * power/energy consequences(后果) are not considered.
	 */
	if (sgs->sum_nr_running <= sgs->group_weight &&
	    group_smaller_min_cpu_capacity(sds->local, sg))
		return false;
 
	/*
	 * Candidate sg doesn't face any severe(严重的) imbalance issues so
	 * don't disturb unless the groups are of similar capacity
	 * where balancing is more harmless.
	 */
	if (sgs->group_type == group_other &&
		!group_similar_cpu_capacity(sds->local, sg))
		return false;
 
	/*
	 * If we have more than one misfit sg go with the biggest misfit.
	 */
	if (sgs->group_type == group_misfit_task &&
	    sgs->group_misfit_task_load < busiest->group_misfit_task_load)
		return false;
 
asym_packing:
	/* This is the busiest node in its class. */
	if (!(env->sd->flags & SD_ASYM_PACKING))
		return true;
 
	/* No ASYM_PACKING if target cpu is already busy */
	if (env->idle == CPU_NOT_IDLE)
		return true;
	/*
	 * ASYM_PACKING needs to move all the work to the highest
	 * prority CPUs in the group, therefore mark all groups
	 * of lower priority than ourself as busy.
	 */
	if (sgs->sum_nr_running &&
	    sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
		if (!sds->busiest)
			return true;
 
		/* Prefer to move from lowest priority cpu's work */
		if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
				      sg->asym_prefer_cpu))
			return true;
	}
 
	return false;
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值