【内核调度、负载均衡】【select_task_rq_fair】

在新建一个task或者block的task被唤醒的时候,也会执行负载均衡,调用的函数是select_task_rq_fair

和内核周期性调度相似(寻找最忙的cpu上的任务,然后把该任务pull过来执行。或者从最忙的cpu上将当时正在执行的任务停掉,然后放到local cpu上去执行)。只是他寻找的是最idlest的cpu来运行task

select_task_rq_fair

/*
 * select_task_rq_fair: Select target runqueue for the waking task in domains
 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
 *
 * Balances load by selecting the idlest cpu in the idlest group, or under
 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
 *
 * Returns the target cpu number.
 *
 * preempt must be disabled.
 */
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
		    int sibling_count_hint)
{
	struct sched_domain *tmp, *affine_sd = NULL;
	struct sched_domain *sd = NULL, *energy_sd = NULL;
	int cpu = smp_processor_id();
	int new_cpu = prev_cpu;/* 默认new_cpu为prev_cpu */
	int want_affine = 0;
	int want_energy = 0;
	int sync = wake_flags & WF_SYNC;

	if (energy_aware()) {//使用EAS做负载均衡
		rcu_read_lock();
		new_cpu = find_energy_efficient_cpu(energy_sd, p,
				cpu, prev_cpu, sync);

		if (new_cpu == -1)
			new_cpu = prev_cpu;

		rcu_read_unlock();
		return new_cpu;
	}

	rcu_read_lock();

	if (sd_flag & SD_BALANCE_WAKE) {
		record_wakee(p);
		want_energy = wake_energy(p, prev_cpu, sd_flag, wake_flags);
		want_affine = !want_energy &&
				//!wake_wide(p)  当前cpu的唤醒次数没有超标
			      !wake_wide(p, sibling_count_hint) &&//
			      !wake_cap(p, cpu, prev_cpu) &&
				  //pumask_test_cpu(cpu, tsk_cpus_allowed(p)) // 当前cpu在进程在P的affinity中
			      cpumask_test_cpu(cpu, &p->cpus_allowed);
	}
	/* (4) 从下往上遍历当前cpu的sd,查询在哪个层次的sd进行负载均衡 */
	for_each_domain(cpu, tmp) {
		if (!(tmp->flags & SD_LOAD_BALANCE))
			break;

		/*
		 * If both cpu and prev_cpu are part of this domain,
		 * cpu is a valid SD_WAKE_AFFINE target.
		 */
		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
			affine_sd = tmp;
			break;
		}

		/*
		 * If we are able to try an energy-aware wakeup,
		 * select the highest non-overutilized sched domain
		 * which includes this cpu and prev_cpu
		 *
		 * maybe want to not test prev_cpu and only consider
		 * the current one?
		 */
		if (want_energy &&
		    !sd_overutilized(tmp) &&
		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
			energy_sd = tmp;

		if (tmp->flags & sd_flag)
			sd = tmp;
		else if (!(want_affine || want_energy))
			break;
	}

	if (affine_sd) {
		sd = NULL; /* Prefer wake_affine over balance flags */
		if (cpu == prev_cpu)
			goto pick_cpu;

		if (wake_affine(affine_sd, p, prev_cpu, sync))
			new_cpu = cpu;
	}

	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
		/*
		 * We're going to need the task's util for capacity_spare_without
		 * in find_idlest_group. Sync it up to prev_cpu's
		 * last_update_time.
		 */
		sync_entity_load_avg(&p->se);
	}
	/* (6) 没有找到符合sd_flag的sd */
	if (!sd) {
pick_cpu:
		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);

	} else {
		if (energy_sd)
			new_cpu = find_energy_efficient_cpu(energy_sd, p, cpu, prev_cpu, sync);

		/* if we did an energy-aware placement and had no choices available
		 * then fall back to the default find_idlest_cpu choice
		 */
		if (!energy_sd || (energy_sd && new_cpu == -1))
			new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
	}

	rcu_read_unlock();

#ifdef CONFIG_NO_HZ_COMMON
	if (nohz_kick_needed(cpu_rq(new_cpu), true))
		nohz_balancer_kick(true);
#endif

	return new_cpu;
}

find_energy_efficient_cpu

 

find_idlest_cpu

寻找最空闲的cpu,一般是在最空闲的组里面去找最空闲的cpu

static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
				  int cpu, int prev_cpu, int sd_flag)
{
	int new_cpu = cpu;

	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
		return prev_cpu;

	while (sd) {
		struct sched_group *group;
		struct sched_domain *tmp;
		int weight;

		if (!(sd->flags & sd_flag)) {
			sd = sd->child;
			continue;
		}

		group = find_idlest_group(sd, p, cpu, sd_flag);
		if (!group) {
			sd = sd->child;
			continue;
		}

		new_cpu = find_idlest_group_cpu(group, p, cpu);
		if (new_cpu == cpu) {
			/* Now try balancing at a lower domain level of cpu */
			sd = sd->child;
			continue;
		}

		/* Now try balancing at a lower domain level of new_cpu */
		cpu = new_cpu;
		weight = sd->span_weight;
		sd = NULL;
		for_each_domain(cpu, tmp) {
			if (weight <= tmp->span_weight)
				break;
			if (tmp->flags & sd_flag)
				sd = tmp;
		}
		/* while loop will break here if sd == NULL */
	}

	return new_cpu;
}

find_idlest_group

/*
 * find_idlest_group finds and returns the least busy CPU group within the
 * domain.
 *
 * Assumes p is allowed on at least one CPU in sd.
 */
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
		  int this_cpu, int sd_flag)
{
	struct sched_group *idlest = NULL, *group = sd->groups;
	struct sched_group *most_spare_sg = NULL;
	unsigned long min_runnable_load = ULONG_MAX;
	unsigned long this_runnable_load = ULONG_MAX;
	unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
	unsigned long most_spare = 0, this_spare = 0;
	int load_idx = sd->forkexec_idx;
	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
				(sd->imbalance_pct-100) / 100;

	if (sd_flag & SD_BALANCE_WAKE)
		load_idx = sd->wake_idx;

	do {
		unsigned long load, avg_load, runnable_load;
		unsigned long spare_cap, max_spare_cap;
		int local_group;
		int i;

		/* Skip over this group if it has no CPUs allowed */
		if (!cpumask_intersects(sched_group_span(group),
					&p->cpus_allowed))
			continue;

		local_group = cpumask_test_cpu(this_cpu,
					       sched_group_span(group));

		/*
		 * Tally up the load of all CPUs in the group and find
		 * the group containing the CPU with most spare capacity.
		 */
		avg_load = 0;
		runnable_load = 0;
		max_spare_cap = 0;

		for_each_cpu(i, sched_group_span(group)) {
			/* Bias balancing toward cpus of our domain */
			if (local_group)
				load = source_load(i, load_idx);
			else
				load = target_load(i, load_idx);

			runnable_load += load;

			avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);

			spare_cap = capacity_spare_without(i, p);

			if (spare_cap > max_spare_cap)
				max_spare_cap = spare_cap;
		}

		/* Adjust by relative CPU capacity of the group */
		avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
					group->sgc->capacity;
		runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
					group->sgc->capacity;

		if (local_group) {
			this_runnable_load = runnable_load;
			this_avg_load = avg_load;
			this_spare = max_spare_cap;
		} else {
			if (min_runnable_load > (runnable_load + imbalance)) {
				/*
				 * The runnable load is significantly smaller
				 * so we can pick this new cpu
				 */
				min_runnable_load = runnable_load;
				min_avg_load = avg_load;
				idlest = group;
			} else if ((runnable_load < (min_runnable_load + imbalance)) &&
				   (100*min_avg_load > imbalance_scale*avg_load)) {
				/*
				 * The runnable loads are close so take the
				 * blocked load into account through avg_load.
				 */
				min_avg_load = avg_load;
				idlest = group;
			}

			if (most_spare < max_spare_cap) {
				most_spare = max_spare_cap;
				most_spare_sg = group;
			}
		}
	} while (group = group->next, group != sd->groups);

	/*
	 * The cross-over point between using spare capacity or least load
	 * is too conservative for high utilization tasks on partially
	 * utilized systems if we require spare_capacity > task_util(p),
	 * so we allow for some task stuffing by using
	 * spare_capacity > task_util(p)/2.
	 *
	 * Spare capacity can't be used for fork because the utilization has
	 * not been set yet, we must first select a rq to compute the initial
	 * utilization.
	 */
	if (sd_flag & SD_BALANCE_FORK)
		goto skip_spare;

	if (this_spare > task_util(p) / 2 &&
	    imbalance_scale*this_spare > 100*most_spare)
		return NULL;

	if (most_spare > task_util(p) / 2)
		return most_spare_sg;

skip_spare:
	if (!idlest)
		return NULL;

	if (min_runnable_load > (this_runnable_load + imbalance))
		return NULL;

	if ((this_runnable_load < (min_runnable_load + imbalance)) &&
	     (100*this_avg_load < imbalance_scale*min_avg_load))
		return NULL;

	return idlest;
}

find_idlest_group_cpu

在最空闲的组里去寻找最空闲的cpu

/*
 * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
 */
static int
find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
	unsigned long load, min_load = ULONG_MAX;
	unsigned int min_exit_latency = UINT_MAX;
	u64 latest_idle_timestamp = 0;
	int least_loaded_cpu = this_cpu;
	int shallowest_idle_cpu = -1;
	int i;

	/* Check if we have any choice: */
	if (group->group_weight == 1)
		return cpumask_first(sched_group_span(group));

	/* Traverse only the allowed CPUs */
	for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
		if (idle_cpu(i)) {
			struct rq *rq = cpu_rq(i);
			struct cpuidle_state *idle = idle_get_state(rq);
			if (idle && idle->exit_latency < min_exit_latency) {
				/*
				 * We give priority to a CPU whose idle state
				 * has the smallest exit latency irrespective
				 * of any idle timestamp.
				 */
				min_exit_latency = idle->exit_latency;
				latest_idle_timestamp = rq->idle_stamp;
				shallowest_idle_cpu = i;
			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
				   rq->idle_stamp > latest_idle_timestamp) {
				/*
				 * If equal or no active idle state, then
				 * the most recently idled CPU might have
				 * a warmer cache.
				 */
				latest_idle_timestamp = rq->idle_stamp;
				shallowest_idle_cpu = i;
			}
		} else if (shallowest_idle_cpu == -1) {
			load = weighted_cpuload(cpu_rq(i));
			if (load < min_load || (load == min_load && i == this_cpu)) {
				min_load = load;
				least_loaded_cpu = i;
			}
		}
	}

	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}

 

 

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值