在新建一个task或者block的task被唤醒的时候,也会执行负载均衡,调用的函数是select_task_rq_fair
和内核周期性调度相似(寻找最忙的cpu上的任务,然后把该任务pull过来执行。或者从最忙的cpu上将当时正在执行的任务停掉,然后放到local cpu上去执行)。只是他寻找的是最idlest的cpu来运行task
select_task_rq_fair
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
* Balances load by selecting the idlest cpu in the idlest group, or under
* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
*
* Returns the target cpu number.
*
* preempt must be disabled.
*/
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
int sibling_count_hint)
{
struct sched_domain *tmp, *affine_sd = NULL;
struct sched_domain *sd = NULL, *energy_sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu;/* 默认new_cpu为prev_cpu */
int want_affine = 0;
int want_energy = 0;
int sync = wake_flags & WF_SYNC;
if (energy_aware()) {//使用EAS做负载均衡
rcu_read_lock();
new_cpu = find_energy_efficient_cpu(energy_sd, p,
cpu, prev_cpu, sync);
if (new_cpu == -1)
new_cpu = prev_cpu;
rcu_read_unlock();
return new_cpu;
}
rcu_read_lock();
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
want_energy = wake_energy(p, prev_cpu, sd_flag, wake_flags);
want_affine = !want_energy &&
//!wake_wide(p) 当前cpu的唤醒次数没有超标
!wake_wide(p, sibling_count_hint) &&//
!wake_cap(p, cpu, prev_cpu) &&
//pumask_test_cpu(cpu, tsk_cpus_allowed(p)) // 当前cpu在进程在P的affinity中
cpumask_test_cpu(cpu, &p->cpus_allowed);
}
/* (4) 从下往上遍历当前cpu的sd,查询在哪个层次的sd进行负载均衡 */
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
break;
/*
* If both cpu and prev_cpu are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
break;
}
/*
* If we are able to try an energy-aware wakeup,
* select the highest non-overutilized sched domain
* which includes this cpu and prev_cpu
*
* maybe want to not test prev_cpu and only consider
* the current one?
*/
if (want_energy &&
!sd_overutilized(tmp) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
energy_sd = tmp;
if (tmp->flags & sd_flag)
sd = tmp;
else if (!(want_affine || want_energy))
break;
}
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
if (cpu == prev_cpu)
goto pick_cpu;
if (wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
if (sd && !(sd_flag & SD_BALANCE_FORK)) {
/*
* We're going to need the task's util for capacity_spare_without
* in find_idlest_group. Sync it up to prev_cpu's
* last_update_time.
*/
sync_entity_load_avg(&p->se);
}
/* (6) 没有找到符合sd_flag的sd */
if (!sd) {
pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
} else {
if (energy_sd)
new_cpu = find_energy_efficient_cpu(energy_sd, p, cpu, prev_cpu, sync);
/* if we did an energy-aware placement and had no choices available
* then fall back to the default find_idlest_cpu choice
*/
if (!energy_sd || (energy_sd && new_cpu == -1))
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
#ifdef CONFIG_NO_HZ_COMMON
if (nohz_kick_needed(cpu_rq(new_cpu), true))
nohz_balancer_kick(true);
#endif
return new_cpu;
}
find_energy_efficient_cpu
find_idlest_cpu
寻找最空闲的cpu,一般是在最空闲的组里面去找最空闲的cpu
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
int cpu, int prev_cpu, int sd_flag)
{
int new_cpu = cpu;
if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
return prev_cpu;
while (sd) {
struct sched_group *group;
struct sched_domain *tmp;
int weight;
if (!(sd->flags & sd_flag)) {
sd = sd->child;
continue;
}
group = find_idlest_group(sd, p, cpu, sd_flag);
if (!group) {
sd = sd->child;
continue;
}
new_cpu = find_idlest_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
continue;
}
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) {
if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
}
/* while loop will break here if sd == NULL */
}
return new_cpu;
}
find_idlest_group
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
struct sched_group *most_spare_sg = NULL;
unsigned long min_runnable_load = ULONG_MAX;
unsigned long this_runnable_load = ULONG_MAX;
unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
(sd->imbalance_pct-100) / 100;
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
do {
unsigned long load, avg_load, runnable_load;
unsigned long spare_cap, max_spare_cap;
int local_group;
int i;
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_span(group),
&p->cpus_allowed))
continue;
local_group = cpumask_test_cpu(this_cpu,
sched_group_span(group));
/*
* Tally up the load of all CPUs in the group and find
* the group containing the CPU with most spare capacity.
*/
avg_load = 0;
runnable_load = 0;
max_spare_cap = 0;
for_each_cpu(i, sched_group_span(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
else
load = target_load(i, load_idx);
runnable_load += load;
avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
spare_cap = capacity_spare_without(i, p);
if (spare_cap > max_spare_cap)
max_spare_cap = spare_cap;
}
/* Adjust by relative CPU capacity of the group */
avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
group->sgc->capacity;
runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
group->sgc->capacity;
if (local_group) {
this_runnable_load = runnable_load;
this_avg_load = avg_load;
this_spare = max_spare_cap;
} else {
if (min_runnable_load > (runnable_load + imbalance)) {
/*
* The runnable load is significantly smaller
* so we can pick this new cpu
*/
min_runnable_load = runnable_load;
min_avg_load = avg_load;
idlest = group;
} else if ((runnable_load < (min_runnable_load + imbalance)) &&
(100*min_avg_load > imbalance_scale*avg_load)) {
/*
* The runnable loads are close so take the
* blocked load into account through avg_load.
*/
min_avg_load = avg_load;
idlest = group;
}
if (most_spare < max_spare_cap) {
most_spare = max_spare_cap;
most_spare_sg = group;
}
}
} while (group = group->next, group != sd->groups);
/*
* The cross-over point between using spare capacity or least load
* is too conservative for high utilization tasks on partially
* utilized systems if we require spare_capacity > task_util(p),
* so we allow for some task stuffing by using
* spare_capacity > task_util(p)/2.
*
* Spare capacity can't be used for fork because the utilization has
* not been set yet, we must first select a rq to compute the initial
* utilization.
*/
if (sd_flag & SD_BALANCE_FORK)
goto skip_spare;
if (this_spare > task_util(p) / 2 &&
imbalance_scale*this_spare > 100*most_spare)
return NULL;
if (most_spare > task_util(p) / 2)
return most_spare_sg;
skip_spare:
if (!idlest)
return NULL;
if (min_runnable_load > (this_runnable_load + imbalance))
return NULL;
if ((this_runnable_load < (min_runnable_load + imbalance)) &&
(100*this_avg_load < imbalance_scale*min_avg_load))
return NULL;
return idlest;
}
find_idlest_group_cpu
在最空闲的组里去寻找最空闲的cpu
/*
* find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
int shallowest_idle_cpu = -1;
int i;
/* Check if we have any choice: */
if (group->group_weight == 1)
return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
/*
* We give priority to a CPU whose idle state
* has the smallest exit latency irrespective
* of any idle timestamp.
*/
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
} else if ((!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
* the most recently idled CPU might have
* a warmer cache.
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(cpu_rq(i));
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
least_loaded_cpu = i;
}
}
}
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}