757ffdd sched/fair: Set rq->rd->overload when misfit
e90c8fe sched/fair: Wrap rq->rd->overload accesses with READ/WRITE_ONCE()
575638d sched/core: Change root_domain->overload type to int
dbbad71 sched/fair: Change 'prefer_sibling' type to bool
5fbdfae sched/fair: Kick nohz balance if rq->misfit_task_load
cad68e5 sched/fair: Consider misfit tasks when load-balancing
e3d6d0c sched/fair: Add sched_group per-CPU max capacity
3b1baa6 sched/fair: Add 'group_misfit_task' load-balance type
1. 创建SCHED_SOFTIRQ
*start_kernel()
|--sched_init(void)
|--init_sched_fair_class() /*sched/fair.c*/
|--open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
|<--------|
|
| *scheduler_tick(void)
| |--trigger_load_balance(struct rq *rq) /*kernel/sched/fair.c */
| |--raise_softirq(SCHED_SOFTIRQ)
|<-------|
|
| *scheduler_ipi(void) /*kernel/sched/core.c*/
| |--raise_softirq_irqoff(SCHED_SOFTIRQ)
|------->|
|--run_rebalance_domains(struct softirq_action *h)
|--if (nohz_idle_balance(this_rq, idle)), return;
|<----------| |
| |-----------------|
| | *balance_fair
| | *pick_next_task_fair()
| | |--newidle_balance()
| |------|--nohz_newidle_balance()
| |-- _nohz_idle_balance()
|------------>|--rebalance_domains(this_rq, idle)
9455 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9456 {
9457 int continue_balancing = 1;
9458 int cpu = rq->cpu;
9459 unsigned long interval;
9460 struct sched_domain *sd;
9461 /* Earliest time when we have to do rebalance again */
9462 unsigned long next_balance = jiffies + 60*HZ;
9463 int update_next_balance = 0;
9464 int need_serialize, need_decay = 0;
9465 u64 max_cost = 0;
9466
9467 rcu_read_lock();
9468 for_each_domain(cpu, sd) {
9469 /*
9470 * Decay the newidle max times here because this is a regular
9471 * visit to all the domains. Decay ~1% per second.
9472 */
9473 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9474 sd->max_newidle_lb_cost =
9475 (sd->max_newidle_lb_cost * 253) / 256;
9476 sd->next_decay_max_lb_cost = jiffies + HZ;
9477 need_decay = 1;
9478 }
9479 max_cost += sd->max_newidle_lb_cost;
9480
9481 if (!(sd->flags & SD_LOAD_BALANCE))
9482 continue;
9483
9484 /*
9485 * Stop the load balance at this level. There is another
9486 * CPU in our sched group which is doing load balancing more
9487 * actively.
9488 */
9489 if (!continue_balancing) {
9490 if (need_decay)
9491 continue;
9492 break;
9493 }
9494
9495 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9496
9497 need_serialize = sd->flags & SD_SERIALIZE;
9498 if (need_serialize) {
9499 if (!spin_trylock(&balancing))
9500 goto out;
9501 }
9502
9503 if (time_after_eq(jiffies, sd->last_balance + interval)) {
9504 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
9505 /*
9506 * The LBF_DST_PINNED logic could have changed
9507 * env->dst_cpu, so we can't know our idle
9508 * state even if we migrated tasks. Update it.
9509 */
9510 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
9511 }
9512 sd->last_balance = jiffies;
9513 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9514 }
9515 if (need_serialize)
9516 spin_unlock(&balancing);
9517 out:
9518 if (time_after(next_balance, sd->last_balance + interval)) {
9519 next_balance = sd->last_balance + interval;
9520 update_next_balance = 1;
9521 }
9522 }
9523 if (need_decay) {
9524 /*
9525 * Ensure the rq-wide value also decays but keep it at a
9526 * reasonable floor to avoid funnies with rq->avg_idle.
9527 */
9528 rq->max_idle_balance_cost =
9529 max((u64)sysctl_sched_migration_cost, max_cost);
9530 }
9531 rcu_read_unlock();
9532
9533 /*
9534 * next_balance will be updated only when there is a need.
9535 * When the cpu is attached to null domain for ex, it will not be
9536 * updated.
9537 */
9538 if (likely(update_next_balance)) {
9539 rq->next_balance = next_balance;
9540
9541 #ifdef CONFIG_NO_HZ_COMMON
9542 /*
9543 * If this CPU has been elected to perform the nohz idle
9544 * balance. Other idle CPUs have already rebalanced with
9545 * nohz_idle_balance() and nohz.next_balance has been
9546 * updated accordingly. This CPU is now running the idle load
9547 * balance for itself and we need to update the
9548 * nohz.next_balance accordingly.
9549 */
9550 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9551 nohz.next_balance = rq->next_balance;
9552 #endif
9553 }
9554 }