【内核调度、负载均衡】【load_balance】

最新推荐文章于 2024-03-17 18:25:14 发布

money_yuan

最新推荐文章于 2024-03-17 18:25:14 发布

阅读量1.4k

点赞数

分类专栏： linux内核

本文链接：https://blog.csdn.net/feifei_csdn/article/details/107362031

版权

linux内核专栏收录该内容

57 篇文章

订阅专栏

should_we_balance

判断是否应该做balance

在should_we_balance()做各种判断，做dst_cpu的条件有：要么是本sg的第一个idle cpu，要么是本sg的第一个cpu

判断是否需要在当前cpu上做load balance

（1）如果是NEWLY IDLE，需要做load balance

（2）否则的需要在idle cpu上做balance

（3）如果没有idle cpu的话，就在group的第一个cpu上做load balance

static int should_we_balance(struct lb_env *env)
{
	struct sched_group *sg = env->sd->groups;
	int cpu, balance_cpu = -1;
 
	/*
	 * Ensure the balancing environment is consistent; can happen
	 * when the softirq triggers 'during' hotplug.
	 */
	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
		return 0;
 
	/*
	 * In the newly idle case, we will allow all the cpu's
	 * to do the newly idle load balance.
	 */
	  /* (7.2.1) 如果本cpu为CPU_NEWLY_IDLE，直接符合迁移条件 */
	if (env->idle == CPU_NEWLY_IDLE)
		return 1;
 
	/* Try to find first idle cpu */
	/* (7.2.2) 本sched_group的第一个idle cpu适合做load_balance */
	//如果这里的第一个idle cpu是dst cpu，则开始做load balance
	for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
		if (!idle_cpu(cpu))
			continue;
 
		balance_cpu = cpu;
		break;
	}
	/* (7.2.3) 没有idle cpu，则选取本sched_group的第一个cpu做load_balance */
	// sg的第一个cpu是dst cpu，则开始做load balance
	if (balance_cpu == -1)
		balance_cpu = group_balance_cpu(sg);//the first cpu(busiest) in this sched group
 
	/*
	 * First idle cpu or the first cpu(busiest) in this sched group
	 * is eligible(合格) for doing load balancing at this and above domains.
	 */
	return balance_cpu == env->dst_cpu;
}

load balance

pull task 是当前的cpu将busiest cpu上的task拉到自己身上来执行。

/*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
 */
 // 找出负载最重的cpu，将busiest cpu上的部分task迁移到local cpu上。
 // 返回的是迁移的task的数量
static int load_balance(int this_cpu, struct rq *this_rq,
			struct sched_domain *sd, enum cpu_idle_type idle,
			int *continue_balancing)
{
	int ld_moved, cur_ld_moved, active_balance = 0;
	struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
	struct sched_group *group;
	struct rq *busiest;
	struct rq_flags rf;
	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

	/* (7.1) 构造Load_balance需要的数据结构：*/
	struct lb_env env = {
		.sd		= sd,//本cpu在tl层次的sd
		.dst_cpu	= this_cpu,//目的cpu是本cpu
		.dst_rq		= this_rq,// 目的rq是本cpu的rq
		/*由于一些cpu allows的设置，导致一些task不能被迁移到dst_cpu上，
		所以在出现这种情况的时候，就需要从dst cpu所在的group上选择另外一个cpu
		*/
		.dst_grpmask    = sched_group_span(sd->groups),
		.idle		= idle,
		//以sched_nr_migrate_break为基本单位累积
		.loop_break	= sched_nr_migrate_break,
		.cpus		= cpus,
		.fbq_type	= all,
		//初始化链表，后续会将需要迁移的task暂时放在这个链表里面。
		.tasks		= LIST_HEAD_INIT(env.tasks),
	};

	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);

	schedstat_inc(sd->lb_count[idle]);

redo:
	if (!should_we_balance(&env)) {
		*continue_balancing = 0;
		goto out_balanced;
	}
	/* (7.3) 找出本层级sched_group链表中，负载最重的(busiest)的sched_group */
	group = find_busiest_group(&env);
	if (!group) {
		schedstat_inc(sd->lb_nobusyg[idle]);//我先假设为sd->lb_nobusyg[idle]--
		goto out_balanced;
	}
	/* (7.4) 找出busiest sched_group中sched_group的rq，即负载最重cpu对应的rq */
	busiest = find_busiest_queue(&env, group);
	if (!busiest) {
		schedstat_inc(sd->lb_nobusyq[idle]);
		goto out_balanced;
	}

	BUG_ON(busiest == env.dst_rq);
	//env.imbalance需要迁移的负载
	schedstat_add(sd->lb_imbalance[idle], env.imbalance);

	env.src_cpu = busiest->cpu;
	env.src_rq = busiest;

	ld_moved = 0;
	/* (7.5) 判断busiest cpu rq中的runnable进程数 > 1？ 至少有进程可以迁移走 */
	if (busiest->nr_running > 1) {
		/*
		 * Attempt to move tasks. If find_busiest_group has found
		 * an imbalance but busiest->nr_running <= 1, the group is
		 * still unbalanced. ld_moved simply stays zero, so it is
		 * correctly treated as an imbalance.
		 */
		env.flags |= LBF_ALL_PINNED;
		//最大遷移的task數量
		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);

more_balance:
		rq_lock_irqsave(busiest, &rf);
		update_rq_clock(busiest);

		/*
		 * cur_ld_moved - load moved in current iteration
		 * ld_moved     - cumulative load moved across iterations(迭代)
		 */
		cur_ld_moved = detach_tasks(&env, &rf);//cur_ld_moved本次迁移的task的数量

		/*
		 * We've detached some tasks from busiest_rq. Every
		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
		 * unlock busiest->lock, and we are able to be sure
		 * that nobody can manipulate(操作) the tasks in parallel(平行).
		 * See task_rq_lock() family for the details.
		 */

		rq_unlock(busiest, &rf);

		if (cur_ld_moved) {
			//attach tasks将这些tasks添加到了新的rq中了，也就是enqueue操作
			attach_tasks(&env);
			ld_moved += cur_ld_moved;//ld_moved累积迁移的task数量
		}

		local_irq_restore(rf.flags);
		/* (7.9) LBF_NEED_BREAK设置，说明balance还没有完成，
		循环只是出来休息一下，继续重新balance*/
		/*LBF_NEED_BREAK这个flag是在detach_tasks中设置的，
因为detach_tasks是在持有spinlock的情况下运行的。长时间的spinlock会带来一些问题。*/
		if (env.flags & LBF_NEED_BREAK) {
			env.flags &= ~LBF_NEED_BREAK;
			goto more_balance;
		}

		/*
		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
		 * us and move them to an alternate dst_cpu in our sched_group
		 * where they can run. The upper limit on how many times we
		 * iterate on same src_cpu is dependent on number of cpus in our
		 * sched_group.
		 *
		 * This changes load balance semantics a bit on who can move
		 * load to a given_cpu. In addition to the given_cpu itself
		 * (or a ilb_cpu acting on its behalf where given_cpu is
		 * nohz-idle), we now have balance_cpu in a position to move
		 * load to given_cpu. In rare situations, this may cause
		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
		 * _independently_ and at _same_ time to move some load to
		 * given_cpu) causing exceess load to be moved to given_cpu.
		 * This however should not happen so much in practice and
		 * moreover subsequent load balance cycles should correct the
		 * excess load moved.
		 */
		 /* (7.10) 设置了LBF_DST_PINNED标志，并且env.imbalance > 0
            说明src_cpu上有些进程因为affinity的原因不能迁移到dst_cpu但是能迁移到同sg的new_dst_cpu上
            把dst_cpu更改为new_dst_cpu，重新开始balance流程
         */
		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {

/*
load balance的过程中有很多的flags标志位，其中LBF_SOME_PINNED的flag，
用来表示dst rq不在task的cpu allowed里面。
如果该task可以migrate到group的其余cpu上去，
另外用了一个flag叫着LBF_DST_PINNED来表示。
在这种情况下，会从group中重新选择一个cpu作为target cpu
继续load balance。很显然在NEWLY IDLE的load balance里面
不会出现LBF_DST_PINNED的情况，因为NEWLY IDLE的load balance中dst_grpmask为NULL
*/
			/* Prevent to re-select dst_cpu via env's cpus */
			cpumask_clear_cpu(env.dst_cpu, env.cpus);

			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
			env.dst_cpu	 = env.new_dst_cpu;
			env.flags	&= ~LBF_DST_PINNED;
			env.loop	 = 0;
			env.loop_break	 = sched_nr_migrate_break;

			/*
			 * Go back to "more_balance" rather than "redo" since we
			 * need to continue with same src_cpu.
			 */
			goto more_balance;
		}

		/*
		 * We failed to reach balance because of affinity.
		 */
		  /* (7.11) 设置了LBF_SOME_PINNED标志，说明有些进程因为affinity迁移失败，  
            设置当前sd的parent sd的 sgc->imbalance，让parent sd做rebalance的概率增高
         */
		if (sd_parent) {//这个是在MC层级来看
			int *group_imbalance = &sd_parent->groups->sgc->imbalance;

			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
				*group_imbalance = 1;
		}

		/* All tasks on this runqueue were pinned(固定的) by CPU affinity */
		/* (7.12) 如果LBF_ALL_PINNED标志一直被置位，
            说明busiest_cpu因为affinity没有一个进程迁移成功，哪怕迁移到dst_cpu同sg的其他cpu也没有一个成功
            将busiest cpu从全局cpu mask去掉，重新做整个流程：find_busiest_group -> find_busiest_queue -> detach_tasks -> attach_tasks
         */
		if (unlikely(env.flags & LBF_ALL_PINNED)) {
			cpumask_clear_cpu(cpu_of(busiest), cpus);
			/*
			 * Attempting to continue load balancing at the current
			 * sched_domain level only makes sense if there are
			 * active CPUs remaining as possible busiest CPUs to
			 * pull load from which are not contained within the
			 * destination group that is receiving any migrated
			 * load.
			 */
			if (!cpumask_subset(cpus, env.dst_grpmask)) {
				env.loop = 0;
				env.loop_break = sched_nr_migrate_break;
				goto redo;
			}
			goto out_all_pinned;
		}
	}
	
	
	
	/* (7.13) 经过几轮的努力尝试，最终迁移的进程数ld_moved还是0，
	说明pull失败，此时采用激进的方式，
	即在busiest cpu上主动push task到local cpu上 */
	if (!ld_moved) {
		schedstat_inc(sd->lb_failed[idle]);//inc可能是increment的缩写
		/*
		 * Increment the failure counter only on periodic(周期性的) balance.
		 * We do not want newidle balance, which can be very
		 * frequent, pollute(污染) the failure counter(计数器) causing
		 * excessive(过多的) cache_hot migrations and active balances.
		 */
		if (idle != CPU_NEWLY_IDLE)
			if (env.src_grp_nr_running > 1)
				sd->nr_balance_failed++;
		// 最后一次尝试迁移一个进程
		if (need_active_balance(&env)) {
			unsigned long flags;

			raw_spin_lock_irqsave(&busiest->lock, flags);

			/* don't kick the active_load_balance_cpu_stop,
			 * if the curr task on busiest cpu can't be
			 * moved to this_cpu
			 */
			 /* (7.15) 如果当前cpu不在busiest->curr进程的affinity之内，返回失败 */
			if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
				raw_spin_unlock_irqrestore(&busiest->lock,
							    flags);
				env.flags |= LBF_ALL_PINNED;
				goto out_one_pinned;
			}

			/*
			 * ->active_balance synchronizes accesses to
			 * ->active_balance_work.  Once set, it's cleared
			 * only after active load balance is finished.
			 */
			if (!busiest->active_balance) {//发起主动负载均衡
				busiest->active_balance = 1;
				busiest->push_cpu = this_cpu;
				active_balance = 1;
			}
			raw_spin_unlock_irqrestore(&busiest->lock, flags);

			if (active_balance) {/* (7.16) 迁移busiest->curr进程当前期cpu */
			/*
这边的load balance更加的激进，采用了一个stop class的进程（在前面介绍过，stop >
 deadline > real time> fair > idle)同时将src rq的running的task重新enqueue到rq
中成为runnable状态。这样将running的task纳入到了load balance的范围
*/
/*
这个rq正在运行fair scheduling的load balance
此时会调用stop_one_cpu_nowait暂停该cpu的进程
//然后通过调用active_load_balance_cpu_stop
把tasks从最忙碌的处理器移到idle的处理器上执行
*/

				stop_one_cpu_nowait(cpu_of(busiest),
					active_load_balance_cpu_stop, busiest,
					&busiest->active_balance_work);
			}

			/* We've kicked active balancing, force task migration. */
			sd->nr_balance_failed = sd->cache_nice_tries+1;
		}
	} else
		sd->nr_balance_failed = 0;
	
	//调整负载均衡时间间隔
	if (likely(!active_balance)) {
		/* We were unbalanced, so reset the balancing interval */
		sd->balance_interval = sd->min_interval;//下次尽早到
	} else {
		/*
		 * If we've begun active balancing, start to back off. This
		 * case may not be covered by the all_pinned logic if there
		 * is only 1 task on the busy runqueue (because we don't call
		 * detach_tasks).
		 */
		if (sd->balance_interval < sd->max_interval)
			sd->balance_interval *= 2;//推迟下一次负载均衡的时间
	}

	goto out;

out_balanced:
	/*
	 * We reach balance although we may have faced some affinity(亲和性)
	 * constraints(约束). Clear the imbalance flag if it was set.
	 */
	if (sd_parent) {
		int *group_imbalance = &sd_parent->groups->sgc->imbalance;

		if (*group_imbalance)
			*group_imbalance = 0;
	}

out_all_pinned:
	/*
	 * We reach balance because all tasks are pinned(固定的) at this level so
	 * we can't migrate them. Let the imbalance flag set so parent level
	 * can try to migrate them.
	 */
	 /*
	 我们达到平衡，因为所有任务都固定在此级别，因此我们无法迁移它们。 
	 设置不平衡标志,因此父级可以尝试迁移它们
	 */
	schedstat_inc(sd->lb_balanced[idle]);

	sd->nr_balance_failed = 0;

out_one_pinned:
	ld_moved = 0;// 迁移的task的数量

	/*
	 * idle_balance() disregards(无视) balance intervals, so we could repeatedly
	 * reach this code, which would lead to balance_interval skyrocketting(暴涨)
	 * in a short amount of time. Skip the balance_interval increase logic
	 * to avoid that.
	 */
	 /*
	 idle_balance（）忽略平衡间隔，因此我们可以重复访问此代码，
	 这将导致balance_interval在短时间内飞涨。
	 跳过balance_interval增加逻辑来避免这种情况。
	 */
	if (env.idle == CPU_NEWLY_IDLE)
		goto out;

	/* tune up the balancing interval */
	//LBF_ALL_PINNED cpu具有亲和性
	if (((env.flags & LBF_ALL_PINNED) &&
			//MAX_PINNED_INTERVAL是512
			sd->balance_interval < MAX_PINNED_INTERVAL) ||
			(sd->balance_interval < sd->max_interval))
		//不需要进行进程的迁移，适当的加大负载均衡的间隔时间，说明当前的负载均衡做得比较好了
		sd->balance_interval *= 2;
out:
	return ld_moved;//迁移的task的数量
}

pull task当出现一定的失败次数的时候，就会开始push task，首先我们来看看pull task 失败的类型或者原因

1、cpu affinity，即task被绑定在了指定的cpu上去运行，不能在其余的cpu上运行。此时不能做task 迁移

2、task 正在cpu上运行，或者task上一次运行的时间距离当前运行的时间很近。此时cache上有很多未淘汰的数据

则称该进程的cache是热的，对于cache是热的，我们尽量不做迁移。

push task active_load_balance 主要是想将当时正在运行的task或者距离时间十分接近的task迁移过来。

active_load_balance_cpu_stop

/*
 采用了一个stop class的进程（在前面介绍过，stop > deadline > real time> fair > idle)，
stop进程将当前正在运行的fair 或者低于stop级别的task停下来。然后从最忙的cpu上load到idle cpu上去
*/
/*
 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
 * running tasks off the busiest CPU onto idle CPUs. It requires at
 * least 1 task to be running on each physical CPU where possible, and
 * avoids physical / logical imbalances.
 */
static int active_load_balance_cpu_stop(void *data)
{
	struct rq *busiest_rq = data;
	int busiest_cpu = cpu_of(busiest_rq);
	int target_cpu = busiest_rq->push_cpu;
	struct rq *target_rq = cpu_rq(target_cpu);
	struct sched_domain *sd = NULL;
	struct task_struct *p = NULL;
	struct rq_flags rf;
	struct task_struct *push_task = NULL;
	int push_task_detached = 0;
	struct lb_env env = {
		.sd         = sd,
		.dst_cpu        = target_cpu,
		.dst_rq         = target_rq,
		.src_cpu        = busiest_rq->cpu,
		.src_rq         = busiest_rq,
		.idle           = CPU_IDLE,//直接转到idle的cpu上
		.flags          = 0,
		.loop           = 0,
	};

	rq_lock_irq(busiest_rq, &rf);
	/*
	 * Between queueing the stop-work and running it is a hole in which
	 * CPUs can become inactive. We should not move tasks from or to
	 * inactive CPUs.
	 */
	if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
		goto out_unlock;
// 也就是要求当前cpu是busiest，那就刚好对应的是push这个名称
	/* make sure the requested cpu hasn't gone down in the meantime */
	if (unlikely(busiest_cpu != smp_processor_id() ||
		     !busiest_rq->active_balance))
		goto out_unlock;

	/* Is there any task to move? */
	if (busiest_rq->nr_running <= 1)
		goto out_unlock;

	/*
	 * This condition is "impossible", if it occurs
	 * we need to fix it. Originally reported by
	 * Bjorn Helgaas on a 128-cpu setup.
	 */
	BUG_ON(busiest_rq == target_rq);

	push_task = busiest_rq->push_task;
	if (push_task) {
		if (task_on_rq_queued(push_task) &&
		    push_task->state == TASK_RUNNING &&
		    task_cpu(push_task) == busiest_cpu &&
		    cpu_online(target_cpu)) {
			update_rq_clock(busiest_rq);
			detach_task(push_task, &env, &rf);
			push_task_detached = 1;
		}
		goto out_unlock;
	}

	/* Search for an sd spanning us and the target CPU. */
	rcu_read_lock();
	for_each_domain(target_cpu, sd) {
		if ((sd->flags & SD_LOAD_BALANCE) &&
		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				break;
	}

	if (likely(sd)) {
		struct lb_env env = {
			.sd		= sd,
			.dst_cpu	= target_cpu,
			.dst_rq		= target_rq,
			.src_cpu	= busiest_rq->cpu,
			.src_rq		= busiest_rq,
			.idle		= CPU_IDLE,
			/*
			 * can_migrate_task() doesn't need to compute new_dst_cpu
			 * for active balancing. Since we have CPU_IDLE, but no
			 * @dst_grpmask we need to make that test go away with lying
			 * about DST_PINNED.
			 */
			.flags		= LBF_DST_PINNED,
		};

		schedstat_inc(sd->alb_count);
		update_rq_clock(busiest_rq);

		p = detach_one_task(&env, &rf);
		if (p) {
			schedstat_inc(sd->alb_pushed);
			/* Active balancing done, reset the failure counter. */
			sd->nr_balance_failed = 0;
		} else {
			schedstat_inc(sd->alb_failed);
		}
	}
	rcu_read_unlock();
out_unlock:
	busiest_rq->active_balance = 0;

	if (push_task)
		busiest_rq->push_task = NULL;

	rq_unlock(busiest_rq, &rf);

	if (push_task) {
		if (push_task_detached)
			attach_one_task(target_rq, push_task);
		put_task_struct(push_task);
	}

	if (p)
		attach_one_task(target_rq, p);

	local_irq_enable();

	return 0;
}