【内核调度、负载均衡】【detach_tasks】

detach_tasks

在load balance中会将busiest cpu上的task detach掉,然后atach在local cpu上。返回值为detach的task的数量

/*
 * detach_tasks() -- tries to detach up to imbalance weighted load from
 * busiest_rq, as part of a balancing operation(操作) within domain "sd".
 *
 * Returns number of detached tasks if successful and 0 otherwise.
 */
 // 返回值为detach的task的数量
static int detach_tasks(struct lb_env *env, struct rq_flags *rf)
{
	//src_rq是busiest的rq
	struct list_head *tasks = &env->src_rq->cfs_tasks;
	struct task_struct *p;
	unsigned long load;
	int detached = 0;

	lockdep_assert_held(&env->src_rq->lock);

	if (env->imbalance <= 0)
		return 0;
	/* (7.6.1) 遍历busiest rq中的进程 */
	while (!list_empty(tasks)) {
		/*
		 * We don't want to steal all, otherwise we may be treated likewise,
		 * which could at worst lead to a livelock crash.
		 */
		 /* (7.6.2) 如果dest cpu不是idle,不能将busiest cpu迁移到idle状态 */  
		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
			break;

		p = list_first_entry(tasks, struct task_struct, se.group_node);

		env->loop++;
		/* We've more or less seen every task there is, call it quits */
		/* (7.6.3) 遍历任务最多不超过sysctl_sched_nr_migrate(32) */
		if (env->loop > env->loop_max)
			break;

		/* take a breather every nr_migrate tasks */
		/* (7.6.4) 每sched_nr_migrate_break个任务遍历需要跳出休息一下,
         如果没有达到env->loop_max,后面会重来*/
		if (env->loop > env->loop_break) {
			env->loop_break += sched_nr_migrate_break;
			env->flags |= LBF_NEED_BREAK;
			break;
		}
		/* (7.6.5) 判断任务是否支持迁移? */
		if (!can_migrate_task(p, env))
			goto next;
		/* (7.6.6) 获取p进程相对顶层cfs_rq的负载, 
					根据负载判断进程是否适合迁移*/
		load = task_h_load(p);
		// LB_MIN 定义的是false
		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
			goto next;

		if ((load / 2) > env->imbalance)
			goto next;
		// 把task加到env的list里面
		detach_task(p, env, rf);
		list_add(&p->se.group_node, &env->tasks);

		detached++;
		env->imbalance -= load;

#ifdef CONFIG_PREEMPT
		/*
		 * NEWIDLE balancing is a source of latency, so preemptible
		 * kernels will stop after the first task is detached to minimize
		 * the critical section.
		 */
		if (env->idle == CPU_NEWLY_IDLE)
			break;
#endif

		/*
		 * We only want to steal up to the prescribed amount of
		 * weighted load.
		 */
		if (env->imbalance <= 0)
			break;

		continue;
next:
		// 估计就是下移一位
		list_move_tail(&p->se.group_node, tasks);
	}

	/*
	 * Right now, this is one of only two places we collect this stat
	 * so we can safely collect detach_one_task() stats here rather
	 * than inside detach_one_task().
	 */
	schedstat_add(env->sd->lb_gained[env->idle], detached);

	return detached;
}

task_h_load

因为这里开了CONFIG_FAIR_GROUP_SCHED宏,所以task的load是相对于csf_rq的load

/*
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
{
        return cfs_rq->avg.load_avg;
}
*/
static unsigned long task_h_load(struct task_struct *p)
{
	struct cfs_rq *cfs_rq = task_cfs_rq(p);

	update_cfs_rq_h_load(cfs_rq);
	//load_avg是带weight的avg load
	/* (7.6.6.1) task_h_load的目的是在task_group使能时,rq中有多个层次的cfs_rq 
        如果进程p挂载在底层的cfs_rq中,把p的负载转换成顶层cfs_rq的相对负载*/
	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
			cfs_rq_load_avg(cfs_rq) + 1);
}

update_cfs_rq_h_load

通过层级关系来计算load值

/*
 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
 * This needs to be done in a top-down fashion because the load of a child
 * group is a fraction of its parents load.
 */
static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{
	struct rq *rq = rq_of(cfs_rq);
	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
	unsigned long now = jiffies;
	unsigned long load;

	if (cfs_rq->last_h_load_update == now)
		return;

	WRITE_ONCE(cfs_rq->h_load_next, NULL);
	/* 从底层cfs_rq到顶层cfs_rq建立起层次关系 */
	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		WRITE_ONCE(cfs_rq->h_load_next, se);
		if (cfs_rq->last_h_load_update == now)
			break;
	}

	if (!se) {
		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
		cfs_rq->last_h_load_update = now;
	}
	/* 使用建立的关系,从顶层cfs_rq开始计算每个层次cfs_rq的相对顶层负载h_load */
	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
		load = cfs_rq->h_load;
		load = div64_ul(load * se->avg.load_avg,
			cfs_rq_load_avg(cfs_rq) + 1);
		cfs_rq = group_cfs_rq(se);
		cfs_rq->h_load = load;
		cfs_rq->last_h_load_update = now;
	}
}

detach_task

detach的task 一定会attach上,因为在detach_task函数中就已经指定了dst_cpu

/*
 * detach_task() -- detach the task for the migration specified(指定的) in env
 */
static void detach_task(struct task_struct *p, struct lb_env *env,
			struct rq_flags *rf)
{
	lockdep_assert_held(&env->src_rq->lock);

	p->on_rq = TASK_ON_RQ_MIGRATING;
	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
	rq_unpin_lock(env->src_rq, rf);
	double_lock_balance(env->src_rq, env->dst_rq);
	set_task_cpu(p, env->dst_cpu);//这里直接将task的cpu设置为dst_cpu
	double_unlock_balance(env->src_rq, env->dst_rq);
	rq_repin_lock(env->src_rq, rf);
}

deactive_task函数基本上将task从list上剥离下来。deactivate_task是core.c中定义的函数,后面有空了再去看。

deactivate_task&dequeue_task

void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible++;

	dequeue_task(rq, p, flags);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
	if (!(flags & DEQUEUE_NOCLOCK))
		update_rq_clock(rq);

	if (!(flags & DEQUEUE_SAVE)) {
		sched_info_dequeued(rq, p);
		psi_dequeue(p, flags & DEQUEUE_SLEEP);
	}

	p->sched_class->dequeue_task(rq, p, flags);
}

dequeue_task_fair

对于普通的内核task,他调用到函数dequeue_task_fair

/*
 * The dequeue_task method is called before nr_running is
 * decreased. We remove the task from the rbtree and
 * update the fair scheduling stats:
 */
 
 /*
 1、如果sched_entity的on_rq成员判断进程已经在就绪队列上,则无事可做
 2、否则,具体的工作委托给dequeue_entity完成,内核会借机用update_curr更新统计量
 dequeue_entity内部会调用__dequeue_entity将进程从红黑树上摘下来
 */
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;
	int task_sleep = flags & DEQUEUE_SLEEP;

	/* Update rq's walt util before update schedutil */
	walt_dec_cumulative_runnable_avg(rq, p);

	/*
	 * The code below (indirectly) updates schedutil which looks at
	 * the cfs_rq utilization to select a frequency.
	 * Let's update schedtune here to ensure the boost value of the
	 * current task is not more accounted for(考虑) in the selection of the OPP.
	 */
	schedtune_dequeue_task(p, cpu_of(rq));

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		dequeue_entity(cfs_rq, se, flags);

		/*
		 * end evaluation on encountering(遇到) a throttled cfs_rq
		 *
		 * note: in the case of encountering a throttled cfs_rq we will
		 * post the final h_nr_running decrement below.
		*/
		if (cfs_rq_throttled(cfs_rq))
			break;
		cfs_rq->h_nr_running--;
		walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);

		/* Don't dequeue parent if it has other entities besides us */
		if (cfs_rq->load.weight) {
			/* Avoid re-evaluating load for this entity: */
			se = parent_entity(se);
			/*
			 * Bias pick_next to pick a task from this cfs_rq, as
			 * p is sleeping when it is within its sched_slice.
			 */
			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
				set_next_buddy(se);
			break;
		}
		flags |= DEQUEUE_SLEEP;
	}

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		cfs_rq->h_nr_running--;
		walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);

		if (cfs_rq_throttled(cfs_rq))
			break;

		update_load_avg(se, UPDATE_TG);
		update_cfs_shares(se);
	}

	if (!se)
		sub_nr_running(rq, 1);

	util_est_dequeue(&rq->cfs, p, task_sleep);
	hrtick_update(rq);
}

walt_dec_cumulative_runnable_avg

void
walt_dec_cumulative_runnable_avg(struct rq *rq,
				 struct task_struct *p)
{
	rq->cumulative_runnable_avg -= p->ravg.demand;
	BUG_ON((s64)rq->cumulative_runnable_avg < 0);

	/*
	 * on_rq will be 1 for sleeping tasks. So check if the task
	 * is migrating or dequeuing in RUNNING state to change the
	 * prio/cgroup/class.
	 */
	if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
		fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
}

dequeue_entity

dequeue_entity实际上将task放在红黑树中。同时在dequeue_entity中也会更新entity的load(基于PELT方式计算的load)

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

	/*
	 * When dequeuing a sched_entity, we must:
	 *   - Update loads to have both entity and cfs_rq synced with now.
	 *   - Substract its load from the cfs_rq->runnable_avg.
	 *   - Substract its previous weight from cfs_rq->load.weight.
	 *   - For group entity, update its weight to reflect the new share
	 *     of its group cfs_rq.
	 */
	update_load_avg(se, UPDATE_TG);
	dequeue_entity_load_avg(cfs_rq, se);

	update_stats_dequeue(cfs_rq, se, flags);

	clear_buddies(cfs_rq, se);

	if (se != cfs_rq->curr)
		__dequeue_entity(cfs_rq, se);
	se->on_rq = 0;
	account_entity_dequeue(cfs_rq, se);

	/*
	 * Normalize after update_curr(); which will also have moved
	 * min_vruntime if @se is the one holding it back. But before doing
	 * update_min_vruntime() again, which will discount @se's position and
	 * can move min_vruntime forward still more.
	 */
	if (!(flags & DEQUEUE_SLEEP))
		se->vruntime -= cfs_rq->min_vruntime;

	/* return excess runtime on last dequeue */
	return_cfs_rq_runtime(cfs_rq);

	update_cfs_shares(se);

	/*
	 * Now advance min_vruntime if @se was the entity holding it back,
	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
	 * put back on, and if we advance min_vruntime, we'll be placed back
	 * further than we started -- ie. we'll be penalized.
	 */
	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
		update_min_vruntime(cfs_rq);
}

__dequeue_entity

__dequeue_entity函数才是插入红黑树的过程

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	//erase抹去
	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值