Kernel Scheduler学习之七：CFS调度器之task管理

最新推荐文章于 2022-06-09 13:55:06 发布

sucjhwaxp

最新推荐文章于 2022-06-09 13:55:06 发布

阅读量577

点赞数

本文链接：https://blog.csdn.net/sucjhwaxp/article/details/107447162

版权

Overview
task如何管理

enqueuetask的过程
入口函数如下所示：

/*
 * The enqueue_task method is called before nr_running is
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;
	int idle_h_nr_running = task_has_idle_policy(p);

	/*
	 * The code below (indirectly) updates schedutil which looks at
	 * the cfs_rq utilization to select a frequency.
	 * Let's add the task's estimated utilization to the cfs_rq's
	 * estimated utilization, before we update schedutil.
	 */
	util_est_enqueue(&rq->cfs, p); //rq的util预计，这个后面专门研究

	/*
	 * If in_iowait is set, the code below may not trigger any cpufreq
	 * utilization updates, so do it here explicitly with the IOWAIT flag
	 * passed.
	 */
	if (p->in_iowait)
		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); //设定IO boost，后面研究调频的时候，再做专门研究

	for_each_sched_entity(se) { //如果开启CONFIG_FAIR_GROUP_SCHED，则将整个group内的task都queue到这个runqueue当中
		if (se->on_rq)
			break;
		cfs_rq = cfs_rq_of(se);//获取这个sched_entity的cfs_rq，所以，此处说明，task的管理是放在cfs_rq当中的。
		enqueue_entity(cfs_rq, se, flags);//核心函数

		/*
		 * end evaluation on encountering a throttled cfs_rq
		 *
		 * note: in the case of encountering a throttled cfs_rq we will
		 * post the final h_nr_running increment below.
		 */
		if (cfs_rq_throttled(cfs_rq))
			break;
		cfs_rq->h_nr_running++;
		cfs_rq->idle_h_nr_running += idle_h_nr_running;

		flags = ENQUEUE_WAKEUP;
	}

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		cfs_rq->h_nr_running++;
		cfs_rq->idle_h_nr_running += idle_h_nr_running;

		if (cfs_rq_throttled(cfs_rq))
			break;

		update_load_avg(cfs_rq, se, UPDATE_TG);
		update_cfs_group(se);
	}

	if (!se) {
		add_nr_running(rq, 1);
		/*
		 * Since new tasks are assigned an initial util_avg equal to
		 * half of the spare capacity of their CPU, tiny tasks have the
		 * ability to cross the overutilized threshold, which will
		 * result in the load balancer ruining all the task placement
		 * done by EAS. As a way to mitigate that effect, do not account
		 * for the first enqueue operation of new tasks during the
		 * overutilized flag detection.
		 *
		 * A better way of solving this problem would be to wait for
		 * the PELT signals of tasks to converge before taking them
		 * into account, but that is not straightforward to implement,
		 * and the following generally works well enough in practice.
		 */
		if (flags & ENQUEUE_WAKEUP)
			update_overutilized_status(rq);

	}

	if (cfs_bandwidth_used()) {
		/*
		 * When bandwidth control is enabled; the cfs_rq_throttled()
		 * breaks in the above iteration can result in incomplete
		 * leaf list maintenance, resulting in triggering the assertion
		 * below.
		 */
		for_each_sched_entity(se) {
			cfs_rq = cfs_rq_of(se);

			if (list_add_leaf_cfs_rq(cfs_rq))
				break;
		}
	}

	assert_list_leaf_cfs_rq(rq);

	hrtick_update(rq);
}

如果开启CONFIG_FAIR_GROUP_SCHED，可能有多个cfs_rq,详细的后面再研究，现在关注task的管理。
/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
	return se->cfs_rq;
}
如果没有开启CONFIG_FAIR_GROUP_SCHED，所有的se都隶属于一个cfs_rq

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
	struct task_struct *p = task_of(se);
	struct rq *rq = task_rq(p);

	return &rq->cfs;
}

enqueue_task_fair的功能主要是获取task的entity,并将entity enqueue到对应的cfs_rq当中。对于没有开启CONFIG_FAIR_GROUP_SCHED来讲，rq将task放到cfs_rq当中进行管理。
同时将rq中的nr_running加1，所以，nr_running表示了目前rq当中处于runnable即就绪task的数量。

static inline void add_nr_running(struct rq *rq, unsigned count)
{
	unsigned prev_nr = rq->nr_running; //累加计数之前的值

	rq->nr_running = prev_nr + count; //之后的值

#ifdef CONFIG_SMP
	if (prev_nr < 2 && rq->nr_running >= 2) {//如果rq中有两个及以上的task，则认定这个domain overload，可以做loadbalance.
		if (!READ_ONCE(rq->rd->overload))
			WRITE_ONCE(rq->rd->overload, 1);
	}
#endif

	sched_update_tick_dependency(rq); //与no hz有关系，后面研究一下No hz
}

enqueue_entity如下所示：

/*
 * MIGRATION
 *
 *	dequeue
 *	  update_curr()
 *	    update_min_vruntime()
 *	  vruntime -= min_vruntime
 *
 *	enqueue
 *	  update_curr()
 *	    update_min_vruntime()
 *	  vruntime += min_vruntime
 *
 * this way the vruntime transition between RQs is done when both
 * min_vruntime are up-to-date.
 *
 * WAKEUP (remote)
 *
 *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
 *	  vruntime -= min_vruntime
 *
 *	enqueue
 *	  update_curr()
 *	    update_min_vruntime()
 *	  vruntime += min_vruntime
 *
 * this way we don't have the most up-to-date min_vruntime on the originating
 * CPU and an up-to-date min_vruntime on the destination CPU.
 */

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
	bool curr = cfs_rq->curr == se; //cfs_rq->curr表示上一次选task时被选中的task，即当前正在running的task(如果fair group没有开启的话)

	/*
	 * If we're the current task, we must renormalise before calling
	 * update_curr().
	 */
	if (renorm && curr)//此处有疑问:curr表示上一次正是当前task在run为什么还会再enqueue一次呢?
		se->vruntime += cfs_rq->min_vruntime; //更新se的vruntime
    //更新当前正在running task的统计信息,如vruntime等
	update_curr(cfs_rq);

	/*
	 * Otherwise, renormalise after, such that we're placed at the current
	 * moment in time, instead of some random moment in the past. Being
	 * placed in the past could significantly boost this task to the
	 * fairness detriment of existing tasks.
	 */
	if (renorm && !curr)
		se->vruntime += cfs_rq->min_vruntime;
    //接下来如英文注释,主要是更新task的load及rq的load
	/*
	 * When enqueuing a sched_entity, we must:
	 *   - Update loads to have both entity and cfs_rq synced with now.
	 *   - Add its load to cfs_rq->runnable_avg
	 *   - For group_entity, update its weight to reflect the new share of
	 *     its group cfs_rq
	 *   - Add its new weight to cfs_rq->load.weight
	 */
    //更新task的负载
	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
	update_cfs_group(se);
    //将task的负载计入到cfs_rq当中
	enqueue_runnable_load_avg(cfs_rq, se);
    //计数cfs_rq中的task数量
	account_entity_enqueue(cfs_rq, se);

	if (flags & ENQUEUE_WAKEUP)
		place_entity(cfs_rq, se, 0);

	check_schedstat_required();
	update_stats_enqueue(cfs_rq, se, flags);
	check_spread(cfs_rq, se);
	if (!curr)
		__enqueue_entity(cfs_rq, se);
    //设置当前task on rq状态
	se->on_rq = 1;

	/*
	 * When bandwidth control is enabled, cfs might have been removed
	 * because of a parent been throttled but cfs->nr_running > 1. Try to
	 * add it unconditionnally.
	 */
	if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
		list_add_leaf_cfs_rq(cfs_rq);

	if (cfs_rq->nr_running == 1)
		check_enqueue_throttle(cfs_rq);
}

上面的函数主要是做了一些入队之前的准备工作,主要是更新task与cfs_rq的load,计时相关的信息.这些内容分个章节专门研究.准备工作做完之后,才是真正的enqueue的过程:

//根据注释,task采用红黑树的方式进行保存.
/*
 * Enqueue an entity into the rb-tree:
 */
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    //红黑树的根节点保存在cfs_rq的tasks_timeline当中.
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	bool leftmost = true;

	/*
	 * Find the right place in the rbtree:
	 */
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
		if (entity_before(se, entry)) { //如后面的entity_before函数所示,表明排序是按照vruntime进行排序的.
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = false;
		}
	}

	rb_link_node(&se->run_node, parent, link);
	rb_insert_color_cached(&se->run_node,
			       &cfs_rq->tasks_timeline, leftmost);
}

static inline int entity_before(struct sched_entity *a,
				struct sched_entity *b)
{
	return (s64)(a->vruntime - b->vruntime) < 0;
}

综上所述，cfs中对task的管理以se为单位通过红黑树的方式进行管理。而红黑树主要是依据vruntime顺序进行管理。
、
处于红黑树最左边的是vruntime最小的，所以，每次选task的时候，只要选最左边的task即可。

dequeue task

static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
	if (!(flags & DEQUEUE_NOCLOCK)) //如果没有要求不更新clack，则更新rq的clock
		update_rq_clock(rq);  //关于rq clock后面研究

	if (!(flags & DEQUEUE_SAVE)) {
		sched_info_dequeued(rq, p);
		psi_dequeue(p, flags & DEQUEUE_SLEEP);
	}

	uclamp_rq_dec(rq, p); //将taskd dequeue之后，更新uclamp，关于uclamp后面研究。
	p->sched_class->dequeue_task(rq, p, flags);
}

/*
 * The dequeue_task method is called before nr_running is
 * decreased. We remove the task from the rbtree and
 * update the fair scheduling stats:
 */
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;
	int task_sleep = flags & DEQUEUE_SLEEP;
	int idle_h_nr_running = task_has_idle_policy(p);

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
    //先dequeue task，然后才将nr_running减1
		dequeue_entity(cfs_rq, se, flags);

		/*
		 * end evaluation on encountering a throttled cfs_rq
		 *
		 * note: in the case of encountering a throttled cfs_rq we will
		 * post the final h_nr_running decrement below.
		*/
		if (cfs_rq_throttled(cfs_rq))
			break;
		cfs_rq->h_nr_running--;
		cfs_rq->idle_h_nr_running -= idle_h_nr_running;

		/* Don't dequeue parent if it has other entities besides us */
		if (cfs_rq->load.weight) {
			/* Avoid re-evaluating load for this entity: */
			se = parent_entity(se);
			/*
			 * Bias pick_next to pick a task from this cfs_rq, as
			 * p is sleeping when it is within its sched_slice.
			 */
			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
				set_next_buddy(se);
			break;
		}
		flags |= DEQUEUE_SLEEP;
	}

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		cfs_rq->h_nr_running--;
		cfs_rq->idle_h_nr_running -= idle_h_nr_running;

		if (cfs_rq_throttled(cfs_rq))
			break;

		update_load_avg(cfs_rq, se, UPDATE_TG);
		update_cfs_group(se);
	}

	if (!se)
		sub_nr_running(rq, 1);

	util_est_dequeue(&rq->cfs, p, task_sleep);
	hrtick_update(rq);
}

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

	/*
	 * When dequeuing a sched_entity, we must:
	 *   - Update loads to have both entity and cfs_rq synced with now.
	 *   - Subtract its load from the cfs_rq->runnable_avg.
	 *   - Subtract its previous weight from cfs_rq->load.weight.
	 *   - For group entity, update its weight to reflect the new share
	 *     of its group cfs_rq.
	 */
	update_load_avg(cfs_rq, se, UPDATE_TG);
	dequeue_runnable_load_avg(cfs_rq, se);

	update_stats_dequeue(cfs_rq, se, flags);

	clear_buddies(cfs_rq, se);

	if (se != cfs_rq->curr)
		__dequeue_entity(cfs_rq, se);
	se->on_rq = 0;
	account_entity_dequeue(cfs_rq, se);

	/*
	 * Normalize after update_curr(); which will also have moved
	 * min_vruntime if @se is the one holding it back. But before doing
	 * update_min_vruntime() again, which will discount @se's position and
	 * can move min_vruntime forward still more.
	 */
	if (!(flags & DEQUEUE_SLEEP))
		se->vruntime -= cfs_rq->min_vruntime;

	/* return excess runtime on last dequeue */
	return_cfs_rq_runtime(cfs_rq);

	update_cfs_group(se);

	/*
	 * Now advance min_vruntime if @se was the entity holding it back,
	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
	 * put back on, and if we advance min_vruntime, we'll be placed back
	 * further than we started -- ie. we'll be penalized.
	 */
	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
		update_min_vruntime(cfs_rq);
}

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}

dequeue_task负载将一个Task从rq中摘除.

Summary
综上所述,cfs的task通过cfs_rq中的红黑树按照vruntime排序进行管理
当enqueue的时候,设置task的on_rq为1,dequeue的时候设置为0
enqueue与dequeue的时候,进行task与cfs_rq的负载更新.

sucjhwaxp

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫