Kernel Scheduler学习之七:CFS调度器之task管理

  1. Overview
    task如何管理
  2. enqueuetask的过程
    入口函数如下所示:
    /*
     * The enqueue_task method is called before nr_running is
     * increased. Here we update the fair scheduling stats and
     * then put the task into the rbtree:
     */
    static void
    enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
    {
    	struct cfs_rq *cfs_rq;
    	struct sched_entity *se = &p->se;
    	int idle_h_nr_running = task_has_idle_policy(p);
    
    	/*
    	 * The code below (indirectly) updates schedutil which looks at
    	 * the cfs_rq utilization to select a frequency.
    	 * Let's add the task's estimated utilization to the cfs_rq's
    	 * estimated utilization, before we update schedutil.
    	 */
    	util_est_enqueue(&rq->cfs, p); //rq的util预计,这个后面专门研究
    
    	/*
    	 * If in_iowait is set, the code below may not trigger any cpufreq
    	 * utilization updates, so do it here explicitly with the IOWAIT flag
    	 * passed.
    	 */
    	if (p->in_iowait)
    		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); //设定IO boost,后面研究调频的时候,再做专门研究
    
    	for_each_sched_entity(se) { //如果开启CONFIG_FAIR_GROUP_SCHED,则将整个group内的task都queue到这个runqueue当中
    		if (se->on_rq)
    			break;
    		cfs_rq = cfs_rq_of(se);//获取这个sched_entity的cfs_rq,所以,此处说明,task的管理是放在cfs_rq当中的。
    		enqueue_entity(cfs_rq, se, flags);//核心函数
    
    		/*
    		 * end evaluation on encountering a throttled cfs_rq
    		 *
    		 * note: in the case of encountering a throttled cfs_rq we will
    		 * post the final h_nr_running increment below.
    		 */
    		if (cfs_rq_throttled(cfs_rq))
    			break;
    		cfs_rq->h_nr_running++;
    		cfs_rq->idle_h_nr_running += idle_h_nr_running;
    
    		flags = ENQUEUE_WAKEUP;
    	}
    
    	for_each_sched_entity(se) {
    		cfs_rq = cfs_rq_of(se);
    		cfs_rq->h_nr_running++;
    		cfs_rq->idle_h_nr_running += idle_h_nr_running;
    
    		if (cfs_rq_throttled(cfs_rq))
    			break;
    
    		update_load_avg(cfs_rq, se, UPDATE_TG);
    		update_cfs_group(se);
    	}
    
    	if (!se) {
    		add_nr_running(rq, 1);
    		/*
    		 * Since new tasks are assigned an initial util_avg equal to
    		 * half of the spare capacity of their CPU, tiny tasks have the
    		 * ability to cross the overutilized threshold, which will
    		 * result in the load balancer ruining all the task placement
    		 * done by EAS. As a way to mitigate that effect, do not account
    		 * for the first enqueue operation of new tasks during the
    		 * overutilized flag detection.
    		 *
    		 * A better way of solving this problem would be to wait for
    		 * the PELT signals of tasks to converge before taking them
    		 * into account, but that is not straightforward to implement,
    		 * and the following generally works well enough in practice.
    		 */
    		if (flags & ENQUEUE_WAKEUP)
    			update_overutilized_status(rq);
    
    	}
    
    	if (cfs_bandwidth_used()) {
    		/*
    		 * When bandwidth control is enabled; the cfs_rq_throttled()
    		 * breaks in the above iteration can result in incomplete
    		 * leaf list maintenance, resulting in triggering the assertion
    		 * below.
    		 */
    		for_each_sched_entity(se) {
    			cfs_rq = cfs_rq_of(se);
    
    			if (list_add_leaf_cfs_rq(cfs_rq))
    				break;
    		}
    	}
    
    	assert_list_leaf_cfs_rq(rq);
    
    	hrtick_update(rq);
    }
    
    如果开启CONFIG_FAIR_GROUP_SCHED,可能有多个cfs_rq,详细的后面再研究,现在关注task的管理。
    /* runqueue on which this entity is (to be) queued */
    static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
    {
    	return se->cfs_rq;
    }
    如果没有开启CONFIG_FAIR_GROUP_SCHED,所有的se都隶属于一个cfs_rq
    
    static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
    {
    	struct task_struct *p = task_of(se);
    	struct rq *rq = task_rq(p);
    
    	return &rq->cfs;
    }
    

    enqueue_task_fair的功能主要是获取task的entity,并将entity enqueue到对应的cfs_rq当中。对于没有开启CONFIG_FAIR_GROUP_SCHED来讲,rq将task放到cfs_rq当中进行管理。
    同时将rq中的nr_running加1,所以,nr_running表示了目前rq当中处于runnable即就绪task的数量。
     

    static inline void add_nr_running(struct rq *rq, unsigned count)
    {
    	unsigned prev_nr = rq->nr_running; //累加计数之前的值
    
    	rq->nr_running = prev_nr + count; //之后的值
    
    #ifdef CONFIG_SMP
    	if (prev_nr < 2 && rq->nr_running >= 2) {//如果rq中有两个及以上的task,则认定这个domain overload,可以做loadbalance.
    		if (!READ_ONCE(rq->rd->overload))
    			WRITE_ONCE(rq->rd->overload, 1);
    	}
    #endif
    
    	sched_update_tick_dependency(rq); //与no hz有关系,后面研究一下No hz
    }
    

    enqueue_entity如下所示:
     

    /*
     * MIGRATION
     *
     *	dequeue
     *	  update_curr()
     *	    update_min_vruntime()
     *	  vruntime -= min_vruntime
     *
     *	enqueue
     *	  update_curr()
     *	    update_min_vruntime()
     *	  vruntime += min_vruntime
     *
     * this way the vruntime transition between RQs is done when both
     * min_vruntime are up-to-date.
     *
     * WAKEUP (remote)
     *
     *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
     *	  vruntime -= min_vruntime
     *
     *	enqueue
     *	  update_curr()
     *	    update_min_vruntime()
     *	  vruntime += min_vruntime
     *
     * this way we don't have the most up-to-date min_vruntime on the originating
     * CPU and an up-to-date min_vruntime on the destination CPU.
     */
    
    static void
    enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
    {
    	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
    	bool curr = cfs_rq->curr == se; //cfs_rq->curr表示上一次选task时被选中的task,即当前正在running的task(如果fair group没有开启的话)
    
    	/*
    	 * If we're the current task, we must renormalise before calling
    	 * update_curr().
    	 */
    	if (renorm && curr)//此处有疑问:curr表示上一次正是当前task在run为什么还会再enqueue一次呢?
    		se->vruntime += cfs_rq->min_vruntime; //更新se的vruntime
        //更新当前正在running task的统计信息,如vruntime等
    	update_curr(cfs_rq);
    
    	/*
    	 * Otherwise, renormalise after, such that we're placed at the current
    	 * moment in time, instead of some random moment in the past. Being
    	 * placed in the past could significantly boost this task to the
    	 * fairness detriment of existing tasks.
    	 */
    	if (renorm && !curr)
    		se->vruntime += cfs_rq->min_vruntime;
        //接下来如英文注释,主要是更新task的load及rq的load
    	/*
    	 * When enqueuing a sched_entity, we must:
    	 *   - Update loads to have both entity and cfs_rq synced with now.
    	 *   - Add its load to cfs_rq->runnable_avg
    	 *   - For group_entity, update its weight to reflect the new share of
    	 *     its group cfs_rq
    	 *   - Add its new weight to cfs_rq->load.weight
    	 */
        //更新task的负载
    	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
    	update_cfs_group(se);
        //将task的负载计入到cfs_rq当中
    	enqueue_runnable_load_avg(cfs_rq, se);
        //计数cfs_rq中的task数量
    	account_entity_enqueue(cfs_rq, se);
    
    	if (flags & ENQUEUE_WAKEUP)
    		place_entity(cfs_rq, se, 0);
    
    	check_schedstat_required();
    	update_stats_enqueue(cfs_rq, se, flags);
    	check_spread(cfs_rq, se);
    	if (!curr)
    		__enqueue_entity(cfs_rq, se);
        //设置当前task on rq状态
    	se->on_rq = 1;
    
    	/*
    	 * When bandwidth control is enabled, cfs might have been removed
    	 * because of a parent been throttled but cfs->nr_running > 1. Try to
    	 * add it unconditionnally.
    	 */
    	if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
    		list_add_leaf_cfs_rq(cfs_rq);
    
    	if (cfs_rq->nr_running == 1)
    		check_enqueue_throttle(cfs_rq);
    }
    

    上面的函数主要是做了一些入队之前的准备工作,主要是更新task与cfs_rq的load,计时相关的信息.这些内容分个章节专门研究.准备工作做完之后,才是真正的enqueue的过程:
     

    //根据注释,task采用红黑树的方式进行保存.
    /*
     * Enqueue an entity into the rb-tree:
     */
    static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
    {
        //红黑树的根节点保存在cfs_rq的tasks_timeline当中.
    	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
    	struct rb_node *parent = NULL;
    	struct sched_entity *entry;
    	bool leftmost = true;
    
    	/*
    	 * Find the right place in the rbtree:
    	 */
    	while (*link) {
    		parent = *link;
    		entry = rb_entry(parent, struct sched_entity, run_node);
    		/*
    		 * We dont care about collisions. Nodes with
    		 * the same key stay together.
    		 */
    		if (entity_before(se, entry)) { //如后面的entity_before函数所示,表明排序是按照vruntime进行排序的.
    			link = &parent->rb_left;
    		} else {
    			link = &parent->rb_right;
    			leftmost = false;
    		}
    	}
    
    	rb_link_node(&se->run_node, parent, link);
    	rb_insert_color_cached(&se->run_node,
    			       &cfs_rq->tasks_timeline, leftmost);
    }
    
    static inline int entity_before(struct sched_entity *a,
    				struct sched_entity *b)
    {
    	return (s64)(a->vruntime - b->vruntime) < 0;
    }
    

    综上所述,cfs中对task的管理以se为单位通过红黑树的方式进行管理。而红黑树主要是依据vruntime顺序进行管理。

    处于红黑树最左边的是vruntime最小的,所以,每次选task的时候,只要选最左边的task即可。

  3. dequeue task

    static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
    {
    	if (!(flags & DEQUEUE_NOCLOCK)) //如果没有要求不更新clack,则更新rq的clock
    		update_rq_clock(rq);  //关于rq clock后面研究
    
    	if (!(flags & DEQUEUE_SAVE)) {
    		sched_info_dequeued(rq, p);
    		psi_dequeue(p, flags & DEQUEUE_SLEEP);
    	}
    
    	uclamp_rq_dec(rq, p); //将taskd dequeue之后,更新uclamp,关于uclamp后面研究。
    	p->sched_class->dequeue_task(rq, p, flags);
    }
    
    /*
     * The dequeue_task method is called before nr_running is
     * decreased. We remove the task from the rbtree and
     * update the fair scheduling stats:
     */
    static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
    {
    	struct cfs_rq *cfs_rq;
    	struct sched_entity *se = &p->se;
    	int task_sleep = flags & DEQUEUE_SLEEP;
    	int idle_h_nr_running = task_has_idle_policy(p);
    
    	for_each_sched_entity(se) {
    		cfs_rq = cfs_rq_of(se);
        //先dequeue task,然后才将nr_running减1
    		dequeue_entity(cfs_rq, se, flags);
    
    		/*
    		 * end evaluation on encountering a throttled cfs_rq
    		 *
    		 * note: in the case of encountering a throttled cfs_rq we will
    		 * post the final h_nr_running decrement below.
    		*/
    		if (cfs_rq_throttled(cfs_rq))
    			break;
    		cfs_rq->h_nr_running--;
    		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
    
    		/* Don't dequeue parent if it has other entities besides us */
    		if (cfs_rq->load.weight) {
    			/* Avoid re-evaluating load for this entity: */
    			se = parent_entity(se);
    			/*
    			 * Bias pick_next to pick a task from this cfs_rq, as
    			 * p is sleeping when it is within its sched_slice.
    			 */
    			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
    				set_next_buddy(se);
    			break;
    		}
    		flags |= DEQUEUE_SLEEP;
    	}
    
    	for_each_sched_entity(se) {
    		cfs_rq = cfs_rq_of(se);
    		cfs_rq->h_nr_running--;
    		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
    
    		if (cfs_rq_throttled(cfs_rq))
    			break;
    
    		update_load_avg(cfs_rq, se, UPDATE_TG);
    		update_cfs_group(se);
    	}
    
    	if (!se)
    		sub_nr_running(rq, 1);
    
    	util_est_dequeue(&rq->cfs, p, task_sleep);
    	hrtick_update(rq);
    }


     

    static void
    dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
    {
    	/*
    	 * Update run-time statistics of the 'current'.
    	 */
    	update_curr(cfs_rq);
    
    	/*
    	 * When dequeuing a sched_entity, we must:
    	 *   - Update loads to have both entity and cfs_rq synced with now.
    	 *   - Subtract its load from the cfs_rq->runnable_avg.
    	 *   - Subtract its previous weight from cfs_rq->load.weight.
    	 *   - For group entity, update its weight to reflect the new share
    	 *     of its group cfs_rq.
    	 */
    	update_load_avg(cfs_rq, se, UPDATE_TG);
    	dequeue_runnable_load_avg(cfs_rq, se);
    
    	update_stats_dequeue(cfs_rq, se, flags);
    
    	clear_buddies(cfs_rq, se);
    
    	if (se != cfs_rq->curr)
    		__dequeue_entity(cfs_rq, se);
    	se->on_rq = 0;
    	account_entity_dequeue(cfs_rq, se);
    
    	/*
    	 * Normalize after update_curr(); which will also have moved
    	 * min_vruntime if @se is the one holding it back. But before doing
    	 * update_min_vruntime() again, which will discount @se's position and
    	 * can move min_vruntime forward still more.
    	 */
    	if (!(flags & DEQUEUE_SLEEP))
    		se->vruntime -= cfs_rq->min_vruntime;
    
    	/* return excess runtime on last dequeue */
    	return_cfs_rq_runtime(cfs_rq);
    
    	update_cfs_group(se);
    
    	/*
    	 * Now advance min_vruntime if @se was the entity holding it back,
    	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
    	 * put back on, and if we advance min_vruntime, we'll be placed back
    	 * further than we started -- ie. we'll be penalized.
    	 */
    	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
    		update_min_vruntime(cfs_rq);
    }
    
    static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
    {
    	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
    }

    dequeue_task负载将一个Task从rq中摘除.

  4. Summary
    综上所述,cfs的task通过cfs_rq中的红黑树按照vruntime排序进行管理
    当enqueue的时候,设置task的on_rq为1,dequeue的时候设置为0
    enqueue与dequeue的时候,进行task与cfs_rq的负载更新.

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值