Kernel Scheduler学习之七:CFS调度器之pick_next_task

  1. Overview
    根据Kernel Scheduler学习之七:CFS调度器之task管理知道cfs调度器采用红黑树的方式按照虚拟runtime(vruntime)进行排序。本次博客主要搞清如下的问题:
    1.如何获取下一个task
  2. 如何获取下一个task
    cfs调度器选出一个task出来执行的函数为pick_next_task_fair
    static struct task_struct *
    pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
    {
    	struct cfs_rq *cfs_rq = &rq->cfs;
    	struct sched_entity *se;
    	struct task_struct *p;
    	int new_tasks;
    
    again:
    	if (!sched_fair_runnable(rq))  //如果cfs_rq中没有task,说明cfs_rq是idle,则尝试做ible balance.
    		goto idle;
    //为简单化,先不看FAIR_GROUP
    #ifdef CONFIG_FAIR_GROUP_SCHED
    	if (!prev || prev->sched_class != &fair_sched_class)
    		goto simple;
    
    	/*
    	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
    	 * likely that a next task is from the same cgroup as the current.
    	 *
    	 * Therefore attempt to avoid putting and setting the entire cgroup
    	 * hierarchy, only change the part that actually changes.
    	 */
    
    	do {
    		struct sched_entity *curr = cfs_rq->curr;
    
    		/*
    		 * Since we got here without doing put_prev_entity() we also
    		 * have to consider cfs_rq->curr. If it is still a runnable
    		 * entity, update_curr() will update its vruntime, otherwise
    		 * forget we've ever seen it.
    		 */
    		if (curr) {
    			if (curr->on_rq)
    				update_curr(cfs_rq);
    			else
    				curr = NULL;
    
    			/*
    			 * This call to check_cfs_rq_runtime() will do the
    			 * throttle and dequeue its entity in the parent(s).
    			 * Therefore the nr_running test will indeed
    			 * be correct.
    			 */
    			if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
    				cfs_rq = &rq->cfs;
    
    				if (!cfs_rq->nr_running)
    					goto idle;
    
    				goto simple;
    			}
    		}
    
    		se = pick_next_entity(cfs_rq, curr);
    		cfs_rq = group_cfs_rq(se);
    	} while (cfs_rq);
    
    	p = task_of(se);
    
    	/*
    	 * Since we haven't yet done put_prev_entity and if the selected task
    	 * is a different task than we started out with, try and touch the
    	 * least amount of cfs_rqs.
    	 */
    	if (prev != p) {
    		struct sched_entity *pse = &prev->se;
    
    		while (!(cfs_rq = is_same_group(se, pse))) {
    			int se_depth = se->depth;
    			int pse_depth = pse->depth;
    
    			if (se_depth <= pse_depth) {
    				put_prev_entity(cfs_rq_of(pse), pse);
    				pse = parent_entity(pse);
    			}
    			if (se_depth >= pse_depth) {
    				set_next_entity(cfs_rq_of(se), se);
    				se = parent_entity(se);
    			}
    		}
    
    		put_prev_entity(cfs_rq, pse);
    		set_next_entity(cfs_rq, se);
    	}
    
    	goto done;
    simple:
    #endif
    	if (prev)
    		put_prev_task(rq, prev);
    
    	do {
    		se = pick_next_entity(cfs_rq, NULL); //选择下一个task出来
    		set_next_entity(cfs_rq, se);    //记录cfs_rq当前正在running task为SE
    		cfs_rq = group_cfs_rq(se);
    	} while (cfs_rq);
    
    	p = task_of(se);
    
    done: __maybe_unused;
    #ifdef CONFIG_SMP
    	/*
    	 * Move the next running task to the front of
    	 * the list, so our cfs_tasks list becomes MRU
    	 * one.
    	 */
    	list_move(&p->se.group_node, &rq->cfs_tasks);
    #endif
    
    	if (hrtick_enabled(rq))
    		hrtick_start_fair(rq, p);
    
    	update_misfit_status(p, rq); //更新当前rq是否发生能力不匹配的情况(即task需要的能力大,但是cpu可提供的能力小的状况)
    
    	return p;
    
    idle:
    	if (!rf)
    		return NULL;
    //如果没有选到task,则认为此时cfs_rq为idle状态,那么进行idle balance,向其他cpu做均衡。
    	new_tasks = newidle_balance(rq, rf);
    
    	/*
    	 * Because newidle_balance() releases (and re-acquires) rq->lock, it is
    	 * possible for any higher priority task to appear. In that case we
    	 * must re-start the pick_next_entity() loop.
    	 */
    	if (new_tasks < 0) 
    		return RETRY_TASK;
    
    	if (new_tasks > 0) //如果有拉到task过来,则重新挑选task.
    		goto again;
    
    	/*
    	 * rq is about to be idle, check if we need to update the
    	 * lost_idle_time of clock_pelt
    	 */
    	update_idle_rq_clock_pelt(rq);
    
    	return NULL;
    }
    流程图如下:


    所以,选择task的时候,首先判断cfs_rq中是否有task,如果没有的话,说明接下来会进入idle,这时候,尝试从其他cpu中拉一些task过来,为其他cpu做负载均衡,如果有拉到task则重新开始选取task。
    如果选取到task,记录cfs_rq中的current为所选择的task,然后,判断这个task所需要的能力与cpu可提供的能力是否匹配。
    涉及如下的问题:
    a.如何选择next task
    b.标记current做了什么事情
    c.如果判断task与cpu是否匹配

    a.选取next task
     
    /*
     * Pick the next process, keeping these things in mind, in this order:
     * 1) keep things fair between processes/task groups
     * 2) pick the "next" process, since someone really wants that to run
     * 3) pick the "last" process, for cache locality
     * 4) do not run the "skip" process, if something else is available
     */
    static struct sched_entity *
    pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
    {
    	struct sched_entity *left = __pick_first_entity(cfs_rq);//选取最左边的task
    	struct sched_entity *se;
    
    	/*
    	 * If curr is set we have to see if its left of the leftmost entity
    	 * still in the tree, provided there was anything in the tree at all.
    	 */
    //如果最左边的为空,或者当前task的runtime还是最小的,则选择正在run的task.
    	if (!left || (curr && entity_before(curr, left))) 
    		left = curr;
    
    	se = left; /* ideally we run the leftmost entity */
    //正常上面的就够了,接下来处理skip的情况。
    	/*
    	 * Avoid running the skip buddy, if running something else can
    	 * be done without getting too unfair.
    	 */
    	if (cfs_rq->skip == se) {
    		struct sched_entity *second;
    
    		if (se == curr) {
    			second = __pick_first_entity(cfs_rq);
    		} else {
    			second = __pick_next_entity(se);
    			if (!second || (curr && entity_before(curr, second)))
    				second = curr;
    		}
    
    		if (second && wakeup_preempt_entity(second, left) < 1)
    			se = second;
    	}
    
    	/*
    	 * Prefer last buddy, try to return the CPU to a preempted task.
    	 */
    	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
    		se = cfs_rq->last;
    
    	/*
    	 * Someone really wants this to run. If it's not unfair, run it.
    	 */
    	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
    		se = cfs_rq->next;
    
    	clear_buddies(cfs_rq, se);
    
    	return se;
    }
    

    所以,pick_next_entity是选取了cfs_rq中最左边的task。但是这个地方有个疑点,__pick_first_entity并没有将到的task从红黑树中删除。下次来选的时候,岂不是又选到这个task呢?接下来看下set_next_entity。
    b.设定当前正在running的task
     

    static void
    set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
    {
    	/* 'current' is not kept within the tree. */
    	if (se->on_rq) {//因为是从cfs_rq中选取task,所以,这个条件判断是true的。
    		/*
    		 * Any task has to be enqueued before it get to execute on
    		 * a CPU. So account for the time it spent waiting on the
    		 * runqueue.
    		 */
            //更新等待的时间状况
    		update_stats_wait_end(cfs_rq, se);
        //如下面的code所示,这时候就将se从红黑树中拿掉,所以,下一交再选的时候,就不会再选到了。
    		__dequeue_entity(cfs_rq, se);
        //更新负载状况
    		update_load_avg(cfs_rq, se, UPDATE_TG);
    	}
    
    	update_stats_curr_start(cfs_rq, se);
    	cfs_rq->curr = se;
    
    	/*
    	 * Track our maximum slice length, if the CPU's load is at
    	 * least twice that of our own weight (i.e. dont track it
    	 * when there are only lesser-weight tasks around):
    	 */
    	if (schedstat_enabled() &&
    	    rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
    		schedstat_set(se->statistics.slice_max,
    			max((u64)schedstat_val(se->statistics.slice_max),
    			    se->sum_exec_runtime - se->prev_sum_exec_runtime));
    	}
    
    	se->prev_sum_exec_runtime = se->sum_exec_runtime;
    }
    
    static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
    {
    	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
    }

    所以,这个函数主要做了如下几件事情 :
    1.将选中的task从红黑树中拿掉,这样就避免下次再选到同一个人
    2.设定当前task
    3.更新负载信息

    c.如何判断task与cpu能力是不匹配的
     

    static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
    {
    	if (!static_branch_unlikely(&sched_asym_cpucapacity))
    		return;
    
    	if (!p) {
    		rq->misfit_task_load = 0;
    		return;
    	}
        //注意:传入的参数为:cpu的capacity,此处的capacity为原始能力扣除rt,dl task用去的能力之后剩余值。
    	if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
    		rq->misfit_task_load = 0;
    		return;
    	}
    
    	rq->misfit_task_load = task_h_load(p);
    }
    
    static inline int task_fits_capacity(struct task_struct *p, long capacity)
    {
    	return fits_capacity(task_util_est(p), capacity);
    }
    //判断是否匹配的条件为:task需要的能力是否占到扣除rt,dl使用能力之后剩余能力的80%,如果超过80%则认为misfit,否则认为是fit的。
    /*
     * The margin used when comparing utilization with CPU capacity.
     *
     * (default: ~20%)
     */
    #define fits_capacity(cap, max)	((cap) * 1280 < (max) * 1024)

    所以,判断是否匹配的条件为:预估的task需要的能力占到cpu原始能力扣除rt,dl使用能力之后剩余80%,如果超过80%,则认为不匹配。

  3. Summary
    pick next task从rq的cfs_rq管理的红黑树中选择最左边的task,最左边表明vrunimte最小。而vruntime是一个虚拟运行时间。所以,CFS所谓的完全公平,则是指虚拟时间的公平。
    选取task的时候,更新了cfs_rq与所选 task的负载及其他统计信息
    而且还判断了当前rq中是否存在task需要的能力与cpu可提供能力是否匹配。

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值