Kernel Scheduler学习之七:CFS调度器之虚拟时间

  1. Overview
    cfs调度器被称作是完全公平调度器,那么如何体现公平的呢?是否有绝对的公平呢?本博客主要回答如下的问题:
    vruntime的原理是什么?
    vruntime如何更新的?
  2. Vtime的原理 
    Linux抽象出来一个调度实体,在这个实体当中保存了了一个叫vruntime的变量,如下所示:
    struct sched_entity {
    	/* For load-balancing: */
    	struct load_weight		load;
    	unsigned long			runnable_weight;
    	struct rb_node			run_node;
    	struct list_head		group_node;
    	unsigned int			on_rq;
    
    	u64				exec_start;
    	u64				sum_exec_runtime;
    	u64				vruntime;
    	u64				prev_sum_exec_runtime;
    
    	u64				nr_migrations;
    
    	struct sched_statistics		statistics;
    
    #ifdef CONFIG_FAIR_GROUP_SCHED
    	int				depth;
    	struct sched_entity		*parent;
    	/* rq on which this entity is (to be) queued: */
    	struct cfs_rq			*cfs_rq;
    	/* rq "owned" by this entity/group: */
    	struct cfs_rq			*my_q;
    #endif
    
    #ifdef CONFIG_SMP
    	/*
    	 * Per entity load average tracking.
    	 *
    	 * Put into separate cache line so it does not
    	 * collide with read-mostly values above.
    	 */
    	struct sched_avg		avg;
    #endif
    };

    vruntime更新的时间点与位置:
    Kernel会在涉及到task时间变化 或者变更cpu的时候,更新vruntime,如下所示:
     

    update_curr的函数如所示:

    /*
     * Update the current task's runtime statistics.
     */
    static void update_curr(struct cfs_rq *cfs_rq)
    {
    	struct sched_entity *curr = cfs_rq->curr;
    	u64 now = rq_clock_task(rq_of(cfs_rq));//此处取得的是task的clock,关于rq相关的clock,后面研究。
    	u64 delta_exec;
    
    	if (unlikely(!curr))
    		return;
    
    	delta_exec = now - curr->exec_start;
    	if (unlikely((s64)delta_exec <= 0))
    		return;
    
    	curr->exec_start = now;//重新开始计时
    
    	schedstat_set(curr->statistics.exec_max,
    		      max(delta_exec, curr->statistics.exec_max));
    
    	curr->sum_exec_runtime += delta_exec;//计算总运行时间
    	schedstat_add(cfs_rq->exec_clock, delta_exec);
    
    	curr->vruntime += calc_delta_fair(delta_exec, curr);//计算vruntime
    	update_min_vruntime(cfs_rq);//更新cfs_rq的最小vruntime
    
    	if (entity_is_task(curr)) {
    		struct task_struct *curtask = task_of(curr);
    
    		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
    		cgroup_account_cputime(curtask, delta_exec);
    		account_group_exec_runtime(curtask, delta_exec);
    	}
    
    	account_cfs_rq_runtime(cfs_rq, delta_exec);
    }
    

    vruntime的计算:

    /*
     * delta /= w
     */
    static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
    {
    	if (unlikely(se->load.weight != NICE_0_LOAD)) //如果权重不是优先级为0的权重,则要根据其权重重新计算。
    		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
    //否则就是真实的runing时间差值。
    	return delta;
    }
    
    

    所以,根据上面的公式看上去和load.weight有很大关系,那么Load.weight表示什么呢?
    如之前sched_entity中Load的类型为:    struct load_weight        load;

    struct load_weight {
    	unsigned long			weight;
    	u32				inv_weight;
    };
    

    查询代码,主要是通过如下的函数设置load.weight:

    static void set_load_weight(struct task_struct *p, bool update_load)
    {
    	int prio = p->static_prio - MAX_RT_PRIO;//RT的最大优先级为99,normal的从100开始,此处MAX_RT_PRIO的值为100。
    	struct load_weight *load = &p->se.load;
    
    	/*
    	 * SCHED_IDLE tasks get minimal weight:
    	 */
    	if (task_has_idle_policy(p)) {
    		load->weight = scale_load(WEIGHT_IDLEPRIO);
    		load->inv_weight = WMULT_IDLEPRIO;
    		p->se.runnable_weight = load->weight;
    		return;
    	}
    
    	/*
    	 * SCHED_OTHER tasks have to update their load when changing their
    	 * weight
    	 */
    	if (update_load && p->sched_class == &fair_sched_class) {
    		reweight_task(p, prio);
    	} else {
    		load->weight = scale_load(sched_prio_to_weight[prio]); 
    		load->inv_weight = sched_prio_to_wmult[prio];
    		p->se.runnable_weight = load->weight;
    	}
    }
    
    #ifdef CONFIG_64BIT
    # define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
    # define scale_load(w)		((w) << SCHED_FIXEDPOINT_SHIFT)

    所以,weight的值其实是根据prio在sched_prio_to_weight查表获得一个值并左移(32位系统不用左移)。这个表的内容如下所示:

    /*
     * Nice levels are multiplicative, with a gentle 10% change for every
     * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
     * nice 1, it will get ~10% less CPU time than another CPU-bound task
     * that remained on nice 0.
     *
     * The "10% effect" is relative and cumulative: from _any_ nice level,
     * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
     * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
     * If a task goes up by ~10% and another task goes down by ~10% then
     * the relative distance between them is ~25%.)
     */
    const int sched_prio_to_weight[40] = {
     /* -20 */     88761,     71755,     56483,     46273,     36291,
     /* -15 */     29154,     23254,     18705,     14949,     11916,
     /* -10 */      9548,      7620,      6100,      4904,      3906,
     /*  -5 */      3121,      2501,      1991,      1586,      1277,
     /*   0 */      1024,       820,       655,       526,       423,
     /*   5 */       335,       272,       215,       172,       137,
     /*  10 */       110,        87,        70,        56,        45,
     /*  15 */        36,        29,        23,        18,        15,
    };

    根据英文注释:这个表中的每个值都是前一个值的80%,以nice为0(即prio的值为120)时候的值为1024。
    nice的值区间为[-20,20],那么sched_prio_to_weight对应[-20,20]每一个Nice的weight值。这个数组的值近似满足等比数列。
    可以用下面的公式表示:


    其中:        load->inv_weight = sched_prio_to_wmult[prio];中的sched_prio_to_wmult如下所示:

    /*
     * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
     *
     * In cases where the weight does not change often, we can use the
     * precalculated inverse to speed up arithmetics by turning divisions
     * into multiplications:
     */
    const u32 sched_prio_to_wmult[40] = {
     /* -20 */     48388,     59856,     76040,     92818,    118348,
     /* -15 */    147320,    184698,    229616,    287308,    360437,
     /* -10 */    449829,    563644,    704093,    875809,   1099582,
     /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
     /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
     /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
     /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
     /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
    };
    

    根据注释,每个元素的值公式如下:

    接下来继续计算delta_time:

    /*
     * delta_exec * weight / lw.weight
     *   OR
     * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
     *
     * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
     * we're guaranteed shift stays positive because inv_weight is guaranteed to
     * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
     *
     * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
     * weight/lw.weight <= 1, and therefore our shift will also be positive.
     */
    static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
    {
    	u64 fact = scale_load_down(weight); //因为在设定weight的时候对于64位系统,向左移了10位,这里再移回来,这样做的目的,看注释是为了增加分辨率。
    	int shift = WMULT_SHIFT;
    
    	__update_inv_weight(lw);
    
    	if (unlikely(fact >> 32)) {
    		while (fact >> 32) {
    			fact >>= 1;
    			shift--;
    		}
    	}
    
    	/* hint to use a 32x32->64 mul */
    	fact = (u64)(u32)fact * lw->inv_weight;
    
    	while (fact >> 32) {
    		fact >>= 1;
    		shift--;
    	}
    
    	return mul_u64_u32_shr(delta_exec, fact, shift);
    }

    上面的函数对应的公式为:

    所以,当优先级为120的时候,dealta_exec不用打折。而对于其他优先级都会打折或者放大。
    如果优先级越大那么其nice值越小,对应的weight_load的值越大,则计算出的时间比实际执行时间要少
    如果优先级越小,那么其nice值越大,对应的weight_load的值越小,则计算出的时间比较实际执行时间要多。
    如果按照vtime进行公平调度的话,那么优先级越大,则实际分配的时间越多。
    如下图所示:
    config

时间片计算

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	unsigned long ideal_runtime, delta_exec;
	struct sched_entity *se;
	s64 delta;

	ideal_runtime = sched_slice(cfs_rq, curr);
	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	if (delta_exec > ideal_runtime) {
		resched_curr(rq_of(cfs_rq));
		/*
		 * The current task ran long enough, ensure it doesn't get
		 * re-elected due to buddy favours.
		 */
		clear_buddies(cfs_rq, curr);
		return;
	}

	/*
	 * Ensure that a task that missed wakeup preemption by a
	 * narrow margin doesn't have to wait for a full slice.
	 * This also mitigates buddy induced latencies under load.
	 */
	if (delta_exec < sysctl_sched_min_granularity)
		return;

	se = __pick_first_entity(cfs_rq);
	delta = curr->vruntime - se->vruntime;

	if (delta < 0)
		return;

	if (delta > ideal_runtime)
		resched_curr(rq_of(cfs_rq));
}

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值