深入Linux内核架构笔记 - 进程管理与调度5: 完全公平调度类

核心调度器必须知道的有关公平调度类的所有信息,都包含在fair_sched_class中

static const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,
	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,

	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= pick_next_task_fair,
	.put_prev_task		= put_prev_task_fair,

#ifdef CONFIG_SMP
	.load_balance		= load_balance_fair,
	.move_one_task		= move_one_task_fair,
#endif

	.set_curr_task          = set_curr_task_fair,
	.task_tick		= task_tick_fair,
	.task_new		= task_new_fair,
};

数据结构

struct cfs_rq {
	struct load_weight load;
	unsigned long nr_running;

	u64 exec_clock;
	u64 min_vruntime;

	struct rb_root tasks_timeline;
	struct rb_node *rb_leftmost;
	struct rb_node *rb_load_balance_curr;
	/* 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	struct sched_entity *curr;

	unsigned long nr_spread_over;
}
  • nr_running: 队列上可运行进程的数目
  • load: 所有进程的累积负荷值
  • min_vruntime: 队列上所有进程的最小虚拟运行时间,实际值可能比最左边的树节点的vruntime大些
  • tasks_timeline: 用于在按时间排序的红黑树中管理所有进程
  • rb_leftmost: 指向树最左边的节点,即最需要被调度的进程
  • curr: 指向当前执行进程的可调度实体

CFS操作

  • 虚拟时钟

    • 根据现存的实际时钟和每个进程相关的负荷权重推算,所有与虚拟时钟有关的计算都在update_curr中执行

      static void update_curr(struct cfs_rq *cfs_rq)
      {
      	struct sched_entity *curr = cfs_rq->curr;
      	u64 now = rq_of(cfs_rq)->clock;
      	unsigned long delta_exec;
      
      	if (unlikely(!curr))
      		return;
      
      	/*
      	 * Get the amount of time the current task was running
      	 * since the last time we changed load (this cannot
      	 * overflow on 32 bits):
      	 */
      	delta_exec = (unsigned long)(now - curr->exec_start);
      
      	__update_curr(cfs_rq, curr, delta_exec);
      	curr->exec_start = now;
      
      	if (entity_is_task(curr)) {
      		struct task_struct *curtask = task_of(curr);
      
      		cpuacct_charge(curtask, delta_exec);
      	}
      }
      
    • 确定就绪队列的当前执行进程,并获取主调度器的实际时钟值,如果就绪队列上没有进程正在执行,则无事可做; 否则计算当前和上一次更新负荷统计量的时间差,并调用__update_curr

      static inline void
      __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
            unsigned long delta_exec)
      {
      	unsigned long delta_exec_weighted;
      	u64 vruntime;
      
      	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
      
      	curr->sum_exec_runtime += delta_exec;
      	schedstat_add(cfs_rq, exec_clock, delta_exec);
      	delta_exec_weighted = delta_exec;
      	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
      		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
      							&curr->load);
      	}
      	curr->vruntime += delta_exec_weighted;
      
      	/*
      	 * maintain cfs_rq->min_vruntime to be a monotonic increasing
      	 * value tracking the leftmost vruntime in the tree.
      	 */
      	if (first_fair(cfs_rq)) {
      		vruntime = min_vruntime(curr->vruntime,
      				__pick_next_entity(cfs_rq)->vruntime);
      	} else
      		vruntime = curr->vruntime;
      
      	cfs_rq->min_vruntime =
      		max_vruntime(cfs_rq->min_vruntime, vruntime);
      }
      
      • 更新当前进程在CPU执行花费的物理时间和虚拟时间,物理时间的更新只需要将时间差delta_exec加到先前统计的时间上就可以了

      • 虚拟时间的更新,对于nice级别为0的进程,其虚拟时间等于物理时间,对于其它的优先级,需要根据进程的负荷权重,重新计算时间,忽略舍入和溢出检查,计算公式:(delta_exec) * (NICE_0_LOAD / curr->load.weight)

      • 更新就绪队列的min_vruntime, first_fair用来检测树是否有最左边的节点,如果有,则使用该节点的vruntime和当前进程vruntime中的最小值, 如果没有,则使用当前进程的vruntime,取上述方法得到的vruntime和min_vruntime中的最大值作为最终的min_vruntime

        static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
        {
        	return se->vruntime - cfs_rq->min_vruntime;
        }
        
      • 红黑树排序是根据entity_key函数返回的结果来进行的,进程运行时,其vruntime稳定地增加,在红黑树中向右移动,而且越重要的进程vruntime增加速度越慢,向右移动的速度也慢; 对于睡眠的进程,其vruntime保持不变,当时由于min_vruntime的增加,其键值变小,导致其往红黑数的左边移动

  • 延迟跟踪

    • 良好的调度延迟: 保证每个可运行程序都至少运行一次的某个时间间隔,在sysctl_sched_latency给出,可以通过/proc/sys/kernel/sched_latency_ns控制,默认值为20ms

    • sched_nr_latency: 控制一个延迟周期中处理的最大活动进程数目,如果活动进程超出上限,则延迟周期也成比例地扩展,可以通过sysctl_sched_min_granularity间接地控制,后者可以通过/proc/sys/kernel/shed_min_granularity_ns设置,默认值是4ms

    • sysctl_sched_latency和sysctl_sched_min_granularity改变时,都会重新计算sched_nr_latency

    • __sched_period确定延迟周期的长度,通常就是sysctl_sched_latency,但是如果有更多进程运行,其值有可能按现行比例扩展:
      sysctl_sched_latency * (nr_running / sched_nr_latency)

    • 根据进程的相对权重,将一个延迟周期的时间在活动进程之间分配:sched_slice计算调度实体在一个延迟周期内分配到的实际时间,__sched_vslice计算延迟周期对应的虚拟时间。

      static u64 __sched_period(unsigned long nr_running)
      {
      	u64 period = sysctl_sched_latency;
      	unsigned long nr_latency = sched_nr_latency;
      
      	if (unlikely(nr_running > nr_latency)) {
      		period *= nr_running;
      		do_div(period, nr_latency);
      	}
      
      	return period;
      }
      static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
      {
      	u64 slice = __sched_period(cfs_rq->nr_running);
      
      	slice *= se->load.weight;
      	do_div(slice, cfs_rq->load.weight);
      
      	return slice;
      }
      /*
       * We calculate the vruntime slice.
       *
       * vs = s/w = p/rw
       */
      static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
      {
      	u64 vslice = __sched_period(nr_running);
      
      	vslice *= NICE_0_LOAD;
      	do_div(vslice, rq_weight);
      
      	return vslice;
      }
      

队列操作

  • 向就绪队列放置新进程

    static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
    {
    	struct cfs_rq *cfs_rq;
    	struct sched_entity *se = &p->se;
    
    	for_each_sched_entity(se) {
    		if (se->on_rq)
    			break;
    		cfs_rq = cfs_rq_of(se);
    		enqueue_entity(cfs_rq, se, wakeup);
    		wakeup = 1;
    	}
    }
    static void
    enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
    {
    	/*
    	 * Update run-time statistics of the 'current'.
    	 */
    	update_curr(cfs_rq);
    
    	if (wakeup) {
    		place_entity(cfs_rq, se, 0);
    		enqueue_sleeper(cfs_rq, se);
    	}
    
    	update_stats_enqueue(cfs_rq, se);
    	check_spread(cfs_rq, se);
    	if (se != cfs_rq->curr)
    		__enqueue_entity(cfs_rq, se);
    	account_entity_enqueue(cfs_rq, se);
    }
    static void
    place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
    {
    	u64 vruntime;
    
    	vruntime = cfs_rq->min_vruntime;
    
    	if (sched_feat(TREE_AVG)) {
    		struct sched_entity *last = __pick_last_entity(cfs_rq);
    		if (last) {
    			vruntime += last->vruntime;
    			vruntime >>= 1;
    		}
    	} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
    		vruntime += sched_vslice(cfs_rq)/2;
    
    	/*
    	 * The 'current' period is already promised to the current tasks,
    	 * however the extra weight of the new task will slow them down a
    	 * little, place the new task so that it fits in the slot that
    	 * stays open at the end.
    	 */
    	if (initial && sched_feat(START_DEBIT))
    		vruntime += sched_vslice_add(cfs_rq, se);
    
    	if (!initial) {
    		/* sleeps upto a single latency don't count. */
    		if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se))
    			vruntime -= sysctl_sched_latency;
    
    		/* ensure we never gain time by being placed backwards. */
    		vruntime = max_vruntime(se->vruntime, vruntime);
    	}
    
    	se->vruntime = vruntime;
    }
    
    • 如果进程此前睡眠,则调用place_entity调整进程的虚拟运行时间
    • place_entity根据initial的值来区分两种情况,只有在新进程被加到系统时,才会设置该参数,此处讨论initial未设置的情况,由于内核要确保在当前的延迟周期内是活动进程都至少运行一次,因此,队列的min_vruntime用作基准时间,通过减去sysctl_sched_lantency,确保新唤醒的进程只有在当前延迟周期结束后才能运行,但如果睡眠进程已经累积了比较大的不公平,内核选择se->vruntime最为最终的vruntime

选择下一个进程

  • 选择下一个要运行的进程由pick_next_task_fair执行

    static struct task_struct *pick_next_task_fair(struct rq *rq)
    {
    	struct cfs_rq *cfs_rq = &rq->cfs;
    	struct sched_entity *se;
    
    	if (unlikely(!cfs_rq->nr_running))
    		return NULL;
    
    	do {
    		se = pick_next_entity(cfs_rq);
    		cfs_rq = group_cfs_rq(se);
    	} while (cfs_rq);
    
    	return task_of(se);
    }
    static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
    {
    	struct sched_entity *se = NULL;
    
    	if (first_fair(cfs_rq)) {
    		se = __pick_next_entity(cfs_rq);
    		set_next_entity(cfs_rq, se);
    	}
    
    	return se;
    }
    static void
    set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
    {
    	/* 'current' is not kept within the tree. */
    	if (se->on_rq) {
    		/*
    		 * Any task has to be enqueued before it get to execute on
    		 * a CPU. So account for the time it spent waiting on the
    		 * runqueue.
    		 */
    		update_stats_wait_end(cfs_rq, se);
    		__dequeue_entity(cfs_rq, se);
    	}
    	cfs_rq->curr = se;
    	se->prev_sum_exec_runtime = se->sum_exec_runtime;
    }
    

处理周期性调度器

  • 由函数task_tick_fair负责,但实际工作是委托entity_tick完成的
static void task_tick_fair(struct rq *rq, struct task_struct *curr)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &curr->se;

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		entity_tick(cfs_rq, se);
	}
}
static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
		check_preempt_tick(cfs_rq, curr);
}
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	unsigned long ideal_runtime, delta_exec;

	ideal_runtime = sched_slice(cfs_rq, curr);
	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	if (delta_exec > ideal_runtime)
		resched_task(rq_of(cfs_rq)->curr);
}
  • 使用update_curr更新统计量
  • 如果可运行的进程大于1个,调用check_preempt_tick来决策进程是否应该被抢占,如果进程实际运行的时间间隔delta_exec如果大于由延迟周期确定的份额,则设置task_struct中的TIF_NEED_RESCHED标志,核心调度器会在适当的时机发起重新调度

唤醒抢占

  • 当使用try_to_wake_up和wake_up_new_task唤醒进程时,内核使用check_preempt_curr看看新进程是否可以抢占当前进程

    static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
    {
    	struct task_struct *curr = rq->curr;
    	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
    	struct sched_entity *se = &curr->se, *pse = &p->se;
    	unsigned long gran;
    
    	if (unlikely(rt_prio(p->prio))) {
    		update_rq_clock(rq);
    		update_curr(cfs_rq);
    		resched_task(curr);
    		return;
    	}
    	/*
    	 * Batch tasks do not preempt (their preemption is driven by
    	 * the tick):
    	 */
    	if (unlikely(p->policy == SCHED_BATCH))
    		return;
    
    	if (!sched_feat(WAKEUP_PREEMPT))
    		return;
    
    	while (!is_same_group(se, pse)) {
    		se = parent_entity(se);
    		pse = parent_entity(pse);
    	}
    
    	gran = sysctl_sched_wakeup_granularity;
    	if (unlikely(se->load.weight != NICE_0_LOAD))
    		gran = calc_delta_fair(gran, &se->load);
    
    	if (pse->vruntime + gran < se->vruntime) //不太理解
    		resched_task(curr);
    }
    
    • 如果唤醒进程是实时进程,则立即请求重新调度
    • 如果唤醒进程是SCHED_BATCH进程,则放弃抢占
    • 当运行进程被新进程抢占时,内核确保被抢占者已经至少运行了某一最小时间限额,该限额保存在sysctl_sched_wakeup_granularity中,如果新进程的虚拟运行时间加上最小时间限额,仍然小于当前运行进程的虚拟晕新时间,则请求重新调度

处理新进程

  • 创建新进程的行为有参数sysctl_sched_child_run_first控制,该参数用于判断新建子进程是否应该在父进程之前运行

    static void task_new_fair(struct rq *rq, struct task_struct *p)
    {
    	struct cfs_rq *cfs_rq = task_cfs_rq(p);
    	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
    	int this_cpu = smp_processor_id();
    
    	sched_info_queued(p);
    
    	update_curr(cfs_rq);
    	place_entity(cfs_rq, se, 1);
    
    	/* 'curr' will be NULL if the child belongs to a different group */
    	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
    			curr && curr->vruntime < se->vruntime) {
    		/*
    		 * Upon rescheduling, sched_class::put_prev_task() will place
    		 * 'current' within the tree based on its new key value.
    		 */
    		swap(curr->vruntime, se->vruntime);
    	}
    
    	enqueue_task_fair(rq, p, 0);
    	resched_task(rq->curr);
    }
    
    • 使用update_curr更新统计量
    • 调用place_entity时将initial设置为1, 以便使用sched_vslice_add计算初始的vruntime
    • 如果设置了sysctl_sched_child_run_first,但是父进程的虚拟运行时间小于子进程的虚拟运行时间,需要交换二者的虚拟运行时间。
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值