linux cfs调度器

最新推荐文章于 2024-06-15 18:21:31 发布

jerry_ms

最新推荐文章于 2024-06-15 18:21:31 发布

阅读量867

点赞数

本文链接：https://blog.csdn.net/u014089131/article/details/54616976

版权

CFS(completely fair scheduler)完全公平调度器，对应应用设置的调度策略为SCHED_NORMAL/SCHED_BATCH。
这种调度策略区别于实时调度，进程优先级低于实时调度进程，用nice值来表示进程的重要程度，nice值得范围-20~19，
转换成优先级为100~140。

应用程序有多种类型，cpu 消耗型，交互型，nice值一般也不同。cfs调度器如何保证完全公平呢？
cfs是将进程所有影响调度的因素都转换为对vruntime的处理，从而近似公平。

说下对CFS “完全公平” 的理解：
①不再区分进程类型，所有进程公平对待
②对I/O消耗型进程，仍然会提供快速响应(对睡眠进程做时间补偿)
③优先级高的进程，获得CPU时间更多(vruntime增长的更慢)

可见CFS的完全公平，并不是说所有进程绝对的平等，占用CPU时间完全相同，而是体现在vruntime数值上，所有进程都用虚拟时间来度量，总是让vruntime最小的进程抢占，这样看起来是完全公平的，但实际上vruntime的更新，增长速度，不同进程是不尽一样的。CFS利用这么个简单的vruntime机制，实现了以往需要相当复杂算法实现的进度调度需求。

vruntime在每个时钟中断，task创建，task加入运行队列，移出运行队列调用到，
接下来看下vruntime更新的相关代码：

每个tick中断时会调用tick_handle_periodic，tick_periodic里面只有在某个特定cpu上执行才会更新全局变量jiffy，

其他cpu上都只会更新自己rq上的进程信息。

//只有一个CPU是负责更新jffies的，其他的CPU只会更新当前自己的进程
static void tick_periodic(int cpu)
{
	if (tick_do_timer_cpu == cpu) {//更新jiffy
		write_seqlock(&jiffies_lock);

		/* Keep track of the next tick event */
		tick_next_period = ktime_add(tick_next_period, tick_period);

		do_timer(1);
		write_sequnlock(&jiffies_lock);
	}
	//更新进程时间信息
	update_process_times(user_mode(get_irq_regs()));
	profile_tick(CPU_PROFILING);
}

update_process_times中会继续调用scheduler_tick，

/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 */
 //更新CPU和当前进行的一些数据，然后根据当前进程的调度类，调用task_tick()函数
void scheduler_tick(void)
{
	int cpu = smp_processor_id();
	struct rq *rq = cpu_rq(cpu);
	struct task_struct *curr = rq->curr; //当前正在执行的进程
	u32 old_load;
	struct related_thread_group *grp;
    //更新调度时间
	sched_clock_tick();

	raw_spin_lock(&rq->lock);
	old_load = task_load(curr);
	grp = task_related_thread_group(curr);//task 所在group
	set_window_start(rq);
	update_rq_clock(rq);
	//更新cpu load信息
	update_cpu_load_active(rq);
	//调用cfs 子类task_tick_fair
	curr->sched_class->task_tick(rq, curr, 0);
	update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), 0);
	raw_spin_unlock(&rq->lock);

	perf_event_task_tick();

#ifdef CONFIG_SMP
	rq->idle_balance = idle_cpu(cpu);
	trigger_load_balance(rq, cpu);
#endif
	rq_last_tick_reset(rq);

	if (update_preferred_cluster(grp, curr, old_load))
		set_preferred_cluster(grp);

	if (curr->sched_class == &fair_sched_class)
		check_for_migration(rq, curr);
}

static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &curr->se;
	//组调度，更新task group中每个实体的时间信息
	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		entity_tick(cfs_rq, se, queued);
	}

	if (sched_feat_numa(NUMA))
		task_tick_numa(rq, curr);

	update_rq_runnable_avg(rq, 1);
}

//每个时钟周期，更新vruntime
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	//更新rq 时间信息
	update_curr(cfs_rq);

	/*
	 * Ensure that runnable average is periodically updated.
	 */
	update_entity_load_avg(curr, 1);
	update_cfs_rq_blocked_load(cfs_rq, 1);
	update_cfs_shares(cfs_rq);

#ifdef CONFIG_SCHED_HRTICK
	/*
	 * queued ticks are scheduled to match the slice, so don't bother
	 * validating it and just reschedule.
	 */
	if (queued) {
		resched_task(rq_of(cfs_rq)->curr);
		return;
	}
	/*
	 * don't let the period tick interfere with the hrtick preemption
	 */
	if (!sched_feat(DOUBLE_TICK) &&
			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
		return;
#endif
	
	//运行状态task数量大于1的话，检查是否要发生进程调度抢占
	if (cfs_rq->nr_running > 1)
		check_preempt_tick(cfs_rq, curr);
}

//获取task实际运行的时间，加权后变为vruntime
static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_clock_task(rq_of(cfs_rq));
	unsigned long delta_exec;

	if (unlikely(!curr))
		return;

	/*
	 * Get the amount of time the current task was running
	 * since the last time we changed load (this cannot
	 * overflow on 32 bits):
	 */
	//当前系统时间减去本次任务开始执行时间， 差值为task 运行的时间
	delta_exec = (unsigned long)(now - curr->exec_start);
	if (!delta_exec)
		return;

	//task运行的实际时间加权后，加到当前task 的vruntime中
	__update_curr(cfs_rq, curr, delta_exec);
	//当前task执行开始时间
	curr->exec_start = now;

	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);

		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
		cpuacct_charge(curtask, delta_exec);
		account_group_exec_runtime(curtask, delta_exec);
	}

	account_cfs_rq_runtime(cfs_rq, delta_exec);
}

static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
	      unsigned long delta_exec)
{
	unsigned long delta_exec_weighted;

	schedstat_set(curr->statistics.exec_max,
		      max((u64)delta_exec, curr->statistics.exec_max));
	/* 总运行时间更新 */  
	curr->sum_exec_runtime += delta_exec; 
	schedstat_add(cfs_rq, exec_clock, delta_exec);
	/* 用优先级和delta_exec来计算weighted，以用于更新vruntime */ 
	delta_exec_weighted = calc_delta_fair(delta_exec, curr);//加权后的执行时间
	//vruntime 增长值不是实际运行的时间，而是加权后的值，这样保证运行的公平
	curr->vruntime += delta_exec_weighted;
	//cfs_rq 保存所有进程最小的vruntime,下次调度时直接使用
	update_min_vruntime(cfs_rq);
}

 //若当前进程nice为0，直接返回实际运行时间，其他所有nice值的加权都是以0 nice值为参考增加或减少的。
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
	if (unlikely(se->load.weight != NICE_0_LOAD))//nice 不为0，就得计算delta加权值
		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);

	return delta;
}

/*
 * delta *= weight / lw
 */
 /*
 NICE_0_LOAD is 1024 delta*=NICE_0_LOAD/lw  weight=NICE_0_LOAD，
 nice值越大的，自身的load越小，计算后delta相比实际时间就增长越快，这样下次被调度到的机会降低。
 另外也说明为了达到同样的vruntime，nice值高的进程可以分配更少的运行时间
 nice值对应的load 参考prio_to_weight。
 */
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
		struct load_weight *lw)
{
	u64 tmp;

	/*
	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
	 * 2^SCHED_LOAD_RESOLUTION.
	 */
	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
		tmp = (u64)delta_exec * scale_load_down(weight);//tmp=delta_exec*NICE_0_LOAD;
	else
		tmp = (u64)delta_exec;

	if (!lw->inv_weight) {
		unsigned long w = scale_load_down(lw->weight);//w=lw->weight;

		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
			lw->inv_weight = 1;
		else if (unlikely(!w))
			lw->inv_weight = WMULT_CONST;
		else
			lw->inv_weight = WMULT_CONST / w; //1^32 /load
	}

	/*
	 * Check whether we'd overflow the 64-bit multiplication:
	 */
	if (unlikely(tmp > WMULT_CONST))
		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
			WMULT_SHIFT/2);
	else
		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);//数据按照1<<32位对齐,tmp等于WMULT_SHIFT的倍数

	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}

/*
 * Preempt the current task with a newly woken task if needed:
 */
 //更新实体vruntime后，检查是否需要重新调度,当前进程执行时间超过调度器分配的时间，就重新调度
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	unsigned long ideal_runtime, delta_exec;
	struct sched_entity *se;
	s64 delta;
	//ideal_runtime是理论上的处理器运行时间片，这个时间不固定，根据当前rq中的running task数量计算出来
	ideal_runtime = sched_slice(cfs_rq, curr);
	//该进程本轮调度累计运行时间
	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 
	// 假如实际运行超过调度器分配的时间，就标记重新调度标志
	if (delta_exec > ideal_runtime) {
		resched_task(rq_of(cfs_rq)->curr);
		/*
		 * The current task ran long enough, ensure it doesn't get
		 * re-elected due to buddy favours.
		 */
		clear_buddies(cfs_rq, curr);
		return;
	}

	/*
	 * Ensure that a task that missed wakeup preemption by a
	 * narrow margin doesn't have to wait for a full slice.
	 * This also mitigates buddy induced latencies under load.
	 */
	 //sysctl_sched_min_granularity 调度最小的间隔，不需要重新调度，直接返回
	if (delta_exec < sysctl_sched_min_granularity)
		return;
	//找到位于rq最左边的节点，这个entity就是vruntime最小的entity
	se = __pick_first_entity(cfs_rq);
	delta = curr->vruntime - se->vruntime;
	//当前任务跟rq中最小的vruntime节点对比
	if (delta < 0)
		return;
	//设置重新调度标记，需要切换到其他entity运行
	if (delta > ideal_runtime)
		resched_task(rq_of(cfs_rq)->curr);
}

继续看下调度实体出对，入队的操作，对cfs调度器，入队调用的enqueue_task_fair

/*
 * The enqueue_task method is called before nr_running is
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;
	
	/* 这里是一个迭代，我们知道，进程有可能是处于一个进程组中的，所以当这个处于进程组中的进程加入到该进程组的队列中时，要对此队列向上迭代 */
	for_each_sched_entity(se) {
		if (se->on_rq)
			break;
		cfs_rq = cfs_rq_of(se);
		//se 插入rb tree
		enqueue_entity(cfs_rq, se, flags);

		/*
		 * end evaluation on encountering a throttled cfs_rq
		 *
		 * note: in the case of encountering a throttled cfs_rq we will
		 * post the final h_nr_running increment below.
		*/
		if (cfs_rq_throttled(cfs_rq))
			break;
		cfs_rq->h_nr_running++;
		inc_cfs_rq_hmp_stats(cfs_rq, p, 1);

		flags = ENQUEUE_WAKEUP;
	}
	//只有se不处于队列中或者cfs_rq_throttled(cfs_rq)返回假才会运行这个循环，用来处理task group余下的task
	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		cfs_rq->h_nr_running++;
		inc_cfs_rq_hmp_stats(cfs_rq, p, 1);

		if (cfs_rq_throttled(cfs_rq))
			break;

		update_cfs_shares(cfs_rq);
		update_entity_load_avg(se, 1);
	}

	if (!se) {
		update_rq_runnable_avg(rq, rq->nr_running);
		inc_nr_running(rq);
		inc_rq_hmp_stats(rq, p, 1);
	}
	hrtick_update(rq);
}


static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	/*
	 * Update the normalized vruntime before updating min_vruntime
	 * through callig update_curr().
	 */
	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
		se->vruntime += cfs_rq->min_vruntime;

	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);
	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
	account_entity_enqueue(cfs_rq, se);
	update_cfs_shares(cfs_rq);

	//当前进程之前处于睡眠状态，刚被唤醒,新建进程flag为0
	if (flags & ENQUEUE_WAKEUP) {
		place_entity(cfs_rq, se, 0);
		enqueue_sleeper(cfs_rq, se);
	}

	update_stats_enqueue(cfs_rq, se, !!(flags & ENQUEUE_MIGRATING));
	check_spread(cfs_rq, se);
	if (se != cfs_rq->curr)//将se插入到运行队列cfs_rq的红黑树中
		__enqueue_entity(cfs_rq, se);
	se->on_rq = 1;

	if (cfs_rq->nr_running == 1) {
		list_add_leaf_cfs_rq(cfs_rq);
		check_enqueue_throttle(cfs_rq);
	}
}

/*
对"新"进程处理，initial为0表示新创建进程，为1代表睡眠唤醒的进程
子进程在创建时，vruntime初值首先被设置为min_vruntime；然后，如果sched_features中设置了START_DEBIT位，vruntime会在min_vruntime的基础上再增大一些。设置完子进程的vruntime之后，检查sched_child_runs_first参数，如果为1的话，就比较父进程和子进程的vruntime，若是父进程的vruntime更小，就对换父、子进程的vruntime，这样就保证了子进程会在父进程之前运行。

*/
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
	//新建task以cfs的最小vruntime为初始值
	u64 vruntime = cfs_rq->min_vruntime;

	/*
	 * The 'current' period is already promised to the current tasks,
	 * however the extra weight of the new task will slow them down a
	 * little, place the new task so that it fits in the slot that
	 * stays open at the end.
	 */
	 /*
      sched_features是控制调度器特性的开关，每个bit表示调度器的一个特性。
      在sched_features.h文件中记录了全部的特性。START_DEBIT是其中之一，
      如果打开这个特性，表示给新进程的vruntime初始值要设置得比默认值更大一些，
      这样会推迟它的运行时间，以防进程通过不停的fork来获得cpu时间片。	 
	 */
	if (initial && sched_feat(START_DEBIT))//新进程并且设置了START_DEBIT
		vruntime += sched_vslice(cfs_rq, se);// 加上一个调度周期内的"时间片"

	/* sleeps up to a single latency don't count. */
	//对于休眠进程，vruntime会比rq中最小vruntime还要小，这样在下次调度切换时最先得到调度，
	//也就是交互性进程唤醒后最先得到调度
	if (!initial) {//休眠进程
		unsigned long thresh = sysctl_sched_latency;//一个调度周期

		/*
		 * Halve their sleep time's effect, to allow
		 * for a gentler effect of sleepers:
		 */
		if (sched_feat(GENTLE_FAIR_SLEEPERS))
			thresh >>= 1;

		vruntime -= thresh;
	}

	/* ensure we never gain time by being placed backwards. */
	//第三种情况，进程之前处于执行状态
	se->vruntime = max_vruntime(se->vruntime, vruntime);
}

//将调度实体加入rb tree中，这样才有机会被调度到
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	int leftmost = 1;

	/*
	 * Find the right place in the rbtree:
	 */
	//在rb tree中找到合适插入点 
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
		if (entity_before(se, entry)) {
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = 0; //往右子树走过之后，新插入entity的vruntime不可能最小
		}
	}

	/*
	 * Maintain a cache of leftmost tree entries (it is frequently
	 * used):
	 */
	//加入一个新的task后，判断是否要更新rb_leftmost，
	//保存rb tree 最左子树，避免了rb tree的遍历查找
	if (leftmost)
		cfs_rq->rb_leftmost = &se->run_node;
	//插入 rb tree
	rb_link_node(&se->run_node, parent, link);
	//调整rb tree，达到平衡
	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}

//sched entity 出队操作

static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;
	int task_sleep = flags & DEQUEUE_SLEEP;
	
	//对于task group，依次出队,group作为一个整体进行处理，出队、入队，以及选择group内的某个task
	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		dequeue_entity(cfs_rq, se, flags);

		/*
		 * end evaluation on encountering a throttled cfs_rq
		 *
		 * note: in the case of encountering a throttled cfs_rq we will
		 * post the final h_nr_running decrement below.
		*/
		if (cfs_rq_throttled(cfs_rq))
			break;
		cfs_rq->h_nr_running--;
		dec_cfs_rq_hmp_stats(cfs_rq, p, 1);

		/* Don't dequeue parent if it has other entities besides us */
		if (cfs_rq->load.weight) {
			/*
			 * Bias pick_next to pick a task from this cfs_rq, as
			 * p is sleeping when it is within its sched_slice.
			 */
			if (task_sleep && parent_entity(se))
				set_next_buddy(parent_entity(se));

			/* avoid re-evaluating load for this entity */
			se = parent_entity(se);
			break;
		}
		flags |= DEQUEUE_SLEEP;
	}

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		cfs_rq->h_nr_running--;
		dec_cfs_rq_hmp_stats(cfs_rq, p, 1);

		if (cfs_rq_throttled(cfs_rq))
			break;

		update_cfs_shares(cfs_rq);
		update_entity_load_avg(se, 1);
	}

	if (!se) {
		dec_nr_running(rq);
		update_rq_runnable_avg(rq, 1);
		dec_rq_hmp_stats(rq, p, 1);
	}
	hrtick_update(rq);
}

/*
当进程从一个CPU的运行队列中出来 (dequeue_entity) 的时候，它的vruntime要减去队列的min_vruntime值；
而当进程加入另一个CPU的运行队列 ( enqueue_entiry) 时，它的vruntime要加上该队列的min_vruntime值。
这样，进程从一个CPU迁移到另一个CPU之后，vruntime保持相对公平。
*/

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	//更新当前task的vruntime
	update_curr(cfs_rq);
	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);

	update_stats_dequeue(cfs_rq, se, !!(flags & DEQUEUE_MIGRATING));
	if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
		if (entity_is_task(se)) {
			struct task_struct *tsk = task_of(se);

			if (tsk->state & TASK_INTERRUPTIBLE)
				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
			if (tsk->state & TASK_UNINTERRUPTIBLE)
				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
		}
#endif
	}

	clear_buddies(cfs_rq, se);

	if (se != cfs_rq->curr)
		__dequeue_entity(cfs_rq, se);
	se->on_rq = 0;
	account_entity_dequeue(cfs_rq, se);

	/*
	 * Normalize the entity after updating the min_vruntime because the
	 * update can refer to the ->curr item and we need to reflect this
	 * movement in our normalized position.
	 */
	//更新se vruntime 
	if (!(flags & DEQUEUE_SLEEP))
		se->vruntime -= cfs_rq->min_vruntime;

	/* return excess runtime on last dequeue */
	return_cfs_rq_runtime(cfs_rq);
	//更新cfs rq的min_vruntime
	update_min_vruntime(cfs_rq);
	update_cfs_shares(cfs_rq);
}
//从rb tree中移除entity
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	if (cfs_rq->rb_leftmost == &se->run_node) {
		struct rb_node *next_node;

		next_node = rb_next(&se->run_node);
		cfs_rq->rb_leftmost = next_node;
	}

	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}

jerry_ms

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
linux cfs调度器

CFS(completely fair scheduler)完全公平调度器，对应应用设置的调度策略为SCHED_NORMAL/SCHED_BATCH。这种调度策略区别于实时调度，进程优先级低于实时调度进程，用nice值来表示进程的重要程度，nice值得范围-20~19，转换成优先级为100~140。应用程序有多种类型，cpu 消耗型，交互型，nice值一般也不同。cfs调度器如
复制链接

扫一扫