深入Linux内核架构笔记 - 进程管理与调度5: 完全公平调度类

最新推荐文章于 2023-02-25 21:53:23 发布

snoopyljc

最新推荐文章于 2023-02-25 21:53:23 发布

阅读量191

点赞数

分类专栏： Linux 文章标签： Linux Process Schedule

本文链接：https://blog.csdn.net/snoopyljc/article/details/98995718

版权

Linux 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

核心调度器必须知道的有关公平调度类的所有信息，都包含在fair_sched_class中

static const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,
	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,

	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= pick_next_task_fair,
	.put_prev_task		= put_prev_task_fair,

#ifdef CONFIG_SMP
	.load_balance		= load_balance_fair,
	.move_one_task		= move_one_task_fair,
#endif

	.set_curr_task          = set_curr_task_fair,
	.task_tick		= task_tick_fair,
	.task_new		= task_new_fair,
};

数据结构

struct cfs_rq {
	struct load_weight load;
	unsigned long nr_running;

	u64 exec_clock;
	u64 min_vruntime;

	struct rb_root tasks_timeline;
	struct rb_node *rb_leftmost;
	struct rb_node *rb_load_balance_curr;
	/* 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	struct sched_entity *curr;

	unsigned long nr_spread_over;
}

nr_running: 队列上可运行进程的数目
load: 所有进程的累积负荷值
min_vruntime: 队列上所有进程的最小虚拟运行时间，实际值可能比最左边的树节点的vruntime大些
tasks_timeline: 用于在按时间排序的红黑树中管理所有进程
rb_leftmost: 指向树最左边的节点，即最需要被调度的进程
curr: 指向当前执行进程的可调度实体

CFS操作

虚拟时钟

根据现存的实际时钟和每个进程相关的负荷权重推算，所有与虚拟时钟有关的计算都在update_curr中执行

static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_of(cfs_rq)->clock;
	unsigned long delta_exec;

	if (unlikely(!curr))
		return;

	/*
	 * Get the amount of time the current task was running
	 * since the last time we changed load (this cannot
	 * overflow on 32 bits):
	 */
	delta_exec = (unsigned long)(now - curr->exec_start);

	__update_curr(cfs_rq, curr, delta_exec);
	curr->exec_start = now;

	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);

		cpuacct_charge(curtask, delta_exec);
	}
}

确定就绪队列的当前执行进程，并获取主调度器的实际时钟值，如果就绪队列上没有进程正在执行，则无事可做; 否则计算当前和上一次更新负荷统计量的时间差，并调用__update_curr
```
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
      unsigned long delta_exec)
{
	unsigned long delta_exec_weighted;
	u64 vruntime;

	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));

	curr->sum_exec_runtime += delta_exec;
	schedstat_add(cfs_rq, exec_clock, delta_exec);
	delta_exec_weighted = delta_exec;
	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
							&curr->load);
	}
	curr->vruntime += delta_exec_weighted;

	/*
	 * maintain cfs_rq->min_vruntime to be a monotonic increasing
	 * value tracking the leftmost vruntime in the tree.
	 */
	if (first_fair(cfs_rq)) {
		vruntime = min_vruntime(curr->vruntime,
				__pick_next_entity(cfs_rq)->vruntime);
	} else
		vruntime = curr->vruntime;

	cfs_rq->min_vruntime =
		max_vruntime(cfs_rq->min_vruntime, vruntime);
}
```
- 更新当前进程在CPU执行花费的物理时间和虚拟时间，物理时间的更新只需要将时间差delta_exec加到先前统计的时间上就可以了
- 虚拟时间的更新，对于nice级别为0的进程，其虚拟时间等于物理时间，对于其它的优先级，需要根据进程的负荷权重，重新计算时间，忽略舍入和溢出检查，计算公式：(delta_exec) * (NICE_0_LOAD / curr->load.weight)
- 更新就绪队列的min_vruntime, first_fair用来检测树是否有最左边的节点，如果有，则使用该节点的vruntime和当前进程vruntime中的最小值, 如果没有，则使用当前进程的vruntime，取上述方法得到的vruntime和min_vruntime中的最大值作为最终的min_vruntime
```
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	return se->vruntime - cfs_rq->min_vruntime;
}
```
- 红黑树排序是根据entity_key函数返回的结果来进行的，进程运行时，其vruntime稳定地增加，在红黑树中向右移动，而且越重要的进程vruntime增加速度越慢，向右移动的速度也慢; 对于睡眠的进程，其vruntime保持不变，当时由于min_vruntime的增加，其键值变小，导致其往红黑数的左边移动

延迟跟踪
- 良好的调度延迟: 保证每个可运行程序都至少运行一次的某个时间间隔，在sysctl_sched_latency给出，可以通过/proc/sys/kernel/sched_latency_ns控制，默认值为20ms
- sched_nr_latency: 控制一个延迟周期中处理的最大活动进程数目，如果活动进程超出上限，则延迟周期也成比例地扩展，可以通过sysctl_sched_min_granularity间接地控制，后者可以通过/proc/sys/kernel/shed_min_granularity_ns设置，默认值是4ms
- sysctl_sched_latency和sysctl_sched_min_granularity改变时，都会重新计算sched_nr_latency
- __sched_period确定延迟周期的长度，通常就是sysctl_sched_latency，但是如果有更多进程运行，其值有可能按现行比例扩展:
  sysctl_sched_latency * (nr_running / sched_nr_latency)
- 根据进程的相对权重，将一个延迟周期的时间在活动进程之间分配：sched_slice计算调度实体在一个延迟周期内分配到的实际时间，__sched_vslice计算延迟周期对应的虚拟时间。
```
static u64 __sched_period(unsigned long nr_running)
{
	u64 period = sysctl_sched_latency;
	unsigned long nr_latency = sched_nr_latency;

	if (unlikely(nr_running > nr_latency)) {
		period *= nr_running;
		do_div(period, nr_latency);
	}

	return period;
}
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	u64 slice = __sched_period(cfs_rq->nr_running);

	slice *= se->load.weight;
	do_div(slice, cfs_rq->load.weight);

	return slice;
}
/*
 * We calculate the vruntime slice.
 *
 * vs = s/w = p/rw
 */
static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
{
	u64 vslice = __sched_period(nr_running);

	vslice *= NICE_0_LOAD;
	do_div(vslice, rq_weight);

	return vslice;
}
```

队列操作

向就绪队列放置新进程

static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;

	for_each_sched_entity(se) {
		if (se->on_rq)
			break;
		cfs_rq = cfs_rq_of(se);
		enqueue_entity(cfs_rq, se, wakeup);
		wakeup = 1;
	}
}
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

	if (wakeup) {
		place_entity(cfs_rq, se, 0);
		enqueue_sleeper(cfs_rq, se);
	}

	update_stats_enqueue(cfs_rq, se);
	check_spread(cfs_rq, se);
	if (se != cfs_rq->curr)
		__enqueue_entity(cfs_rq, se);
	account_entity_enqueue(cfs_rq, se);
}
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
	u64 vruntime;

	vruntime = cfs_rq->min_vruntime;

	if (sched_feat(TREE_AVG)) {
		struct sched_entity *last = __pick_last_entity(cfs_rq);
		if (last) {
			vruntime += last->vruntime;
			vruntime >>= 1;
		}
	} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
		vruntime += sched_vslice(cfs_rq)/2;

	/*
	 * The 'current' period is already promised to the current tasks,
	 * however the extra weight of the new task will slow them down a
	 * little, place the new task so that it fits in the slot that
	 * stays open at the end.
	 */
	if (initial && sched_feat(START_DEBIT))
		vruntime += sched_vslice_add(cfs_rq, se);

	if (!initial) {
		/* sleeps upto a single latency don't count. */
		if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se))
			vruntime -= sysctl_sched_latency;

		/* ensure we never gain time by being placed backwards. */
		vruntime = max_vruntime(se->vruntime, vruntime);
	}

	se->vruntime = vruntime;
}

如果进程此前睡眠，则调用place_entity调整进程的虚拟运行时间
place_entity根据initial的值来区分两种情况，只有在新进程被加到系统时，才会设置该参数，此处讨论initial未设置的情况，由于内核要确保在当前的延迟周期内是活动进程都至少运行一次，因此，队列的min_vruntime用作基准时间，通过减去sysctl_sched_lantency，确保新唤醒的进程只有在当前延迟周期结束后才能运行，但如果睡眠进程已经累积了比较大的不公平，内核选择se->vruntime最为最终的vruntime

选择下一个进程

选择下一个要运行的进程由pick_next_task_fair执行

static struct task_struct *pick_next_task_fair(struct rq *rq)
{
	struct cfs_rq *cfs_rq = &rq->cfs;
	struct sched_entity *se;

	if (unlikely(!cfs_rq->nr_running))
		return NULL;

	do {
		se = pick_next_entity(cfs_rq);
		cfs_rq = group_cfs_rq(se);
	} while (cfs_rq);

	return task_of(se);
}
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
	struct sched_entity *se = NULL;

	if (first_fair(cfs_rq)) {
		se = __pick_next_entity(cfs_rq);
		set_next_entity(cfs_rq, se);
	}

	return se;
}
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	/* 'current' is not kept within the tree. */
	if (se->on_rq) {
		/*
		 * Any task has to be enqueued before it get to execute on
		 * a CPU. So account for the time it spent waiting on the
		 * runqueue.
		 */
		update_stats_wait_end(cfs_rq, se);
		__dequeue_entity(cfs_rq, se);
	}
	cfs_rq->curr = se;
	se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

处理周期性调度器

由函数task_tick_fair负责，但实际工作是委托entity_tick完成的

static void task_tick_fair(struct rq *rq, struct task_struct *curr)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &curr->se;

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		entity_tick(cfs_rq, se);
	}
}
static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
		check_preempt_tick(cfs_rq, curr);
}
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	unsigned long ideal_runtime, delta_exec;

	ideal_runtime = sched_slice(cfs_rq, curr);
	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	if (delta_exec > ideal_runtime)
		resched_task(rq_of(cfs_rq)->curr);
}

使用update_curr更新统计量
如果可运行的进程大于1个，调用check_preempt_tick来决策进程是否应该被抢占，如果进程实际运行的时间间隔delta_exec如果大于由延迟周期确定的份额，则设置task_struct中的TIF_NEED_RESCHED标志，核心调度器会在适当的时机发起重新调度

唤醒抢占

当使用try_to_wake_up和wake_up_new_task唤醒进程时，内核使用check_preempt_curr看看新进程是否可以抢占当前进程

static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
{
	struct task_struct *curr = rq->curr;
	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
	struct sched_entity *se = &curr->se, *pse = &p->se;
	unsigned long gran;

	if (unlikely(rt_prio(p->prio))) {
		update_rq_clock(rq);
		update_curr(cfs_rq);
		resched_task(curr);
		return;
	}
	/*
	 * Batch tasks do not preempt (their preemption is driven by
	 * the tick):
	 */
	if (unlikely(p->policy == SCHED_BATCH))
		return;

	if (!sched_feat(WAKEUP_PREEMPT))
		return;

	while (!is_same_group(se, pse)) {
		se = parent_entity(se);
		pse = parent_entity(pse);
	}

	gran = sysctl_sched_wakeup_granularity;
	if (unlikely(se->load.weight != NICE_0_LOAD))
		gran = calc_delta_fair(gran, &se->load);

	if (pse->vruntime + gran < se->vruntime) //不太理解
		resched_task(curr);
}

如果唤醒进程是实时进程，则立即请求重新调度
如果唤醒进程是SCHED_BATCH进程，则放弃抢占
当运行进程被新进程抢占时，内核确保被抢占者已经至少运行了某一最小时间限额，该限额保存在sysctl_sched_wakeup_granularity中，如果新进程的虚拟运行时间加上最小时间限额，仍然小于当前运行进程的虚拟晕新时间，则请求重新调度

处理新进程

创建新进程的行为有参数sysctl_sched_child_run_first控制，该参数用于判断新建子进程是否应该在父进程之前运行

static void task_new_fair(struct rq *rq, struct task_struct *p)
{
	struct cfs_rq *cfs_rq = task_cfs_rq(p);
	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
	int this_cpu = smp_processor_id();

	sched_info_queued(p);

	update_curr(cfs_rq);
	place_entity(cfs_rq, se, 1);

	/* 'curr' will be NULL if the child belongs to a different group */
	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
			curr && curr->vruntime < se->vruntime) {
		/*
		 * Upon rescheduling, sched_class::put_prev_task() will place
		 * 'current' within the tree based on its new key value.
		 */
		swap(curr->vruntime, se->vruntime);
	}

	enqueue_task_fair(rq, p, 0);
	resched_task(rq->curr);
}

使用update_curr更新统计量
调用place_entity时将initial设置为1, 以便使用sched_vslice_add计算初始的vruntime
如果设置了sysctl_sched_child_run_first，但是父进程的虚拟运行时间小于子进程的虚拟运行时间，需要交换二者的虚拟运行时间。

snoopyljc

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
深入Linux内核架构笔记 - 进程管理与调度5: 完全公平调度类

核心调度器必须知道的有关公平调度类的所有信息，都包含在fair_sched_class中static const struct sched_class fair_sched_class = { .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_...
复制链接

扫一扫

专栏目录