Linux CFS调度器之place_entity函数

本文链接：https://blog.csdn.net/weixin_45030965/article/details/128581192

place_entity函数主要在进程被唤醒和新创建时调用，用于调整进程的虚拟时钟初值。新进程会受到惩罚，增加vruntime，而唤醒的进程则获得一定补偿。此外，文章详细介绍了进程从睡眠到运行以及新进程创建的调度过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

文章目录

一、place_entity简介
二、place_entity源码详解
参考资料

一、place_entity简介

place_entity函数主要是为刚唤醒的进程和新创建的进程重新调整其虚拟时钟初值。然后对于新创建进程在重新调整后的虚拟时钟初值是惩罚，对于唤醒的进程通常是奖励。

刚唤醒的进程和新创建的进程都会变成运行态，调用enqueue_entity -> __enqueue_entity将进程调度实体加入到红黑树中，加入到CFS运行队列中（这里只指普通进程），因此需要重新计算其vruntime。

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	/*
	 * Update the normalized vruntime before updating min_vruntime
	 * through callig update_curr().
	 */
	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
		se->vruntime += cfs_rq->min_vruntime;

	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);
	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
	account_entity_enqueue(cfs_rq, se);
	update_cfs_shares(cfs_rq);

	if (flags & ENQUEUE_WAKEUP) {
		place_entity(cfs_rq, se, 0);
		enqueue_sleeper(cfs_rq, se);
	}

	update_stats_enqueue(cfs_rq, se);
	check_spread(cfs_rq, se);
	if (se != cfs_rq->curr)
		__enqueue_entity(cfs_rq, se);
	se->on_rq = 1;

	if (cfs_rq->nr_running == 1) {
		list_add_leaf_cfs_rq(cfs_rq);
		check_enqueue_throttle(cfs_rq);
	}
}

__enqueue_entity将进程调度实体加入到rb-tree中：

/*
 * Enqueue an entity into the rb-tree:
 */
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	int leftmost = 1;

	/*
	 * Find the right place in the rbtree:
	 */
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
		if (entity_before(se, entry)) {
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = 0;
		}
	}

	/*
	 * Maintain a cache of leftmost tree entries (it is frequently
	 * used):
	 */
	if (leftmost)
		cfs_rq->rb_leftmost = &se->run_node;

	rb_link_node(&se->run_node, parent, link);
	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}

place_entity函数主要在以下几处被调用：
（1）刚唤醒的进程：当一个进程从睡眠状态被唤醒以后，就离开等待队列进入运行队列中。
（2）新创建的进程：选择当前最合适（较空闲）的一个处理器，将子进程加入该处理器的运行队列中。

1.1 try_to_wake_up

/**
 * try_to_wake_up - wake up a thread
 * @p: the thread to be awakened
 * @state: the mask of task states that can be woken
 * @wake_flags: wake modifier flags (WF_*)
 *
 * Put it on the run-queue if it's not already there. The "current"
 * thread is always on the run-queue (except when the actual
 * re-schedule is in progress), and as such you're allowed to do
 * the simpler "current->state = TASK_RUNNING" to mark yourself
 * runnable without the overhead of this.
 *
 * Returns %true if @p was woken up, %false if it was already running
 * or @state didn't match @p's state.
 */
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
	unsigned long flags;
	int cpu, success = 0;

	smp_wmb();
	raw_spin_lock_irqsave(&p->pi_lock, flags);
	if (!(p->state & state))
		goto out;

	success = 1; /* we're going to change ->state */
	cpu = task_cpu(p);

	if (p->on_rq && ttwu_remote(p, wake_flags))
		goto stat;

#ifdef CONFIG_SMP
	/*
	 * If the owning (remote) cpu is still in the middle of schedule() with
	 * this task as prev, wait until its done referencing the task.
	 */
	while (p->on_cpu)
		cpu_relax();
	/*
	 * Pairs with the smp_wmb() in finish_lock_switch().
	 */
	smp_rmb();

	p->sched_contributes_to_load = !!task_contributes_to_load(p);
	p->state = TASK_WAKING;

	if (p->sched_class->task_waking)
		p->sched_class->task_waking(p);

	//给唤醒的进程选择一个合适的处理器
	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
	if (task_cpu(p) != cpu) {
		wake_flags |= WF_MIGRATED;
		set_task_cpu(p, cpu);
	}
#endif /* CONFIG_SMP */

	//将唤醒的进程加入到CFS就绪队列中
	ttwu_queue(p, cpu);
stat:
	ttwu_stat(p, cpu, wake_flags);
out:
	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

	return success;
}

这里最重要的函数就是ttwu_queue，将唤醒的进程加入到CFS就绪队列中。

static void ttwu_queue(struct task_struct *p, int cpu)
{
	struct rq *rq = cpu_rq(cpu);

#if defined(CONFIG_SMP)
	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
		sched_clock_cpu(cpu); /* sync clocks x-cpu */
		ttwu_queue_remote(p, cpu);
		return;
	}
#endif

	raw_spin_lock(&rq->lock);
	ttwu_do_activate(rq, p, 0);
	raw_spin_unlock(&rq->lock);
}

static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
{
#ifdef CONFIG_SMP
	if (p->sched_contributes_to_load)
		rq->nr_uninterruptible--;
#endif
	(1)
	//将唤醒的进程加入CFS就绪队列中，也就是将进程调度实体加入红黑树中
	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
	(2)
	//检查唤醒的进程是否应该发生抢占
	ttwu_do_wakeup(rq, p, wake_flags);
}

（1） ttwu_activate
ttwu_activate将唤醒的进程加入CFS就绪队列中，也就是将进程调度实体加入红黑树中

static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
	activate_task(rq, p, en_flags);
	p->on_rq = 1;

	/* if a worker is waking up, notify workqueue */
	if (p->flags & PF_WQ_WORKER)
		wq_worker_waking_up(p, cpu_of(rq));
}

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
	update_rq_clock(rq);
	sched_info_queued(p);
	p->sched_class->enqueue_task(rq, p, flags);
}


void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible--;

	enqueue_task(rq, p, flags);
}

/*
 * The enqueue_task method is called before nr_running is
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;

	for_each_sched_entity(se) {
		if (se->on_rq)
			break;
		cfs_rq = cfs_rq_of(se);
		enqueue_entity(cfs_rq, se, flags);

		/*
		 * end evaluation on encountering a throttled cfs_rq
		 *
		 * note: in the case of encountering a throttled cfs_rq we will
		 * post the final h_nr_running increment below.
		*/
		if (cfs_rq_throttled(cfs_rq))
			break;
		cfs_rq->h_nr_running++;

		flags = ENQUEUE_WAKEUP;
	}

}

/*
 * All the scheduling class methods:
 */
const struct sched_class fair_sched_class = {
	.enqueue_task		= enqueue_task_fair,
};

#define ENQUEUE_WAKEUP		1

struct sched_class {
	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
};
extern const struct sched_class fair_sched_class;

为了方便添加新的调度策略，Linux内核抽象了一个调度类sched_class，目前有五种调度器类，我们这里只关注CFS调度器类（完全公平调度类）。

enqueue_task()：向运行队列添加一个进程。当一个进程从睡眠状态被唤醒以后，就离开等待队列进入运行队列，发生enqueue_task()操作。

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
	update_rq_clock(rq);
	sched_info_queued(p);
	p->sched_class->enqueue_task(rq, p, flags);
}

对于CFS调度器类enqueue_task就是enqueue_task_fair函数，enqueue_task_fair函数中调用了enqueue_entity，将唤醒的进程调度实体加入到红黑树中。

（2）ttwu_do_wakeup

/*
 * Mark the task runnable and perform wakeup-preemption.
 */
static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
	//调用了 check_preempt_curr 检查是否应该发生抢占
	check_preempt_curr(rq, p, wake_flags);
	trace_sched_wakeup(p, true);

	p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
	if (p->sched_class->task_woken)
		p->sched_class->task_woken(rq, p);

	if (rq->idle_stamp) {
		u64 delta = rq->clock - rq->idle_stamp;
		u64 max = 2*sysctl_sched_migration_cost;

		if (delta > max)
			rq->avg_idle = max;
		else
			update_avg(&rq->avg_idle, delta);
		rq->idle_stamp = 0;
	}
#endif
}

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
	const struct sched_class *class;

	if (p->sched_class == rq->curr->sched_class) {
		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
	} else {
		for_each_class(class) {
			if (class == rq->curr->sched_class)
				break;
			if (class == p->sched_class) {
				resched_task(rq->curr);
				break;
			}
		}
	}

	/*
	 * A queue event has occurred, and we're going to schedule.  In
	 * this case, we can save a useless back to back clock update.
	 */
	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
		rq->skip_clock_update = 1;
}

/*
 * All the scheduling class methods:
 */
const struct sched_class fair_sched_class = {
	.check_preempt_curr	= check_preempt_wakeup,
}

这里面调用了 check_preempt_curr（check_preempt_wakeup）检查唤醒的进程是否应该发生抢占，抢占正在当前处理器运行的进程：rq->curr。如果应该发生抢占，也不是直接踢走当前进程，而是将当前进程标记为应该被抢占。

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
	//获取当前处理器运行队列正在运行的进程：rq->curr 
	struct task_struct *curr = rq->curr;
	//获取当前处理器运行队列正在运行的进程调度实体：&curr->se
	//获取唤醒进程的调度实体&p->se
	struct sched_entity *se = &curr->se, *pse = &p->se;
	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
	int scale = cfs_rq->nr_running >= sched_nr_latency;
	int next_buddy_marked = 0;

	if (unlikely(se == pse))
		return;

	/*
	 * This is possible from callers such as move_task(), in which we
	 * unconditionally check_prempt_curr() after an enqueue (which may have
	 * lead to a throttle).  This both saves work and prevents false
	 * next-buddy nomination below.
	 */
	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
		return;

	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
		set_next_buddy(pse);
		next_buddy_marked = 1;
	}

	/*
	 * We can come here with TIF_NEED_RESCHED already set from new task
	 * wake up path.
	 *
	 * Note: this also catches the edge-case of curr being in a throttled
	 * group (e.g. via set_curr_task), since update_curr() (in the
	 * enqueue of curr) will have resulted in resched being set.  This
	 * prevents us from potentially nominating it as a false LAST_BUDDY
	 * below.
	 */
	//
	if (test_tsk_need_resched(curr))
		return;

	//如果当前任务是空闲进程，那么唤醒的进程就应该发起抢占，因为空闲进程的优先级最低
	/* Idle tasks are by definition preempted by non-idle tasks. */
	if (unlikely(curr->policy == SCHED_IDLE) &&
	    likely(p->policy != SCHED_IDLE))
		goto preempt;

	/*
	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
	 * is driven by the tick):
	 */
	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
		return;

	//与任务调度组有关：CONFIG_FAIR_GROUP_SCHED
	find_matching_se(&se, &pse);
	//更行当前处理器正在运行进程的 vruntime
	update_curr(cfs_rq_of(se));
	BUG_ON(!pse);
	//调用wakeup_preempt_entity判断唤醒的进程是否发生抢占
	if (wakeup_preempt_entity(se, pse) == 1) {
		/*
		 * Bias pick_next to pick the sched entity that is
		 * triggering this preemption.
		 */
		if (!next_buddy_marked)
			set_next_buddy(pse);
		//唤醒的进程应该发起抢占
		goto preempt;
	}

	return;

	//将当前进程标记为应该被抢占
preempt:
	resched_task(curr);
	/*
	 * Only set the backward buddy when the current task is still
	 * on the rq. This can happen when a wakeup gets interleaved
	 * with schedule on the ->pre_schedule() or idle_balance()
	 * point, either of which can * drop the rq lock.
	 *
	 * Also, during early boot the idle thread is in the fair class,
	 * for obvious reasons its a bad idea to schedule back to it.
	 */
	if (unlikely(!se->on_rq || curr == rq->idle))
		return;

	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
		set_last_buddy(se);
}

（1）获取当前处理器运行队列正在运行的进程：rq->curr
（2）获取当前处理器运行队列正在运行的进程调度实体：&curr->se，获取唤醒进程的调度实体&p->se
（3）如果当前任务是空闲进程，那么唤醒的进程就应该发起抢占，因为空闲进程的优先级最低
（4）更行当前处理器正在运行进程的 vruntime
（5) 调用wakeup_preempt_entity判断唤醒的进程是否发生抢占

源码如下：

/*
 * SCHED_OTHER wake-up granularity.
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * This option delays the preemption effects of decoupled workloads
 * and reduces their over-scheduling. Synchronous workloads will still
 * have immediate wakeup/sleep latencies.
 */
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;

static unsigned long
wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
	//sysctl_sched_wakeup_granularity = 1ms
	//该值是被抢占的进程至少运行了某一最小时间限额，这是个实际的物理时间
	unsigned long gran = sysctl_sched_wakeup_granularity;

	/*
	 * Since its curr running now, convert the gran from real-time
	 * to virtual-time in his units.
	 *
	 * By using 'se' instead of 'curr' we penalize light tasks, so
	 * they get preempted easier. That is, if 'se' < 'curr' then
	 * the resulting gran will be larger, therefore penalizing the
	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
	 * be smaller, again penalizing the lighter task.
	 *
	 * This is especially important for buddies when the leftmost
	 * task is higher priority than the buddy.
	 */
	//将该最小时间限额根据唤醒进程的权重转化为一个虚拟时间
	return calc_delta_fair(gran, se);
}

/*
 * Should 'se' preempt 'curr'.
 *
 *             |s1
 *        |s2
 *   |s3
 *         g
 *      |<--->|c
 *
 *  w(c, s1) = -1
 *  w(c, s2) =  0
 *  w(c, s3) =  1
 *
 */
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
	s64 gran, vdiff = curr->vruntime - se->vruntime;

	//如果当前处理器正在运行的进程的 vruntime 小于唤醒进程的 vruntime ,那么不能发生抢占
	if (vdiff <= 0)
		return -1;

	//如果当前处理器正在运行的进程的 vruntime 大于唤醒进程的 vruntime

	//计算一个进程的最小运行时间限额（将物理时间转化为虚拟时间）
	gran = wakeup_gran(curr, se);
	//如果当前处理器正在运行的进程的 vruntime 大于唤醒进程的 vruntime
	//同时其差值大于进程的最小运行时间限额，那么就应该将当前进程标记为应该被抢占，请求重新调度
	if (vdiff > gran)
		return 1;

	return 0;
}

这里调用wakeup_preempt_entity函数计算是否将当前正在运行的进程标记为应该被抢占时，如果当前处理器正在运行的进程的 vruntime 大于唤醒进程的 vruntime，不是直接就确定将当前正在运行的进程标记为应该被抢占，而是增加了一个时间缓冲，如果唤醒的进程 vruntime 加上进程最小运行时间（sysctl_sched_wakeup_granularity = 1ms转化为虚拟时间）仍然小于当前处理器正在运行的进程的 vruntime，那么就确定当前处理器正在运行的进程应该被抢占，增加一个时间缓冲避免进程切换过于频繁，花费过多的时间再上下文切换中。

（6）如果计算出当前进程应该被抢占，那么调用resched_task将当前进程标记为应该被抢占，请求重新调度。
注意这里只是当前进程标记为应该被抢占，请求重新调度，但是真正的抢占动作并没有发生。
resched_task将进程的struct thread_info的flags成员设置为：TIF_NEED_RESCHED。

/*
 * resched_task - mark a task 'to be rescheduled now'.
 *
 * On UP this means the setting of the need_resched flag, on SMP it
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
#ifdef CONFIG_SMP
void resched_task(struct task_struct *p)
{
	int cpu;

	assert_raw_spin_locked(&task_rq(p)->lock);

	if (test_tsk_need_resched(p))
		return;

	set_tsk_need_resched(p);

	cpu = task_cpu(p);
	if (cpu == smp_processor_id())
		return;

	/* NEED_RESCHED must be visible before we test polling */
	smp_mb();
	if (!tsk_is_polling(p))
		smp_send_reschedule(cpu);
}

static inline void set_tsk_need_resched(struct task_struct *tsk)
{
	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

/* set thread flags in other task's structures
 * - see asm/thread_info.h for TIF_xxxx flags available
 */
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
	set_ti_thread_flag(task_thread_info(tsk), flag);
}

/*
 * flag set/clear/test wrappers
 * - pass TIF_xxxx constants to these functions
 */

static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
{
	set_bit(flag, (unsigned long *)&ti->flags);
}

/*
 * thread information flags
 * - these are process state flags that various assembly files
 *   may need to access
 * - pending work-to-be-done flags are in LSW
 * - other flags in MSW
 * Warning: layout of LSW is hardcoded in entry.S
 */
#define TIF_NEED_RESCHED	3	/* rescheduling necessary */

struct thread_info {
	__u32			flags;		/* low level flags */	
};

1.2 task_fork_fair

/*
 * called on fork with the child task as argument from the parent's context
 *  - child not yet on the tasklist
 *  - preemption disabled
 */
static void task_fork_fair(struct task_struct *p)
{
	update_curr(cfs_rq);

	if (curr)
		se->vruntime = curr->vruntime;
	place_entity(cfs_rq, se, 1);

}

/*
 * All the scheduling class methods:
 */
const struct sched_class fair_sched_class = {
	.task_fork		= task_fork_fair,
};

struct sched_class {
	void (*task_fork) (struct task_struct *p);
};

extern const struct sched_class fair_sched_class;

task_fork()：创建进程时调用该操作。内核在创建新进程（普通进程）时就会调用task_fork_fair函数，task_fork_fair函数会调用place_entity函数。

1.2.1 do_fork

接下来简单简述进程创建的过程：内核创建进程，线程，内核线程都会调用do_fork函数（不同内核版本该函数会有差异），如下所示：

SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
	return do_fork(SIGCHLD, 0, 0, NULL, NULL);
#else
	/* can not support in nommu mode */
	return(-EINVAL);
#endif
}

SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
		 int __user *, parent_tidptr,
		 int __user *, child_tidptr,
		 int, tls_val)
{
	return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
}

/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
	return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
		(unsigned long)arg, NULL, NULL);
}

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr)
{
	struct task_struct *p;
	//初始化进程的调度器
	p = copy_process(clone_flags, stack_start, stack_size,
			 child_tidptr, NULL, trace);
	//唤醒进程，加入就绪队列
	wake_up_new_task(p);
}

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static struct task_struct *copy_process(unsigned long clone_flags,
					unsigned long stack_start,
					unsigned long stack_size,
					int __user *child_tidptr,
					struct pid *pid,
					int trace)
{
	/* Perform scheduler related setup. Assign this task to a CPU. */
	sched_fork(p);
}

1.2.2 sched_fork

/*
 * fork()/clone()-time setup:
 */
void sched_fork(struct task_struct *p)
{
	unsigned long flags;
	int cpu = get_cpu();

	__sched_fork(p);
	/*
	 * We mark the process as running here. This guarantees that
	 * nobody will actually run it, and a signal or other external
	 * event cannot wake it up and insert it on the runqueue either.
	 */
	p->state = TASK_RUNNING;

	/*
	 * Make sure we do not leak PI boosting priority to the child.
	 */
	p->prio = current->normal_prio;

	/*
	 * Revert to default priority/policy on fork if requested.
	 */
	if (unlikely(p->sched_reset_on_fork)) {
		if (task_has_rt_policy(p)) {
			p->policy = SCHED_NORMAL;
			p->static_prio = NICE_TO_PRIO(0);
			p->rt_priority = 0;
		} else if (PRIO_TO_NICE(p->static_prio) < 0)
			p->static_prio = NICE_TO_PRIO(0);

		p->prio = p->normal_prio = __normal_prio(p);
		set_load_weight(p);

		/*
		 * We don't need the reset flag anymore after the fork. It has
		 * fulfilled its duty:
		 */
		p->sched_reset_on_fork = 0;
	}

	if (!rt_prio(p->prio))
		p->sched_class = &fair_sched_class;

	if (p->sched_class->task_fork)
		p->sched_class->task_fork(p);
}

sched_fork()负责设置进程中与调度器相关的字段（比如将状态设置为TASK_RUNNING，设置进程的各种优先级、调度类、调度策略、负荷权重等，即初始化与进程调度相关的数据结构。

对于普通进程主要就是：

p->sched_class = &fair_sched_class;

if (p->sched_class->task_fork)
	p->sched_class->task_fork(p);

对于普通进程我们可以看到在sched_fork会执行task_fork函数，即task_fork_fair函数。

1.2.3 wake_up_new_task

新进程调用wake_up_new_task之前，其调度类已经初始化完成。

/*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
 * This function will do some initial scheduler statistics housekeeping
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
void wake_up_new_task(struct task_struct *p)
{
	unsigned long flags;
	struct rq *rq;

	raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_SMP
	/*
	 * Fork balancing, do it here and not earlier because:
	 *  - cpus_allowed can change in the fork path
	 *  - any previously selected cpu might disappear through hotplug
	 */
	 //给子进程设置即将要运行的处理器
	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
#endif

	rq = __task_rq_lock(p);
	//将子进程加入到CFS就绪队列中
	activate_task(rq, p, 0);
	p->on_rq = 1;
	trace_sched_wakeup_new(p, true);
	check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
	if (p->sched_class->task_woken)
		p->sched_class->task_woken(rq, p);
#endif
	task_rq_unlock(rq, p, &flags);
}

void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible--;

	enqueue_task(rq, p, flags);
}

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
	update_rq_clock(rq);
	sched_info_queued(p);
	p->sched_class->enqueue_task(rq, p, flags);
}

父进程将通过wake_up_new_task()唤醒刚刚创建的子进程，给子进程设置即将要运行的处理器，把进程加入就绪队列，CFS就绪队列中，准备接受调度，运行。
这里在wake_up_new_task()给子进程设置即将要运行的处理器，在之前的sched_fork函数中的task_fork_fair已经设置过一次，这里为什么要重新设置一次呢？

/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p);

void sched_fork(struct task_struct *p)
{
	int cpu = get_cpu();
	
	/*
	 * The child is not yet in the pid-hash so no cgroup attach races,
	 * and the cgroup is pinned to this child due to cgroup_fork()
	 * is ran before sched_fork().
	 *
	 * Silence PROVE_RCU.
	 */
	raw_spin_lock_irqsave(&p->pi_lock, flags);
	//给子进程选择一个合适的处理器，通常选择的是父进程运行的处理器上
	//注意这里只是选择合适的处理器，并没有将新进程加入到选择的处理器中
	//如果配置了 CONFIG_SMP ，后面在wake_up_new_task中还会重新给子进程选择一个合适的处理器
	set_task_cpu(p, cpu);
	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}

sched_fork给子进程选择一个合适的处理器，通常在fork->sched_fork中子进程都是继承了父进程的cpu_allowed，子进程这里选择的父进程运行的处理上，注意这里只是选择合适的处理器，并没有将新进程加入到选择的处理器中。同时在之后的wake_up_new_task()还要给子进程设置即将要运行的处理器。
因为在创建新进程的时候，cpu_allowed可能会发生变化。另外在sched_fork中选择的处理器可能关闭了，因此在wake_up_new_task()还要给子进程设置即将要运行的处理器。
对于cpu_allowed值，每个进程 struct task_struct 都有一个该成员，如下所示：

typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;

struct task_struct {
	cpumask_t cpus_allowed;
}

进程只运行在指定的处理器上，对处理器的指定由 struct task_struct 的成员 cpus_allowed 决定的。

void wake_up_new_task(struct task_struct *p)
{
	#ifdef CONFIG_SMP
	/*
	 * Fork balancing, do it here and not earlier because:
	 *  - cpus_allowed can change in the fork path
	 *  - any previously selected cpu might disappear through hotplug
	 */
	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
	#endif
}

这里再wake_up_new_task源码中也标明了重新给子进程选择处理器的原因：

fork平衡在这里进行，而不是更早，因为：
（1）CPU_Allowed可以在 the fork path 中更改。
（2）之前选择的任何CPU可能会通过热插拔消失

调用 select_task_rq 给子进程选择最合适的调度域中最空闲的处理器：

/*
 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
 */
static inline
int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
{
	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);

	/*
	 * In order not to call set_task_cpu() on a blocking task we need
	 * to rely on ttwu() to place the task on a valid ->cpus_allowed
	 * cpu.
	 *
	 * Since this is common to all placement strategies, this lives here.
	 *
	 * [ this allows ->select_task() to simply return task_cpu(p) and
	 *   not worry about this generic constraint ]
	 */
	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
		     !cpu_online(cpu)))
		cpu = select_fallback_rq(task_cpu(p), p);

	return cpu;
}

1.3 小结

place_entity函数主要在以下两处调用：
（1）进程从睡眠状态被唤醒以后，就离开等待队列进入运行队列，调用enqueue_task()操作。
（2）新创建的进程其父进程通过wake_up_new_task()唤醒刚刚创建的子进程，调用enqueue_task()操作。

二、place_entity源码详解

上面我们可以看到 place_entity 函数主要在进程被唤醒和新创建子进程时，调用enqueue_task()操作时运行。

源码如下：

/*
 * Targeted preemption latency for CPU-bound tasks:
 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * NOTE: this latency value is not the same as the concept of
 * 'timeslice length' - timeslices in CFS are of variable length
 * and have no persistent notion like in traditional, time-slice
 * based scheduling concepts.
 *
 * (to see the precise effective timeslice length of your workload,
 *  run vmstat and monitor the context-switches (cs) field)
 */
unsigned int sysctl_sched_latency = 6000000ULL;

static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
	(1)
	//获取 cfs_rq 运行队列的min_vruntime
	//将调度实体的 vruntime 成员与 cfs_rq 运行队列的min_vruntime成员对齐。
	u64 vruntime = cfs_rq->min_vruntime;

	/*
	 * The 'current' period is already promised to the current tasks,
	 * however the extra weight of the new task will slow them down a
	 * little, place the new task so that it fits in the slot that
	 * stays open at the end.
	 */
	(2)
	//initial等于1，表示为创建的新进程，惩罚新进程，也就是增加新进程调度实体的vruntime值
	if (initial && sched_feat(START_DEBIT))
		vruntime += sched_vslice(cfs_rq, se);

	/* sleeps up to a single latency don't count. */
	//initial等于0，表示为唤醒的进程，对唤醒的进程进行一定的补偿
	//补偿为默认调度周期的一半，即3ms，减去调度周期的一半：3ms
	(3)
	if (!initial) {
		unsigned long thresh = sysctl_sched_latency;

		/*
		 * Halve their sleep time's effect, to allow
		 * for a gentler effect of sleepers:
		 */
		if (sched_feat(GENTLE_FAIR_SLEEPERS))
			thresh >>= 1;

		vruntime -= thresh;
	}

	/* ensure we never gain time by being placed backwards. */
	//将调度实体的虚拟运行时间（se->vruntime）设置为当前虚拟运行时间和计算得到的vruntime值中的较大值，确保任务放置的位置不会早于其当前虚拟运行时间。
	(4)
	se->vruntime = max_vruntime(se->vruntime, vruntime);
}

唤醒进程时调用place_entity，flag=0：

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	if (flags & ENQUEUE_WAKEUP) {
		place_entity(cfs_rq, se, 0);
	}
}

新创建进程时调用place_entity，flag=1：

/*
 * called on fork with the child task as argument from the parent's context
 *  - child not yet on the tasklist
 *  - preemption disabled
 */
static void task_fork_fair(struct task_struct *p)
{
	place_entity(cfs_rq, se, 1);
}

（1）获取 cfs_rq 运行队列的min_vruntime，将调度实体的 vruntime 成员与 cfs_rq 运行队列的min_vruntime成员对齐。防止新进程和刚唤醒的进程 vruntime 值过小而一直占用处理器时间。
（2）flag等于1，表示为创建的新进程，惩罚新进程，也就是增加新进程调度实体的vruntime值。因为新创建的进程会导致CFS就绪队列发生变化，vruntime增加的值由sched_vslice函数根据新创建的进程权重值决定。
（3）initial等于0，表示为唤醒的进程，对唤醒的进程进行一定的补偿。
（4）将新进程的虚拟运行时间（se->vruntime）设置为当前虚拟运行时间和计算得到的vruntime值中（CFS就绪队列的min_vruntime+sched_vslice(cfs_rq, se)）的较大值，确保新进程放置的位置不会早于其当前虚拟运行时间。

/*
 * called on fork with the child task as argument from the parent's context
 *  - child not yet on the tasklist
 *  - preemption disabled
 */
static void task_fork_fair(struct task_struct *p)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se, *curr;
	int this_cpu = smp_processor_id();
	struct rq *rq = this_rq();
	unsigned long flags;

	raw_spin_lock_irqsave(&rq->lock, flags);

	update_rq_clock(rq);

	cfs_rq = task_cfs_rq(current);
	curr = cfs_rq->curr;
	
	//更新父进程的 vruntime
	update_curr(cfs_rq);

	if (curr)
		//将父进程的 vruntime 赋值给子进程的 vruntime
		se->vruntime = curr->vruntime;
	//惩罚新进程，重新更新子进程的 vruntime 
	place_entity(cfs_rq, se, 1);

	//如果设置了让子进程先运行，子进程的vruntime大于父进程vruntime
	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
		/*
		 * Upon rescheduling, sched_class::put_prev_task() will place
		 * 'current' within the tree based on its new key value.
		 */
		//交换父子进程的vruntime
		swap(curr->vruntime, se->vruntime);
		//将子进程设置为需要重新调度的标志位：TIF_NEED_RESCHED
		//标记一个进程应该被抢占，需要重新调度
		//并不真的抢占，而是将子进程的标志位成员设置为：TIF_NEED_RESCHED
		resched_task(rq->curr);
	}

	//新进程的vruntime将减去CFS就绪队列的 min_vruntime 
	//这里还没有将新进程加入处理器的就绪队列，这里减去CFS就绪队列的 min_vruntime 后，会在新进程唤醒后就如新的处理器CFS就绪队列时再重新加上 min_vruntime 。
	se->vruntime -= cfs_rq->min_vruntime;

	raw_spin_unlock_irqrestore(&rq->lock, flags);
}

更新父进程的 vruntime，将父进程的 vruntime 赋值给子进程的 vruntime，然后调用 place_entity 更新子进程的vruntime，给予子进程一定的惩罚。而对新进程来说，它从没有得到运行，因此需要为它设置合适的虚拟时钟初值，以决定它在红黑树中的位置。将新进程的vruntime将减去当前处理器CFS就绪队列的 min_vruntime ，这里还没有将新进程加入处理器的就绪队列，这里减去CFS就绪队列的 min_vruntime 后，会在新进程唤醒后加入新的处理器CFS就绪队列时再重新加上 min_vruntime 。

这里还没有将子进程加入所选择的处理器的运行队列中，而是在后面的wake_up_new_task()函数中，唤醒刚刚创建的子进程，把进程加入就绪队列。

//设置合适的虚拟时钟初值，将其对其为CFS就绪队列的min_vruntime值
u64 vruntime = cfs_rq->min_vruntime;

//惩罚新进程，根据其权证增加其 vruntime 值
if (initial && sched_feat(START_DEBIT))
	vruntime += sched_vslice(cfs_rq, se);

/* ensure we never gain time by being placed backwards. */
se->vruntime = max_vruntime(se->vruntime, vruntime);

/*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
 * This function will do some initial scheduler statistics housekeeping
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
void wake_up_new_task(struct task_struct *p)
{
	unsigned long flags;
	struct rq *rq;

	raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_SMP
	/*
	 * Fork balancing, do it here and not earlier because:
	 *  - cpus_allowed can change in the fork path
	 *  - any previously selected cpu might disappear through hotplug
	 */
	//如果配置了 CONFIG_SMP ，重新给子进程选择一个合适的处理器
	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
#endif

	rq = __task_rq_lock(p);
	//将新进程加入选择的处理器的就绪队列中
	activate_task(rq, p, 0);
	p->on_rq = 1;
	trace_sched_wakeup_new(p, true);
	check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
	if (p->sched_class->task_woken)
		p->sched_class->task_woken(rq, p);
#endif
	task_rq_unlock(rq, p, &flags);
}

void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible--;

	enqueue_task(rq, p, flags);
}

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
	update_rq_clock(rq);
	sched_info_queued(p);
	p->sched_class->enqueue_task(rq, p, flags);
}

/*
 * All the scheduling class methods:
 */
const struct sched_class fair_sched_class = {
	.enqueue_task		= enqueue_task_fair,
};

/*
 * The enqueue_task method is called before nr_running is
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;

	for_each_sched_entity(se) {
		if (se->on_rq)
			break;
		cfs_rq = cfs_rq_of(se);
		enqueue_entity(cfs_rq, se, flags);

		/*
		 * end evaluation on encountering a throttled cfs_rq
		 *
		 * note: in the case of encountering a throttled cfs_rq we will
		 * post the final h_nr_running increment below.
		*/
		if (cfs_rq_throttled(cfs_rq))
			break;
		cfs_rq->h_nr_running++;

		flags = ENQUEUE_WAKEUP;
	}
}

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	/*
	 * Update the normalized vruntime before updating min_vruntime
	 * through callig update_curr().
	 */
	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
		se->vruntime += cfs_rq->min_vruntime;
	
	......
}

在wake_up_new_task中，如果配置了 CONFIG_SMP ，重新给子进程选择一个合适的处理器，然后将新进程加入选择的处理器的就绪队列中，然后加上选择的处理器CFS运行队列的 min_vruntime。由于在task_fork_fair中减去了父进程所在的处理器CFS运行队列的 min_vruntime，这里加上 min_vruntime。

（3）flag=0，表示刚唤醒的进程从等待队列加入就绪队列，对于刚唤醒的进程要奖励。但是当一个进程睡眠一段时间后，其他进程得到了运行，这样这些进程的虚拟时钟都在前进，而当这个进程被唤醒时，它的虚拟时间可能远远小于其他进程。如果直接把这个进程加入就绪队列，那么在今后它在CPU的竞争中将占据很大的优势，如果不做特殊处理，在唤醒之后的一段时间间隔之内别的进程都得不到调度，所以对于曾经主动放弃CPU的进程，在唤醒时，要根据当前最慢的虚拟时钟来调整该进程的虚拟时钟，也就是要重新初始化唤醒进程的 vruntime 值，初始化为CFS就绪队列的 min_vruntime 值。将其初始化为CFS就绪队列的 min_vruntime 值后，然后奖励其 vruntime ，奖励的时间是默认调度周期的一半。源码如下：

// Targeted preemption latency for CPU-bound tasks:
// (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
unsigned int sysctl_sched_latency = 6000000ULL;

//设置合适的虚拟时钟初值，将其对其为CFS就绪队列的min_vruntime值
u64 vruntime = cfs_rq->min_vruntime;

/* sleeps up to a single latency don't count. */
//如果是唤醒的进程
if (!initial) {
	//计算给唤醒进程奖励的 vruntime 值
	unsigned long thresh = sysctl_sched_latency;

	/*
	 * Halve their sleep time's effect, to allow
	 * for a gentler effect of sleepers:
	 */
	if (sched_feat(GENTLE_FAIR_SLEEPERS))
		thresh >>= 1;
	//减去 thresh = sysctl_sched_latency/2 = 3ms
	vruntime -= thresh;
}

/* ensure we never gain time by being placed backwards. */
se->vruntime = max_vruntime(se->vruntime, vruntime);