Linux调度时机(抢占&调度)

Cc又菜又帅

已于 2023-01-09 09:19:00 修改

阅读量1.1k

点赞数 23

分类专栏： Linux内核文章标签： linux

于 2023-01-08 23:56:49 首次发布

本文链接：https://blog.csdn.net/weixin_48450161/article/details/128607558

版权

Linux内核专栏收录该内容

6 篇文章 0 订阅

订阅专栏

调度的时机

/*
 * __schedule() is the main scheduler function.
 *
 * The main means of driving the scheduler and thus entering this function are:
 *
 *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
 *
 *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
 *      paths. For example, see arch/x86/entry_64.S.
 *
 *      To drive preemption between tasks, the scheduler sets the flag in timer
 *      interrupt handler scheduler_tick().
 *
 *   3. Wakeups don't really cause entry into schedule(). They add a
 *      task to the run-queue and that's it.
 *
 *      Now, if the new task added to the run-queue preempts the current
 *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
 *      called on the nearest possible occasion:
 *
 *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
 *
 *         - in syscall or exception context, at the next outmost
 *           preempt_enable(). (this might be as soon as the wake_up()'s
 *           spin_unlock()!)
 *
 *         - in IRQ context, return from interrupt-handler to
 *           preemptible context
 *
 *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
 *         then at the next:
 *
 *          - cond_resched() call
 *          - explicit schedule() call
 *          - return from syscall or exception to user-space
 *          - return from interrupt-handler to user-space
 *
 * WARNING: must be called with preemption disabled!
 */

__schedule()是主调度函数

驱动调度器然后进入这个函数的主要方法包括
- 1、显示阻塞：互斥锁、信号量、等待队列等，当前线程不能继续执行
- 2、在返回用户空间的路径上，检查TIF_NEED_RESCHED标志位，如果置位就重新调度
- 3、wakeup()并不会导致调度，只会把这个线程加入就绪队列中
- 4、切换线程的调度点：
  - 如果内核是不可抢占的(CONFIG_PREEMPT=n)
    - 调度点：
      - 显式调用cond_resched()或者schedule()
      - 从系统调用或异常返回到用户空间
      - 中断处理函数返回用户空间
  - 如果内核是可抢占的(CONFIG_PREEMPT=y)
    - 如果添加到就绪队列的这个新线程确实要抢占当前线程，那在wakeup函数中会设置TIF_NEED_RESCHED标志位(非抢占式的则只加到就绪队列中)，然后在最近的抢占点调用schedule()
    - 调度点：
      - ~~所有"内核是不可抢占式的"调度点~~ schedule()
      - 在系统调用和异常上下文中，在下一个最近的preempt_enable()中(可能就在wake_up()中的spin_unlock()后)(更早)
      - 在中断上下文中，从中断函数返回到抢占上下文

注意:所有调用方必须在之后重新检查need_resched()并相应地重新调度，以防在__schedule()中禁用抢占时，事件触发了重新调度的需要(例如中断唤醒任务)
类似酱紫：do {__schedule();} while (need_resched());

开抢占提高实时性的原理：开启CONFIG_PREEMPT可以让调度点更多更密集，比如不开要等到下次系统调用，开了在wakeup()的spin_unlock()直接就调度了

疑问：
1、cond_resched()的用法？显式调用cond_resched()并不一定可以放弃处理器吧，因为可能此刻就是它自己最适合运行？还有另一种机制可以强制放弃处理器的函数是什么？
(1)cond_resched()是一种不可抢占式内核中的低延时处理，在一些比较耗时的处理中(已经被内核开发者识别出来)，如文件系统和内存回收的一些路径会调用cond_resched，看一下cond_resched这个宏的实现

/* include/linux/sched.h */
#define cond_resched() ({			\
	___might_sleep(__FILE__, __LINE__, 0);	\
	_cond_resched();			\
})

#ifndef CONFIG_PREEMPTION
extern int _cond_resched(void); /* 见下 */
#else
static inline int _cond_resched(void) { return 0; }  /* 对于抢占式内核来说cond_resched()是个空函数 */
#endif

/* kernel/sched/core.c */
#ifndef CONFIG_PREEMPTION
int __sched _cond_resched(void)
{
    /* 只有抢占计数器为0，且当前任务被设置了重新调度标志，才会进行抢占式调度 */
	if (should_resched(0)) { /* 判断抢占计数器(thread_info->preempt_count)是否为0, 0 => preemptible, >0 => 不可抢占 */
		preempt_schedule_common(); /* 抢占式调度 *//* 见下 */
		return 1;
	}
	rcu_all_qs();
	return 0;
}
EXPORT_SYMBOL(_cond_resched);
#endif

static void __sched notrace preempt_schedule_common(void)
{
	do {
		preempt_latency_start(1);
		__schedule(true);
		preempt_latency_stop(1);
		/*
		 * Check again in case we missed a preemption opportunity
		 * between schedule and now.
		 */
	} while (need_resched()); /* 内部检查current->thread_info->flag中的TIF_NEED_RESCHED */
}

(2)分析源码可知，cond_resched()函数的功能是放弃主动权，等待下一次调度运行

如果当前任务的调度策略是FIFO，那么调度器会将它放到同一优先级队列的队尾，只有当前面的FIFO任务都运行完，才会轮到它。但如果没有优先级更高或同级任务，则自己会继续执行
如果调度策略是RR，那么调度器会将它放在同一优先级队列的队尾，只有当当前RR任务运行完一个时间片之后，才能轮到它再次运行。与上同理其可以马上再上处理器
如果调度器是CFS，那么根据这次已经运行的时间计算出虚拟运行时间，加到总的虚拟运行时间里，再根据虚拟运行时间插入红黑树，cfs选一个虚拟运行时间最短的上处理器运行。与上同理其可以马上再上处理器(不是很确定)
(3)没查到，我记得有

2、非抢占式的内核也会去设置TIF_NEED_RESCHED么？
会，见问题1

参考资料：

https://zhuanlan.zhihu.com/p/554764883

抢占点的实现(并非调度点)

1、时钟中断tick时

kernel/sched/core.c

scheduler_tick
->curr->sched_class->task_tick(rq, curr, 0)
 ->task_tick_fair
  ->entity_tick
   ->check_preempt_tick

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	unsigned long ideal_runtime, delta_exec;
	struct sched_entity *se;
	s64 delta;

	ideal_runtime = sched_slice(cfs_rq, curr);
	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	if (delta_exec > ideal_runtime) { /* 当前任务的实际运行时间 > 理想运行时间 */
		resched_curr(rq_of(cfs_rq)); /* 重新调度 */
		/*
		 * The current task ran long enough, ensure it doesn't get
		 * re-elected due to buddy favours.
		 */
		clear_buddies(cfs_rq, curr); /* 保证自己不会因buddy的偏爱被再次选中 */
		return;
	}

	/*
	 * Ensure that a task that missed wakeup preemption by a
	 * narrow margin doesn't have to wait for a full slice.
	 * This also mitigates buddy induced latencies under load.
	 */
	if (delta_exec < sysctl_sched_min_granularity) /* 当前任务的实际运行时间小于最小调度粒度,750000ns=0.75ms */
		return;

	se = __pick_first_entity(cfs_rq); /* 选取虚拟运行时间最小的线程 */
	delta = curr->vruntime - se->vruntime;

	if (delta < 0)
		return;

	if (delta > ideal_runtime) /* 如果(curr虚拟时间-se虚拟运行时间)>curr的理想运行时间 */
		resched_curr(rq_of(cfs_rq)); /* 重新调度 */
}

2、唤醒抢占——fork路径

kernel/fork.c

kernel_clone
->wake_up_new_task(p)
 ->check_preempt_curr(rq, p, WF_FORK)

3、唤醒抢占——正常路径

kernel/sched/core.c
wake_up_process
->try_to_wake_up
 ->ttwu_queue
  ->ttwu_do_activate
   ->ttwu_do_wakeup
    ->check_preempt_curr(rq, p, wake_flags)

/* rq是目标CPU的队列 */
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
	const struct sched_class *class;

	if (p->sched_class == rq->curr->sched_class) {
		rq->curr->sched_class->check_preempt_curr(rq, p, flags); /* 以cfs为例分析 */
	} else {
		for_each_class(class) {
			if (class == rq->curr->sched_class)
				break;
			if (class == p->sched_class) {
				resched_curr(rq);
				break;
			}
		}
	}

	/*
	 * A queue event has occurred, and we're going to schedule.  In
	 * this case, we can save a useless back to back clock update.
	 */
	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
		rq_clock_skip_update(rq);
}

const struct sched_class fair_sched_class = {
	.check_preempt_curr	= check_preempt_wakeup,
    //……
}

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
	struct task_struct *curr = rq->curr;
	struct sched_entity *se = &curr->se, *pse = &p->se;
	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
    /* scale的含义: 如果nr_running大于或者等于sched_nr_latency，则scale为1。那么scale为1更深层次的含义是什么呢？
     * 实际上这意味着CFS运行队列上的进程比较多，那么被唤醒的进程可能要等较长时间才能得到调度。虽然被唤醒的进程会被赋予一
     * 个比较小的vruntime，已保证它尽快得到调度，但是它并一定是最小的。为了保证被唤醒的进程不会等太长时间，会调用
     * set_next_buddy接口设置cfs队列的next成员，这样在下一次调度的时候就会优先选择被唤醒的进程。这里我们要注意的是，
     * scale为1并不会导致当前进程被抢占，只会设置cfs队列的next成员，在不抢占当前进程的情况下，会等待当前进程用完时间片
     * 在schedule函数中判断是否需要选择next执行的进程来执行 */
	int scale = cfs_rq->nr_running >= sched_nr_latency;
	int next_buddy_marked = 0;

	if (unlikely(se == pse))
		return;

	/*
	 * This is possible from callers such as attach_tasks(), in which we
	 * unconditionally check_prempt_curr() after an enqueue (which may have
	 * lead to a throttle).  This both saves work and prevents false
	 * next-buddy nomination below.
	 */
	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
		return;

	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
		set_next_buddy(pse);
		next_buddy_marked = 1;
	}

	/*
	 * We can come here with TIF_NEED_RESCHED already set from new task
	 * wake up path.
	 *
	 * Note: this also catches the edge-case of curr being in a throttled
	 * group (e.g. via set_curr_task), since update_curr() (in the
	 * enqueue of curr) will have resulted in resched being set.  This
	 * prevents us from potentially nominating it as a false LAST_BUDDY
	 * below.
	 */
	/* 如果当前进程已经设置了TIF_NEED_RESCHED标记，那就不用再跟他纠缠了，直接返回 */
	if (test_tsk_need_resched(curr))
		return;

	/* Idle tasks are by definition preempted by non-idle tasks. */
	/* 如果当前进程是idle进程，而被唤醒的进程不是idle进程，那没什么说的，直接抢占 */
	if (unlikely(task_has_idle_policy(curr)) &&
	    likely(!task_has_idle_policy(p)))
		goto preempt;

	/*
	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
	 * is driven by the tick):
	 */
	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
		return;

	find_matching_se(&se, &pse);
	update_curr(cfs_rq_of(se));
	BUG_ON(!pse);
	/* 如果当前进程和被唤醒的进程都是普通的进程，那么调用wakeup_preempt_entity函数判断是否应该抢占当前进程 */
	if (wakeup_preempt_entity(se, pse) == 1) {
		/*
		 * Bias pick_next to pick the sched entity that is
		 * triggering this preemption.
		 */
		if (!next_buddy_marked)
			set_next_buddy(pse);
		goto preempt;
	}

	return;

preempt:
	/* 当前进程被抢占通过resched_curr来完成 */
	resched_curr(rq);
	/*
	 * Only set the backward buddy when the current task is still
	 * on the rq. This can happen when a wakeup gets interleaved
	 * with schedule on the ->pre_schedule() or idle_balance()
	 * point, either of which can * drop the rq lock.
	 *
	 * Also, during early boot the idle thread is in the fair class,
	 * for obvious reasons its a bad idea to schedule back to it.
	 */
	if (unlikely(!se->on_rq || curr == rq->idle))
		return;

	/* 最终如果scale为1，会调用set_last_buddy设置cfs调度队列的last成员为当前进程
     * last成员会受到调度器的优待，由于为了补偿被唤醒的进程，抢占了当前进程，
     * 那么补偿完被唤醒的进程后，需要再补偿下被抢占的进程。 */
	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
		set_last_buddy(se);
}

static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
	s64 gran, vdiff = curr->vruntime - se->vruntime;
	/* 如果当前进程的vruntime比较小，那么不能抢占，因为这意味着当前进程需要受到优待 */
	if (vdiff <= 0)
		return -1;

	/* 如果当前进程的vruntime比较大，而且相比于被唤醒的进程超过了一定的门限值，
     * 那么需要补偿被唤醒的进程，也就是允许被唤醒的进程抢占当前进程 */
	gran = wakeup_gran(se);
	if (vdiff > gran)
		return 1;

	return 0;
}

static unsigned long wakeup_gran(struct sched_entity *se)
{
	unsigned long gran = sysctl_sched_wakeup_granularity; /* 1000000ns=1ms */
	return calc_delta_fair(gran, se); /* 计算gran对应的虚拟运行时间 */
}