Linux的进程分普通进程和实时进程,而实时进程又分SCHED_FIFO与SCHED_RR,它们都比普通进程的优先级高。
对于SCHED_FIFO进程,它就一直运行直到退出,除非它阻塞才会释放CPU, 或被更高优先级的实时进 程抢占。
对于SCHED_RR(时间片轮转)进程,只有当它的时间片用完,内核会把它放到进程队列的末尾。
我们来看看在2.4内核中对应的调度代码:
asmlinkage void schedule(void)
{
struct schedule_data * sched_data;
struct task_struct *prev, *next, *p;
struct list_head *tmp;
int this_cpu, c;
spin_lock_prefetch(&runqueue_lock);
BUG_ON(!current->active_mm);
need_resched_back:
prev = current; //prev指向当前进程
this_cpu = prev->processor;
if (unlikely(in_interrupt())) {
printk("Scheduling in interrupt/n");
BUG();
}
release_kernel_lock(prev, this_cpu);
/*
* 'sched_data' is protected by the fact that we can run
* only one process per CPU.
*/
sched_data = & aligned_data[this_cpu].schedule_data;
spin_lock_irq(&runqueue_lock);
/* move an exhausted RR process to be last.. */
if (unlikely(prev->policy == SCHED_RR))
//如果当前进程是SCHED_RR(时间片轮转)实时进程,查看它的时间片是否用完,
//如果用完把它移到队列末尾
if (!prev->counter) {
prev->counter = NICE_TO_TICKS(prev->nice);
move_last_runqueue(prev);
}
switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (signal_pending(prev)) {
prev->state = TASK_RUNNING;
break;
}
default:
del_from_runqueue(prev);
case TASK_RUNNING:;
}
prev->need_resched = 0;
/*
* this is the scheduler proper:
*/
repeat_schedule:
/*
* Default process to select..
*/
next = idle_task(this_cpu);
c = -1000;
//遍历全部进程,找出最需要运行的进程,由函数goodness计算它们的权值
list_for_each(tmp, &runqueue_head) {
p = list_entry(tmp, struct task_struct, run_list);
if (can_schedule(p, this_cpu)) {
int weight = goodness(p, this_cpu, prev->active_mm);
if (weight > c)
c = weight, next = p;
}
}
/* Do we need to re-calculate counters? */
//如果选中的进程没有时间片了,说明全部进程的时间片都用完了,这时重新计算全部
//进程的时间片,然后跳回去重调度
if (unlikely(!c)) {
struct task_struct *p;
spin_unlock_irq(&runqueue_lock);
read_lock(&tasklist_lock);
for_each_task(p)
p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
read_unlock(&tasklist_lock);
spin_lock_irq(&runqueue_lock);
goto repeat_schedule;
}
...........................
}
再看看goodness函数
static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
{
int weight;
/*
* select the current process after every other
* runnable process, but before the idle thread.
* Also, dont trigger a counter recalculation.
*/
//如果此进程为YIELD状态,返回-1(当前还它不需要运行)
weight = -1;
if (p->policy & SCHED_YIELD)
goto out;
/*
* Non-RT process - normal case first.
*/
//先检果普通进程,普通进程的计算方法是返回它的counter值,同时,如果此进程(线程)与
//当前正在运行的进程属于同一个进程,权值+1(优先选择它,这样少了页表切换的开销,有利于提高效率)
if (p->policy == SCHED_OTHER) {
/*
* Give the process a first-approximation goodness value
* according to the number of clock-ticks it has left.
*
* Don't do any other calculations if the time slice is
* over..
*/
weight = p->counter;
if (!weight)
goto out;
#ifdef CONFIG_SMP
/* Give a largish advantage to the same processor... */
/* (this is equivalent to penalizing other processors) */
//优先调度原来在此cpu上运行的进程
if (p->processor == this_cpu)
weight += PROC_CHANGE_PENALTY;
#endif
/* .. and a slight advantage to the current MM */
//判断是不是同一个进程内的线程(同一个进程内的两个线程mm相同)
if (p->mm == this_mm || !p->mm)
weight += 1;
weight += 20 - p->nice;
goto out;
}
/*
* Realtime process, select the first one on the
* runqueue (taking priorities within processes
* into account).
*/
//剩下的就是实时进程(SCHED_RR或SCHED_RR),它的权值固定为1000+进程优先级,
//远远大于普通进程,所以它们总是能得到及时调度,而且更高优先级的实时进程总是
//能抢占低优先级的进程
weight = 1000 + p->rt_priority;
out:
return weight;
}
通过以上代码,我们发现2.4以前的内核,有几点缺陷:
1) 每次调度时要遍历全部进程,时间复杂度为O(N)
2) 当全部进程时间片用完后,要为它们重新计算时间片
3) 在内核态不可抢占
4) 多个cpu共用一个运行队列,需要频繁的加锁,影响效率
2.6内核重写的进程调度这部分,其时间复杂度为O(1)。
先看看2.6与调度相关的几个数据结构:
struct rq {
spinlock_t lock;
//当前处理器上有多少个可运行的进程
unsigned long nr_running;
unsigned long raw_weighted_load;
unsigned long expired_timestamp;
struct mm_struct *prev_mm;
//active: 活动进程队列, expired: 过期进程队列,这里是实现O(1)调度的关键
struct prio_array *active, *expired, arrays[2];
int best_expired_prio;
};
//每个CPU分配一个运行队列runqueues
static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
struct prio_array {
unsigned int nr_active; //当前队列进程数
DECLARE_BITMAP(bitmap, MAX_PRIO+1);//位图,每一位表示对应级别的进程链表是否有进程
struct list_head queue[MAX_PRIO]; //进程链表,共MAX_PRIO(140)级,进程按其优先级存放在这个链表中
};
从上面结构来看,每个cpu有自己单独的运行队列,而每个运行队列中,把进程分为活动进程队列和
过期进程队列,每次调度时,从活动进程队列的最高优先级链表中选择第一个进程作为next。
我们来看看它是如何选择的。
我们先看prio_arry中的queue[MAX_PRIO], 进程按优先级放入这个队列中,queue[0]中的全部进程其优先级为0,
其优先级最高,queue[1]中的全部进程其优先级为1, 优先级的值越小优先运行。0~MAX_RT_PRIO(100)为实时进程的优先级,
MAX_RT_PRIO~MAX_PRIO(140)为普通进程的优先级。bitmap为5个32位整数,它的前140位对应140个优先级,
比如:bitmap的第5位置1,表示优先级为5的进程队列存在进程。
idx = sched_find_first_bit(array->bitmap)就是查找bitmap中第一个为1的位,那么就可以获取当前优先级最高的进程队列。
asmlinkage void __sched schedule(void)
{
struct task_struct *prev, *next;
struct prio_array *array;
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
int cpu, idx, new_prio;
long *switch_count;
struct rq *rq;
/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
if (unlikely(in_atomic() && !current->exit_state)) {
printk(KERN_ERR "BUG: scheduling while atomic: "
"%s/0x%08x/%d/n",
current->comm, preempt_count(), current->pid);
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
dump_stack();
}
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
need_resched:
preempt_disable();
prev = current; //prev指向当前进程
release_kernel_lock(prev);
need_resched_nonpreemptible:
rq = this_rq(); //当前处理器上的进程队列
/*
* The idle thread is not allowed to schedule!
* Remove this check after it has been exercised a bit.
*/
if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
printk(KERN_ERR "bad: scheduling from the idle thread!/n");
dump_stack();
}
schedstat_inc(rq, sched_cnt);
now = sched_clock();
if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
run_time = now - prev->timestamp;
if (unlikely((long long)(now - prev->timestamp) < 0))
run_time = 0;
} else
run_time = NS_MAX_SLEEP_AVG;
/*
* Tasks charged proportionately less run_time at high sleep_avg to
* delay them losing their interactive status
*/
run_time /= (CURRENT_BONUS(prev) ? : 1);
spin_lock_irq(&rq->lock);
switch_count = &prev->nivcsw;
//如果运行状态不为runable
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
switch_count = &prev->nvcsw;
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
unlikely(signal_pending(prev))))
prev->state = TASK_RUNNING;
else {
if (prev->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
deactivate_task(prev, rq); //如果prev为INTERRUPTIBLE或UNINTERRUPTIBLE状态,把它从队列中删除
}
}
cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) {
idle_balance(cpu, rq);
if (!rq->nr_running) {
next = rq->idle;
rq->expired_timestamp = 0;
goto switch_tasks;
}
}
//当前cpu上的活动进程队列数组
array = rq->active;
if (unlikely(!array->nr_active)) {
//如果活动队列中没有活动进程了,用它与过期队列(expired)交换
schedstat_inc(rq, sched_switch);
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
rq->expired_timestamp = 0;
rq->best_expired_prio = MAX_PRIO;
}
//查找活动队列中最高优先级的index
idx = sched_find_first_bit(array->bitmap);
//取这个进程链表
queue = array->queue + idx;
//取这个链表中的第一个进程
next = list_entry(queue->next, struct task_struct, run_list);
//如果next不是实时进程且它是交互进程
if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
//计算睡眠时间,睡眠时间越长,要提同它的优先级,让它先运行
unsigned long long delta = now - next->timestamp;
if (unlikely((long long)(now - next->timestamp) < 0))
delta = 0;
if (next->sleep_type == SLEEP_INTERACTIVE)
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
//重新计算它的优先级
new_prio = recalc_task_prio(next, next->timestamp + delta);
//重新按它的优先级插入队列
if (unlikely(next->prio != new_prio)) {
dequeue_task(next, array);
next->prio = new_prio;
enqueue_task(next, array);
}
}
next->sleep_type = SLEEP_NORMAL;
开始进程切换。。。
switch_tasks:
if (next == rq->idle)
schedstat_inc(rq, sched_goidle);
prefetch(next);
prefetch_stack(next);
clear_tsk_need_resched(prev);
rcu_qsctr_inc(task_cpu(prev));
update_cpu_clock(prev, rq, now);
prev->sleep_avg -= run_time;
if ((long)prev->sleep_avg <= 0)
prev->sleep_avg = 0;
prev->timestamp = prev->last_ran = now; //timestamp记录换出时间
sched_info_switch(prev, next);
if (likely(prev != next)) {
next->timestamp = next->last_ran = now; //记录换入时间
rq->nr_switches++;
rq->curr = next;
++*switch_count;
prepare_task_switch(rq, next);
prev = context_switch(rq, prev, next);
barrier();
/*
* this_rq must be evaluated again because prev may have moved
* CPUs since it called schedule(), thus the 'rq' on its stack
* frame will be invalid.
*/
finish_task_switch(this_rq(), prev);
} else
spin_unlock_irq(&rq->lock);
prev = current;
if (unlikely(reacquire_kernel_lock(prev) < 0))
goto need_resched_nonpreemptible;
preempt_enable_no_resched();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
再看看内核时钟发生发时做了那些操作,代码在scheduler_tick()->task_running_tick()
static void task_running_tick(struct rq *rq, struct task_struct *p)
{
if (p->array != rq->active) {
/* Task has expired but was not scheduled yet */
set_tsk_need_resched(p);
return;
}
spin_lock(&rq->lock);
/*
* The task was running during this tick - update the
* time slice counter. Note: we do not update a thread's
* priority until it either goes to sleep or uses up its
* timeslice. This makes it possible for interactive tasks
* to use up their timeslices at their highest priority levels.
*/
if (rt_task(p)) {
//如果是SCHED_RR进程,更新它的时间片,如果时间片用完,重计算时间片。
//SCHED_FIFO进程不需要时间片,它一直运行
if ((p->policy == SCHED_RR) && !--p->time_slice) {
p->time_slice = task_timeslice(p); //重计算时间片
p->first_time_slice = 0;
set_tsk_need_resched(p);
//把它移动队列末尾
requeue_task(p, rq->active);
}
goto out_unlock;
}
//如果是普通进程用完时间片
if (!--p->time_slice) {
dequeue_task(p, rq->active); //从活动队列中删除
set_tsk_need_resched(p);
p->prio = effective_prio(p); //重新计算其优先级
p->time_slice = task_timeslice(p); //重新计算其时间片
p->first_time_slice = 0;
if (!rq->expired_timestamp)
rq->expired_timestamp = jiffies;
//如果不是交互进程,移到过期队列
if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
enqueue_task(p, rq->expired);
if (p->static_prio < rq->best_expired_prio)
rq->best_expired_prio = p->static_prio;
} else
enqueue_task(p, rq->active); //如果是交互进程,移到其优先级队列末尾
} else {
if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
(p->array == rq->active)) {
requeue_task(p, rq->active);//如果时间片未用完,时间片又太长,把它移到队列末尾,给其它进程运行机会
set_tsk_need_resched(p);
}
}
out_unlock:
spin_unlock(&rq->lock);
}
从task_running_tick我们可以看到,每个时钟模周期(tick)不同类型的进程处理方式不同:
1) 实时进程SCHED_RR: 时间片用完后, 重计算它的时间片,优先级不变,把它移到当前
优先级队列末尾。所以,如果有更高级的FIFO、RR进程,或跟它相同优先级的RR进程,
它都会被抢占(因为它在队列末尾)
2) 实时进程SCHED_FIFO: 完全不使用时间片,不修改其优先级、不修改其在队列中的位置。所以,它只会被
更高级的FIFO、RR进程抢占,跟它相同优先级的进程没有机会得到执行,除非它退出。
3)普通进程:A) 时间片用完,从活动队列中删除,重计算它的时间片,如果它是交互进程
把它插入活动队例,给它继续运行的机会,奖励交互进程达到更好的交互响应时间,
如果不是交互进程,插入过期队列中。
B)时间片未用完,如果是交互进程且它的剩余时间片太长(这个还没搞清楚怎么计算的),把它移动
当前优先级队列的末尾,也就是说它要被更高优先级或跟它相同优先级的进程抢占。
遗留问题:
1) 进程阻塞(Sleep或IO操作)之后,其在active队列还是在expired队列,或者两个队列都不放?
答:进程阻塞之后,会把它的状态设为TASK_INTERRUPTIBLE,同时设置task的need_schedule
标志,在系统调用结束时,检查need_schedule标志后会调用schedule()做进程切换,schedule()检查
到当前进程的状态为TASK_INTERRUPTIBLE后,把它从runqueue中删除,所以进程既不在active队列中也不在
expried队列中。当阻塞结束时,会调用activate_task()把进程重放入active队列中。
2) 进程阻塞被唤醒之后,它要被放入active队例中,它是放在它的优先级队列头还是尾?
答:会被放入active队列,进程所在优先级队列末尾,那么对于FIFO进程,一旦它阻塞再恢复
运行之后,可能同等优先级的FIFO进程抢占。看代码
static void __activate_task(struct task_struct *p, struct rq *rq)
{
struct prio_array *target = rq->active;
if (batch_task(p))
target = rq->expired;
enqueue_task(p, target);
inc_nr_running(p, rq);
}
static void enqueue_task(struct task_struct *p, struct prio_array *array)
{
sched_info_queued(p);
list_add_tail(&p->run_list, array->queue + p->prio); //放入队列末尾
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
}
3) 如何判断一个进程是否是交互进程?
答:通过task->timestamp来计算,task->timestamp在三种情况会更新它:
1. 进程换入时 2. 进程换出 3.睡眠时,所以在schedule()中,
now - prev->timestamp表示当前进程运行时间, prev为当前正在运行的进程,prev->timestamp正好是它上次换入的时间
now-next->timestamp表示next的睡眠时间,next为准备运行的进程,next->timestamp正好是next上次换出的时间