Linux shedule 的发展历史.



慢慢来吧~~


Linux V0.11  


支持定时器和信号

流程图:


源码:

void schedule(void)
{
    int i,next,c;
    struct task_struct ** p;

/* check alarm, wake up any interruptible tasks that have got a signal */

    for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
	if (*p) {
            if ((*p)->alarm && (*p)->alarm < jiffies) {
	 	(*p)->signal |= (1<<(SIGALRM-1));
	   	    (*p)->alarm = 0;
	    }
	if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
	    (*p)->state==TASK_INTERRUPTIBLE)
		(*p)->state=TASK_RUNNING;
	}

/* this is the scheduler proper: */

    while (1) {
        c = -1;
	next = 0;
        i = NR_TASKS;
	p = &task[NR_TASKS];
	while (--i) {
	if (!*--p)
	    continue;
	if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
            c = (*p)->counter, next = i;
	}
	if (c) break;
	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
	    if (*p)
		(*p)->counter = ((*p)->counter >> 1) +
			(*p)->priority;
    }
    switch_to(next);
}


重新计算时间片:

time = oldtime / 2 + priority 


相关接口: 

1. pause , 暂时放弃CPU . 可打断.

int sys_pause(void)
{
	current->state = TASK_INTERRUPTIBLE;
	schedule();
	return 0;
}

2.  sleep_on 睡眠等待.不可打断. 比如等到某种资源的时候


void sleep_on(struct task_struct **p)
{
	struct task_struct *tmp;

	if (!p)
		return;
	if (current == &(init_task.task))
		panic("task[0] trying to sleep");
	tmp = *p;
	*p = current;
	current->state = TASK_UNINTERRUPTIBLE;
	schedule();
	if (tmp)
		tmp->state=0;
}
3 . 可打断睡眠 , 多用于多进程等待同一个资源的时候. 可以形成等待队列.

void interruptible_sleep_on(struct task_struct **p)
{
	struct task_struct *tmp;

	if (!p)
		return;
	if (current == &(init_task.task))
		panic("task[0] trying to sleep");
	tmp=*p;
	*p=current;
repeat:	current->state = TASK_INTERRUPTIBLE;
	schedule();
	if (*p && *p != current) {
		(**p).state=0;
		goto repeat;
	}
	*p=NULL;
	if (tmp)
		tmp->state=0;
}

4. 唤醒进程 . 不论进程处于何种状态都唤醒它.

void wake_up(struct task_struct **p)
{
	if (p && *p) {
		(**p).state=0;
		*p=NULL;
	}
}


5. nice . 降低优先级. 更愿意出让CPU . 但是设置的increment 必须不大于当前的优先级数值.

int sys_nice(long increment)
{
    if (current->priority-increment>0)
        current->priority -= increment;
    return 0;
}


scheduel 的调用者:  (system_call.s)

reschedule:
    pushl $ret_from_sys_call
    jmp _schedule
_system_call:
	cmpl $nr_system_calls-1,%eax    # 检查系统调用号 
	ja bad_sys_call
	...                             # 各种压栈
	call _sys_call_table(,%eax,4)   # 调用对应接口 
	pushl %eax
	movl _current,%eax              # 拿到scheduel后的进程结构体指针
	cmpl $0,state(%eax)		# 如果不是运行态 就回去重新调度
	jne reschedule
	cmpl $0,counter(%eax)		# 如果时间片刚好消耗没了, 就回去重新调度
	je reschedule

相关结构体:

保存进程指针的数组.  所有的进程保存在一个固定大小的数组中,所以Linux系统支持的最大进程数目是固定的.

进程之间有指针组成链表.

struct task_struct * task[NR_TASKS] = {&(init_task.task), };


Linux V0.12

支持限时操作, 比如 select 支持最长等待时间.

schedual流程图:




源代码:


void schedule(void)
{
	int i,next,c;
	struct task_struct ** p;

/* check alarm, wake up any interruptible tasks that have got a signal */

	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
		if (*p) {
			if ((*p)->timeout && (*p)->timeout < jiffies) {
				(*p)->timeout = 0;
				if ((*p)->state == TASK_INTERRUPTIBLE)
					(*p)->state = TASK_RUNNING;
			}
			if ((*p)->alarm && (*p)->alarm < jiffies) {
				(*p)->signal |= (1<<(SIGALRM-1));
				(*p)->alarm = 0;
			}
			if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
			(*p)->state==TASK_INTERRUPTIBLE)
				(*p)->state=TASK_RUNNING;
		}

/* this is the scheduler proper: */

	while (1) {
		c = -1;
		next = 0;
		i = NR_TASKS;
		p = &task[NR_TASKS];
		while (--i) {
			if (!*--p)
				continue;
			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
				c = (*p)->counter, next = i;
		}
		if (c) break;
		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
			if (*p)
				(*p)->counter = ((*p)->counter >> 1) +
						(*p)->priority;
	}
	switch_to(next);
}

相关函数修改:

1. 添加接口 __sleep_on , 睡眠等待和可打断睡眠均直接调用此接口

static inline void __sleep_on(struct task_struct **p, int state)
{
	struct task_struct *tmp;

	if (!p)
		return;
	if (current == &(init_task.task))
		panic("task[0] trying to sleep");
	tmp = *p;
	*p = current;
	current->state = state;
repeat:	schedule();
	if (*p && *p != current) {
		(**p).state = 0;
		current->state = TASK_UNINTERRUPTIBLE;
		goto repeat;
	}
	if (!*p)
		printk("Warning: *P = NULL\n\r");
	if (*p = tmp)
		tmp->state=0;
}

void interruptible_sleep_on(struct task_struct **p)
{
	__sleep_on(p,TASK_INTERRUPTIBLE);
}

void sleep_on(struct task_struct **p)
{
	__sleep_on(p,TASK_UNINTERRUPTIBLE);
}

2. 唤醒进程的时候对已经停止的进程和僵尸进程进行警告
void wake_up(struct task_struct **p)
{
	if (p && *p) {
		if ((**p).state == TASK_STOPPED)
			printk("wake_up: TASK_STOPPED");
		if ((**p).state == TASK_ZOMBIE)
			printk("wake_up: TASK_ZOMBIE");
		(**p).state=0;
	}
}


 

Linux V0.95

仅仅修改了一点, 关于signal 屏蔽位:

if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
			(*p)->state==TASK_INTERRUPTIBLE)
				(*p)->state=TASK_RUNNING;

==>

if (((*p)->signal & ~(*p)->blocked) &&
			(*p)->state==TASK_INTERRUPTIBLE)
				(*p)->state=TASK_RUNNING;

因为V0.95 版本在sys_ssetmask函数中已经去掉了SIGKILL 和 SIGSTOP  :

int sys_ssetmask(int newmask)
{
	int old=current->blocked;
	current->blocked = newmask & ~(1<<(SIGKILL-1)) & ~(1<<(SIGSTOP-1));
	return old;
}

相关接口修改:

1. 暂时出让CPU的进程对SIG_IGN信号进行了屏蔽 :

int sys_pause(void)
{
	unsigned long old_blocked;
	unsigned long mask;
	struct sigaction * sa = current->sigaction;

	old_blocked = current->blocked;
	for (mask=1 ; mask ; sa++,mask += mask)
		if (sa->sa_handler == SIG_IGN)
			current->blocked |= mask;
	current->state = TASK_INTERRUPTIBLE;
	schedule();
	current->blocked = old_blocked;
	return -EINTR;
}

2. __sleep_on 接口对标志寄存器进行了保护, 使用CTL STI 指令保证原子操作性.

static inline void __sleep_on(struct task_struct **p, int state)
{
	struct task_struct *tmp;
	unsigned int flags;

	if (!p)
		return;
	if (current == &(init_task.task))
		panic("task[0] trying to sleep");
	__asm__("pushfl ; popl %0":"=r" (flags));
	tmp = *p;
	*p = current;
	current->state = state;
/* make sure interrupts are enabled: there should be no more races here */
	sti();
repeat:	schedule();
	if (*p && *p != current) {
		current->state = TASK_UNINTERRUPTIBLE;
		(**p).state = 0;
		goto repeat;
	}
	if (*p = tmp)
		tmp->state=0;
	__asm__("pushl %0 ; popfl"::"r" (flags));
}
3. nice函数处理了increment 比当前优先值大的情况: 直接减成 1

int sys_nice(long increment)
{
	if (increment < 0 && !suser())
		return -EPERM;
	if (increment > current->priority)
		increment = current->priority-1;
	current->priority -= increment;
	return 0;
}


Linux V0.95a

修改对限时操作的BUG:

if ((*p)->timeout && (*p)->timeout < jiffies) {
	(*p)->timeout = 0;
	if ((*p)->state == TASK_INTERRUPTIBLE)
		(*p)->state = TASK_RUNNING;
}


==>

if ((*p)->timeout && (*p)->timeout < jiffies)
        if ((*p)->state == TASK_INTERRUPTIBLE) {
	(*p)->timeout = 0;
	(*p)->state = TASK_RUNNING;
}

相关的接口修改:

1. 将next_wait 加入结构体  task_strcut. 从而使用它来维护睡眠等待列表, 不再唤醒僵尸进程和终止进程.

void wake_up(struct task_struct **p)
{
	struct task_struct * wakeup_ptr, * tmp;

	if (p && *p) {
		wakeup_ptr = *p;
		*p = NULL;
		while (wakeup_ptr && wakeup_ptr != task[0]) {
			if (wakeup_ptr->state == TASK_STOPPED)
				printk("wake_up: TASK_STOPPED\n");
			else if (wakeup_ptr->state == TASK_ZOMBIE)
				printk("wake_up: TASK_ZOMBIE\n");
			else
				wakeup_ptr->state = TASK_RUNNING;
			tmp = wakeup_ptr->next_wait;
			wakeup_ptr->next_wait = task[0];
			wakeup_ptr = tmp;
		}
	}
}

static inline void __sleep_on(struct task_struct **p, int state)
{
    unsigned int flags;

    if (!p)
        return;
    if (current == task[0])
        panic("task[0] trying to sleep");
    __asm__("pushfl ; popl %0":"=r" (flags));
    current->next_wait = *p;
    task[0]->next_wait = NULL;
    *p = current;
    current->state = state;
    sti();
    schedule();
    if (current->next_wait != task[0])
        wake_up(p);
    current->next_wait = NULL;
    __asm__("pushl %0 ; popfl"::"r" (flags));
}


Linux V0.95c

无相关修改


Linux V0.96a

添加了一项高优先级抢占功能:

当新唤醒的进程更优先的时候, 就重新调度.

实现:


1. scheduel 函数仅仅在开始加上一句代码:

need_resched = 0;

2. wake_up 函数检查新唤醒的进程优先级是否大于当前的. 是则设置need_resched = 1

void wake_up(struct task_struct **p)
{
	struct task_struct * wakeup_ptr, * tmp;

	if (p && *p) {
		wakeup_ptr = *p;
		*p = NULL;
		while (wakeup_ptr && wakeup_ptr != task[0]) {
			if (wakeup_ptr->state == TASK_ZOMBIE)
				printk("wake_up: TASK_ZOMBIE\n");
			else if (wakeup_ptr->state != TASK_STOPPED) {
				wakeup_ptr->state = TASK_RUNNING;
				if (wakeup_ptr->counter > current->counter)
					need_resched = 1;
			}
			tmp = wakeup_ptr->next_wait;
			wakeup_ptr->next_wait = task[0];
			wakeup_ptr = tmp;
		}
	}
}

3. system_call 检查need_resched, 若非0 就重新调度:

reschedule:
	pushl $ret_from_sys_call
	jmp _schedule
.align 2
_system_call:
	pushl %eax		# save orig_eax
	SAVE_ALL
	cmpl _NR_syscalls,%eax
	jae bad_sys_call
	call _sys_call_table(,%eax,4)
	movl %eax,EAX(%esp)		# save the return value
ret_from_sys_call:
	cmpw $0x0f,CS(%esp)		# was old code segment supervisor ?
	jne 2f
	cmpw $0x17,OLDSS(%esp)		# was stack segment = 0x17 ?
	jne 2f
1:	movl _current,%eax
	cmpl _task,%eax			# task[0] cannot have signals
	je 2
	cmpl $0,_need_resched           # 检查need_resched, 若非0 就重新调度
	jne reschedule
	cmpl $0,state(%eax)		# state
	jne reschedule
	cmpl $0,counter(%eax)		# counter
	je reschedule

相关接口修改:

1.修改nice 接口 ,保证优先级不会被改成0

int sys_nice(long increment)
{
	if (increment < 0 && !suser())
		return -EPERM;
	if (increment >= current->priority)      // 原先是 increment > current->priority
		increment = current->priority-1;
	current->priority -= increment;
	return 0;
}

2. 修改system_call , 若进程调度后仍为同一进程则不再检查状态和时间片

1:	movl _current,%eax
	cmpl _task,%eax			# task[0] cannot have signals
	je 2f
	cmpl $0,_need_resched
	jne reschedule
	cmpl $0,state(%eax)		# state
	jne reschedule
	cmpl $0,counter(%eax)		# counter
	je reschedule


Linux V0.96b

将定时器部分益处scheduel, 这部分功能由 do_timer 接管.


流程图:



源代码:

void schedule(void)
{
	int i,next,c;
	struct task_struct ** p;

/* check alarm, wake up any interruptible tasks that have got a signal */

	need_resched = 0;
	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
		if (*p) {
			if ((*p)->timeout && (*p)->timeout < jiffies)
				if ((*p)->state == TASK_INTERRUPTIBLE) {
					(*p)->timeout = 0;
					(*p)->state = TASK_RUNNING;
				}
			if (((*p)->signal & ~(*p)->blocked) &&
			(*p)->state==TASK_INTERRUPTIBLE)
				(*p)->state=TASK_RUNNING;
		}

/* this is the scheduler proper: */

	while (1) {
		c = -1;
		next = 0;
		i = NR_TASKS;
		p = &task[NR_TASKS];
		while (--i) {
			if (!*--p)
				continue;
			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
				c = (*p)->counter, next = i;
		}
		if (c) break;
		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
			if (*p)
				(*p)->counter = ((*p)->counter >> 1) +
						(*p)->priority;
	}
	switch_to(next);
}

Linux V0.96c

无相关修改

Linux V0.97

添加 wake_one_task接口 ,修了scheduel的BUG , 使得调度的时候被唤醒的高优先级的进程也可以抢占.

1. 新的schedule

void schedule(void)
{
	int i,next,c;
	struct task_struct ** p;

/* check alarm, wake up any interruptible tasks that have got a signal */

	need_resched = 0;
	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
		if (*p) {
			if ((*p)->timeout && (*p)->timeout < jiffies)
				if ((*p)->state == TASK_INTERRUPTIBLE) {
					(*p)->timeout = 0;
					wake_one_task(*p);          // 原来直接改变p->state
				}
			if (((*p)->signal & ~(*p)->blocked) &&
			    (*p)->state==TASK_INTERRUPTIBLE)
				wake_one_task(*p);                  // 原来直接改变p->state
		}

/* this is the scheduler proper: */

	while (1) {
		c = -1;
		next = 0;
		i = NR_TASKS;
		p = &task[NR_TASKS];
		while (--i) {
			if (!*--p)
				continue;
			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
				c = (*p)->counter, next = i;
		}
		if (c)
			break;
		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
			if (*p)
				(*p)->counter = ((*p)->counter >> 1) +
						(*p)->priority;
	}
	sti();            // 保证原子操作
	switch_to(next);
}

辅助接口 :

void wake_one_task(struct task_struct * p)
{
	p->state = TASK_RUNNING;
	if (p->counter > current->counter)
		need_resched = 1;
}

2. 定义了 wait_queue .在 sched.h 里面添加了接口 : add_wait_queue 和 remove_wait_queue . 使得这个环状链表的操作更美观.

接口代码:

extern inline void add_wait_queue(struct wait_queue ** p, struct wait_queue * wait)
{
	unsigned long flags;
	struct wait_queue * tmp;

	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=r" (flags));
	wait->next = *p;
	tmp = wait;
	while (tmp->next)
		if ((tmp = tmp->next)->next == *p)
			break;
	*p = tmp->next = wait;
	__asm__ __volatile__("pushl %0 ; popfl"::"r" (flags));
}

extern inline void remove_wait_queue(struct wait_queue ** p, struct wait_queue * wait)
{
	unsigned long flags;
	struct wait_queue * tmp;

	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=r" (flags));
	if (*p == wait)
		if ((*p = wait->next) == wait)
			*p = NULL;
	tmp = wait;
	while (tmp && tmp->next != wait)
		tmp = tmp->next;
	if (tmp)
		tmp->next = wait->next;
	wait->next = NULL;
	__asm__ __volatile__("pushl %0 ; popfl"::"r" (flags));
}

然后对对应的 __sleep_on 接口进行了修改

static inline void __sleep_on(struct wait_queue **p, int state)
{
	unsigned long flags;

	if (!p)
		return;
	if (current == task[0])
		panic("task[0] trying to sleep");
	if (current->wait.next)
		printk("__sleep_on: wait->next exists\n");
	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=r" (flags));
	current->state = state;
	add_wait_queue(p,¤t->wait);
	sti();
	schedule();
	remove_wait_queue(p,¤t->wait);
	__asm__("pushl %0 ; popfl"::"r" (flags));
}

3. wake_up 函数也有改变, 但是只是将  while 改成了 do .. . while .


Linux V1.0 


首先, scheduel函数重新接管了定时器, 当然代码更加复杂了. 但是流程图回到了 V1.2 版.

其次, 为了产生更高效率的机器码, 使用 for(;;) 代替while , 使用goto 代替if

最后, 不再依靠遍历task数组来遍历所有的进程,改为遍历环状链表.

源代码:

asmlinkage void schedule(void)
{
	int c;
	struct task_struct * p;
	struct task_struct * next;
	unsigned long ticks;

/* check alarm, wake up any interruptible tasks that have got a signal */

	cli();
	ticks = itimer_ticks;
	itimer_ticks = 0;
	itimer_next = ~0;
	sti();
	need_resched = 0;
	p = &init_task;
	for (;;) {
		if ((p = p->next_task) == &init_task)
			goto confuse_gcc1;
		if (ticks && p->it_real_value) {
			if (p->it_real_value <= ticks) {
				send_sig(SIGALRM, p, 1);
				if (!p->it_real_incr) {
					p->it_real_value = 0;
					goto end_itimer;
				}
				do {
					p->it_real_value += p->it_real_incr;
				} while (p->it_real_value <= ticks);
			}
			p->it_real_value -= ticks;
			if (p->it_real_value < itimer_next)
				itimer_next = p->it_real_value;
		}
end_itimer:
		if (p->state != TASK_INTERRUPTIBLE)
			continue;
		if (p->signal & ~p->blocked) {
			p->state = TASK_RUNNING;
			continue;
		}
		if (p->timeout && p->timeout <= jiffies) {
			p->timeout = 0;
			p->state = TASK_RUNNING;
		}
	}
confuse_gcc1:

/* this is the scheduler proper: */
#if 0
	/* give processes that go to sleep a bit higher priority.. */
	/* This depends on the values for TASK_XXX */
	/* This gives smoother scheduling for some things, but */
	/* can be very unfair under some circumstances, so.. */
 	if (TASK_UNINTERRUPTIBLE >= (unsigned) current->state &&
	    current->counter < current->priority*2) {
		++current->counter;
	}
#endif
	c = -1;
	next = p = &init_task;
	for (;;) {
		if ((p = p->next_task) == &init_task)
			goto confuse_gcc2;
		if (p->state == TASK_RUNNING && p->counter > c)
			c = p->counter, next = p;
	}
confuse_gcc2:
	if (!c) {
		for_each_task(p)
			p->counter = (p->counter >> 1) + p->priority;
	}
	if(current != next)
		kstat.context_swtch++;
	switch_to(next);
	/* Now maybe reload the debug registers */
	if(current->debugreg[7]){
		loaddebug(0);
		loaddebug(1);
		loaddebug(2);
		loaddebug(3);
		loaddebug(6);
	};
}

相关函数修改:


1. wake_up 对环状列表的所有睡眠的进程进行唤醒

void wake_up(struct wait_queue **q)
{
	struct wait_queue *tmp;
	struct task_struct * p;

	if (!q || !(tmp = *q))
		return;
	do {
		if ((p = tmp->task) != NULL) {
			if ((p->state == TASK_UNINTERRUPTIBLE) ||
			    (p->state == TASK_INTERRUPTIBLE)) {
				p->state = TASK_RUNNING;
				if (p->counter > current->counter)
					need_resched = 1;
			}
		}
		if (!tmp->next) {
			printk("wait_queue is bad (eip = %08lx)\n",((unsigned long *) q)[-1]);
			printk("        q = %p\n",q);
			printk("       *q = %p\n",*q);
			printk("      tmp = %p\n",tmp);
			break;
		}
		tmp = tmp->next;
	} while (tmp != *q);
}



2. 加入接口wake_up_interruptible, 仅唤醒可中断睡眠的进行

void wake_up_interruptible(struct wait_queue **q)
{
	struct wait_queue *tmp;
	struct task_struct * p;

	if (!q || !(tmp = *q))
		return;
	do {
		if ((p = tmp->task) != NULL) {
			if (p->state == TASK_INTERRUPTIBLE) {
				p->state = TASK_RUNNING;
				if (p->counter > current->counter)
					need_resched = 1;
			}
		}
		if (!tmp->next) {
			printk("wait_queue is bad (eip = %08lx)\n",((unsigned long *) q)[-1]);
			printk("        q = %p\n",q);
			printk("       *q = %p\n",*q);
			printk("      tmp = %p\n",tmp);
			break;
		}
		tmp = tmp->next;
	} while (tmp != *q);
}

3. 为了保护竞争资源加入了__down接口实现了计数的信号量机制

void __down(struct semaphore * sem)
{
	struct wait_queue wait = { current, NULL };
	add_wait_queue(&sem->wait, &wait);
	current->state = TASK_UNINTERRUPTIBLE;
	while (sem->count <= 0) {
		schedule();
		current->state = TASK_UNINTERRUPTIBLE;
	}
	current->state = TASK_RUNNING;
	remove_wait_queue(&sem->wait, &wait);
}

4. 通过nice接口, 限制优先级数值为1-35 .

asmlinkage int sys_nice(long increment)
{
	int newprio;

	if (increment < 0 && !suser())
		return -EPERM;
	newprio = current->priority - increment;
	if (newprio < 1)
		newprio = 1;
	if (newprio > 35)
		newprio = 35;
	current->priority = newprio;
	return 0;
}


Linux  V1.1

加入了个对中断的输出:

	if (intr_count) {
		printk("Aiee: scheduling in interrupt\n");
		intr_count = 0;
	}

Linux V1.2

删除了system_call.s 文件. 添加了arch文件夹, 系统跳用移动到对应内核的entry.S


 Linux V1.3

1. 添加了对 scheduel时处理tq_scheduler的支持.

run_task_queue(&tq_scheduler);

2. 每次统计当前运行的进程数目

nr_running = 0;
...
nr_running++;

Linux V2.0

大量的修改 , 先看源码 ,再一一解释:

asmlinkage void schedule(void)
{
	int c;
	struct task_struct * p;
	struct task_struct * prev, * next;
	unsigned long timeout = 0;
	int this_cpu=smp_processor_id();

/* check alarm, wake up any interruptible tasks that have got a signal */

	if (intr_count)
		goto scheduling_in_interrupt;

	if (bh_active & bh_mask) {
		intr_count = 1;
		do_bottom_half();         // 1. 添加了对 buttom half 的支持.
		intr_count = 0;
	}

	run_task_queue(&tq_scheduler);    

	need_resched = 0;
	prev = current;
	cli();
	/* move an exhausted RR process to be last.. */
	if (!prev->counter && prev->policy == SCHED_RR) {
		prev->counter = prev->priority;
		move_last_runqueue(prev);                   //2. 加入了runquene的概念
	}
	switch (prev->state) {
		case TASK_INTERRUPTIBLE:
			if (prev->signal & ~prev->blocked)
				goto makerunnable;
			timeout = prev->timeout;
			if (timeout && (timeout <= jiffies)) {
				prev->timeout = 0;
				timeout = 0;
		makerunnable:
				prev->state = TASK_RUNNING;
				break;
			}
		default:
			del_from_runqueue(prev);
		case TASK_RUNNING:
	}
	p = init_task.next_run;
	sti();
	
#ifdef __SMP__
	/*
	 *	This is safe as we do not permit re-entry of schedule()
	 */
	prev->processor = NO_PROC_ID;
#define idle_task (task[cpu_number_map[this_cpu]])
#else
#define idle_task (&init_task)
#endif	

/*
 * Note! there may appear new tasks on the run-queue during this, as
 * interrupts are enabled. However, they will be put on front of the
 * list, so our list starting at "p" is essentially fixed.
 */
/* this is the scheduler proper: */
	c = -1000;
	next = idle_task;
	while (p != &init_task) {
		int weight = goodness(p, prev, this_cpu);            //3. 新的优先级计算方式
		if (weight > c)
			c = weight, next = p;
		p = p->next_run;
	}

	/* if all runnable processes have "counter == 0", re-calculate counters */
	if (!c) {
		for_each_task(p)
			p->counter = (p->counter >> 1) + p->priority;
	}
#ifdef __SMP__                                                                // 4. 多CPU支持
	/*
	 *	Allocate process to CPU
	 */
	 
	 next->processor = this_cpu;
	 next->last_processor = this_cpu;
#endif	 
#ifdef __SMP_PROF__ 
	/* mark processor running an idle thread */
	if (0==next->pid)
		set_bit(this_cpu,&smp_idle_map);
	else
		clear_bit(this_cpu,&smp_idle_map);
#endif
	if (prev != next) {
		struct timer_list timer;

		kstat.context_swtch++;
		if (timeout) {
			init_timer(&timer);
			timer.expires = timeout;
			timer.data = (unsigned long) prev;
			timer.function = process_timeout;
			add_timer(&timer);
		}
		get_mmu_context(next);
		switch_to(prev,next);
		if (timeout)
			del_timer(&timer);
	}
	return;

scheduling_in_interrupt:
	printk("Aiee: scheduling in interrupt %p\n",
		__builtin_return_address(0));
}


1. 添加了对 buttom half 的支持.

利用 bh_active 和 bh_mask 两个掩码来记录软中断信息. 每次scheduel统一执行之 .

	if (bh_active & bh_mask) {
		intr_count = 1;
		do_bottom_half();
		intr_count = 0;
	}


2. 定义了runquene的概念(task_struct 加入俩指针) , 加入接口 add_to_runqueue , del_from_runqueue, move_last_runqueue , 来支持对runquene的支持.

其中除了双向环状链表的操作, 就是对多CPU的支持.

static inline void add_to_runqueue(struct task_struct * p)
{
#ifdef __SMP__
	int cpu=smp_processor_id();
#endif	
#if 1	/* sanity tests */
	if (p->next_run || p->prev_run) {
		printk("task already on run-queue\n");
		return;
	}
#endif
	if (p->counter > current->counter + 3)
		need_resched = 1;
	nr_running++;
	(p->prev_run = init_task.prev_run)->next_run = p;
	p->next_run = &init_task;
	init_task.prev_run = p;
#ifdef __SMP__
	/* this is safe only if called with cli()*/
	while(set_bit(31,&smp_process_available))
	{
		while(test_bit(31,&smp_process_available))
		{
			if(clear_bit(cpu,&smp_invalidate_needed))
			{
				local_flush_tlb();
				set_bit(cpu,&cpu_callin_map[0]);
			}
		}
	}
	smp_process_available++;
	clear_bit(31,&smp_process_available);
	if ((0!=p->pid) && smp_threads_ready)
	{
		int i;
		for (i=0;i<smp_num_cpus;i++)
		{
			if (0==current_set[cpu_logical_map[i]]->pid) 
			{
				smp_message_pass(cpu_logical_map[i], MSG_RESCHEDULE, 0L, 0);
				break;
			}
		}
	}
#endif
}

static inline void del_from_runqueue(struct task_struct * p)
{
	struct task_struct *next = p->next_run;
	struct task_struct *prev = p->prev_run;

#if 1	/* sanity tests */
	if (!next || !prev) {
		printk("task not on run-queue\n");
		return;
	}
#endif
	if (p == &init_task) {
		static int nr = 0;
		if (nr < 5) {
			nr++;
			printk("idle task may not sleep\n");
		}
		return;
	}
	nr_running--;
	next->prev_run = prev;
	prev->next_run = next;
	p->next_run = NULL;
	p->prev_run = NULL;
}

static inline void move_last_runqueue(struct task_struct * p)
{
	struct task_struct *next = p->next_run;
	struct task_struct *prev = p->prev_run;

	/* remove from list */
	next->prev_run = prev;
	prev->next_run = next;
	/* add back to list */
	p->next_run = &init_task;
	prev = init_task.prev_run;
	init_task.prev_run = p;
	p->prev_run = prev;
	prev->next_run = p;
}


3. 加入新的优先级计算方式.

加入了进程调度测略概念:

/*
 * Scheduling policies
 */
#define SCHED_OTHER        0 //一般的进程
#define SCHED_FIFO        1  // 实时进程, 一个进程执行完才执行另一个
#define SCHED_RR        2    // 实时进程, 固定执行时间片,轮转依次执行


添加了优先级计算接口 goodness

/*
 * This is the function that decides how desirable a process is..
 * You can weigh different processes against each other depending
 * on what CPU they've run on lately etc to try to handle cache
 * and TLB miss penalties.
 *
 * Return values:
 *	 -1000: never select this
 *	     0: out of time, recalculate counters (but it might still be
 *		selected)
 *	   +ve: "goodness" value (the larger, the better)
 *	 +1000: realtime process, select this.
 */
static inline int goodness(struct task_struct * p, struct task_struct * prev, int this_cpu)
{
	int weight;

#ifdef __SMP__	
	/* We are not permitted to run a task someone else is running */
	if (p->processor != NO_PROC_ID)
		return -1000;                    // CPU 不支持直接
#ifdef PAST_2_0		
	/* This process is locked to a processor group */
	if (p->processor_mask && !(p->processor_mask & (1<<this_cpu))
		return -1000;                    // 绑定特定CPU , 不需要当前CPU
#endif		
#endif

	/*
	 * Realtime process, select the first one on the
	 * runqueue (taking priorities within processes
	 * into account).
	 */
	if (p->policy != SCHED_OTHER)
		return 1000 + p->rt_priority;    // 实时程序, 立刻执行

	/*
	 * Give the process a first-approximation goodness value
	 * according to the number of clock-ticks it has left.
	 *
	 * Don't do any other calculations if the time slice is
	 * over..
	 */
	weight = p->counter;
	if (weight) {
			
#ifdef __SMP__
		/* Give a largish advantage to the same processor...   */
		/* (this is equivalent to penalizing other processors) */
		if (p->last_processor == this_cpu)
			weight += PROC_CHANGE_PENALTY;      // 同一CPU
#endif

		/* .. and a slight advantage to the current process */
		if (p == prev)
			weight += 1;                       // 还是上一个进程
	}

	return weight;
}

4. 支持多CPU调度


相关接口修改: 


加入接口 wake_up_process , 利用add_runquene ,  被wake_upwake_up_interruputible 取代p->state = TASK_RUNNING调用.

inline void wake_up_process(struct task_struct * p)
{
	unsigned long flags;

	save_flags(flags);
	cli();
	p->state = TASK_RUNNING;
	if (!p->next_run)
		add_to_runqueue(p);
	restore_flags(flags);
}

nice接口策略修改, 限定 优先值 0 - DEF_PRIORITY*2

asmlinkage int sys_nice(int increment)
{
	unsigned long newprio;
	int increase = 0;

	newprio = increment;
	if (increment < 0) {
		if (!suser())
			return -EPERM;
		newprio = -increment;
		increase = 1;
	}
	if (newprio > 40)
		newprio = 40;
	/*
	 * do a "normalization" of the priority (traditionally
	 * unix nice values are -20..20, linux doesn't really
	 * use that kind of thing, but uses the length of the
	 * timeslice instead (default 150 msec). The rounding is
	 * why we want to avoid negative values.
	 */
	newprio = (newprio * DEF_PRIORITY + 10) / 20;
	increment = newprio;
	if (increase)
		increment = -increment;
	newprio = current->priority - increment;
	if (newprio < 1)
		newprio = 1;
	if (newprio > DEF_PRIORITY*2)
		newprio = DEF_PRIORITY*2;
	current->priority = newprio;
	return 0;
}


Linux V2.1


相关函数修改:

唤醒函数修改,统一使用wait_quene

void wake_up(struct wait_queue **q)
{
    struct wait_queue *next;
    struct wait_queue *head;

    if (!q || !(next = *q))
        return;
    head = WAIT_QUEUE_HEAD(q);
    while (next != head) {
        struct task_struct *p = next->task;
        next = next->next;
        if (p != NULL) {
            if ((p->state == TASK_UNINTERRUPTIBLE) ||
                (p->state == TASK_INTERRUPTIBLE))
                wake_up_process(p);
        }
        if (!next)
            goto bad;
    }
    return;
bad:
    printk("wait_queue is bad (eip = %p)\n",
        __builtin_return_address(0));
    printk("        q = %p\n",q);
    printk("       *q = %p\n",*q);
}

void wake_up_interruptible(struct wait_queue **q)
{
    struct wait_queue *next;
    struct wait_queue *head;

    if (!q || !(next = *q))
        return;
    head = WAIT_QUEUE_HEAD(q);
    while (next != head) {
        struct task_struct *p = next->task;
        next = next->next;
        if (p != NULL) {
            if (p->state == TASK_INTERRUPTIBLE)
                wake_up_process(p);
        }
        if (!next)
            goto bad;
    }
    return;
bad:
    printk("wait_queue is bad (eip = %p)\n",
        __builtin_return_address(0));
    printk("        q = %p\n",q);
    printk("       *q = %p\n",*q);
}
对 sleep_on 接口加入原子保护

修改nice接口的一个bug .

    if (newprio < 1)

==>

	if ((signed) newprio < 1)

Linux V2.2 

修改较多.


1. 添加CPU的的更多支持.

2. 对每个队列操作添加了信号量保护


源码:

asmlinkage void schedule(void)
{
	struct schedule_data * sched_data;
	struct task_struct * prev, * next;
	int this_cpu;

	prev = current;
	this_cpu = prev->processor;
	/*
	 * 'sched_data' is protected by the fact that we can run
	 * only one process per CPU.
	 */
	sched_data = & aligned_data[this_cpu].schedule_data;

	if (in_interrupt())
		goto scheduling_in_interrupt;
	release_kernel_lock(prev, this_cpu);

	/* Do "administrative" work here while we don't hold any locks */
	if (bh_active & bh_mask)
		do_bottom_half();
	run_task_queue(&tq_scheduler);

	spin_lock(&scheduler_lock);
	spin_lock_irq(&runqueue_lock);

	/* move an exhausted RR process to be last.. */
	prev->need_resched = 0;

	if (!prev->counter && prev->policy == SCHED_RR) {
		prev->counter = prev->priority;
		move_last_runqueue(prev);
	}

	switch (prev->state) {
		case TASK_INTERRUPTIBLE:
			if (signal_pending(prev)) {
				prev->state = TASK_RUNNING;
				break;
			}
		default:
			del_from_runqueue(prev);
		case TASK_RUNNING:
	}

	sched_data->prevstate = prev->state;

	{
		struct task_struct * p = init_task.next_run;
		/*
		 * This is subtle.
		 * Note how we can enable interrupts here, even
		 * though interrupts can add processes to the run-
		 * queue. This is because any new processes will
		 * be added to the front of the queue, so "p" above
		 * is a safe starting point.
		 * run-queue deletion and re-ordering is protected by
		 * the scheduler lock
		 */
		spin_unlock_irq(&runqueue_lock);
#ifdef __SMP__
		prev->has_cpu = 0;
#endif
	
/*
 * Note! there may appear new tasks on the run-queue during this, as
 * interrupts are enabled. However, they will be put on front of the
 * list, so our list starting at "p" is essentially fixed.
 */
/* this is the scheduler proper: */
		{
			int c = -1000;
			next = idle_task;
			while (p != &init_task) {
				if (can_schedule(p)) {
					int weight = goodness(p, prev, this_cpu);
					if (weight > c)
						c = weight, next = p;
				}
				p = p->next_run;
			}

			/* Do we need to re-calculate counters? */
			if (!c) {
				struct task_struct *p;
				read_lock(&tasklist_lock);
				for_each_task(p)
					p->counter = (p->counter >> 1) + p->priority;
				read_unlock(&tasklist_lock);
			}
		}
	}

 	/*
 	 * maintain the per-process 'average timeslice' value.
 	 * (this has to be recalculated even if we reschedule to
 	 * the same process) Currently this is only used on SMP:
 	 */
#ifdef __SMP__
	{
		cycles_t t, this_slice;

		t = get_cycles();
		this_slice = t - sched_data->last_schedule;
		sched_data->last_schedule = t;

		/*
		 * Simple, exponentially fading average calculation:
		 */
		prev->avg_slice = this_slice + prev->avg_slice;
		prev->avg_slice >>= 1;
	}

	/*
	 * We drop the scheduler lock early (it's a global spinlock),
	 * thus we have to lock the previous process from getting
	 * rescheduled during switch_to().
	 */
	prev->has_cpu = 1;

 	next->has_cpu = 1;
 	next->processor = this_cpu;
	spin_unlock(&scheduler_lock);
#endif /* __SMP__ */
 	if (prev != next) {
#ifdef __SMP__
		sched_data->prev = prev;
#endif
	 	kstat.context_swtch++;
		get_mmu_context(next);
		switch_to(prev,next);

		__schedule_tail();
	}
  
	reacquire_kernel_lock(current);
	return;

scheduling_in_interrupt:
	printk("Scheduling in interrupt\n");
	*(int *)0 = 0;
}

相关接口修改:

添加接口reschedule_idle, 对唤醒进程后重新调度之前做了更过工作

void wake_up_process(struct task_struct * p)
{
	unsigned long flags;

	spin_lock_irqsave(&runqueue_lock, flags);
	p->state = TASK_RUNNING;
	if (!p->next_run) {
		add_to_runqueue(p);
		reschedule_idle(p);
	}
	spin_unlock_irqrestore(&runqueue_lock, flags);
}
主要是对多CPU的支持. 同时对优先级更高有了新的定义 : 至少大于3
static inline void reschedule_idle(struct task_struct * p)
{

	if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) {
		current->need_resched = 1;
		return;
	}

#ifdef __SMP__
	/*
	 * ("wakeup()" should not be called before we've initialized
	 * SMP completely.
	 * Basically a not-yet initialized SMP subsystem can be
	 * considered as a not-yet working scheduler, simply dont use
	 * it before it's up and running ...)
	 *
	 * SMP rescheduling is done in 2 passes:
	 *  - pass #1: faster: 'quick decisions'
	 *  - pass #2: slower: 'lets try and find another CPU'
	 */

	/*
	 * Pass #1
	 *
	 * There are two metrics here:
	 *
	 * first, a 'cutoff' interval, currently 0-200 usecs on
	 * x86 CPUs, depending on the size of the 'SMP-local cache'.
	 * If the current process has longer average timeslices than
	 * this, then we utilize the idle CPU.
	 *
	 * second, if the wakeup comes from a process context,
	 * then the two processes are 'related'. (they form a
	 * 'gang')
	 *
	 * An idle CPU is almost always a bad thing, thus we skip
	 * the idle-CPU utilization only if both these conditions
	 * are true. (ie. a 'process-gang' rescheduling with rather
	 * high frequency should stay on the same CPU).
	 *
	 * [We can switch to something more finegrained in 2.3.]
	 */
	if ((current->avg_slice < cacheflush_time) && related(current, p))
		return;

	reschedule_idle_slow(p);
#endif /* __SMP__ */
} 


添加了新的睡眠接口,支持定时睡眠

signed long schedule_timeout(signed long timeout)
{
    struct timer_list timer;
    unsigned long expire;

    switch (timeout)
    {
    case MAX_SCHEDULE_TIMEOUT:
        /*
         * These two special cases are useful to be comfortable
         * in the caller. Nothing more. We could take
         * MAX_SCHEDULE_TIMEOUT from one of the negative value
         * but I' d like to return a valid offset (>=0) to allow
         * the caller to do everything it want with the retval.
         */
        schedule();
        goto out;
    default:
        /*
         * Another bit of PARANOID. Note that the retval will be
         * 0 since no piece of kernel is supposed to do a check
         * for a negative retval of schedule_timeout() (since it
         * should never happens anyway). You just have the printk()
         * that will tell you if something is gone wrong and where.
         */
        if (timeout < 0)
        {
            printk(KERN_ERR "schedule_timeout: wrong timeout "
                   "value %lx from %p\n", timeout,
                   __builtin_return_address(0));
            goto out;
        }
    }

    expire = timeout + jiffies;

    init_timer(&timer);
    timer.expires = expire;
    timer.data = (unsigned long) current;
    timer.function = process_timeout;

    add_timer(&timer);
    schedule();
    del_timer(&timer);

    timeout = expire - jiffies;

 out:
    return timeout < 0 ? 0 : timeout;
}

/*
 * This one aligns per-CPU data on cacheline boundaries.
 */
static union {
    struct schedule_data {
        struct task_struct * prev;
        long prevstate;
        cycles_t last_schedule;
    } schedule_data;
    char __pad [L1_CACHE_BYTES];
} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};


static inline void __schedule_tail (void)
{
#ifdef __SMP__
    struct schedule_data * sched_data;

    /*
     * We might have switched CPUs:
     */
    sched_data = & aligned_data[smp_processor_id()].schedule_data;

    /*
     * Subtle. In the rare event that we got a wakeup to 'prev' just
     * during the reschedule (this is possible, the scheduler is pretty
     * parallel), we should do another reschedule in the next task's
     * context. schedule() will do the right thing next time around.
     * this is equivalent to 'delaying' the wakeup until the reschedule
     * has finished.
     */
    if (sched_data->prev->state != sched_data->prevstate)
        current->need_resched = 1;

    /*
     * Release the previous process ...
     *
     * We have dropped all locks, and we must make sure that we
     * only mark the previous process as no longer having a CPU
     * after all other state has been seen by other CPU's. Thus
     * the write memory barrier!
     */
    wmb();
    sched_data->prev->has_cpu = 0;
#endif /* __SMP__ */
}


void interruptible_sleep_on(struct wait_queue **p)
{
	SLEEP_ON_VAR

	current->state = TASK_INTERRUPTIBLE;

	SLEEP_ON_HEAD
	schedule();
	SLEEP_ON_TAIL
}

long interruptible_sleep_on_timeout(struct wait_queue **p, long timeout)
{
	SLEEP_ON_VAR

	current->state = TASK_INTERRUPTIBLE;

	SLEEP_ON_HEAD
	timeout = schedule_timeout(timeout);
	SLEEP_ON_TAIL

	return timeout;
}

void sleep_on(struct wait_queue **p)
{
	SLEEP_ON_VAR
	
	current->state = TASK_UNINTERRUPTIBLE;

	SLEEP_ON_HEAD
	schedule();
	SLEEP_ON_TAIL
}

long sleep_on_timeout(struct wait_queue **p, long timeout)
{
	SLEEP_ON_VAR
	
	current->state = TASK_UNINTERRUPTIBLE;

	SLEEP_ON_HEAD
	timeout = schedule_timeout(timeout);
	SLEEP_ON_TAIL

	return timeout;
}



Linux V2.3


1. 对多CPU支持进行了更多修改, 这里不讨论.

2. scheduel函数本身逻辑并无太大修改,但是大量使用goto 替换原来的if { ... } \

3. 更多的使用信号量保护.

源码:

asmlinkage void schedule(void)
{
	struct schedule_data * sched_data;
	struct task_struct *prev, *next, *p;
	int this_cpu, c;

	if (tq_scheduler)
		goto handle_tq_scheduler;
tq_scheduler_back:

	prev = current;
	this_cpu = prev->processor;

	if (in_interrupt())
		goto scheduling_in_interrupt;

	release_kernel_lock(prev, this_cpu);

	/* Do "administrative" work here while we don't hold any locks */
	if (bh_mask & bh_active)
		goto handle_bh;
handle_bh_back:

	/*
	 * 'sched_data' is protected by the fact that we can run
	 * only one process per CPU.
	 */
	sched_data = & aligned_data[this_cpu].schedule_data;

	spin_lock_irq(&runqueue_lock);

	/* move an exhausted RR process to be last.. */
	if (prev->policy == SCHED_RR)
		goto move_rr_last;
move_rr_back:

	switch (prev->state) {
		case TASK_INTERRUPTIBLE:
			if (signal_pending(prev)) {
				prev->state = TASK_RUNNING;
				break;
			}
		default:
			del_from_runqueue(prev);
		case TASK_RUNNING:
	}
	prev->need_resched = 0;

repeat_schedule:

	/*
	 * this is the scheduler proper:
	 */

	p = init_task.next_run;
	/* Default process to select.. */
	next = idle_task(this_cpu);
	c = -1000;
	if (prev->state == TASK_RUNNING)
		goto still_running;
still_running_back:

	/*
	 * This is subtle.
	 * Note how we can enable interrupts here, even
	 * though interrupts can add processes to the run-
	 * queue. This is because any new processes will
	 * be added to the front of the queue, so "p" above
	 * is a safe starting point.
	 * run-queue deletion and re-ordering is protected by
	 * the scheduler lock
	 */
/*
 * Note! there may appear new tasks on the run-queue during this, as
 * interrupts are enabled. However, they will be put on front of the
 * list, so our list starting at "p" is essentially fixed.
 */
	while (p != &init_task) {
		if (can_schedule(p)) {
			int weight = goodness(prev, p, this_cpu);
			if (weight > c)
				c = weight, next = p;
		}
		p = p->next_run;
	}

	/* Do we need to re-calculate counters? */
	if (!c)
		goto recalculate;
	/*
	 * from this point on nothing can prevent us from
	 * switching to the next task, save this fact in
	 * sched_data.
	 */
	sched_data->curr = next;
#ifdef __SMP__
 	next->has_cpu = 1;
	next->processor = this_cpu;
#endif
	spin_unlock_irq(&runqueue_lock);

	if (prev == next)
		goto same_process;

#ifdef __SMP__
 	/*
 	 * maintain the per-process 'average timeslice' value.
 	 * (this has to be recalculated even if we reschedule to
 	 * the same process) Currently this is only used on SMP,
	 * and it's approximate, so we do not have to maintain
	 * it while holding the runqueue spinlock.
 	 */
	{
		cycles_t t, this_slice;

		t = get_cycles();
		this_slice = t - sched_data->last_schedule;
		sched_data->last_schedule = t;

		/*
		 * Exponentially fading average calculation, with
		 * some weight so it doesnt get fooled easily by
		 * smaller irregularities.
		 */
		prev->avg_slice = (this_slice*1 + prev->avg_slice*1)/2;
	}

	/*
	 * We drop the scheduler lock early (it's a global spinlock),
	 * thus we have to lock the previous process from getting
	 * rescheduled during switch_to().
	 */

#endif /* __SMP__ */

	kstat.context_swtch++;
	get_mmu_context(next);
	switch_to(prev, next, prev);
	__schedule_tail(prev);

same_process:
  
	reacquire_kernel_lock(current);
	return;

recalculate:
	{
		struct task_struct *p;
		spin_unlock_irq(&runqueue_lock);
		read_lock(&tasklist_lock);
		for_each_task(p)
			p->counter = (p->counter >> 1) + p->priority;
		read_unlock(&tasklist_lock);
		spin_lock_irq(&runqueue_lock);
		goto repeat_schedule;
	}

still_running:
	c = prev_goodness(prev, prev, this_cpu);
	next = prev;
	goto still_running_back;

handle_bh:
	do_bottom_half();
	goto handle_bh_back;

handle_tq_scheduler:
	run_task_queue(&tq_scheduler);
	goto tq_scheduler_back;

move_rr_last:
	if (!prev->counter) {
		prev->counter = prev->priority;
		move_last_runqueue(prev);
	}
	goto move_rr_back;

scheduling_in_interrupt:
	printk("Scheduling in interrupt\n");
	*(int *)0 = 0;
	return;
}

相关函数修改:

唤醒函数

void __wake_up(struct wait_queue **q, unsigned int mode)
{
	struct task_struct *p;
	struct wait_queue *head, *next;

        if (!q)
		goto out;
	/*
	 * this is safe to be done before the check because it
	 * means no deference, just pointer operations.
	 */
	head = WAIT_QUEUE_HEAD(q);

	read_lock(&waitqueue_lock);
	next = *q;
	if (!next)
		goto out_unlock;

	while (next != head) {
		p = next->task;
		next = next->next;
		if (p->state & mode) {
			/*
			 * We can drop the read-lock early if this
			 * is the only/last process.
			 */
			if (next == head) {
				read_unlock(&waitqueue_lock);
				wake_up_process(p);
				goto out;
			}
			wake_up_process(p);
		}
	}
out_unlock:
	read_unlock(&waitqueue_lock);
out:
	return;
}

同时 __schedule_tail 接口也开始调用 reschedule_idle . 仍然是针对多CPU支持

/*
 * schedule_tail() is getting called from the fork return path. This
 * cleans up all remaining scheduler things, without impacting the
 * common case.
 */
static inline void __schedule_tail (struct task_struct *prev)
{
#ifdef __SMP__
	if ((prev->state == TASK_RUNNING) &&
			(prev != idle_task(smp_processor_id())))
		reschedule_idle(prev);
	wmb();
	prev->has_cpu = 0;
#endif /* __SMP__ */
}

void schedule_tail (struct task_struct *prev)
{
	__schedule_tail(prev);
}


Linux V2.4


1. scheduel添加了对内存的判断, 进程至少要有有效内存

	if (!current->active_mm) BUG();

2. 添加了 prepare_to_switch 部分.

	prepare_to_switch();
	{
		struct mm_struct *mm = next->mm;
		struct mm_struct *oldmm = prev->active_mm;
		if (!mm) {
			if (next->active_mm) BUG();
			next->active_mm = oldmm;
			atomic_inc(&oldmm->mm_count);
			enter_lazy_tlb(oldmm, next, this_cpu);
		} else {
			if (next->active_mm != mm) BUG();
			switch_mm(oldmm, mm, next, this_cpu);
		}

		if (!prev->mm) {
			prev->active_mm = NULL;
			mmdrop(oldmm);
		}
	}

scheduel 源码 :

asmlinkage void schedule(void)
{
	struct schedule_data * sched_data;
	struct task_struct *prev, *next, *p;
	struct list_head *tmp;
	int this_cpu, c;

	if (!current->active_mm) BUG();
need_resched_back:
	prev = current;
	this_cpu = prev->processor;

	if (in_interrupt())
		goto scheduling_in_interrupt;

	release_kernel_lock(prev, this_cpu);

	/* Do "administrative" work here while we don't hold any locks */
	if (softirq_active(this_cpu) & softirq_mask(this_cpu))
		goto handle_softirq;
handle_softirq_back:

	/*
	 * 'sched_data' is protected by the fact that we can run
	 * only one process per CPU.
	 */
	sched_data = & aligned_data[this_cpu].schedule_data;

	spin_lock_irq(&runqueue_lock);

	/* move an exhausted RR process to be last.. */
	if (prev->policy == SCHED_RR)
		goto move_rr_last;
move_rr_back:

	switch (prev->state) {
		case TASK_INTERRUPTIBLE:
			if (signal_pending(prev)) {
				prev->state = TASK_RUNNING;
				break;
			}
		default:
			del_from_runqueue(prev);
		case TASK_RUNNING:
	}
	prev->need_resched = 0;

	/*
	 * this is the scheduler proper:
	 */

repeat_schedule:
	/*
	 * Default process to select..
	 */
	next = idle_task(this_cpu);
	c = -1000;
	if (prev->state == TASK_RUNNING)
		goto still_running;

still_running_back:
	list_for_each(tmp, &runqueue_head) {
		p = list_entry(tmp, struct task_struct, run_list);
		if (can_schedule(p, this_cpu)) {
			int weight = goodness(p, this_cpu, prev->active_mm);
			if (weight > c)
				c = weight, next = p;
		}
	}

	/* Do we need to re-calculate counters? */
	if (!c)
		goto recalculate;
	/*
	 * from this point on nothing can prevent us from
	 * switching to the next task, save this fact in
	 * sched_data.
	 */
	sched_data->curr = next;
#ifdef CONFIG_SMP
 	next->has_cpu = 1;
	next->processor = this_cpu;
#endif
	spin_unlock_irq(&runqueue_lock);

	if (prev == next)
		goto same_process;

#ifdef CONFIG_SMP
 	/*
 	 * maintain the per-process 'last schedule' value.
 	 * (this has to be recalculated even if we reschedule to
 	 * the same process) Currently this is only used on SMP,
	 * and it's approximate, so we do not have to maintain
	 * it while holding the runqueue spinlock.
 	 */
 	sched_data->last_schedule = get_cycles();

	/*
	 * We drop the scheduler lock early (it's a global spinlock),
	 * thus we have to lock the previous process from getting
	 * rescheduled during switch_to().
	 */

#endif /* CONFIG_SMP */

	kstat.context_swtch++;
	/*
	 * there are 3 processes which are affected by a context switch:
	 *
	 * prev == .... ==> (last => next)
	 *
	 * It's the 'much more previous' 'prev' that is on next's stack,
	 * but prev is set to (the just run) 'last' process by switch_to().
	 * This might sound slightly confusing but makes tons of sense.
	 */
	prepare_to_switch();
	{
		struct mm_struct *mm = next->mm;
		struct mm_struct *oldmm = prev->active_mm;
		if (!mm) {
			if (next->active_mm) BUG();
			next->active_mm = oldmm;
			atomic_inc(&oldmm->mm_count);
			enter_lazy_tlb(oldmm, next, this_cpu);
		} else {
			if (next->active_mm != mm) BUG();
			switch_mm(oldmm, mm, next, this_cpu);
		}

		if (!prev->mm) {
			prev->active_mm = NULL;
			mmdrop(oldmm);
		}
	}

	/*
	 * This just switches the register state and the
	 * stack.
	 */
	switch_to(prev, next, prev);
	__schedule_tail(prev);

same_process:
	reacquire_kernel_lock(current);
	if (current->need_resched)
		goto need_resched_back;

	return;

recalculate:
	{
		struct task_struct *p;
		spin_unlock_irq(&runqueue_lock);
		read_lock(&tasklist_lock);
		for_each_task(p)
			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
		read_unlock(&tasklist_lock);
		spin_lock_irq(&runqueue_lock);
	}
	goto repeat_schedule;

still_running:
	c = goodness(prev, this_cpu, prev->active_mm);
	next = prev;
	goto still_running_back;

handle_softirq:
	do_softirq();
	goto handle_softirq_back;

move_rr_last:
	if (!prev->counter) {
		prev->counter = NICE_TO_TICKS(prev->nice);
		move_last_runqueue(prev);
	}
	goto move_rr_back;

scheduling_in_interrupt:
	printk("Scheduling in interrupt\n");
	BUG();
	return;
}


相关代码修改:

1. 修改了对runquene的操作. 双向链表操作抽象在list.h中  , 提供各种接口

使用例子:

static inline void add_to_runqueue(struct task_struct * p)
{
	list_add(&p->run_list, &runqueue_head);
	nr_running++;
}

static inline void move_last_runqueue(struct task_struct * p)
{
	list_del(&p->run_list);
	list_add_tail(&p->run_list, &runqueue_head);
}

static inline void move_first_runqueue(struct task_struct * p)
{
	list_del(&p->run_list);
	list_add(&p->run_list, &runqueue_head);
}

2. reschedule_idle对多CPU做了更多的工作.

3. 添加异步唤醒接口, 唤醒后不去判断是否重新调度.

static inline void wake_up_process_synchronous(struct task_struct * p)
{
	unsigned long flags;

	/*
	 * We want the common case fall through straight, thus the goto.
	 */
	spin_lock_irqsave(&runqueue_lock, flags);
	p->state = TASK_RUNNING;
	if (task_on_runqueue(p))
		goto out;
	add_to_runqueue(p);
out:
	spin_unlock_irqrestore(&runqueue_lock, flags);
}



4. 添加一系列唤醒接口, 可以根据不同的模式,选择CPU ,选择唤醒方式.
static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
                     unsigned int wq_mode, const int sync)
{
    struct list_head *tmp, *head;
    struct task_struct *p, *best_exclusive;
    unsigned long flags;
    int best_cpu, irq;

    if (!q)
        goto out;

    best_cpu = smp_processor_id();
    irq = in_interrupt();
    best_exclusive = NULL;
    wq_write_lock_irqsave(&q->lock, flags);

#if WAITQUEUE_DEBUG
    CHECK_MAGIC_WQHEAD(q);
#endif

    head = &q->task_list;
#if WAITQUEUE_DEBUG
        if (!head->next || !head->prev)
                WQ_BUG();
#endif
    tmp = head->next;
    while (tmp != head) {
        unsigned int state;
                wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);

        tmp = tmp->next;

#if WAITQUEUE_DEBUG
        CHECK_MAGIC(curr->__magic);
#endif
        p = curr->task;
        state = p->state;
        if (state & mode) {
#if WAITQUEUE_DEBUG
            curr->__waker = (long)__builtin_return_address(0);
#endif
            /*
             * If waking up from an interrupt context then
             * prefer processes which are affine to this
             * CPU.
             */
            if (irq && (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)) {
                if (!best_exclusive)
                    best_exclusive = p;
                if (p->processor == best_cpu) {
                    best_exclusive = p;
                    break;
                }
            } else {
                if (sync)
                    wake_up_process_synchronous(p);
                else
                    wake_up_process(p);
                if (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)
                    break;
            }
        }
    }
    if (best_exclusive) {
        if (sync)
            wake_up_process_synchronous(best_exclusive);
        else
            wake_up_process(best_exclusive);
    }
    wq_write_unlock_irqrestore(&q->lock, flags);
out:
    return;
}

void __wake_up(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
{
    __wake_up_common(q, mode, wq_mode, 0);
}

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
{
    __wake_up_common(q, mode, wq_mode, 1);
}


Linux V2.5

1. 不再大量的使用 goto , 作为替代对 if 使用 unlikely .

源码: 

asmlinkage void schedule(void)
{
	struct schedule_data * sched_data;
	struct task_struct *prev, *next, *p;
	struct list_head *tmp;
	int this_cpu, c;


	spin_lock_prefetch(&runqueue_lock);

	if (!current->active_mm) BUG();
need_resched_back:
	prev = current;
	this_cpu = prev->processor;

	if (unlikely(in_interrupt())) {
		printk("Scheduling in interrupt\n");
		BUG();
	}

	release_kernel_lock(prev, this_cpu);

	/*
	 * 'sched_data' is protected by the fact that we can run
	 * only one process per CPU.
	 */
	sched_data = & aligned_data[this_cpu].schedule_data;

	spin_lock_irq(&runqueue_lock);

	/* move an exhausted RR process to be last.. */
	if (unlikely(prev->policy == SCHED_RR))
		if (!prev->counter) {
			prev->counter = NICE_TO_TICKS(prev->nice);
			move_last_runqueue(prev);
		}

	switch (prev->state) {
		case TASK_INTERRUPTIBLE:
			if (signal_pending(prev)) {
				prev->state = TASK_RUNNING;
				break;
			}
		default:
			del_from_runqueue(prev);
		case TASK_RUNNING:;
	}
	prev->need_resched = 0;

	/*
	 * this is the scheduler proper:
	 */

repeat_schedule:
	/*
	 * Default process to select..
	 */
	next = idle_task(this_cpu);
	c = -1000;
	list_for_each(tmp, &runqueue_head) {
		p = list_entry(tmp, struct task_struct, run_list);
		if (can_schedule(p, this_cpu)) {
			int weight = goodness(p, this_cpu, prev->active_mm);
			if (weight > c)
				c = weight, next = p;
		}
	}

	/* Do we need to re-calculate counters? */
	if (unlikely(!c)) {
		struct task_struct *p;

		spin_unlock_irq(&runqueue_lock);
		read_lock(&tasklist_lock);
		for_each_task(p)
			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
		read_unlock(&tasklist_lock);
		spin_lock_irq(&runqueue_lock);
		goto repeat_schedule;
	}

	/*
	 * from this point on nothing can prevent us from
	 * switching to the next task, save this fact in
	 * sched_data.
	 */
	sched_data->curr = next;
	task_set_cpu(next, this_cpu);
	spin_unlock_irq(&runqueue_lock);

	if (unlikely(prev == next)) {
		/* We won't go through the normal tail, so do this by hand */
		prev->policy &= ~SCHED_YIELD;
		goto same_process;
	}

#ifdef CONFIG_SMP
 	/*
 	 * maintain the per-process 'last schedule' value.
 	 * (this has to be recalculated even if we reschedule to
 	 * the same process) Currently this is only used on SMP,
	 * and it's approximate, so we do not have to maintain
	 * it while holding the runqueue spinlock.
 	 */
 	sched_data->last_schedule = get_cycles();

	/*
	 * We drop the scheduler lock early (it's a global spinlock),
	 * thus we have to lock the previous process from getting
	 * rescheduled during switch_to().
	 */

#endif /* CONFIG_SMP */

	kstat.context_swtch++;
	/*
	 * there are 3 processes which are affected by a context switch:
	 *
	 * prev == .... ==> (last => next)
	 *
	 * It's the 'much more previous' 'prev' that is on next's stack,
	 * but prev is set to (the just run) 'last' process by switch_to().
	 * This might sound slightly confusing but makes tons of sense.
	 */
	prepare_to_switch();
	{
		struct mm_struct *mm = next->mm;
		struct mm_struct *oldmm = prev->active_mm;
		if (!mm) {
			if (next->active_mm) BUG();
			next->active_mm = oldmm;
			atomic_inc(&oldmm->mm_count);
			enter_lazy_tlb(oldmm, next, this_cpu);
		} else {
			if (next->active_mm != mm) BUG();
			switch_mm(oldmm, mm, next, this_cpu);
		}

		if (!prev->mm) {
			prev->active_mm = NULL;
			mmdrop(oldmm);
		}
	}

	/*
	 * This just switches the register state and the
	 * stack.
	 */
	switch_to(prev, next, prev);
	__schedule_tail(prev);

same_process:
	reacquire_kernel_lock(current);
	if (current->need_resched)
		goto need_resched_back;
	return;
}

相关函数修改:

1. 唤醒函数整合, 添加了参数有效性代码

static inline int try_to_wake_up(struct task_struct * p, int synchronous)
{
	unsigned long flags;
	int success = 0;

	/*
	 * We want the common case fall through straight, thus the goto.
	 */
	spin_lock_irqsave(&runqueue_lock, flags);
	p->state = TASK_RUNNING;
	if (task_on_runqueue(p))
		goto out;
	add_to_runqueue(p);
	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
		reschedule_idle(p);
	success = 1;
out:
	spin_unlock_irqrestore(&runqueue_lock, flags);
	return success;
}

inline int wake_up_process(struct task_struct * p)
{
	return try_to_wake_up(p, 0);
}

oid __wake_up(wait_queue_head_t *q, unsigned int mode, int nr)
{
	if (q) {
		unsigned long flags;
		wq_read_lock_irqsave(&q->lock, flags);
		__wake_up_common(q, mode, nr, 0);
		wq_read_unlock_irqrestore(&q->lock, flags);
	}
}

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)
{
	if (q) {
		unsigned long flags;
		wq_read_lock_irqsave(&q->lock, flags);
		__wake_up_common(q, mode, nr, 1);
		wq_read_unlock_irqrestore(&q->lock, flags);
	}
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值