Linux学习总结—进程切换和调度算法深入分析

最新推荐文章于 2023-04-25 19:18:16 发布

bjcxy110

最新推荐文章于 2023-04-25 19:18:16 发布

阅读量6.8k

点赞数

分类专栏： Linux技术文章标签： linux 算法 struct thread io permissions

本文链接：https://blog.csdn.net/cxylaf/article/details/1626529

版权

Linux技术专栏收录该内容

7 篇文章 1 订阅

订阅专栏

一、Linux进程切换深入分析

#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)

创建内核线程时使用的CLONE标志。

1．#define unlikely(x) __builtin_expect(!!(x), 0)

编译器优化，实际返回值x是整型表达式，0表示并不预期该事件发生，也就是说x为0的可能性很小，这是为了让编译器对下面得语句进行优化。

2．进程内核态堆栈结构：

进程是动态实体，进程描述符是存放在动态内存中的。在一块进程内存区上，Linux存放了两个数据结构：指向task_struct得thread_info和内核态的进程栈。大小一般2页8K，这要求页面帧对齐2的13次幂，在X86上编译时可以配置大小为4K。thread_info在内存区开始处，内核栈从内存尾向下增长。在C语言中可以用union结构表示：

图 1. 8K 内核栈和进程描述符 task_struct 及 thread_info 的相互关系

union thread_union {

struct thread_info thread_info;

unsigned long stack[2048]; /* 1024 for 4KB stacks */

};

CPU的 esp寄存器用于执行堆栈的顶部指针，当从用户态转向内核态时，进程内核栈总是空的，所以esp就会执行堆栈底部。

使用 alloc_thread_info 和 free_thread_info 用于分配和释放一个存放thread_info结构和内核堆栈的内存区。

内核通过当前esp指针可以很方便的得到thread_info结构的地址。current_thread_info(void)的原理即如下：

movl $0xffff2000,%ecx /* or 0xfffff000 for 4KB stacks */

andl %esp,%ecx

movl %ecx,p

thread_info 中task指针是第一个，所以current宏相当于current_thread_info( )->task，从而也就得到task指针。

每个进程有自己独立得进程空间，所有进程共享CPU寄存器。进程继续执行时必须装入寄存器恢复得数据集称为硬件上下文环境。在Linux中部分硬件上下文存放在进程描述符中，部分存放到内核态堆栈里。

3. 进程切换堆栈原理：

80x86体系支持在进程TSS段跳转时自动执行进程硬件上下文切换。Linux使用软件方法实现。软件方式效率差不多，当更灵活，可以控制流程，留下优化空间。

80x86用TSS段保存硬件上下文内容，每个CPU有一个TSS段。从用户态到内核态切换时，从TSS中取出内核栈地址。用户态进程访问I/O端口时，TSS中的I/O访问位图可以验证权限。tss_struct描述了TSS格式，init_tss存放初始TSS内容，每次进程切换，内核更新TSS中的某些字段，以反映当前运行进程的权限等级。每个进程有个反映任务CPU状态的 thread_struct 结构变量 thread，除eax、ecx等通用寄存器内容保存在内核态堆栈中，其他大部分寄存器都保存在次结构中。该结构一部分对应于tss_struct中的内容，进程切换时把thread中某些内容更新到tss_struct中就可以反映当前任务的运行CPU环境。

struct tss_struct {

unsigned short back_link,__blh;

unsigned long esp0;

unsigned short ss0,__ss0h;

unsigned long esp1;

unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */

unsigned long esp2;

unsigned short ss2,__ss2h;

unsigned long __cr3;

unsigned long eip;

unsigned long eflags;

unsigned long eax,ecx,edx,ebx;

unsigned long esp;

unsigned long ebp;

unsigned long esi;

unsigned long edi;

unsigned short es, __esh;

unsigned short cs, __csh;

unsigned short ss, __ssh;

unsigned short ds, __dsh;

unsigned short fs, __fsh;

unsigned short gs, __gsh;

unsigned short ldt, __ldth;

unsigned short trace, io_bitmap_base;

/*

* The extra 1 is there because the CPU will access an

* additional byte beyond the end of the IO permission

* bitmap. The extra byte must be all 1 bits, and must

* be within the limit.

*/

unsigned long io_bitmap[IO_BITMAP_LONGS + 1];

/*

* Cache the current maximum and the last task that used the bitmap:

*/

unsigned long io_bitmap_max;

struct thread_struct *io_bitmap_owner;

/*

* pads the TSS to be cacheline-aligned (size is 0x100)

*/

unsigned long __cacheline_filler[35];

/*

* .. and then another 0x100 bytes for emergency kernel stack

*/

unsigned long stack[64];

} __attribute__((packed));

struct thread_struct {

/* cached TLS descriptors. */

struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];

unsigned long esp0;

unsigned long sysenter_cs;

unsigned long eip;

unsigned long esp;

unsigned long fs;

unsigned long gs;

/* Hardware debugging registers */

unsigned long debugreg[8]; /* %%db0-7 debug registers */

/* fault info */

unsigned long cr2, trap_no, error_code;

/* floating point info */

union i387_union i387;

/* virtual 86 mode info */

struct vm86_struct __user * vm86_info;

unsigned long screen_bitmap;

unsigned long v86flags, v86mask, saved_esp0;

unsigned int saved_fs, saved_gs;

/* IO permissions */

unsigned long *io_bitmap_ptr;

unsigned long iopl;

/* max allowed port in the bitmap, in bytes: */

unsigned long io_bitmap_max;

};

4．进程切换流程解析switch_to

进程切换本质上两步：

1) 进程页表PGD切换；

2) 内核态堆栈和硬件上下文切换（包括CPU寄存器）；

上面两步通过context_switch()实现，它通过调用switch_mm()切换进程空间，switch_to切换内核上下文环境。

首先看看context_switch()做了些什么：

1) 进程描述符中active_mm执行进程使用的地址空间，mm执行进程拥有的地址空间，对于普通进程它们相同。对于内核线程，它的mm总为NULL。所以context_switch()首先判断 if (!next->mm)即next为内核线程，则使用prev的进程地址空间：

if (!next->mm) {

    next->active_mm = prev->active_mm;

    atomic_inc(&prev->active_mm->mm_count);

    enter_lazy_tlb(prev->active_mm, next);

2) 否则，如果next是普通进程，则用next进程空间替换prev的地址空间：

switch_mm(oldmm, mm, next);

3) 如果prev是内核线程或者正在退出，则设置prev->active_mm 和runqueue的 prev_mm 为NULL：

if (!prev->mm) {

prev->active_mm = NULL;

WARN_ON(rq->prev_mm);

rq->prev_mm = oldmm;

}

下面看看switch_mm()如何切换进程空间：

1) 获取cpu逻辑号。

2) 清除cpu_vm_mask位标志。cpu_clear(cpu, prev->cpu_vm_mask)

3) 设置cpu_tlbstate状态。per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK

4) 设置cpu_tlbstate的active_mm为next。per_cpu(cpu_tlbstate, cpu).active_mm = next

5) 设置next的cpu_vm_mask标志。cpu_set(cpu, next->cpu_vm_mask)

6) 装载next的pgd页表到cr3寄存器。load_cr3(next->pgd)

7) 如果next的LDT描述符改变，则加载next的LDT描述符。

if (unlikely(prev->context.ldt != next->context.ldt))

load_LDT_nolock(&next->context);

最后，switch_to进行内核堆栈和CPU环境切换操作：

#define switch_to(prev,next,last) do { /

unsigned long esi,edi; /

asm volatile("pushfl/n/t" /* Save flags */ /

"pushl %%ebp/n/t" /

"movl %%esp,%0/n/t" /* save ESP */ /

"movl %5,%%esp/n/t" /* restore ESP */ /

"movl $1f,%1/n/t" /* save EIP */ /

"pushl %6/n/t" /* restore EIP */ /

"jmp __switch_to/n" /

"1:/t" /

"popl %%ebp/n/t" /

"popfl" /

:"=m" (prev->thread.esp),"=m" (prev->thread.eip), /

"=a" (last),"=S" (esi),"=D" (edi) /

:"m" (next->thread.esp),"m" (next->thread.eip), /

"2" (prev), "d" (next)); /

} while (0)

流程描述，prev是进程A的task结构，next是进程B的task结构，last是进程C的结构：

1) 保存prev和next指针的值到eax和edx：

movl prev, %eax

movl next, %edx

2) 保存eflags 和 ebp 寄存器内容到 prev 内核态堆栈中：

pushfl

pushl %ebp

3) 将esp内容保存到prev->thread.esp中，该字段执行prev内核堆栈的top地址。

movl %esp,484(%eax)

4) 将next->thread.esp加载到esp中，现在开始，esp执行next的内核堆栈，进程切换完成。

movl 484(%edx), %esp

5) 保存下面Label 1到prev->thread.eip指针中，当prev进程恢复运行时，从该位置开始运行。

movl $1f, 480(%eax)

6) 将next->thread.eip的指针内容压到next的内核态堆栈中，通常它的内容也是Label 1。

pushl 480(%edx)

7) 跳转到__switch_to（）C函数执行。

jmp __switch_to

8) 被替换的进程A继续执行，它在Label 1处，首先是恢复eflags和ebp寄存器内容。注意这里是发生在调度器选择prev在CPU上运行后，次数esp已经执行了prev的内核堆栈。

popl %ebp

popfl

9) 将eax内容保存到last任务结构中。这里eax是被进程A切换下来的进程C的task结构指针。

movl %eax, last

5．__switch_to深入分析

__switch_to参数是存放在eax和edx中的内容，这通过

#define fastcall __attribute__((regparm(3)))告诉gcc编译器。

1) 获取tss_struct tss、prev_p和next_p的thread_struct结构prev和next、当前CPU逻辑ID。

2) 调用__unlazy_fpu(prev_p)根据条件标志选择是否保存prev_p的FPU, MMX, 和XMM寄存器内容 。

3) load_esp0(tss, next) 将next的堆栈地址存放到tss中：tss->esp0 = thread->esp0。

4) savesegment(gs, prev->gs) 保存gs寄存器到prev->gs，fs已经在栈入口保存，es和ds在内核态下不需要保存。

5) load_TLS(next, cpu) 从next的tls_array 缓存中加载线程的Thread-Local Storage描述符。TLS在GDT表中位置6、7、8。

cpu_gdt_table[cpu][6] = next_p->thread.tls_array[0];

cpu_gdt_table[cpu][7] = next_p->thread.tls_array[1];

cpu_gdt_table[cpu][8] = next_p->thread.tls_array[2];

6) 如果当前特权级别是0并且prev->iopl != next->iopl则恢复IOPL设置set_iopl_mask(next->iopl)。

7) 根据thread_info的TIF标志_TIF_WORK_CTXSW和TIF_IO_BITMAP判断是否需要处理debug寄存器和IO位图：__switch_to_xtra(next_p, tss);

l 只有当next_p挂起时即if (test_tsk_thread_flag(next_p, TIF_DEBUG))使用了debug寄存器才需要恢复set_debugreg(next->debugreg[i], i)。只有调试器需要监控prev的状态时，prev_p->thread.debugreg数组的内容才会被修改。Debug寄存器dr0～dr7，dr4和dr5不用。

l 当prev_p或者next_p定义了自己的I/O访问位图时，必须更新TSS的 I/O bitmap。

if (prev_p->thread.io_bitmap_ptr || next_p->thread.io_bitmap_ptr)

handle_io_bitmap(&next_p->thread, &init_tss[cpu]);

进程的 I/O访问位图存放在io_bitmap_ptr指针里，通常进程很少修改IO位图，只有当前时间片中访问IO端口才会把实际的IO位图加载到TSS中。

ü 当next_p没有自定义位图时：

tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 返回

ü 如果next == tss->io_bitmap_owner则设置有效的偏移量：tss->io_bitmap_base = IO_BITMAP_OFFSET; 返回

ü 否则tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;

只有第二种情况tss->io_bitmap_base设置的是有效的io_bitmap偏移量，对于其他两种情况，当用户进程访问I/O端口时将会触发"General protection "的异常，do_general_protection( )异常处理函数根据io_bitmap的值处理异常：如果是 0x8000(INVALID_IO_BITMAP_OFFSET ) 则发送SIGSEGV信号给用户进程；如果是0x9000(INVALID_IO_BITMAP_OFFSET_LAZY ) 则拷贝进程的thread中的io_bitmap_ptr内容到 io_bitmap中，并设置 io_bitmap_base为正确的偏移量(104)。

8) disable_tsc(prev_p, next_p) 设置cr4中的TSC Disable位。

9) arch_leave_lazy_cpu_mode() 设置CPU的lazy模式。

10) 如果next_p->fpu_counter > 5则恢复next_p的FPU寄存器内容：

math_state_restore() 。FPU寄存器存放在next_p->thread->i387中，i387是i387_union的union结构：

union i387_union {

struct i387_fsave_struct fsave;

struct i387_fxsave_struct fxsave;

struct i387_soft_struct soft;

};

struct i387_fxsave_struct {

unsigned short cwd;

unsigned short swd;

unsigned short twd;

unsigned short fop;

long fip;

long fcs;

long foo;

long fos;

long mxcsr;

long mxcsr_mask;

long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */

long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */

long padding[56];

} __attribute__ ((aligned (16)));

11) 如果需要，则从next->gs中恢复gs寄存器内容。

if (prev->gs | next->gs)

loadsegment(gs, next->gs);

二、Linux实时调度schedule

1．概述

三种调度策略：SCHED_FIFO，SCHED_RR和SCHED_NORMAL。

FIFO 实时调度算法当调度器将CPU指定给某个进程时，它把该进程放到运行队列首；除非有更高优先级的进程，否则该进程将一直占用CPU。

Round Robin实时进程调度把CPU指定给某进程，把它放到运行队列尾。时间片运行完再选择其他进程调度。这样保证了同优先级的公平竞争CPU。

SCHED_NORMAL 是普通的基于运行时间和等待时间等，动态调整进程优先级的一种调度策略。

实时进程优先级1～100，普通101～139。

2．实时进程调度的时机

1) 该进程被更高优先级的进程抢占；

2) 进程执行一个阻塞操作，被放到睡眠队列，状态为TASK_INTERRUPTIBLE或TASK_UNINTERRUPTIBLE；

3) 进程被终止(状态为TASK_STOPPED 或TASK_TRACED)，或者进程被杀死(状态为EXIT_ZOMBIE 或 EXIT_DEAD)

4) 进程调用sched_yield()主动放弃CPU；

5) RR 实时进程用完了CPU分配的时间片；

3．调度器相关函数

1) scheduler_tick( )

更新当前进程的运行时间片tick值，在update_process_times( )中调用，判断进程的时间片是否用完。

2) try_to_wake_up( )

唤醒一个睡眠的进程并把它的状态设为TASK_RUNNING，插入到运行队列中。

3) recalc_task_prio( )

更新进程的睡眠时间和动态优先级，SCHED_NORMAL调度。

4) schedule( )

进程调度

5) load_balance()

SMP 系统的负载均衡。

4．schedule( )函数

进程调度有两种方式：直接调用和延迟调用。

直接调用schedule，当前进程资源不可用时会直接调用调度器，这种情况下，内核线程进行如下处理：

1) 将current插入到合适的等待队列中；

2) 将current状态变为TASK_INTERRUPTIBLE 或 TASK_UNINTERRUPTIBLE

3) 调用schedule();

4) 检查资源是否可用，如果不可用，转到第2）步；

5) 一旦资源可用，从等待队列中移除current进程；

在设备驱动程序中也经常会检查TIF_NEED_RESCHED并调用schedule()。

延迟调用方式是通过设置current进程的TIF_NEED_RESCHED标志为1。当恢复用户态进程的执行前，会检查该标志并决定是否调用schedule()。延迟调度的情形有：

1) 在scheduler_tick()中如果current用完了时间片则设置该标志；

2) 在try_to_wake_up( )中唤醒一个进程并且该进程比当前运行进程优先级高。

3) 调用sched_setscheduler()时。

schedule() 函数工作流程：

进程切换前的工作：

1) 禁止内核抢占，初始化局部变量prev，释放prev占有的大内核锁；

need_resched:

preempt_disable();

prev = current;

release_kernel_lock(prev);

2) 读取调度TSC时间，计算调整run_time时间，更新调度状态rq->sched_cnt参数，获取rq的spin锁：spin_lock_irq(&rq->lock)。

3) 检查prev状态：如果状态不是TASK_RUNNING且没有在内核态被抢占，则从运行队列中移除；但是如果prev状态是 TASK_INTERRUPTIBLE 并且拥有非阻塞挂起的信号，则把进程状态设为 TASK_RUNNING不移出运行队列。

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

switch_count = &prev->nvcsw;

if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&

unlikely(signal_pending(prev))))

prev->state = TASK_RUNNING;

else {

if (prev->state == TASK_UNINTERRUPTIBLE)

rq->nr_uninterruptible++;

deactivate_task(prev, rq);

}

4) 获取当前CPU逻辑号，如果当前运行队列为空，则调用idle_balance(cpu, rq)从其他CPU运行队列上拉进程到本地CPU的运行队列上。如果调整后，当前运行队列仍为空则next赋为idle进程，跳转到任务切换代码行去。

if (unlikely(!rq->nr_running)) {

idle_balance(cpu, rq);

if (!rq->nr_running) {

next = rq->idle;

rq->expired_timestamp = 0;

goto switch_tasks;

}

5) 如果runqueue中有进程，并且 当前活得进程数为0，则交换active 和 expired队列指针。

array = rq->active;

if (unlikely(!array->nr_active)) {

schedstat_inc(rq, sched_switch);

rq->active = rq->expired;

rq->expired = array;

array = rq->active;

rq->expired_timestamp = 0;

rq->best_expired_prio = MAX_PRIO;

}

6) 从运行队列的活动prio_array数据的位图中查找第一个位设置为1的索引，根据索引找到该优先级队列的第一个task。

idx = sched_find_first_bit(array->bitmap);

queue = array->queue + idx;

next = list_entry(queue->next, struct task_struct, run_list);

7) 如果next是普通进程，并且next->sleep_type是SLEEP_INTERACTIVE 或SLEEP_INTERRUPTED，则重新计算进程睡眠时间和进程优先级。

进程切换工作 ：

8) 更新sched_goidle，预期next结构数据，清除TIF_NEED_RESCHED标志，设置quiescent状态计数为1：rcu_data ->passed_quiesc = 1;

switch_tasks:

if (next == rq->idle)

schedstat_inc(rq, sched_goidle);

prefetch(next);

prefetch_stack(next);

clear_tsk_need_resched(prev);

rcu_qsctr_inc(task_cpu(prev));

9) 更新prev进程运行时间戳prev->sleep_avg，prev->timestamp;

10) 调度信息切换到next，更新next;时间戳和运行队列信息：

sched_info_switch(prev, next);

if (likely(prev != next)) {

next->timestamp = next->last_ran = now;

rq->nr_switches++;

rq->curr = next;

++*switch_count;

……

}

11) 进行进程切换，context_switch参见前面的分析，它进行进程空间和内核堆栈切换 。prepare_lock_switch 功能是在定义了__ARCH_WANT_INTERRUPTS_ON_CTXSW情况下，在切换前开中断spin_unlock_irq(&rq->lock); barrier() 是保证代码执行顺序不变。

prepare_task_switch(rq, next);

prev = context_switch(rq, prev, next);

barrier();

finish_task_switch(this_rq(), prev);

进程切换后的工作：

进程切换context_switch语句之后的代码并不是由next进程立即执行的，而是由调度器选择prev进程继续执行的。次时prev变量指向的已经是被prev进程替换的其他进程的指针。

12) finish_task_switch() 必须与prepare_task_switch配对使用，并主要锁的顺序。它所做的工作，finish_lock_switch调用local_irq_enable(),获取prev的状态和rq->prev_mm，如果mm非空，则调用mmdrop(mm)减少mm的引用计数，如果为0则释放进程页表和虚拟空间。如果prev_state为TASK_DEAD则释放进程的task结构。

struct mm_struct *mm = rq->prev_mm;

long prev_state;

rq->prev_mm = NULL;

prev_state = prev->state;

finish_arch_switch(prev);

finish_lock_switch(rq, prev);

if (mm)

mmdrop(mm);

if (unlikely(prev_state == TASK_DEAD)) {

kprobe_flush_task(prev);

put_task_struct(prev);

}

13) 最后，if (unlikely(task->lock_depth >= 0))则重新获取大内核锁__reacquire_kernel_lock，否则goto need_resched_nonpreemptible; 允许抢占，如果TIF_NEED_RESCHED被设置，则跳转到need_resched重新进行调度。

prev = current;

if (unlikely(reacquire_kernel_lock(prev) < 0))

goto need_resched_nonpreemptible;

preempt_enable_no_resched();

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

goto need_resched;

bjcxy110

关注

0
点赞
踩
9

收藏

觉得还不错? 一键收藏
1
评论
Linux学习总结—进程切换和调度算法深入分析

一、Linux进程切换深入分析#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)创建内核线程时使用的CLONE标志。1．#define unlikely(x) __builtin_expect(!!(x), 0)编译器优化，实际返回值x是整型表达式，0表示并不预期该事件发生，也就是说x为0的可
复制链接

扫一扫