- 这里介绍fork, vfork和 clone的具体实现
- 它们具体实现的代码如下:
- asmlinkage int sys_fork(struct pt_regs regs)
- {
- return do_fork(SIGCHLD, regs.esp, ®s, 0);
- }
- asmlinkage int sys_clone(struct pt_regs regs)
- {
- unsigned long clone_flags;
- unsigned long newsp;
- clone_flags = regs.ebx;
- newsp = regs.ecx;
- if (!newsp)
- newsp = regs.esp;
- return do_fork(clone_flags, newsp, ®s, 0);
- }
- asmlinkage int sys_vfork(struct pt_regs regs)
- {
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0);
- }
- 这里可以看到它们都是对do_fork的调用,不过是参数不同而已下面是 do_fork函数(很长)
- int do_fork(unsigned int clone_flags, unsigned long stack_start, struct pt_regs * regs, unsigned long stack_size) {
- //对于clone_flags是由2部分组成,最低字节为信号类型,用于规定子进程去世时向父进程发出的信号。我们可以看到在fork和vfork中这个信号就是SIGCHLD,而clone则可以由用户自己定义。而第2部分是资源表示资源和特性的标志位(前面我们见过这些标志了),对于 fork我们可以看出第2部分全部是0表现对有关资源都要复制而不是通过指针共享。而对于vfork则是CLONE_VFORK|CLONE_VM(看了fork,vfork,clone,应该很熟悉了)表示对虚存空间的共享和对父进程的挂起和唤醒,至于clone则是由用户自己来定义的
- int retval = -ENOMEM;
- struct task_struct *p;
- DECLARE_MUTEX_LOCKED(sem); //定义和创建了一个用于进程互斥和同步的信号量,这里不做讨论
- if(clone_flags & CLONE_PID)
- { //CLONE_PID信号是子进程和父进程拥有相同的PID号,这只有一种情况可以使用,就是父进程的PID为0,这里是做这个保证
- if(current->pid)
- return -EPERM;
- }
- current->vfork_sem = sem;
- p = alloc_task_struct();//为子进程分配2个页面(为什么是2个,前面看过也该明白用来做系统堆栈和存放task_struct的)
- if(!p)
- goto fork_out;
- *p = *current; //将父进程的task_struct赋值到2个页面中
- retval = -EAGAIN;
- if(atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur) //p->user指向该进程所属用户的数据结构,这个数据结构见下(内核进程不属于任何用户,所以它的p->user = 0),p->rlim是对进程资源的限制,而p->rlim[RLIMIT_NPROC]则规定了该进程所属用户可以拥有的进程数量,如果超过这个数量就不可以再fork了
- goto bad_fork_free;
- atomic_inc(&p->user->__count);
- atomic_inc(&p->user->processes);
- if(nr_threads >= max_threads) //上面是对用户进程的限制,这里是对内核进程的数量限制
- goto bad_fork_cleanup_count;
- get_exec_domain(p->exec_domain); //p->exec_domain指向一个exec_domain结构,定义见下。
- if(p->binfmt && p->binfmt->module) //每个进程都属于某种可执行的印象格式如a.out或者elf,对这些格式的支持都是通过动态安装驱动模块来实现的,binfmt就是用来指向这些格式驱动
- __MOD_INC_USE_COUNT(p->binfmt->module);
- p->did_exec = 0;
- p->swappable = 0;
- p->state = TASK_UNINTERRUPTIBLE; //为下面设置PID做准备,明显get_pid是一种独占行为,不能多个进程同时去get_pid,因此在这里可能需要将当前进程睡眠,所以设置这个
- copy_flags(clone_flags, p);
- p->pid = get_pid(clone_flags); //设置新建进程的PID
- p->run_list.next = NULL;
- p->run_list.prev = NULL;
- if((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT))
- {
- p->p_opptr = current;
- if(!(p->trace & PT_PTRACED))
- p->p_pptr = current;
- }
- p->p_cptr = NULL;
- init_waitqueue_head(&p->wait_childexit); //wait4()与wait3()函数是一个进程等待子进程完成使命后再继续执行,这个队列为此做准备,这里是做初始化
- p->vfork_sem = NULL;
- spin_lock_init(&p->alloc_lock);
- p->sigpending = 0;
- init_sigpending(&p->sigpending); //对子进程待处理信号队列和有关结构成分初始化
- p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
- p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
- init_timer(&p->real_timer);
- p->real_timer.data = (unsigned long)p;
- p->leader = 0;
- p->tty_old_pgrp = 0;
- p->times.tms_utime = p->times.tms_stime = 0;
- p->times.tms_curtime = p->times.tms_cstime = 0; //对进程各种记时器的初始化
- #ifdef CONFIG_SMP
- {
- int i;
- p->has_cpu = 0;
- p->processor = current->processor;
- for(i = 0; i < smp_num_cpus; i++)
- p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
- spin_lock_init(&p->sigmask_lock);
- }
- #endif //多处理器相关
- p->lock_death = -1;
- p->start_time = jiffies; //对进程初始时间的初始化,jeffies是时钟中断记录的记时器,到这里task_struct基本初始化完毕
- retval = -ENOMEM;
- if(copy_files(clone_flags,p)) //copy_files是复制已打开文件的控制结构,但只有才clone_flags中CLONE_FILES标志才能进行,否则只是共享
- goto bad_fork_cleanup;
- if(copy_fs(clone_flags, p)); //依然是对文件的,详细的参考文件系统
- goto bad_fork_cleanup_files;
- if(copy_sighand(clone_flags, p))//和上面一样,这里是对信号的处理方式
- goto bad_fork_cleanpu_fs;
- if(copy_mm(clone_flags, p))//内存,下面给出了copy_mm的代码
- goto bad_fork_cleanup_sighand; //到这里所有需要有条件复制的资源全部结束
- retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); //4个资源中,还剩系统堆栈资源没有复制,这里是解决这个问题的
- if(retval)
- goto bad_fork_cleanup_sighand;
- p->semundo = NULL;
- p->parent_exec_id = p->self_exec_id; //parent_exec_id父进程的执行域
- /* ok, now we should be set up.. */
- p->swappable = 1;//表示本进程的页面可以被换出
- p->exit_signal = clone_flags & CSIGNAL;
- p->pdeath_signal = 0;
- p->counter = (current->counter + 1) >> 1;
- current->counter >>= 1;//父进程的分配的时间额被分成2半
- if (!current->counter)
- current->need_resched = 1; //让父子进程各拥有时间的一半
- retval = p->pid;
- p->tgid = retval;
- INIT_LIST_HEAD(&p->thread_group);
- write_lock_irq(&tasklist_lock);
- if (clone_flags & CLONE_THREAD) {
- p->tgid = current->tgid;
- list_add(&p->thread_group, ¤t->thread_group);
- }
- SET_LINKS(p); //将子进程的PCB放入进程队列,让它可以接受调度
- hash_pid(p); //将子进程放入hash表中
- nr_threads++;
- write_unlock_irq(&tasklist_lock);
- if (p->ptrace & PT_PTRACED)
- send_sig(SIGSTOP, p, 1);
- wake_up_process(p); /* do this last *///将子进程唤醒,到这里子进程已经完成了
- ++total_forks;
- fork_out:
- if ((clone_flags & CLONE_VFORK) && (retval > 0))
- down(&sem); //这里就是达到扣留一个进程的目的
- return retval;
- } //进程虽然创建结束,但有个特殊情况有待考虑就是调用者是vfork,标志位CLONE_VFORK,此时由于决定采用的是CLONE_VM,父子2个进程是共享用户空间的,对堆栈空间的写入更是致命,因为会导致其中一个因为非法越界而死亡,所以做法是扣留其中一个进程
- struct user_struct { //描述用户的数据结构
- atomic_t __count; /* reference count */
- atomic_t processes; /* How many processes does this user have? */
- atomic_t files; /* How many open files does this user have? */
- /* Hash table maintenance information */
- struct user_struct *next, **pprev; //用于杂凑表,对用户名施以杂凑运算
- uid_t uid;
- };
- struct exec_domain
- {
- const char *name; /* name of the execdomain */
- handler_t handler; /* handler for syscalls */
- unsigned char pers_low; /* lowest personality */ //指向某种域的代码,有PER_LILNUX, PER_SVR4,PER_BSD和PER_SOLARIS这是表示进程的执行域
- unsigned char pers_high; /* highest personality */
- unsigned long *signal_map; /* signal mapping */
- unsigned long *signal_invmap; /* reverse signal mapping */
- struct map_segment *err_map; /* error mapping */
- struct map_segment *socktype_map; /* socket type mapping */
- struct map_segment *sockopt_map; /* socket option mapping */
- struct map_segment *af_map; /* address family mapping */
- struct module *module; /* module context of the ed. */ //在linux系统中设备驱动程序"动态安装模块",使其运行动态的安装和拆除
- struct exec_domain *next; /* linked list (internal) */
- };
- static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
- {
- struct mm_struct * mm, *old_mm;
- int retval;
- tsk->min_flt = tsk->maj_flt = 0;
- tsk->cmin_flt = tsk->cmaj_flt = 0;
- tsk->nswap = tsk->cnswap = 0;
- tsk->mm = NULL;
- tsk->active_mm = NULL;
- old_mm = current->mm;
- if(!old_mm)
- return 0;
- if(clone_flags & CLONE_VM) {//从这里可以看出,如果是共享内存的话,只是将mm由父进程赋值给了子进程,2个进程将会指向同一块内存
- atomic_inc(&old_mm->mm_users);
- mm = oldmm;
- goto good_mm;
- }
- retval = -ENOMEM;
- mm = allocate_mm();
- if(!mm)
- goto fail_nomem;
- memcpy(mm, oldmm, sizeof(*mm));
- if(!mm_init(mm));
- goto fail_nomem;
- down(&oldmm->mmap_sem);
- retval = dup_mmap(mm); //这里完成了对vm_area_struct和页面表的复制
- up(&oldmm->mmap_sem);
- if(retval)
- goto free_pt;
- copy_segments(tsk, mm);
- if(init_new_context(tsk, mm));
- goto free_pt;
- good_mm:
- tsk->mm = mm;
- tsk->active_mm = mm;
- return 0;
- free_pt:
- mmput(mm);
- fail_nomem:
- return retval;
- }
- static inline int dup_mmap(struct mm_struct * mm) {
- struct vm_area_struct * mpnt, * tmp, **prev;
- int retval;
- flush_cache_mm(current->mm);
- mm->locked_vm = 0;
- mm->mmap = NULL;
- mm->mmap_avl = NULL;
- mm->mmap_cache = NULL;
- mm->map_count = 0;
- mm->cpu_vm_mask = 0;
- mm->swap_cnt = 0;
- mm->swap_address = 0;
- pprev = &mm->mmap;
- for(mpnt = current->mm_mmap; mpnt; mpnt= mpnt->vm_next) { //遍历队列,对属于父进程的所有mm_struct开始遍历
- struct file * file;
- retval = -ENOMEM;
- if(mpnt->vm_flags & VM_DONTCOPY)
- continue;
- tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);//给TMP申请缓存
- if(!tmp)
- goto fail_nomem;
- *tmp = *mpnt;
- tmp->vm_flags &= ~VM_LOCKED;
- tmp->vm_mm = mm;
- mm->map_count++;
- tmp->vm_next = NULL;
- file = tmp->vm_file;
- if(file) {
- struct inode *inode = file->f_dentry->d_inode;
- get_file(file);
- if(tmp->vm_flags & VM_DENYWRITE)
- atomic_dec(&inode->i_writecount);
- spin_lock(&inode->i_mapping->i_shared_lock);
- if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
- mpnt->vm_next_share->vm_pprev_share = &tmp->vm_next_share;
- mpnt->vm_next_share = tmp;
- tmp->vm_pprev_share = &mpnt->vm_next_share;
- spin_unlock(&inode->i_mapping->i_shared_lock);
- }
- retval = (mm, current->mm, tmp);
- if(!retval && tmp->tmp->vm_ops && tmp->vm_ops->open)
- tmp->vm_ops->open(tmp);
- *pprev = tmp;
- pprev = &tmp->vm_next;
- if(retval)
- goto fail_nomem;
- }
- retval = 0;
- if(mm->map_count >= AVL_MIN_MAP_COUNT)
- build_mmap_avl(mm);
- fail_nomem;
- flush_tlb_mm(current->mm);
- return retval;
- }
- int copy_page_range(struct mm_struct * dst, struct mm_struct * src, struct vm_area_struct * vma) {
- pgd_t * src_pgd, * dst_pgd;
- unsigned long address = vma->vm_start;
- unsigned long end = vma->vm_end;
- unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
- src_pgd = pgd_offset(src, address) - 1;
- dst_pgd = pgd_offset(dst, address) - 1;
- for(;;) { //对页面目录表项的循环
- pmd_t * src_pmd, * dst_pmd;
- src_pgd++;
- dst_pgd++;
- if(pgd_none(*src_pgd))
- goto skip_copy_pmd_range;
- if(pgd_bad(* src_pgd)) {
- pgd_ERROR(*src_pgd);
- pgd_clear(src_pgd);
- skip_copy_pmd_range:
- address = (address + PGDIR_SIZE) &PGDIR_MASK;
- if(!address || (address >= end))
- goto out;
- continue;
- }
- if(pgd_none(*dst_pgd)) {
- if(!pmd_alloc(dst_pgd, 0))
- goto nomem;
- }
- src_pmd = pmd_offset(src_pgd, address);
- dst_pmd = pmd_offset(dst_pgd, address);
- do{ //对中间目录的循环
- pte_t * src_pte, * dst_pte;
- if(pmd_none(*src_pmd))
- goto skip_copy_pte_range;
- if(pmd_bad(*src_pmd)) {
- pmd_ERROR(*src_pmd);
- pmd_clear(src_pmd);
- skip_copy_pte_range:
- address = (address + PMD_SIZE) & PMD_MASK;
- if(address >= end)
- goto out;
- goto cont_copy_pmd_range;
- }
- if(pmd_none(*dst_pmd))
- {
- if(!pte_alloc(dst_pmd, 0))
- goto nomem;
- }
- src_pte = pte_offset(src_pmd, address);
- dst_pte = pte_offset(dst_pmd, address);
- do{ //对页面表的循环
- pte_t pte = *src__pte;
- struct page * ptepage;
- if(pte_none(pte)) //映射尚未建立的表项,直接跳过
- goto cont_copy_pte_range_noset;
- if(!pte_present(pte)) { //说明该页面被交换到了磁盘,只是对盘上页面用户计数加一
- swap_duplicate(pte_to_swp_entry(pte));
- goto cont_copy_pte_range;
- }
- ptepage = pte_page(pte);
- if((!VALLID_PAGE(ptepage)) || PageReserved(ptepage)) //不是有效页面,此页面对应的表项直接复制到子进程的页面表中
- goto cont_copy_pte_range;
- if(cow) { //使用copy_on_write机制,这里就是子进程本来应该从父进程中复制出来的页面
- ptep_set_wrprotect(src_pte); //将原来父进程的可惜页面改成写保护
- pte = * src_pte;
- }
- if(vma->vm_flags& VM_SHARED)
- pte = pte_mkclean(pte); //将父进程的页面表项复制到子进程中
- //从这里我们就看到,不是一开始就是为子进程开辟一个新的内存页面,然后将对应的父进程中的页面内容复制到该内存中,这种消耗过大,实际做法是先将这个内存改成写保护,然后将页面表项复制给子进程,最后,若真的父进程或者子进程会对这个页面执行写操作,便会发生写保护异常,异常处理程序中才将这个页面复制出来从而达到了"父子分家"
- pte = pte_mkold(pte);
- get_page(ptepage);
- cont_copy_pte_range:
- set_pte(dst_pte, pte); //直接复制页面表项
- cont_copy_pte_range_noset:
- if(address >= end)
- goto out;
- src_pte++;
- dst_pte++;
- } while((unsigned long)src_pte & PTE_TABLE_MASK);
- cont_copy_pmd_rang:
- src_pmd++;
- dst_pmd++;
- } while((unsigned long) src_pmd & PMD_TABLE_MASK);
- }
- out:
- return 0;
- nomem:
- return -ENOMEM;
- } //从这里我们看到一个页面都没复制,这就是为什么fork也能达到vfork 创建线程那么快的效率
- 529 int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
- 530 unsigned long unused,
- 531 struct task_struct * p, struct pt_regs * regs)
- 532{
- 533 struct pt_regs * childregs;
- 534
- 535 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1; //中断前夕,系统堆栈的高部保存了各个部分的寄存器的信息
- 536 struct_cpy(childregs, regs); //将父进程的内容全部复制给子进程
- 537 childregs->eax = 0; //对子进程的系统堆栈做少量调整,首先是对 eax寄存器内容置0
- 538 childregs->esp = esp;//将esp指定成给定的esp
- 539 //task_thread记载了一些关键性信息,包括进程切换时到系统态的堆栈指针,取指令地址,明显这些父子2个进程是不可以完全复制的,一下是对这些的修改
- 540 p->thread.esp = (unsigned long) childregs; //将堆栈指针指向正确的位置
- 541 p->thread.esp0 = (unsigned long) (childregs+1);//堆栈的顶部也指向真确的位置
- 542
- 543 p->thread.eip = (unsigned long) ret_from_fork;//这是当进程下一次切换时将进入的切入点,在进程切换里会详细提到
- 544
- 545 savesegment(fs,p->thread.fs);
- 546 savesegment(gs,p->thread.gs);
- 547
- 548 unlazy_fpu(current);
- 549 struct_cpy(&p->thread.i387, ¤t->thread.i387);
- 550
- 551 return 0;
- 552}
fork,vfork和clone底层实现
最新推荐文章于 2021-11-21 16:39:45 发布