/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace) { int retval; struct task_struct *p; int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) //It is not permitted for the child thread to create its own namespace while on the other hand it shares its parent's root director and working directory return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); /* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */ if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags);//security/security.c:702 --->security/capability.c:360 --> return 0; ??? /* * 安全框架检查,利用它可以在进程建立前检查是否允许检查,利用这个开发框架可以开发出进程监控功能。默认调用dummy_task_create函数,它什么也不操作。《独辟蹊径 P311》 * 检查当前进程是否有权限创建新的进程 */ if (retval) goto fork_out; retval = -ENOMEM; //out of memory p = dup_task_struct(current); //current process is the parent process of the process being created, so we need copy the current process's task_struct /* * 这一步完成之后父子进程的task_struct是一样的,thread_info也是一样的(当然其中的task_struct指针是不同的) */ if (!p) goto fork_out; ftrace_graph_init_task(p); /* Allocate a return stack for newly created task */ rt_mutex_init_task(p);//reference to Documentation/rt-mutext.txt and Documentation/pi-futex.txt /* * 这个函数是和进程优先级继承相关的设置。请参看相关文档。 */ #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); //make sure that the hard irq is enabled, or produce warning. DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); //make sure that the soft irq is enabled, or produce warning. #endif retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { //the process's number of the current user has should under its limit if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->real_cred->user != INIT_USER) goto bad_fork_free; } /* *当前用户所拥有的进程数是否超过了该用户所拥有进程数的上限 */ retval = copy_creds(p, clone_flags); //Copy and set credentials for the new process created by fork() /* *关于cred数据结构及其作用和用法请参看cred数据结构定义处的注释内容。 *这个函数及其调用的函数和相关的数据结构在源码中都有很详细的注释,可以自己看。 */ if (retval < 0) goto bad_fork_free; /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ retval = -EAGAIN; if (nr_threads >= max_threads) goto bad_fork_cleanup_count; /* *检查系统当前的进程数(nr_threads)是否超过系统所能承受的最大进程数(max_threads)。最大进程数的默认值由系统的RAM的大小决定。 *一般来说,所有的thread_info结构体和所有的内核堆栈之和不能超过物理内存的八分之一。 *然而系统管理员可以通过修改/proc/sys/kernel/threads-max来调整max_threads的值。 *nr_threads是一个全局变量,记录系统当前的进程数,不包括idle进程。 *参考文章:http://hi.baidu.com/zengzhaonong/blog/item/6106d61795f09009c83d6d34.html */ if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; /* 如果需要用到内核模块,则需要对相关的域做修改。 */ p->did_exec = 0; //it means that the process will execute the old code. /* task_struct中的该字段被置为0,表明该进程尚未调用exec(),因此仍将执行原来的代码(也就是父进程的代码) */ delayacct_tsk_init(p); /* Must remain after dup_task_struct() *//* reinitialize in case parent's non-null pointer was dup'ed*/ copy_flags(clone_flags, p); //set p's flags. /* *我们看一下p中的flags字段被设置成了什么: *static void copy_flags(unsigned long clone_flags, struct task_struct *p) *{ * unsigned long new_flags = p->flags; * * new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); * new_flags |= PF_FORKNOEXEC; * new_flags |= PF_STARTING; * p->flags = new_flags; * clear_freeze_flag(p); //cancel the previous 'freeze' request *} *没有超级用户的权限,I am not a workqueue worker,被调度了,但是尚未执行新的程序,正在被创建。 */ INIT_LIST_HEAD(&p->children); //初始化孩子指针 INIT_LIST_HEAD(&p->sibling); //初始化兄弟指针 rcu_copy_process(p); p->vfork_done = NULL; //for vfork() spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); /* *将task_struct结构中的信号队列清空,并且初始化该信号队列。 *表示没有信号等待处理。 */ p->utime = cputime_zero; //utime是进程用户态耗费的时间 p->stime = cputime_zero; //stime是用户内核态耗费的时间 p->gtime = cputime_zero; //组用户耗费的时间 p->utimescaled = cputime_zero; p->stimescaled = cputime_zero; #ifndef CONFIG_VIRT_CPU_ACCOUNTING p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif p->default_timer_slack_ns = current->timer_slack_ns; task_io_accounting_init(&p->ioac);//initialize a structure which is used for recording a single task's IO statistics as 0. acct_clear_integrals(p); /* *将task_struct中的运行时间(stime+utime)置为0,实际用到的虚存大小置为0,将实际用到的物理内存的大小置为0 */ posix_cpu_timers_init(p); //Initialize POSIX timer handling for a single task. do_posix_clock_monotonic_gettime(&p->start_time); /* *task_struct中的start_time是mononic类型的time,应该是那种从1970年开始计时,然后递增得到的时间。 */ p->real_start_time = p->start_time; //set the clock when the process appear monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) threadgroup_fork_read_lock(current);//The threadgroup_fork_lock prevents threads from forking with CLONE_THREAD while held for writing. cgroup_fork(p); //attach newly forked task to its parents cgroup. /* *将新创建的进程加入到其父进程的组中,这个在dup_task_struct中已经做过一遍了,为什么还要再做一遍呢,请看注释: * A pointer to the shared css_set was automatically copied in * fork.c by dup_task_struct(). However, we ignore that copy, since * it was not made under the protection of RCU or cgroup_mutex, so * might no longer be a valid cgroup pointer. cgroup_attach_task() might * have already changed current->cgroups, allowing the previously * referenced cgroup group to be removed and freed. */ #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_cgroup; } mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW p->hardirqs_enabled = 1; #else p->hardirqs_enabled = 0; #endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; p->hardirq_disable_event = 0; p->softirqs_enabled = 1; p->softirq_enable_ip = _THIS_IP_; p->softirq_enable_event = 0; p->softirq_disable_ip = 0; p->softirq_disable_event = 0; p->hardirq_context = 0; p->softirq_context = 0; #endif #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; #endif #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif #ifdef CONFIG_CGROUP_MEM_RES_CTLR p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p); // Initialize the scheduling-related fields of the new process, set up data structures (this is rather straightforward), and determine the dynamic priority of the process /* *void sched_fork(struct task_struct *p) *{ * unsigned long flags;
* int cpu = get_cpu(); //在对称多处理器体系结构中,获取当前CPU的ID。
* * __sched_fork(p); // Perform scheduler related setup for a newly forked process p.p is forked by current.; * Perform scheduler related setup for a newly forked process p. * p is forked by current. /* static void __sched_fork(struct task_struct *p) { p->on_rq = 0; //on_rq denotes whether the entity is currently scheduled on a run queue or not. p->se.on_rq = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); //关于上面几个字段的意义,请参看(深入Linux内核架构的93页) ........ } */ /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; //将新创建的进程设置为运行状态以保证没有其他进程可以调度它。 /* * Revert to default priority/policy on fork if requested. *///为新进程设置优先级 if (unlikely(p->sched_reset_on_fork)) { if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { p->policy = SCHED_NORMAL; p->normal_prio = p->static_prio; } if (PRIO_TO_NICE(p->static_prio) < 0) { p->static_prio = NICE_TO_PRIO(0); p->normal_prio = p->static_prio; set_load_weight(p); } /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty: */ p->sched_reset_on_fork = 0; } /* * Make sure we do not leak PI(Priority Inheritance) boosting priority to the child. */ p->prio = current->normal_prio; /* *有关task_struct中的static_prio/normal_prio/prio,三个与优先级相关的字段的作用和联系,可以参看 *深入Linux内核结构的94页。涉及到进程调度,以后再详解。 */ if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; if (p->sched_class->task_fork) p->sched_class->task_fork(p); /* *关于sched_class,请参看深入Linux内核架构89页。 */ /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() * is ran before sched_fork(). * * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); set_task_cpu(p, cpu); //将新创建的进程放在和父进程一样的CPU上运行,当然,这个只是进行相关的设置,而非真正的开始运行。 raw_spin_unlock_irqrestore(&p->pi_lock, flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) p->on_cpu = 0; //新创建的进程尚未在CPU上运行。 #endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; //标记新创建的进程为不可抢占的进程。 #endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); #endif put_cpu(); } */ retval = perf_event_init_task(p); //?????????????????????????? if (retval)goto bad_fork_cleanup_policy;if ((retval = audit_alloc(p))) //有关审计机制,请参看深入Linux内核架构的第19章 goto bad_fork_cleanup_policy; /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) //copy_semundo uses the System V semaphores of the parent process if COPY_SYSVSEM is set goto bad_fork_cleanup_audit; if ((retval = copy_files(clone_flags, p)))// if(clone_flags & CLONE_FILES),child process will share its parent's files or allocate a new files structure and copy contents from its parent's in files structure. goto bad_fork_cleanup_semundo; /* 如果CLONE_FILES被设置了,那么自进程将与父进程共享已经被父进程打开的文件,只是将文件的共享数+1 否则,创建一个新的文件结构体,并且将父进程的打开文件的文件结构题(file_struct)拷贝一份,赋给子进程的files字段. 当然,这里涉及到文件系统的只是,我们以后再来研究。 */ if ((retval = copy_fs(clone_flags, p)))goto bad_fork_cleanup_files; /* 和copy_files类似 */ if ((retval = copy_sighand(clone_flags, p)))goto bad_fork_cleanup_fs; if ((retval = copy_signal(clone_flags, p)))goto bad_fork_cleanup_sighand; /* 和信号相关的操作,我们以后讨论 */ if ((retval = copy_mm(clone_flags, p)))goto bad_fork_cleanup_signal; /* * copy_mm来拷贝父进程中的mm字段,涉及到内存管理,我们以后讨论。 * 不过,需要注意的是active_mm和mm的处理: * 判断被拷贝的进程是否是内核线程,如果是的话,则mm = NULl, active_mm = oldmm (之前一个进程的mm);否则将mm和active_mm指向统一个结构体即可 * 关于mm和active_mm的区别,请参看我的博客(task_struct注释)。 */ if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_mm; /* *关于namespace可以按看深入Linux内核架构2.3.2节*********************************8 */ if ((retval = copy_io(clone_flags, p))) goto bad_fork_cleanup_namespaces; retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); //使用参数regs初始化子进程的内核堆栈空间,但是将fork()或clone()的返回值(保存在寄存器eax中)置为0; /* int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { int err; struct pt_regs *childregs; struct task_struct *me = current; childregs = ((struct pt_regs *)task_pt_regs(p); //childregs指向thread_info的顶端??? /* #define KSTK_TOP(info) \ ({ \ unsigned long *__ptr = (unsigned long *)(info); \ (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ }) #define task_pt_regs(task) \ ({ \ struct pt_regs *__regs__; \ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ __regs__ - 1; \ }) 最终的返回值应该是(8K高地址 - 8- sizeof(struct pt_regs)) ;但是为什么-8呢? */ *childregs = *regs; //这个regs是sys_fork(regs)传递下来的。但是在UNIX编程中,fork()是不需要参数的,那么sys_fork(regs)的参数是哪里来的? childregs->ax = 0; //修改子进程pt_regs中ax的内容为0,这是返回时,区别父子进程的标记。当返回到子进程时,返回的是0;否则返回到父进程,返回子进程的ID if (user_mode(regs)) childregs->sp = sp; 如果父进程是用户态,则将子进程的sp调整为调用do_fork函数时传递过来的参数stack_start,不过这个值是0. 从而保证处于用户态的子进程每次被调度执行时都是从空的栈开始的。********************************* else childregs->sp = (unsigned long)childregs; 如果父进程是内核态的,那么不会存在用户空间和内核空间切换的问题,因此直接将sp字段指向其栈顶就可以了。 p->thread.sp = (unsigned long) childregs;//子进程的用户态指针指向childregs?? p->thread.sp0 = (unsigned long) (childregs+1);//指针的内核态指针指向childregs的下一个地址。为什么???? p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK);//设置子进程thread_info结构中的flags字段,TIF_FORK表示该进程是从fork中返回的。 p->thread.io_bitmap_ptr = NULL; 保存段寄存器 但是为什么gs,fs和es,ds的保存方法不一样?是寄存器使用方法不同么? 其实Linux的内存管理根本没有用到段机制,因此我们主要关注与页相关的操作就OK了。 savesegment(gs, p->thread.gsindex); //gs中存储的只是其在段表中的选择符 p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; //thread.gs中存储的才是实际的是[gs]***************** savesegment(fs, p->thread.fsindex); p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); err = -ENOMEM; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; return -ENOMEM; } memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); } Set a new TLS(thread level storage) for the child thread? 关于TLS策略请参看http://en.wikipedia.org/wiki/Thread-local_storage if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); else #endif err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); if (err) goto out; } err = 0; out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } return err; } */ if (retval) goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { retval = -ENOMEM; pid = alloc_pid(p->nsproxy->pid_ns); //allocate pid. if (!pid) goto bad_fork_cleanup_io; } p->pid = pid_nr(pid);//get a new global pid No. But I don't know how?...分配一个全局PID,但是我还没有搞清楚实现机制,因为涉及到namespace p->tgid = p->pid; //thread group id = pid 子进程中的线程组ID等于父进程的组ID if (clone_flags & CLONE_THREAD) // p->tgid = current->tgid; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; //set_child_tid??? /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) p->sas_ss_sp = p->sas_ss_size = 0; //请参看深入Linux内核框架相关章节的说明。这里事关信号处理,以后再讨论。 /* Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); //关闭TIF_SYSCALL_TRACE标志位,确保执行ret_from_fork时不通知调试进程。 #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif clear_all_latency_tracing(p);/* ok, now we should be set up.. */ /* 只有当线程组的最后一个线程(线程组的组长)退出时才会通知该组长的父进程 */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); //设置返回时的task_struct中的exit_signal字段 p->pdeath_signal = 0; //当父进程死亡时,子进程发送的信号 p->exit_state = 0; /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible * on the tasklist. */ cgroup_fork_callbacks(p); //about cgroup, pleaese refer to Documenttation/cgroups/cgroups.txt OR http://blog.chinaunix.net/space.php?uid=20543183&do=blog&id=1930840&page=1#comment cgroup_callbacks_done = 1;/* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock);/* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; } else { p->real_parent = current; //**** p->parent_exec_id = current->self_exec_id; } spin_lock(¤t->sighand->siglock); /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_free_pid; } if (clone_flags & CLONE_THREAD) { current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } if (likely(p->pid)) { tracehook_finish_clone(p, clone_flags, trace); //new child created and being attached if (thread_group_leader(p)) { if (is_child_reaper(pid)) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); __this_cpu_inc(process_counts); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; } total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) threadgroup_fork_read_unlock(current); perf_event_fork(p); return p; bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); /* 下面的代码都是错误处理的代码,就不分析了。 */ }好了,copy_process到这里终于完成了!其中涉及的内存管理和信号处理及文件系统相关的只是并没有深究,以后学习到这方面的知识的时候再重新来看!
内核代码是要读好多遍的,不要期望一遍能够完事儿。
进程创建之dofork->copy_process()
最新推荐文章于 2022-10-24 12:51:54 发布