fork系统调用是内核中相当麻烦的一部分,由于进程数据结构struct task_struct包含了进程运行所需的所有的数据结构,包括虚拟地址空间,文件系统,打开的文件,信号处理程序,sys v实例,命名空间和IO上下文等。fork系统调用最后调用do_fork函数处理请求:
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
long do_fork(unsigned long clone_flags,//低字节表示子进程结束发送给父进程的退出代码,高3字节包含各种flag定制子进程
unsigned long stack_start, //父进程的用户空间栈的地址
struct pt_regs *regs, //通用寄存器的值的指针,从用户态切换到内核态时保存在内核栈中
unsigned long stack_size, //未使用
int __user *parent_tidptr, /*当clone_flags的高字节中设置CLONE_PARENT_SETTID,则将子进程的pid写入这个父进程的用户空间变量*/
int __user *child_tidptr) /*当clone_flags的高字节中设置CLONE_CHILD_SETTID,则将子进程的pid写入这个父进程的用户空间变量*/
{
struct task_struct *p;
int trace = 0;
long nr;
/*
* Do some preliminary argument and permissions checking before we
* actually start allocating stuff
*/
if (clone_flags & CLONE_NEWUSER) { /*检查标志冲突,建立新的命名空间就不能是设置创建新线程标志,两者冲突*/
if (clone_flags & CLONE_THREAD)
return -EINVAL;
/* hopefully this check will go away when userns support is
* complete
*/
if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
!capable(CAP_SETGID))/*检查权限,检查是否有创建新命名空间的权限*/
return -EPERM;
}
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED))/*设置试调器的选项,如果是在用户空间,且没有设置不被跟踪,则设置试调器选项,user_mode通过测试代码段寄存器的cs低两位是否是USER级别来判断是否是用户空间调用fork*/ {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}
p = copy_process(clone_flags, stack_start, regs, stack_size,
child_tidptr, NULL, trace); //复制父进程的所有数据结构
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
if (!IS_ERR(p)) {
struct completion vfork;
trace_sched_process_fork(current, p);
nr = task_pid_vnr(p);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr); //将子进程的pid复制给用户空间变量parent_tidptr
if (clone_flags & CLONE_VFORK) { /*如果是vfork调用,则父进程在子进程退出或者调用execve之前不能被调度执行,因此父进程必须等待,这里使用完成量,下面可以看到,父进程等待子进程执行完成*/
p->vfork_done = &vfork;
init_completion(&vfork);
}
audit_finish_fork(p); //初始化审计相关数据
/*
* We set PF_STARTING at creation in case tracing wants to
* use this to distinguish a fully live task from one that
* hasn't finished SIGSTOP raising yet. Now we clear it
* and set the child going.
*/
p->flags &= ~PF_STARTING;
wake_up_new_task(p); //唤醒新进程,将新产生的进程加入到就绪队列并设置为TASK_RUNNING状态
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event(trace, nr);
if (clone_flags & CLONE_VFORK) {
freezer_do_not_count();
wait_for_completion(&vfork);/*如果设置了CLONE_VFORK,则父进程进入等待状态,在子进程的等待队列上等待子进程唤醒自己*/
freezer_count();
ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
}
} else {
nr = PTR_ERR(p);
}
return nr;
}
先看看比较容易实现的wake_new_task操作,在wake_new_task之前,copy_process已经进行了调度器相关的设置。
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
void wake_up_new_task(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
*/
set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); /* 设置进程的的cpu,在task_struct->thread_info中设置,thread_info的数据结构在进程内核栈顶。*/
#endif
rq = __task_rq_lock(p);
activate_task(rq, p, 0); //将进程加入到就绪队列
p->on_rq = 1; //设置标志,表示进程在就绪队列上
trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK); //比较进程的优先级,新进程是否可以抢占之前运行的进程
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
#endif
task_rq_unlock(rq, p, &flags);
}
可以看到,do_fork所有的麻烦的事情都放在了copy_process中,copy_process完成各种task_struct中的数据结构的复制操作。下面随便看一下copy_process操作
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
* It copies the registers, and all the appropriate
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*///参数都和do_fork中的含义相同
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
int trace)
{
int retval;
struct task_struct *p;
int cgroup_callbacks_done = 0;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))/*检查标志冲突,新命名空间不能和共享文件系统兼容*/
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))/*线程必须共享信号处理程序*/
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))/*由于装载的信号处理程序只能在当前虚拟内存空间中运行,所以如果设置了共享信号处理程序就必须共享虚拟内存空间*/
return ERR_PTR(-EINVAL);
/*
* Siblings of global init remain as zombies on exit since they are
* not reaped by their parent (swapper). To solve this and to avoid
* multi-rooted process trees, prevent global and container-inits
* from creating siblings.
*/
if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current); /*复制task_struct结构,不包括其他的数据结构,并且新的task_struct实例p继承了current的normal_prio,并设置了p的调度器相关状态,如调度类、load_weight(用于CFS)*/
if (!p)
goto fork_out;
ftrace_graph_init_task(p);
rt_mutex_init_task(p); //初始化实时互斥锁
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
if (atomic_read(&p->real_cred->user->processes) >=
task_rlimit(p, RLIMIT_NPROC)) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
p->real_cred->user != INIT_USER)
goto bad_fork_free;
}
current->flags &= ~PF_NPROC_EXCEEDED;
retval = copy_creds(p, clone_flags);
if (retval < 0)
goto bad_fork_free;
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
retval = -EAGAIN;
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
/*初始化p的相关成员*/
p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
init_sigpending(&p->pending);
p->utime = cputime_zero;
p->stime = cputime_zero;
p->gtime = cputime_zero;
p->utimescaled = cputime_zero;
p->stimescaled = cputime_zero;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
p->default_timer_slack_ns = current->timer_slack_ns;
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
posix_cpu_timers_init(p);
do_posix_clock_monotonic_gettime(&p->start_time);
p->real_start_time = p->start_time;
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
if (clone_flags & CLONE_THREAD)
threadgroup_fork_read_lock(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_cgroup;
}
mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
p->hardirqs_enabled = 1;
#else
p->hardirqs_enabled = 0;
#endif
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
p->hardirq_disable_event = 0;
p->softirqs_enabled = 1;
p->softirq_enable_ip = _THIS_IP_;
p->softirq_enable_event = 0;
p->softirq_disable_ip = 0;
p->softirq_disable_event = 0;
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
p->memcg_batch.do_batch = 0;
p->memcg_batch.memcg = NULL;
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p);
retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_policy;
/* copy all the process information */
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_audit;
retval = copy_files(clone_flags, p); //复制打开的文件描述符
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p); //复制文件系统
if (retval)
goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p); //复制信号处理程序
if (retval)
goto bad_fork_cleanup_fs;
retval = copy_signal(clone_flags, p); //复制信号集
if (retval)
goto bad_fork_cleanup_sighand;
retval = copy_mm(clone_flags, p); //复制虚拟内存空间
if (retval)
goto bad_fork_cleanup_signal;
retval = copy_namespaces(clone_flags, p); //复制命名空间
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {//pid为null
retval = -ENOMEM;
pid = alloc_pid(p->nsproxy->pid_ns); //在p的命名空间中分配一个新的pid
if (!pid)
goto bad_fork_cleanup_io;
}
/*设置p的id相关值,pid,tid,tgid*/
p->pid = pid_nr(pid);
p->tgid = p->pid;
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
}
关注一下copy_process的主体操作,复制几个数据结构,其中最麻烦应该是复制虚拟地址空间了,由于牵涉到内存管理,缺页异常等。看一下到底复制了虚拟地址空间的哪些东西。由于copy_mm的主要操作都在dup_mm中,直接看一下dup_mm这个函数的具体实现:
/*
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
*/
struct mm_struct *dup_mm(struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm = current->mm;
int err;
if (!oldmm)
return NULL;
mm = allocate_mm(); //在内存中分配一个mm_struct的实例
if (!mm)
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm)); //将父进程的mm_struct全部复制过来
mm_init_cpumask(mm);
/* Initializing for Swap token stuff */
mm->token_priority = 0;
mm->last_interval = 0;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
mm->pmd_huge_pte = NULL;
#endif
if (!mm_init(mm, tsk)) //初始化mm_struct中几个成员,其中最重要的就是给pdg分配一页,用于全局页表
goto fail_nomem;
if (init_new_context(tsk, mm))
goto fail_nocontext;
dup_mm_exe_file(oldmm, mm); /*进程二进制执行文件共享一下,由于fork调用之后,新进程在父进程执行的地方继续执行,因此简单增加父进程的的exe_file的引用计数就可以了*/
err = dup_mmap(mm, oldmm); /* 复制用户空间页表,文件映射,匿名摄影等*/
if (err)
goto free_pt;
mm->hiwater_rss = get_mm_rss(mm); //进程的最大页帧的水印
mm->hiwater_vm = mm->total_vm; //进程的虚拟内存空间的大小
if (mm->binfmt && !try_module_get(mm->binfmt->module))
goto free_pt;
return mm;
free_pt:
/* don't put binfmt in mmput, we haven't got module yet */
mm->binfmt = NULL;
mmput(mm);
fail_nomem:
return NULL;
fail_nocontext:
/*
* If init_new_context() failed, we cannot use mmput() to free the mm
* because it calls destroy_context()
*/
mm_free_pgd(mm);
free_mm(mm);
return NULL;
}
真正的虚拟内存空间的复制在dup_mmap中,复制了父进程的文件映射和匿名映射。
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
struct mempolicy *pol;
down_write(&oldmm->mmap_sem);
flush_cache_dup_mm(oldmm);
/*
* Not linked in yet - no deadlock potential:
*/
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
mm->locked_vm = 0;
mm->mmap = NULL;
mm->mmap_cache = NULL;
mm->free_area_cache = oldmm->mmap_base;
mm->cached_hole_size = ~0UL;
mm->map_count = 0;
cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
pprev = &mm->mmap;
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
retval = khugepaged_fork(mm, oldmm);
if (retval)
goto out;
prev = NULL;
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {//遍历父进程的vm_area_struct的链表,复制所有项
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {//不让复制的标志则跳过
long pages = vma_pages(mpnt);
mm->total_vm -= pages;
vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
-pages);
continue;
}
charge = 0;
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
if (security_vm_enough_memory(len))
goto fail_nomem;
charge = len;
}
tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); //分配一个新的vm_area_struct实例
if (!tmp)
goto fail_nomem;
*tmp = *mpnt;
INIT_LIST_HEAD(&tmp->anon_vma_chain);
pol = mpol_dup(vma_policy(mpnt));
retval = PTR_ERR(pol);
if (IS_ERR(pol))
goto fail_nomem_policy;
vma_set_policy(tmp, pol);
tmp->vm_mm = mm; //设置其mm_struct
if (anon_vma_fork(tmp, mpnt)) //创建当前vm_area_struct的匿名映射
goto fail_nomem_anon_vma_fork;
tmp->vm_flags &= ~VM_LOCKED;
tmp->vm_next = tmp->vm_prev = NULL;
file = tmp->vm_file;
if (file) { //file不为空表示vm_area_struct对应一个文件映射
struct inode *inode = file->f_path.dentry->d_inode;
struct address_space *mapping = file->f_mapping; //文件映射的地址空间
get_file(file); //增加计数
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
mutex_lock(&mapping->i_mmap_mutex);
if (tmp->vm_flags & VM_SHARED)
mapping->i_mmap_writable++; //如果是共享映射则增加计数
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
vma_prio_tree_add(tmp, mpnt); //将其增加到文件地址空间的优先树上面,用于逆向映射
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
}
/*
* Clear hugetlb-related page reserves for children. This only
* affects MAP_PRIVATE mappings. Faults generated by the child
* are not guaranteed to succeed, even if read-only
*/
if (is_vm_hugetlb_page(tmp))
reset_vma_resv_huge_pages(tmp);
/*
* Link in the new vma and copy the page table entries.
*///将新的vm_area_struct实例添加到子进程mm_struct的mmap链表中
*pprev = tmp;
pprev = &tmp->vm_next;
tmp->vm_prev = prev;
prev = tmp;
__vma_link_rb(mm, tmp, rb_link, rb_parent); //将vm_area_struct实例添加到红黑树上
rb_link = &tmp->vm_rb.rb_right;
rb_parent = &tmp->vm_rb;
mm->map_count++;
retval = copy_page_range(mm, oldmm, mpnt); //复制vm_area_struct地址空间部分的页表项到新的mm_struct
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
if (retval)
goto out;
}
/* a new mm has just been created */
arch_dup_mmap(oldmm, mm);
retval = 0;
out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
up_write(&oldmm->mmap_sem);
return retval;
fail_nomem_anon_vma_fork:
mpol_put(pol);
fail_nomem_policy:
kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
retval = -ENOMEM;
vm_unacct_memory(charge);
goto out;
}