fork之旅
前文:本文以x86_64架构的代码作为参考
初探
编码过程中,我们想要使用linux的系统调用,就需要进行这样一条include
#include <unistd.h>
这个文件位于${LINUX_SOURCE}/include/asm-\${arch}/目录下。内容大概分文以下几个部分
1.系统调用号的定义
根据操作系统课程设计的经验,用户态进程在系统调用阶段能完成的工作仅仅是陷入内核态,然后由内核完成工作。那么内核具体需要完成的功能就由中断号指定,这里能看到许多非常熟悉的名字
#define __NR_read 0
__SYSCALL(__NR_read, sys_read)
#define __NR_write 1
__SYSCALL(__NR_write, sys_write)
#define __NR_open 2
__SYSCALL(__NR_open, sys_open)
#define __NR_close 3
...
#define __NR_vserver 236
__SYSCALL(__NR_vserver, sys_ni_syscall)
#define __NR_syscall_max __NR_vserver
__SYSCALL
这个宏出自$LINUX_SOURCE/arch/x86_64/kernel/syscall.c,定义比较奇怪
#define __NO_STUBS
#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
#undef _ASM_X86_64_UNISTD_H_
#include <asm-x86_64/unistd.h>
#undef __SYSCALL
#define __SYSCALL(nr, sym) [ nr ] = sym,
#undef _ASM_X86_64_UNISTD_H_
typedef void (*sys_call_ptr_t)(void);
extern void sys_ni_syscall(void);
sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = {
/* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm-x86_64/unistd.h>
};
怎么看都能看出来这是定义了两遍,根据我的理解,上面的定义是通用的定义,而下面的定义是针对x86_64架构的定义。根据上面unistd.h中的内容,正好能对上号:
- 系统调用表(sys_call_table)是一个函数指针的数组
- 按照宏定义,所有__SYSCALL宏中的内容都会被改写为lambda,就像这个数组中预定义的第一个元素一样
- 数组静态定义中直接include操作的确暴力,不过最终真的,满足语法
这样的定义造成了如下结果:
- 所有lambda都被预设至
sys_ni_syscall
- 部分lambda被重新定向
这里sys_ni_syscall
是如此定义的,在${LINUX_SOURCE}/kernel/sys.c中
asmlinkage long sys_ni_syscall(void)
{
return -ENOSYS;
}
这个意思很明显,就是返回”无效系统调用“这个错误。
综上所述,这一部分的作用就是定义系统调用表,采用了巧妙的设计:
- 不同体编译参数可能有不同的系统调用,实现途径就是根据不同的宏,选择不同架构的unistd.h,装入系统调用表
- 系统调用表是长的,但是系统调用的数量可变,预先的lambda定义使得任意内核在使用任何合法范围内(273)的系统调用时,都不会出现不可预测的情况
2.系统调用的内核视图
什么意思呢?意思是这段宏定义的内容根本不是给人看的。。。
#ifndef __NO_STUBS
/* user-visible error numbers are in the range -1 - -4095 */
#define __syscall_clobber "r11","rcx","memory"
#define __syscall_return(type, res) \
do { \
if ((unsigned long)(res) >= (unsigned long)(-127)) { \
errno = -(res); \
res = -1; \
} \
return (type) (res); \
} while (0)
#ifndef __KERNEL_SYSCALLS__
#define __syscall "syscall"
#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile (__syscall \
: "=a" (__res) \
: "0" (__NR_##name) : __syscall_clobber ); \
__syscall_return(type,__res); \
}
#define _syscall1(type,name,type1,arg1) \
type name(type1 arg1) \
{ \
long __res; \
__asm__ volatile (__syscall \
: "=a" (__res) \
: "0" (__NR_##name),"D" ((long)(arg1)) : __syscall_clobber ); \
__syscall_return(type,__res); \
}
#define _syscall2(type,name,type1,arg1,type2,arg2) \
type name(type1 arg1,type2 arg2) \
{ \
long __res; \
__asm__ volatile (__syscall \
: "=a" (__res) \
: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)) : __syscall_clobber ); \
__syscall_return(type,__res); \
}
#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
type name(type1 arg1,type2 arg2,type3 arg3) \
{ \
long __res; \
__asm__ volatile (__syscall \
: "=a" (__res) \
: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
"d" ((long)(arg3)) : __syscall_clobber); \
__syscall_return(type,__res); \
}
#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
{ \
long __res; \
__asm__ volatile ("movq %5,%%r10 ;" __syscall \
: "=a" (__res) \
: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
"d" ((long)(arg3)),"g" ((long)(arg4)) : __syscall_clobber,"r10" ); \
__syscall_return(type,__res); \
}
#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
type5,arg5) \
type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
{ \
long __res; \
__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; " __syscall \
: "=a" (__res) \
: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
"d" ((long)(arg3)),"g" ((long)(arg4)),"g" ((long)(arg5)) : \
__syscall_clobber,"r8","r10" ); \
__syscall_return(type,__res); \
}
#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
type5,arg5,type6,arg6) \
type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5,type6 arg6) \
{ \
long __res; \
__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; movq %7,%%r9" __syscall \
: "=a" (__res) \
: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
"d" ((long)(arg3)),"g" ((long)(arg4)),"g" ((long)(arg5), \
"g" ((long)(arg6),) : \
__syscall_clobber,"r8","r10","r9" ); \
__syscall_return(type,__res); \
}
虽然艰涩难懂,还是简要梳理以下
- 这块代码都是被宏
__NO_STUBS
包着,而这个东西就是在上面syscall.c中定义的,所以,这些东西都不会被放到syscall.c里面 - 这一块代码在
__KERNEL_SYSCALLS__
这个宏的控制范围下的内容,根据我的理解,是系统调用对内核的接口。原因很简单:都是一些嵌入式汇编,一看就是要和模式切换,中断开关打交道 - 这些宏的作用比较容易看出来:就是参数列表长度不同的系统调用的接口
3.系统调用的用户接口
这一部分比起上一部分友好了很多,一看明白
#else /* __KERNEL_SYSCALLS__ */
/*
* we need this inline - forking from kernel space will result
* in NO COPY ON WRITE (!!!), until an execve is executed. This
* is no problem, but for the stack. This is handled by not letting
* main() use the stack at all after fork(). Thus, no function
* calls - which means inline code for fork too, as otherwise we
* would use the stack upon exit from 'fork()'.
*
* Actually only pause and fork are needed inline, so that there
* won't be any messing with the stack from main(), but we define
* some others too.
*/
#define __NR__exit __NR_exit
extern pid_t sys_setsid(void);
static inline pid_t setsid(void)
{
return sys_setsid();
}
long sys_write(int fd, const char *buf, size_t size);
static inline ssize_t write(unsigned int fd, char * buf, size_t count)
{
return sys_write(fd, buf, count);
}
extern ssize_t sys_read(unsigned int, char *, size_t);
static inline ssize_t read(unsigned int fd, char * buf, size_t count)
{
return sys_read(fd, buf, count);
}
extern off_t sys_lseek(unsigned int, off_t, unsigned int);
static inline off_t lseek(unsigned int fd, off_t offset, unsigned int origin)
{
return sys_lseek(fd, offset, origin);
}
extern long sys_dup(unsigned int);
static inline long dup(unsigned int fd)
{
return sys_dup(fd);
}
/* implemented in asm in arch/x86_64/kernel/entry.S */
extern long execve(char *, char **, char **);
extern long sys_open(const char *, int, int);
static inline long open(const char * filename, int flags, int mode)
{
return sys_open(filename, flags, mode);
}
extern long sys_close(unsigned int);
static inline long close(unsigned int fd)
{
return sys_close(fd);
}
extern long sys_exit(int) __attribute__((noreturn));
extern inline long exit(int error_code)
{
sys_exit(error_code);
}
struct rusage;
long sys_wait4(pid_t pid,unsigned int * stat_addr,
int options, struct rusage * ru);
static inline pid_t waitpid(int pid, int * wait_stat, int flags)
{
return sys_wait4(pid, wait_stat, flags, NULL);
}
#endif /* __KERNEL_SYSCALLS__ */
#endif /* __NO_STUBS */
/*
* "Conditional" syscalls
*
* What we want is __attribute__((weak,alias("sys_ni_syscall"))),
* but it doesn't work on all toolchains, so we just do it by hand
*/
#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall");
#endif
简要分析:
- 看起来,这里的所有系统调用函数的本体是分布在不同模块的代码中的,这里全部都是extern定义
- 另外,这里出现了我们最熟悉的系统调用的定义,而这一系列不过都是转发,真正的操作是由第二部分提到的接口进入内核,由内核完成的
总结
这一部分和fork还不太沾边,但至少我们知道了当我们调用fork的时候,究竟发生了什么?都是定义在C源文件里的函数,系统调用和一般API有什么区别?
下面我们将进一步深入,看看内核态下,fork都完成了什么工作
起点&终点
在之前的unistd.h中,我们在系统调用表中找到这样一行:
...
#define __NR_fork 57
__SYSCALL(__NR_fork, stub_fork)
...
fork的系统调用号被接到了stub_fork函数上,这个函数。通览全文,stub_fork唯一的定义在:
${LINUX_SOURCE}\arch\x86_64\kernel\entry.S
.macro PTREGSCALL label,func
.globl \label
\label:
leaq \func(%rip),%rax
jmp ptregscall_common
.endm
PTREGSCALL stub_clone, sys_clone
PTREGSCALL stub_fork, sys_fork
PTREGSCALL stub_vfork, sys_vfork
PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend
PTREGSCALL stub_sigaltstack, sys_sigaltstack
PTREGSCALL stub_iopl, sys_iopl
ENTRY(ptregscall_common)
popq %r11
SAVE_REST
movq %r11, %r15
FIXUP_TOP_OF_STACK %r11
call *%rax
RESTORE_TOP_OF_STACK %r11
movq %r15, %r11
RESTORE_REST
pushq %r11
ret
而entry.S是系统调用入口,一切的起点
/*
* entry.S contains the system-call and fault low-level handling routines.
*
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
*
* Normal syscalls and interrupts don't save a full stack frame, this is
* only done for syscall tracing, signals or fork/exec et.al.
*
* A note on terminology:
* - top of stack: Architecture defined interrupt frame from SS to RIP
* at the top of the kernel process stack.
* - partial stack frame: partially saved registers upto R11.
* - full stack frame: Like partial stack frame, but all register saved.
*
* TODO:
* - schedule it carefully for the final hardware.
*/
一上来就有这样一段注释,其中说明了一些需要保存全部堆栈帧的系统调用,fork就在其列。
紧接着就是大量的汇编,其中有一个宏ENTRY,定义在$LINUX_SOURCE/include/linkage.h中:
#define __ALIGN .align 4,0x90
...
#define ENTRY(name) \
.globl name; \
ALIGN; \
name:
.globl
的意思应该是说将这个标签置为全局可见,所以说其他文件中的代码可以直接跳到ENTRY处
回过头来看上面包含stub_fork的部分,意思就比较清晰了:调用stub_fork的过程本质上是调用用sys_fork这个地址的的函数,只不过前面后面多加了一些栈操作。
风景区
不纠结于着大量的汇编,我们在${LINUX_SOURCE}\arch\x86_64\kernel\process.c处找到了sys_fork和他的好朋友们:
asmlinkage long sys_fork(struct pt_regs regs)
{
return do_fork(SIGCHLD, regs.rsp, ®s, 0, NULL, NULL);
}
asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void *parent_tid, void *child_tid, struct pt_regs regs)
{
if (!newsp)
newsp = regs.rsp;
return do_fork(clone_flags & ~CLONE_IDLETASK, newsp, ®s, 0,
parent_tid, child_tid);
}
/*
* This is trivial, and on the face of it looks like it
* could equally well be done in user mode.
*
* Not so, for quite unobvious reasons - register pressure.
* In user mode vfork() cannot have a stack frame, and if
* done by calling the "clone()" system call directly, you
* do not have enough call-clobbered registers to hold all
* the information you need.
*/
asmlinkage long sys_vfork(struct pt_regs regs)
{
return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.rsp, ®s, 0,
NULL, NULL);
}
这个样子基本上就清楚了,这一系列和进程产生有关的函数,全部由do_fork这个函数实现。
而这个函数就是正了八经声明在sched.h中,实现在fork.c中的
/* fork.c */
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
struct task_struct *p;
int trace = 0;
long pid;
if (unlikely(current->ptrace)) {
trace = fork_traceflag (clone_flags);
if (trace)
clone_flags |= CLONE_PTRACE;
}
p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
pid = IS_ERR(p) ? PTR_ERR(p) : p->pid;
if (!IS_ERR(p)) {
struct completion vfork;
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
}
if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
/*
* We'll start up with an immediate SIGSTOP.
*/
sigaddset(&p->pending.signal, SIGSTOP);
set_tsk_thread_flag(p, TIF_SIGPENDING);
}
p->state = TASK_STOPPED;
if (!(clone_flags & CLONE_STOPPED))
wake_up_forked_process(p); /* do this last */
++total_forks;
if (unlikely (trace)) {
current->ptrace_message = pid;
ptrace_notify ((trace << 8) | SIGTRAP);
}
if (clone_flags & CLONE_VFORK) {
wait_for_completion(&vfork);
if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
} else
/*
* Let the child process run first, to avoid most of the
* COW overhead when the child exec()s afterwards.
*/
set_need_resched();
}
return pid;
}
看一眼函数注释,我们就知道找对了地方。
1.do_fork参数分析
首先是调用参数的解析,结合上面fork系列函数的定义,我们可以分析出do_fork各个参数的意义
clone_flags:定义了do_fork操作中需要复制的内容,定义在sched.h中:
/* * cloning flags: */ #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ #define CLONE_VM 0x00000100 /* set if VM shared between processes */ #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ #define CLONE_IDLETASK 0x00001000 /* set if new pid should be 0 (kernel only)*/ #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ #define CLONE_THREAD 0x00010000 /* Same thread group? */ #define CLONE_NEWNS 0x00020000 /* New namespace group? */ #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ #define CLONE_DETACHED 0x00400000 /* Not used - CLONE_THREAD implies detached uniquely */ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ #define CLONE_STOPPED 0x02000000 /* Start in stopped state */
意思都非常明确,注释说的一清二楚,那么我们回过头,看看fork,vfork和clone分别用了什么标识:
- sys_fork:SIGCHLD标识定义在signal.h中,值为17,并不在上述flag中,我们可以认为它什么特殊操作也没有采用
- sys_clone:这个操作基本上就是对do_fork进行了一个简单的转接,但是用户输入的flag中不能够包括CLONE_IDLETASK这个标识
- sys_vfork:在sys_fork的基础上增加了CLONE_VFORK,CLONE_VM两个标识VFORK指定这次frok行为是VFORK,在收到子进程退出信号之前父进程会阻塞;VM标识则说明子进程和父进程共享同一内存空间
stack_start:按照字面意思就是栈的起点,看看三个fork和vfork,给出的值都是regs.rsp,意思就是堆栈指针寄存器,而clone给出的是用户输入的参数newsp。者就告诉我们如果想要使用clone,需要手动申请内核栈
regs:这个意思也非常明显,看看参数类型就明白了——寄存器指针
stack_size:意思是栈大小,然而很有意思的是fork系列中该参数始终是0,==让人摸不着头脑==
parent/child_tid:虽然搞不明白tid是什么,但是前面那个__user宏能说明很多问题,定义在${LINUX_SOURCE}/include/compiler.h中:
# define __user __attribute__((noderef, address_space(1))) # define __kernel /* default address space */
address_space这个名词非常的可疑,考虑以下执行这段代码的进程,他现在处于的是内核空间,所以__kernel这个宏什么都没有,而__这个宏却有一些让人看不懂的东西。估计目的是说明这地址位于用户空间,而不是内核空间
2.do_fork内容分析
至此,参数我们已经分析完毕了,下面我们来仔细看看do_fork都做了些什么:
变量准备
用了这样一段代码,完成了需要贯穿整个函数体的变量的初始化工作:
struct task_struct *p;
int trace = 0;
long pid;
if (unlikely(current->ptrace)) {
trace = fork_traceflag (clone_flags);
if (trace)
clone_flags |= CLONE_PTRACE;
}
其中if代码块表名子进程根据父进程的要求设定自己的追踪信息标识,具体逻辑定义在fork.c的fork_traceflag中,如下:
static inline int fork_traceflag (unsigned clone_flags)
{
if (clone_flags & (CLONE_UNTRACED | CLONE_IDLETASK))
return 0;
else if (clone_flags & CLONE_VFORK) {
if (current->ptrace & PT_TRACE_VFORK)
return PTRACE_EVENT_VFORK;
} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
if (current->ptrace & PT_TRACE_CLONE)
return PTRACE_EVENT_CLONE;
} else if (current->ptrace & PT_TRACE_FORK)
return PTRACE_EVENT_FORK;
return 0;
}
PCB的初始化
接下来通过copy_process函数,构造新的PCB,并把指针返回给变量p:
p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr);
copy_process的实现也在fork.c中,内容相当丰富:
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
* It copies the registers, and all the appropriate
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
int retval;
struct task_struct *p = NULL;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
/*
* CLONE_DETACHED must match CLONE_THREAD: it's a historical
* thing.
*/
if (!(clone_flags & CLONE_DETACHED) != !(clone_flags & CLONE_THREAD)) {
/* Warn about the old no longer supported case so that we see it */
if (clone_flags & CLONE_THREAD) {
static int count;
if (count < 5) {
count++;
printk(KERN_WARNING "%s trying to use CLONE_THREAD without CLONE_DETACH\n", current->comm);
}
}
return ERR_PTR(-EINVAL);
}
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current);
if (!p)
goto fork_out;
retval = -EAGAIN;
if (atomic_read(&p->user->processes) >=
p->rlim[RLIMIT_NPROC].rlim_cur) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
p->user != &root_user)
goto bad_fork_free;
}
atomic_inc(&p->user->__count);
atomic_inc(&p->user->processes);
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
if (!try_module_get(p->thread_info->exec_domain->module))
goto bad_fork_cleanup_count;
if (p->binfmt && !try_module_get(p->binfmt->module))
goto bad_fork_cleanup_put_domain;
#ifdef CONFIG_PREEMPT
/*
* schedule_tail drops this_rq()->lock so we compensate with a count
* of 1. Also, we want to start with kernel preemption disabled.
*/
p->thread_info->preempt_count = 1;
#endif
p->did_exec = 0;
p->state = TASK_UNINTERRUPTIBLE;
copy_flags(clone_flags, p);
if (clone_flags & CLONE_IDLETASK)
p->pid = 0;
else {
p->pid = alloc_pidmap();
if (p->pid == -1)
goto bad_fork_cleanup;
}
retval = -EFAULT;
if (clone_flags & CLONE_PARENT_SETTID)
if (put_user(p->pid, parent_tidptr))
goto bad_fork_cleanup;
p->proc_dentry = NULL;
INIT_LIST_HEAD(&p->run_list);
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
INIT_LIST_HEAD(&p->posix_timers);
init_waitqueue_head(&p->wait_chldexit);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
spin_lock_init(&p->switch_lock);
spin_lock_init(&p->proc_lock);
clear_tsk_thread_flag(p, TIF_SIGPENDING);
init_sigpending(&p->pending);
p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
init_timer(&p->real_timer);
p->real_timer.data = (unsigned long) p;
p->leader = 0; /* session leadership doesn't inherit */
p->tty_old_pgrp = 0;
p->utime = p->stime = 0;
p->cutime = p->cstime = 0;
p->array = NULL;
p->lock_depth = -1; /* -1 = no lock */
p->start_time = get_jiffies_64();
p->security = NULL;
p->io_context = NULL;
retval = -ENOMEM;
if ((retval = security_task_alloc(p)))
goto bad_fork_cleanup;
/* copy all the process information */
if ((retval = copy_semundo(clone_flags, p)))
goto bad_fork_cleanup_security;
if ((retval = copy_files(clone_flags, p)))
goto bad_fork_cleanup_semundo;
if ((retval = copy_fs(clone_flags, p)))
goto bad_fork_cleanup_files;
if ((retval = copy_sighand(clone_flags, p)))
goto bad_fork_cleanup_fs;
if ((retval = copy_signal(clone_flags, p)))
goto bad_fork_cleanup_sighand;
if ((retval = copy_mm(clone_flags, p)))
goto bad_fork_cleanup_signal;
if ((retval = copy_namespace(clone_flags, p)))
goto bad_fork_cleanup_mm;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_namespace;
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
/*
* Syscall tracing should be turned off in the child regardless
* of CLONE_PTRACE.
*/
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
p->parent_exec_id = p->self_exec_id;
/* ok, now we should be set up.. */
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;
/*
* Share the timeslice between parent and child, thus the
* total amount of pending timeslices in the system doesn't change,
* resulting in more scheduling fairness.
*/
local_irq_disable();
p->time_slice = (current->time_slice + 1) >> 1;
/*
* The remainder of the first timeslice might be recovered by
* the parent if the child exits early enough.
*/
p->first_time_slice = 1;
current->time_slice >>= 1;
p->timestamp = sched_clock();
if (!current->time_slice) {
/*
* This case is rare, it happens when the parent has only
* a single jiffy left from its timeslice. Taking the
* runqueue lock is not a problem.
*/
current->time_slice = 1;
preempt_disable();
scheduler_tick(0, 0);
local_irq_enable();
preempt_enable();
} else
local_irq_enable();
/*
* Ok, add it to the run-queues and make it
* visible to the rest of the system.
*
* Let it rip!
*/
p->tgid = p->pid;
p->group_leader = p;
INIT_LIST_HEAD(&p->ptrace_children);
INIT_LIST_HEAD(&p->ptrace_list);
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
/*
* Check for pending SIGKILL! The new thread should not be allowed
* to slip out of an OOM kill. (or normal SIGKILL.)
*/
if (sigismember(¤t->pending.signal, SIGKILL)) {
write_unlock_irq(&tasklist_lock);
retval = -EINTR;
goto bad_fork_cleanup_namespace;
}
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & CLONE_PARENT)
p->real_parent = current->real_parent;
else
p->real_parent = current;
p->parent = p->real_parent;
if (clone_flags & CLONE_THREAD) {
spin_lock(¤t->sighand->siglock);
/*
* Important: if an exit-all has been started then
* do not create this new thread - the whole thread
* group is supposed to exit anyway.
*/
if (current->signal->group_exit) {
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -EAGAIN;
goto bad_fork_cleanup_namespace;
}
p->tgid = current->tgid;
p->group_leader = current->group_leader;
if (current->signal->group_stop_count > 0) {
/*
* There is an all-stop in progress for the group.
* We ourselves will stop as soon as we check signals.
* Make the new thread part of that group stop too.
*/
current->signal->group_stop_count++;
set_tsk_thread_flag(p, TIF_SIGPENDING);
}
spin_unlock(¤t->sighand->siglock);
}
SET_LINKS(p);
if (p->ptrace & PT_PTRACED)
__ptrace_link(p, current->parent);
attach_pid(p, PIDTYPE_PID, p->pid);
if (thread_group_leader(p)) {
attach_pid(p, PIDTYPE_TGID, p->tgid);
attach_pid(p, PIDTYPE_PGID, process_group(p));
attach_pid(p, PIDTYPE_SID, p->session);
if (p->pid)
__get_cpu_var(process_counts)++;
} else
link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid);
nr_threads++;
write_unlock_irq(&tasklist_lock);
retval = 0;
fork_out:
if (retval)
return ERR_PTR(retval);
return p;
bad_fork_cleanup_namespace:
exit_namespace(p);
bad_fork_cleanup_mm:
exit_mm(p);
bad_fork_cleanup_signal:
exit_signal(p);
bad_fork_cleanup_sighand:
exit_sighand(p);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_security:
security_task_free(p);
bad_fork_cleanup:
if (p->pid > 0)
free_pidmap(p->pid);
if (p->binfmt)
module_put(p->binfmt->module);
bad_fork_cleanup_put_domain:
module_put(p->thread_info->exec_domain->module);
bad_fork_cleanup_count:
atomic_dec(&p->user->processes);
free_uid(p->user);
bad_fork_free:
free_task(p);
goto fork_out;
}
我们把这些内容切分成几块来看:
错误检查:验证参数是否有效,由于这里是内核,出现了异常参数就不是段错误那么简单的问题了,所以必须严格检查:
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); /* * CLONE_DETACHED must match CLONE_THREAD: it's a historical * thing. */ if (!(clone_flags & CLONE_DETACHED) != !(clone_flags & CLONE_THREAD)) { /* Warn about the old no longer supported case so that we see it */ if (clone_flags & CLONE_THREAD) { static int count; if (count < 5) { count++; printk(KERN_WARNING "%s trying to use CLONE_THREAD without CLONE_DETACH\n", current->comm); } } return ERR_PTR(-EINVAL); }
其中,ERR_PTR这个函数定义在${LINUX_SOURCE}/include/linux/err.h中,内容十分简单
static inline void *ERR_PTR(long error) { return (void *) error; }
将错误号转化成指针类型返回
而printk是内核中使用的printf,功能基本上相近,此处不再展开
初始化新的数据结构,内容复制,检查进程数是否超限,更新用户信息
接下来的工作就像上面描述的顺序一样:
retval = security_task_create(clone_flags); if (retval) goto fork_out; retval = -ENOMEM; p = dup_task_struct(current); if (!p) goto fork_out; retval = -EAGAIN; if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->user != &root_user) goto bad_fork_free; } atomic_inc(&p->user->__count); atomic_inc(&p->user->processes);
同样,也是每个操作后都要检查合法性
security_create函数是对一个“方法”的转发,定义在${LINUX_SOURCE}/include/linux/security.h中,内容如下:
static inline int security_task_create (unsigned long clone_flags) { return security_ops->task_create (clone_flags); }
而security_ops这个结构体也定义在这个文件当中,成员全部是函数指针
dup_task_struct:真正完成了新PCB的创建,并复制了父进程(当前进程)PCB的内容,如下:
static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; prepare_to_copy(orig); tsk = alloc_task_struct(); if (!tsk) return NULL; ti = alloc_thread_info(tsk); if (!ti) { free_task_struct(tsk); return NULL; } *ti = *orig->thread_info; *tsk = *orig; tsk->thread_info = ti; ti->task = tsk; /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); return tsk; }
- prepare_to_copy函数定义在process.c中,对父进程的PCB做了一系列置位操作
- alloc_task_struct函数在slab中为申请一个新的PCB空间;同样的alloc_thread_info也在对应的slab中为申请了一块新的空间。而对应的如果这些过程出现问题,就直接通过free_task_struct释放掉申请的空间
检查进程数是否溢出并检查模块可用性
/* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ if (nr_threads >= max_threads) goto bad_fork_cleanup_count; if (!try_module_get(p->thread_info->exec_domain->module)) goto bad_fork_cleanup_count; if (p->binfmt && !try_module_get(p->binfmt->module)) goto bad_fork_cleanup_put_domain;
这里的max_thread是一个全局变量,赋值在fork_init函数中,如下:
max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8;
看起来是根据内存大小决定的。
完成新PCB标志的复制与设置:
#ifdef CONFIG_PREEMPT /* * schedule_tail drops this_rq()->lock so we compensate with a count * of 1. Also, we want to start with kernel preemption disabled. */ p->thread_info->preempt_count = 1; #endif p->did_exec = 0; p->state = TASK_UNINTERRUPTIBLE; copy_flags(clone_flags, p); if (clone_flags & CLONE_IDLETASK) p->pid = 0; else { p->pid = alloc_pidmap(); if (p->pid == -1) goto bad_fork_cleanup; } retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) if (put_user(p->pid, parent_tidptr)) goto bad_fork_cleanup;
进程同步机制、元数据的初始化:
p->proc_dentry = NULL; INIT_LIST_HEAD(&p->run_list); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); INIT_LIST_HEAD(&p->posix_timers); init_waitqueue_head(&p->wait_chldexit); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); spin_lock_init(&p->switch_lock); spin_lock_init(&p->proc_lock); clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); p->it_real_value = p->it_virt_value = p->it_prof_value = 0; p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0; init_timer(&p->real_timer); p->real_timer.data = (unsigned long) p; p->leader = 0; /* session leadership doesn't inherit */ p->tty_old_pgrp = 0; p->utime = p->stime = 0; p->cutime = p->cstime = 0; p->array = NULL; p->lock_depth = -1; /* -1 = no lock */ p->start_time = get_jiffies_64(); p->security = NULL; p->io_context = NULL;
根据clone_flags为子进程的各个描述符复制:
retval = -ENOMEM; if ((retval = security_task_alloc(p))) goto bad_fork_cleanup; /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) goto bad_fork_cleanup_security; if ((retval = copy_files(clone_flags, p))) goto bad_fork_cleanup_semundo; if ((retval = copy_fs(clone_flags, p))) goto bad_fork_cleanup_files; if ((retval = copy_sighand(clone_flags, p))) goto bad_fork_cleanup_fs; if ((retval = copy_signal(clone_flags, p))) goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) goto bad_fork_cleanup_signal; if ((retval = copy_namespace(clone_flags, p))) goto bad_fork_cleanup_mm; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_namespace;
这里出现的copy函数一般都采用如下策略:
- clone_flags中设置了相关的共享标识,则直接将父进程(当前进程)的相关描述符的地址返回
- 否则,为子进程重新建立一个描述符
父进程将自己的时间片分给子进程
/* * Share the timeslice between parent and child, thus the * total amount of pending timeslices in the system doesn't change, * resulting in more scheduling fairness. */ local_irq_disable(); p->time_slice = (current->time_slice + 1) >> 1; /* * The remainder of the first timeslice might be recovered by * the parent if the child exits early enough. */ p->first_time_slice = 1; current->time_slice >>= 1; p->timestamp = sched_clock(); if (!current->time_slice) { /* * This case is rare, it happens when the parent has only * a single jiffy left from its timeslice. Taking the * runqueue lock is not a problem. */ current->time_slice = 1; preempt_disable(); scheduler_tick(0, 0); local_irq_enable(); preempt_enable(); } else local_irq_enable();
- 涉及到时间片的分割,首先上来就把中断关了。local_irq_enable和local_irq_disable都是宏,内容是汇编的清位和置位
- 子进程如果结束的够快(一个时间片之内),父进程可以将first_time_slice域中的时间收回来。这是从注释中获得的信息,具体实现还要看进程结束的逻辑
- 如果父进程没有时间片了,就再给父进程一个时间片
- 打开中断
将新进程加入进程树
/* * Ok, add it to the run-queues and make it * visible to the rest of the system. * * Let it rip! */ p->tgid = p->pid; p->group_leader = p; INIT_LIST_HEAD(&p->ptrace_children); INIT_LIST_HEAD(&p->ptrace_list); /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* * Check for pending SIGKILL! The new thread should not be allowed * to slip out of an OOM kill. (or normal SIGKILL.) */ if (sigismember(¤t->pending.signal, SIGKILL)) { write_unlock_irq(&tasklist_lock); retval = -EINTR; goto bad_fork_cleanup_namespace; } /* CLONE_PARENT re-uses the old parent */ if (clone_flags & CLONE_PARENT) p->real_parent = current->real_parent; else p->real_parent = current; p->parent = p->real_parent; if (clone_flags & CLONE_THREAD) { spin_lock(¤t->sighand->siglock); /* * Important: if an exit-all has been started then * do not create this new thread - the whole thread * group is supposed to exit anyway. */ if (current->signal->group_exit) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -EAGAIN; goto bad_fork_cleanup_namespace; } p->tgid = current->tgid; p->group_leader = current->group_leader; if (current->signal->group_stop_count > 0) { /* * There is an all-stop in progress for the group. * We ourselves will stop as soon as we check signals. * Make the new thread part of that group stop too. */ current->signal->group_stop_count++; set_tsk_thread_flag(p, TIF_SIGPENDING); } spin_unlock(¤t->sighand->siglock); }
- 由于CLONE_PARENT的存在,新进程的父进程到底是谁还是个问题:如果该标识设置,那么父进程应该是当前进程的父进程。
- 同时,如果子进程设置了CLONE_THREAD标识,那么子进程将加入父进程的线程组
将新的PCB与系统中维护的PID数据结构联系起来
SET_LINKS(p); if (p->ptrace & PT_PTRACED) __ptrace_link(p, current->parent); attach_pid(p, PIDTYPE_PID, p->pid); if (thread_group_leader(p)) { attach_pid(p, PIDTYPE_TGID, p->tgid); attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, p->session); if (p->pid) __get_cpu_var(process_counts)++; } els link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid); nr_threads++; write_unlock_irq(&tasklist_lock); retval = 0;
至此,copy_process完成,返回的PCB已经配置完毕,并与系统相关数据结构建立起了联系
收尾工作
等copy_process返回的时候,子进程已经基本建立完成了,下面的工作就是设置子进程状态,然后要求调度器重新调度。然而由于vfork的特殊性,在这个标识下还需要对父进程进行一系列操作
if (!IS_ERR(p)) {
struct completion vfork;
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
}
if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
/*
* We'll start up with an immediate SIGSTOP.
*/
sigaddset(&p->pending.signal, SIGSTOP);
set_tsk_thread_flag(p, TIF_SIGPENDING);
}
p->state = TASK_STOPPED;
if (!(clone_flags & CLONE_STOPPED))
wake_up_forked_process(p); /* do this last */
++total_forks;
if (unlikely (trace)) {
current->ptrace_message = pid;
ptrace_notify ((trace << 8) | SIGTRAP);
}
if (clone_flags & CLONE_VFORK) {
wait_for_completion(&vfork);
if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
} else
/*
* Let the child process run first, to avoid most of the
* COW overhead when the child exec()s afterwards.
*/
set_need_resched();
}
set_need_resched函数会设置父进程的TIF_NEED_RESCHED标识,下一次始终中断的时候,父进程就会放弃CPU,让新建的子进程执行
总结
一次成功的fork系统调用的流程,可以用下面这张流程图简单描述
参考资料&辅助工具
- lxr
- sublime text 3
- CSDN博主JeanCheng的博文