大家好,我是程栩,一个专注于性能的大厂程序员,分享包括但不限于计算机体系结构、性能优化、云原生的知识。
引
前面我们介绍了一些关于进程的知识,今天我们来聊一聊进程是如何创建的。今天的内容基于《Linux内核设计与实现》以及Linux v6.3版本。
进程创建
许多操作系统都提供了产生进程的机制,Linux内核中,采取了组合的方式来实现这样的机制,通过fork
和exec
的组合,将进程的生成分为两个步骤:简单来说就是fork
负责生成一个进程,然后exec
读入可执行文件执行:
当然,以上只是简化的步骤。进程的创建并不是复制进程描述符即可,需要做许多细节的操作。
在内核中,通过kernel_clone
来实现fork
系统调用,而与fork
类似的系统调用,例如vfork
、__clone
等,都是通过给kernel_clone
传入不同的参数来实现:
// kernel/fork.c L2999
#ifdef __ARCH_WANT_SYS_FORK
// 通过SYSCALL_DEFINE0宏定义声明一个0参数系统调用
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
// 设置kernel参数
struct kernel_clone_args args = {
.exit_signal = SIGCHLD,
};
// 调用kernel_clone
return kernel_clone(&args);
#else
/* can not support in nommu mode */
return -EINVAL;
#endif
}
#endif
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
struct kernel_clone_args args = {
.flags = CLONE_VFORK | CLONE_VM,
.exit_signal = SIGCHLD,
};
return kernel_clone(&args);
}
#endif
我们来看看kernel_clone
的实现:
// kernel/fork.c L2869
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*
* args->exit_signal is expected to be checked for sanity by the caller.
*/
pid_t kernel_clone(struct kernel_clone_args *args)
{
u64 clone_flags = args->flags;
struct completion vfork;
struct pid *pid;
struct task_struct *p;
int trace = 0;
pid_t nr;
// 做一些权限校验
/*
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
* mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
* field in struct clone_args and it still doesn't make sense to have
* them both point at the same memory location. Performing this check
* here has the advantage that we don't need to have a separate helper
* to check for legacy clone().
*/
if ((args->flags & CLONE_PIDFD) &&
(args->flags & CLONE_PARENT_SETTID) &&
(args->pidfd == args->parent_tid))
return -EINVAL;
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if (args->exit_signal != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}
// 复制进程结构体
p = copy_process(NULL, trace, NUMA_NO_NODE, args);
add_latent_entropy();
if (IS_ERR(p))
return PTR_ERR(p);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
trace_sched_process_fork(current, p);
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, args->parent_tid);
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
get_task_struct(p);
}
if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
task_unlock(p);
}
// 唤醒子进程,尽可能让子进程先执行
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
put_pid(pid);
return nr;
}
为了简化,我们尝试画一个简单的图:
copy_process
那么copy_process
所做的事情就成为了重中之重了,复制进程的时候到底复制了什么呢?我们来看:
// kernel/fork.c L2238
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
* It copies the registers, and all the appropriate
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
struct kernel_clone_args *args)
该函数是一个非常长的函数(L2238-L2808),因为设置到针对各种参数的处理。copy_process
的大致执行过程如下:
首先copy_process
会进行各种权限的校验,如:
// kernel/fork.c L2304
if (clone_flags & CLONE_PIDFD) {
/*
* - CLONE_DETACHED is blocked so that we can potentially
* reuse it later for CLONE_PIDFD.
* - CLONE_THREAD is blocked until someone really needs it.
*/
if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
return ERR_PTR(-EINVAL);
}
完成校验后,copy_process
会进行信号的相关处理:
// kernel/fork.c L2314
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
* processes that happen during the fork and delay them so that
* they appear to happen after the fork.
*/
sigemptyset(&delayed.signal);
INIT_HLIST_NODE(&delayed.node);
spin_lock_irq(¤t->sighand->siglock);
if (!(clone_flags & CLONE_THREAD))
hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
retval = -ERESTARTNOINTR;
if (task_sigpending(current))
goto fork_out;
从注释中我们可以看出,这里会将fork
前收到的信号传送出去,而fork
执行过程中的信号则做一些延迟。
接着,copy_process
会调用dup_task_struct
为新进程创建内核栈、task_info
等结构体,这时候子进程和父进程的进程描述符是完全一样的:
// kernel/fork.c L2333
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
在执行完这一步后,子进程会设置部分flags
的值并进行诸多成员的清零和初始化:
// kernel/fork.c L2336
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
if (args->user_worker)
p->flags |= PF_USER_WORKER;
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
* fatal or STOP
*/
p->flags |= PF_IO_WORKER;
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
if (args->name)
strscpy_pad(p->comm, args->name, sizeof(p->comm));
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
之后会将该进程分配到某个CPU上去:
// kernel/fork.c L2476
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p, clone_flags);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
接着,根据传递给kernel_clone
的参数,copy_process
拷贝或者共享打开的文件、文件系统信息等内容:
// kernel/fork.c L2487
/* copy all the process information */
shm_init_task(p);
retval = security_task_alloc(p, clone_flags);
if (retval)
goto bad_fork_cleanup_audit;
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
if (retval)
goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p);
if (retval)
goto bad_fork_cleanup_fs;
retval = copy_signal(clone_flags, p);
if (retval)
goto bad_fork_cleanup_sighand;
retval = copy_mm(clone_flags, p);
if (retval)
goto bad_fork_cleanup_signal;
retval = copy_namespaces(clone_flags, p);
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
retval = copy_thread(p, args);
if (retval)
goto bad_fork_cleanup_io;
接着,调用alloc_pid
为新进程分配一个有效的pid
:
// kernel/fork.c L2525
if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
args->set_tid_size);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
}
}
最后,copy_process
做一些扫尾的工作并返回相应的指针。
在阅读《Linux内核设计与实现》一书过程中,其在这里讲解的进程创建过程与笔者记录的并不完全一致。简单的说,copy_process
就是对当前进程做了一个复制,并且基于传入的参数对这个进程描述符做或多或少的修改,在以一个新的pid
作为进程的标记之后就返回。
接着,我们就需要尽可能的让子进程优先于父进程运行。一般子进程在执行之后就会立刻调用exec
函数,如果我们让子进程先运行的话,就可以避免写时拷贝的额外开销;而如果父进程受限制性,则可能立马就会做写入。
线程创建
首先我们需要知道,在Linux中,我们并没有对线程thread
做更细节的描述,而是把线程看成是一个特殊的进程来实现。**也即线程是一个与其他进程共享某些资源的进程。**而在线程创建的过程中,也就自然而然的复用了进程创建的过程,只不过在传入的参数上有所区别:
// kernel/fork.c L2964
/*
* Create a kernel thread.
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
.name = name,
.kthread = 1,
};
return kernel_clone(&args);
}
/*
* Create a user mode thread.
*/
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
};
return kernel_clone(&args);
}
可以看到,无论是内核线程还是用户线程,都是通过调用kernel_clone
来进行实现的。这里的诸如CLONE_VM
、CLONE_UNTRACED
等标志都是来告诉内核到底这个线程共享了哪些内容的,例如CLONE_VM
就是指父子共享地址空间。相关参数定义可以在include/uapi/linux/sched.h
中找到:
// kernel/fork.c L7
/*
* cloning flags:
*/
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD 0x00010000 /* Same thread group? */
#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
* syscalls only:
*/
#define CLONE_NEWTIME 0x00000080 /* New time namespace */
值得注意的是,内核的内核线程是在kernel/kthread.c
中实现的,但是其底层也是调用我们前面说的kernel_thread
函数:
// kernel/kthread.c L394
static void create_kthread(struct kthread_create_info *create)
{
int pid;
#ifdef CONFIG_NUMA
current->pref_node_fork = create->node;
#endif
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, create->full_name,
CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
/* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);
kfree(create->full_name);
if (!done) {
kfree(create);
return;
}
create->result = ERR_PTR(pid);
complete(done);
}
}
小结
今天我们结合书和代码大致的学习了Linux中进程和线程的创建,接下来我们将会介绍进程的终结过程,敬请期待。
小结如下:
e,
CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
/* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);
kfree(create->full_name);
if (!done) {
kfree(create);
return;
}
create->result = ERR_PTR(pid);
complete(done);
}
}
### 小结
今天我们结合书和代码大致的学习了Linux中进程和线程的创建,接下来我们将会介绍进程的终结过程,敬请期待。
小结如下:
[外链图片转存中...(img-tzqyV0WM-1683206712322)]