深入解析Linux内核：进程与线程的创建机制-CSDN博客

本文链接：https://blog.csdn.net/weixin_43079395/article/details/130495702

大家好，我是程栩，一个专注于性能的大厂程序员，分享包括但不限于计算机体系结构、性能优化、云原生的知识。

引

前面我们介绍了一些关于进程的知识，今天我们来聊一聊进程是如何创建的。今天的内容基于《Linux内核设计与实现》以及Linux v6.3版本。

进程创建

许多操作系统都提供了产生进程的机制，Linux内核中，采取了组合的方式来实现这样的机制，通过fork和exec的组合，将进程的生成分为两个步骤：简单来说就是fork负责生成一个进程，然后exec读入可执行文件执行：

fork与exec的简化过程

当然，以上只是简化的步骤。进程的创建并不是复制进程描述符即可，需要做许多细节的操作。

在内核中，通过kernel_clone来实现fork系统调用，而与fork类似的系统调用，例如vfork、__clone等，都是通过给kernel_clone传入不同的参数来实现：

// kernel/fork.c L2999
#ifdef __ARCH_WANT_SYS_FORK
// 通过SYSCALL_DEFINE0宏定义声明一个0参数系统调用
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
    // 设置kernel参数
	struct kernel_clone_args args = {
		.exit_signal = SIGCHLD,
	}; 
	// 调用kernel_clone
	return kernel_clone(&args);
#else
	/* can not support in nommu mode */
	return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
	struct kernel_clone_args args = {
		.flags		= CLONE_VFORK | CLONE_VM,
		.exit_signal	= SIGCHLD,
	};

	return kernel_clone(&args);
}
#endif

我们来看看kernel_clone的实现：

// kernel/fork.c L2869
/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 *
 * args->exit_signal is expected to be checked for sanity by the caller.
 */
pid_t kernel_clone(struct kernel_clone_args *args)
{
	u64 clone_flags = args->flags;
	struct completion vfork;
	struct pid *pid;
	struct task_struct *p;
	int trace = 0;
	pid_t nr;
	// 做一些权限校验
	/*
	 * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
	 * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
	 * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
	 * field in struct clone_args and it still doesn't make sense to have
	 * them both point at the same memory location. Performing this check
	 * here has the advantage that we don't need to have a separate helper
	 * to check for legacy clone().
	 */
	if ((args->flags & CLONE_PIDFD) &&
	    (args->flags & CLONE_PARENT_SETTID) &&
	    (args->pidfd == args->parent_tid))
		return -EINVAL;
	
	/*
	 * Determine whether and which event to report to ptracer.  When
	 * called from kernel_thread or CLONE_UNTRACED is explicitly
	 * requested, no event is reported; otherwise, report if the event
	 * for the type of forking is enabled.
	 */
	if (!(clone_flags & CLONE_UNTRACED)) {
		if (clone_flags & CLONE_VFORK)
			trace = PTRACE_EVENT_VFORK;
		else if (args->exit_signal != SIGCHLD)
			trace = PTRACE_EVENT_CLONE;
		else
			trace = PTRACE_EVENT_FORK;

		if (likely(!ptrace_event_enabled(current, trace)))
			trace = 0;
	}
	// 复制进程结构体
	p = copy_process(NULL, trace, NUMA_NO_NODE, args);
	add_latent_entropy();

	if (IS_ERR(p))
		return PTR_ERR(p);

	/*
	 * Do this prior waking up the new thread - the thread pointer
	 * might get invalid after that point, if the thread exits quickly.
	 */
	trace_sched_process_fork(current, p);

	pid = get_task_pid(p, PIDTYPE_PID);
	nr = pid_vnr(pid);

	if (clone_flags & CLONE_PARENT_SETTID)
		put_user(nr, args->parent_tid);

	if (clone_flags & CLONE_VFORK) {
		p->vfork_done = &vfork;
		init_completion(&vfork);
		get_task_struct(p);
	}

	if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
		/* lock the task to synchronize with memcg migration */
		task_lock(p);
		lru_gen_add_mm(p->mm);
		task_unlock(p);
	}
	// 唤醒子进程，尽可能让子进程先执行
	wake_up_new_task(p);

	/* forking complete and child started to run, tell ptracer */
	if (unlikely(trace))
		ptrace_event_pid(trace, pid);

	if (clone_flags & CLONE_VFORK) {
		if (!wait_for_vfork_done(p, &vfork))
			ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
	}

	put_pid(pid);
	return nr;
}

为了简化，我们尝试画一个简单的图：

kernel_clone

copy_process

那么copy_process所做的事情就成为了重中之重了，复制进程的时候到底复制了什么呢？我们来看：

// kernel/fork.c L2238
/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
__latent_entropy struct task_struct *copy_process(
					struct pid *pid,
					int trace,
					int node,
					struct kernel_clone_args *args)

该函数是一个非常长的函数（L2238-L2808），因为设置到针对各种参数的处理。copy_process的大致执行过程如下：

首先copy_process会进行各种权限的校验，如：

// kernel/fork.c L2304
if (clone_flags & CLONE_PIDFD) {
		/*
		 * - CLONE_DETACHED is blocked so that we can potentially
		 *   reuse it later for CLONE_PIDFD.
		 * - CLONE_THREAD is blocked until someone really needs it.
		 */
		if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
			return ERR_PTR(-EINVAL);
	}

完成校验后，copy_process会进行信号的相关处理：

// kernel/fork.c L2314
	/*
	 * Force any signals received before this point to be delivered
	 * before the fork happens.  Collect up signals sent to multiple
	 * processes that happen during the fork and delay them so that
	 * they appear to happen after the fork.
	 */
	sigemptyset(&delayed.signal);
	INIT_HLIST_NODE(&delayed.node);

	spin_lock_irq(&current->sighand->siglock);
	if (!(clone_flags & CLONE_THREAD))
		hlist_add_head(&delayed.node, &current->signal->multiprocess);
	recalc_sigpending();
	spin_unlock_irq(&current->sighand->siglock);
	retval = -ERESTARTNOINTR;
	if (task_sigpending(current))
		goto fork_out;

从注释中我们可以看出，这里会将fork前收到的信号传送出去，而fork执行过程中的信号则做一些延迟。

接着，copy_process会调用dup_task_struct为新进程创建内核栈、task_info等结构体，这时候子进程和父进程的进程描述符是完全一样的：

// kernel/fork.c L2333
	p = dup_task_struct(current, node);
	if (!p)
		goto fork_out;

在执行完这一步后，子进程会设置部分flags的值并进行诸多成员的清零和初始化：

// kernel/fork.c L2336
	p->flags &= ~PF_KTHREAD;
	if (args->kthread)
		p->flags |= PF_KTHREAD;
	if (args->user_worker)
		p->flags |= PF_USER_WORKER;
	if (args->io_thread) {
		/*
		 * Mark us an IO worker, and block any signal that isn't
		 * fatal or STOP
		 */
		p->flags |= PF_IO_WORKER;
		siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
	}

	if (args->name)
		strscpy_pad(p->comm, args->name, sizeof(p->comm));

	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
	/*
	 * Clear TID on mm_release()?
	 */
	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;

之后会将该进程分配到某个CPU上去：

// kernel/fork.c L2476
	/* Perform scheduler related setup. Assign this task to a CPU. */
	retval = sched_fork(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_policy;

	retval = perf_event_init_task(p, clone_flags);
	if (retval)
		goto bad_fork_cleanup_policy;
	retval = audit_alloc(p);
	if (retval)
		goto bad_fork_cleanup_perf;

接着，根据传递给kernel_clone的参数，copy_process拷贝或者共享打开的文件、文件系统信息等内容：

// kernel/fork.c L2487
	/* copy all the process information */
	shm_init_task(p);
	retval = security_task_alloc(p, clone_flags);
	if (retval)
		goto bad_fork_cleanup_audit;
	retval = copy_semundo(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_security;
	retval = copy_files(clone_flags, p, args->no_files);
	if (retval)
		goto bad_fork_cleanup_semundo;
	retval = copy_fs(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_files;
	retval = copy_sighand(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_fs;
	retval = copy_signal(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_sighand;
	retval = copy_mm(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_signal;
	retval = copy_namespaces(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_mm;
	retval = copy_io(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_namespaces;
	retval = copy_thread(p, args);
	if (retval)
		goto bad_fork_cleanup_io;

接着，调用alloc_pid为新进程分配一个有效的pid：

// kernel/fork.c L2525
	if (pid != &init_struct_pid) {
		pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
				args->set_tid_size);
		if (IS_ERR(pid)) {
			retval = PTR_ERR(pid);
			goto bad_fork_cleanup_thread;
		}
	}

最后，copy_process做一些扫尾的工作并返回相应的指针。

在阅读《Linux内核设计与实现》一书过程中，其在这里讲解的进程创建过程与笔者记录的并不完全一致。简单的说，copy_process就是对当前进程做了一个复制，并且基于传入的参数对这个进程描述符做或多或少的修改，在以一个新的pid作为进程的标记之后就返回。

接着，我们就需要尽可能的让子进程优先于父进程运行。一般子进程在执行之后就会立刻调用exec函数，如果我们让子进程先运行的话，就可以避免写时拷贝的额外开销；而如果父进程受限制性，则可能立马就会做写入。

线程创建

首先我们需要知道，在Linux中，我们并没有对线程thread做更细节的描述，而是把线程看成是一个特殊的进程来实现。**也即线程是一个与其他进程共享某些资源的进程。**而在线程创建的过程中，也就自然而然的复用了进程创建的过程，只不过在传入的参数上有所区别：

// kernel/fork.c L2964
/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
		    unsigned long flags)
{
	struct kernel_clone_args args = {
		.flags		= ((lower_32_bits(flags) | CLONE_VM |
				    CLONE_UNTRACED) & ~CSIGNAL),
		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
		.fn		= fn,
		.fn_arg		= arg,
		.name		= name,
		.kthread	= 1,
	};

	return kernel_clone(&args);
}

/*
 * Create a user mode thread.
 */
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
	struct kernel_clone_args args = {
		.flags		= ((lower_32_bits(flags) | CLONE_VM |
				    CLONE_UNTRACED) & ~CSIGNAL),
		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
		.fn		= fn,
		.fn_arg		= arg,
	};

	return kernel_clone(&args);
}

可以看到，无论是内核线程还是用户线程，都是通过调用kernel_clone来进行实现的。这里的诸如CLONE_VM、CLONE_UNTRACED等标志都是来告诉内核到底这个线程共享了哪些内容的，例如CLONE_VM就是指父子共享地址空间。相关参数定义可以在include/uapi/linux/sched.h中找到：

// kernel/fork.c L7
/*
 * cloning flags:
 */
#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
#define CLONE_VM	0x00000100	/* set if VM shared between processes */
#define CLONE_FS	0x00000200	/* set if fs info shared between processes */
#define CLONE_FILES	0x00000400	/* set if open files shared between processes */
#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
#define CLONE_PIDFD	0x00001000	/* set if a pidfd should be placed in parent */
#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
#define CLONE_THREAD	0x00010000	/* Same thread group? */
#define CLONE_NEWNS	0x00020000	/* New mount namespace group */
#define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */
#define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */
#define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */
#define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */
#define CLONE_DETACHED		0x00400000	/* Unused, ignored */
#define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
#define CLONE_NEWCGROUP		0x02000000	/* New cgroup namespace */
#define CLONE_NEWUTS		0x04000000	/* New utsname namespace */
#define CLONE_NEWIPC		0x08000000	/* New ipc namespace */
#define CLONE_NEWUSER		0x10000000	/* New user namespace */
#define CLONE_NEWPID		0x20000000	/* New pid namespace */
#define CLONE_NEWNET		0x40000000	/* New network namespace */
#define CLONE_IO		0x80000000	/* Clone io context */

/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */

/*
 * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
 * syscalls only:
 */
#define CLONE_NEWTIME	0x00000080	/* New time namespace */

值得注意的是，内核的内核线程是在kernel/kthread.c中实现的，但是其底层也是调用我们前面说的kernel_thread函数：

// kernel/kthread.c L394
static void create_kthread(struct kthread_create_info *create)
{
	int pid;

#ifdef CONFIG_NUMA
	current->pref_node_fork = create->node;
#endif
	/* We want our own signal handler (we take no signals by default). */
	pid = kernel_thread(kthread, create, create->full_name,
			    CLONE_FS | CLONE_FILES | SIGCHLD);
	if (pid < 0) {
		/* Release the structure when caller killed by a fatal signal. */
		struct completion *done = xchg(&create->done, NULL);

		kfree(create->full_name);
		if (!done) {
			kfree(create);
			return;
		}
		create->result = ERR_PTR(pid);
		complete(done);
	}
}

小结

今天我们结合书和代码大致的学习了Linux中进程和线程的创建，接下来我们将会介绍进程的终结过程，敬请期待。

小结如下：

e,
CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
/* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);

	kfree(create->full_name);
	if (!done) {
		kfree(create);
		return;
	}
	create->result = ERR_PTR(pid);
	complete(done);
}

}




### 小结

今天我们结合书和代码大致的学习了Linux中进程和线程的创建，接下来我们将会介绍进程的终结过程，敬请期待。

小结如下：

[外链图片转存中...(img-tzqyV0WM-1683206712322)]

原来进程是这样创建的

引

进程创建

copy_process

线程创建

小结