进程原理及其系统调用

苦梨甜

已于 2023-05-14 20:13:04 修改

阅读量862

点赞数 1

分类专栏： Linux驱动 # 内核模块文章标签： linux 架构驱动开发 arm开发数据结构

于 2022-11-29 20:30:49 首次发布

本文链接：https://blog.csdn.net/weixin_52849254/article/details/128103396

版权

Linux驱动同时被 2 个专栏收录

103 篇文章 14 订阅

订阅专栏

内核模块

21 篇文章 3 订阅

订阅专栏

进程原理及其系统调用

文章目录

进程原理及其系统调用
什么是进程
进程的四要要素
进程生命周期
task_struct数据结构
进程优先级
进程系统调用

什么是进程

操作系统作为硬件的使用层，提供使用硬件资源的能力，进程作为操作系统使用层，提供使用操作系统抽象出的资源层的能力。

进程：是指计算机中己运行的程序。进程本身不是基本的运行单位，而是线程的容器。程序本身只是指令、数据及其组织形式的描述，进程才是程序（那些指令和数据）的真正运行实例。

进程的四要要素

有一段程序代其执行，
有进程专用的系统堆栈空间；
在内核有taskstruct数据结构，
进程有独立的存储空间，拥有专有的用户空间；

如果具备前面三条的话而缺少第四条，那就称为"线程"。
如果完全没有用户空间，就称为"内核线程"；如果共享用户空间则就称为"用户线程"。

进程生命周期

Linux操作系统属于多任务操作系统，系统中的每个进程能够分时复用CPU时间片，通过有效的进程调度策略实现多任务并行执行。而进程在被CPU调度运行，等待CPU资源分配以及等待外部事件时会属于不同的状态。进程之间的状态关系：
运行：该进程此刻正在执行。
等待：进程能够运行，但没有得到许可，因为CPU分配给另一个进程。调度器可以在下次任务切换时结束该进程。
睡眠：进程正在睡眠无法运行，因为它在等待一个外部事件。调度器无法在下一次任务切换时选择该进程。
在这里插入图片描述

task_struct数据结构

Linux内核涉及进程和程序的所有算法都围绕一个名为task_struct的数据结构建立，该结构定义在include/linux/sched.h中。这是系统中主要的，个结构。在阐述调度器的实现之前，了解一下Linux管理进程的方式是很有必要的。

task_struct包含很多成员，将进程与各个内核子系统联系，task_struct定义如下：

struct task_struct {//进程描述符
	volatile long state;	/*进程状态一1：就绪态0：运仃态>0：停止态 -1 unrunnable, 0 runnable, >0 stopped */
	void *stack;//指向内核栈指针
	atomic_t usage;//有几个进程在使用此结构
	unsigned int flags;	/*标记 per process flags, defined below */
	unsigned int ptrace;//ptrace系统调用，实现断电测试，跟踪进程运行

#ifdef CONFIG_SMP
	struct llist_node wake_entry;
	int on_cpu;
	struct task_struct *last_wakee;
	unsigned long wakee_flips;
	unsigned long wakee_flip_decay_ts;

	int wake_cpu;
#endif
	int on_rq;//运行队列和进程调试相关程序

	int prio, static_prio, normal_prio;//关于进程调度
	unsigned int rt_priority;//优先级

	//关于进程
	const struct sched_class *sched_class;
	struct sched_entity se;
	struct sched_rt_entity rt;
#ifdef CONFIG_CGROUP_SCHED
	struct task_group *sched_task_group;
#endif
	struct sched_dl_entity dl;

#ifdef CONFIG_PREEMPT_NOTIFIERS
	/* list of struct preempt_notifier: */
	struct hlist_head preempt_notifiers;
#endif

//块设备跟踪工具
#ifdef CONFIG_BLK_DEV_IO_TRACE
	unsigned int btrace_seq;
#endif
	//进程调度策略相关字段
	unsigned int policy;
	int nr_cpus_allowed;
	cpumask_t cpus_allowed;
	
//RCU同步原语
#ifdef CONFIG_PREEMPT_RCU
	int rcu_read_lock_nesting;
	union rcu_special rcu_read_unlock_special;
	struct list_head rcu_node_entry;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_PREEMPT_RCU
	struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
	unsigned long rcu_tasks_nvcsw;
	bool rcu_tasks_holdout;
	struct list_head rcu_tasks_holdout_list;
	int rcu_tasks_idle_cpu;
#endif /* #ifdef CONFIG_TASKS_RCU */

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
	struct sched_info sched_info;
#endif

//进程架构链表
	struct list_head tasks;
#ifdef CONFIG_SMP
	struct plist_node pushable_tasks;
	struct rb_node pushable_dl_tasks;
#endif

/*进程管理进程的地址空间，每个进程有独立的地址空间4G，32位X86*/
	struct mm_struct *mm, *active_mm;
#ifdef CONFIG_COMPAT_BRK
	unsigned brk_randomized:1;
#endif
	/* per-thread vma caching */
	u32 vmacache_seqnum;
	struct vm_area_struct *vmacache[VMACACHE_SIZE];
#if defined(SPLIT_RSS_COUNTING)
	struct task_rss_stat	rss_stat;
#endif

//进程状态参数
/* task state */
	int exit_state;
	int exit_code, exit_signal;
	int pdeath_signal;  /* 接收父进程终止时就会发出信号 The signal sent when the parent dies  */
	unsigned int jobctl;	/* JOBCTL_*, siglock protected */

	/* Used for emulating ABI behavior of previous Linux versions */
	unsigned int personality;

	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
				 * execve */
	unsigned in_iowait:1;

	/* Revert to default priority/policy when forking */
	unsigned sched_reset_on_fork:1;
	unsigned sched_contributes_to_load:1;

#ifdef CONFIG_MEMCG_KMEM
	unsigned memcg_kmem_skip_account:1;
#endif

	unsigned long atomic_flags; /* Flags needing atomic access. */

	struct restart_block restart_block;

	pid_t pid;//进程pid
	pid_t tgid;//父进程

#ifdef CONFIG_CC_STACKPROTECTOR
	/* Canary value for the -fstack-protector gcc feature */
	unsigned long stack_canary;//防止内核堆栈溢出
#endif
	/*
	 * pointers to (original) parent process, youngest child, younger sibling,
	 * older sibling, respectively.  (p->father can be replaced with
	 * p->real_parent->pid)
	 */
	struct task_struct __rcu *real_parent; /*初始化父进程 real parent process */
	struct task_struct __rcu *parent; /* 接收终止进程recipient of SIGCHLD, wait4() reports */
	/*
	 * children/sibling forms the list of my natural children
	 */
	
	struct list_head children;	 //维护子进程链表
	/* list of my children */
	struct list_head sibling;	/*兄弟进程链表 linkage in my parent's children list */
	struct task_struct *group_leader;	/*线程组组长 threadgroup leader */

	/*
	 * ptraced is the list of tasks this task is using ptrace on.
	 * This includes both natural children and PTRACE_ATTACH targets.
	 * p->ptrace_entry is p's link on the p->parent->ptraced list.
	 */
	struct list_head ptraced;//系统调用，关于断开调试
	struct list_head ptrace_entry;

	/* PID/PID hash table linkage. */
	struct pid_link pids[PIDTYPE_MAX];//pid/pid散列表的关系
	struct list_head thread_group;
	struct list_head thread_node;

	//for_fork()函数
	struct completion *vfork_done;		/* for vfork() */
	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */

	//描还cpu的时间内容
	//utime用户态下的执行时间
	//stime内核态下的执行时间

	cputime_t utime, stime, utimescaled, stimescaled;
	cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
	struct cputime prev_cputime;
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
	seqlock_t vtime_seqlock;
	unsigned long long vtime_snap;
	enum {
		VTIME_SLEEPING = 0,
		VTIME_USER,
		VTIME_SYS,
	} vtime_snap_whence;
#endif
	unsigned long nvcsw, nivcsw; /* context switch counts */
	u64 start_time;		/* monotonic time in nsec */
	u64 real_start_time;	/* boot based time in nsec */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
	unsigned long min_flt, maj_flt;

	struct task_cputime cputime_expires;
	struct list_head cpu_timers[3];

/* process credentials */
	const struct cred __rcu *real_cred; /* objective and real subjective task
					 * credentials (COW) */
	const struct cred __rcu *cred;	/* effective (overridable) subjective task
					 * credentials (COW) */
	char comm[TASK_COMM_LEN]; /* executable name excluding path
				     - access with [gs]et_task_comm (which lock
				       it with task_lock())
				     - initialized normally by setup_new_exec */
/* file system info */
	int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
	struct sysv_sem sysvsem;
	struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
	unsigned long last_switch_count;
#endif
/* CPU-specific state of this task */
	struct thread_struct thread;
/* filesystem information */
	struct fs_struct *fs;
/* open file information */
	struct files_struct *files;
/* namespaces */
	struct nsproxy *nsproxy;
/* signal handlers */
	struct signal_struct *signal;
	struct sighand_struct *sighand;

	sigset_t blocked, real_blocked;
	sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
	struct sigpending pending;

	unsigned long sas_ss_sp;
	size_t sas_ss_size;
	int (*notifier)(void *priv);
	void *notifier_data;
	sigset_t *notifier_mask;
	struct callback_head *task_works;

	struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
	kuid_t loginuid;
	unsigned int sessionid;
#endif
	struct seccomp seccomp;

/* Thread group tracking */
   	u32 parent_exec_id;
   	u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
 * mempolicy */
	spinlock_t alloc_lock;

	/* Protection of the PI data structures: */
	raw_spinlock_t pi_lock;

#ifdef CONFIG_RT_MUTEXES
	/* PI waiters blocked on a rt_mutex held by this task */
	struct rb_root pi_waiters;
	struct rb_node *pi_waiters_leftmost;
	/* Deadlock detection and priority inheritance handling */
	struct rt_mutex_waiter *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
	/* mutex deadlock detection */
	struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
	unsigned int irq_events;
	unsigned long hardirq_enable_ip;
	unsigned long hardirq_disable_ip;
	unsigned int hardirq_enable_event;
	unsigned int hardirq_disable_event;
	int hardirqs_enabled;
	int hardirq_context;
	unsigned long softirq_disable_ip;
	unsigned long softirq_enable_ip;
	unsigned int softirq_disable_event;
	unsigned int softirq_enable_event;
	int softirqs_enabled;
	int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
	u64 curr_chain_key;
	int lockdep_depth;
	unsigned int lockdep_recursion;
	struct held_lock held_locks[MAX_LOCK_DEPTH];
	gfp_t lockdep_reclaim_gfp;
#endif

/* journalling filesystem info */
	void *journal_info;//日志文件系统信息

/* stacked block device info */
	struct bio_list *bio_list;//块设备i岸标

#ifdef CONFIG_BLOCK
/* stack plugging */
	struct blk_plug *plug;
#endif

/* VM state *///虚拟内存状态参数
	struct reclaim_state *reclaim_state;//虚拟内存状态，内存回收

	struct backing_dev_info *backing_dev_info;//存放款设备I/O流量信息

	struct io_context *io_context;//I/O调度器所用的信息

	unsigned long ptrace_message;
	siginfo_t *last_siginfo; /* For ptrace use.  */
	struct task_io_accounting ioac;//记录进程I/O计数
#if defined(CONFIG_TASK_XACCT)
	u64 acct_rss_mem1;	/* accumulated rss usage */
	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
	cputime_t acct_timexpd;	/* stime + utime since last update */
#endif
#ifdef CONFIG_CPUSETS
	nodemask_t mems_allowed;	/* Protected by alloc_lock */
	seqcount_t mems_allowed_seq;	/* Seqence no to catch updates */
	int cpuset_mem_spread_rotor;
	int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
	/* Control Group info protected by css_set_lock */
	struct css_set __rcu *cgroups;
	/* cg_list protected by css_set_lock and tsk->alloc_lock */
	struct list_head cg_list;
#endif
//futex同步机制
#ifdef CONFIG_FUTEX
	struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
	struct compat_robust_list_head __user *compat_robust_list;
#endif
	struct list_head pi_state_list;
	struct futex_pi_state *pi_state_cache;
#endif

//内存检测工具Performance Event
#ifdef CONFIG_PERF_EVENTS
	struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
	struct mutex perf_event_mutex;
	struct list_head perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
	unsigned long preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
	short il_next;
	short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
	int numa_scan_seq;
	unsigned int numa_scan_period;
	unsigned int numa_scan_period_max;
	int numa_preferred_nid;
	unsigned long numa_migrate_retry;
	u64 node_stamp;			/* migration stamp  */
	u64 last_task_numa_placement;
	u64 last_sum_exec_runtime;
	struct callback_head numa_work;

	struct list_head numa_entry;
	struct numa_group *numa_group;

	/*
	 * numa_faults is an array split into four regions:
	 * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
	 * in this precise order.
	 *
	 * faults_memory: Exponential decaying average of faults on a per-node
	 * basis. Scheduling placement decisions are made based on these
	 * counts. The values remain static for the duration of a PTE scan.
	 * faults_cpu: Track the nodes the process was running on when a NUMA
	 * hinting fault was incurred.
	 * faults_memory_buffer and faults_cpu_buffer: Record faults per node
	 * during the current scan window. When the scan completes, the counts
	 * in faults_memory and faults_cpu decay and these values are copied.
	 */
	unsigned long *numa_faults;
	unsigned long total_numa_faults;

	/*
	 * numa_faults_locality tracks if faults recorded during the last
	 * scan window were remote/local or failed to migrate. The task scan
	 * period is adapted based on the locality of the faults with different
	 * weights depending on whether they were shared or private faults
	 */
	unsigned long numa_faults_locality[3];

	unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

	struct rcu_head rcu;//rcu链表

	/*
	 * cache last used pipe for splice
	 */
	struct pipe_inode_info *splice_pipe;//管道

	struct page_frag task_frag;


//延迟计数
#ifdef	CONFIG_TASK_DELAY_ACCT
	struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
	int make_it_fail;
#endif
	/*
	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
	 * balance_dirty_pages() for some dirty throttling pause
	 */
	int nr_dirtied;
	int nr_dirtied_pause;
	unsigned long dirty_paused_when; /* start of a write-and-pause period */

#ifdef CONFIG_LATENCYTOP
	int latency_record_count;
	struct latency_record latency_record[LT_SAVECOUNT];
#endif
	/*
	 * time slack values; these are used to round up poll() and
	 * select() etc timeout values. These are in nanoseconds.
	 */
	unsigned long timer_slack_ns;
	unsigned long default_timer_slack_ns;

#ifdef CONFIG_KASAN
	unsigned int kasan_depth;
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
	/* Index of current stored address in ret_stack */
	int curr_ret_stack;
	/* Stack of return addresses for return function tracing */
	struct ftrace_ret_stack	*ret_stack;
	/* time stamp for last schedule */
	unsigned long long ftrace_timestamp;
	/*
	 * Number of functions that haven't been traced
	 * because of depth overrun.
	 */
	atomic_t trace_overrun;
	/* Pause for the tracing */
	atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
	/* state flags for use by tracers */
	unsigned long trace;
	/* bitmask and counter of trace recursion */
	unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_MEMCG
	struct memcg_oom_info {
		struct mem_cgroup *memcg;
		gfp_t gfp_mask;
		int order;
		unsigned int may_oom:1;
	} memcg_oom;
#endif
#ifdef CONFIG_UPROBES
	struct uprobe_task *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
	unsigned int	sequential_io;
	unsigned int	sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
	unsigned long	task_state_change;
#endif
};

进程优先级

并非所有进程都具有相同的重要性。除了大多数我们所熟悉的进程优先级之外，进程还有不同的关键度类别，以满足不同需求。首先进行比较粗糙的划分，进程可以分为实时进程(0~99)和非实时进程(普通进程100-139)：

在这里插入图片描述

进程系统调用

讨论fork和exec系列系统调用的实现。通常这些调用不是由应用程序直接发出的，而是通过一个中间层调用，即负责与内核通信的C标库。从用户状态切换到核心态的方法，依不同的体系结构而各有不同：
在这里插入图片描述

进程复制

传统的入UNIX中用·于复制进程的系统调用是fork。但它并不是Linux为此实现的唯一调用，实际Linux实现了3个。

fork是重量级调用，因为它建立了父进程的．个完整副本，然后作为了进程执行。为减少与该调用相关的工作量，Linux使用了写时复制（copy-on-write）技术。
vfork类似于k，不并不创建父进程数据的副本。相反，父了进程之间共享数据。
这节省了大量CPU时间（如果一个进程操纵共享数据，则另一个会自动注意到）。
clone产生线程，可以对父了进程之间的共享、复制进行精确控制。

写时复制（copy-on-write）

内核使用了写时复制（Copy-0n-Wnte,COW）技术，以防十在fork执行时将父进程的所有数据复制到了进程。在调用fork时，内核通常对父进程的每个内存页，都为了进程创建一个相同的副本。
在这里插入图片描述
问题？如果主进程修改其中页z的数据？此时就会发生父子进程在内存分离。

只有在不得不复制数据内容时才去复制数据内容，这是是写时复制核心思想，可以看到因为修改页z导致子进程不得不去复制原页z来保证父子进程互干扰。
内核只为新生成的子进程创建虚拟空间结构，它们来复制于父进程的虚拟结构，但是不为这些段分配物理内存，它们共享父进程的物理空间，当父进程中有更改相应段的行为发生时，再为子进程相应段分配物理空间。

执行系统调用

fork、vfork和clone系统调用的入口点分别是sys_fork、sys_vfork和sys_clone函数。其定又依赖于具体的体系结构，因为在用户窄间和内核空间之间传递参数的方法因体系结构而异。

do_fork实现

所有3个fork机制最终都调用kernel/fork中的do_fork（一个体系结构无关的函数），其代码流程如图所示。
在这里插入图片描述

long do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr)
{//生成一个子进程，然后把它加入到CPU就绪队列，等待CPU调度，然后系统调用就返回了。
	struct task_struct *p;
	int trace = 0;
	long nr;

/*
	 * Determine whether and which event to report to ptracer.  When
	 * called from kernel_thread or CLONE_UNTRACED is explicitly
	 * requested, no event is reported; otherwise, report if the event
	 * for the type of forking is enabled.
	 */
	if (!(clone_flags & CLONE_UNTRACED)) {
		if (clone_flags & CLONE_VFORK)
			trace = PTRACE_EVENT_VFORK;
		else if ((clone_flags & CSIGNAL) != SIGCHLD)
			trace = PTRACE_EVENT_CLONE;
		else
			trace = PTRACE_EVENT_FORK;

		if (likely(!ptrace_event_enabled(current, trace)))
			trace = 0;
	}

	p = copy_process(clone_flags, stack_start, stack_size,
			 child_tidptr, NULL, trace);//将父进程的相关资源复制到子进程，执行生成子进程的工作；
	/*
	 * Do this prior waking up the new thread - the thread pointer
	 * might get invalid after that point, if the thread exits quickly.
	 */
	if (!IS_ERR(p)) {
		struct completion vfork;
		struct pid *pid;

		trace_sched_process_fork(current, p);

		pid = get_task_pid(p, PIDTYPE_PID);
		nr = pid_vnr(pid);

		if (clone_flags & CLONE_PARENT_SETTID)
			put_user(nr, parent_tidptr);

		if (clone_flags & CLONE_VFORK) {
			p->vfork_done = &vfork;
			init_completion(&vfork);
			get_task_struct(p);
		}

		wake_up_new_task(p);//将子进程加入到CPU就绪队列。

		/* forking complete and child started to run, tell ptracer */
		if (unlikely(trace))
			ptrace_event_pid(trace, pid);

		if (clone_flags & CLONE_VFORK) {
			if (!wait_for_vfork_done(p, &vfork))
				ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
		}

		put_pid(pid);
	} else {
		nr = PTR_ERR(p);
	}
	return nr;
}

内核线程

内核线程是直接由内核本身启动的进程。内核线程实际上是将内核函数委托给独立的进程，与系统中其他进程“并行"执行（实际上，也并行于内核自身的执0）。内核线程经常称之为（内核）守护进程。它们用于执行下列任务。

周期性地将修改的内存页与页来源块设各同步（例如，使用mmap的文件映射）。
如果内存页很少使用，则写人交换区。
管理延时动作(deferredaction）
实现文件系统的事务日志。

退出线程

进程必须用exrit系统调用终止。这使得内核有机会将该进
程使用的资源释放回系统。见kernel/exit.c----->do_exit。简而言之，
该函数的实现就是将各个引用汁数器减1，如果引用计数器]0而
没有进程再使用对应的结构，那么将相应的内存区域返还给内存
管理模块。

void do_exit(long code)
{
	struct task_struct *tsk = current;
	int group_dead;
	TASKS_RCU(int tasks_rcu_i);

	profile_task_exit(tsk);//触发task_exit_nb通知链实例的处理函数

	WARN_ON(blk_needs_flush_plug(tsk));//检查进程的blk_plug是否为空
//保证task_struct中的plug字段是空的，或者plug字段指向的队列是空的。plug字段的意义是stack plugging
	if (unlikely(in_interrupt()))//OOPS消息,中断上下文不能执行do_exit函数, 也不能终止PID为0的进程
		panic("Aiee, killing interrupt handler!");
	if (unlikely(!tsk->pid))
		panic("Attempted to kill the idle task!");

	/*
	 * If do_exit is called because this processes oopsed, it's possible
	 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
	 * continuing. Amongst other possible reasons, this is to prevent
	 * mm_release()->clear_child_tid() from writing to a user-controlled
	 * kernel address.
	 */
	set_fs(USER_DS);//设定进程可以使用的虚拟地址的上限（用户空间）

	ptrace_event(PTRACE_EVENT_EXIT, code);

	validate_creds_for_do_exit(tsk);

	/*
	 * We're taking recursive faults here in do_exit. Safest is to just
	 * leave this task alone and wait for reboot.
	 */
	 //检查进病设置进程程PF_EXITING
	 /*/*current->flags的PF_EXITING标志表示进程正在被删除*/
	if (unlikely(tsk->flags & PF_EXITING)) {/*  检查PF_EXITING标志是否未被设置  */
		pr_alert("Fixing recursive fault but reboot is needed!\n");
		/*
		 * We can do this unlocked here. The futex code uses
		 * this flag just to verify whether the pi state
		 * cleanup has been done or not. In the worst case it
		 * loops once more. We pretend that the cleanup was
		 * done as there is no way to return. Either the
		 * OWNER_DIED bit is set by now or we push the blocked
		 * task into the wait for ever nirwana as well.
		 */
		tsk->flags |= PF_EXITPIDONE;/*  设置进程标识为PF_EXITPIDONE*/
		set_current_state(TASK_UNINTERRUPTIBLE);/*  设置进程状态为不可中断的等待状态 */
		schedule();/*  调度其它进程  */
	}
//如果此标识未被设置, 则通过exit_signals来设置
	exit_signals(tsk);  /* sets PF_EXITING */
	/*
	 * tsk->flags are checked in the futex code to protect against
	 * an exiting task cleaning up the robust pi futexes.
	 */
	smp_mb();/*  内存屏障，用于确保在它之后的操作开始执行之前，它之前的操作已经完成*/
	raw_spin_unlock_wait(&tsk->pi_lock); /*  一直等待，直到获得current->pi_lock自旋锁*/

	if (unlikely(in_atomic()))
		pr_info("note: %s[%d] exited with preempt_count %d\n",
			current->comm, task_pid_nr(current),
			preempt_count());

	acct_update_integrals(tsk);//获取current->mm->rss_stat.count[member]计数
	/* sync mm's RSS info before statistics gathering */
	if (tsk->mm)//同步进程的mm的rss_stat
		sync_mm_rss(tsk->mm);
	group_dead = atomic_dec_and_test(&tsk->signal->live);//清除定时器
	if (group_dead) {
		hrtimer_cancel(&tsk->signal->real_timer);
		exit_itimers(tsk->signal);
		if (tsk->mm)
			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
	}
	acct_collect(code, group_dead);//收集进程会计信息
	if (group_dead)//审计
		tty_audit_exit();//记录审计事件
	audit_free(tsk);//  释放struct audit_context结构体

	tsk->exit_code = code;
	taskstats_exit(tsk, group_dead);
//释放进程占用的资源
	exit_mm(tsk);/*  释放存储空间
    放弃进程占用的mm,如果没有其他进程使用该mm，则释放它。
     */

	if (group_dead)//输出进程会计信息
		acct_process();
	trace_sched_process_exit(tsk);

	exit_sem(tsk);/*  释放用户空间的“信号量”  */
	exit_shm(tsk);/* 释放锁  */
	exit_files(tsk);/*  释放已经打开的文件   */
	exit_fs(tsk); /*  释放用于表示工作目录等结构*/
	if (group_dead)//脱离控制终端
		disassociate_ctty(1);
	exit_task_namespaces(tsk);//释放命名空间
	exit_task_work(tsk);
	exit_thread();//释放task_struct中的thread_struct结构
/*触发thread_notify_head链表中所有通知链实例的处理函数，用于处理struct thread_info结构体*/
	/*
	 * Flush inherited counters to the parent - before the parent
	 * gets woken up by child-exit notifications.
	 *
	 * because of cgroup mode, must be called before cgroup_exit()
	 */
	perf_event_exit_task(tsk);//Performance Event功能相关资源的释放

	cgroup_exit(tsk);//Performance Event功能相关资源的释放

	/*
	 * FIXME: do that only when needed, using sched_exit tracepoint
	 */
	flush_ptrace_hw_breakpoint(tsk);//注销断点

	TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
	exit_notify(tsk, group_dead);//更新所有子进程的父进程
	proc_exit_connector(tsk);//进程事件连接器（通过它来报告进程fork、exec、exit以及进程用户ID与组ID的变化
#ifdef CONFIG_NUMA//用于NUMA，当引用计数为0时，释放mempolicy结构体所占用的资源
	task_lock(tsk);
	mpol_put(tsk->mempolicy);
	tsk->mempolicy = NULL;
	task_unlock(tsk);
#endif
#ifdef CONFIG_FUTEX
	if (unlikely(current->pi_state_cache))
		kfree(current->pi_state_cache);
#endif
	/*
	 * Make sure we are holding no locks:
	 */
	debug_check_no_locks_held();
	/*
	 * We can do this unlocked here. The futex code uses this flag
	 * just to verify whether the pi state cleanup has been done
	 * or not. In the worst case it loops once more.
	 */
	tsk->flags |= PF_EXITPIDONE;

	if (tsk->io_context)//释放struct futex_pi_state结构体所占用的内存
		exit_io_context(tsk);

	if (tsk->splice_pipe)//释放与进程描述符splice_pipe字段相关的资源
		free_pipe_info(tsk->splice_pipe);

	if (tsk->task_frag.page)
		put_page(tsk->task_frag.page);

	validate_creds_for_do_exit(tsk);

	check_stack_usage();//检查有多少未使用的进程内核栈
	preempt_disable();
	if (tsk->nr_dirtied)
		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
	exit_rcu();
	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));

	/*
	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
	 * when the following two conditions become true.
	 *   - There is race condition of mmap_sem (It is acquired by
	 *     exit_mm()), and
	 *   - SMI occurs before setting TASK_RUNINNG.
	 *     (or hypervisor of virtual machine switches to other guest)
	 *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
	 *
	 * To avoid it, we have to wait for releasing tsk->pi_lock which
	 * is held by try_to_wake_up()
	 */
	smp_mb();
	raw_spin_unlock_wait(&tsk->pi_lock);

	/* causes final put_task_struct in finish_task_switch(). */
	tsk->state = TASK_DEAD;//调度其它进程
	tsk->flags |= PF_NOFREEZE;/*重新调度，因为该进程已经被设置成了僵死状态，因此永远都不会再把它调度回来运行了，也就实现了do_exit不会有返回的目标    */	/* tell freezer to ignore us */
	schedule();
	BUG();
	/* Avoid "noreturn function does return".  */
	for (;;)
		cpu_relax();	/* For when BUG is null */
}