进程-process

最新推荐文章于 2022-07-30 19:50:27 发布

古戎烽烟

最新推荐文章于 2022-07-30 19:50:27 发布

阅读量1.2k

点赞数

分类专栏： linux kernel

本文链接：https://blog.csdn.net/u012681083/article/details/49156961

版权

linux kernel 专栏收录该内容

44 篇文章 1 订阅

订阅专栏

进程：一般进程具备以下要素:

1、有一段程序执行，这段程序可以不一定是进程所专有，可以与其它进程共用；

2、有专用的系统堆栈空间；

3、在内核中有一个task_struct数据结构；即进程控制块。有了次数据结构，进程才能成为内核调度的一个基本单位接收内核的调度。同时此结构记录着进程所占用的各项资源。

4、有独立的存储空间，即拥有专有的用户空间：除了系统堆栈空间外还有其专用的用户空间。注意系统空间是不能独立的，任何进程都不可能直接（不通过系统调用）改变系统空间的内容（除了本身的系统空间堆栈外）

如果具备了前面两条二缺少第四条，就称为线程

如果完全没有用户空间；就称为内核线程即kernel thread

而如果共享用户空间就称为”用户空间“。

主要不要把这里的线程有些系统中在用户空间的同一进程内实现的”线程“相混淆；那种线程显然不拥有独立的、专用的系统堆栈，也不作为一个调度单位直接受内核调度。既然linux内核提供了对线程的支持，一般也就没有必要在进程内部及用户空间自行实现线程。

线程也有pid也有task_struct结构；linux中进程（process）和任务”task“是同一个意思；每一个进程都有一个task_struct结构；而其号码确实pid；唤醒一个进程的函数名为wake_up_process().

进程即拥有进程描述符task_struct数据结构和系统堆栈外，还需要附加资源，拥有独立的存储空间意味着进程拥有用户空间，因此就要有用于内存管理的mm_struct数据结构以及下属vm_erea数据结构以及相应的页面目录和页表面；

intel在i386系统中增加了另一种段-任务状态段TSS；

虽然说是代码段数据段等一样；实际上很自有104bytes即叫控制块

记录任务很重要的信息：

1、任务切换前夕（即切入点上）该任务个通用寄存器的内容；

2、任务前夕该任务各个段寄存器(ES,CS,SS,DS,FS,GS)的内容

3、任务切换前夕该任务上的Efags寄存器的内容

4、任务切换前夕该任务指令地址寄存器EIP的内容

5、指向前一个任务的TSS结构的段选择码。当前任务执行IRET指令时，就返回到由这个段选择码所指定的（TSS所代表的）任务（返回地址则由堆栈决定）；

6、该任务的LDT段选择码，它指向任务的LDT；

7、控制寄存器CR3的内容，它指向任务的页面目录；

8、三个堆栈指针，分别为当前任务运行于0级1级2级时的堆栈指针，包括堆栈段寄存器SSx以及ESPx的内容（x取值可为0、1、2）。CPU中只有一个SS和一个ESP寄存器，CPU进入新的运行级别时会自动从当前任务的TSS中装入相应的SS和ESP的内容，实现堆栈的切换。

linux系统中TSS并不属于某个进程的资源，而是个全局性的公共资源。每个CPU只有一个TSS

TSS的定义：

/*
 * Note that the .io_bitmap member must be extra-big. This is because
 * the CPU will access an additional byte beyond the end of the IO
 * permission bitmap. The extra byte must be all 1 bits, and must
 * be within the limit.
 */
#define<strong>INIT_TSS {							\
	.esp0		= sizeof(init_stack) + (long)&init_stack,	\
	.ss0		= __KERNEL_DS,					\
	.ss1		= __KERNEL_CS,					\
	.ldt		= GDT_ENTRY_LDT,				\
	.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,			\
	.io_bitmap	= { [ 0 ... IO_BITMAP_LONGS] = ~0 },		\
}

把系统中的第一个进程的ss0设置成内核数据段（kernel_ds），esp0指向&init_stack的顶端；

init_stack定义如下：

#define init_stack		(init_thread_union.stack)
union thread_union init_thread_union = { INIT_THREAD_INFO(init_task) };

#define INIT_THREAD_INFO(tsk)			\
{						\
	.task		= &tsk,			\
	.exec_domain	= &default_exec_domain,	\
	.flags		= 0,			\
	.cpu		= 0,			\
	.preempt_count	= 1,			\
	.addr_limit	= KERNEL_DS,		\
	.restart_block = {			\
		.fn = do_no_restart_syscall,	\
	},					\
}
union thread_union {
	struct thread_info thread_info;
	unsigned long stack[THREAD_SIZE/sizeof(long)];//一般为4Kbytes的栈数组对应是1024
};

thread_info {
	struct task_struct	*task;		/* main task structure */
	struct exec_domain	*exec_domain;	/* execution domain */
	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
	__u32 cpu; /* should always be 0 on m68k */
	struct restart_block    restart_block;

	__u8			supervisor_stack[0];
};

<span style="font-size:18px;">#ifdef CONFIG_4KSTACKS
#define THREAD_SIZE            (4096)
#else
#define THREAD_SIZE		(8192)
#endif

union thread_union init_thread_union 
	__attribute__((__section__(".data.init_task"))) =
		{ INIT_THREAD_INFO(init_task) };

/*
 * Initial task structure.
 *
 * All other task structs will be allocated on slabs in fork.c
 */
struct  task_struct init_task = INIT_TASK(init_task);</span>

#define init_thread_info(init_thread_union.thread_info)
#define init_stack (init_thread_union.stack)

/*
 *  INIT_TASK is used to set up the first task table, touch at
 * your own risk!. Base=0, limit=0x1fffff (=2MB)
 */
#define  INIT_TASK(tsk)	\
{									\
	.state		= 0,						\
	.thread_info	= &init_thread_info,				\
	.usage		= ATOMIC_INIT(2),				\
	.flags		= 0,						\
	.lock_depth	= -1,						\
	.prio		= MAX_PRIO-20,					\
	.static_prio	= MAX_PRIO-20,					\
	.policy		= SCHED_NORMAL,					\
	.cpus_allowed	= CPU_MASK_ALL,					\
	.mm		= NULL,						\
	.active_mm	= &init_mm,					\
	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
	.time_slice	= HZ,						\
	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
	.real_parent	= &tsk,						\
	.parent		= &tsk,						\
	.children	= LIST_HEAD_INIT(tsk.children),			\
	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
	.group_leader	= &tsk,						\
	.real_timer	= {						\
		.function	= it_real_fn				\
	},								\
	.group_info	= &init_groups,					\
	.cap_effective	= CAP_INIT_EFF_SET,				\
	.cap_inheritable = CAP_INIT_INH_SET,				\
	.cap_permitted	= CAP_FULL_SET,					\
	.keep_capabilities = 0,						\
	.user		= INIT_USER,					\
	.comm		= "swapper",					\
	.thread		= INIT_THREAD,					\
	.fs		= &init_fs,					\
	.files		= &init_files,					\
	.signal		= &init_signals,				\
	.sighand	= &init_sighand,				\
	.pending	= {						\
		.list = LIST_HEAD_INIT(tsk.pending.list),		\
		.signal = {{0}}},					\
	.blocked	= {{0}},					\
	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
	.journal_info	= NULL,						\
}

对INIT_TSS的引用则是在init_task.c中给出；

DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS;

结构体数组 init_tss大小为NR_CPUS；即static tss_struct[NR_CPUS];

struct tss_struct {
	unsigned short	back_link,__blh;
	unsigned long	esp0;
	unsigned short	ss0,__ss0h;
	unsigned long	esp1;
	unsigned short	ss1,__ss1h;	/* ss1 is used to cache MSR_IA32_SYSENTER_CS */
	unsigned long	esp2;
	unsigned short	ss2,__ss2h;
	unsigned long	__cr3;
	unsigned long	eip;
	unsigned long	eflags;
	unsigned long	eax,ecx,edx,ebx;
	unsigned long	esp;
	unsigned long	ebp;
	unsigned long	esi;
	unsigned long	edi;
	unsigned short	es, __esh;
	unsigned short	cs, __csh;
	unsigned short	ss, __ssh;
	unsigned short	ds, __dsh;
	unsigned short	fs, __fsh;
	unsigned short	gs, __gsh;
	unsigned short	ldt, __ldth;
	unsigned short	trace, io_bitmap_base;
	/*
	 * The extra 1 is there because the CPU will access an
	 * additional byte beyond the end of the IO permission
	 * bitmap. The extra byte must be all 1 bits, and must
	 * be within the limit.
	 */
	unsigned long	io_bitmap[IO_BITMAP_LONGS + 1];
	/*
	 * Cache the current maximum and the last task that used the bitmap:
	 */
	unsigned long io_bitmap_max;
	struct thread_struct *io_bitmap_owner;
	/*
	 * pads the TSS to be cacheline-aligned (size is 0x100)
	 */
	unsigned long __cacheline_filler[35];
	/*
	 * .. and then another 0x100 bytes for emergency kernel stack
	 */
	unsigned long stack[64];
} __attribute__((packed));

 task_struct {      此时task_struct数据
	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
struct thread_info *thread_info;
	atomic_t usage;
	unsigned long flags;	/* per process flags, defined below */
	unsigned long ptrace;

	int lock_depth;		/* Lock depth */

	int prio, static_prio;
	struct list_head run_list;
	prio_array_t *array;

	unsigned long sleep_avg;
	unsigned long long timestamp, last_ran;
	int activated;

	unsigned long policy;
	cpumask_t cpus_allowed;
	unsigned int time_slice, first_time_slice;

#ifdef CONFIG_SCHEDSTATS
	struct sched_info sched_info;
#endif

	struct list_head tasks;
	/*
	 * ptrace_list/ptrace_children forms the list of my children
	 * that were stolen by a ptracer.
	 */
	struct list_head ptrace_children;
	struct list_head ptrace_list;

	struct mm_struct *mm, *active_mm;

/* task state */
	struct linux_binfmt *binfmt;
	<long exit_state;
	int exit_code, exit_signal;</span>
	int pdeath_signal;  /*  The signal sent when the parent dies  */
	/* ??? */
	unsigned long personality;
	unsigned did_exec:1;
	<span style="color:#ff0000;">pid_t pid;
	pid_t tgid;</span>
	/* 
	 * pointers to (original) parent process, youngest child, younger sibling,
	 * older sibling, respectively.  (p->father can be replaced with 
	 * p->parent->pid)
	 */
	struct task_struct *real_parent; /* real parent process (when being debugged) */
	struct task_struct *parent;	/* parent process */
	/*
	 * children/sibling forms the list of my children plus the
	 * tasks I'm ptracing.
	 */
struct list_head children;	/* list of my children */
	struct list_head sibling;	/* linkage in my parent's children list */</span>
	struct task_struct *group_leader;	/* threadgroup leader */

	/* PID/PID hash table linkage.pid  为了快速从进程链表中查找，引入了四个散列表 */
	struct pid pids[PIDTYPE_MAX];

struct completion *vfork_done;		/* for vfork() */
	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */

	unsigned long rt_priority;
	unsigned long it_real_value, it_real_incr;
	cputime_t it_virt_value, it_virt_incr;
	cputime_t it_prof_value, it_prof_incr;
	struct timer_list real_timer;
	cputime_t utime, stime;
	unsigned long nvcsw, nivcsw; /* context switch counts */
	struct timespec start_time;
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
	unsigned long min_flt, maj_flt;
/* process credentials */
	uid_t uid,euid,suid,fsuid;
	gid_t gid,egid,sgid,fsgid;
	struct group_info *group_info;
	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
	unsigned keep_capabilities:1;
	struct user_struct *user;
#ifdef CONFIG_KEYS
	struct key *session_keyring;	/* keyring inherited over fork */
	struct key *process_keyring;	/* keyring private to this process (CLONE_THREAD) */
	struct key *thread_keyring;	/* keyring private to this thread */
#endif
	int oomkilladj; /* OOM kill score adjustment (bit shift). */
	char comm[TASK_COMM_LEN];
/* file system info */
	int link_count, total_link_count;
/* ipc stuff */
	struct sysv_sem sysvsem;
/* CPU-specific state of this task */
	struct thread_struct thread;
/* filesystem information */
	struct fs_struct *fs;
/* open file information */
	struct files_struct *files;
/* namespace */
	struct namespace *namespace;
/* signal handlers */
	struct signal_struct *signal;
	struct sighand_struct *sighand;

	sigset_t blocked, real_blocked;
	struct sigpending pending;

	unsigned long sas_ss_sp;
	size_t sas_ss_size;
	int (*notifier)(void *priv);
	void *notifier_data;
	sigset_t *notifier_mask;
	
	void *security;
	struct audit_context *audit_context;

/* Thread group tracking */
   	u32 parent_exec_id;
   	u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
	spinlock_t alloc_lock;
/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
	spinlock_t proc_lock;
/* context-switch lock */
	spinlock_t switch_lock;

/* journalling filesystem info */
	void *journal_info;

/* VM state */
	struct reclaim_state *reclaim_state;

	struct dentry *proc_dentry;
	struct backing_dev_info *backing_dev_info;

	struct io_context *io_context;

	unsigned long ptrace_message;
	siginfo_t *last_siginfo; /* For ptrace use.  */
/*
 * current io wait handle: wait queue entry to use for io waits
 * If this thread is processing aio, this points at the waitqueue
 * inside the currently handled kiocb. It may be NULL (i.e. default
 * to a stack based synchronous wait) if its doing sync IO.
 */
	wait_queue_t *io_wait;
/* i/o counters(bytes read/written, #syscalls */
	u64 rchar, wchar, syscr, syscw;
#if defined(CONFIG_BSD_PROCESS_ACCT)
	u64 acct_rss_mem1;	/* accumulated rss usage */
	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
	clock_t acct_stimexpd;	/* clock_t-converted stime since last update */
#endif
#ifdef CONFIG_NUMA
  	struct mempolicy *mempolicy;
	short il_next;
#endif
}

每个进程都有一个task_struct数据结构和一片用作系统空间的堆栈存储空间；这二者在物理存储空间中连在一起。内核为每个进程分配一个task_struct结构时，分配两个连续的物理页面（8Kbytes）；底部为进程的task_struct结构，结构上面用作进程的系统空间堆栈。

系统空间：系统堆栈空间不像用户空间那样可以在运行时动态的扩展；而是静态的确定了。所以在中断服务程序、内核软中断服务程序以及设备驱动程序设计中，不宜嵌套太深；也不易使用太大的局部变量；

内核使用alloc_thread_info和free_thread_info宏分配和释放存储thread_info结构和内核栈的内存区；

#define alloc_thread_info(tsk)\
({ \
struct thread_info *ret;\
\
ret = kmalloc(THREAD_SIZE, GFP_KERNEL);\
if (ret) \
memset(ret, 0, THREAD_SIZE);\
ret; \
})

当进程在内核空间运行时，常常需要访问当前进程自身的task_struct结构；

定义了宏current；

static inline struct task_struct * get_current(void)
{
	return current_thread_info()->task;
}
 #define current get_current()

/* how to get the thread information struct from C */
static inline struct thread_info *current_thread_info(void)
{
	struct thread_info *ti;
	__asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1)));
	return ti;
}

上述函数中：将当前进程的堆栈指针寄存器ESP的内容与8191UL（THREAD_SIZE=8192）相与;而得到当前起始进程的task_struct结构的起始地址；

task_struct中的violate long state 表示当前进程的运行状态；

#define TASK_RUNNING		0
#define TASK_INTERRUPTIBLE	1
#define TASK_UNINTERRUPTIBLE	2
#define TASK_STOPPED		4
#define TASK_TRACED		8
#define EXIT_ZOMBIE		16
#define EXIT_DEAD		32

task_interruptible 和task_uninterruptable均表示进程处于睡眠状态；但是task_uninterruptable表示进程处于深度睡眠而不受信号（signal也称软中断）的打扰；task_interruptable表示进程可以被信号唤醒；函数sleep_on()和wake_up（）用于深度睡眠；而interruptable_sleep_on()和wake_up_interruptable（）用于浅度睡眠；

深入睡眠一般只用于临界区和关键的部分；信号的概念实际上与中断的概念是相同的，所以这里所谓的interruptable也是指这种软中断而言；

task_running表示当前进程可以被调度而成为当前进程。并不是表示一个进程正在执行或者说并不是表示当前进程就是当前进程（current）；处于task_running时，内核就可以将进程的task_struct结构通过队列头run_list挂入一个“运行队列”；

task_zombie状态表示进程已经exit而“户口”尚未注销；

僵尸状态（exit_zombie）:进程的执行被终止，但是父进程还没有发布wait4，或waitpid（）系统调用；

僵尸撤销状态（exit_dead）：最终状态;由父进程刚刚发出在同一个进程上执行wait（）系统调用。撤销进程资源

task_stopped主要用于调试；进程收到一个SIGSTOP信号后就将运行状态改成task_stopped状态；收到一个SIGCONT信号后又可以运行；

struct sigpending pengding --表示进程收到了“信号”但是尚未处理；与这个标志有关的是信号队列sigqueue、sigqueue_tail、sig等指针以及sigmask_lock、signal、blocked等成分；

/* signal handlers */
	struct signal_struct *signal;
	struct sighand_struct *sighand;

struct sigpending pending;

/*
 * NOTE! "signal_struct" does not have it's own
 * locking, because a shared signal_struct always
 * implies a shared sighand_struct, so locking
 * sighand_struct is always a proper superset of
 * the locking of signal_struct.
 */
struct signal_struct {
	atomic_t		count;
	atomic_t		live;

	wait_queue_head_t	wait_chldexit;	/* for wait4() */

	/* current thread group signal load-balancing target: */
	task_t			*curr_target;

	/* shared signal handling: */
	struct sigpending	shared_pending;

	/* thread group exit support */
	int			group_exit_code;
	/* overloaded:
	 * - notify group_exit_task when ->count is equal to notify_count
	 * - everyone except group_exit_task is stopped during signal delivery
	 *   of fatal signals, group_exit_task processes the signal.
	 */
	struct task_struct	*group_exit_task;
	int			notify_count;

	/* thread group stop support, overloads group_exit_code too */
	int			group_stop_count;
	unsigned int		flags; /* see SIGNAL_* flags below */

	/* POSIX.1b Interval Timers */
	struct list_head posix_timers;

	/* job control IDs */
	pid_t pgrp;
	pid_t tty_old_pgrp;
	pid_t session;
	/* boolean value for session group leader */
	int leader;

	struct tty_struct *tty; /* NULL if no tty */

	/*
	 * Cumulative resource counters for dead threads in the group,
	 * and for reaped dead child processes forked by this group.
	 * Live threads maintain their own counters and add to these
	 * in __exit_signal, except for the group leader.
	 */
	cputime_t utime, stime, cutime, cstime;
	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;

	/*
	 * We don't bother to synchronize most readers of this at all,
	 * because there is no reader checking a limit that actually needs
	 * to get both rlim_cur and rlim_max atomically, and either one
	 * alone is a single word that can safely be read normally.
	 * getrlimit/setrlimit use task_lock(current->group_leader) to
	 * protect this instead of the siglock, because they really
	 * have no need to disable irqs.
	 */
	struct rlimit rlim[RLIM_NLIMITS];
};

struct sighand_struct {
atomic_t count;
struct k_sigactionaction[_NSIG];
spinlock_t siglock;
};

struct sigpending {
struct list_head list;
sigset_t signal;
};

binfmt--应用程序的文件格式。如a.out elf等见系统调用exec（）

ecit_code /exit_signal/ pdeath_signal 详见系统调用exit wait4

pid--进程号

uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid;主要与文件操作权限有关；

struct user_struct *user;--指向一个user_struct 结构；

进程资源限制

current->signal->rlim 字段；即进程信号描述符的字段，该字段是类型为rlimt结构数组，每个资源限制对应一个元素；

struct rlimit {
unsigned long rlim_cur;
unsigned long rlim_max;
};

rlim_cur字段是资源的当前资源限制；例如：

current->signal->rlim[RLIMIT_CPU]；rlim_cur表示正运行进程所占用的CPU时间的当前限制；

rlimt_max表示资源限制所允许的最大值。

利用getrlimt()和setrlimt（）系统调用；可以设置rlim_cur限制增加到rlim_max。

进程链表

每个task_struct结构都包含一个list_head类型的tasks字段；这个类型的prev和next指向前面一个和后面的task_struct元素；

进程链表的头是init_stack描述符；他是所谓的0进程（process 0）或swapper 进程的进程描述符；

/*
 * Initial task structure.
 *
 *All other task structs will be allocated on slabs in fork.c
 */
struct task_struct init_task = INIT_TASK(init_task);

init_task的tasks_prev字段指向链表中的最后插入的进程描述符的tasks字段；

进程间的关系：程序创建的进程具有父子关系，一个进程创建多个子进程时，子进程之间具有兄弟关系；0号进程和1号进程是由内核创建；

/* 
	 * pointers to (original) parent process, youngest child, younger sibling,
	 * older sibling, respectively.  (p->father can be replaced with 
	 * p->parent->pid)
	 */
	struct task_struct *real_parent; /* real parent process (when being debugged) */
	struct task_struct *parent;	/* parent process */
	/*
	 * children/sibling forms the list of my children plus the
	 * tasks I'm ptracing.
	 */
	struct list_head children;	/* list of my children */
	struct list_head sibling;	/* linkage in my parent's children list */
	struct task_struct *group_leader;	/* threadgroup leader */

real_parents 指向创建了p的进程的描述符；如果p的父进程不在；就指向1号进程；

parent 指向p的当前父进程（当这种进程的子进程终止，必须向父进程发送信号），其值通常与real_parent一致，其偶尔也可以不同，eg：当另一个进程发出监控p的ptrace（）系统调用时；

children 链表头部，链表中的元素都是p创建的子进程；

silbing 指向兄弟进程链表中的下一个元素或前一个元素指针，这些兄弟进程的父进程都是p

task_running状态进程链表：

早期的linux版本把所有的可运行队列都放在同一个叫做运行队列的链表中，由于维持链表中的进程按优先级排序开销过大；

提高调度程序运行的速度的诀窍是在建立多个运行进程链表，每种进程优先权对应一个不同的链表；

每个task_struct描述符包含一个list_head类型字段run_list，如果进程优先级为k（取值范围0-139），run_list字段把该进程连入优先权为k的可运行进程链表中。在smp系统中，每个CPU都有他自己的运行的队列。

prio_array_t数据结构字段：

struct prio_array {
unsigned int nr_active;//链表中进程的数量
unsigned long bitmap[BITMAP_SIZE];//优先权位图，当且仅当某个优先权的

链表不为空shi2设置相应的位标志；
struct list_head queue[MAX_PRIO];//140个优先权队列的头结点
};

/*
 * Adding/removing a task to/from a priority array:
 */
static void dequeue_task(struct task_struct *p, prio_array_t *array)
{
	array->nr_active--;
	list_del(&p->run_list);
	if (list_empty(array->queue + p->prio))
		__clear_bit(p->prio, array->bitmap);
}

把进程描述符插入某个运行队列的链表

static void enqueue_task(struct task_struct *p, prio_array_t *array)
{
	sched_info_queued(p);
	list_add_tail(&p->run_list, array->queue + p->prio);
	__set_bit(p->prio, array->bitmap);
	array->nr_active++;
	p->array = array;
}

进程、轻量级进程、线程

在创建进程时提供三种机制：

1、写时复制技术允许父子进程读相同的物理页面。只要两者之间试图写一个物理页，内核就把这个页的内容拷贝到一个新的物理页，并把这个新的物理页分配给正在写的进程。

2、轻量级进程（由clone创建）允许父子进程共享每进城在内核很多的数据结构。

3、vfork（）系统调用创建进程能共享其父进程的内存地址空间。

linux提供两个系统调用，fork（）、vfork（）、clone（）。

传统的fork系统调用在linux中是由clone实现的；

do_fork()函数负责处理clone（）、fork（）、vfork（）系统调用。

fork（）->sys_fork() clone()->sys_clone() vfork()->sys_vfork()

_clone()可以创建一个线程既可以是内核线程也可以是用户线程创建用户线程时可以给定子线程用户空间堆栈的位置也可以指定子进程运行的起订；也可以创建进程，有选择的复制父进程的资源。而fork（）则是全面的复制；

其中0—299的进程号是分配给demo（守护进程）的。剩下的pid号是分配给普通进程的。

/*
 * PID-map pages start out as NULL, they get allocated upon
 * first use and are never deallocated. This way a low pid_max
 * value does not cause lots of bitmaps to be allocated, but
 * the scheme scales to up to 4 million PIDs, runtime.
 */

struct pidmap这个结构体主要是为了标志pid是不是已经分配出去了。在page这个变量当中表示着，每一个char类型占一个字节—8位，也就是说，为了 表示32768个进程号是不是分配出去没有，我们要占用32768/8=4096个字节，这个也是为什么page要申请占用4096个大小的原因。另外， 为了表示现在还有多少个pid号没有分配出去，我们还可以使用的话，使用nr_free来表示。</span>
typedef struct pidmap {
atomic_t nr_free;
void *page;
} pidmap_t;

int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;


#define PIDMAP_ENTRIES    ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8)
#define BITS_PER_PAGE         (PAGE_SIZE*8)
#define BITS_PER_PAGE_MASK    (BITS_PER_PAGE-1)

int alloc_pidmap(void)
{
	int i, offset, max_scan, pid, last = last_pid;//last_pid为全局变量，表示上次分配pid时分出去的
	pidmap_t *map;

	pid = last + 1;
	if (pid >= pid_max)
		pid = RESERVED_PIDS;//300 前300个pid是不可分配 固定的
	offset = pid & BITS_PER_PAGE_MASK;//找出pid在某一页中的偏移量
	map = &pidmap_array[pid/BITS_PER_PAGE];//&pidmap_array[i]表示的是第i个描述pid使用状况的页面的地址.

获得pid号在内存中表示占用的页面个数 
	max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;

	for (i = 0; i <= max_scan; ++i) {对每个页面进行扫描
		if (unlikely(!map->page)) {//如果所指页面为空
			unsigned long page = get_zeroed_page(GFP_KERNEL);//分配一个物理页面
			/*
			 * Free the page if someone raced with us
			 * installing it:
			 */
			spin_lock(&pidmap_lock);
			if (map->page)//在多处理器系统中 需要再次判断map所指是否为空
				free_page(page);
			else
				map->page = (void *)page;
			spin_unlock(&pidmap_lock);
			if (unlikely(!map->page))//如果页面指针还是为空,分配失败,跳出循环,返回-1  
				break;
		}
		if (likely(atomic_read(&map->nr_free))) {//如果该页面的空闲pid的个数不为0的话  
			do {
				if (!test_and_set_bit(offset, map->page)) {//判断该页面中offset偏移量表示的pid是否
					atomic_dec(&map->nr_free);<span style="color: rgb(0, 130, 0); font-family: Consolas, 'Courier New', Courier, mono, serif; line-height: 18px; widows: auto; background-color: rgb(255, 255, 236);">//为空在空闲的pid个数--</span>
					last_pid = pid;//全局变量更新  
					return pid;
				}
				offset = find_next_offset(map, offset);//失败则寻找下一个pid 
				pid = mk_pid(map, offset);//根据map和offset的值获得pid 
			/*
			 * find_next_offset() found a bit, the pid from it
			 * is in-bounds, and if we fell back to the last
			 * bitmap block and the final block was the same
			 * as the starting point, pid is before last_pid.
			 */
			} while (offset < BITS_PER_PAGE && pid < pid_max &&//判断offset是否超过一个页面可以表示的最大的pid数或者最大的pid  
					(i != max_scan || pid < last ||//如果i==max_scan的话,如果pid>=last因为之前已经判断过,就没必要了  
					    !((last+1) & BITS_PER_PAGE_MASK))); //对于max_scan等于1,也需要对其扫描两编,第一次是last之后的,第二次是last之前的.  
		}
		if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) {//如果map没有超过最大的pid所占的页面的地址  
			++map;
			offset = 0;
		} else {
			map = &pidmap_array[0];
			offset = RESERVED_PIDS;
			if (unlikely(last == offset))
				break;
		}
		pid = mk_pid(map, offset);
	}
	return -1;
}

regs：denate Pointer to the values of the general purpose registers saved into the Kernel Mode
stack when switching from User Mode to Kernel Mode

long do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      struct pt_regs *regs,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr)
{
	struct task_struct *p;
	int trace = 0;
	long pid = alloc_pidmap();

	if (pid < 0)
		return -EAGAIN;
	if (unlikely(current->ptrace)) {如果此时他的值不为0，说明有另外一个进程正在跟踪父进程
		trace = fork_traceflag (clone_flags);
		if (trace)
			clone_flags |= CLONE_PTRACE;
	}
   copy_process复制进程描述符。如果所有的必须资源都是可用的该函数返回创建task_struct描述符地址
	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
	/*
	 * Do this prior waking up the new thread - the thread pointer
	 * might get invalid after that point, if the thread exits quickly.
	 */
	if (!IS_ERR(p)) {
		struct completion vfork;

		if (clone_flags & CLONE_VFORK) {
			p->vfork_done = &vfork;
			init_completion(&vfork);
		}
              如果设置了clone_stopped标志或者必须跟踪子进程
		if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
			/*
			 * We'll start up with an immediate SIGSTOP.
			 */为子进程增加挂起的SIGSTOP信号
			sigaddset(&p->pending.signal, SIGSTOP);
			set_tsk_thread_flag(p, TIF_SIGPENDING);
		}
               <span style="font-size:24px;">  </span><span style="font-size:18px;"> /*如果没有设置clone_stopped标志，这调用wake_up*/
		if (!(clone_flags & CLONE_STOPPED))
			wake_up_new_task(p, clone_flags);</span>
		else
			p->state = TASK_STOPPED;//如果clone_stopped标志设置，则把子进程设置为task_stopped状态

		if (unlikely (trace)) {//如果父进程被跟中，则把子进程的PID存入curren进程的ptrace_message字段并调用ptrace_notify

//是当前进程停止  并向当前进程的父进程发送SIGchild信号 
			current->ptrace_message = pid;
			ptrace_notify ((trace << 8) | SIGTRAP);
		}

		if (clone_flags & CLONE_VFORK) {
			wait_for_completion(&vfork);把父进程插入等待队列 并挂起父进程直到子进程释放自己的内存地址空间（直到子进程结束或者执行新的程序）
			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
		}
	} else {
		free_pidmap(pid);
		pid = PTR_ERR(p);
	}
	return pid;
}

*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
 * This function will do some initial scheduler statistics housekeeping
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
{
	unsigned long flags;
	int this_cpu, cpu;
	runqueue_t *rq, *this_rq;

      rq = task_rq_lock(p, &flags);

	cpu = task_cpu(p);//获得新创建进程的CPU
	this_cpu = smp_processor_id();//当前CPU编号

	BUG_ON(p->state != TASK_RUNNING);

	schedstat_inc(rq, wunt_cnt);
	/*
	 * We decrease the sleep average of forking parents
	 * and children as well, to keep max-interactive tasks
	 * from forking tasks that are max-interactive. The parent
	 * (current) is done further down, under its lock.
	 */
	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);

	p->prio = effective_prio(p);

	if (likely(cpu == this_cpu)) {
		if (!(clone_flags & CLONE_VM)) {
			/*
			 * The VM isn't cloned, so we're in a good position to
			 * do child-runs-first in anticipation of an exec. This
			 * usually avoids a lot of COW overhead.
			 */
			if (unlikely(!current->array))
				__activate_task(p, rq);
			else {
				p->prio = current->prio;
				list_add_tail(&p->run_list, ¤t->run_list);
				p->array = current->array;
				p->array->nr_active++;
				rq->nr_running++;
			}
			set_need_resched();
		} else
			/* Run child last */
			__activate_task(p, rq);
		/*
		 * We skip the following code due to cpu == this_cpu
	 	 *
		 *   task_rq_unlock(rq, &flags);
		 *   this_rq = task_rq_lock(current, &flags);
		 */
		this_rq = rq;
	} else {
		this_rq = cpu_rq(this_cpu);

		/*
		 * Not the local CPU - must adjust timestamp. This should
		 * get optimised away in the !CONFIG_SMP case.
		 */
		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
					+ rq->timestamp_last_tick;
		__activate_task(p, rq);//加入运行队列
		if (TASK_PREEMPTS_CURR(p, rq))
			resched_task(rq->curr);

		schedstat_inc(rq, wunt_moved);
		/*
		 * Parent and child are on different CPUs, now get the
		 * parent runqueue to update the parent's ->sleep_avg:
		 */
		task_rq_unlock(rq, &flags);
		this_rq = task_rq_lock(current, &flags);
	}
	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
	task_rq_unlock(this_rq, &flags);
}

void ptrace_notify(int exit_code)
{
	siginfo_t info;

	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);

	memset(&info, 0, sizeof info);
	info.si_signo = SIGTRAP;
	info.si_code = exit_code;
	info.si_pid = current->pid;
	info.si_uid = current->uid;

	/* Let the debugger run.  */
	spin_lock_irq(¤t->sighand->siglock);
	ptrace_stop(exit_code, 0, &info);
	spin_unlock_irq(¤t->sighand->siglock);
}

task_rq(p) 获取进程p所在的runqueue的首地址
--------------------------------------------
#define task_rq(p)      cpu_rq(task_cpu(p))
#define cpu_rq(cpu)     (&per_cpu(runqueues, (cpu)))

task_cpu(p) 获取进程p所在CPU的编号
-------------------------------------------
static inline unsigned int task_cpu(const struct task_struct *p)
{
    return p->thread_info->cpu;
}

cpu_rq(cpu) 获取编号为cpu的处理器的runqueue的首地址
--------------------------------------------
#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]))
#define RELOC_HIDE(ptr, off)                    /
({ unsigned long __ptr;                   /
    __asm__ ("" : "=g"(__ptr) : "0"(ptr));      /
    (typeof(ptr)) (__ptr + (off)); })

rq = task_rq_lock(p, &flags);
/* task_rq_lock函数**/
* task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
*/
static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
__acquires(rq->lock)
{
struct runqueue *rq;

repeat_lock_task:
local_irq_save(*flags);
rq = task_rq(p);
spin_lock(&rq->lock);
if (unlikely(rq != task_rq(p))) {
spin_unlock_irqrestore(&rq->lock, *flags);
goto repeat_lock_task;
}
return rq;
}
/* how to get the thread information struct from C */
static inline struct thread_info *current_thread_info(void)
{
struct thread_info *ti; //esp为栈指针
__asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1)));
return ti;
}

重读：

当进程fork时，child process is copy of parent's address space and executes the same code ，the
parent and child may share the pages containing the program code (text)；they have separate copies of the data (stack and heap), so that changes by the child to a memory location are invisible to the parent (and vice versa).

Wait queues have several uses in the kernel, particularly for interrupt handling, process synchronization, and timing.

copy_thread( )to initialize the Kernel Mode stack of the child process
with the values contained in the CPU registers when theclone( )system call was
issued (these values have been saved in the Kernel Mode stack of the parent, as
). However, the function forces the value 0 into the field
corresponding to the eax register (this is the child’s return value of thefork()or
clone() system call). The thread.esp field in the descriptor of the child process is
initialized with the base address of the child’s Kernel Mode stack, and the address
of an assembly language function (ret_from_fork( )) is stored in thethread.eip
field. If the parent process makes use of an I/O Permission Bitmap, the child gets
a copy of such bitmap. Finally, if the CLONE_SETTLS flag is set, the child gets the
TLS segment specified by the User Mode data structure pointed to by thetls
parameter of the clone() system call.

process termination：

in linux 2.6 there are two system call that terminate a user mode application：

1、exit_group（）system call：terminates a full thread group，that is， a whole multithreaded application ，the main kernel function that implement this system call is called do_group_exit（）， should be invoked by C library function。

2、the _exit() system call ，which terminates a signal process regardless of any other process in the group of the victim The main kernel function implements this system call is called do_exit()。

The do_group_exit( ) function：

<pre name="code" class="cpp">NORET_TYPE void
do_group_exit(int exit_code)
{
	struct signal_struct *sig = current->signal;


	BUG_ON(exit_code & 0x80); /* core dumps don't get here */


	if (signal_group_exit(sig))
		exit_code = sig->group_exit_code;   checks whether signal_roup_exit flag is not zero
 whitch means the kernel  alreadly started an  procedure tjhis thread group  
	else if (!thread_group_empty(current)) { 
 otherwise , set signal_group_exit flag of the process stores the terminate code in the 
current->signal->group_exit_code filed
		struct sighand_struct *const sighand = current->sighand;
		spin_lock_irq(&sighand->siglock);
		if (signal_group_exit(sig))
			/* Another thread got here before we took the lock.  */
			exit_code = sig->group_exit_code;
		else {
			sig->group_exit_code = exit_code;
			sig->flags = SIGNAL_GROUP_EXIT;
			zap_other_threads(current);   
kill other process in the group of current In order to do this, the function scans t
he perPID list in the PIDTYPE_TGID hash table corresponding to current->tgid; for each
process in the list different from current, it sends a SIGKILL signal to it  As a result, 
all such processes will eventually execute the do_exit()
function, and thus they will be killed.
		}
		spin_unlock_irq(&sighand->siglock);
	}


	do_exit(exit_code); The Sans Mono Condensedassing to it the process termination code. As
we’ll see below,do_exit()kills the process and never returns.

/* NOTREACHED */}

/* * Tell a process that it has a new active signal.. * * NOTE! we rely on the previous spin_lock to * lock interrupts for us! We can only be called with * "siglock" held, and the local interrupt must * have been disabled when that got acquired! * * No need to set need_resched since signal event passing * goes through ->blocked */void signal_wake_up(struct task_struct *t, int resume){ unsigned int mask; set_tsk_thread_flag(t, TIF_SIGPENDING); /* * For SIGKILL, we want to wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. * By using wake_up_state, we ensure the process will wake up and * handle its death signal. */ mask = TASK_INTERRUPTIBLE; if (resume) mask |= TASK_WAKEKILL; if (!wake_up_state(t, mask)) kick_process(t);}

对于 do_exit 的分析：

1、sets the Pf_exiting flag indicate that the process is being eliminated。

2、if necessary ，remove the dynamic timer via the del_timer_sync() function

3、Detaches from the process descriptor the data structures related to paging, semaphores, filesystem, open file descriptors, namespaces, and I/O Permission Bitmap, respectively, with theexit_mm( ),exit_sem(),_ _exit_files( ),_ _exit_fs(),
exit_namespace( ), andexit_thread( )functions.

4、If the kernel functions implementing the execution domain and the executable
format of the process being killed are included in kernel modules, the function decreases their usage counters.

5、Sets the exit_codefield of the process descriptor to the process termination code. This value is either the _exit( )orexit_group()system call parameter

(normal termination), or an error code supplied by the kernel (abnormal termination).

6、Invokes theexit_notify( )function to perform the following operations

a. Updates the parenthood relationships of both the parent process and thehild processes.

All child processes created by the terminating process

become children of another process in the same thread group, if any is running, or otherwise of theinitprocess.

b. Checks whether the exit_signal process descriptor field of the process being
terminated is different from -1, and whether the process is the last member
of its thread group . In this case, the function sends a signal (usuallySIGCHLD) to the parent of the process being
terminated to notify the parent about a child’s death.

c. Otherwise, if the exit_signal field is equal to -1 or the thread group
includes other processes, the function sends a SIGCHLD signal to the parent
only if the process is being traced (in this case the parent is the debugger,
which is thus informed of the death of the lightweight process).

d. If the exit_signal process descriptor field is equal to-1 and the process is
not being traced, it sets the exit_state field of the process descriptor to
EXIT_DEAD, and invokesrelease_task()to reclaim the memory of the
remaining process data structures and to decrease the usage counter of the
process descriptor (see the following section). The usage counter becomes
equal to 1 (see step 3f in the copy_process() function), so that the process
descriptor itself is not released right away.

e. Otherwise, if the exit_signal process descriptor field is not equal to-1 or
the process is being traced, it sets the exit_state field to EXIT_ZOMBIE. We’ll
see what happens to zombie processes in the following section.

f. Sets the PF_DEAD flag in the flags field of the process descriptor .

7、Invokes theschedule( )

a process in an EXIT_ZOMBIE state is ignored by the scheduler

the scheduler will check thePF_DEADflag and will decrease the usage counter in the descriptor of the zombie process being replaced to
denote the fact that the process is no longer alive.

release_task( )function detaches the last data structures from the descriptor of
a zombie process; it is applied on a zombie process in two possible ways:

1、by thedo_exit()function if the parent is not interested in receiving signals from the child；

in this case the memory reclaiming will be done by the
scheduler .

2、by thewait4()orwaitpid()system calls after a signal has been sent to the parent.
the function will reclaim the memory used by the process descriptor

pid_t waitpid(pid_t pid, int *statusPtr, int options);

1.where pid is the process of the child it should wait.

2.statusPtr is a pointer to the location where status information for the terminating process is to be stored.

3.specifies optional actions for the waitpid function. Either of the following option flags may be specified, or they can be combined with a bitwise inclusive OR operator:

*
 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
	unsigned long state;
	int retval, status, traced;
	pid_t pid = task_pid_vnr(p);
	uid_t uid = __task_cred(p)->uid;
	struct siginfo __user *infop;

	if (!likely(wo->wo_flags & WEXITED))
		return 0;

	if (unlikely(wo->wo_flags & WNOWAIT)) {
		int exit_code = p->exit_code;
		int why;

		get_task_struct(p);
		read_unlock(&tasklist_lock);
		if ((exit_code & 0x7f) == 0) {
			why = CLD_EXITED;
			status = exit_code >> 8;
		} else {
			why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
			status = exit_code & 0x7f;
		}
		return wait_noreap_copyout(wo, p, pid, uid, why, status);
	}

	/*
	 * Try to move the task's state to DEAD
	 * only one thread is allowed to do this:
	 */
	state = xchg(&p->exit_state, EXIT_DEAD);
	if (state != EXIT_ZOMBIE) {
		BUG_ON(state != EXIT_DEAD);
		return 0;
	}

	traced = ptrace_reparented(p);
	/*
	 * It can be ptraced but not reparented, check
	 * !task_detached() to filter out sub-threads.
	 */
	if (likely(!traced) && likely(!task_detached(p))) {
		struct signal_struct *psig;
		struct signal_struct *sig;
		unsigned long maxrss;
		cputime_t tgutime, tgstime;

		/*
		 * The resource counters for the group leader are in its
		 * own task_struct.  Those for dead threads in the group
		 * are in its signal_struct, as are those for the child
		 * processes it has previously reaped.  All these
		 * accumulate in the parent's signal_struct c* fields.
		 *
		 * We don't bother to take a lock here to protect these
		 * p->signal fields, because they are only touched by
		 * __exit_signal, which runs with tasklist_lock
		 * write-locked anyway, and so is excluded here.  We do
		 * need to protect the access to parent->signal fields,
		 * as other threads in the parent group can be right
		 * here reaping other children at the same time.
		 *
		 * We use thread_group_times() to get times for the thread
		 * group, which consolidates times for all threads in the
		 * group including the group leader.
		 */
		thread_group_times(p, &tgutime, &tgstime);
		spin_lock_irq(&p->real_parent->sighand->siglock);
		psig = p->real_parent->signal;
		sig = p->signal;
		psig->cutime =
			cputime_add(psig->cutime,
			cputime_add(tgutime,
				    sig->cutime));
		psig->cstime =
			cputime_add(psig->cstime,
			cputime_add(tgstime,
				    sig->cstime));
		psig->cgtime =
			cputime_add(psig->cgtime,
			cputime_add(p->gtime,
			cputime_add(sig->gtime,
				    sig->cgtime)));
		psig->cmin_flt +=
			p->min_flt + sig->min_flt + sig->cmin_flt;
		psig->cmaj_flt +=
			p->maj_flt + sig->maj_flt + sig->cmaj_flt;
		psig->cnvcsw +=
			p->nvcsw + sig->nvcsw + sig->cnvcsw;
		psig->cnivcsw +=
			p->nivcsw + sig->nivcsw + sig->cnivcsw;
		psig->cinblock +=
			task_io_get_inblock(p) +
			sig->inblock + sig->cinblock;
		psig->coublock +=
			task_io_get_oublock(p) +
			sig->oublock + sig->coublock;
		maxrss = max(sig->maxrss, sig->cmaxrss);
		if (psig->cmaxrss < maxrss)
			psig->cmaxrss = maxrss;
		task_io_accounting_add(&psig->ioac, &p->ioac);
		task_io_accounting_add(&psig->ioac, &sig->ioac);
		spin_unlock_irq(&p->real_parent->sighand->siglock);
	}

	/*
	 * Now we are sure this task is interesting, and no other
	 * thread can reap it because we set its state to EXIT_DEAD.
	 */
	read_unlock(&tasklist_lock);

	retval = wo->wo_rusage
		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
		? p->signal->group_exit_code : p->exit_code;
	if (!retval && wo->wo_stat)
		retval = put_user(status, wo->wo_stat);

	infop = wo->wo_info;
	if (!retval && infop)
		retval = put_user(SIGCHLD, &infop->si_signo);
	if (!retval && infop)
		retval = put_user(0, &infop->si_errno);
	if (!retval && infop) {
		int why;

		if ((status & 0x7f) == 0) {
			why = CLD_EXITED;
			status >>= 8;
		} else {
			why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
			status &= 0x7f;
		}
		retval = put_user((short)why, &infop->si_code);
		if (!retval)
			retval = put_user(status, &infop->si_status);
	}
	if (!retval && infop)
		retval = put_user(pid, &infop->si_pid);
	if (!retval && infop)
		retval = put_user(uid, &infop->si_uid);
	if (!retval)
		retval = pid;

	if (traced) {
		write_lock_irq(&tasklist_lock);
		/* We dropped tasklist, ptracer could die and untrace */
		ptrace_unlink(p);
		/*
		 * If this is not a detached task, notify the parent.
		 * If it's still not detached after that, don't release
		 * it now.
		 */
		if (!task_detached(p)) {
			do_notify_parent(p, p->exit_signal);
			if (!task_detached(p)) {
				p->exit_state = EXIT_ZOMBIE;
				p = NULL;
			}
		}
		write_unlock_irq(&tasklist_lock);
	}
	if (p != NULL)
		release_task(p);

	return retval;
}

*
 * Consider @p for a wait by @parent.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue;
 * then ->notask_error is 0 if @p is an eligible child,
 * or another error from security_task_wait(), or still -ECHILD.
 */
static int wait_consider_task(struct wait_opts *wo, int ptrace,
				struct task_struct *p)
{
	。。。。。。。。。
/*
<span style="white-space:pre">	</span> * We don't reap group leaders with subthreads.
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
<span style="white-space:pre">		</span>return wait_task_zombie(wo, p);


<span style="white-space:pre">	</span>/*
<span style="white-space:pre">	</span> * It's stopped or running now, so it might
<span style="white-space:pre">	</span> * later continue, exit, or stop again.
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>wo->notask_error = 0;


<span style="white-space:pre">	</span>if (task_stopped_code(p, ptrace))
<span style="white-space:pre">		</span>return wait_task_stopped(wo, ptrace, p);


<span style="white-space:pre">	</span>return wait_task_continued(wo, p);
}

/*
 * Do the work of do_wait() for one thread in the group, @tsk.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue; then
 * ->notask_error is 0 if there were any eligible children,
 * or another error from security_task_wait(), or still -ECHILD.
 */
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
	struct task_struct *p;

	list_for_each_entry(p, &tsk->children, sibling) {
		int ret = wait_consider_task(wo, 0, p);
		if (ret)
			return ret;
	}

	return 0;
}

static long do_wait(struct wait_opts *wo)
{
	struct task_struct *tsk;
	int retval;

	trace_sched_process_wait(wo->wo_pid);

	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
	wo->child_wait.private = current;
	add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
repeat:
	/*
	 * If there is nothing that can match our critiera just get out.
	 * We will clear ->notask_error to zero if we see any child that
	 * might later match our criteria, even if we are not able to reap
	 * it yet.
	 */
	wo->notask_error = -ECHILD;
	if ((wo->wo_type < PIDTYPE_MAX) &&
	   (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
		goto notask;

	set_current_state(TASK_INTERRUPTIBLE);
	read_lock(&tasklist_lock);
	tsk = current;
	do {
		retval = do_wait_thread(wo, tsk);
		if (retval)
			goto end;

		retval = ptrace_do_wait(wo, tsk);
		if (retval)
			goto end;

		if (wo->wo_flags & __WNOTHREAD)
			break;
	} while_each_thread(current, tsk);
	read_unlock(&tasklist_lock);

notask:
	retval = wo->notask_error;
	if (!retval && !(wo->wo_flags & WNOHANG)) {
		retval = -ERESTARTSYS;
		if (!signal_pending(current)) {
			schedule();
			goto repeat;
		}
	}
end:
	__set_current_state(TASK_RUNNING);
	remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
	return retval;
}     /*

implement of release_task():

1、Decreases the number of processes belonging to the user owner of the terminated process. This value is stored in theuser_structstructure mentioned earlier in the chapter

2.If the process is being traced, the function removes it from the debugger’s
ptrace_childrenlist and assigns the process back to its original parent.

3.Invokes _ _exit_signal() to cancel any pending signal and to release thesignal_
struct descriptor of the process. If the descriptor is no longer used by other
lightweight processes, the function also removes this data structure. Moreover,
the function invokes exit_itimers() to detach any POSIX interval timer from
the process.

4.Invokes _ _exit_sighand() to get rid of the signal handlers.

5.Invokes _ _unhash_process( ), which in turn:
a. Decreases by 1 the nr_threads variable.

b. Invokes detach_pid( ) twice to remove the process descriptor from the
pidhash hash tables of type PIDTYPE_PID and PIDTYPE_TGID.

c. If the process is a thread group leader, invokes againdetach_pid()twice to
remove the process descriptor from the PIDTYPE_PGID and PIDTYPE_SID hash
tables.

d. Uses the REMOVE_LINKS macro to unlink the process descriptor from the process list.

6.If the process is not a thread group leader, the leader is a zombie, and the process is the last member of the thread group, the function sends a signal to the
parent of the leader to notify it of the death of the process.

7.Invokes the sched_exit() function to adjust the timeslice of the parent process

8.Invokes put_task_struct() to decrease the process descriptor’s usage counter; if
the counter becomes zero, the function drops any remaining reference to the
process:
a. Decreases the usage counter (_ _countfield) of theuser_structdata structure of the user that owns the process, and releases that data structure if the usage counter becomes zero.
b. Releases the process descriptor and the memory area used to contain the thread_info descriptor and the Kernel Mode stack.

static void __unhash_process(struct task_struct *p, bool group_dead)
{
	nr_threads--;
	detach_pid(p, PIDTYPE_PID);
	if (group_dead) {
		detach_pid(p, PIDTYPE_PGID);
		detach_pid(p, PIDTYPE_SID);

		list_del_rcu(&p->tasks);
		list_del_init(&p->sibling);
		__this_cpu_dec(process_counts);
	}
	list_del_rcu(&p->thread_group);
}

/*
 * This function expects the tasklist_lock write-locked.
 */
static void __exit_signal(struct task_struct *tsk)
{
	struct signal_struct *sig = tsk->signal;
	bool group_dead = thread_group_leader(tsk);
	struct sighand_struct *sighand;
	struct tty_struct *uninitialized_var(tty);

	sighand = rcu_dereference_check(tsk->sighand,
					rcu_read_lock_held() ||
					lockdep_tasklist_lock_is_held());
	spin_lock(&sighand->siglock);

	posix_cpu_timers_exit(tsk);
	if (group_dead) {
		posix_cpu_timers_exit_group(tsk);
		tty = sig->tty;
		sig->tty = NULL;
	} else {
		/*
		 * This can only happen if the caller is de_thread().
		 * FIXME: this is the temporary hack, we should teach
		 * posix-cpu-timers to handle this case correctly.
		 */
		if (unlikely(has_group_leader_pid(tsk)))
			posix_cpu_timers_exit_group(tsk);

		/*
		 * If there is any task waiting for the group exit
		 * then notify it:
		 */
		if (sig->notify_count > 0 && !--sig->notify_count)
			wake_up_process(sig->group_exit_task);

		if (tsk == sig->curr_target)
			sig->curr_target = next_thread(tsk);
		/*
		 * Accumulate here the counters for all threads but the
		 * group leader as they die, so they can be added into
		 * the process-wide totals when those are taken.
		 * The group leader stays around as a zombie as long
		 * as there are other threads.  When it gets reaped,
		 * the exit.c code will add its counts into these totals.
		 * We won't ever get here for the group leader, since it
		 * will have been the last reference on the signal_struct.
		 */
		sig->utime = cputime_add(sig->utime, tsk->utime);
		sig->stime = cputime_add(sig->stime, tsk->stime);
		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
		sig->min_flt += tsk->min_flt;
		sig->maj_flt += tsk->maj_flt;
		sig->nvcsw += tsk->nvcsw;
		sig->nivcsw += tsk->nivcsw;
		sig->inblock += task_io_get_inblock(tsk);
		sig->oublock += task_io_get_oublock(tsk);
		task_io_accounting_add(&sig->ioac, &tsk->ioac);
		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
	}

	sig->nr_threads--;
	__unhash_process(tsk, group_dead);

	/*
	 * Do this under ->siglock, we can race with another thread
	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
	 */
	flush_sigqueue(&tsk->pending);
	tsk->sighand = NULL;
	spin_unlock(&sighand->siglock);

	__cleanup_sighand(sighand);
	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
	if (group_dead) {
		flush_sigqueue(&sig->shared_pending);
		tty_kref_put(tty);
	}
}

static void delayed_put_task_struct(struct rcu_head *rhp)
{
	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

	perf_event_delayed_put(tsk);
	trace_sched_process_free(tsk);
	put_task_struct(tsk);
}


void release_task(struct task_struct * p)
{
	struct task_struct *leader;
	int zap_leader;
repeat:
	tracehook_prepare_release_task(p);
	/* don't need to get the RCU readlock here - the process is dead and
	 * can't be modifying its own credentials. But shut RCU-lockdep up */
	rcu_read_lock();
	atomic_dec(&__task_cred(p)->user->processes);
	rcu_read_unlock();

	proc_flush_task(p);

	write_lock_irq(&tasklist_lock);
	tracehook_finish_release_task(p);
	__exit_signal(p);

	/*
	 * If we are the last non-leader member of the thread
	 * group, and the leader is zombie, then notify the
	 * group leader's parent process. (if it wants notification.)
	 */
	zap_leader = 0;
	leader = p->group_leader;
	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
		BUG_ON(task_detached(leader));
		do_notify_parent(leader, leader->exit_signal);
		/*
		 * If we were the last child thread and the leader has
		 * exited already, and the leader's parent ignores SIGCHLD,
		 * then we are the one who should release the leader.
		 *
		 * do_notify_parent() will have marked it self-reaping in
		 * that case.
		 */
		zap_leader = task_detached(leader);

		/*
		 * This maintains the invariant that release_task()
		 * only runs on a task in EXIT_DEAD, just for sanity.
		 */
		if (zap_leader)
			leader->exit_state = EXIT_DEAD;
	}

	write_unlock_irq(&tasklist_lock);
	release_thread(p);
	call_rcu(&p->rcu,delayed_put_task_struct);

	p = leader;
	if (unlikely(zap_leader))
		goto repeat;
}

NORET_TYPE void do_exit(long code)
{
	struct task_struct *tsk = current;
	int group_dead;

	profile_task_exit(tsk);

	WARN_ON(atomic_read(&tsk->fs_excl));

	if (unlikely(in_interrupt()))
		panic("Aiee, killing interrupt handler!");
	if (unlikely(!tsk->pid))
		panic("Attempted to kill the idle task!");

	/*
	 * If do_exit is called because this processes oopsed, it's possible
	 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
	 * continuing. Amongst other possible reasons, this is to prevent
	 * mm_release()->clear_child_tid() from writing to a user-controlled
	 * kernel address.
	 */
	set_fs(USER_DS);

	tracehook_report_exit(&code);

	validate_creds_for_do_exit(tsk);

	/*
	 * We're taking recursive faults here in do_exit. Safest is to just
	 * leave this task alone and wait for reboot.
	 */
	if (unlikely(tsk->flags & PF_EXITING)) {
		printk(KERN_ALERT
			"Fixing recursive fault but reboot is needed!\n");
		/*
		 * We can do this unlocked here. The futex code uses
		 * this flag just to verify whether the pi state
		 * cleanup has been done or not. In the worst case it
		 * loops once more. We pretend that the cleanup was
		 * done as there is no way to return. Either the
		 * OWNER_DIED bit is set by now or we push the blocked
		 * task into the wait for ever nirwana as well.
		 */
		tsk->flags |= PF_EXITPIDONE;
		set_current_state(TASK_UNINTERRUPTIBLE);
		schedule();
	}

	exit_irq_thread();

	exit_signals(tsk);  /* sets PF_EXITING */
	/*
	 * tsk->flags are checked in the futex code to protect against
	 * an exiting task cleaning up the robust pi futexes.
	 */
	smp_mb();
	raw_spin_unlock_wait(&tsk->pi_lock);

	if (unlikely(in_atomic()))
		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
				current->comm, task_pid_nr(current),
				preempt_count());

	acct_update_integrals(tsk);
	/* sync mm's RSS info before statistics gathering */
	if (tsk->mm)
		sync_mm_rss(tsk, tsk->mm);
	group_dead = atomic_dec_and_test(&tsk->signal->live);
	if (group_dead) {
		hrtimer_cancel(&tsk->signal->real_timer);
		exit_itimers(tsk->signal);
		if (tsk->mm)
			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
	}
	acct_collect(code, group_dead);
	if (group_dead)
		tty_audit_exit();
	if (unlikely(tsk->audit_context))
		audit_free(tsk);

	tsk->exit_code = code;
	taskstats_exit(tsk, group_dead);

	exit_mm(tsk);

	if (group_dead)
		acct_process();
	trace_sched_process_exit(tsk);

	exit_sem(tsk);
	exit_files(tsk);
	exit_fs(tsk);
	check_stack_usage();
	exit_thread();

	/*
	 * Flush inherited counters to the parent - before the parent
	 * gets woken up by child-exit notifications.
	 *
	 * because of cgroup mode, must be called before cgroup_exit()
	 */
	perf_event_exit_task(tsk);

	cgroup_exit(tsk, 1);

	if (group_dead)
		disassociate_ctty(1);

	module_put(task_thread_info(tsk)->exec_domain->module);

	proc_exit_connector(tsk);

	/*
	 * FIXME: do that only when needed, using sched_exit tracepoint
	 */
	flush_ptrace_hw_breakpoint(tsk);

	exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
	task_lock(tsk);
	mpol_put(tsk->mempolicy);
	tsk->mempolicy = NULL;
	task_unlock(tsk);
#endif
#ifdef CONFIG_FUTEX
	if (unlikely(current->pi_state_cache))
		kfree(current->pi_state_cache);
#endif
	/*
	 * Make sure we are holding no locks:
	 */
	debug_check_no_locks_held(tsk);
	/*
	 * We can do this unlocked here. The futex code uses this flag
	 * just to verify whether the pi state cleanup has been done
	 * or not. In the worst case it loops once more.
	 */
	tsk->flags |= PF_EXITPIDONE;

	if (tsk->io_context)
		exit_io_context(tsk);

	if (tsk->splice_pipe)
		__free_pipe_info(tsk->splice_pipe);

	validate_creds_for_do_exit(tsk);

	preempt_disable();
	exit_rcu();
	/* causes final put_task_struct in finish_task_switch(). */
	tsk->state = TASK_DEAD;
	schedule();
	BUG();
	/* Avoid "noreturn function does return".  */
	for (;;)
		cpu_relax();	/* For when BUG is null */
}

古戎烽烟

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
进程-process

进程：一般进程具备以下要素: 1、有一段程序执行，这段程序可以不一定是进程所专有，可以与其它进程共用； 2、有专用的系统堆栈空间； 3、在内核中有一个task_struct数据结构；即进程控制块。有了次数据结构，进程才能成为内核调度的一个基本单位接收内核的调度。同时此结构记录着进程所占用的各项资源。 4、有独立的存
复制链接

扫一扫