在linux中,内核线程在以下几个方面运行:
内核线程只运行在内核态,普通进程可以运行于内核态和用户态
内核线程用大于page_offset的线性地址,普通进程可以用所有的
创建内核线程
/*
* Create a kernel thread
*/
int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
{
struct pt_regs regs;
memset(®s, 0, sizeof(regs));
regs.ebx = (unsigned long) fn;
regs.edx = (unsigned long) arg;
regs.xds = __USER_DS;
regs.xes = __USER_DS;
regs.orig_eax = -1;
regs.eip = (unsigned long) kernel_thread_helper;
regs.xcs = __KERNEL_CS;
regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
/* Ok, create the new process.. */
return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL);
}
clone_vm标志 避免复制调用进程的页表:由于新进程内核线程不会访问用户进程的地址空间,所以如果copy的话就会浪费;
".align 4\n"
"kernel_thread_helper:\n\t"
"movl %edx,%eax\n\t"
"pushl %edx\n\t"
"call *%ebx\n\t"
"pushl %eax\n\t"
"call do_exit\n"
".previous");
{//杀死属于current线程组的所有的进程。
BUG_ON(exit_code & 0x80); /* core dumps don't get here */
//检查进程退出标志是否为0;不为0说明 内核已经线程组执行退出过程;
if (current->signal->flags & SIGNAL_GROUP_EXIT)
exit_code = current->signal->group_exit_code;
else if (!thread_group_empty(current)) {
struct signal_struct *const sig = current->signal;
struct sighand_struct *const sighand = current->sighand;
read_lock(&tasklist_lock);
spin_lock_irq(&sighand->siglock);
if (sig->flags & SIGNAL_GROUP_EXIT)
/* Another thread got here before we took the lock. */
exit_code = sig->group_exit_code;
else {
sig->flags = SIGNAL_GROUP_EXIT;
sig->group_exit_code = exit_code;
zap_other_threads(current);
}
spin_unlock_irq(&sighand->siglock);
read_unlock(&tasklist_lock);
}
do_exit(exit_code);
/* NOTREACHED */
}
{
struct task_struct *tsk = current;
int group_dead;
profile_task_exit(tsk);
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
if (unlikely(tsk->pid == 1))
panic("Attempted to kill init!");
if (tsk->io_context)
exit_io_context();
if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
current->ptrace_message = code;
ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
}
将进程描述符中的flag字段添加pf_exiting标识,以表示进程正在删除
tsk->flags |= PF_EXITING;
del_timer_sync(&tsk->real_timer); 从动态定时器队列中删除进程描述符。
if (unlikely(in_atomic()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
current->comm, current->pid,
preempt_count());
acct_update_integrals();
update_mem_hiwater();
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead)
acct_process(code);
exit_mm(tsk);
exit_sem(tsk);
__exit_files(tsk);
__exit_fs(tsk);
exit_namespace(tsk);
exit_thread();
exit_keys(tsk);
if (group_dead && tsk->signal->leader)
disassociate_ctty(1);
module_put(tsk->thread_info->exec_domain->module);
if (tsk->binfmt)
module_put(tsk->binfmt->module);
tsk->exit_code = code;//终止代号
exit_notify(tsk);
#ifdef CONFIG_NUMA
mpol_free(tsk->mempolicy);
tsk->mempolicy = NULL;
#endif
BUG_ON(!(current->flags & PF_DEAD));
schedule();
BUG();
/* Avoid "noreturn function does return". */
for (;;) ;
}
/*
* Send signals to all our closest relatives so that they know
* to properly mourn us..
*/
static void exit_notify(struct task_struct *tsk)
{
int state;
struct task_struct *t;
struct list_head ptrace_dead, *_p, *_n;
if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
&& !thread_group_empty(tsk)) {
/*
* This occurs when there was a race between our exit
* syscall and a group signal choosing us as the one to
* wake up. It could be that we are the only thread
* alerted to check for pending signals, but another thread
* should be woken now to take the signal since we will not.
* Now we'll wake all the threads in the group just to make
* sure someone gets all the pending signals.
*/
read_lock(&tasklist_lock);
spin_lock_irq(&tsk->sighand->siglock);
for (t = next_thread(tsk); t != tsk; t = next_thread(t))
if (!signal_pending(t) && !(t->flags & PF_EXITING)) {
recalc_sigpending_tsk(t);
if (signal_pending(t))
signal_wake_up(t, 0);唤醒函数
}
spin_unlock_irq(&tsk->sighand->siglock);
read_unlock(&tasklist_lock);
}
write_lock_irq(&tasklist_lock);
/*
* This does two things:
*
* A. Make init inherit all the child processes
* B. Check to see if any process groups have become orphaned
* as a result of our exiting, and if they have any stopped
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
INIT_LIST_HEAD(&ptrace_dead);
b)养父是当前进程的祖先进程;
c)养父是init进程;
置成指向正在跟踪它的进程,那个进程就暂时成了被跟踪进程的“养父”。而被跟踪进程的p_opptr指针却不变,仍旧指向其生父。如果一个进程在其了进程之前“去世”话,就要把它的了进程托付给某个进程。托付给谁呢?如果当前进程是一个线程,那就托付给同一线程组中的下一个线程,使了进程的p }pp}'指向这个线程。否则,就只好托付给系统中的init进程,所以这init进程就好像是孤儿院。由此可见,所谓‘original parent”也不是永远不变的,原因在于系统中的进程号pid以及用作task struct
结构的页面都是在周转使用的,所以实际上一来并没有保留这个记录的意义,二来技术上也有困难。现在,当前进程要exit()了,所以要将其所有的了进程都送进“孤儿院”,要不然到它们也要exit()的时候就没有父进程来料理它们的后事了。这就是调用forget original_parent()的目的(kernel/exit.c ) .
指向child reaper,即init进程,并嘱其将来exit()时要发一个SIGCHLD信号给child reaper,并根据当
前进程的task struet结构中的pdeath_signal的设置向其发一个信号,告知生父的“噩耗”。
forget_original_parent(tsk, &ptrace_dead);
/*
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
* group, and if no such member exists, give it to
* the global child reaper process (ie "init")
*/
static inline void forget_original_parent(struct task_struct * father,
struct list_head *to_release)
{
struct task_struct *p, *reaper = father;
struct list_head *_p, *_n;
task_t fastcall *next_thread(const task_t *p)
{
<span style="white-space:pre"> </span>return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
}
do {
reaper = next_thread(reaper);
if (reaper == father) {
reaper = child_reaper; // struct task_struct *child_reaper = &init_task;
break;
}
} while (reaper->exit_state);
/*
* There are only two places where our children can be:
*
* - in our child list
* - in our ptraced child list
*
* Search them and reparent children.
*/
list_for_each_safe(_p, _n, &father->children) {
int ptrace;
p = list_entry(_p,struct task_struct,sibling);
ptrace = p->ptrace;
/* if father isn't the real parent, then ptrace must be enabled */
BUG_ON(father != p->real_parent && !ptrace);
if (father == p->real_parent) {
/* reparent with a reaper, real father it's us */
choose_new_parent(p, reaper, child_reaper);
reparent_thread(p, father, 0);
} else {
/* reparent ptraced task to its real parent */
__ptrace_unlink (p);
if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
thread_group_empty(p))
do_notify_parent(p, p->exit_signal);
}
/*
* if the ptraced child is a zombie with exit_signal == -1
* we must collect it before we exit, or it will remain
* zombie forever since we prevented it from self-reap itself
* while it was being traced by us, to be able to see it in wait4.
*/
if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
list_add(&p->ptrace_list, to_release);
}
list_for_each_safe(_p, _n, &father->ptrace_children) {
p = list_entry(_p,struct task_struct,ptrace_list);
choose_new_parent(p, reaper, child_reaper);
reparent_thread(p, father, 1);
}
}
BUG_ON(!list_empty(&tsk->children));
BUG_ON(!list_empty(&tsk->ptrace_children));
/*
* Check to see if any process groups have become orphaned
* as a result of our exiting, and if they have any stopped
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*
* Case i: Our father is in a different pgrp than we are
* and we were the only connection outside, so our pgrp
* is about to become orphaned.
*/
t = tsk->real_parent;
if ((process_group(t) != process_group(tsk)) &&
(t->signal->session == tsk->signal->session) &&
will_become_orphaned_pgrp(process_group(tsk), tsk) &&
has_stopped_jobs(process_group(tsk))) {
__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
__kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
}
/* Let father know we died
*
* Thread signals are configurable, but you aren't going to use
* that to send signals to arbitary processes.
* That stops right now.
*
* If the parent exec id doesn't match the exec id we saved
* when we started then we know the parent has changed security
* domain.
*
* If our self_exec id doesn't match our parent_exec_id then
* we have changed execution domain as these two values started
* the same after a fork.
*
*/
if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
( tsk->parent_exec_id != t->self_exec_id ||
tsk->self_exec_id != tsk->parent_exec_id)
&& !capable(CAP_KILL))
tsk->exit_signal = SIGCHLD;
/* If something other than our normal parent is ptracing us, then
* send it a SIGCHLD instead of honoring exit_signal. exit_signal
* only has special meaning to our real parent.
*/
if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
do_notify_parent(tsk, signal);
} else if (tsk->ptrace) {
do_notify_parent(tsk, SIGCHLD);
}
p一 cptr,指向了进程,这里的c表示“child"o p_cptr与p_pp}'是相对应的。当一个进程有多个了进程时,p_ cptr指向其“最年轻的”,也就是最近创建的那个了进程。p_ysptr,指向当前进程的“弟弟”,这里的Y表示“younger,而s表示“sibling"o p_osptr,指向当前进程的“哥哥”,这里的。表示“older"o这样,当前进程的所有了进程都通过p_ysptr和p_ osptr连接在一起形成一个双链队列。队列中每一个进程的p_pp}'都指向当前进程,而当前进程的p_ optr则指向队列中最后创建的了进程。有趣的是,
了进程在行事时只认其“养父”,而p }pp}'所指的“生父”倒似乎无关紧要。当然,一个进程除身处这个由亲属关系形成的队列中之外,同时也身处其它的队列中,所以tas址struct结构中还有其它的
<span style="font-size:18px;">/*
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
*/
void do_notify_parent(struct task_struct *tsk, int sig)
{
struct siginfo info;
unsigned long flags;
struct sighand_struct *psig;
BUG_ON(sig == -1);
/* do_notify_parent_cldstop should have been called instead. */
BUG_ON(tsk->state & (TASK_STOPPED|TASK_TRACED));
BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
info.si_signo = sig;
info.si_errno = 0;
info.si_pid = tsk->pid;
info.si_uid = tsk->uid;
/* FIXME: find out whether or not this is supposed to be c*time. */
info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
tsk->signal->utime));
info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime,
tsk->signal->stime));
info.si_status = tsk->exit_code & 0x7f;
if (tsk->exit_code & 0x80)
info.si_code = CLD_DUMPED;
else if (tsk->exit_code & 0x7f)
info.si_code = CLD_KILLED;
else {
info.si_code = CLD_EXITED;
info.si_status = tsk->exit_code >> 8;
}
psig = tsk->parent->sighand;
spin_lock_irqsave(&psig->siglock, flags);
if (sig == SIGCHLD &&
(psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
(psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
/*
* We are exiting and our parent doesn't care. POSIX.1
* defines special semantics for setting SIGCHLD to SIG_IGN
* or setting the SA_NOCLDWAIT flag: we should be reaped
* automatically and not left for our parent's wait4 call.
* Rather than having the parent do it as a magic kind of
* signal handler, we just set this to tell do_exit that we
* can be cleaned up without becoming a zombie. Note that
* we still call __wake_up_parent in this case, because a
* blocked sys_wait4 might now return -ECHILD.
*
* Whether we send SIGCHLD or not for SA_NOCLDWAIT
* is implementation-defined: we do (if you don't want
* it, just use SIG_IGN instead).
*/
tsk->exit_signal = -1;
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = 0;
}</span>
<span style="font-size:18px;"><span style="color: rgb(51, 51, 51); font-family: Arial; font-size: 13.63636302948px; line-height: 25.9943180084229px; widows: auto;"> send_sig_info(..., tsk->p_pptr):向父进程发送信号通知其自身结束的消息</span><br style="color: rgb(51, 51, 51); font-family: Arial; font-size: 13.63636302948px; line-height: 25.9943180084229px; widows: auto;" /><span style="color: rgb(51, 51, 51); font-family: Arial; font-size: 13.63636302948px; line-height: 25.9943180084229px; widows: auto;"> wake_up_parent(tsk->p_pptr): 唤醒父进程使之进入可调度队列(状态变为TASK_RUNNING)</span>
if (sig > 0 && sig <= _NSIG)
__group_send_sig_info(sig, &info, tsk->parent);
</span><span style="font-size:32px;">__wake_up_parent(tsk, tsk->parent);</span><span style="font-size:18px;">
spin_unlock_irqrestore(&psig->siglock, flags);
}
</span>
<span style="font-size:32px;">static inline void __wake_up_parent(struct task_struct *p,
<span style="white-space: pre;"> </span> struct task_struct *parent)</span><span style="font-size:18px;">
{
</span><span style="font-size:18px; white-space: pre;"> </span><span style="font-size:18px;">wake_up_interruptible_sync(&parent->signal->wait_chldexit);
}
</span>
state = EXIT_ZOMBIE;
if (tsk->exit_signal == -1 &&
(likely(tsk->ptrace == 0) ||
unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT)))
state = EXIT_DEAD;
tsk->exit_state = state;
/*
* Clear these here so that update_process_times() won't try to deliver
* itimer, profile or rlimit signals to this task while it is in late exit.
*/
tsk->it_virt_value = cputime_zero;
tsk->it_prof_value = cputime_zero;
write_unlock_irq(&tasklist_lock);
list_for_each_safe(_p, _n, &ptrace_dead) {
list_del_init(_p);
t = list_entry(_p,struct task_struct,ptrace_list);
release_task(t);
}
/* If the process is dead, release it - nobody will wait for it */
if (state == EXIT_DEAD)
release_task(tsk);
/* PF_DEAD causes final put_task_struct after we schedule. */
preempt_disable();
tsk->flags |= PF_DEAD;
}
养父进程释放剩下的资源
养父进程使用sys_waitpid()->sys_wait4()处理消亡的子进程,处理过程如下:
1、在当前进程的栈中定义一个structwait_opts wo数据结构,重点关注wait_queue_tchild_wait成员;
2、调用add_wait_queue()将wo的child_wait成员加入到当前进程的wait_chldexit队列中;
3、设置当前进程的运行状态为可中断状态(INTERRUPTIBLE);
4、如果子进程满足以下条件:
a)子进程的状态为EXIT_DEAD,跳转到5);
b)子进程的状态为EXIT_ZOMBIE,则调用wait_task_zombie()处理子进程消亡,跳转到5);
c)子进程的状态为STOPPED,则调用wait_task_stopped()处理子进程的消亡,跳转到5);
d)否则,调用wait_task_continued()继续执行;调用schedule()将当前进程睡眠,等到下一次被调度时,程序跳转到4)执行。
5、设置当前进程的运行状态为TASK_RUNNING;
6、调用remove_wait_queue()将child_wait从wait_chldexit队列中移除。
也就是说,sys_wait4一开头就在当前进程的系统堆栈上分配一个wait_queue_ t数据结构(名为wait),结构中的compiler warning为0x1234567,指针task指向当前进程的task struct,而list head结构task list中的两个指针均为N ULL。由于这个数据结构建立在当前进程的系统空问堆栈中,一旦从sys_ wait4()返回,这个数据结构就不复存在了。与此相应,在进程的task struct中有个wait_queue一 head一 t数据结构wait chldexit用于这个目的。
然后,通过add_ wait queue()将这个数据结构(wait)加入到当前进程的wait chldexit队列中。这洋做的作用在下面重温了do_ notify_parent()的代码以后就会清楚。接着,就进入了一个循环,这是一个不小的循环(kernel/exit.c: sys_ wait4()):
这个由goto实现的循环要到当前进程被调度运行,并且下列条件之一得到满足时才结束(见代码中的“goto end_ wait4”语句):
1、 所等待的了进程的状态变成TASK_ STOPPED或TASK ZOMBIE;
2、 所等待的了进程存在,可是不在上述两个状态,而调用参数options中的WNOHANG标志位为1,或者当前进程收到了其它的
3、 进程号为pid的那个进程根本不存在,或者不是当前进程的了进程。
static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
int flag, retval;
add_wait_queue(¤t->signal->wait_chldexit,&wait);
repeat:
/*
* We will set this flag if we see any child that might later
* match our criteria, even if we are not able to reap it yet.
*/
flag = 0;
current->state = TASK_INTERRUPTIBLE;
read_lock(&tasklist_lock);
tsk = current;
do {
struct task_struct *p;
struct list_head *_p;
int ret;
list_for_each(_p,&tsk->children) {
p = list_entry(_p,struct task_struct,sibling_thread;
ret = eligible_child(pid, options, p);
if (!ret)
continue;
switch (p->state) {
case TASK_TRACED:
if (!my_ptrace_child(p))
continue;
/*FALLTHROUGH*/
case TASK_STOPPED:
/*
* It's stopped now, so it might later
* continue, exit, or stop again.
*/
flag = 1;
if (!(options & WUNTRACED) &&
!my_ptrace_child(p))
continue;
retval = wait_task_stopped(p, ret == 2,
(options & WNOWAIT),
infop,
stat_addr, ru);
if (retval == -EAGAIN)
goto repeat;
if (retval != 0) /* He released the lock. */
goto end;
break;
default:
// case EXIT_DEAD:
if (p->exit_state == EXIT_DEAD)
continue;
// case EXIT_ZOMBIE:
if (p->exit_state == EXIT_ZOMBIE) {
/*
* Eligible but we cannot release
* it yet:
*/
if (ret == 2)
goto check_continued;
if (!likely(options & WEXITED))
continue;
//wait_task_zombie() 最后会调用release_task()????????;
retval = wait_task_zombie(
p, (options & WNOWAIT),
infop, stat_addr, ru);
/* He released the lock. */
if (retval != 0)
goto end;
break;
}
check_continued:
/*
* It's running now, so it might later
* exit, stop, or stop and then continue.
*/
flag = 1;
if (!unlikely(options & WCONTINUED))
continue;
retval = wait_task_continued(
p, (options & WNOWAIT),
infop, stat_addr, ru);
if (retval != 0) /* He released the lock. */
goto end;
break;
}
}
if (!flag) {
list_for_each(_p, &tsk->ptrace_children) {
p = list_entry(_p, struct task_struct,
ptrace_list);
if (!eligible_child(pid, options, p))
continue;
flag = 1;
break;
}
}
if (options & __WNOTHREAD)
break;
tsk = next_thread(tsk);
if (tsk->signal != current->signal)
BUG();
} while (tsk != current)<span lang="EN-US" style="font-family: Arial; line-height: 25.9943180084229px; widows: auto; color: rgb(109, 109, 109); font-size: 9pt;"> //</span><span style="line-height: 25.9943180084229px; widows: auto; font-family: 宋体; color: rgb(109, 109, 109); font-size: 9pt;">它的解释是,当前任务可能是一个线程,而等待的进程是由同一个进程克隆出来的另一个线程的子进程</span>
read_unlock(&tasklist_lock);
if (flag) {
retval = 0;
if (options & WNOHANG)
goto end;
retval = -ERESTARTSYS;
if (signal_pending(current))
goto end;
schedule();
goto repeat;
}
retval = -ECHILD;
end:
current->state = TASK_RUNNING;
remove_wait_queue(¤t->signal->wait_chldexit,&wait);
if (infop) {
if (retval > 0)
retval = 0;
else {
/*
* For a WNOHANG return, clear out all the fields
* we would set so the user can easily tell the
* difference.
*/
if (!retval)
retval = put_user(0, &infop->si_signo);
if (!retval)
retval = put_user(0, &infop->si_errno);
if (!retval)
retval = put_user(0, &infop->si_code);
if (!retval)
retval = put_user(0, &infop->si_pid);
if (!retval)
retval = put_user(0, &infop->si_uid);
if (!retval)
retval = put_user(0, &infop->si_status);
}
}
return retval;
}
可是,要是父进程不在wait490中等待呢?那也不要紧。每当进程从系统调用、中断或异常返回时,都要检查一下是否有信号等待处理,如有的话就转入entry. S中的
signal return处调用do_ signalQ。而do_ signal()中有一个片段为(arch/i386/kernel/signal.c中):