/*
* SMP负载均衡:http://linux.chinaitlab.com/kernel/888181.html
* 命名空间:http://prettyinsight.iteye.com/blog/901468
* do_fork():http://edsionte.com/techblog/archives/2131
* CLONE_VFORK: http://book.51cto.com/art/201005/200889.htm
*
*
*
*/
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
/* fork/vfork/clone三个系统调用所对应的系统调用服务例程均调用了do_fork()
* clone_flags 标志的4个字节分为两部分.最低的一个字节为子进程结束时
* 发送给父进程的信号,通常为SIGCHLD
* 剩余的三个字节是各种clone标志的组合.下面列举几个比较常用的:
* CLONE_VM 共享进程地址空间
* CLONE_FS 共享文件系统
* CLONE_FILES 共享打开的文件
* CLONE_PTRACE 继续调试子进程
* CLONE_VFORK 调用vfork(),所以父进程准备睡眠等待子进程将其唤醒
* CLONE_SIGHAND 共享信号处理表,阻塞信号表和挂起信号表,选此必须选CLONE_VM
* CLONE_THREAD 父子进程放入相同的线程组.选此必选CLONE_SIGHAND
* stack_start 子进程用户堆栈的地址
* regs 指向pt_regs结构体,当系统调用发生,从用户态切换到内核态时
* 用来保存寄存器值,并被存放于内核态的栈中.
* stack_size 未使用 常赋值0
* parent_tidptr 父进程用户态下pid的地址,仅在CLONE_PARENT_SETTID时有意义.
* child_tidptr 子进程在用户态下pid的地址,仅在CLONE_CHILD_SETTID时有意义.
*
*/
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
/*p用来接收新分配的子进程描述符*/
struct task_struct *p;
int trace = 0;
long nr;
/*
* We hope to recycle these flags after 2.6.26
* CLONE_STOPPED 废弃的标识符
*/
if (unlikely(clone_flags & CLONE_STOPPED)) {
static int __read_mostly count = 100;
if (count > 0 && printk_ratelimit()) {
char comm[TASK_COMM_LEN];
count--;
printk(KERN_INFO "fork(): process `%s' used deprecated "
"clone flags 0x%lx\n",
get_task_comm(comm, current),
clone_flags & CLONE_STOPPED);
}
}
/*
* 如果当前进程(即fork关系中的父进程)被另外一个进程跟踪了
* 常见的跟踪如:debugger进程跟踪被调试进程来控制被调试进程.
* 如果(&task_struct)->ptrace字段非0,那么说明进程正被跟踪.
* 父进程被跟踪时,子进程要使用fork_traceflag()确定是否也被跟踪
*/
if (unlikely(current->ptrace)) {
trace = fork_traceflag (clone_flags);
if (trace)
clone_flags |= CLONE_PTRACE;
}
/*
* copy_process()函数创建子进程描述符,并创建子进程执行时所需的
* 其他数据结构,最终则返回这个创建好的进程描述符.
* 各参数意义与do_fork()相同.
*/
p = copy_process(clone_flags, stack_start, regs, stack_size,
child_tidptr, NULL);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
if (!IS_ERR(p)) {
/*
* struct completion是完成量,完成量是Linux里面实现的一种同步机制,
* 类似于信号量.
* struct completion {
* unsigned int done;
* wait_queue_head_t wait;
* };
* struct __wait_queue_head {
* spinlock_t lock;
* struct list_head task_list;
* };
* typedef struct __wait_queue_head wait_queue_head_t;
*/
struct completion vfork;
/*
*task_pid_vnr()获取子进程的pid.
*/
nr = task_pid_vnr(p);
/**/
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
/*
* 如果使用了vfork机制(设置CLONE_VFORK),则必须启动子进程的完成机制
* 借助于wait_for_completion函数,父进程在该函数上睡眠,
* 直到子进程退出.在进程终止或调用execuve启动新程序时,
* 内核会自动调用complete(vfork_done),这会唤醒所有因为该
* 变量而睡眠的进程.
* 通过采用这种方法,内核可以确保使用vfork生成子进程的父进程会一直
* 处于非活动状态,直至子进程退出或执行型一个新的程序.父进程的临时
* 睡眠状态,也确保了两个进程不会彼此干扰或操作对方的地址空间.
*/
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
}
/*
*如果父进程被跟踪 或者被设置了CLONE_STOPPED,那么要挂起子进程
*/
if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
/*
* We'll start up with an immediate SIGSTOP.
*/
sigaddset(&p->pending.signal, SIGSTOP);
set_tsk_thread_flag(p, TIF_SIGPENDING);
}
/*
* 如果没有设置CLONE_STOPPED标志,那么唤醒新创建的子进程.
*/
if (!(clone_flags & CLONE_STOPPED))
wake_up_new_task(p, clone_flags);
else
__set_task_state(p, TASK_STOPPED);
/*
* 如果子进程需要被跟踪
*/
if (unlikely (trace)) {
current->ptrace_message = nr;
ptrace_notify ((trace << 8) | SIGTRAP);
}
/*
* vfork()机制相关处理
*/
if (clone_flags & CLONE_VFORK) {
freezer_do_not_count();
/*
* 父进程陷入不可中断方式睡眠,等待子进程终结
*/
wait_for_completion(&vfork);
freezer_count();
if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
current->ptrace_message = nr;
ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
}
}
} else {
/*
* copy_process()返回错误
*/
nr = PTR_ERR(p);
}
/*
* 返回子进程的pid,注意不是进程描述符
*/
return nr;
}
/*
* This is the main, per-CPU runqueue data structure.
*
* Locking rule: those places that want to lock multiple runqueues
* (such as the load balancing or the thread migration code), lock
* acquire operations must be ordered by ascending &runqueue.
*/
/*
* 在SMP环境下,每个CPU对应一个run_queue(rq,可执行队列).如果一个进程处于
* TASK_RUNNING状态,则它会被加入到其中一个(而且只能是1个)run_queue中
* 以便调度器安排它在这个run_queue对应的CPU上面运行.
* 为了解决各个run_queue里面的进程不均衡,linux引入了Load Balance机制
* LB所需要做的事情就是,在一定的时机,通过将进程从一个run_queue迁移到
* 另一个run_queue,来保持CPU之间的负载均衡.
* LB又分为实时进程的LB和普通进程的LB.
*
* 实时进程的优先级关系是很严格的,当系统中最高cpu_nr个优先级最高的
* 实时进程状态发生变化时,内核必须马上响应.如果实时进程不足cpu_nr个,
* 那么剩下的CPU才会分给普通进程去使用.
* 具体来说,内核通过pull_rt_task/push_rt_task来完成实时进程的迁移.
*
* 普通进程并不要求严格的优先级关系,可以容忍一定程度的不均衡,所以
* 普通进程的负载均衡可以不必在进程发生变化时立即完成,而采用一些
* 异步调整的策略.
* 优先级越高的进程,权重越高,分得的CPU时间就越多.在CFS(完全公平调度)
* 中,这里的权重被称为load,LB的任务就是根据load为每个普通进程分配相应
* 的CPU时间.
*
*/
struct rq {
/* runqueue lock: */
spinlock_t lock;
/*
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
unsigned long nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned char idle_at_tick;
#ifdef CONFIG_NO_HZ
unsigned char in_nohz_recently;
#endif
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
struct cfs_rq cfs;
struct rt_rq rt;
u64 rt_period_expire;
int rt_throttled;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list;
#endif
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
unsigned long nr_uninterruptible;
struct task_struct *curr, *idle;
unsigned long next_balance;
struct mm_struct *prev_mm;
u64 clock, prev_clock_raw;
s64 clock_max_delta;
unsigned int clock_warps, clock_overflows, clock_underflows;
u64 idle_clock;
unsigned int clock_deep_idle_events;
u64 tick_timestamp;
atomic_t nr_iowait;
#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain *sd;
/* For active balancing */
int active_balance;
int push_cpu;
/* cpu of this runqueue: */
int cpu;
struct task_struct *migration_thread;
struct list_head migration_queue;
#endif
#ifdef CONFIG_SCHED_HRTICK
unsigned long hrtick_flags;
ktime_t hrtick_expire;
struct hrtimer hrtick_timer;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
/* sys_sched_yield() stats */
unsigned int yld_exp_empty;
unsigned int yld_act_empty;
unsigned int yld_both_empty;
unsigned int yld_count;
/* schedule() stats */
unsigned int sched_switch;
unsigned int sched_count;
unsigned int sched_goidle;
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
/* BKL stats */
unsigned int bkl_count;
#endif
struct lock_class_key rq_lock_key;
};
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
* It copies the registers, and all the appropriate
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
/*
* 注意在此函数中,执行者是父进程,函数内多次提及的task_struct * p
* 是新创建的子进程描述符.
*/
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid)
{
int retval;
struct task_struct *p;
int cgroup_callbacks_done = 0;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
retval = -ENOMEM;
/*
* 分配内核栈(包含thread_info)/task_struct,分配的内核栈大小为2^1=2页
* 它内部调用setup_thread_stack()把父进程内核栈中的数据拷到子进程内核栈中
* 并修改子进程thread_info的task域指针,使指向新申请的task_struct
* 最后初始化新task_struct的一些其他参数,返回这个新的task_struct指针.
* 返回的新的task_struct描述符,此时和父进程的描述符是基本相同的.
*/
p = dup_task_struct(current);
if (!p)
goto fork_out;
/*
* 初始化新进程中的互斥变量
*/
rt_mutex_init_task(p);
#ifdef CONFIG_TRACE_IRQFLAGS
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
if (atomic_read(&p->user->processes) >=
p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
p->user != current->nsproxy->user_ns->root_user)
goto bad_fork_free;
}
atomic_inc(&p->user->__count);
atomic_inc(&p->user->processes);
get_group_info(p->group_info);
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
if (p->binfmt && !try_module_get(p->binfmt->module))
goto bad_fork_cleanup_put_domain;
p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
/*
* 根据clone_flags设置子进程的各种flags
*/
copy_flags(clone_flags, p);
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
#ifdef CONFIG_PREEMPT_RCU
p->rcu_read_lock_nesting = 0;
p->rcu_flipctr_idx = 0;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
/*
* 清理子进程的thread_info的flags
*/
clear_tsk_thread_flag(p, TIF_SIGPENDING);
/*
* 初始化等待信号
*/
init_sigpending(&p->pending);
/*
* 初始化进程使用的各种时间
*/
p->utime = cputime_zero;
p->stime = cputime_zero;
p->gtime = cputime_zero;
p->utimescaled = cputime_zero;
p->stimescaled = cputime_zero;
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;
#ifdef CONFIG_DETECT_SOFTLOCKUP
p->last_switch_count = 0;
p->last_switch_timestamp = 0;
#endif
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
p->wchar = 0; /* I/O counter: bytes written */
p->syscr = 0; /* I/O counter: read syscalls */
p->syscw = 0; /* I/O counter: write syscalls */
#endif
/*
* 清理/初始化io/mm统计量
*/
task_io_accounting_init(p);
acct_clear_integrals(p);
p->it_virt_expires = cputime_zero;
p->it_prof_expires = cputime_zero;
p->it_sched_expires = 0;
/*
* 初始化进程的定时器
*/
INIT_LIST_HEAD(&p->cpu_timers[0]);
INIT_LIST_HEAD(&p->cpu_timers[1]);
INIT_LIST_HEAD(&p->cpu_timers[2]);
p->lock_depth = -1; /* -1 = no lock */
/*
* 设置进程启动时间
*/
do_posix_clock_monotonic_gettime(&p->start_time);
p->real_start_time = p->start_time;
/*
* 修正进程真正启动时间
*/
monotonic_to_bootbased(&p->real_start_time);
#ifdef CONFIG_SECURITY
p->security = NULL;
#endif
p->cap_bset = current->cap_bset;
p->io_context = NULL;
p->audit_context = NULL;
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_copy(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_cgroup;
}
mpol_fix_fork_child_flag(p);
#endif
/*
* 初始化irq相关成员
*/
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
p->hardirqs_enabled = 1;
#else
p->hardirqs_enabled = 0;
#endif
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
p->hardirq_disable_event = 0;
p->softirqs_enabled = 1;
p->softirq_enable_ip = _THIS_IP_;
p->softirq_enable_event = 0;
p->softirq_disable_ip = 0;
p->softirq_disable_event = 0;
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
/*
* 在单处理器系统上,该函数实质上执行3个操作:
* 初始化新进程与调度相关的字段/建立数据结构/确定进程的动态优先级
* 使用父进程的普通优先级作为子进程的动态优先级
*/
sched_fork(p, clone_flags);
if ((retval = security_task_alloc(p)))
goto bad_fork_cleanup_policy;
if ((retval = audit_alloc(p)))
goto bad_fork_cleanup_security;
/* copy all the process information */
if ((retval = copy_semundo(clone_flags, p)))
goto bad_fork_cleanup_audit;
if ((retval = copy_files(clone_flags, p)))
goto bad_fork_cleanup_semundo;
if ((retval = copy_fs(clone_flags, p)))
goto bad_fork_cleanup_files;
if ((retval = copy_sighand(clone_flags, p)))
goto bad_fork_cleanup_fs;
if ((retval = copy_signal(clone_flags, p)))
goto bad_fork_cleanup_sighand;
if ((retval = copy_mm(clone_flags, p)))
goto bad_fork_cleanup_signal;
if ((retval = copy_keys(clone_flags, p)))
goto bad_fork_cleanup_mm;
if ((retval = copy_namespaces(clone_flags, p)))
goto bad_fork_cleanup_keys;
if ((retval = copy_io(clone_flags, p)))
goto bad_fork_cleanup_namespaces;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
retval = -ENOMEM;
/*
* 在当前进程所在的活动命名空间内分配一个pid
*/
pid = alloc_pid(task_active_pid_ns(p));
if (!pid)
goto bad_fork_cleanup_io;
/*
* 创建新的pid namespace
*/
if (clone_flags & CLONE_NEWPID) {
retval = pid_ns_prepare_proc(task_active_pid_ns(p));
if (retval < 0)
goto bad_fork_free_pid;
}
}
/*
* 改变子进程的PID和tgid
*/
p->pid = pid_nr(pid);
p->tgid = p->pid;
/*
* 父子进程放入相同的线程组
*/
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
p->compat_robust_list = NULL;
#endif
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
#endif
/*
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
p->sas_ss_sp = p->sas_ss_size = 0;
/*
* Syscall tracing should be turned off in the child regardless
* of CLONE_PTRACE.
*/
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
clear_all_latency_tracing(p);
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
/*
* 初始化子进程执行域
*/
p->parent_exec_id = p->self_exec_id;
/* ok, now we should be set up.. */
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;
p->exit_state = 0;
/*
* Ok, make it visible to the rest of the system.
* We dont wake it up yet.
*/
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group);
INIT_LIST_HEAD(&p->ptrace_children);
INIT_LIST_HEAD(&p->ptrace_list);
/* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible
* on the tasklist. */
cgroup_fork_callbacks(p);
cgroup_callbacks_done = 1;
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
/*
* The task hasn't been attached yet, so its cpus_allowed mask will
* not be changed, nor will its assigned CPU.
*
* The cpus_allowed mask of the parent may have changed after it was
* copied first time - so re-copy it here, then check the child's CPU
* to ensure it is on a valid CPU (and if not, just force it back to
* parent's CPU). This avoids alot of nasty races.
*/
p->cpus_allowed = current->cpus_allowed;
p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
!cpu_online(task_cpu(p))))
set_task_cpu(p, smp_processor_id());
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
p->real_parent = current->real_parent;
else
p->real_parent = current;
p->parent = p->real_parent;
spin_lock(¤t->sighand->siglock);
/*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
* it's process group.
* A fatal signal pending means that current will exit, so the new
* thread can't slip out of an OOM kill (or normal SIGKILL).
*/
recalc_sigpending();
if (signal_pending(current)) {
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_free_pid;
}
if (clone_flags & CLONE_THREAD) {
p->group_leader = current->group_leader;
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
if (!cputime_eq(current->signal->it_virt_expires,
cputime_zero) ||
!cputime_eq(current->signal->it_prof_expires,
cputime_zero) ||
current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
!list_empty(¤t->signal->cpu_timers[0]) ||
!list_empty(¤t->signal->cpu_timers[1]) ||
!list_empty(¤t->signal->cpu_timers[2])) {
/*
* Have child wake up on its first tick to check
* for process CPU timers.
*/
p->it_prof_expires = jiffies_to_cputime(1);
}
}
if (likely(p->pid)) {
add_parent(p);
if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent);
if (thread_group_leader(p)) {
if (clone_flags & CLONE_NEWPID)
p->nsproxy->pid_ns->child_reaper = p;
p->signal->leader_pid = pid;
p->signal->tty = current->signal->tty;
set_task_pgrp(p, task_pgrp_nr(current));
set_task_session(p, task_session_nr(current));
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail_rcu(&p->tasks, &init_task.tasks);
__get_cpu_var(process_counts)++;
}
attach_pid(p, PIDTYPE_PID, pid);
nr_threads++;
}
total_forks++;
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
return p;
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_io:
put_io_context(p->io_context);
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_keys:
exit_keys(p);
bad_fork_cleanup_mm:
if (p->mm)
mmput(p->mm);
bad_fork_cleanup_signal:
cleanup_signal(p);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_security:
security_task_free(p);
bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_free(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
if (p->binfmt)
module_put(p->binfmt->module);
bad_fork_cleanup_put_domain:
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
put_group_info(p->group_info);
atomic_dec(&p->user->processes);
free_uid(p->user);
bad_fork_free:
free_task(p);
fork_out:
return ERR_PTR(retval);
}
/*
* Linux内核中的namespace提供了一个轻量级的基于系统调用层面的虚拟化解决方案.
* 相比传统的使用vmware/qemu/kvm/hurd的虚拟机,基于namespace的轻量级虚拟具有
* 易使用/易管理,无需硬件虚拟化支持/低成本等优点.
* namespace又称为container,和C++中的namespace概念相似.在Linux中,系统资源如
* 进程/用户帐号/文件系统/网络都属于某个namespace.每个namespace下的资源对于
* 其他namespace下的资源都是不可见的,是透明的.
* 因此,在OS的层面上看,就会出现多个相同pid的进程,多个相同uid的用户帐号.
* 而在用户层面上只能看到属于用户自己namespace下的资源,例如ps只能查看到自己
* namespace下的进程.这样每个namespace看上去就像一个单独的Linux系统.
* namespace起到的作用如下图所示:
* -----------------------------------------------
* | process0 | process1 || process0 | process1 |
* |----------------------||---------------------|
* | namespace0 || namespace1 |
* -----------------------------------------------
* | Linux Kernel |
* -----------------------------------------------
* | Hardware |
* -----------------------------------------------
* namespace还拥有层次关系,子namespace中的进程号会被映射到parent namespace中
*
*/
struct pid_namespace {
struct kref kref;
struct pidmap pidmap[PIDMAP_ENTRIES];
int last_pid;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
int level;
struct pid_namespace *parent;
#ifdef CONFIG_PROC_FS
struct vfsmount *proc_mnt;
#endif
};
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
unsigned long flags;
struct rq *rq;
/*
* 锁住运行队列
*/
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
/*
* 更新运行队列的时钟
*/
update_rq_clock(rq);
/*
* 计算进程当前有效的优先级
*/
p->prio = effective_prio(p);
if (!p->sched_class->task_new || !current->se.on_rq) {
/*
* 把进程插入到运行队列中,更新运行队列中进程数.
*/
activate_task(rq, p, 0);
} else {
/*
* Let the scheduling class do new task startup
* management (if any):
*/
/*
* 把启动子进程的任务交给调度类去处理
*/
p->sched_class->task_new(rq, p);
inc_nr_running(p, rq);
}
check_preempt_curr(rq, p);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
p->sched_class->task_wake_up(rq, p);
#endif
/*
* 解锁运行队列
*/
task_rq_unlock(rq, &flags);
}
Linux内核进程管理-do_fork()执行过程分析
最新推荐文章于 2023-10-07 14:58:18 发布