copy_process情景分析(working in process,10月18号更新)


/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace)
{
    int retval;
    struct task_struct *p;

    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

在2.4.19版本之前,系统中的所有进程都是共享一个mount namespace,在这种情况下,任何进程通过mount或者umount来改变mount namespace都会反应到其他的进程中。从2.4.19版本开始,linux提供了per-process的mount namespace机制(也就是说每个进程都是拥有自己私有的mount namespace)。通过fork创建的进程总是和父进程共享mount namespace的(当然子进程也可以调用unshare来解除共享)。当调用clone创建进程的时候,可以有更多的灵活性,可以通过CLONE_NEWNS这个flag可以不和父进程共享mount namespace(注意:子进程的这个private mount namespace仍然用父进程的mount namespace来初始化,只是之后,子进程和父进程的mount namespace就分道扬镳了,这时候,子进程的mount或者umount的动作将不会影响到父进程)。
CLONE_FS flag是用来控制父子进程是否共享文件系统信息(例如文件系统的root、当前工作目录等),如果设定了该flag,那么父子进程共享文件系统信息,如果不设定该flag,那么子进程则copy父进程的文件系统信息,之后,子进程调用chroot,chdir,umask来改变文件系统信息将不会影响到父进程。
在内核中,CLONE_NEWNS和CLONE_FS是排他的。一个进程的文件系统信息在内核中是用struct fs_struct来抽象,这个结构中就有mount namespace的信息,因此如果想共享文件系统信息,其前提条件就是要处于同一个mount namespace中。


++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++



    if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
        return ERR_PTR(-EINVAL);

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

CLONE_NEWUSER|CLONE_FS的组合会导致一个系统漏洞,可以让一个普通用户窃取到root的权限,具体可以参考下面的连接:

http://www.openwall.com/lists/oss-security/2013/03/13/10

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    /*
     * Thread groups must share signals as well, and detached threads
     * can only be started up within the thread group.
     */
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
        return ERR_PTR(-EINVAL);

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

POSIX规定一个进程内部的多个thread要共享一个PID,但是,在linux kernel中,不论是进程还是线程,都是会分配一个task struct并且分配一个唯一的PID(这时候,PID其实就是thread ID)。这样,为了满足POSIX的线程规定,linux引入了线程组的概念,一个进程中的所有线程所共享的那个PID被称为线程组ID,也就是task struct中的tgid成员。因此,在linux kernel中,线程组ID(tgid,thread group id)就是传统意义的进程ID。对于sys_getpid系统调用,linux内核返回了tgid。对于sys_gettid系统调用,本意是要求返回线程ID,在linux内核中,返回了task struct的pid成员。一言以蔽之,POSIX的进程ID就是linux中的线程组ID。POSIX的线程ID也就是linux中的pid。

在了解了线程组ID和线程ID之后,我们来看一看CLONE_THREAD这个flag。这个flag被设定的话,则表示被创建的子进程与父进程在一个线程组中。否则会创建一个新的线程组。

如果设定CLONE_SIGHAND这个flag,则表示创建的子进程与父进程共享相同的信号处理(signal handler)表。线程组应该共享signal handler,因此,当设定了CLONE_THREAD后必须同时设定CLONE_SIGHAND

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


    /*
     * Shared signal handlers imply shared VM. By way of the above,
     * thread groups also imply shared VM. Blocking this case allows
     * for various simplifications in other code.
     */
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

设定了CLONE_SIGHAND表示共享signal handler,前提条件就是要共享地址空间,否则,无法共享signal handler。因为如果不共享地址空间,即便是同样地址的handler,其物理地址都是不一样的。

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    /*
     * Siblings of global init remain as zombies on exit since they are
     * not reaped by their parent (swapper). To solve this and to avoid
     * multi-rooted process trees, prevent global and container-inits
     * from creating siblings.
     */
    if ((clone_flags & CLONE_PARENT) &&
                current->signal->flags & SIGNAL_UNKILLABLE)
        return ERR_PTR(-EINVAL);

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Linux kernel会静态定义一个init task,该task的pid是0,被称作swapper(其实就是idle进程,在系统没有任何进程可调度的时候会执行该进程)。所有的其他的进程由此开始,例如最著名的init进程。所有用户空间的进程都是由init进程派生出来的。因此init进程要负责为所有用户空间的进程处理后事(否则会变成僵尸进程)。但是如果创建init进程的兄弟进程,那么该进程无法由init进程回收,其父亲swapper进程也不会收养用户空间创建的init的兄弟进程,这种情况下,这类进程退出都会变成zombie,因此要杜绝。

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    /*
     * If the new process will be in a different pid namespace
     * don't allow the creation of threads.
     */
    if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
        (task_active_pid_ns(current) !=
         current->nsproxy->pid_ns_for_children))
        return ERR_PTR(-EINVAL);

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

在下面的调用序列的情况下:

unshare(CLONE_NEWPID)

clone(CLONE_THREAD|CLONE_SIGHAND|CLONE_VM)

kernel会存在问题,具体可以参考提交patch的说明,地址如下:

http://lists.linuxfoundation.org/pipermail/containers/2012-December/031433.html

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


    retval = security_task_create(clone_flags);
    if (retval)
        goto fork_out;

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

这一段代码是和LinuxSecurity Modules相关的。LinuxSecurity Modules是一个安全框架,允许各种安全模型插入到内核。大家熟知的一个计算机安全模型就是selinux。具体这里就不再描述。如果本次操作通过了安全校验,那么后续的操作可以顺利进行

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


    retval = -ENOMEM;
    p = dup_task_struct(current);
    if (!p)
        goto fork_out;

    ftrace_graph_init_task(p);
    get_seccomp_filter(p);

    rt_mutex_init_task(p);

#ifdef CONFIG_PROVE_LOCKING
    DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
    DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
    retval = -EAGAIN;
    if (atomic_read(&p->real_cred->user->processes) >=
            task_rlimit(p, RLIMIT_NPROC)) {
        if (p->real_cred->user != INIT_USER &&
            !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
            goto bad_fork_free;
    }
    current->flags &= ~PF_NPROC_EXCEEDED;

    retval = copy_creds(p, clone_flags);
    if (retval < 0)
        goto bad_fork_free;

    /*
     * If multiple threads are within copy_process(), then this check
     * triggers too late. This doesn't hurt, the check is only there
     * to stop root fork bombs.
     */
    retval = -EAGAIN;
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;

    if (!try_module_get(task_thread_info(p)->exec_domain->module))
        goto bad_fork_cleanup_count;

    p->did_exec = 0;
    delayacct_tsk_init(p);    /* Must remain after dup_task_struct() */
    copy_flags(clone_flags, p);
    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    rcu_copy_process(p);
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);

    init_sigpending(&p->pending);

    p->utime = p->stime = p->gtime = 0;
    p->utimescaled = p->stimescaled = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
    p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
    seqlock_init(&p->vtime_seqlock);
    p->vtime_snap = 0;
    p->vtime_snap_whence = VTIME_SLEEPING;
#endif

#if defined(SPLIT_RSS_COUNTING)
    memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif

    p->default_timer_slack_ns = current->timer_slack_ns;

    task_io_accounting_init(&p->ioac);
    acct_clear_integrals(p);

    posix_cpu_timers_init(p);

    do_posix_clock_monotonic_gettime(&p->start_time);
    p->real_start_time = p->start_time;
    monotonic_to_bootbased(&p->real_start_time);
    p->io_context = NULL;
    p->audit_context = NULL;
    if (clone_flags & CLONE_THREAD)
        threadgroup_change_begin(current);
    cgroup_fork(p);
#ifdef CONFIG_NUMA
    p->mempolicy = mpol_dup(p->mempolicy);
    if (IS_ERR(p->mempolicy)) {
        retval = PTR_ERR(p->mempolicy);
        p->mempolicy = NULL;
        goto bad_fork_cleanup_cgroup;
    }
    mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_CPUSETS
    p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
    p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
    seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
    p->irq_events = 0;
    p->hardirqs_enabled = 0;
    p->hardirq_enable_ip = 0;
    p->hardirq_enable_event = 0;
    p->hardirq_disable_ip = _THIS_IP_;
    p->hardirq_disable_event = 0;
    p->softirqs_enabled = 1;
    p->softirq_enable_ip = _THIS_IP_;
    p->softirq_enable_event = 0;
    p->softirq_disable_ip = 0;
    p->softirq_disable_event = 0;
    p->hardirq_context = 0;
    p->softirq_context = 0;
#endif
#ifdef CONFIG_LOCKDEP
    p->lockdep_depth = 0; /* no locks held yet */
    p->curr_chain_key = 0;
    p->lockdep_recursion = 0;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
    p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_MEMCG
    p->memcg_batch.do_batch = 0;
    p->memcg_batch.memcg = NULL;
#endif
#ifdef CONFIG_BCACHE
    p->sequential_io    = 0;
    p->sequential_io_avg    = 0;
#endif

    /* Perform scheduler related setup. Assign this task to a CPU. */
    sched_fork(p);

    retval = perf_event_init_task(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    retval = audit_alloc(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    /* copy all the process information */
    retval = copy_semundo(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_audit;
    retval = copy_files(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_semundo;
    retval = copy_fs(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_fs;
    retval = copy_signal(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_sighand;
    retval = copy_mm(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_signal;
    retval = copy_namespaces(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_namespaces;
    retval = copy_thread(clone_flags, stack_start, stack_size, p);
    if (retval)
        goto bad_fork_cleanup_io;

    if (pid != &init_struct_pid) {
        retval = -ENOMEM;
        pid = alloc_pid(p->nsproxy->pid_ns_for_children);
        if (!pid)
            goto bad_fork_cleanup_io;
    }

    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /*
     * Clear TID on mm_release()?
     */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
#ifdef CONFIG_BLOCK
    p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
    p->robust_list = NULL;
#ifdef CONFIG_COMPAT
    p->compat_robust_list = NULL;
#endif
    INIT_LIST_HEAD(&p->pi_state_list);
    p->pi_state_cache = NULL;
#endif
    uprobe_copy_process(p);
    /*
     * sigaltstack should be cleared when sharing the same VM
     */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        p->sas_ss_sp = p->sas_ss_size = 0;

    /*
     * Syscall tracing and stepping should be turned off in the
     * child regardless of CLONE_PTRACE.
     */
    user_disable_single_step(p);
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
    clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
    clear_all_latency_tracing(p);

    /* ok, now we should be set up.. */
    p->pid = pid_nr(pid);
    if (clone_flags & CLONE_THREAD) {
        p->exit_signal = -1;
        p->group_leader = current->group_leader;
        p->tgid = current->tgid;
    } else {
        if (clone_flags & CLONE_PARENT)
            p->exit_signal = current->group_leader->exit_signal;
        else
            p->exit_signal = (clone_flags & CSIGNAL);
        p->group_leader = p;
        p->tgid = p->pid;
    }

    p->pdeath_signal = 0;
    p->exit_state = 0;

    p->nr_dirtied = 0;
    p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
    p->dirty_paused_when = 0;

    INIT_LIST_HEAD(&p->thread_group);
    p->task_works = NULL;

    /*
     * Make it visible to the rest of the system, but dont wake it up yet.
     * Need tasklist lock for parent etc handling!
     */
    write_lock_irq(&tasklist_lock);

    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
        p->real_parent = current->real_parent;
        p->parent_exec_id = current->parent_exec_id;
    } else {
        p->real_parent = current;
        p->parent_exec_id = current->self_exec_id;
    }

    spin_lock(&current->sighand->siglock);

    /*
     * Process group and session signals need to be delivered to just the
     * parent before the fork or both the parent and the child after the
     * fork. Restart if a signal comes in before we add the new process to
     * it's process group.
     * A fatal signal pending means that current will exit, so the new
     * thread can't slip out of an OOM kill (or normal SIGKILL).
    */
    recalc_sigpending();
    if (signal_pending(current)) {
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        retval = -ERESTARTNOINTR;
        goto bad_fork_free_pid;
    }

    if (likely(p->pid)) {
        ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

        init_task_pid(p, PIDTYPE_PID, pid);
        if (thread_group_leader(p)) {
            init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
            init_task_pid(p, PIDTYPE_SID, task_session(current));

            if (is_child_reaper(pid)) {
                ns_of_pid(pid)->child_reaper = p;
                p->signal->flags |= SIGNAL_UNKILLABLE;
            }

            p->signal->leader_pid = pid;
            p->signal->tty = tty_kref_get(current->signal->tty);
            list_add_tail(&p->sibling, &p->real_parent->children);
            list_add_tail_rcu(&p->tasks, &init_task.tasks);
            attach_pid(p, PIDTYPE_PGID);
            attach_pid(p, PIDTYPE_SID);
            __this_cpu_inc(process_counts);
        } else {
            current->signal->nr_threads++;
            atomic_inc(&current->signal->live);
            atomic_inc(&current->signal->sigcnt);
            list_add_tail_rcu(&p->thread_group,
                      &p->group_leader->thread_group);
        }
        attach_pid(p, PIDTYPE_PID);
        nr_threads++;
    }

    total_forks++;
    spin_unlock(&current->sighand->siglock);
    write_unlock_irq(&tasklist_lock);
    proc_fork_connector(p);
    cgroup_post_fork(p);
    if (clone_flags & CLONE_THREAD)
        threadgroup_change_end(current);
    perf_event_fork(p);

    trace_task_newtask(p, clone_flags);

    return p;

bad_fork_free_pid:
    if (pid != &init_struct_pid)
        free_pid(pid);
bad_fork_cleanup_io:
    if (p->io_context)
        exit_io_context(p);
bad_fork_cleanup_namespaces:
    exit_task_namespaces(p);
bad_fork_cleanup_mm:
    if (p->mm)
        mmput(p->mm);
bad_fork_cleanup_signal:
    if (!(clone_flags & CLONE_THREAD))
        free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
    __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
    exit_fs(p); /* blocking */
bad_fork_cleanup_files:
    exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
    exit_sem(p);
bad_fork_cleanup_audit:
    audit_free(p);
bad_fork_cleanup_policy:
    perf_event_free_task(p);
#ifdef CONFIG_NUMA
    mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
    if (clone_flags & CLONE_THREAD)
        threadgroup_change_end(current);
    cgroup_exit(p, 0);
    delayacct_tsk_free(p);
    module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
    atomic_dec(&p->cred->user->processes);
    exit_creds(p);
bad_fork_free:
    free_task(p);
fork_out:
    return ERR_PTR(retval);
}

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值