进程相关杂记

这个我好像学过

已于 2023-02-04 22:59:42 修改

阅读量222

点赞数

文章标签： linux Powered by 金山文档

于 2023-01-15 22:56:44 首次发布

本文链接：https://blog.csdn.net/qq_42693685/article/details/128571420

版权

进程地址空间

进程所有的地址空间使用mm_struct进行管理，它描述了进程地址空间的所有信息。每个进程地址空间包含了许多使用中的页面对齐的内存区域。它们不会相互重叠。这些区域则使用struct vm_area_struct进行管理。这些区域可能是malloc所申请的堆空间，也可能是一个内存映射文件，又或者是一个mmap分配的匿名内存区域。一个进程只有一个mm_struct，它是对整个用户空间的描述。一个进程可以包含多个vm_area_struct.

struct mm_struct {
.................
    unsigned long mmap_base;        /* base of mmap area */
..............
    unsigned long start_code, end_code, start_data, end_data;
    unsigned long start_brk, brk, start_stack;
.......................
};

通过上图可以看到，mm_struct里面保存了整个进程空间的代码段，数据段，堆区，栈区的布局信息。

struct mm_struct {
    struct vm_area_struct *mmap;        /* list of VMAs */
    struct rb_root mm_rb;
.........................
};
struct vm_area_struct {
...................
    /* linked list of VM areas per task, sorted by address */
    struct vm_area_struct *vm_next, *vm_prev;
    struct rb_node vm_rb;
.................
};

每个进程只有一个mm_struct，但存在多个vm_area_struct。mm_struct->mmap能够找到链表的头结点vm_area_struct；vm_area_struct->vm_mm能够找到其所属的mm_struct。在地址空间中，mmap为地址空间的内存区域（用vm_area_struct结构来表示）链表，mm_rb用红黑树来存储,它是树的根部。链表表示起来更加方便，红黑树表示起来更加方便查找。区别是，当虚拟区较少的时候，这个时候采用单链表，由mmap指向这个链表，当虚拟区多时此时采用红黑树的结构，由mm_rb指向这棵红黑树。这样就可以在大量数据的时候效率更高。

struct mm_struct {
    /* VMA链表首部 */
    struct vm_area_struct *mmap;        /* list of VMAs */
    /* VMA会被放到红黑树中,该成员表示根节点 */
    struct rb_root mm_rb;
    u32 vmacache_seqnum;                   /* per-thread vmacache */
#ifdef CONFIG_MMU
    unsigned long (*get_unmapped_area) (struct file *filp,
                unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags);
#endif
    unsigned long mmap_base;        /* base of mmap area */
    unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */
    unsigned long task_size;        /* size of task vm space */
    unsigned long highest_vm_end;        /* highest vma end address */
    /* 全局目录表的起始地址  */
    pgd_t * pgd;
    atomic_t mm_users;            /* How many users with user space? */
    atomic_t mm_count;            /* How many references to "struct mm_struct" (users count as 1) */
    atomic_long_t nr_ptes;            /* Page table pages */
    int map_count;                /* number of VMAs */

    spinlock_t page_table_lock;        /* Protects page tables and some counters */
    struct rw_semaphore mmap_sem;

    struct list_head mmlist;        /* List of maybe swapped mm's.    These are globally strung
                         * together off init_mm.mmlist, and are protected
                         * by mmlist_lock
                         */


    unsigned long hiwater_rss;    /* High-watermark of RSS usage */
    unsigned long hiwater_vm;    /* High-water virtual memory usage */

    unsigned long total_vm;        /* Total pages mapped */
    unsigned long locked_vm;    /* Pages that have PG_mlocked set */
    unsigned long pinned_vm;    /* Refcount permanently increased */
    unsigned long shared_vm;    /* Shared pages (files) */
    unsigned long exec_vm;        /* VM_EXEC & ~VM_WRITE */
    unsigned long stack_vm;        /* VM_GROWSUP/DOWN */
    unsigned long def_flags;
    /* 代码段和数据段起始和结束地址 */
    unsigned long start_code, end_code, start_data, end_data;
    /* 堆栈的起始和结束地址 */
    unsigned long start_brk, brk, start_stack;
    /* 命令行和环境变量区域的起始和结束地址 */
    unsigned long arg_start, arg_end, env_start, env_end;

    unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

    /*
     * Special counters, in some configurations protected by the
     * page_table_lock, in other configurations by being atomic.
     */
    struct mm_rss_stat rss_stat;

    struct linux_binfmt *binfmt;

    cpumask_var_t cpu_vm_mask_var;

    /* Architecture-specific MM context */
    mm_context_t context;

    unsigned long flags; /* Must use atomic bitops to access the bits */

    struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
    spinlock_t            ioctx_lock;
    struct kioctx_table __rcu    *ioctx_table;
#endif
#ifdef CONFIG_MEMCG
    /*
     * "owner" points to a task that is regarded as the canonical
     * user/owner of this mm. All of the following must be true in
     * order for it to be changed:
     *
     * current == mm->owner
     * current->mm != mm
     * new_owner->mm == mm
     * new_owner->alloc_lock is held
     */
    struct task_struct __rcu *owner;
#endif

    /* store ref to file /proc/<pid>/exe symlink points to */
    struct file *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
    struct mmu_notifier_mm *mmu_notifier_mm;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
    pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_CPUMASK_OFFSTACK
    struct cpumask cpumask_allocation;
#endif
#ifdef CONFIG_NUMA_BALANCING
    /*
     * numa_next_scan is the next time that the PTEs will be marked
     * pte_numa. NUMA hinting faults will gather statistics and migrate
     * pages to new nodes if necessary.
     */
    unsigned long numa_next_scan;

    /* Restart point for scanning and setting pte_numa */
    unsigned long numa_scan_offset;

    /* numa_scan_seq prevents two threads setting pte_numa */
    int numa_scan_seq;
#endif
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
    /*
     * An operation with batched TLB flushing is going on. Anything that
     * can move process memory needs to flush the TLB when moving a
     * PROT_NONE or PROT_NUMA mapped page.
     */
    bool tlb_flush_pending;
#endif
    struct uprobes_state uprobes_state;
};

一个进程可以包含多个线程，而一个进程只有一个mm_struct。因此这几个线程的task_struct是指向的同一个mm_struct。

内核线程不需要mm_struct，因为它们永远不会发生缺页中断。因此内核线程的task_struct->mm字段为NULL。

如何获取当前进程的task_struct

arm32内核栈大小是8k，arm64内核栈大小是16k。内核栈的起始地址都是按照8K或者16k的大小对齐的。因此只要能够获取到sp栈顶指针的地址，那么按照8/16k对齐之后，就能得到内核栈的起始地址。

thread_info被放到了内核栈起始地址处，其成员task指向了当前进程的task_struct。同样当前进程的task_struct的成员stack也指向了其内核栈的起始地址，即thread_info。因此可以知道每个进程都有自己一个单独的内核栈。

struct thread_info {
..........................
    struct task_struct    *task;        /* main task structure */
...................................
};
struct task_struct {
    volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
    void *stack;
    ...............
};

#define get_current() (current_thread_info()->task)
#define current get_current()
atic inline struct thread_info *current_thread_info(void) __attribute_const__;

static inline struct thread_info *current_thread_info(void)
{
    register unsigned long sp asm ("sp");
    return (struct thread_info *)(sp & ~(THREAD_SIZE - 1));
}
#define THREAD_SIZE_ORDER    1
#define THREAD_SIZE        8192
#define THREAD_START_SP        (THREAD_SIZE - 8)

进程、线程创建

进程或者线程的创建通过fork、vfork或者clone等系统调用建立，它们最终都是通过do_fork传入不同的参数实现的。

long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)

clone_flags：创建进程的标志位集合

stack_start：用户态栈的起始地址

stack_size：用户态栈大小，通常设置为0

parent_tidptr，child_tidptr指向用户空间中地址的两个指针，分别指向父子进程的pid

fork实现：SIGCHLD表示子进程在终止后发送SIGCHLD信号通知父进程。fork为子进程建立了一个基于父进程的完整的副本。但是该副本并不是立即就完成创建，而是采用了写时复制（主要是为了减小创建完整副本的开销）。在最开始是子进程只是复制了进程的页表，并不会去复制页面里面的内存（物理page?)。当子进程需要去修改内容时，才会触发写时复制机制，为子进程创建一个副本（不知道这个创建的副本是父进程所有的东西，还是说就只是需要被改写内容的副本，感觉应该是后者）。

另外fork出来的子进程和父进程，哪个先运行，顺序是不确定的。网上也有说和内核版本有关系。http://chenzhenianqing.com/articles/350.html

SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
    return do_fork(SIGCHLD, 0, 0, NULL, NULL);
#else
    /* can not support in nommu mode */
    return -EINVAL;
#endif
}

vfork实现

CLONE_VFORK：创建子进程是启用linux内核的完成机制（completion）。wait_for_completion会使父进程进入睡眠等待，直到子进程执行execve或者exit释放虚拟内存资源。

CLONE_VM：父子进程运行在同一个虚拟地址空间，一个进程对全局变量的修改，对另外的进程也可以看到。（正常情况下，不同的进程的虚拟地址空间是不同的）

从传入的标记就可以看到vfork创建的子进程一定运行在父进程前面。

SYSCALL_DEFINE0(vfork)
{
    return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
            0, NULL, NULL);
}

clone实现：clone用于创建线程。

SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int __user *, child_tidptr,
         int, tls_val)
{
    return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
}

long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    long nr;

    /*
     * Determine whether and which event to report to ptracer.  When
     * called from kernel_thread or CLONE_UNTRACED is explicitly
     * requested, no event is reported; otherwise, report if the event
     * for the type of forking is enabled.
     */
    if (!(clone_flags & CLONE_UNTRACED)) {
        if (clone_flags & CLONE_VFORK)
            trace = PTRACE_EVENT_VFORK;
        else if ((clone_flags & CSIGNAL) != SIGCHLD)
            trace = PTRACE_EVENT_CLONE;
        else
            trace = PTRACE_EVENT_FORK;

        if (likely(!ptrace_event_enabled(current, trace)))
            trace = 0;
    }

    p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        struct completion vfork;
        struct pid *pid;

        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
            put_user(nr, parent_tidptr);

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
            get_task_struct(p);
        }
        /* 唤醒新创建的进程 */
        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
            ptrace_event_pid(trace, pid);

        /*
         设置了CLONE_VFORK，父进程就会等待子进程执行exit或者execve
         看了一下wait_for_vfork_done最后的实现，感觉父进程只是进入了一个循环，
         并不是将父进程从就绪队列里面移除 
        */
        if (clone_flags & CLONE_VFORK) {
            if (!wait_for_vfork_done(p, &vfork))
                ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
    } else {
        nr = PTR_ERR(p);
    }
    return nr;
}

wait_for_vfork_done->wait_for_completion_killable->wait_for_common->do_wait_for_common

从最后的代码实现看感觉vfork并不是一定会让子进程先运行，只是父进程进入了一个循环出不来而已，并不是将父进程从就绪队列中摘除了？？

static inline long __sched
do_wait_for_common(struct completion *x,
           long (*action)(long), long timeout, int state)
{
    if (!x->done) {
        DECLARE_WAITQUEUE(wait, current);

        __add_wait_queue_tail_exclusive(&x->wait, &wait);
        do {
            if (signal_pending_state(state, current)) {
                timeout = -ERESTARTSYS;
                break;
            }
            __set_current_state(state);
            spin_unlock_irq(&x->wait.lock);
            timeout = action(timeout);
            spin_lock_irq(&x->wait.lock);
        } while (!x->done && timeout);
        __remove_wait_queue(&x->wait, &wait);
        if (!x->done)
            return timeout;
    }
    x->done--;
    return timeout ?: 1;
}

copy_process创建一个新的进程

static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace)
{
.....................................................
    /* 创建一个新的task_struct实例 */
    p = dup_task_struct(current);
}

static struct task_struct *dup_task_struct(struct task_struct *orig)
{
    struct task_struct *tsk;
    struct thread_info *ti;
    unsigned long *stackend;
    int node = tsk_fork_get_node(orig);
    int err;
    /* 分配一个task_struct实例 */
    tsk = alloc_task_struct_node(node);
    if (!tsk)
        return NULL;
    /* 分配一个thread_info实例 */
    ti = alloc_thread_info_node(tsk, node);
    if (!ti)
        goto free_tsk;
    /* 把父进程的tsk复制为子进程的tsk */
    err = arch_dup_task_struct(tsk, orig);
    if (err)
        goto free_ti;

    tsk->stack = ti;
    /* 把父进程的thread_info复制为子进程 */
    setup_thread_stack(tsk, orig);
    clear_user_return_notifier(tsk);
    clear_tsk_need_resched(tsk);
    stackend = end_of_stack(tsk);
    *stackend = STACK_END_MAGIC;    /* for overflow detection */

#ifdef CONFIG_CC_STACKPROTECTOR
    tsk->stack_canary = get_random_int();
#endif

    /*
     * One for us, one for whoever does the "release_task()" (usually
     * parent)
     */
    atomic_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
    tsk->btrace_seq = 0;
#endif
    tsk->splice_pipe = NULL;
    tsk->task_frag.page = NULL;

    account_kernel_stack(ti, 1);

    return tsk;

free_ti:
    free_thread_info(ti);
free_tsk:
    free_task_struct(tsk);
    return NULL;
}

static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace)
{
    int retval;
    struct task_struct *p;

    .....................
    /* 创建一个新的task_struct实例 */
    p = dup_task_struct(current);
    if (!p)
        goto fork_out;

    ftrace_graph_init_task(p);
    get_seccomp_filter(p);

    rt_mutex_init_task(p);

#ifdef CONFIG_PROVE_LOCKING
    DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
    DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
    retval = -EAGAIN;
    if (atomic_read(&p->real_cred->user->processes) >=
            task_rlimit(p, RLIMIT_NPROC)) {
        if (p->real_cred->user != INIT_USER &&
            !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
            goto bad_fork_free;
    }
    current->flags &= ~PF_NPROC_EXCEEDED;
    /* 复制父进程证书,不知道是干啥的 */
    retval = copy_creds(p, clone_flags);
    if (retval < 0)
        goto bad_fork_free;

    /*
     * If multiple threads are within copy_process(), then this check
     * triggers too late. This doesn't hurt, the check is only there
     * to stop root fork bombs.
     */
    retval = -EAGAIN;
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;

    if (!try_module_get(task_thread_info(p)->exec_domain->module))
        goto bad_fork_cleanup_count;

    delayacct_tsk_init(p);    /* Must remain after dup_task_struct() */
    p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
    p->flags |= PF_FORKNOEXEC;//PF_FORKNOEXEC表示该进程还不能执行
    INIT_LIST_HEAD(&p->children);//初始化子进程链表为空
    INIT_LIST_HEAD(&p->sibling);//初始化兄弟进程链表为空
    rcu_copy_process(p);
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);

    init_sigpending(&p->pending);

    p->utime = p->stime = p->gtime = 0;
    p->utimescaled = p->stimescaled = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
    p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
    seqlock_init(&p->vtime_seqlock);
    p->vtime_snap = 0;
    p->vtime_snap_whence = VTIME_SLEEPING;
#endif

#if defined(SPLIT_RSS_COUNTING)
    memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif

    p->default_timer_slack_ns = current->timer_slack_ns;

    task_io_accounting_init(&p->ioac);
    acct_clear_integrals(p);

    posix_cpu_timers_init(p);

    do_posix_clock_monotonic_gettime(&p->start_time);
    p->real_start_time = p->start_time;
    monotonic_to_bootbased(&p->real_start_time);
    p->io_context = NULL;
    p->audit_context = NULL;
    if (clone_flags & CLONE_THREAD)
        threadgroup_change_begin(current);
    cgroup_fork(p);
#ifdef CONFIG_NUMA
    p->mempolicy = mpol_dup(p->mempolicy);
    if (IS_ERR(p->mempolicy)) {
        retval = PTR_ERR(p->mempolicy);
        p->mempolicy = NULL;
        goto bad_fork_cleanup_threadgroup_lock;
    }
#endif
#ifdef CONFIG_CPUSETS
    p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
    p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
    seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
    p->irq_events = 0;
    p->hardirqs_enabled = 0;
    p->hardirq_enable_ip = 0;
    p->hardirq_enable_event = 0;
    p->hardirq_disable_ip = _THIS_IP_;
    p->hardirq_disable_event = 0;
    p->softirqs_enabled = 1;
    p->softirq_enable_ip = _THIS_IP_;
    p->softirq_enable_event = 0;
    p->softirq_disable_ip = 0;
    p->softirq_disable_event = 0;
    p->hardirq_context = 0;
    p->softirq_context = 0;
#endif
#ifdef CONFIG_LOCKDEP
    p->lockdep_depth = 0; /* no locks held yet */
    p->curr_chain_key = 0;
    p->lockdep_recursion = 0;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
    p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_MEMCG
    p->memcg_batch.do_batch = 0;
    p->memcg_batch.memcg = NULL;
#endif
#ifdef CONFIG_BCACHE
    p->sequential_io    = 0;
    p->sequential_io_avg    = 0;
#endif

    /* Perform scheduler related setup. Assign this task to a CPU. */
    /* 初始化进程调度相关的数据结构 */
    retval = sched_fork(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_policy;

    retval = perf_event_init_task(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    retval = audit_alloc(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    /* copy all the process information */
    retval = copy_semundo(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_audit;
    /* 复制父进程打开的文件信息 */
    retval = copy_files(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_semundo;
    retval = copy_fs(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_fs;
    retval = copy_signal(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_sighand;
    /* 复制父进程的内存空间 */
    retval = copy_mm(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_signal;
    retval = copy_namespaces(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_namespaces;
    retval = copy_thread(clone_flags, stack_start, stack_size, p);
    if (retval)
        goto bad_fork_cleanup_io;

    if (pid != &init_struct_pid) {
        retval = -ENOMEM;
        pid = alloc_pid(p->nsproxy->pid_ns_for_children);
        if (!pid)
            goto bad_fork_cleanup_io;
    }

    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /*
     * Clear TID on mm_release()?
     */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
#ifdef CONFIG_BLOCK
    p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
    p->robust_list = NULL;
#ifdef CONFIG_COMPAT
    p->compat_robust_list = NULL;
#endif
    INIT_LIST_HEAD(&p->pi_state_list);
    p->pi_state_cache = NULL;
#endif
    /*
     * sigaltstack should be cleared when sharing the same VM
     */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        p->sas_ss_sp = p->sas_ss_size = 0;

    /*
     * Syscall tracing and stepping should be turned off in the
     * child regardless of CLONE_PTRACE.
     */
    user_disable_single_step(p);
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
    clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
    clear_all_latency_tracing(p);

    /* ok, now we should be set up.. */
    p->pid = pid_nr(pid);
    if (clone_flags & CLONE_THREAD) {
        p->exit_signal = -1;
        p->group_leader = current->group_leader;
        p->tgid = current->tgid;
    } else {
        if (clone_flags & CLONE_PARENT)
            p->exit_signal = current->group_leader->exit_signal;
        else
            p->exit_signal = (clone_flags & CSIGNAL);
        p->group_leader = p;
        p->tgid = p->pid;
    }

    p->nr_dirtied = 0;
    p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
    p->dirty_paused_when = 0;

    p->pdeath_signal = 0;
    INIT_LIST_HEAD(&p->thread_group);
    p->task_works = NULL;

    /*
     * Make it visible to the rest of the system, but dont wake it up yet.
     * Need tasklist lock for parent etc handling!
     */
    write_lock_irq(&tasklist_lock);

    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
        p->real_parent = current->real_parent;
        p->parent_exec_id = current->parent_exec_id;
    } else {
        p->real_parent = current;
        p->parent_exec_id = current->self_exec_id;
    }

    spin_lock(&current->sighand->siglock);

    /*
     * Process group and session signals need to be delivered to just the
     * parent before the fork or both the parent and the child after the
     * fork. Restart if a signal comes in before we add the new process to
     * it's process group.
     * A fatal signal pending means that current will exit, so the new
     * thread can't slip out of an OOM kill (or normal SIGKILL).
    */
    recalc_sigpending();
    if (signal_pending(current)) {
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        retval = -ERESTARTNOINTR;
        goto bad_fork_free_pid;
    }

    if (likely(p->pid)) {
        ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

        init_task_pid(p, PIDTYPE_PID, pid);
        if (thread_group_leader(p)) {
            init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
            init_task_pid(p, PIDTYPE_SID, task_session(current));

            if (is_child_reaper(pid)) {
                ns_of_pid(pid)->child_reaper = p;
                p->signal->flags |= SIGNAL_UNKILLABLE;
            }

            p->signal->leader_pid = pid;
            p->signal->tty = tty_kref_get(current->signal->tty);
            list_add_tail(&p->sibling, &p->real_parent->children);
            list_add_tail_rcu(&p->tasks, &init_task.tasks);
            attach_pid(p, PIDTYPE_PGID);
            attach_pid(p, PIDTYPE_SID);
            __this_cpu_inc(process_counts);
        } else {
            current->signal->nr_threads++;
            atomic_inc(&current->signal->live);
            atomic_inc(&current->signal->sigcnt);
            list_add_tail_rcu(&p->thread_group,
                      &p->group_leader->thread_group);
            list_add_tail_rcu(&p->thread_node,
                      &p->signal->thread_head);
        }
        attach_pid(p, PIDTYPE_PID);
        nr_threads++;
    }

    total_forks++;
    spin_unlock(&current->sighand->siglock);
    syscall_tracepoint_update(p);
    write_unlock_irq(&tasklist_lock);

    proc_fork_connector(p);
    cgroup_post_fork(p);
    if (clone_flags & CLONE_THREAD)
        threadgroup_change_end(current);
    perf_event_fork(p);

    trace_task_newtask(p, clone_flags);
    uprobe_copy_process(p, clone_flags);

    return p;

bad_fork_free_pid:
    if (pid != &init_struct_pid)
        free_pid(pid);
bad_fork_cleanup_io:
    if (p->io_context)
        exit_io_context(p);
bad_fork_cleanup_namespaces:
    exit_task_namespaces(p);
bad_fork_cleanup_mm:
    if (p->mm)
        mmput(p->mm);
bad_fork_cleanup_signal:
    if (!(clone_flags & CLONE_THREAD))
        free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
    __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
    exit_fs(p); /* blocking */
bad_fork_cleanup_files:
    exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
    exit_sem(p);
bad_fork_cleanup_audit:
    audit_free(p);
bad_fork_cleanup_policy:
    perf_event_free_task(p);
#ifdef CONFIG_NUMA
    mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
    if (clone_flags & CLONE_THREAD)
        threadgroup_change_end(current);
    delayacct_tsk_free(p);
    module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
    atomic_dec(&p->cred->user->processes);
    exit_creds(p);
bad_fork_free:
    free_task(p);
fork_out:
    return ERR_PTR(retval);
}

copy_process->copy_mm：可以看到如果设置了CLONE_VM，确实父子进程指向的是同一个mm_struct，共享了同一个虚拟地址空间

static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
    struct mm_struct *mm, *oldmm;
    int retval;

    tsk->min_flt = tsk->maj_flt = 0;
    tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
    tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
#endif

    tsk->mm = NULL;
    tsk->active_mm = NULL;

    /*
     * Are we cloning a kernel thread?
     *
     * We need to steal a active VM for that..
     */
    oldmm = current->mm;
    if (!oldmm)/* mm为空说明是线程或者内核线程 */
        return 0;

    /* initialize the new vmacache entries */
    vmacache_flush(tsk);
    /* 从这里可以看到CLONE_VM,确实父子进程共享虚拟地址空间,
       父子进程指向同一个mm_struct
    */
    if (clone_flags & CLONE_VM) {
        atomic_inc(&oldmm->mm_users);
        mm = oldmm;
        goto good_mm;
    }

    retval = -ENOMEM;
    /* 分配一个mm_struct然后从父进程复制相关内容 */
    mm = dup_mm(tsk);
    if (!mm)
        goto fail_nomem;

good_mm:
    tsk->mm = mm;
    tsk->active_mm = mm;
    return 0;

fail_nomem:
    return retval;
}

copy_process->copy_mm->dup_mm:为子进程分配一个mm_struct，然后将父进程的相关信息copy过来

/*
 * Allocate a new mm structure and copy contents from the
 * mm structure of the passed in task structure.
 */
static struct mm_struct *dup_mm(struct task_struct *tsk)
{
    struct mm_struct *mm, *oldmm = current->mm;
    int err;
    /* 为子进程分配一个mm_struct */
    mm = allocate_mm();
    if (!mm)
        goto fail_nomem;

    memcpy(mm, oldmm, sizeof(*mm));
    mm_init_cpumask(mm);

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
    mm->pmd_huge_pte = NULL;
#endif
    if (!mm_init(mm, tsk))
        goto fail_nomem;

    if (init_new_context(tsk, mm))
        goto fail_nocontext;

    dup_mm_exe_file(oldmm, mm);

    err = dup_mmap(mm, oldmm);
    if (err)
        goto free_pt;

    mm->hiwater_rss = get_mm_rss(mm);
    mm->hiwater_vm = mm->total_vm;

    if (mm->binfmt && !try_module_get(mm->binfmt->module))
        goto free_pt;

    return mm;

free_pt:
    /* don't put binfmt in mmput, we haven't got module yet */
    mm->binfmt = NULL;
    mmput(mm);

fail_nomem:
    return NULL;

fail_nocontext:
    /*
     * If init_new_context() failed, we cannot use mmput() to free the mm
     * because it calls destroy_context()
     */
    mm_free_pgd(mm);
    free_mm(mm);
    return NULL;
}

copy_process->copy_mm->dup_mm->mm_alloc_pgd->pgd_alloc:为进程分配PGD页表

/*
 * need to get a 16k page for level 1
 */
pgd_t *pgd_alloc(struct mm_struct *mm)
{
    pgd_t *new_pgd, *init_pgd;
    pud_t *new_pud, *init_pud;
    pmd_t *new_pmd, *init_pmd;
    pte_t *new_pte, *init_pte;

    new_pgd = __pgd_alloc();
    if (!new_pgd)
        goto no_pgd;

    memset(new_pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));

    /*
     * Copy over the kernel and IO PGD entries
     */
    init_pgd = pgd_offset_k(0);
    memcpy(new_pgd + USER_PTRS_PER_PGD, init_pgd + USER_PTRS_PER_PGD,
               (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));

    clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t));

#ifdef CONFIG_ARM_LPAE
    /*
     * Allocate PMD table for modules and pkmap mappings.
     */
    new_pud = pud_alloc(mm, new_pgd + pgd_index(MODULES_VADDR),
                MODULES_VADDR);
    if (!new_pud)
        goto no_pud;

    new_pmd = pmd_alloc(mm, new_pud, 0);
    if (!new_pmd)
        goto no_pmd;
#endif

    if (!vectors_high()) {
        /*
         * On ARM, first page must always be allocated since it
         * contains the machine vectors. The vectors are always high
         * with LPAE.
         */
        new_pud = pud_alloc(mm, new_pgd, 0);
        if (!new_pud)
            goto no_pud;

        new_pmd = pmd_alloc(mm, new_pud, 0);
        if (!new_pmd)
            goto no_pmd;

        new_pte = pte_alloc_map(mm, NULL, new_pmd, 0);
        if (!new_pte)
            goto no_pte;

        init_pud = pud_offset(init_pgd, 0);
        init_pmd = pmd_offset(init_pud, 0);
        init_pte = pte_offset_map(init_pmd, 0);
        set_pte_ext(new_pte + 0, init_pte[0], 0);
        set_pte_ext(new_pte + 1, init_pte[1], 0);
        pte_unmap(init_pte);
        pte_unmap(new_pte);
    }

    return new_pgd;

no_pte:
    pmd_free(mm, new_pmd);
no_pmd:
    pud_free(mm, new_pud);
no_pud:
    __pgd_free(new_pgd);
no_pgd:
    return NULL;
}

copy_process->copy_mm->dup_mm->dup_mmap

遍历父进程所有的VMA(进程的虚拟地址空间由vm_area_struct表示)，复制父进程VMA中对应的pte页表项到子进程相应的VMA中（只复制对应的pte页表项，并未复制VMA所对应页面的内容）

文章最开始提到，VMA可以是一个文件，也可以是代码段，数据段。因此父子进程执行同一个代码。应该就是这里复制的

static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
    struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
    struct rb_node **rb_link, *rb_parent;
    int retval;
    unsigned long charge;

    uprobe_start_dup_mmap();
    down_write(&oldmm->mmap_sem);
    flush_cache_dup_mm(oldmm);
    uprobe_dup_mmap(oldmm, mm);
    /*
     * Not linked in yet - no deadlock potential:
     */
    down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);

    mm->locked_vm = 0;
    mm->mmap = NULL;
    mm->vmacache_seqnum = 0;
    mm->map_count = 0;
    cpumask_clear(mm_cpumask(mm));
    mm->mm_rb = RB_ROOT;
    rb_link = &mm->mm_rb.rb_node;
    rb_parent = NULL;
    pprev = &mm->mmap;
    retval = ksm_fork(mm, oldmm);
    if (retval)
        goto out;
    retval = khugepaged_fork(mm, oldmm);
    if (retval)
        goto out;

    prev = NULL;
    /* 遍历所有的VMA */
    for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
        struct file *file;

        if (mpnt->vm_flags & VM_DONTCOPY) {
            vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
                            -vma_pages(mpnt));
            continue;
        }
        charge = 0;
        if (mpnt->vm_flags & VM_ACCOUNT) {
            unsigned long len = vma_pages(mpnt);

            if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                goto fail_nomem;
            charge = len;
        }
        tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (!tmp)
            goto fail_nomem;
        *tmp = *mpnt;
        INIT_LIST_HEAD(&tmp->anon_vma_chain);
        retval = vma_dup_policy(mpnt, tmp);
        if (retval)
            goto fail_nomem_policy;
        tmp->vm_mm = mm;
        if (anon_vma_fork(tmp, mpnt))
            goto fail_nomem_anon_vma_fork;
        tmp->vm_flags &= ~VM_LOCKED;
        tmp->vm_next = tmp->vm_prev = NULL;
        file = tmp->vm_file;
        if (file) {
            struct inode *inode = file_inode(file);
            struct address_space *mapping = file->f_mapping;

            get_file(file);
            if (tmp->vm_flags & VM_DENYWRITE)
                atomic_dec(&inode->i_writecount);
            mutex_lock(&mapping->i_mmap_mutex);
            if (tmp->vm_flags & VM_SHARED)
                mapping->i_mmap_writable++;
            flush_dcache_mmap_lock(mapping);
            /* insert tmp into the share list, just after mpnt */
            if (unlikely(tmp->vm_flags & VM_NONLINEAR))
                vma_nonlinear_insert(tmp,
                        &mapping->i_mmap_nonlinear);
            else
                vma_interval_tree_insert_after(tmp, mpnt,
                            &mapping->i_mmap);
            flush_dcache_mmap_unlock(mapping);
            mutex_unlock(&mapping->i_mmap_mutex);
        }

        /*
         * Clear hugetlb-related page reserves for children. This only
         * affects MAP_PRIVATE mappings. Faults generated by the child
         * are not guaranteed to succeed, even if read-only
         */
        if (is_vm_hugetlb_page(tmp))
            reset_vma_resv_huge_pages(tmp);

        /*
         * Link in the new vma and copy the page table entries.
         */
        *pprev = tmp;
        pprev = &tmp->vm_next;
        tmp->vm_prev = prev;
        prev = tmp;
        /* 将新创建的tmp加入到子进程的链表中 */
        __vma_link_rb(mm, tmp, rb_link, rb_parent);
        rb_link = &tmp->vm_rb.rb_right;
        rb_parent = &tmp->vm_rb;

        mm->map_count++;
        retval = copy_page_range(mm, oldmm, mpnt);

        if (tmp->vm_ops && tmp->vm_ops->open)
            tmp->vm_ops->open(tmp);

        if (retval)
            goto out;
    }
    /* a new mm has just been created */
    arch_dup_mmap(oldmm, mm);
    retval = 0;
out:
    up_write(&mm->mmap_sem);
    flush_tlb_mm(oldmm);
    up_write(&oldmm->mmap_sem);
    uprobe_end_dup_mmap();
    return retval;
fail_nomem_anon_vma_fork:
    mpol_put(vma_policy(tmp));
fail_nomem_policy:
    kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
    retval = -ENOMEM;
    vm_unacct_memory(charge);
    goto out;
}

copy_process->copy_mm->dup_mm->dup_mmap->copy_page_range将父进程的VAM页表复制到子进程页表中

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        struct vm_area_struct *vma)
{
    pgd_t *src_pgd, *dst_pgd;
    unsigned long next;
    unsigned long addr = vma->vm_start;
    unsigned long end = vma->vm_end;
    unsigned long mmun_start;    /* For mmu_notifiers */
    unsigned long mmun_end;        /* For mmu_notifiers */
    bool is_cow;
    int ret;

    /*
     * Don't copy ptes where a page fault will fill them correctly.
     * Fork becomes much lighter when there are big shared or private
     * readonly mappings. The tradeoff is that copy_page_range is more
     * efficient than faulting.
     */
    if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
                   VM_PFNMAP | VM_MIXEDMAP))) {
        if (!vma->anon_vma)
            return 0;
    }

    if (is_vm_hugetlb_page(vma))
        return copy_hugetlb_page_range(dst_mm, src_mm, vma);

    if (unlikely(vma->vm_flags & VM_PFNMAP)) {
        /*
         * We do not free on error cases below as remove_vma
         * gets called on error from higher level routine
         */
        ret = track_pfn_copy(vma);
        if (ret)
            return ret;
    }

    /*
     * We need to invalidate the secondary MMU mappings only when
     * there could be a permission downgrade on the ptes of the
     * parent mm. And a permission downgrade will only happen if
     * is_cow_mapping() returns true.
     */
    is_cow = is_cow_mapping(vma->vm_flags);
    mmun_start = addr;
    mmun_end   = end;
    if (is_cow)
        mmu_notifier_invalidate_range_start(src_mm, mmun_start,
                            mmun_end);

    ret = 0;
    dst_pgd = pgd_offset(dst_mm, addr);
    src_pgd = pgd_offset(src_mm, addr);
    do {
        next = pgd_addr_end(addr, end);
        if (pgd_none_or_clear_bad(src_pgd))
            continue;
        if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
                        vma, addr, next))) {
            ret = -ENOMEM;
            break;
        }
    } while (dst_pgd++, src_pgd++, addr = next, addr != end);

    if (is_cow)
        mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
    return ret;
}

fork是怎么返回两次的？感觉就是因为有两个进程，分别都会从fork调用返回。

在copy_thread里面可以看到，给子进程的r0设置为了0。网上说用户态fork调用的返回值就是r0，因此子进程返回的是0。而的do_fork返回值是子进程的pid，这里应该是父进程的返回值。

    thread->cpu_context.pc = (unsigned long)ret_from_fork;
    thread->cpu_context.sp = (unsigned long)childregs;

2、可以看到进程硬件上下文cpu_context的pc被设置为了ret_from_fork。在copy_process返回后，会将子进程唤醒，这样就能够让子进程被调度。在进程切换时，switch_to()函数会完成进程硬件上下文的切换，即把下一个进程（next进程）的cpu_context数据结构保存的内容恢复到处理器的寄存器中，从而完成进程的切换。此时，处理器开始运行next进程了。由于新创建的子进程的PC设置为了ret_from_fork，处理器会从ret_from_fork汇编函数里开始执行。

int
copy_thread(unsigned long clone_flags, unsigned long stack_start,
        unsigned long stk_sz, struct task_struct *p)
{
    struct thread_info *thread = task_thread_info(p);
    struct pt_regs *childregs = task_pt_regs(p);

    memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));

    if (likely(!(p->flags & PF_KTHREAD))) {//用户进程
        *childregs = *current_pt_regs();//这里的regs保存的是用户态的上下文
        childregs->ARM_r0 = 0;
        if (stack_start)
            childregs->ARM_sp = stack_start;
    } else {//创建的是内核线程
        /* 
        内核线程的没有用户态的东西，所以及通过memset全部置为0，
        cpu_context表示内核态的上下文，
        在这里把要执行函数的地址保存在r5寄存器中，参数保存在r4寄存器中
        */
        memset(childregs, 0, sizeof(struct pt_regs));
        thread->cpu_context.r4 = stk_sz;//参数
        thread->cpu_context.r5 = stack_start;//内核线程执行的函数
        childregs->ARM_cpsr = SVC_MODE;
    }
    thread->cpu_context.pc = (unsigned long)ret_from_fork;
    thread->cpu_context.sp = (unsigned long)childregs;

    clear_ptrace_hw_breakpoint(p);

    if (clone_flags & CLONE_SETTLS)
        thread->tp_value[0] = childregs->ARM_r3;
    thread->tp_value[1] = get_tpuser();

    thread_notify(THREAD_NOTIFY_COPY, thread);

    return 0;
}

#define THREAD_SIZE        8192
#define THREAD_START_SP        (THREAD_SIZE - 8)
#define task_pt_regs(p) \
    ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1)
#define task_stack_page(task)    ((task)->stack)

(task)->stack是指向了内核栈最下面，即thread_info起始地址

(struct pt_regs *)(THREAD_START_SP + task_stack_page(p))是指向了上图SP的位置。

然后将其-1，其实等价于（uint8_t*）(struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - sizeof(struct pt_regs *)

https://blog.csdn.net/rlk8888/article/details/122514176。
（3）CPU打断正在运行的用户进程A，处于异常模式。CPU会跳转到异常向量表中的el0_irq里。在el0_irq汇编函数里，首先把中断现场保存到进程A的pt_regs栈框中。

按照上面说的，pt_regs里面应该是用户进程的现场信息。

#define STACK_END_MAGIC        0x57AC6E9D
static inline unsigned long *end_of_stack(struct task_struct *p)
{
    return (unsigned long *)(task_thread_info(p) + 1);
}
static struct task_struct *dup_task_struct(struct task_struct *orig)
{
 .........................
    ti = alloc_thread_info_node(tsk, node);
 .........................
    stackend = end_of_stack(tsk);
    *stackend = STACK_END_MAGIC;    /* for overflow detection */
}

可以看到0x57AC6E90，其实就是在创建子进程信息的时候被设置的。表示栈的末尾，用于栈溢出检测

接下来看子进程被调度时如何运行

schedule -> __schedule -> context_switch ->switch_mm & switch_to

static inline void
context_switch(struct rq *rq, struct task_struct *prev,
           struct task_struct *next)
{
    struct mm_struct *mm, *oldmm;

    prepare_task_switch(rq, prev, next);

    mm = next->mm;
    oldmm = prev->active_mm;
.......................
    if (!mm) {
        next->active_mm = oldmm;
        atomic_inc(&oldmm->mm_count);
        enter_lazy_tlb(oldmm, next);
    } else
        switch_mm(oldmm, mm, next);
.........................
    switch_to(prev, next, prev);
..............................
}

context_switch()调用switch_mm切换进程页表, 然后调用switch_to()加载task的pc, sp等寄存器。

arch\arm\include\asm\switch_to.h(我环境里面是这个)

extern struct task_struct *__switch_to(struct task_struct *, struct thread_info *, struct thread_info *);

#define switch_to(prev,next,last)                    \
do {                                    \
    last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));    \
} while (0)

/arch/arm/kernel/asm-offsets.c
 DEFINE(TI_CPU_SAVE,        offsetof(struct thread_info, cpu_context));
#define DEFINE(sym, val) \
        asm volatile("\n->" #sym " %0 " #val : : "i" (val))

arm里面寄存器r0,r1,r2,r3用于保存函数传入的形参，超过4个侧保存到栈中

eg：f(a,b,c,d)。a,b,c,d分别对应r0,r1,r2,r3

prev:表示即将被换下的进程（r0和r1里面保存了相关信息）；next是即将被换上CPU的进程(r2表示)

/arch/arm/kernel/entry-armv.S
ENTRY(__switch_to)
 UNWIND(.fnstart    )
 UNWIND(.cantunwind    )
     // ip = r1 + cpu_context偏移 <=> ip = prev->thread_info.cpu_context
    add    ip, r1, #TI_CPU_SAVE
 //将r4 - sl, fp, sp, lr寄存器中的内容保存到IP寄存器所指向的内存地址，即prev->thread_info->cpu_context，这相当于保存了prev进程(即将被换下CPU的进程)运行时的寄存器上下文。从下面可以看到就是保存里面cpu_context_save里面所有的寄存器，lr被保存到了pc里面。
 ARM(    stmia    ip!, {r4 - sl, fp, sp, lr} )    @ Store most regs on stack
//下面这个也是将值保存到ip所执行的内存地址。对了ip！,！表示最新的内存地址会被更新到寄存器ip中。
//此时ip指向了extra
//不知道为什么这个会保存两次
 THUMB(    stmia    ip!, {r4 - sl, fp}       )    @ Store most regs on stack
 THUMB(    str    sp, [ip], #4           )
 THUMB(    str    lr, [ip], #4           )
    //将next->thread_info->tp_value[0]和tp_value[1]分别复制为r4和r5
    ldr    r4, [r2, #TI_TP_VALUE]
    ldr    r5, [r2, #TI_TP_VALUE + 4]
#ifdef CONFIG_CPU_USE_DOMAINS
    ldr    r6, [r2, #TI_CPU_DOMAIN]
#endif
    switch_tls r1, r4, r5, r3, r7
#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
    ldr    r7, [r2, #TI_TASK]
    ldr    r8, =__stack_chk_guard
    ldr    r7, [r7, #TSK_STACK_CANARY]
#endif
#ifdef CONFIG_CPU_USE_DOMAINS
    mcr    p15, 0, r6, c3, c0, 0        @ Set domain register
#endif
    mov    r5, r0
    add    r4, r2, #TI_CPU_SAVE
    ldr    r0, =thread_notify_head
    mov    r1, #THREAD_NOTIFY_SWITCH
    bl    atomic_notifier_call_chain
#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
    str    r7, [r8]
#endif
 THUMB(    mov    ip, r4               )
    mov    r0, r5
  //“ARM( ldmia r4, {r4 - sl, fp, sp, pc} )” 将next->thread_info->cpu_context的数据加载到r4 - sl, fp, sp, lr，pc寄存器中，next->thread_info->cpu_context->sp存入寄存器SP相当于内核栈切换完成，next->thread_info->cpu_context->pc存入寄存器PC相当于跳转到next进程运行。即切换到next进程运行时的寄存器上下文。
 ARM(    ldmia    r4, {r4 - sl, fp, sp, pc}  )    @ Load all regs saved previously
 THUMB(    ldmia    ip!, {r4 - sl, fp}       )    @ Load all regs saved previously
 THUMB(    ldr    sp, [ip], #4           )
 THUMB(    ldr    pc, [ip]           )
 UNWIND(.fnend        )
ENDPROC(__switch_to)

struct cpu_context_save {
    __u32    r4;
    __u32    r5;
    __u32    r6;
    __u32    r7;
    __u32    r8;
    __u32    r9;
    __u32    sl;
    __u32    fp;
    __u32    sp;
    __u32    pc;
    __u32    extra[2];        /* Xscale 'acc' register, etc */
};

“ARM( ldmia r4, {r4 - sl, fp, sp, pc} )” 将next->thread_info->cpu_context的数据加载到r4 - sl, fp, sp, lr，pc寄存器中，next->thread_info->cpu_context->sp存入寄存器SP相当于内核栈切换完成，next->thread_info->cpu_context->pc存入寄存器PC相当于跳转到next进程运行。即切换到next进程运行时的寄存器上下文。这里的pc就是前面设置的reet_from_fork

./arch/arm/kernel/entry-header.S

下面这些都是寄存器的别名。tsk是r9的别名

scno    .req    r7        @ syscall number
tbl    .req    r8        @ syscall table pointer
why    .req    r8        @ Linux syscall (!= 0)
tsk    .req    r9        @ current thread_info

/*
 * Get current thread_info.
 */
    .macro    get_thread_info, rd
 ARM(    mov    \rd, sp, lsr #13    )
 THUMB(    mov    \rd, sp            )
 THUMB(    lsr    \rd, \rd, #13        )
    mov    \rd, \rd, lsl #13
    .endm

ENTRY(ret_from_fork)
    bl    schedule_tail
    cmp    r5, #0 /*copy_thread里面说了r5如果是0,则表示创建的是用户进程，反之是内核线程执行函数的地址 */
    movne    r0, r4/* 不为0,则将参数r4给r0 */
    adrne    lr, BSYM(1f)
    movne    pc, r5
    /*
     因此如果创建的是用户进程，上面的ne都不满足，直接到这里
     get_thread_info tsk将r9设置为thread__info的起始地址
    */
1:    get_thread_info tsk
    b    ret_slow_syscall
ENDPROC(ret_from_fork)

ret_to_user返回到用户空间

ENTRY(ret_to_user)
ret_slow_syscall:
    disable_irq                @ disable interrupts
ENTRY(ret_to_user_from_irq)
    ldr    r1, [tsk, #TI_FLAGS]
    tst    r1, #_TIF_WORK_MASK
    bne    work_pending
no_work_pending:
    asm_trace_hardirqs_on

    /* perform architecture specific actions before user return */
    arch_ret_to_user r1, lr
    ct_user_enter save = 0

    restore_user_regs fast = 0, offset = 0
ENDPROC(ret_to_user_from_irq)
ENDPROC(ret_to_user)

“ARM( ldmia r4, {r4 - sl, fp, sp, pc} )” 将next->thread_info->cpu_context的数据加载到r4 - sl, fp, sp, lr，pc寄存器中，next->thread_info->cpu_context->sp存入寄存器SP相当于内核栈切换完成，next->thread_info->cpu_context->pc存入寄存器PC相当于跳转到next进程运行。即切换到next进程运行时的寄存器上下文。这里的pc就是前面设置的reet_from_fork

上面说__switch_to会切换当前进程和即将运行进程里的cpu_context里面保存的寄存器内容。那子进程是何时开始运行的呢？进行如下验证：

int printTwice = 10; 
asmlinkage __visible void schedule_tail(struct task_struct *prev)
    __releases(rq->lock)
{
    struct rq *rq = this_rq();

    finish_task_switch(rq, prev);
    if (printTwice != 0)
    {
           printk(KERN_EMERG "\r\nfunc %s, line %d, new process, CPU: %d PID: %d Comm: %.20s \n", __FUNCTION__, __LINE__, raw_smp_processor_id(), current->pid, current->comm);
        --printTwice;        
    }
}
static inline void
context_switch(struct rq *rq, struct task_struct *prev,
           struct task_struct *next)
{
    if (printTwice != 0)
    {
        printk(KERN_EMERG "\r\nfunc %s, line %d, before switch CPU: %d PID: %d Comm: %.20s \n", __FUNCTION__, __LINE__, raw_smp_processor_id(), current->pid, current->comm);
    }
    context_tracking_task_switch(prev, next);
    /* Here we just switch the register state and the stack. */

    switch_to(prev, next, prev);


    if (printTwice != 0)
    {
        printk(KERN_EMERG "\r\nfunc %s, line %d, after switch, CPU: %d PID: %d Comm: %.20s\n", __FUNCTION__, __LINE__, raw_smp_processor_id(), current->pid, current->comm);
        --printTwice;
    }
}

在switch_to(prev, next, prev);前后增加打印。另外即将会换上CPU的进程会去调用schedule_tail，这个函数里面也增加打印，看打印顺序。打印结果如下：

可以看到在switch_to返回之前，新的进程已经开始运行。switch_to返回之后，是调度器有重新调度到该进程了

“ARM( ldmia r4, {r4 - sl, fp, sp, pc} )” 将next->thread_info->cpu_context的数据加载到r4 - sl, fp, sp, lr，pc寄存器中，next->thread_info->cpu_context->sp存入寄存器SP相当于内核栈切换完成，next->thread_info->cpu_context->pc存入寄存器PC相当于跳转到next进程运行。即切换到next进程运行时的寄存器上下文。这里的pc就是前面设置的reet_from_fork

所以我感觉应该是在执行这句汇编指令后：ARM( ldmia r4, {r4 - sl, fp, sp, pc} )，新的进程就立马投入了运行。

THUMB(    ldmia    ip!, {r4 - sl, fp}       )    @ Load all regs saved previously
 THUMB(    ldr    sp, [ip], #4           )
 THUMB(    ldr    pc, [ip]           )

在执行这几条指令的时候，多半是当前进程被重新调度的时候了。

func context_switch, line 2338, before switch CPU: 0 PID: 0 Comm: swapper/0
func schedule_tail, line 2278, new process, CPU: 0 PID: 2 Comm: swapper/0
func context_switch, line 2338, before switch CPU: 0 PID: 2 Comm: kthreadd
func schedule_tail, line 2278, new process, CPU: 0 PID: 1 Comm: swapper/0
func context_switch, line 2338, before switch CPU: 0 PID: 1 Comm: swapper/0
func context_switch, line 2345, after switch, CPU: 0 PID: 2 Comm: kthreadd
func context_switch, line 2338, before switch CPU: 0 PID: 2 Comm: kthreadd
func schedule_tail, line 2278, new process, CPU: 0 PID: 3 Comm: kthreadd
func context_switch, line 2338, before switch CPU: 0 PID: 3 Comm: kthreadd
func context_switch, line 2345, after switch, CPU: 0 PID: 1 Comm: swapper/0
func context_switch, line 2338, before switch CPU: 0 PID: 1 Comm: swapper/0
func context_switch, line 2345, after switch, CPU: 0 PID: 3 Comm: ksoftirqd/0
func context_switch, line 2338, before switch CPU: 0 PID: 3 Comm: ksoftirqd/0
func context_switch, line 2345, after switch, CPU: 0 PID: 1 Comm: swapper/0
func context_switch, line 2338, before switch CPU: 0 PID: 1 Comm: swapper/0
func context_switch, line 2345, after switch, CPU: 0 PID: 2 Comm: kthreadd
func context_switch, line 2338, before switch CPU: 0 PID: 2 Comm: kthreadd
func context_switch, line 2345, after switch, CPU: 0 PID: 3 Comm: ksoftirqd/0
func context_switch, line 2338, before switch CPU: 0 PID: 3 Comm: ksoftirqd/0
func schedule_tail, line 2278, new process, CPU: 0 PID: 4 Comm: kthreadd