linux do_fork详解

当内核调用kernel_thread函数创建内核线程或者应用程序系统调用fork创建进程以及使用pthread_create创建线程的时候,其在内核中最终调用的函数就是do_fork。

do_fork这个函数非常复杂,这边只介绍里面的两个子函数copy_mm和copy_thread。

1 copy_mm

do_fork

    --------->copy_process

        ------------->copy_mm

//tsk参数是新建的子进程/线程的task结构
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
	struct mm_struct *mm, *oldmm;
	int retval;

	tsk->min_flt = tsk->maj_flt = 0;
	tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
#endif

	tsk->mm = NULL; 
	tsk->active_mm = NULL; 

	/*
	 * Are we cloning a kernel thread?
	 *
	 * We need to steal a active VM for that..
	 */
	oldmm = current->mm;  //获取父进程的mm结构,如果没有mm结构,则说明父进程是个内核线程,父进程的active_mm借用的是上个进程的mm
------------------------------------------------------(1)
	if (!oldmm) //父进程如果内核线程,直接退出
		return 0;
-----------------------------------------------------(2)
	if (clone_flags & CLONE_VM) {  //如果设置了CLONE_VM,则说明新建的是个用户线程,共享父进程的运行空间
		atomic_inc(&oldmm->mm_users);
		mm = oldmm;
		goto good_mm;
	}

	retval = -ENOMEM;
-----------------------------------------------------(3)
	mm = dup_mm(tsk);//复制页表,新建运行空间
	if (!mm)
		goto fail_nomem;

good_mm:
	tsk->mm = mm;//为新建task设置mm
	tsk->active_mm = mm;
	return 0;

fail_nomem:
	return retval;
}

上面的copy_mm可以分为3钟情况:

1 如果是内核线程,mm指针为null,直接退出,每次调度到内核线程时,会借用上一个进程的mm结构,放在active_mm中

2 如果不是内核线程,并且设置了CLONE_VM flag,则说明是个用户线程,共享父进程的运行空间,所以把父进程的mm赋值给子线程

3 如果以上情况都不是,那么新建的肯定是个进程,有独立的运行空间,所以需要新建自己的mm_struct结构,linux基于写时复制的原则,先复制父进程的页表。

下面主要分析第三种情况,看一下子进程是如何复制父进程的页表的。

基本逻辑是linux中所有的进程和线程的内核空间都是一样的,所以内核页表必然都是相同的,那不同的就是用户空间部分。新建用户子进程的时候,子进程的用户空间页表会直接复制父进程的,在复制的时候,会把该页表指向的页面设置为写保护,也就说一开始父子进程用户空间也是相同的,等到父子进程真正要去读写这块空间的时候,触发页面读写中断,在中断函数里面去处理重新分配页面,这个时候父子进程的用户空间就真的分道扬镳了。

struct mm_struct *dup_mm(struct task_struct *tsk)
{
	struct mm_struct *mm, *oldmm = current->mm;
	int err;

	if (!oldmm)
		return NULL;

	mm = allocate_mm(); //分配一个mm_struct结构
	if (!mm)
		goto fail_nomem;

	memcpy(mm, oldmm, sizeof(*mm));//把父进程mm中的信息拷贝过来
	mm_init_cpumask(mm);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	mm->pmd_huge_pte = NULL;
#endif
#ifdef CONFIG_NUMA_BALANCING
	mm->first_nid = NUMA_PTE_SCAN_INIT;
#endif
-------------------------------------------------------(1)
	if (!mm_init(mm, tsk))  //复制父进程内核页表
		goto fail_nomem;

	if (init_new_context(tsk, mm))
		goto fail_nocontext;

	dup_mm_exe_file(oldmm, mm);
--------------------------------------------------------(2)
	err = dup_mmap(mm, oldmm); //复制父进程用户空间页表
	if (err)
		goto free_pt;

	mm->hiwater_rss = get_mm_rss(mm);
	mm->hiwater_vm = mm->total_vm;

	if (mm->binfmt && !try_module_get(mm->binfmt->module))
		goto free_pt;

	return mm;

free_pt:
	/* don't put binfmt in mmput, we haven't got module yet */
	mm->binfmt = NULL;
	mmput(mm);

fail_nomem:
	return NULL;

fail_nocontext:
	/*
	 * If init_new_context() failed, we cannot use mmput() to free the mm
	 * because it calls destroy_context()
	 */
	mm_free_pgd(mm);
	free_mm(mm);
	return NULL;
}

上面两个比较重要的函数是mm_init和dup_mmap,下面分别说明。

1.1 mm_init

mm_init函数的主要作用就是复制父进程的内核页表。

static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
	atomic_set(&mm->mm_users, 1);
	atomic_set(&mm->mm_count, 1);
	init_rwsem(&mm->mmap_sem);
	INIT_LIST_HEAD(&mm->mmlist);
	mm->flags = (current->mm) ?
		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
	mm->core_state = NULL;
	mm->nr_ptes = 0;
	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
	spin_lock_init(&mm->page_table_lock);
	mm->free_area_cache = TASK_UNMAPPED_BASE;
	mm->cached_hole_size = ~0UL;
	mm_init_aio(mm);
	mm_init_owner(mm, p);

	if (likely(!mm_alloc_pgd(mm))) { //这个函数负责内核页表的复制
		mm->def_flags = 0;
		mmu_notifier_mm_init(mm);
		return mm;
	}

	free_mm(mm);
	return NULL;
}

mm_init

    --------->mm_alloc_pgd

static inline int mm_alloc_pgd(struct mm_struct *mm)
{
	mm->pgd = pgd_alloc(mm);//把页表基地址赋值给pgd
	if (unlikely(!mm->pgd))
		return -ENOMEM;
	return 0;
}

pgd_alloc函数完成了页表页目录空间的分配以及内核页表的复制

mm_init

    --------->mm_alloc_pgd

        ------------->pgd_alloc

#define __pgd_alloc()	(pgd_t *)__get_free_pages(GFP_KERNEL, 2)
pgd_t *pgd_alloc(struct mm_struct *mm)
{
	pgd_t *new_pgd, *init_pgd;
	pud_t *new_pud, *init_pud;
	pmd_t *new_pmd, *init_pmd;
	pte_t *new_pte, *init_pte;

	new_pgd = __pgd_alloc();//分配页目录空间,order为2,所以为16K,每个页目录映射1M空间,总共4096个页目录,刚好对应4G的空间
	if (!new_pgd)
		goto no_pgd;
//把内核空间0xc0000000向下偏移16M 的用户空间页表先全部清零
	memset(new_pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));

	/*
	 * Copy over the kernel and IO PGD entries
	 */
	init_pgd = pgd_offset_k(0);//取得init_mm记录的页目录的基地址,init_mm是一开始系统启动的时候给init task用的,因为所有的进程的内核页表都相同,这边为了方便直接取init进程的
//对内核页目录进行复制,起始地址为内核空间0xc0000000向下偏移16M,end 地址为0xffffffff
	memcpy(new_pgd + USER_PTRS_PER_PGD, init_pgd + USER_PTRS_PER_PGD,
		       (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));

	clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t));

#ifdef CONFIG_ARM_LPAE //这个宏没定义
	/*
	 * Allocate PMD table for modules and pkmap mappings.
	 */
	new_pud = pud_alloc(mm, new_pgd + pgd_index(MODULES_VADDR),
			    MODULES_VADDR);
	if (!new_pud)
		goto no_pud;

	new_pmd = pmd_alloc(mm, new_pud, 0);
	if (!new_pmd)
		goto no_pmd;
#endif

	if (!vectors_high()) { //异常向量放在高地址,这个分支不会走
		/*
		 * On ARM, first page must always be allocated since it
		 * contains the machine vectors. The vectors are always high
		 * with LPAE.
		 */
		new_pud = pud_alloc(mm, new_pgd, 0);
		if (!new_pud)
			goto no_pud;

		new_pmd = pmd_alloc(mm, new_pud, 0);
		if (!new_pmd)
			goto no_pmd;

		new_pte = pte_alloc_map(mm, NULL, new_pmd, 0);
		if (!new_pte)
			goto no_pte;

		init_pud = pud_offset(init_pgd, 0);
		init_pmd = pmd_offset(init_pud, 0);
		init_pte = pte_offset_map(init_pmd, 0);
		set_pte_ext(new_pte, *init_pte, 0);
		pte_unmap(init_pte);
		pte_unmap(new_pte);
	}

	return new_pgd;

no_pte:
	pmd_free(mm, new_pmd);
no_pmd:
	pud_free(mm, new_pud);
no_pud:
	__pgd_free(new_pgd);
no_pgd:
	return NULL;
}

具体的解释已经在代码中给出,到这边一开始我就有一个疑问,arm内存映射有两种方式,一种是段映射,一种是页表映射,系统初始化的时候,其物理内存段采用的是段映射(包含代码段),但是其异常向量表以及寄存器的映射,其虚拟地址虽然都是在内核空间,但是采用的是2级页表映射,上面的复制只是复制了第一级页表,也就是页目录而已,不会有问题么?

对于物理内存,采用段映射,只有一级页表肯定没有问题。其实对于异常向量表等虽然只是复制了一级页目录,其实也是没有问题的!如果对内核页表的初始化不了解,可以先看这篇文章:linux3.10 paging_init页表初始化详解_oqqYuJi12345678的博客-CSDN博客

早期在初始化页表的时候有这么一个函数:

static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr, unsigned long prot)
{
	if (pmd_none(*pmd)) {
		pte_t *pte = early_alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
		__pmd_populate(pmd, __pa(pte), prot);
	}
	BUG_ON(pmd_bad(*pmd));
	return pte_offset_kernel(pmd, addr);
}

当页目录为空的时候,为这个页目录分配二级页表,并把得到的二级页表的地址填到页目录中。二级页表的分配方式通过early_alloc,其分配的空间的地址是物理地址加上一个逻辑偏移,也就是其虚拟地址总是落在物理地址所对应的虚拟空间中,假设物理地址为0x3000000~0x34000000,其虚拟地址为0xc000000~0xc4000000,这块地址是段映射。来模拟一下这个情景,假设异常中断到来,根据其虚拟地址,肯定能找到一级页目录,然后一级页目录指向的二级页表的虚拟地址肯定是在0xc000000~0xc4000000这个范围内,所以最终能找到其物理地址。

由此看,上面只复制一级页目录的行为看起来是没问题的,二级页表虚拟地址总是落在物理地址的逻辑地址空间中,而这块逻辑地址早就映射过了。

 另一个需要注意的问题是,vmalloc分配的内存页表,其实不在普通进程的页表中,因为要实现所有的进程共享同一块啮合空间,所以vmalloc的内存分配的时候,页表是填写在init 进程的页表中

inline int vmalloc_area_pages (unsigned long address, unsigned long size,
                               int gfp_mask, pgprot_t prot)
{
    pgd_t * dir;
    unsigned long end = address + size;
    int ret;

    dir = pgd_offset_k(address);         // 获取 address 地址在 init 进程对应的页目录项
    spin_lock(&init_mm.page_table_lock); // 对 init_mm 上锁
    do {
        pmd_t *pmd;

        pmd = pmd_alloc(&init_mm, dir, address);
        ret = -ENOMEM;
        if (!pmd)
            break;

        ret = -ENOMEM;
        if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot)) // 对页目录项进行映射
            break;

        address = (address + PGDIR_SIZE) & PGDIR_MASK;
        dir++;

        ret = 0;
    } while (address && (address < end));
    spin_unlock(&init_mm.page_table_lock);
    return ret;
}

 而当其他进程需要访问vmalloc的空间时,会产生一个缺页异常,在该缺页异常里面,复制init进程的页表,完成空间的共享:

index = pgd_index(addr);

	pgd = cpu_get_pgd() + index; //从页表寄存器获取出错的页目录地址
	pgd_k = init_mm.pgd + index;//获取init进程的页目录地址

	if (pgd_none(*pgd_k))
		goto bad_area;
	if (!pgd_present(*pgd))
		set_pgd(pgd, *pgd_k);

	pud = pud_offset(pgd, addr);
	pud_k = pud_offset(pgd_k, addr);

	if (pud_none(*pud_k))
		goto bad_area;
	if (!pud_present(*pud))
		set_pud(pud, *pud_k);

	pmd = pmd_offset(pud, addr);
	pmd_k = pmd_offset(pud_k, addr);

#ifdef CONFIG_ARM_LPAE
	/*
	 * Only one hardware entry per PMD with LPAE.
	 */
	index = 0;
#else
	/*
	 * On ARM one Linux PGD entry contains two hardware entries (see page
	 * tables layout in pgtable.h). We normally guarantee that we always
	 * fill both L1 entries. But create_mapping() doesn't follow the rule.
	 * It can create inidividual L1 entries, so here we have to call
	 * pmd_none() check for the entry really corresponded to address, not
	 * for the first of pair.
	 */
	index = (addr >> SECTION_SHIFT) & 1;
#endif
	if (pmd_none(pmd_k[index]))
		goto bad_area;

	copy_pmd(pmd, pmd_k);

1.2 dup_mmap

该函数的主要作用是复制用户空间页表。

static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
	struct rb_node **rb_link, *rb_parent;
	int retval;
	unsigned long charge;
	struct mempolicy *pol;

	uprobe_start_dup_mmap();
	down_write(&oldmm->mmap_sem);
	flush_cache_dup_mm(oldmm);
	uprobe_dup_mmap(oldmm, mm);
	/*
	 * Not linked in yet - no deadlock potential:
	 */
	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);

	mm->locked_vm = 0;
	mm->mmap = NULL;
	mm->mmap_cache = NULL;
	mm->free_area_cache = oldmm->mmap_base;
	mm->cached_hole_size = ~0UL;
	mm->map_count = 0;
	cpumask_clear(mm_cpumask(mm));
	mm->mm_rb = RB_ROOT;
	rb_link = &mm->mm_rb.rb_node;
	rb_parent = NULL;
	pprev = &mm->mmap;
	retval = ksm_fork(mm, oldmm);
	if (retval)
		goto out;
	retval = khugepaged_fork(mm, oldmm);
	if (retval)
		goto out;

	prev = NULL;
	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { \\遍历父进程的虚存空间
		struct file *file;

		if (mpnt->vm_flags & VM_DONTCOPY) {
			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
							-vma_pages(mpnt));
			continue;
		}
		charge = 0;
		if (mpnt->vm_flags & VM_ACCOUNT) {
			unsigned long len = vma_pages(mpnt);

			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
				goto fail_nomem;
			charge = len;
		}
		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);\\为子进程新建虚存结构
		if (!tmp)
			goto fail_nomem;
		*tmp = *mpnt;\\把父进程的虚存空间复制给子进程
		INIT_LIST_HEAD(&tmp->anon_vma_chain);
		pol = mpol_dup(vma_policy(mpnt));
		retval = PTR_ERR(pol);
		if (IS_ERR(pol))
			goto fail_nomem_policy;
		vma_set_policy(tmp, pol);
		tmp->vm_mm = mm;
		if (anon_vma_fork(tmp, mpnt))
			goto fail_nomem_anon_vma_fork;
		tmp->vm_flags &= ~VM_LOCKED;
		tmp->vm_next = tmp->vm_prev = NULL;
		file = tmp->vm_file;
		if (file) { //如果父进程的虚存映射的是个文件,这边暂时不讨论
			struct inode *inode = file_inode(file);
			struct address_space *mapping = file->f_mapping;

			get_file(file);
			if (tmp->vm_flags & VM_DENYWRITE)
				atomic_dec(&inode->i_writecount);
			mutex_lock(&mapping->i_mmap_mutex);
			if (tmp->vm_flags & VM_SHARED)
				mapping->i_mmap_writable++;
			flush_dcache_mmap_lock(mapping);
			/* insert tmp into the share list, just after mpnt */
			if (unlikely(tmp->vm_flags & VM_NONLINEAR))
				vma_nonlinear_insert(tmp,
						&mapping->i_mmap_nonlinear);
			else
				vma_interval_tree_insert_after(tmp, mpnt,
							&mapping->i_mmap);
			flush_dcache_mmap_unlock(mapping);
			mutex_unlock(&mapping->i_mmap_mutex);
		}

		/*
		 * Clear hugetlb-related page reserves for children. This only
		 * affects MAP_PRIVATE mappings. Faults generated by the child
		 * are not guaranteed to succeed, even if read-only
		 */
		if (is_vm_hugetlb_page(tmp))
			reset_vma_resv_huge_pages(tmp);

		/*
		 * Link in the new vma and copy the page table entries.
		 */
		*pprev = tmp; //把该新分配的子进程虚存空间连接到子进程的mmap中
		pprev = &tmp->vm_next;
		tmp->vm_prev = prev;
		prev = tmp;

		__vma_link_rb(mm, tmp, rb_link, rb_parent);
		rb_link = &tmp->vm_rb.rb_right;
		rb_parent = &tmp->vm_rb;

		mm->map_count++;
		retval = copy_page_range(mm, oldmm, mpnt);//拷贝该虚存对应的页表

		if (tmp->vm_ops && tmp->vm_ops->open)
			tmp->vm_ops->open(tmp);

		if (retval)
			goto out;
	}
	/* a new mm has just been created */
	arch_dup_mmap(oldmm, mm);
	retval = 0;
out:
	up_write(&mm->mmap_sem);
	flush_tlb_mm(oldmm);
	up_write(&oldmm->mmap_sem);
	uprobe_end_dup_mmap();
	return retval;
fail_nomem_anon_vma_fork:
	mpol_put(pol);
fail_nomem_policy:
	kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
	retval = -ENOMEM;
	vm_unacct_memory(charge);
	goto out;
}

dup_mmap函数完成的主要功能是子进程依次复制父进程的用户层虚拟空间,该空间由mm_struct中的mmap所记录,然后依次对每块虚存调用copy_page_range,拷贝对应的页表。

dup_mmap

   -------------->copy_page_range

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		struct vm_area_struct *vma)
{
	pgd_t *src_pgd, *dst_pgd;
	unsigned long next;
	unsigned long addr = vma->vm_start;
	unsigned long end = vma->vm_end;
	unsigned long mmun_start;	/* For mmu_notifiers */
	unsigned long mmun_end;		/* For mmu_notifiers */
	bool is_cow;
	int ret;

	/*
	 * Don't copy ptes where a page fault will fill them correctly.
	 * Fork becomes much lighter when there are big shared or private
	 * readonly mappings. The tradeoff is that copy_page_range is more
	 * efficient than faulting.
	 */
	if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
			       VM_PFNMAP | VM_MIXEDMAP))) {
		if (!vma->anon_vma)
			return 0;
	}

	if (is_vm_hugetlb_page(vma))
		return copy_hugetlb_page_range(dst_mm, src_mm, vma);

	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
		/*
		 * We do not free on error cases below as remove_vma
		 * gets called on error from higher level routine
		 */
		ret = track_pfn_copy(vma);
		if (ret)
			return ret;
	}

	/*
	 * We need to invalidate the secondary MMU mappings only when
	 * there could be a permission downgrade on the ptes of the
	 * parent mm. And a permission downgrade will only happen if
	 * is_cow_mapping() returns true.
	 */
	is_cow = is_cow_mapping(vma->vm_flags);
	mmun_start = addr; //虚拟地址的起始地址
	mmun_end   = end; //虚拟地址的结束地址
	if (is_cow)
		mmu_notifier_invalidate_range_start(src_mm, mmun_start,
						    mmun_end);

	ret = 0;
	dst_pgd = pgd_offset(dst_mm, addr);//子进程的页目录的基地址
	src_pgd = pgd_offset(src_mm, addr);//父进程的页目录的基地址
	do {
		next = pgd_addr_end(addr, end);//计算下一个页目录的地址
		if (pgd_none_or_clear_bad(src_pgd))
			continue;
		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
					    vma, addr, next))) {
			ret = -ENOMEM;
			break;
		}
	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
//这个循环对每段虚拟地址,寻找其父进程的页表项,依次对这些页表项进行复制,使其指向相同的物理地址
	if (is_cow)
		mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
	return ret;
}

由于pgd,pud,pmd,都相同,所以直接看最里层函数:

copy_page_range

    -------------->copy_pud_range

        ------------------>copy_pmd_range

               ------------------->copy_pte_range

int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
		   unsigned long addr, unsigned long end)
{
	pte_t *orig_src_pte, *orig_dst_pte;
	pte_t *src_pte, *dst_pte;
	spinlock_t *src_ptl, *dst_ptl;
	int progress = 0;
	int rss[NR_MM_COUNTERS];
	swp_entry_t entry = (swp_entry_t){0};

again:
	init_rss_vec(rss);
-----------------------------------------------(1)
//为页目录分配页表空间
	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
	if (!dst_pte)
		return -ENOMEM;
	src_pte = pte_offset_map(src_pmd, addr);//得到父进程一个页目录下的某个页表
	src_ptl = pte_lockptr(src_mm, src_pmd);
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
	orig_src_pte = src_pte;
	orig_dst_pte = dst_pte;
	arch_enter_lazy_mmu_mode();

	do {
		/*
		 * We are holding two locks at this point - either of them
		 * could generate latencies in another task on another CPU.
		 */
		if (progress >= 32) {
			progress = 0;
			if (need_resched() ||
			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
				break;
		}
		if (pte_none(*src_pte)) { //页表为空,跳过
			progress++;
			continue;
		}
----------------------------------------------------------(2)
//复制该页表项
		entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
							vma, addr, rss);
		if (entry.val)
			break;
		progress += 8;
	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);//每次addr地址加一页,直至复制完一个页目录下的所有页表

	arch_leave_lazy_mmu_mode();
	spin_unlock(src_ptl);
	pte_unmap(orig_src_pte);
	add_mm_rss_vec(dst_mm, rss);
	pte_unmap_unlock(orig_dst_pte, dst_ptl);
	cond_resched();

	if (entry.val) {
		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
			return -ENOMEM;
		progress = 0;
	}
	if (addr != end)
		goto again;
	return 0;
}

(1)为该页目录对应的页表分配空间,并把页表地址填写到页目录中

#define pte_alloc_map_lock(mm, pmd, address, ptlp)	\
	((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL,	\
							pmd, address))?	\
		NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
		pmd_t *pmd, unsigned long address)
{
	pgtable_t new = pte_alloc_one(mm, address);//分配页表空间
	int wait_split_huge_page;
	if (!new)
		return -ENOMEM;

	/*
	 * Ensure all pte setup (eg. pte page lock and page clearing) are
	 * visible before the pte is made visible to other CPUs by being
	 * put into page tables.
	 *
	 * The other side of the story is the pointer chasing in the page
	 * table walking code (when walking the page table without locking;
	 * ie. most of the time). Fortunately, these data accesses consist
	 * of a chain of data-dependent loads, meaning most CPUs (alpha
	 * being the notable exception) will already guarantee loads are
	 * seen in-order. See the alpha page table accessors for the
	 * smp_read_barrier_depends() barriers in page table walking code.
	 */
	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

	spin_lock(&mm->page_table_lock);
	wait_split_huge_page = 0;
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
		mm->nr_ptes++;
		pmd_populate(mm, pmd, new);//把分配的页表的地址填写到页目录中
		new = NULL;
	} else if (unlikely(pmd_trans_splitting(*pmd)))
		wait_split_huge_page = 1;
	spin_unlock(&mm->page_table_lock);
	if (new)
		pte_free(mm, new);
	if (wait_split_huge_page)
		wait_split_huge_page(vma->anon_vma, pmd);
	return 0;
}

(2)copy_one_pte函数主要负责把父进程页表中的物理地址,填写到子进程页表中,这样父子进程的同一个虚拟地址就映射到了相同的物理上,而且要把该块地址改写成写保护。

static inline unsigned long
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
		unsigned long addr, int *rss)
{
	unsigned long vm_flags = vma->vm_flags;
	pte_t pte = *src_pte;
	struct page *page;

	/* pte contains position in swap or file, so copy. */
	if (unlikely(!pte_present(pte))) {//如果该页不在内存中,被交换出去了,则另行处理,这边不深究
		if (!pte_file(pte)) {
			swp_entry_t entry = pte_to_swp_entry(pte);

			if (swap_duplicate(entry) < 0)
				return entry.val;

			/* make sure dst_mm is on swapoff's mmlist. */
			if (unlikely(list_empty(&dst_mm->mmlist))) {
				spin_lock(&mmlist_lock);
				if (list_empty(&dst_mm->mmlist))
					list_add(&dst_mm->mmlist,
						 &src_mm->mmlist);
				spin_unlock(&mmlist_lock);
			}
			if (likely(!non_swap_entry(entry)))
				rss[MM_SWAPENTS]++;
			else if (is_migration_entry(entry)) {
				page = migration_entry_to_page(entry);

				if (PageAnon(page))
					rss[MM_ANONPAGES]++;
				else
					rss[MM_FILEPAGES]++;

				if (is_write_migration_entry(entry) &&
				    is_cow_mapping(vm_flags)) {
					/*
					 * COW mappings require pages in both
					 * parent and child to be set to read.
					 */
					make_migration_entry_read(&entry);
					pte = swp_entry_to_pte(entry);
					set_pte_at(src_mm, addr, src_pte, pte);
				}
			}
		}
		goto out_set_pte;
	}

	/*
	 * If it's a COW mapping, write protect it both
	 * in the parent and the child
	 */
	if (is_cow_mapping(vm_flags)) { //允许设置写保护,就设置写保护
		ptep_set_wrprotect(src_mm, addr, src_pte);
		pte = pte_wrprotect(pte);
	}

	/*
	 * If it's a shared mapping, mark it clean in
	 * the child
	 */
	if (vm_flags & VM_SHARED)
		pte = pte_mkclean(pte);
	pte = pte_mkold(pte);

	page = vm_normal_page(vma, addr, pte);
	if (page) {
		get_page(page);
		page_dup_rmap(page);
		if (PageAnon(page))
			rss[MM_ANONPAGES]++;
		else
			rss[MM_FILEPAGES]++;
	}

out_set_pte:
	set_pte_at(dst_mm, addr, dst_pte, pte);//把父进程页表项中的物理地址设置到子进程中
	return 0;
}

通过上面的步骤就完成了父进程用户空间的复制。

2 copy_thread

copy_thread负责进程上下文的复制,这段程序也有很多有意思的地方,从这段程序也能反应出内核线程和用户进程在新建以后是从什么地方开始执行的。

int
copy_thread(unsigned long clone_flags, unsigned long stack_start,
	    unsigned long stk_sz, struct task_struct *p)
{
	struct thread_info *thread = task_thread_info(p);
	struct pt_regs *childregs = task_pt_regs(p);
	memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));
---------------------------------------------------------------------------(1)
	if (likely(!(p->flags & PF_KTHREAD))) { //如果不是内核线程
		*childregs = *current_pt_regs(); //则复制用户态寄存器,用于返回
		childregs->ARM_r0 = 0; //返回值设置为0
		if (stack_start)
			childregs->ARM_sp = stack_start;
	} else {
----------------------------------------------------------------------------(2)
		memset(childregs, 0, sizeof(struct pt_regs));//如果是内核线程,不会返回用户空间
		thread->cpu_context.r4 = stk_sz;//内核线程的参数
		thread->cpu_context.r5 = stack_start;//内核线程传入的执行函数的地址
		childregs->ARM_cpsr = SVC_MODE;
	}
-----------------------------------------------------------------------------(3)
	thread->cpu_context.pc = (unsigned long)ret_from_fork;//用户进程的返回地址
	thread->cpu_context.sp = (unsigned long)childregs;

	clear_ptrace_hw_breakpoint(p);

	if (clone_flags & CLONE_SETTLS)
		thread->tp_value = childregs->ARM_r3;

	thread_notify(THREAD_NOTIFY_COPY, thread);

	return 0;
}

cpu_context中保存的是进程内核态的上下文寄存器,也就是说,每当调用schedule()函数进行进程切换的时候,在真正切换的时候最主要的工作就是把下一个进程的cpu_context中记录的寄存器恢复到寄存器中,而把当前进程的寄存器值保存在到thread_info的cpu_context中,完成进程切换。

而childregs则保存的是进程用户在用户态陷入内核态时的寄存器,每次从用户态陷入内核态以后,需要在内核栈栈顶向下的一片空间中保存用户态的寄存器,而在返回用户态之前,再把这些寄存器恢复出来。

分成几部分来分析。

(1)如果新建的是用户进程,那么最终子进程和父进程一样要返回到用户态相同的代码段去执行。所以把父进程的上下文寄存器复制到子进程中。一般父进程执行fork系统调用以后,返回地址为0的是子进程,childregs->ARM_r0为0就是对应这个操作。

关于进程如何返回用户空间的,可以参考下面这篇文章

linux 系统调用(二)源码分析_oqqYuJi12345678的博客-CSDN博客

(2)创建内核线程的api为:

pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
	return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
		(unsigned long)arg, NULL, NULL);
}

可以看到内核线程传入的执行函数为stack_start,thread->cpu_context.r5中放的是内核线程的执行函数,而thread->cpu_context.r4中放的是内核线程的参数

(3)thread->cpu_context.pc = (unsigned long)ret_from_fork;把子进程的内核是上下文的pc地址设置为ret_from_fork,也就是说,当切换到该子进程执行的时候,执行的第一段代码就是ret_from_fork。

ENTRY(ret_from_fork)
	bl	schedule_tail //完成调度后的一些收尾工作,由于调用schedule函数的时候,在switch_to函数后面还会有一些收尾工作要做,这边不从switch_to函数后面返回,需要把收尾工作放到这边来执行
	cmp	r5, #0//如果r5不等于0,说明是内核线程,需要去执行内核线程提供的函数
	movne	r0, r4
	adrne	lr, BSYM(1f)
	movne	pc, r5 //去执行内核线程提供的函数
1:	get_thread_info tsk
	b	ret_slow_syscall //如果是用户进程,则走这边,可以看到下面的返回代码和普通系统调用返回用户空间的代码没有多大区别
ENDPROC(ret_from_fork)


ret_slow_syscall:
	disable_irq				@ disable interrupts
ENTRY(ret_to_user_from_irq)
	ldr	r1, [tsk, #TI_FLAGS]
	tst	r1, #_TIF_WORK_MASK
	bne	work_pending
no_work_pending:
	asm_trace_hardirqs_on

	/* perform architecture specific actions before user return */
	arch_ret_to_user r1, lr
	ct_user_enter save = 0

	restore_user_regs fast = 0, offset = 0
ENDPROC(ret_to_user_from_irq)

copy_thread下面还有一句比较有意思的代码:

thread->cpu_context.sp = (unsigned long)childregs;

我们知道,内核栈的地址所位于属进程的struct thread_info同一页的页顶端向下偏移8个字节的位置处,先看一下这边childregs是如何取到的:

struct pt_regs *childregs = task_pt_regs(p);
#define task_pt_regs(p) \
	((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1)
#define THREAD_SIZE		8192
#define THREAD_START_SP		(THREAD_SIZE - 8)
#define task_stack_page(task)	((task)->stack)

所以childregs地址比thread_info同一页的页顶端向下偏移8个字节,再偏移一个struct thread_info结构。把这个地址复制给内核上下文寄存器的sp,那么不就和前面说的内核栈只偏移8个字节对不上了吗。其实从用户态陷入内核的时候,有这么一段代码:

ENTRY(vector_swi)
	sub	sp, sp, #S_FRAME_SIZE   //陷入内核以后,sp已经切换成svc特权模式下的栈了,先把该栈顶向下移动S_FRAME_SIZE个单位,为保留操作做准备
	stmia	sp, {r0 - r12}			@ Calling r0 - r12  //把r0 ~ r12寄存器放入栈中,ia是increase after的意思,r0放在栈顶的位置,不更新sp
 ARM(	add	r8, sp, #S_PC		)  //移动r8到sp+#S_PC的位置
 ARM(	stmdb	r8, {sp, lr}^		)	@ Calling sp, lr 

可以看到进入内核再保存用户态寄存器之前,先把sp地址偏移#S_FRAME_SIZE,而这个偏移的大小,就是结构体struct pt_regs的大小,因为新建的进程一开始就运行在内核态,这边要主动为该进程的sp偏移struct pt_regs大小。当然最终该进程从内核态退出到用户态时,会把偏移的地址修正,最终内核栈离页面顶端只偏移8个字节。

.macro	restore_user_regs, fast = 0, offset = 0
	ldr	r1, [sp, #\offset + S_PSR]	//之前sp的位置偏移了8,所以这边要偏移要8+S_PSR才能找到原来存放用户态cpsr的位置@ get calling cpsr
	ldr	lr, [sp, #\offset + S_PC]!	//获取pc,并且更新了sp的位置到S_PC@ get pc
	msr	spsr_cxsf, r1		//把用户态cpsr放入svc 模式的spsr中,为返回用户态做准备@ save in spsr_svc
#if defined(CONFIG_CPU_V6)
	strex	r1, r2, [sp]			@ clear the exclusive monitor
#elif defined(CONFIG_CPU_32v6K)
	clrex					@ clear the exclusive monitor
#endif
	.if	\fast
	ldmdb	sp, {r1 - lr}^		//fast为1,走这边,恢复用户态寄存器	@ get calling r1 - lr
	.else
	ldmdb	sp, {r0 - lr}^			@ get calling r0 - lr
	.endif
	mov	r0, r0				@ ARMv5T and earlier require a nop
						@ after ldm {}^
	add	sp, sp, #S_FRAME_SIZE - S_PC  //设置内核态sp的位置为进入内核态时候的位置,所以每次进入内核态,内核栈总是干净的
	movs	pc, lr		//把lr赋值给pc,执行陷入内核的下一条指令,同时s符号会把svc模式下的spsr自动压入cpsr,实现模式切换,进入用户态		@ return & move spsr_svc into cpsr
	.endm

  • 2
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值