linux进程地址空间(1) fork/clone/vfork详解(2)

接上一篇,dup_mmap函数源码如下

static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)

{

         struct vm_area_struct *mpnt, *tmp, **pprev;

         struct rb_node **rb_link, *rb_parent;

         int retval;

         unsigned long charge;

         struct mempolicy *pol;

         down_write(&oldmm->mmap_sem);

         flush_cache_dup_mm(oldmm);

         /*

          * Not linked in yet - no deadlock potential:

          */

         down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);

         mm->locked_vm = 0;

         mm->mmap = NULL;

         mm->mmap_cache = NULL;

         mm->free_area_cache = oldmm->mmap_base;

         mm->cached_hole_size = ~0UL;

         mm->map_count = 0;

         cpumask_clear(mm_cpumask(mm));

         mm->mm_rb = RB_ROOT;

         rb_link = &mm->mm_rb.rb_node;

         rb_parent = NULL;

         pprev = &mm->mmap;

         retval = ksm_fork(mm, oldmm);

         if (retval)

                   goto out;

    /*遍历父进程的每个vma,准备复制*/

         for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {

                   struct file *file;

        /*不让拷贝的vma,跳过*/

                   if (mpnt->vm_flags & VM_DONTCOPY) {

                            long pages = vma_pages(mpnt);

                            mm->total_vm -= pages;

                            vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,

                                                                           -pages);

                            continue;

                   }

                   charge = 0;

                   if (mpnt->vm_flags & VM_ACCOUNT) {

                            unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;

                            if (security_vm_enough_memory(len))

                                     goto fail_nomem;

                            charge = len;

                   }

             /*注意,这里从slab创建vma,这是给新进程用的*/

                   tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

                   if (!tmp)

                            goto fail_nomem;

        /*把父进程的vma内容拷贝到这个vma*/

                   *tmp = *mpnt;

                   pol = mpol_dup(vma_policy(mpnt));

                   retval = PTR_ERR(pol);

                   if (IS_ERR(pol))

                            goto fail_nomem_policy;

                   vma_set_policy(tmp, pol);

        /*不锁定*/

                   tmp->vm_flags &= ~VM_LOCKED;

        /*回指自己(新进程)mm*/

                   tmp->vm_mm = mm;

                   tmp->vm_next = NULL;

                   anon_vma_link(tmp);

        /*看看这个vma是不是文件映射的*/

                   file = tmp->vm_file;

                   if (file) {

                            struct inode *inode = file->f_path.dentry->d_inode;

                            struct address_space *mapping = file->f_mapping;

                            get_file(file);

                            if (tmp->vm_flags & VM_DENYWRITE)

                                     atomic_dec(&inode->i_writecount);

                            spin_lock(&mapping->i_mmap_lock);

                            if (tmp->vm_flags & VM_SHARED)

                                     mapping->i_mmap_writable++;

                            tmp->vm_truncate_count = mpnt->vm_truncate_count;

                            flush_dcache_mmap_lock(mapping);

                            /* insert tmp into the share list, just after mpnt */

                            vma_prio_tree_add(tmp, mpnt);

                            flush_dcache_mmap_unlock(mapping);

                            spin_unlock(&mapping->i_mmap_lock);

                   }

                   /*

                    * Clear hugetlb-related page reserves for children. This only

                    * affects MAP_PRIVATE mappings. Faults generated by the child

                    * are not guaranteed to succeed, even if read-only

                    */

                   if (is_vm_hugetlb_page(tmp))

                            reset_vma_resv_huge_pages(tmp);

                   /*

                    * Link in the new vma and copy the page table entries.

                    */

                   /*更新新进程的vma链表节点指针为这个tmpvma,并且指定好pprev准备下一个节点*/

                   *pprev = tmp;

                   pprev = &tmp->vm_next;

        /*插入红黑树*/

                   __vma_link_rb(mm, tmp, rb_link, rb_parent);

                   rb_link = &tmp->vm_rb.rb_right;

                   rb_parent = &tmp->vm_rb;

        /*更新新进程的mmvma个数*/

                   mm->map_count++;

        /*参数分别为: 子进程的mm、父进程的mm、父进程的某个vma

          复制mpnt地址空间部分的页表项到新进程的mm*/

                   retval = copy_page_range(mm, oldmm, mpnt);

                   if (tmp->vm_ops && tmp->vm_ops->open)

                            tmp->vm_ops->open(tmp);

                   if (retval)

                            goto out;

         }

         /* a new mm has just been created */

         arch_dup_mmap(oldmm, mm);

         retval = 0;

out:

         up_write(&mm->mmap_sem);

         flush_tlb_mm(oldmm);

         up_write(&oldmm->mmap_sem);

         return retval;

fail_nomem_policy:

         kmem_cache_free(vm_area_cachep, tmp);

fail_nomem:

         retval = -ENOMEM;

         vm_unacct_memory(charge);

         goto out;

}

对于内核源码的很多部分,一时间可能很难完全搞清搞透传,但重在理解它的意图和把握一个脉络,很多结构体成员众多,短时间内很难全部搞清,重点不要放在这里

这里多数加了注释的就是我目前感觉比较有用的,或者说能看懂的,它们也比较清晰,for循环拷贝了父进程的所有vma给子进程,子进程又做了相应初始化,现在重点看下函数copy_page_range,它的功能是把父进程的页映射关系也复制给子进程,这里涉及了内存页表的知识,不熟悉的或忘了的可从这篇文章往前找到arm-linux内存页表创建(http://blog.csdn.net/u010246947/article/details/9837147),里边描述的比较清楚,也可以直接看下边copy_page_range

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,

                   struct vm_area_struct *vma)

{

         pgd_t *src_pgd, *dst_pgd;

         unsigned long next;

    /*注意参数vma是父进程的vma,所以这里的addrend也是这个父进程的vma的首尾地址*/

         unsigned long addr = vma->vm_start;

         unsigned long end = vma->vm_end;

         int ret;

         /*

          * Don't copy ptes where a page fault will fill them correctly.

          * Fork becomes much lighter when there are big shared or private

          * readonly mappings. The tradeoff is that copy_page_range is more

          * efficient than faulting.

          */

         if (!(vma->vm_flags &

         (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {

                   if (!vma->anon_vma)

                            return 0;

         }

         if (is_vm_hugetlb_page(vma))

                   return copy_hugetlb_page_range(dst_mm, src_mm, vma);

         if (unlikely(is_pfn_mapping(vma))) {

                   /*

                    * We do not free on error cases below as remove_vma

                    * gets called on error from higher level routine

                    */

                   ret = track_pfn_vma_copy(vma);

                   if (ret)

                            return ret;

         }

         /*

          * We need to invalidate the secondary MMU mappings only when

          * there could be a permission downgrade on the ptes of the

          * parent mm. And a permission downgrade will only happen if

          * is_cow_mapping() returns true.

          */

         if (is_cow_mapping(vma->vm_flags))

        /*空函数*/

                   mmu_notifier_invalidate_range_start(src_mm, addr, end);

         ret = 0;

    /*先后是子进程、父进程的mm的页表,注意父子进程的pgd是不同的(mm->pgd)*/

         dst_pgd = pgd_offset(dst_mm, addr);

         src_pgd = pgd_offset(src_mm, addr);

    /*循环次数未知,得看addrend相差多少个2MB*/

         do {

        /*对于arm2MB为单位,一段一段来*/

                   next = pgd_addr_end(addr, end);

        /*对于arm,底下的if默认为0不会进入*/

                   if (pgd_none_or_clear_bad(src_pgd))

                            continue;

                   if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,

                                                   vma, addr, next))) {

                            ret = -ENOMEM;

                            break;

                   }

         } while (dst_pgd++, src_pgd++, addr = next, addr != end);

 

         if (is_cow_mapping(vma->vm_flags))

        /*空函数*/

                   mmu_notifier_invalidate_range_end(src_mm,

                                                          vma->vm_start, end);

         return ret;

}

这个函数的重点首先看两个变量addrend,分别是这个父进程的这个vma线性区的起始和结尾虚拟地址,后面就是把这个区间的虚拟物理映射复制到子进程的vma线性区,先看下边这个

/*先后是子进程、父进程的mm的页表,注意父子进程的pgd是不同的(mm->pgd)*/

         dst_pgd = pgd_offset(dst_mm, addr);

         src_pgd = pgd_offset(src_mm, addr);

如上面注释所说,是父子各自mmpgd成员,即各自进程的一级页表,然后是下面的循环

/*循环次数未知,得看addrend相差多少个2MB*/

do {

        /*对于arm2MB为单位,一段一段来*/

                   next = pgd_addr_end(addr, end);

        /*对于arm,底下的if默认为0不会进入*/

                   if (pgd_none_or_clear_bad(src_pgd))

                            continue;

                   if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,

                                                   vma, addr, next))) {

                            ret = -ENOMEM;

                            break;

                   }

         } while (dst_pgd++, src_pgd++, addr = next, addr != end);

熟悉内存页表的肯定理解这是在干什么,这就是从addrend这个区间,以2MB为单位不断调用函数copy_pud_range,所以说循环次数未知;对于函数copy_pud_range,它就是实际的拷贝映射关系,linux四级映射在arm上结合为两级映射,所以接下来调用的函数copy_pud_rangecopy_pmd_range实际上相当于重复执行一遍,对结果没有影响,直到函数copy_pte_range,这时函数copy_pte_range的参数依然是父子mm、父子pgd、父vmavma的首尾地址,源码如下

static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,

                   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,

                   unsigned long addr, unsigned long end)

{

         pte_t *orig_src_pte, *orig_dst_pte;

         pte_t *src_pte, *dst_pte;

         spinlock_t *src_ptl, *dst_ptl;

         int progress = 0;

         int rss[2];

again:

         rss[1] = rss[0] = 0;

    /*这就是给子进程的创建二级页表,再次证明一级页表常驻内存,二级页表要靠分配*/

         dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);

         if (!dst_pte)

                   return -ENOMEM;

    /*得出对于当前的虚拟地址,父进程的二级页表条目*/

         src_pte = pte_offset_map_nested(src_pmd, addr);

         src_ptl = pte_lockptr(src_mm, src_pmd);

         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

         orig_src_pte = src_pte;

         orig_dst_pte = dst_pte;

         arch_enter_lazy_mmu_mode();

    /*addrend差值为2MB的正常情况下,将循环2MB/4KB=512次,

      每次copy_one_pte将把父进程的二级页表映射内容拷贝给子进程二级页表条目,对应复制4KB空间*/

         do {

                   /*

                    * We are holding two locks at this point - either of them

                    * could generate latencies in another task on another CPU.

                    */

                   if (progress >= 32) {

                            progress = 0;

                            if (need_resched() ||

                                spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))

                                     break;

                   }

        /*父进程二级页表映射内容不存在时,进行下一次循环*/

                   if (pte_none(*src_pte)) {

                            progress++;

                            continue;

                   }

                   copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);

                   progress += 8;

         } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

         arch_leave_lazy_mmu_mode();

         spin_unlock(src_ptl);

         pte_unmap_nested(orig_src_pte);

         add_mm_rss(dst_mm, rss[0], rss[1]);

         pte_unmap_unlock(orig_dst_pte, dst_ptl);

         cond_resched();

         if (addr != end)

                   goto again;

         return 0;

}

首先看如下片段

/*这就是给子进程的创建二级页表,再次证明一级页表常驻内存,二级页表要靠分配*/

         dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);

这就是在给子进程创建一个二级页表,然后看下面的片段

/*得出对于当前的虚拟地址,父进程的二级页表条目*/

         src_pte = pte_offset_map_nested(src_pmd, addr);

这是给后面的把父进程的二级页表条目加上写保护属性做准备,先把父进程的二级页表条目获取到,在下一步将增加写保护的属性值;接下来是循环调用函数copy_one_pte,之所以会序号512次,是因为从上面调用到这里,addrend都是2MB的间隔,这里每次调用copy_one_pte写一个二级页表条目,对应4KB,所以需要调用512次,函数copy_one_pte源码如下

static inline void

copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,

                   pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,

                   unsigned long addr, int *rss)

{

         unsigned long vm_flags = vma->vm_flags;

         pte_t pte = *src_pte;

         struct page *page;

         /* pte contains position in swap or file, so copy. */

/*(!pte_present(pte))1,说明父进程的这个二级页表映射的内容,不在物理内存*/

         if (unlikely(!pte_present(pte))) {

 /*如果是因为被交换到磁盘(外存),那么把old_pteswap file中的入口地址,将old_pte复制到内存中*/

                   if (!pte_file(pte)) {

                            swp_entry_t entry = pte_to_swp_entry(pte);

                            swap_duplicate(entry);

                            /* make sure dst_mm is on swapoff's mmlist. */

                            if (unlikely(list_empty(&dst_mm->mmlist))) {

                                     spin_lock(&mmlist_lock);

                                     if (list_empty(&dst_mm->mmlist))

                                               list_add(&dst_mm->mmlist,

                                                         &src_mm->mmlist);

                                     spin_unlock(&mmlist_lock);

                            }

                            if (is_write_migration_entry(entry) &&

                                               is_cow_mapping(vm_flags)) {

                                     /*

                                      * COW mappings require pages in both parent

                                      * and child to be set to read.

                                      */

                                     make_migration_entry_read(&entry);

                                     pte = swp_entry_to_pte(entry);

                                     set_pte_at(src_mm, addr, src_pte, pte);

                            }

                   }

                   goto out_set_pte;

         }

         /*

          * If it's a COW mapping, write protect it both

          * in the parent and the child

          */

         /*这一步很重要,这是在fork时,当子进程拷贝父进程的页表时,将这一页置为写保护,导致父子任何一方再要改动这页内容时不能写入,将触发COW*/

         if (is_cow_mapping(vm_flags)) {

        /*父进程的二级页表的该页条目设置为写保护*/

                   ptep_set_wrprotect(src_mm, addr, src_pte);

        /*子进程的二级页表的该页条目也设置为写保护*/

                   pte = pte_wrprotect(pte);

         }

         /*

          * If it's a shared mapping, mark it clean in

          * the child

          */

         if (vm_flags & VM_SHARED)

                   pte = pte_mkclean(pte);

         pte = pte_mkold(pte);

    /*根据二级页表映射的内容,找出是哪一物理页,并返回其页描述符,

      如果是零页(zero_pfn)返回NULL*/

         page = vm_normal_page(vma, addr, pte);

    /*找到该物理页描述符的目的是,让其成员_count_mapcount均加1,意即该页的使用进程个数*/

         if (page) {

                   get_page(page);

                   page_dup_rmap(page);

                   rss[PageAnon(page)]++;

         }

out_set_pte:

         set_pte_at(dst_mm, addr, dst_pte, pte);

}

这个函数直到最后才调用set_pte_at实际的写子进程的二级页表条目,前面主要完成以下功能:

1、  置该物理页对于父子进程的二级页表条目的属性均为写保护

/*这一步很重要,这是在fork时,当子进程拷贝父进程的页表时,

 将这一页置为写保护,导致父子任何一方再要改动这页内容时不能写入,将触发COW*/

if (is_cow_mapping(vm_flags)) {

        /*父进程的二级页表的该页条目设置为写保护*/

           ptep_set_wrprotect(src_mm, addr, src_pte);

        /*子进程的二级页表的该页条目也设置为写保护*/

           pte = pte_wrprotect(pte);

}

2、  更新该页的页描述符的一些成员

/*根据二级页表映射的内容,找出是哪一物理页,并返回其页描述符,

      如果是零页(zero_pfn)返回NULL*/

page = vm_normal_page(vma, addr, pte);

    /*找到该物理页描述符的目的是,让其成员_count_mapcount均加1,意即该页的使用进程个数*/

if (page) {

           get_page(page);

           page_dup_rmap(page);

           rss[PageAnon(page)]++;

}

这里还需要看下函数vm_normal_page,它的功能是:根据二级页表映射的内容,找出是哪一物理页,并返回其页描述符,如果是零页(zero_pfn)返回NULL;也许会奇怪为什么还有可能是零页,后面就会发现原因和应用场合

         3写子进程的二级页表:set_pte_at(dst_mm, addr, dst_pte, pte);

 

上面就是在函数dup_mmap中,for循环调用函数copy_page_range,把父进程的所有vma内的物理页映射都拷贝给子进程的所有vma中的全过程,回到函数dup_mmap,还应注意些小细节,诸如子进程的mm的插入红黑树和vma双向链表、更新vma个数(_mapcount成员)等等,有了印象后对后面其他内容的熟悉有好处,更有助于全面理解进程地址空间。

细心的你会发现,父子进程的每个vma的起始结尾虚拟地址值,都是一样的,同时,这些虚拟地址空间对应的物理地址也都是一样的;只是,父子任何一方试图写这些物理页时,MMU会阻止写操作,就会触发写时复制COW的缺页异常,因为这些物理页是写保护的

所以,有没有CLONE_VM标志的fork/vfork/clone的区别在于,有CLONE_VM标志的情况下,子进程没有自己的mm也没有自己的vma,而有CLONE_VM的情况是有自己的mmvma的;但虚拟空间的物理地址都是没有额外映射,这样做的好处是节省了物理内存,其实就如ULK所讲,在运行一段时间后,父子进程就会有完全不一样的地址空间了

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值