内核常常要判断进程地址空间中的内存区域是否满足某些条件,为了方便执行,内核定义了许多辅助函数,它们都声明在linux/mm.h中。
- find_vma()
在mm/mmap.c中
- /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
- struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
- {
- struct vm_area_struct *vma = NULL;
- if (mm) {
- /* Check the cache first. */
- /* (Cache hit rate is typically around 35%.) */
- vma = mm->mmap_cache;
- //如果缓存中并未包含希望的VMA,则搜索红黑树
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node * rb_node;
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
- while (rb_node) {
- struct vm_area_struct * vma_tmp;
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
- if (vma)
- mm->mmap_cache = vma;
- }
- }
- return vma; //返回第一个大于addr的VMA
- }
该函数在指定的地址空间中搜索第一个vm_end大于addr的内存区域。换句话说,该函数寻找第一个包含addr或首地址大于addr的内存区域。
注意,由于返回的VMA首地址可能大于addr,所以指定的地址并不一定就包含在返回的VMA中。因为肯那个在对某个VMA执行操作之后,还有其他操作会对该VMA继续进行,所以该函数返回的结果被缓存在内存描述符mm_struct中的mmap_cache域中。如果指定的地址不在缓存中,那么必须搜索和内存描述符相关的所有内存区域,这种搜索通过红黑树进行。
find_vma_prev()
find_vma_prev()函数和find_vma()工作方式相同,但是它返回第一个小于addr的VMA。
- /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
- struct vm_area_struct *
- find_vma_prev(struct mm_struct *mm, unsigned long addr,
- struct vm_area_struct **pprev)
- {
- struct vm_area_struct *vma = NULL, *prev = NULL;
- struct rb_node * rb_node;
- if (!mm)
- goto out;
- /* Guard against addr being lower than the first VMA */
- vma = mm->mmap;
- /* Go through the RB tree quickly. */
- rb_node = mm->mm_rb.rb_node;
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
- vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
- if (addr < vma_tmp->vm_end) {
- rb_node = rb_node->rb_left;
- } else {
- prev = vma_tmp;
- if (!prev->vm_next || (addr < prev->vm_next->vm_end))
- break;
- rb_node = rb_node->rb_right;
- }
- }
- out:
- *pprev = prev;
- return prev ? prev->vm_next : vma;
- }
- find_vma_intersection()
find_vma_intersection()函数返回第一个和指定区间相交的VMA。因为该函数是内联函数,故定义在linux/mm.h中:
- /* Look up the first VMA which intersects the interval start_addr..end_addr-1,
- NULL if none. Assume start_addr < end_addr. */
- static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
- {
- struct vm_area_struct * vma = find_vma(mm,start_addr);
- if (vma && end_addr <= vma->vm_start)
- vma = NULL;
- return vma;
- }
- mmap()和do_mmap():创建地址空间
内核使用do_mmap()函数创建一个新的线性地址空间。如果说该函数创建了一个新的VMA并不非常准确,因为如果创建的地址区间和益而高已经存在的地址空间相邻,并且它们具有同样的访问权限的话,那么两个区间将合并为一个。如果不能合并,那么就确实要创建一个新的VMA了。无论哪种情况,do_mmap()都会将一个地址区间加入到进程的地址空间中(无论是扩展已存在的内存空间还是创建一个新的区域)。
- 在mm.h中
- static inline unsigned long do_mmap(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flag, unsigned long offset)
- {
- unsigned long ret = -EINVAL;
- if ((offset + PAGE_ALIGN(len)) < offset)
- goto out;
- if (!(offset & ~PAGE_MASK))
- ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
- out:
- return ret;
- }
如果file参数是NULL并且offset参数也是0,那么就代表这次映射没有和文件相关,该情况被称作匿名映射;如果指定了文件名和偏移量,那么该映射被称为文件映射。
addr是可选参数,它指定搜索空闲区域的起始位置。
prot参数指定内存区域中页面的访问权限,不同的体系结构标志的定义不同。 flag参数指定了VMA标志。
- /*
- Author: Michael S. Tsirkin <mst@mellanox.co.il>, Mellanox Technologies Ltd.
- Based on: asm-xxx/mman.h
- */
- #define PROT_READ 0x1 /* page can be read */
- #define PROT_WRITE 0x2 /* page can be written */
- #define PROT_EXEC 0x4 /* page can be executed */
- #define PROT_SEM 0x8 /* page may be used for atomic ops */
- #define PROT_NONE 0x0 /* page can not be accessed */
- #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */
- #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */
- #define MAP_SHARED 0x01 /* Share changes */
- #define MAP_PRIVATE 0x02 /* Changes are private */
- #define MAP_TYPE 0x0f /* Mask for type of mapping */
- #define MAP_FIXED 0x10 /* Interpret addr exactly */
- #define MAP_ANONYMOUS 0x20 /* don't use a file */
- 在mm/mmap.c中
- /*
- * The caller must hold down_write(current->mm->mmap_sem).
- */
- unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff)
- {
- struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
- struct inode *inode;
- unsigned int vm_flags;
- int correct_wcount = 0;
- int error;
- struct rb_node ** rb_link, * rb_parent;
- int accountable = 1;
- unsigned long charged = 0, reqprot = prot;
- /*
- * Does the application expect PROT_READ to imply PROT_EXEC?
- *
- * (the exception is when the underlying filesystem is noexec
- * mounted, in which case we dont add PROT_EXEC.)
- */
- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
- prot |= PROT_EXEC;
- if (!len)
- return -EINVAL;
- error = arch_mmap_check(addr, len, flags);
- if (error)
- return error;
- /* Careful about overflows.. */
- len = PAGE_ALIGN(len);
- if (!len || len > TASK_SIZE)
- return -ENOMEM;
- /* offset overflow? */
- if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
- return -EOVERFLOW;
- /* Too many mappings? */
- if (mm->map_count > sysctl_max_map_count)
- return -ENOMEM;
- /* Obtain the address to map to. we verify (or select) it and ensure
- * that it represents a valid section of the address space.
- */
- addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (addr & ~PAGE_MASK)
- return addr;
- /* Do simple checking here so the lower-level routines won't have
- * to. we assume access permissions have been handled by the open
- * of the memory object, so we don't do any here.
- */
- vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
- mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- if (flags & MAP_LOCKED) {
- if (!can_do_mlock())
- return -EPERM;
- vm_flags |= VM_LOCKED;
- }
- /* mlock MCL_FUTURE? */
- if (vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
- inode = file ? file->f_path.dentry->d_inode : NULL;
- if (file) {
- switch (flags & MAP_TYPE) {
- case MAP_SHARED:
- if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
- return -EACCES;
- /*
- * Make sure we don't allow writing to an append-only
- * file..
- */
- if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
- return -EACCES;
- /*
- * Make sure there are no mandatory locks on the file.
- */
- if (locks_verify_locked(inode))
- return -EAGAIN;
- vm_flags |= VM_SHARED | VM_MAYSHARE;
- if (!(file->f_mode & FMODE_WRITE))
- vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
- /* fall through */
- case MAP_PRIVATE:
- if (!(file->f_mode & FMODE_READ))
- return -EACCES;
- if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
- if (vm_flags & VM_EXEC)
- return -EPERM;
- vm_flags &= ~VM_MAYEXEC;
- }
- if (is_file_hugepages(file))
- accountable = 0;
- if (!file->f_op || !file->f_op->mmap)
- return -ENODEV;
- break;
- default:
- return -EINVAL;
- }
- } else {
- switch (flags & MAP_TYPE) {
- case MAP_SHARED:
- vm_flags |= VM_SHARED | VM_MAYSHARE;
- break;
- case MAP_PRIVATE:
- /*
- * Set pgoff according to addr for anon_vma.
- */
- pgoff = addr >> PAGE_SHIFT;
- break;
- default:
- return -EINVAL;
- }
- }
- error = security_file_mmap(file, reqprot, prot, flags);
- if (error)
- return error;
- /* Clear old maps */
- error = -ENOMEM;
- munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
- if (vma && vma->vm_start < addr + len) {
- if (do_munmap(mm, addr, len))
- return -ENOMEM;
- goto munmap_back;
- }
- /* Check against address space limit. */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
- if (accountable && (!(flags & MAP_NORESERVE) ||
- sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
- if (vm_flags & VM_SHARED) {
- /* Check memory availability in shmem_file_setup? */
- vm_flags |= VM_ACCOUNT;
- } else if (vm_flags & VM_WRITE) {
- /*
- * Private writable mapping: check memory availability
- */
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
- }
- /*
- * Can we just expand an old private anonymous mapping?
- * The VM_SHARED test is necessary because shmem_zero_setup
- * will create the file object for a shared anonymous map below.
- */
- if (!file && !(vm_flags & VM_SHARED) &
- vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, NULL, pgoff, NULL))
- goto out;
- /*
- * Determine the object being mapped and call the appropriate
- * specific mapper. the address has already been validated, but
- * not unmapped, but the maps are removed from the list.
- */
- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
- vma->vm_mm = mm;
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_flags = vm_flags;
- vma->vm_page_prot = protection_map[vm_flags
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
- vma->vm_pgoff = pgoff;
- if (file) {
- error = -EINVAL;
- if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
- goto free_vma;
- if (vm_flags & VM_DENYWRITE) {
- error = deny_write_access(file);
- if (error)
- goto free_vma;
- correct_wcount = 1;
- }
- vma->vm_file = file;
- get_file(file);
- error = file->f_op->mmap(file, vma);
- if (error)
- goto unmap_and_free_vma;
- } else if (vm_flags & VM_SHARED) {
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
- }
- /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
- * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
- * that memory reservation must be checked; but that reservation
- * belongs to shared memory object, not to vma: so now clear it.
- */
- if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
- vma->vm_flags &= ~VM_ACCOUNT;
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
- */
- addr = vma->vm_start;
- pgoff = vma->vm_pgoff;
- vm_flags = vma->vm_flags;
- if (vma_wants_writenotify(vma))
- vma->vm_page_prot =
- protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
- if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
- vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
- file = vma->vm_file;
- vma_link(mm, vma, prev, rb_link, rb_parent);
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- } else {
- if (file) {
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- fput(file);
- }
- mpol_free(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
- }
- out:
- mm->total_vm += len >> PAGE_SHIFT;
- vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
- if (vm_flags & VM_LOCKED) {
- mm->locked_vm += len >> PAGE_SHIFT;
- make_pages_present(addr, addr + len);
- }
- if (flags & MAP_POPULATE) {
- up_write(&mm->mmap_sem);
- sys_remap_file_pages(addr, len, 0,
- pgoff, flags & MAP_NONBLOCK);
- down_write(&mm->mmap_sem);
- }
- return addr;
- unmap_and_free_vma:
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- vma->vm_file = NULL;
- fput(file);
- /* Undo any partial mapping done by a device driver. */
- unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- charged = 0;
- free_vma:
- kmem_cache_free(vm_area_cachep, vma);
- unacct_error:
- if (charged)
- vm_unacct_memory(charged);
- return error;
- }
在用户空间可以通过mmap()系统调用获取内核函数do_mmap()的功能。mmap()系统调用定义如下:
- void *mmap2(void *start,size_t length,int prot,int flags,int fd,off_t pgoff)
该系统调用是mmap()的变种所以起名为mmap2()。最原始的mmap()调用中最后一个参数是字节偏移量,而且目前这个变种使用页面偏移量最最后一个参数。使用页面偏移量可以映射更大的文件和更大的偏移位置。原始的mmap()调用由POSIX定义,仍然在C库中作为mmap()方法被使用。虽然C库仍然可以使用原始版本的映射方法,但是它其实还是基于函数mmap2()进行的,对原始mmap()方法的调用是通过将字节偏移转化为页面偏移,从而转化为对mmap2()函数的调用。
-
munmap()和do_munmap():删除地址空间
do_munmap()函数从特定的进程地址空间中删除指定地址区间。
- /* Munmap is split into 2 main parts -- this part which finds
- * what needs doing, and the areas themselves, which do the
- * work. This now handles partial unmappings.
- * Jeremy Fitzhardinge <jeremy@goop.org>
- */
- int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
- {
- unsigned long end;
- struct vm_area_struct *vma, *prev, *last;
- if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
- return -EINVAL;
- if ((len = PAGE_ALIGN(len)) == 0)
- return -EINVAL;
- /* Find the first overlapping VMA */
- vma = find_vma_prev(mm, start, &prev);
- if (!vma)
- return 0;
- /* we have start < vma->vm_end */
- /* if it doesn't overlap, we have nothing.. */
- end = start + len;
- if (vma->vm_start >= end)
- return 0;
- /*
- * If we need to split any vma, do it now to save pain later.
- *
- * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
- * unmapped vm_area_struct will remain in use: so lower split_vma
- * places tmp vma above, and higher split_vma places tmp vma below.
- */
- if (start > vma->vm_start) {
- int error = split_vma(mm, vma, start, 0);
- if (error)
- return error;
- prev = vma;
- }
- /* Does it split the last one? */
- last = find_vma(mm, end);
- if (last && end > last->vm_start) {
- int error = split_vma(mm, last, end, 1);
- if (error)
- return error;
- }
- vma = prev? prev->vm_next: mm->mmap;
- /*
- * Remove the vma's, and unmap the actual pages
- */
- detach_vmas_to_be_unmapped(mm, vma, prev, end);
- unmap_region(mm, vma, prev, start, end);
- /* Fix up all other VM information */
- remove_vma_list(mm, vma);
- return 0;
- }
系统调用munmap()给用户空间程序提供了一种从自身地址空间中删除指定地址区间的方法,它的定义如下:
- int munmap(void *start,size_t length)
该系统调用定义在文件mm/mmap.c中,它是对do_munmap()函数的一个简单的封装:
- asmlinkage long sys_munmap(unsigned long addr, size_t len)
- {
- int ret;
- struct mm_struct *mm = current->mm;
- profile_munmap(addr);
- down_write(&mm->mmap_sem);
- ret = do_munmap(mm, addr, len);
- up_write(&mm->mmap_sem);
- return ret;
- }