文件映射函数mmap实现机制
这里描述的内核版本为3.14.17
一、mmap简介
函数原型:void* mmap ( void * addr , size_t len , int prot , int flags ,int fd , off_t offset )
内存映射函数mmap, 负责把文件内容映射到进程的虚拟内存空间, 通过对这段内存的读取和修改,来实现对文件的读取和修改,而不需要再调用read,write等操作。
二、mmap系统调用函数的参数
a) addr: 指定为文件描述符fd应被映射到的进程空间的起始地址。它通常被指定为一个空指针,这样告诉内核自己去选择起始地址。一般默认为NULL。
b) length:是映射到调用进程地址空间中的字节数,她从被映射文件开头offset个字节处开始算。
c) prot:负责保护内存映射区的保护。常用值是代表读写访问的PROT_READ | PROT_WRITE.当然还包括数据的执行(PROT_EXEC)、数据不可访问(PROT_NONE)
d) flags:flags常用值有MAP_SHARED或MAP_PRIVATE这两个标志必须选一个,并可以选上MAP_FIXED。如果指定了,那么调用进程对被映射数据所做的修改只对该进程可见,而不该变其底层支撑对象。如果指定了,那么调用进程对被映射数据所作的修改对于共享该对象的所有进程都可见,而且确实改变了其底层支撑对象。
e) fd: 由open返回的文件描述符, 代表要映射的文件。
f) offset: 以文件开始处的偏移量, 必须是分页大小的整数倍, 通常为0, 表示从文件头开始映射。
三、解除映射函数
int munmap(void *start,size_t length)
功能:取消参数start所指向的映射内存,参数length表示欲取消的内存大小。
返回值:解除成功返回0,否则返回-1,错误原因存于errno中。
四、mmap系统调用的实现机制
在我们上网查询资料的过程中发现:
在linux-2.6.27.28中,系统调用mmap和mmap2都调用do_mmap2函数,do_mmap2函数再调用do_mmap_pgoff。
在linux-2.6.35的内核中,系统调用mmap和mmap2都调用sys_mmap_pgoff,系统调用mmap_pgoff再调用,do_mmap2函数再调用do_mmap_pgoff。
可以得知,linux-2.6.17.28与linux-2.6.35版本内核有所不同,那么对于我们来说,在我们自己调研的linux-3.14.17内核中其中的实现机制则一定有不少区别,而且网上很少有关于3.14.17版内核资料,这对我们的工作提出连更加艰难的挑战,所以,在接下来的叙述中,或许会存在很多错误或者不够完善的地方,还请包含。
最后根据我们的调研,在linux-3.14.17/arch/mips/kernel/syscall.c中
SYSCALL_DEFINE6(mips_mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags, unsigned long,
fd, off_t, offset)
{
unsigned long result;
result = -EINVAL;
if (offset & ~PAGE_MASK)
goto out;
result = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT);
out:
return result;
}
SYSCALL_DEFINE6(mips_mmap2, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags, unsigned long, fd,
unsigned long, pgoff)
{
if (pgoff & (~PAGE_MASK >> 12))
return -EINVAL;
return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT-12));
}
可以看出,linux-3,14,17内核中,mmap与mmap2系统调用都调用sys_mmap_pgoff,在linux-3.14.17/mm/mmap.c中,mmap_pgoff系统调用如下:
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)
{
struct file *file = NULL;
unsigned long retval = -EBADF;
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
if (!file)
goto out;
if (is_file_hugepages(file))
len = ALIGN(len, huge_page_size(hstate_file(file)));
retval = -EINVAL;
if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
goto out_fput;
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
if (!hs)
return -EINVAL;
len = ALIGN(len, huge_page_size(hs));
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
if (file)
fput(file);
out:
return retval;
}
由上述代码可以看出,mmap_pgoff系统调用调用vm_mmap_pgoff()函数。在以往的版本一般都是mmap_pgof调用do_mmap2,do_mmap2函数再调用do_mmap_pgoff,或者直接的mmap和mmap2都调用do_mmap2函数,do_mmap2函数再调用do_mmap_pgoff。vm_mmap_pgoff()函数代码如下:
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
ret = security_mmap_file(file, prot, flag);
if (!ret) {
down_write(&mm->mmap_sem);
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
&populate);
up_write(&mm->mmap_sem);
if (populate)
mm_populate(ret, populate);
}
return ret;
}
上述代码可以看出,vm_mmap_pgoff()函数最终是对do_mmap_pgoff函数。do_mmap_pgoff函数代码如下:
unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff,
unsigned long *populate)
{
struct mm_struct * mm = current->mm;
vm_flags_t vm_flags;
*populate = 0;
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
prot |= PROT_EXEC;
if (!len)
return -EINVAL;
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
len = PAGE_ALIGN(len);
if (!len)
return -ENOMEM;
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EOVERFLOW;
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (addr & ~PAGE_MASK)
return addr;
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
if (mlock_future_check(mm, vm_flags, len))
return -EAGAIN;
if (file) {
struct inode *inode = file_inode(file);
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
return -EACCES;
if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
return -EACCES;
if (locks_verify_locked(inode))
return -EAGAIN;
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
if (!file->f_op->mmap)
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
break;
default:
return -EINVAL;
}
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
pgoff = addr >> PAGE_SHIFT;
break;
default:
return -EINVAL;
}
}
if (flags & MAP_NORESERVE) {
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE;
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE;
}
addr = mmap_region(file, addr, len, vm_flags, pgoff);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
}
通过上出代码可以看出,do_mmap_pgoff调用get_unmapped_area获取未mmap空间,通过调用mmap_region返回最后地址给mmap系统调用。mmap_region函数如下:
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
if (!(vm_flags & MAP_FIXED))
return -ENOMEM;
nr_pages = count_vma_pages_range(mm, addr, addr + len);
if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}
error = -ENOMEM;
munmap_back:
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
}
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
if (vma)
goto out;
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
INIT_LIST_HEAD(&vma->anon_vma_chain);
if (file) {
if (vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
}
vma->vm_file = get_file(file);
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
if (vma_wants_writenotify(vma)) {
pgprot_t pprot = vma->vm_page_prot;
vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
}
vma_link(mm, vma, prev, rb_link, rb_parent);
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
file = vma->vm_file;
out:
perf_event_mmap(vma);
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
else
vma->vm_flags &= ~VM_LOCKED;
}
if (file)
uprobe_mmap(vma);
vma->vm_flags |= VM_SOFTDIRTY;
return addr;
unmap_and_free_vma:
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
vma->vm_file = NULL;
fput(file);
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
free_vma:
kmem_cache_free(vm_area_cachep, vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
return error;
}
mmap_region最终返回地址给do_mmap_pgoff函数,再返回mapp_pgoff系统调用,再最后返回mmap系统调用,最终完成mmap功能。