一个进程可以系统调用mmap(),将一个已打开文件的内容映射到它的用户空间,其用户界面为:
mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset)。
参数fd代表着一个已打开文件,offset为文件中的起点,而start为映射到用户空间中的起始地址,length则为长度。还有两个参数prot和flags,前者用于对所映射区间的访问模式,如可写、可执行等;后者则用于其他控制目的。从应用程序设计的角度来说,比之常规的文件操作,如read()、write()、lseek(),将文件映射到用户空间后像访问内存一样地访问文件显然要方便得多。
mmap对应系统调用为sys_mmap2,代码为:
asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
return do_mmap2(addr, len, prot, flags, fd, pgoff);
}
static inline long do_mmap2(
unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
int error = -EBADF;
struct file * file = NULL;
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
if (!(flags & MAP_ANONYMOUS)) {//MAP_ANONYMOUS设成1,表示没有文件,实际上只是用来"圈地"
file = fget(fd);//获取file结构
if (!file)
goto out;
}
down(t->mm->mmap_sem);
error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
up(¤t->mm->mmap_sem);
if (file)
fput(file);
out:
return error;
}
inline函数do_mmap(),是供内核自己用的,它也是将已打开文件映射到当前进程空间。代码为:
static inline unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
{
unsigned long ret = -EINVAL;
if ((offset + PAGE_ALIGN(len)) < offset)
goto out;
if (!(offset & ~PAGE_MASK))
ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
out:
return ret;
}
两者都调用,do_mmap_pgoff,代码如下:
unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma;
int correct_wcount = 0;
int error;
.....//各种判断,先忽略
if (flags & MAP_FIXED) {
if (addr & ~PAGE_MASK)
return -EINVAL;
} else {//MAP_FIXED为0,就表示指定的映射地址只是一个参考值,不能满足时可以由内核给分配一个
addr = get_unmapped_area(addr, len);//当前进程的用户空间中分配一个起始地址
if (!addr)
return -ENOMEM;
}
/* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);//映射到一个特定的文件也是一种属性,属性不同的区段不能共存于同一逻辑区间,所以总要为之单独建立一个逻辑区间
if (!vma)
return -ENOMEM;
vma->vm_mm = mm;
vma->vm_start = addr;//起始地址
vma->vm_end = addr + len;//结束地址
vma->vm_flags = vm_flags(prot,flags) | mm->def_flags;
if (file) {//设置vma->flags
VM_ClearReadHint(vma);
vma->vm_raend = 0;
if (file->f_mode & FMODE_READ)
vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_SHARED) {
vma->vm_flags |= VM_SHARED | VM_MAYSHARE;
/* This looks strange, but when we don't have the file open
* for writing, we can demote the shared mapping to a simpler
* private mapping. That also takes care of a security hole
* with ptrace() writing to a shared mapping without write
* permissions.
*
* We leave the VM_MAYSHARE bit on, just to get correct output
* from /proc/xxx/maps..
*/
if (!(file->f_mode & FMODE_WRITE))
vma->vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
}
} else {
vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_SHARED)
vma->vm_flags |= VM_SHARED | VM_MAYSHARE;
}
vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
vma->vm_ops = NULL;
vma->vm_pgoff = pgoff;//所映射内容在文件中的起点,有了这个起点,发生缺页异常时,就可以根据虚拟地址计算出相应页面在文件中的位置
vma->vm_file = NULL;
vma->vm_private_data = NULL;
/* Clear old maps */
error = -ENOMEM;
if (do_munmap(mm, addr, len))//检查目标地址在当前进程的虚拟空间是否已经在使用,如果已经在使用就要将老的映射撤销,要是这个操作失败,则goto free_vma。因为flags的标志位为MAP_FIXED为1时,并未对此检查。
goto free_vma;
/* Check against address space limit. */
if ((mm->total_vm << PAGE_SHIFT) + len //虚拟空间的使用是否超出了为其设置的下限
> current->rlim[RLIMIT_AS].rlim_cur)
goto free_vma;
/* Private writable mapping? Check memory availability.. */
if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&//物理页面数是否够
!(flags & MAP_NORESERVE) &&
!vm_enough_memory(len >> PAGE_SHIFT))
goto free_vma;
if (file) {
if (vma->vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);//排斥常规文件操作,如read write
if (error)
goto free_vma;
correct_wcount = 1;
}
vma->vm_file = file;//重点哦
get_file(file);
error = file->f_op->mmap(file, vma);//指向了generic_file_mmap
if (error)
goto unmap_and_free_vma;
} else if (flags & MAP_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
*/
flags = vma->vm_flags;
addr = vma->vm_start;
insert_vm_struct(mm, vma);//插入到对应的队列中
if (correct_wcount)
atomic_inc(&file->f_dentry->d_inode->i_writecount);
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {//仅在加锁时才调用make_pages_present
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
}
return addr;//最后返回的起始虚拟地址,一般是后12位为0
unmap_and_free_vma:
if (correct_wcount)
atomic_inc(&file->f_dentry->d_inode->i_writecount);
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
flush_cache_range(mm, vma->vm_start, vma->vm_end);
zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
flush_tlb_range(mm, vma->vm_start, vma->vm_end);
free_vma:
kmem_cache_free(vm_area_cachep, vma);
return error;
}
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
struct vm_operations_struct * ops;
struct inode *inode = file->f_dentry->d_inode;
ops = &file_private_mmap;
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
if (!inode->i_mapping->a_ops->writepage)
return -EINVAL;
ops = &file_shared_mmap;
}
if (!inode->i_sb || !S_ISREG(inode->i_mode))
return -EACCES;
if (!inode->i_mapping->a_ops->readpage)
return -ENOEXEC;
UPDATE_ATIME(inode);
vma->vm_ops = ops;//重点哦
return 0;
}
其中file_private_mmap,代码如下:
static struct vm_operations_struct file_private_mmap = {
nopage: filemap_nopage,
};
inode->i_mapping->a_ops->writepage和inode->i_mapping->a_ops->readpage指向了:
struct address_space_operations ext2_aops = {
readpage: ext2_readpage,
writepage: ext2_writepage,
sync_page: block_sync_page,
prepare_write: ext2_prepare_write,
commit_write: generic_commit_write,
bmap: ext2_bmap
};
最后vm_area_struct数据结构,重要的段(vm_ops,file,pgoff,vm_start,vm_end)都设置好了,返回起始虚拟地址。
读者也许感到困惑,在文件与虚拟区间之间建立映射难道就这么简单?而且我们根本就没有看到页面映射的建立!
那么什么时候建立映射呢?
当这个区间的一个页面首次受到访问时,会由于见面无映射而发生缺页异常,相应的处理函数是do_no_page(),而不是do_swap_page()。代码如下:
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long address, int write_access, pte_t *page_table)
{
struct page * new_page;
pte_t entry;
if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table, write_access, address);
/*
* The third argument is "no_share", which tells the low-level code
* to copy, not share the page even if sharing is possible. It's
* essentially an early COW detection.
*/
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);//调用filemap_nopage
if (new_page == NULL) /* no page was available -- SIGBUS */
return 0;
if (new_page == NOPAGE_OOM)
return -1;
++mm->rss;
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if write_access is true, we either now have
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
flush_page_to_ram(new_page);
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access) {
entry = pte_mkwrite(pte_mkdirty(entry));
} else if (page_count(new_page) > 1 &&
!(vma->vm_flags & VM_SHARED))
entry = pte_wrprotect(entry);
set_pte(page_table, entry);
/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
return 2; /* Major fault */
}
do_no_page会调用filemap_nopage,filemap_nopage会调用ext2_readpage()分配一个空闲内存页面并从文件读入相应的页面,然后建立起映射。