mmap 的原理
虚拟地址空间中,每一个进程都有一个列表 vm_area_struct,指向虚拟地址空间的不同的内存块,这个变量的名字叫 mmap。
参见 https://blog.csdn.net/leacock1991/article/details/107328814
struct mm_struct {
struct vm_area_struct *mmap; /* list of VMAs */
......
}
struct vm_area_struct {
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
*/
struct {
struct rb_node rb;
unsigned long rb_subtree_last;
} shared;
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
* list, after a COW of one of the file pages. A MAP_SHARED vma
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
* or brk vma (with NULL file) can only be in an anon_vma list.
*/
struct list_head anon_vma_chain; /* Serialized by mmap_sem &
* page_table_lock */
struct anon_vma *anon_vma; /* Serialized by page_table_lock */
/* Function pointers to deal with this struct. */
const struct vm_operations_struct *vm_ops;
/* Information about our backing store: */
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
units */
struct file * vm_file; /* File we map to (can be NULL). */
void * vm_private_data; /* was vm_pte (shared mem) */
其实内存映射不仅仅是物理内存和虚拟内存之间的映射,还包括将文件中的内容映射到虚拟内存空间,这个时候访问内存空间就能够访问到文件里面的数据。
图片来自极客时间趣谈linux操作系统
mmap系统调用
如何分配一大块内存
如果申请一大块内存就用mmap,mmap是映射内存空间到物理内存
如果一个进程想映射一个文件到自己的虚拟内存空间,也要通过mmap系统调用这个时候mmap是映射内存空间到物理内存再到文件
mmap这个系统调用
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
{
......
error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
......
}
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)
{
struct file *file = NULL;
......
file = fget(fd);
......
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
return retval;
}
如果映射到文件,fd会传进来一个文件描述符,并且mmap_pgoff里面通过fget函数,根据文件描述符获得struct file、struct file表示打开一个文件
接下来的调用链是:vm_mmap_pgoff->do_mmap_pgoff->do_mmap。
这里主要干了两件事情
- 1、调用 get_unmapped_area 找到一个没有映射的区域
- 2、调用 mmap_region 映射这个区域。
get_unmapped_area 函数
\linux-4.13.16\mm\mmap.c
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
unsigned long (*get_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long);
......
get_area = current->mm->get_unmapped_area;
if (file) {
if (file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
}
......
}
file_operations ext4_file_operations 函数
\linux-4.13.16\fs\ext4\file.c
const struct file_operations ext4_file_operations = {
......
.mmap = ext4_file_mmap
.get_unmapped_area = thp_get_unmapped_area,
};
unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
loff_t off, unsigned long flags, unsigned long size)
{
unsigned long addr;
loff_t off_end = off + len;
loff_t off_align = round_up(off, size);
unsigned long len_pad;
len_pad = len + size;
......
addr = current->mm->get_unmapped_area(filp, 0, len_pad,
off >> PAGE_SHIFT, flags);
addr += (off - addr) & (size - 1);
return addr;
}
mmap_region 函数
如何映射虚拟内存区域
\linux-4.13.16\mm\mmap.c
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
struct rb_node **rb_link, *rb_parent;
/*
* Can we just expand an old mapping?
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
INIT_LIST_HEAD(&vma->anon_vma_chain);
if (file) {
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
addr = vma->vm_start;
vm_flags = vma->vm_flags;
}
......
vma_link(mm, vma, prev, rb_link, rb_parent);
return addr;
.....
- 首先要看是否能够基于虚拟内存区域的前一个 vm_area_struct进行扩展,也即调用 vma_merge,和前一个 vm_area_struct 合并到一起
- 如果不能,调用 kmem_cache_zalloc,在 Slub 里面创建一个新的 vm_area_struct对象,设置起始和结束位置,将它加入队列。
- 如果是映射到文件,则设置 vm_file 为目标文件,调用 call_mmap。其实就是调用 file_operations 的 mmap 函数, 对于 ext4 文件系统,调用的是 ext4_file_mmap。这个函数的参数可以看出,这一刻文件和内存开始发生关系了,将vm_area_struct 的内存操作设置为文件系统操作,也就是说,读写内存其实就是读写文件系统。
static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
return file->f_op->mmap(file, vma);
}
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
......
vma->vm_ops = &ext4_file_vm_ops;
......
}
-
最终,vma_link 函数将新创建的 vm_area_struct 挂在了 mm_struct 里面的红黑树上。
-
vma_link 还做了另外一件事情,就是 __vma_link_file。用于建立文件到内存的映射关系。
struct address_space {
struct inode *host; /* owner: inode, block_device */
......
struct rb_root i_mmap; /* tree of private and shared mappings */
......
const struct address_space_operations *a_ops; /* methods */
......
}
static void __vma_link_file(struct vm_area_struct *vma)
{
struct file *file;
file = vma->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
vma_interval_tree_insert(vma, &mapping->i_mmap);
}
\linux-4.13.16\mm\mmap.c
\linux-4.13.16\include\linux\fs.h
对于打开的文件,会有一个结构 struct file 来表示。它有个成员指向 struct address_space 结构,这里面有棵变量名为 i_mmap 的红黑树,vm_area_struct 就挂在这棵树上。
用户态缺页异常
一旦开始访问虚拟内存的某个地址,如果发现并没有对应的物理页,那就触发缺页中断,调用 do_page_fault。
do_page_fault 函数
\linux-4.13.16\arch\x86\mm\fault.c
dotraplinkage void notrace
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
unsigned long address = read_cr2(); /* Get the faulting address */
......
__do_page_fault(regs, error_code, address);
......
}
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
tsk = current;
mm = tsk->mm;
if (unlikely(fault_in_kernel_space(address))) {
if (vmalloc_fault(address) >= 0)
return;
}
......
vma = find_vma(mm, address);
......
fault = handle_mm_fault(vma, address, flags);
......
- 1、在do_page_fault里面,先要判断缺页中断是否发生在内核,如果发生在内核则调用vmalloc_fault,这就是和前面的虚拟内存的布局对应上了
- 2、在内核里面,vmalloc区域需要内核页表映射到物理页
- 3、接下来在用户空间里面,找到你访问的那个地址所在的区域 vm_area_struct,然后调用 handle_mm_fault 来映射这个区域。handle_mm_fault 调用 __handle_mm_fault 函数
__handle_mm_fault 函数
\linux-4.13.16\mm\memory.c
static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
struct vm_fault vmf = {
.vma = vma,
.address = address & PAGE_MASK,
.flags = flags,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
int ret;
pgd = pgd_offset(mm, address);
p4d = p4d_alloc(mm, pgd, address);
......
vmf.pud = pud_alloc(mm, p4d, address);
......
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
......
return handle_pte_fault(&vmf);
}
到这里,看到了熟悉的 PGD、P4G、PUD、PMD、PTE,这就是前面讲页表的时候,讲述的四级页表的概念,因为暂且不考虑五级页表,暂时忽略P4G,参见 https://blog.csdn.net/leacock1991/article/details/107217985
图片来自极客时间趣谈linux操作系统
- 1、pgd_t 用于全局页目录项,pud_t 用于上层页目录项,pmd_t 用于中间页目录项,pte_t 用于直接页表项。
- 2、每个进程都有独立的地址空间,为了这个进程独立完成映射,每个进程都有独立的进程页表,这个页表的最顶级的 pgd 存放在 task_struct 中的 mm_struct 的 pgd变量里面
- 3、在一个进程新创建的时候,会调用 fork,对于内存的部分会调用 copy_mm,里面调用 dup_mm
dup_mm 函数
\linux-4.13.16\kernel\fork.c
/*
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
*/
static struct mm_struct *dup_mm(struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm = current->mm;
mm = allocate_mm();
memcpy(mm, oldmm, sizeof(*mm));
if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;
err = dup_mmap(mm, oldmm);
return mm;
}
在这里,除了创建一个新的 mm_struct,并且通过 memcpy 将它和父进程的弄成一模一样之外,还需要调用 mm_init 进行初始化。
mm_init 调用 mm_alloc_pgd,分配全局页目录项,赋值给 mm_struct 的 pgd 成员变量。
mm_alloc_pgd 函数
\linux-4.13.16\kernel\fork.c
static inline int mm_alloc_pgd(struct mm_struct *mm)
{
mm->pgd = pgd_alloc(mm);
return 0;
}
pgd_alloc 里面除了分配 PDG 之外,还做了很重要的一个事情,就是调用 pgd_ctor
pgd_ctor 函数
\linux-4.13.16\arch\x86\mm\pgtable.c
static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
{
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
if (CONFIG_PGTABLE_LEVELS == 2 ||
(CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
CONFIG_PGTABLE_LEVELS >= 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
}
......
}
pgd_ctor干了什么
只有访问虚拟内存的时候,发现没有映射到物理内存,页表也没有创建过,才触发缺页异常。进入内核调用 do_page_fault,一直调用到 __handle_mm_fault,原来没有创建过页表,于是__handle_mm_fault 调用 pud_alloc 和 pmd_alloc,来创建相应的页目录项,最后调用 handle_pte_fault 来创建页表项。
handle_pte_fault 函数
\linux-4.13.16\mm\memory.c
static int handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
......
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
vmf->orig_pte = *vmf->pte;
......
if (!vmf->pte) {
if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);
else
return do_fault(vmf);
}
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
......
}
第一种情况 __alloc_pages_nodemask 函数
\linux-4.13.16\mm\memory.c
static int do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
int ret = 0;
pte_t entry;
......
if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
return VM_FAULT_OOM;
......
page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
......
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
......
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
......
}
__alloc_pages_nodemask伙伴系统的核心函数,专门用来分配物理页面的,调用 mk_pte,将页表项指向新分配的物理页,set_pte_at 会将页表项塞到页表里面。
第二种情况 __do_fault 函数
\linux-4.13.16\mm\memory.c
static int __do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
int ret;
......
ret = vma->vm_ops->fault(vmf);
......
return ret;
}
这里调用了struct vm_operations_struct vm_ops的fault函数。
对于ext4文件系统,vm_ops指向了ext4_file_vm_ops也就是调用了函数ext4_filemap_fault
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
int ext4_filemap_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
......
err = filemap_fault(vmf);
......
return err;
}
vm_file 就是当时 mmap 的时候映射的那个文件,然后调用 filemap_fault
filemap_fault 函数
\linux-4.13.16\mm\filemap.c
int filemap_fault(struct vm_fault *vmf)
{
int error;
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
int ret = 0;
......
page = find_get_page(mapping, offset);
if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
} else if (!page) {
goto no_cached_page;
}
......
vmf->page = page;
return ret | VM_FAULT_LOCKED;
no_cached_page:
error = page_cache_read(file, offset, vmf->gfp_mask);
......
}
对于文件映射来说,一般这个文件会在物理内存里面有页面作为它的缓存,find_get_page就是找那个页,如果找到了,就调用,预读一些数据到内存里面;如果没有,就跳到no_cached_page
第三种情况 do_swap_page 函数
物理内存管理长时间不用,就要换出到硬盘,也就是 swap,现在这部分数据又要访问了,通过 do_swap_page 再读到内存中来
\linux-4.13.16\mm\memory.c
int do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page, *swapcache;
struct mem_cgroup *memcg;
swp_entry_t entry;
pte_t pte;
......
entry = pte_to_swp_entry(vmf->orig_pte);
......
page = lookup_swap_cache(entry);
if (!page) {
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
......
}
......
swapcache = page;
......
pte = mk_pte(page, vma->vm_page_prot);
......
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
vmf->orig_pte = pte;
......
swap_free(entry);
......
}
- 1、do_swap_page函数会先查找 swap 文件有没有缓存页
- 2、如果没有,就调用swapin_readahead,将 swap 文件读到内存中来,形成内存页,并通过 mk_pte 生成页表项。
- 3、set_pte_at 将页表项插入页表,将 swap 文件清理。因为重新加载回内存了,不再需要 swap 文件了。
- 4、swapin_readahead 会最终调用 swap_readpage,在这里,readpage 函数,也就是说读取普通文件和读取 swap 文件,过程是一样的,同样需要用 kmap_atomickmap_atomic 做临时映射。
int swap_readpage(struct page *page, bool do_poll)
{
struct bio *bio;
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
blk_qc_t qc;
struct block_device *bdev;
......
if (sis->flags & SWP_FILE) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
ret = mapping->a_ops->readpage(swap_file, page);
return ret;
}
......
}
通过上面复杂的过程,用户缺页异常处理完毕了,物理内存中有了页面,页表也建立好了映射,接下来用户程序在虚拟内存空间里面,可以通过虚拟地址顺利经过页表映射的访问物理页面上的数据了
TLB
页表一般都很大,只能存放在内存中。操作系统每次访问内存都要折腾两步,先通过查询页表得到物理地址,然后访问该物理地址读取指令、数据。
为了提高映射速度,引入了 TLB(Translation Lookaside Buffer),称为快表,专门用来做地址映射的硬件设备。
有了 TLB 之后,先查快表,快表中有映射关系,然后直接转换为物理地址。如果在 TLB 查不到映射关系时,才会到内存中查询页表。
图片来自极客时间趣谈linux操作系统
总结
用户态内存映射函数 mmap,包括用它来做匿名映射和文件映射。
用户态的页表结构,存储位置在 mm_struct 中。
在用户态访问没有映射的内存会引发缺页异常,分配物理页表、补齐页表。如果是匿名映射则分配物理内存;如果是 swap,则将 swap 文件读入;如果是文件映射,则将文件读入。
图片来自极客时间趣谈linux操作系统
参考资料:
趣谈Linux操作系统(极客时间)链接:
http://gk.link/a/10iXZ
欢迎大家来一起交流学习