一、内存使用方法
1.创建内存映射
#include <sys/mman.h>
void *mmap(void *addr,size_t length,int prot,int flags,int fd,off_t offset);
mmap:进程创建匿名的内存映射,把内存的物理页映射到进程的虚拟地址空间。进程把文件映射到进程的虚拟地址空间,可以像访问内存一样访问文件,不需要调用系统调用read()/write()访问文件,从而避免用户模式和内核模式之间的切换,提高读写文件速度。两个进程针对同一个文件创建共享的内存映射,实现共享内存。
2.删除内存映射
#include <sys/mman.h>
int munmap(void *addr, size_t len);
mumap:该调用在进程地址空间中解除一个映射关系,addr是调用mmap()时返回的地址,len是映射区的大小。当映射关系解除后,对原来映射地址的访问将导致段错误发生。
3.设置虚拟内存区域的访问权限
#include <sys/mman.h>
int mprotect(void *addr, size_t len, int prot);
mprotect:把自start开始的、长度为len的内存区的保护属性修改为prot指定的值。
prot可以取以下几个值,并且可以用“|”将几个属性合起来使用:
1)PROT_READ:表示内存段内的内容可写;
2)PROT_WRITE:表示内存段内的内容可读;
3)PROT_EXEC:表示内存段中的内容可执行;
4)PROT_NONE:表示内存段中的内容根本没法访问。
需要指出的是,指定的内存区间必须包含整个内存页(4K)。区间开始的地址start必须是一个内存页的起始地址,并且区间长度len必须是页大小的整数倍。
二、mmap的系统调用
0.查找mmap在内核中的系统调用函数
我现在用的内核版是4.19.40,首先在应用层参考上面解析编写一个mmap使用代码,然后编译成程序,在使用strace工具跟踪其函数调用,可以发现mmap也是调用底层的mmap系统调用,然后我们寻找一下底层的带6个参数的mmap系统调用有哪些:
可以看到,arm64和X86的系统调用位于不同文件。
1.mmap的系统调用
x86的位于arch/x86/kernel/sys_x86_64.c文件,如下所示:
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
{
long error;
error = -EINVAL;
//检查偏移是不是页的整数倍,如果不是页的整数倍,直接返回-EINVAL
//如果是也得整数倍,那么把偏移转换成页为单位的偏移,然后继续往下走
if (off & ~PAGE_MASK)
goto out;
error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
out:
return error;
}
arm64的位于arch/arm64/kernel/sys.c文件,如下所示:
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, off_t, off)
{
//检查偏移是不是页的整数倍,如果不是页的整数倍,直接返回-EINVAL
//如果是也得整数倍,那么把偏移转换成页为单位的偏移,然后继续往下走
if (offset_in_page(off) != 0)
return -EINVAL;
return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}
然后都是进入ksys_mmap_pgoff:
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
struct file *file = NULL;
unsigned long retval;
if (!(flags & MAP_ANONYMOUS)) {//如果不是匿名映射
audit_mmap_fd(fd, flags);
file = fget(fd);//文件映射,根据文件描述符在进程的打开文件表中找到file实例
if (!file)
return -EBADF;
if (is_file_hugepages(file))
len = ALIGN(len, huge_page_size(hstate_file(file)));
retval = -EINVAL;
if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
goto out_fput;
} else if (flags & MAP_HUGETLB) {//如果是匿名巨型页映射
struct user_struct *user = NULL;
struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (!hs)
return -EINVAL;
len = ALIGN(len, huge_page_size(hs));
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
//在hugetlbfs文件系统中创建文件“anon_hugepage”
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
if (file)
fput(file);
return retval;
}
然后进入vm_mmap_pgoff:
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
LIST_HEAD(uf);//初始化userfaultfd链表
ret = security_mmap_file(file, prot, flag);//security linux安全相关的,一般不会开,返回值为0
if (!ret) {
if (down_write_killable(&mm->mmap_sem))//以写者身份申请读写信号量
return -EINTR;
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,//创建内存映射主要工作在此函数中进行
&populate, &uf);
up_write(&mm->mmap_sem);//释放读写信号量
userfaultfd_unmap_complete(mm, &uf);//等待userfaultfd处理完成
if (populate)
mm_populate(ret, populate);//如果调用者要求把也锁定在内存中,或要求
}
return ret;
}
我们讲解最重要的do_mmap_pgoff函数:
static inline unsigned long
do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
unsigned long pgoff, unsigned long *populate,
struct list_head *uf)
{
return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
}
然后进入do_mmap:
/*
* The caller must hold down_write(¤t->mm->mmap_sem).
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, vm_flags_t vm_flags,
unsigned long pgoff, unsigned long *populate,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
int pkey = 0;
*populate = 0;
if (!len)
return -EINVAL;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* (the exception is when the underlying filesystem is noexec
* mounted, in which case we dont add PROT_EXEC.)
*/ //如果进程带有READ_IMPLIES_EXEC标记且文件系统是可执行的,则这段内存空间使用READ的属性会附带增加EXEC属性。
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;
/* force arch specific MAP_FIXED handling in get_unmapped_area */
if (flags & MAP_FIXED_NOREPLACE)
flags |= MAP_FIXED;
//如果不是使用固定地址,则使用的addr会进行向下页对齐
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
/* Careful about overflows.. */
len = PAGE_ALIGN(len);//申请内存大小页对齐
if (!len)
return -ENOMEM;
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)//判断申请的内存是否溢出
return -EOVERFLOW;
/* Too many mappings? */
if (mm->map_count > sysctl_max_map_count)//判断内存映射次数是否达到上限
return -ENOMEM;
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);//获取未映射的地址空间并且验证它
if (offset_in_page(addr))
return addr;
if (flags & MAP_FIXED_NOREPLACE) {
struct vm_area_struct *vma = find_vma(mm, addr);
if (vma && vma->vm_start < addr + len)
return -EEXIST;
}
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
pkey = 0;
}
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
//计算虚拟内存的标志并且下面做一些简单的检查
vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
if (mlock_future_check(mm, vm_flags, len))
return -EAGAIN;
//从下面的代码可以看到,各种映射之间,主要是vm_flag之间的差别
if (file) {
struct inode *inode = file_inode(file);
unsigned long flags_mask;
if (!file_mmap_ok(file, inode, pgoff, len))
return -EOVERFLOW;
flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
switch (flags & MAP_TYPE) {
case MAP_SHARED:
/*
* Force use of MAP_SHARED_VALIDATE with non-legacy
* flags. E.g. MAP_SYNC is dangerous to use with
* MAP_SHARED as you don't know which consistency model
* you will get. We silently ignore unsupported flags
* with MAP_SHARED to preserve backward compatibility.
*/
flags &= LEGACY_MAP_MASK;
/* fall through */
case MAP_SHARED_VALIDATE:
if (flags & ~flags_mask)
return -EOPNOTSUPP;
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
return -EACCES;
/*
* Make sure we don't allow writing to an append-only
* file..
*/
if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
return -EACCES;
/*
* Make sure there are no mandatory locks on the file.
*/
if (locks_verify_locked(file))
return -EAGAIN;
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
/* fall through */
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
if (path_noexec(&file->f_path)) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
if (!file->f_op->mmap)
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
break;
default:
return -EINVAL;
}
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
/*
* Ignore pgoff.
*/
pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
/*
* Set pgoff according to addr for anon_vma.
*/
pgoff = addr >> PAGE_SHIFT;
break;
default:
return -EINVAL;
}
}
/*
* Set 'VM_NORESERVE' if we should not account for the
* memory use of this mapping.
*/
if (flags & MAP_NORESERVE) {
/* We honor MAP_NORESERVE if allowed to overcommit */
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE;
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE;
}
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);//真正创建虚拟内存区域
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
}
do_mmap_pgoff这个函数主要做了两件事,get_unmapped_area获取未映射地址,mmap_region映射。
先看下get_unmapped_area ,他是先找到mm_struct的get_unmapped_area成员,再去执行他:
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
......
get_area = current->mm->get_unmapped_area;
......
addr = get_area(file, addr, len, pgoff, flags);
......
}
再看mmap_region的实现:
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
/* Check against address space limit. */
//检查进程虚拟内存限制
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
/*
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/
nr_pages = count_vma_pages_range(mm, addr, addr + len);
if (!may_expand_vm(mm, vm_flags,
(len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}
/* Clear old maps */
//检查是否和旧的虚拟内存区域有重叠并且把重叠部分释放掉
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
&rb_parent)) {
if (do_munmap(mm, addr, len, uf))
return -ENOMEM;
}
/*
* Private writable mapping: check memory availability
*/
//如果有私有的可写的内存映射,可以先修改标志位
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
/*
* Can we just expand an old mapping?
*/
//如果可以和已有的虚拟内存合并,则合并并且使用该vma,然后结束
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
//如果不可以合并,继续往下走
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
//创建一个虚拟内存区域,也就是申请一个vma
vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
if (file) {//文件映射
if (vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
}
if (vm_flags & VM_SHARED) {
error = mapping_map_writable(file->f_mapping);
if (error)
goto allow_write_and_free_vma;
}
/* ->mmap() can change vma->vm_file, but must guarantee that
* vma_link() below can deny write-access if VM_DENYWRITE is set
* and map writably if VM_SHARED is set. This usually means the
* new file must not have been exposed to user-space, yet.
*/
vma->vm_file = get_file(file);
error = call_mmap(file, vma);//调用文件系统提供的文件映射函数
if (error)
goto unmap_and_free_vma;
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
* Bug: If addr is changed, prev, rb_link, rb_parent should
* be updated for vma_link()
*/
WARN_ON_ONCE(addr != vma->vm_start);
//文件系统提供的文件映射函数可能会修改映射的一些参数。在这里需要在调用vma_link前回置
addr = vma->vm_start;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {//匿名映射
error = shmem_zero_setup(vma);//这里是映射到了/dev/zero这个文件,很巧妙,不需要提前将页面清0
if (error)
goto free_vma;
} else {
vma_set_anonymous(vma);
}
vma_link(mm, vma, prev, rb_link, rb_parent);//将vma链接回进程的mm_struct结构体中
/* Once vma denies write, undo our temporary denial count */
if (file) {
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
}
file = vma->vm_file;
out:
perf_event_mmap(vma);//perf在这里安插了个event
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);//进程内存状态统计,在开启了proc才会有
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
else
mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
uprobe_mmap(vma);
/*
* New (or expanded) vma always get soft dirty status.
* Otherwise user-space soft-dirty page tracker won't
* be able to distinguish situation when vma area unmapped,
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
vma->vm_flags |= VM_SOFTDIRTY;//设置标志位为脏状态
vma_set_page_prot(vma);//新映射的区域必须将其设置为一个全新的数据区域(脏)
return addr;
unmap_and_free_vma:
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
free_vma:
vm_area_free(vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
return error;
}
现在,我们看看匿名映射的函数shmem_zero_setup到底做了什么,其实匿名页实际也映射了文件,只是映射到了/dev/zero上,这样有个好处是,不需要对所有页面进行提前置0,只有当访问到某具体页面的时候才会申请一个0页。
/**
* shmem_zero_setup - setup a shared anonymous mapping
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
*/
int shmem_zero_setup(struct vm_area_struct *vma)
{
struct file *file;
loff_t size = vma->vm_end - vma->vm_start;
/*
* Cloning a new file under mmap_sem leads to a lock ordering conflict
* between XFS directory reading and selinux: since this file is only
* accessible to the user through its mapping, use S_PRIVATE flag to
* bypass file security, in the same way as shmem_kernel_file_setup().
*/
file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
if (IS_ERR(file))
return PTR_ERR(file);
if (vma->vm_file)
fput(vma->vm_file);
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
(vma->vm_end & HPAGE_PMD_MASK)) {
khugepaged_enter(vma, vma->vm_flags);
}
return 0;
}
其实说白了,mmap就是在进程mm中创建或者扩展一个vma映射到某个文件,而共享、私有、文件、匿名这些mmap所具有的属性是在哪里体现的呢?上面的源码在不断的设置一些标记位,这些标记位就决定了进程在访问这些内存时内核的行为,mmap仅负责创建一个映射而已。
下面是mmap系统调用的函数调用以及返回情况说明:
SYSCALL_DEFINE6(mmap,
offset_in_page(off) //检查偏移是不是页的整数倍,
ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
if (!(flags & MAP_ANONYMOUS)) {//如果不是匿名映射
fget(fd);//文件映射,根据文件描述符在进程的打开文件表中找到file实例
else if (flags & MAP_HUGETLB) {//如果是匿名巨型页映射
hugetlb_file_setup//在hugetlbfs文件系统中创建文件“anon_hugepage”
vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);//
LIST_HEAD(uf);//初始化userfaultfd链表
security_mmap_file//security linux安全相关的,一般不会开,返回值为0
down_write_killable(&mm->mmap_sem))//以写者身份申请读写信号量
do_mmap_pgoff//创建内存映射主要工作在此函数中进行
do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
round_hint_to_min(addr);//如果不是使用固定地址,则使用的addr会进行向下页对齐
PAGE_ALIGN(len);//申请内存大小页对齐
//判断申请的内存是否溢出
//判断内存映射次数是否达到上限
get_unmapped_area//获取未映射的地址空间并且验证它
find_vma(mm, addr);//查找申请的虚拟地址所在的vma,用来判断vma是否合理
calc_vm_prot_bits//计算虚拟内存的标志并且下面做一些简单的检查
//根据申请的内存确定vm_flag
mmap_region//真正创建虚拟内存区域
//检查进程虚拟内存大小是否超出限制
while(find_vma_links)//检查是否和旧的虚拟内存区域有重叠
do_munmap//并且把重叠部分释放掉
accountable_mapping//如果有私有的可写的内存映射,可以先修改标志位
vma_merge//如果可以和已有的虚拟内存合并,则合并并且使用该vma,然后结束
//如果不可以合并,继续往下走
vm_area_alloc//创建一个虚拟内存区域,也就是申请一个vma
if (file) {//文件映射
call_mmap(file, vma);//调用文件系统提供的文件映射函数
else if (vm_flags & VM_SHARED) {//匿名映射
shmem_zero_setup(vma);//匿名映射操作函数
shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);//这里是映射到了/dev/zero这个文件
vma_link//将vma链接回进程的mm_struct结构体中
perf_event_mmap(vma);//perf在这里安插了个event
vm_stat_account//进程内存状态统计,在开启了proc才会有
vma_set_page_prot(vma);//新映射的区域必须将其设置为一个全新的数据区域(脏)
up_write(&mm->mmap_sem);//释放读写信号量
mm_populate(ret, populate);//如果调用者要求把也锁定在内存中,或要求