一、munmap的系统调用
0.查找munmap在内核中的系统调用函数
#include <sys/mman.h>
int munmap(void *addr, size_t len);
我现在用的内核版是4.19.40,首先在应用层参考上面解析编写一个munmap使用代码,然后编译成程序,在使用strace工具跟踪其函数调用,可以发现munmap也是调用底层的munmap系统调用,然后我们寻找一下底层的带2个参数的munmap系统调用有哪些:
1.munmap的系统调用
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
profile_munmap(addr);//使用内核通知链唤醒munmap_notifier
return vm_munmap(addr, len);
}
vm_munmap函数跟上次的vm_mmap_pgoff函数很相似,
int vm_munmap(unsigned long start, size_t len)
{
int ret;
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);//初始化userfaultfd链表
if (down_write_killable(&mm->mmap_sem))//以写者身份申请读写信号量
return -EINTR;
ret = do_munmap(mm, start, len, &uf);
up_write(&mm->mmap_sem);//释放读写信号量
userfaultfd_unmap_complete(mm, &uf);//等待userfaultfd处理完成
return ret;
}
然后进入do_munmap:
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
* work. This now handles partial unmappings.
* Jeremy Fitzhardinge <jeremy@goop.org>
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf)
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
//要ummap的长度也是page size对齐的
len = PAGE_ALIGN(len);
if (len == 0)
return -EINVAL;
/* Find the first overlapping VMA */
vma = find_vma(mm, start);//找到起始地址是落在哪个vma内
if (!vma)
return 0;
prev = vma->vm_prev;
/* we have start < vma->vm_end */
/* if it doesn't overlap, we have nothing.. */
//如果要释放空间的结束地址都小于vma的起始地址,说明这两者就没有重叠,直接退出
end = start + len;
if (vma->vm_start >= end)
return 0;
/*
* If we need to split any vma, do it now to save pain later.
*
* Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
* unmapped vm_area_struct will remain in use: so lower split_vma
* places tmp vma above, and higher split_vma places tmp vma below.
*/
//如果要释放的内存起始地址在vma中间,不在开头的位置
if (start > vma->vm_start) {
int error;
/*
* Make sure that map_count on return from munmap() will
* not exceed its limit; but let map_count go just above
* its limit temporarily, to help free resources as expected.
*/
//确保从map_count不会超过限制
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
//我们要在start处分裂vma成两份,因为我们只需要释放start后面的内存
error = __split_vma(mm, vma, start, 0);
if (error)
return error;
prev = vma;
}
/* Does it split the last one? */
last = find_vma(mm, end);//找到结束地址是落在哪个vma内
if (last && end > last->vm_start) {
//我们要在last处分裂vma成两份,因为我们只需要释放last前面的内存
int error = __split_vma(mm, last, end, 1);//分裂内存区域
if (error)
return error;
}
vma = prev ? prev->vm_next : mm->mmap;
if (unlikely(uf)) {
/*
* If userfaultfd_unmap_prep returns an error the vmas
* will remain splitted, but userland will get a
* highly unexpected error anyway. This is no
* different than the case where the first of the two
* __split_vma fails, but we don't undo the first
* split, despite we could. This is unlikely enough
* failure that it's not worth optimizing it for.
*/
int error = userfaultfd_unmap_prep(vma, start, end, uf);
if (error)
return error;
}
/*
* unlock any mlock()ed ranges before detaching vmas
*/
if (mm->locked_vm) {//如果这段要释放的空间是lock的
struct vm_area_struct *tmp = vma;
while (tmp && tmp->vm_start < end) {
if (tmp->vm_flags & VM_LOCKED) {
mm->locked_vm -= vma_pages(tmp);
munlock_vma_pages_all(tmp);//解除锁定
}
tmp = tmp->vm_next;
}
}
/*
* Remove the vma's, and unmap the actual pages
*/
detach_vmas_to_be_unmapped(mm, vma, prev, end);//把要删除的vma区域移出红黑树
unmap_region(mm, vma, prev, start, end);//针对删除的目标,在进程的页表和cpu缓存中删除映射
arch_unmap(mm, vma, start, end);//进行处理器架构的特定操作
/* Fix up all other VM information */
remove_vma_list(mm, vma);//从进程虚拟内存区域中删除要删除的vma区域
return 0;
}
这里主要看detach_vmas_to_be_unmapped,unmap_region,arch_unmap,remove_vma_list这4个函数。
先看detach_vmas_to_be_unmapped是怎么把要删除的vma区域移出红黑树的:
/*
* Create a list of vma's touched by the unmap, removing them from the mm's
* vma list as we go..
*/
static void
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, unsigned long end)
{
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
vma_rb_erase(vma, &mm->mm_rb);//从红黑树中删除一个个vma
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
if (vma) {
vma->vm_prev = prev;
vma_gap_update(vma);
} else
mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
tail_vma->vm_next = NULL;
/* Kill the cache */
vmacache_invalidate(mm);//释放vma cache 信号量
}
在看unmap_region是怎么删除映射的:
/*
* Get rid of page table information in the indicated region.
*
* Called with the mm semaphore held.
*/
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
struct mmu_gather tlb;
lru_add_drain();//当前CPU实现缓存的刷新
tlb_gather_mmu(&tlb, mm, start, end);//初始化一个mmu_gather结构体,用于拆解页表
update_hiwater_rss(mm);//更新高水位线上的rss,也就是已经占用的物理页页数
unmap_vmas(&tlb, vma, start, end);//清空线性地址空间的所有页表项
//回收上一步已经清空的进程页表
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, start, end);//刷新TLB,释放页框
}
再看arch_unmap是到底做了什么:
从上图可以看到,不同架构的cpu有不同的调用函数,还有很多架构没有定义这个函数,就用到了通用的arch_unmap,我们先在include/asm-generic/mm_hooks.h文件中看看通用的函数做了什么:
static inline void arch_unmap(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
}
竟然是一个空函数,啥都没有做,我们在看看那X86架构做了什么:
static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
/*
* mpx_notify_unmap() goes and reads a rarely-hot
* cacheline in the mm_struct. That can be expensive
* enough to be seen in profiles.
*
* The mpx_notify_unmap() call and its contents have been
* observed to affect munmap() performance on hardware
* where MPX is not present.
*
* The unlikely() optimizes for the fast case: no MPX
* in the CPU, or no MPX use in the process. Even if
* we get this wrong (in the unlikely event that MPX
* is widely enabled on some system) the overhead of
* MPX itself (reading bounds tables) is expected to
* overwhelm the overhead of getting this unlikely()
* consistently wrong.
*/
//判断CPU中有没有MPX,或者进程中有没有使用MPX
if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
//读取一个很少热的cacheline,通知mpx
mpx_notify_unmap(mm, vma, start, end);
}
最后看看那remove_vma_list是怎么从进程虚拟内存区域中删除要删除的vma区域
/*
* Ok - we have the memory areas we should free on the vma list,
* so release them, and do the vma updates.
*
* Called with the mm semaphore held.
*/
static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
unsigned long nr_accounted = 0;
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);//更新高水位线上的内存使用情况
do {
long nrpages = vma_pages(vma);
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
vm_stat_account(mm, vma->vm_flags, -nrpages);
vma = remove_vma(vma);//从进程的mm_struct的vm_area_struct链表移除删除的vma
} while (vma);
vm_unacct_memory(nr_accounted);//减少已经使用的虚拟内存空间
validate_mm(mm);//如果开启debug则查看mm_struct的状态,否则极易啥也不干
}
其实说白了,要先判断要释放的内存区域块有一下几种情况:
1.要删除的内存区域块刚刚好覆盖了n(n是整数)个vma,我们只要删除这n块vma就行了。
2.要删除的内存区域块的起始位置位于某个vma中间,需要把这个vma分为2块vma,前面的vma不需要删除,后面的vma需要删除。
3.要删除的内存区域块的结束位置位于某个vma中间,需要把这个vma分为2块vma,后面的vma不需要删除,前面的vma需要删除。
3.要删除的内存区域块的起始位置和结束位置都位于某个vma中间,需要把这两个vma都分为2块vma,第一个vma分的两块vma,前面的vma不需要删除,后面的vma需要删除;第二个vma分的两块vma,后面的vma不需要删除,前面的vma需要删除。
下面是munmap系统调用的函数调用以及返回情况说明:
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
profile_munmap//使用内核通知链唤醒munmap_notifier
vm_munmap
LIST_HEAD(uf);//初始化userfaultfd链表
down_write_killable(&mm->mmap_sem))//以写者身份申请读写信号量
do_munmap//主要函数
find_vma(mm, start);//找到起始地址是落在哪个vma内
//如果要释放空间的结束地址都小于vma的起始地址,说明这两者就没有重叠,直接退出
find_vma(mm, start);//找到起始地址是落在哪个vma内
__split_vma//如果要释放的内存起始地址在vma中间,则分裂vma
find_vma(mm, end);//找到结束地址是落在哪个vma内
__split_vma//如果要释放的内存结束地址在vma中间,则分裂vma
munlock_vma_pages_all(tmp);//如果虚拟内存锁定在内存中,则解除锁定
detach_vmas_to_be_unmapped(mm, vma, prev, end);//把要删除的vma区域移出红黑树
do{vma_rb_erase} while //从红黑树中删除一个个vma
vmacache_invalidate(mm);//释放vma cache 信号量
unmap_region(mm, vma, prev, start, end);//针对删除的目标,在进程的页表和cpu缓存中删除映射
lru_add_drain();//当前CPU实现缓存的刷新
tlb_gather_mmu(&tlb, mm, start, end);//初始化一个mmu_gather结构体,用于拆解页表
update_hiwater_rss(mm);//更新高水位线上的rss,也就是已经占用的物理页页数
unmap_vmas(&tlb, vma, start, end);//清空线性地址空间的所有页表项
free_pgtables//回收上一步已经清空的进程页表
tlb_finish_mmu(&tlb, start, end);//刷新TLB,释放页框
arch_unmap(mm, vma, start, end);//进行处理器架构的特定操作
remove_vma_list(mm, vma);//从进程虚拟内存区域中删除要删除的vma区域
update_hiwater_vm(mm);//更新高水位线上的内存使用情况
do{remove_vma}while//从进程的mm_struct的vm_area_struct链表移除删除的vma
vm_unacct_memory(nr_accounted);//减少已经使用的虚拟内存空间
validate_mm(mm);//如果开启debug则查看mm_struct的状态,否则极易啥也不干
up_write(&mm->mmap_sem);//释放读写信号量
userfaultfd_unmap_complete(mm, &uf);//等待userfaultfd处理完成