linux内存管理（四）-munmap的系统调用

最新推荐文章于 2024-02-15 18:28:55 发布

小坚学Linux

最新推荐文章于 2024-02-15 18:28:55 发布

阅读量3.5k

点赞数 1

分类专栏： Linux kernel linux内存管理文章标签： linux 操作系统内存管理

本文链接：https://blog.csdn.net/sinat_22338935/article/details/117387899

版权

Linux kernel 同时被 2 个专栏收录

60 篇文章 69 订阅

订阅专栏

linux内存管理

22 篇文章 4 订阅

订阅专栏

一、munmap的系统调用

0.查找munmap在内核中的系统调用函数

#include <sys/mman.h>
int munmap(void *addr, size_t len);

我现在用的内核版是4.19.40，首先在应用层参考上面解析编写一个munmap使用代码，然后编译成程序，在使用strace工具跟踪其函数调用，可以发现munmap也是调用底层的munmap系统调用，然后我们寻找一下底层的带2个参数的munmap系统调用有哪些：
在这里插入图片描述
1.munmap的系统调用

SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
	profile_munmap(addr);//使用内核通知链唤醒munmap_notifier
	return vm_munmap(addr, len);
}

vm_munmap函数跟上次的vm_mmap_pgoff函数很相似，

int vm_munmap(unsigned long start, size_t len)
{
	int ret;
	struct mm_struct *mm = current->mm;
	LIST_HEAD(uf);//初始化userfaultfd链表

	if (down_write_killable(&mm->mmap_sem))//以写者身份申请读写信号量
		return -EINTR;

	ret = do_munmap(mm, start, len, &uf);
	up_write(&mm->mmap_sem);//释放读写信号量
	userfaultfd_unmap_complete(mm, &uf);//等待userfaultfd处理完成
	return ret;
}

然后进入do_munmap：

/* Munmap is split into 2 main parts -- this part which finds
 * what needs doing, and the areas themselves, which do the
 * work.  This now handles partial unmappings.
 * Jeremy Fitzhardinge <jeremy@goop.org>
 */
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
	      struct list_head *uf)
{
	unsigned long end;
	struct vm_area_struct *vma, *prev, *last;

	if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
		return -EINVAL;

	//要ummap的长度也是page size对齐的
	len = PAGE_ALIGN(len);
	if (len == 0)
		return -EINVAL;

	/* Find the first overlapping VMA */
	vma = find_vma(mm, start);//找到起始地址是落在哪个vma内
	if (!vma)
		return 0;
	prev = vma->vm_prev;
	/* we have  start < vma->vm_end  */

	/* if it doesn't overlap, we have nothing.. */
	//如果要释放空间的结束地址都小于vma的起始地址，说明这两者就没有重叠，直接退出
	end = start + len;
	if (vma->vm_start >= end)
		return 0;

	/*
	 * If we need to split any vma, do it now to save pain later.
	 *
	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
	 * unmapped vm_area_struct will remain in use: so lower split_vma
	 * places tmp vma above, and higher split_vma places tmp vma below.
	 */
	//如果要释放的内存起始地址在vma中间，不在开头的位置
	if (start > vma->vm_start) {
		int error;

		/*
		 * Make sure that map_count on return from munmap() will
		 * not exceed its limit; but let map_count go just above
		 * its limit temporarily, to help free resources as expected.
		 */
		//确保从map_count不会超过限制
		if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
			return -ENOMEM;
		
		//我们要在start处分裂vma成两份，因为我们只需要释放start后面的内存
		error = __split_vma(mm, vma, start, 0);
		if (error)
			return error;
		prev = vma;
	}

	/* Does it split the last one? */
	last = find_vma(mm, end);//找到结束地址是落在哪个vma内
	if (last && end > last->vm_start) {
		//我们要在last处分裂vma成两份，因为我们只需要释放last前面的内存
		int error = __split_vma(mm, last, end, 1);//分裂内存区域
		if (error)
			return error;
	}
	vma = prev ? prev->vm_next : mm->mmap;

	if (unlikely(uf)) {
		/*
		 * If userfaultfd_unmap_prep returns an error the vmas
		 * will remain splitted, but userland will get a
		 * highly unexpected error anyway. This is no
		 * different than the case where the first of the two
		 * __split_vma fails, but we don't undo the first
		 * split, despite we could. This is unlikely enough
		 * failure that it's not worth optimizing it for.
		 */
		int error = userfaultfd_unmap_prep(vma, start, end, uf);
		if (error)
			return error;
	}

	/*
	 * unlock any mlock()ed ranges before detaching vmas
	 */
	if (mm->locked_vm) {//如果这段要释放的空间是lock的
		struct vm_area_struct *tmp = vma;
		while (tmp && tmp->vm_start < end) {
			if (tmp->vm_flags & VM_LOCKED) {
				mm->locked_vm -= vma_pages(tmp);
				munlock_vma_pages_all(tmp);//解除锁定
			}
			tmp = tmp->vm_next;
		}
	}

	/*
	 * Remove the vma's, and unmap the actual pages
	 */
	detach_vmas_to_be_unmapped(mm, vma, prev, end);//把要删除的vma区域移出红黑树
	unmap_region(mm, vma, prev, start, end);//针对删除的目标，在进程的页表和cpu缓存中删除映射

	arch_unmap(mm, vma, start, end);//进行处理器架构的特定操作

	/* Fix up all other VM information */
	remove_vma_list(mm, vma);//从进程虚拟内存区域中删除要删除的vma区域

	return 0;
}

这里主要看detach_vmas_to_be_unmapped，unmap_region，arch_unmap，remove_vma_list这4个函数。
先看detach_vmas_to_be_unmapped是怎么把要删除的vma区域移出红黑树的：

/*
 * Create a list of vma's touched by the unmap, removing them from the mm's
 * vma list as we go..
 */
static void
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
	struct vm_area_struct *prev, unsigned long end)
{
	struct vm_area_struct **insertion_point;
	struct vm_area_struct *tail_vma = NULL;

	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
	vma->vm_prev = NULL;
	do {
		vma_rb_erase(vma, &mm->mm_rb);//从红黑树中删除一个个vma
		mm->map_count--;
		tail_vma = vma;
		vma = vma->vm_next;
	} while (vma && vma->vm_start < end);
	*insertion_point = vma;
	if (vma) {
		vma->vm_prev = prev;
		vma_gap_update(vma);
	} else
		mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
	tail_vma->vm_next = NULL;

	/* Kill the cache */
	vmacache_invalidate(mm);//释放vma cache 信号量
}

在看unmap_region是怎么删除映射的：


/*
 * Get rid of page table information in the indicated region.
 *
 * Called with the mm semaphore held.
 */
static void unmap_region(struct mm_struct *mm,
		struct vm_area_struct *vma, struct vm_area_struct *prev,
		unsigned long start, unsigned long end)
{
	struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
	struct mmu_gather tlb;

	lru_add_drain();//当前CPU实现缓存的刷新
	tlb_gather_mmu(&tlb, mm, start, end);//初始化一个mmu_gather结构体，用于拆解页表
	update_hiwater_rss(mm);//更新高水位线上的rss，也就是已经占用的物理页页数
	unmap_vmas(&tlb, vma, start, end);//清空线性地址空间的所有页表项
	
	//回收上一步已经清空的进程页表
	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
				 next ? next->vm_start : USER_PGTABLES_CEILING);
	tlb_finish_mmu(&tlb, start, end);//刷新TLB，释放页框
}

再看arch_unmap是到底做了什么：
在这里插入图片描述
从上图可以看到，不同架构的cpu有不同的调用函数，还有很多架构没有定义这个函数，就用到了通用的arch_unmap，我们先在include/asm-generic/mm_hooks.h文件中看看通用的函数做了什么：

static inline void arch_unmap(struct mm_struct *mm,
			struct vm_area_struct *vma,
			unsigned long start, unsigned long end)
{
}

竟然是一个空函数，啥都没有做，我们在看看那X86架构做了什么：

static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
			      unsigned long start, unsigned long end)
{
	/*
	 * mpx_notify_unmap() goes and reads a rarely-hot
	 * cacheline in the mm_struct.  That can be expensive
	 * enough to be seen in profiles.
	 *
	 * The mpx_notify_unmap() call and its contents have been
	 * observed to affect munmap() performance on hardware
	 * where MPX is not present.
	 *
	 * The unlikely() optimizes for the fast case: no MPX
	 * in the CPU, or no MPX use in the process.  Even if
	 * we get this wrong (in the unlikely event that MPX
	 * is widely enabled on some system) the overhead of
	 * MPX itself (reading bounds tables) is expected to
	 * overwhelm the overhead of getting this unlikely()
	 * consistently wrong.
	 */
	//判断CPU中有没有MPX，或者进程中有没有使用MPX
	if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
		//读取一个很少热的cacheline，通知mpx
		mpx_notify_unmap(mm, vma, start, end);
}

最后看看那remove_vma_list是怎么从进程虚拟内存区域中删除要删除的vma区域

/*
 * Ok - we have the memory areas we should free on the vma list,
 * so release them, and do the vma updates.
 *
 * Called with the mm semaphore held.
 */
static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
	unsigned long nr_accounted = 0;

	/* Update high watermark before we lower total_vm */
	update_hiwater_vm(mm);//更新高水位线上的内存使用情况
	do {
		long nrpages = vma_pages(vma);

		if (vma->vm_flags & VM_ACCOUNT)
			nr_accounted += nrpages;
		vm_stat_account(mm, vma->vm_flags, -nrpages);
		vma = remove_vma(vma);//从进程的mm_struct的vm_area_struct链表移除删除的vma
	} while (vma);
	vm_unacct_memory(nr_accounted);//减少已经使用的虚拟内存空间
	validate_mm(mm);//如果开启debug则查看mm_struct的状态，否则极易啥也不干
}

其实说白了，要先判断要释放的内存区域块有一下几种情况：
1.要删除的内存区域块刚刚好覆盖了n（n是整数）个vma，我们只要删除这n块vma就行了。
2.要删除的内存区域块的起始位置位于某个vma中间，需要把这个vma分为2块vma，前面的vma不需要删除，后面的vma需要删除。
3.要删除的内存区域块的结束位置位于某个vma中间，需要把这个vma分为2块vma，后面的vma不需要删除，前面的vma需要删除。
3.要删除的内存区域块的起始位置和结束位置都位于某个vma中间，需要把这两个vma都分为2块vma，第一个vma分的两块vma，前面的vma不需要删除，后面的vma需要删除；第二个vma分的两块vma，后面的vma不需要删除，前面的vma需要删除。
下面是munmap系统调用的函数调用以及返回情况说明：
在这里插入图片描述

SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)

		profile_munmap//使用内核通知链唤醒munmap_notifier
		vm_munmap
		
				LIST_HEAD(uf);//初始化userfaultfd链表
				down_write_killable(&mm->mmap_sem))//以写者身份申请读写信号量
				do_munmap//主要函数
				
						find_vma(mm, start);//找到起始地址是落在哪个vma内
						//如果要释放空间的结束地址都小于vma的起始地址，说明这两者就没有重叠，直接退出
						find_vma(mm, start);//找到起始地址是落在哪个vma内
						__split_vma//如果要释放的内存起始地址在vma中间，则分裂vma
						find_vma(mm, end);//找到结束地址是落在哪个vma内
						__split_vma//如果要释放的内存结束地址在vma中间，则分裂vma
						munlock_vma_pages_all(tmp);//如果虚拟内存锁定在内存中，则解除锁定
						detach_vmas_to_be_unmapped(mm, vma, prev, end);//把要删除的vma区域移出红黑树
						
								do{vma_rb_erase} while //从红黑树中删除一个个vma
								vmacache_invalidate(mm);//释放vma cache 信号量
								
						unmap_region(mm, vma, prev, start, end);//针对删除的目标，在进程的页表和cpu缓存中删除映射
						
								lru_add_drain();//当前CPU实现缓存的刷新
								tlb_gather_mmu(&tlb, mm, start, end);//初始化一个mmu_gather结构体，用于拆解页表
								update_hiwater_rss(mm);//更新高水位线上的rss，也就是已经占用的物理页页数
								unmap_vmas(&tlb, vma, start, end);//清空线性地址空间的所有页表项
								free_pgtables//回收上一步已经清空的进程页表
								tlb_finish_mmu(&tlb, start, end);//刷新TLB，释放页框
								
						arch_unmap(mm, vma, start, end);//进行处理器架构的特定操作		
						remove_vma_list(mm, vma);//从进程虚拟内存区域中删除要删除的vma区域
								
								update_hiwater_vm(mm);//更新高水位线上的内存使用情况
								do{remove_vma}while//从进程的mm_struct的vm_area_struct链表移除删除的vma
								vm_unacct_memory(nr_accounted);//减少已经使用的虚拟内存空间
								validate_mm(mm);//如果开启debug则查看mm_struct的状态，否则极易啥也不干
						
				up_write(&mm->mmap_sem);//释放读写信号量
				userfaultfd_unmap_complete(mm, &uf);//等待userfaultfd处理完成