【欢迎关注微信公众号:qiubinwei-1986】
乘着周六有时间,终于可以完成的把进程的虚拟内存管理这块从头到尾学下来,各类函数比较多,只能做到先mark,知其然。所以然照例还是等C语言学的差不多了再回过来学习。
进程地址空间
进程地址空间是指进程可寻址的虚拟地址空间。在64位处理器的操作系统中,进程可寻址256TB用户态的地址空间,另外256TB内核态的地址空间无法访问,需要通过系统调用方式间接访问。
用户空间的进程地址空间可以合法访问。地址空间成为内存区域Memory Area, 在Linux内核中采用VMA数据结构来抽象描述。
内存区域包含树下内容:
代码段映射:可执行文件中包含只读并可执行的程序头,如代码段和init段
数据段映射:可执行文件中包含可读可写的程序头,如数据段和未初始化数据段等
用户进程栈:位于用户空间的最高地址,从上往下延伸,包含栈帧,其中有局部变量和函数调用参数等。
mmap映射区域:位于用户进程栈下面,主要用于mmap系统调用,如映射一个文件的内容到进程地址空间等
堆映射区域:malloc()函数分配的进程虚拟地址
mm_struct数据结构
Linux内核要管理每个进程的内存区域以及他们对应的页表映射,所以需要抽象一个数据结构进行管理,该数据结构叫做mm_struct数据结构。在进程控制块PCB的数据结构task_struct中,有一个指针mm指向,这个指针指向mm_struct数据结构
mm_struct源码分析
struct mm_struct {
struct {
struct vm_area_struct *mmap; //进程里所有的vma形成的单链表,此处为链表头
struct rb_root mm_rb; //vma红黑树的根节点mm_rb
u64 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
#endif
unsigned long mmap_base; //mmap_base指向mmap空间的起始地址
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
/* Base adresses for compatible mmap() */
unsigned long mmap_compat_base;
unsigned long mmap_compat_legacy_base;
#endif
unsigned long task_size; /* size of task vm space */
unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd; //pgd,指向进程的PGD一级页表
#ifdef CONFIG_MEMBARRIER
/**
* @membarrier_state: Flags controlling membarrier behavior.
*
* This field is close to @pgd to hopefully fit in the same
* cache-line, which needs to be touched by switch_mm().
*/
atomic_t membarrier_state;
#endif
/**
* @mm_users: The number of users including userspace.
*
* Use mmget()/mmget_not_zero()/mmput() to modify. When this
* drops to 0 (i.e. when the task exits and there are no other
* temporary reference holders), we also release a reference on
* @mm_count (which may then free the &struct mm_struct if
* @mm_count also drops to 0).
*/
atomic_t mm_users;//记录正在使用的该进程地址空间的进程数目,比如两个线程共享该空间地址,则mm_users为2
/**
* @mm_count: The number of references to &struct mm_struct
* (@mm_users count as 1).
*
* Use mmgrab()/mmdrop() to modify. When this drops to 0, the
* &struct mm_struct is freed.
*/
atomic_t mm_count; //mm_struct结构体的主引用计数
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* PTE page table pages */
#endif
int map_count; /* number of VMAs */
spinlock_t page_table_lock; /* Protects page tables and some
struct rw_semaphore mmap_sem; //mmap_sem为保护VMA的一个读写信号量
//mmlist, 所有的mm_struct数据结构都连接到一个双向立案表中,该链表的头是init_mm内存描述符
//他是init进程的地址空间
struct list_head mmlist; /* List of maybe swapped mm's. These
* are globally strung together off
* init_mm.mmlist, and are protected
* by mmlist_lock
*/
unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
//使用的进程地址空间综合
unsigned long total_vm; /* Total pages mapped */
unsigned long locked_vm; /* Pages that have PG_mlocked set */
atomic64_t pinned_vm; /* Refcount permanently increased */
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
unsigned long stack_vm; /* VM_STACK */
unsigned long def_flags;
spinlock_t arg_lock; /* protect the below fields */
//代码段的起始和结束地址空间
unsigned long start_code, end_code, start_data, end_data;
//堆空间的起始和结束地址空间
unsigned long start_brk, brk, start_stack;
unsigned long arg_start, arg_end, env_start, env_end;
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
/*
* Special counters, in some configurations protected by the
* page_table_lock, in other configurations by being atomic.
*/
struct mm_rss_stat rss_stat;
struct linux_binfmt *binfmt;
/* Architecture-specific MM context */
mm_context_t context;
unsigned long flags; /* Must use atomic bitops to access */
struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
struct kioctx_table __rcu *ioctx_table;
#endif
#ifdef CONFIG_MEMCG
/*
* "owner" points to a task that is regarded as the canonical
* user/owner of this mm. All of the following must be true in
* order for it to be changed:
*
* current == mm->owner
* current->mm != mm
* new_owner->mm == mm
* new_owner->alloc_lock is held
*/
struct task_struct __rcu *owner;
#endif
struct user_namespace *user_ns;
/* store ref to file /proc/<pid>/exe symlink points to */
struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_subscriptions *notifier_subscriptions;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_NUMA_BALANCING
/*
* numa_next_scan is the next time that the PTEs will be marked
* pte_numa. NUMA hinting faults will gather statistics and
* migrate pages to new nodes if necessary.
*/
unsigned long numa_next_scan;
/* Restart point for scanning and setting pte_numa */
unsigned long numa_scan_offset;
/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
#endif
/*
* An operation with batched TLB flushing is going on. Anything
* that can move process memory needs to flush the TLB when
* moving a PROT_NONE or PROT_NUMA mapped page.
*/
atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/* See flush_tlb_batched_pending() */
bool tlb_flush_batched;
#endif
struct uprobes_state uprobes_state;
#ifdef CONFIG_HUGETLB_PAGE
atomic_long_t hugetlb_usage;
#endif
struct work_struct async_put_work;
} __randomize_layout;
/*
* The mm_cpumask needs to be at the end of mm_struct, because it
* is dynamically sized based on nr_cpu_ids.
*/
unsigned long cpu_bitmap[];
};
进程角度观测mm_struct示意图
VMA数据结构
vma数据结构示意图
VMA属性
作为一个进程地址空间的区间,VMA是有自己的属性的,如可读/可写、共享等属性。
vm_flags成员描述这些属性,描述该VMA的劝募页面信息,包括如何映射页面、访问每个页面的权限信息等。
VMA属性的标志位可以任意组合,最终会落到硬件机制上,即页表项属性。
vm_area_struct数据结构中有两个成员和属性的关系,一个是vm_flags成员,用于描述VMA的属性;另一个是vm_page_prot成员,用于将VMA属性标志位转成与处理器相关的页表项的属性。
vm_get_page_prot()函数
该函数主要用于将vm_flags标志位转化成具体的页表项的硬件标志位
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
pgprot_val(arch_vm_get_page_prot(vm_flags)));
return arch_filter_pgprot(ret);
}
EXPORT_SYMBOL(vm_get_page_prot);
protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)])
此处的转换主要通过内存属性数组protection_map[ ]
数组定义如下:
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};
数组内对应的映射关系
<arch/arc/include/asm/pgtable.h>
#define __P000 PAGE_U_NONE
#define __P001 PAGE_U_R
#define __P010 PAGE_U_R /* Pvt-W => !W */
#define __P011 PAGE_U_R /* Pvt-W => !W */
#define __P100 PAGE_U_X_R /* X => R */
#define __P101 PAGE_U_X_R
#define __P110 PAGE_U_X_R /* Pvt-W => !W and X => R */
#define __P111 PAGE_U_X_R /* Pvt-W => !W */
#define __S000 PAGE_U_NONE
#define __S001 PAGE_U_R
#define __S010 PAGE_U_W_R /* W => R */
#define __S011 PAGE_U_W_R
#define __S100 PAGE_U_X_R /* X => R */
#define __S101 PAGE_U_X_R
#define __S110 PAGE_U_X_W_R /* X => R */
#define __S111 PAGE_U_X_W_R
查找VMA
find_vma()函数
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct rb_node *rb_node;
struct vm_area_struct *vma;
/* Check the cache first. */
vma = vmacache_find(mm, addr);
//优先检查vmacache,可以存4个最近使用过的VMA
if (likely(vma))
return vma;
rb_node = mm->mm_rb.rb_node;
//遍历用户进程所在的mm_rb红黑树,这个存放了该用户进程所有的VMA
while (rb_node) {
struct vm_area_struct *tmp;
tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
if (tmp->vm_end > addr) {
vma = tmp;
if (tmp->vm_start <= addr)
break;
rb_node = rb_node->rb_left;
} else
rb_node = rb_node->rb_right;
}
//将查找到的VMA信息更新至vmacache内
if (vma)
vmacache_update(addr, vma);
return vma;
}
EXPORT_SYMBOL(find_vma);
find_vma_intersection()函数
接口函数,通过查找start_addr, end_addr以及现存的VMA有重叠的一个VMA,基于find_vma实现
/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
NULL if none. Assume start_addr < end_addr. */
static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
{
struct vm_area_struct * vma = find_vma(mm,start_addr);
if (vma && end_addr <= vma->vm_start)
vma = NULL;
return vma;
}
find_vma_prev()函数
逻辑与find_vma一致,通过调用find_vma查找到指定vma,并通过vma返回当前VMAd的前一个VMA,vma->vm_prev
/*
* Same as find_vma, but also return a pointer to the previous VMA in *pprev.
*/
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
vma = find_vma(mm, addr);
if (vma) {
*pprev = vma->vm_prev;
} else {
struct rb_node *rb_node = rb_last(&mm->mm_rb);
*pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
}
return vma;
}
插入VMA
insert_vm_struct()函数
insert_vm_struct是内核提供的插入VMA的核心接口函数,它向VMA所在的链表和红黑树中插入一个新的VMA
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
struct vm_area_struct *prev;
struct rb_node **rb_link, *rb_parent;
//find_vma_links找到要插入VMA的位置
if (find_vma_links(mm, vma->vm_start, vma->vm_end,
&prev, &rb_link, &rb_parent))
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
//通过vma_is_anonymous判断VMA是否为匿名映射的VMA并设置vm_pgoff成员
if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
//vma_link函数用于插入链表和红黑树
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
}
find_vma_link()函数
find_vma_link用于查找VMA需要插入的位置
static int find_vma_links(struct mm_struct *mm, unsigned long addr,
unsigned long end, struct vm_area_struct **pprev,
struct rb_node ***rb_link, struct rb_node **rb_parent)
{
struct rb_node **__rb_link, *__rb_parent, *rb_prev;
//__rb_link指向红黑树的根节点
__rb_link = &mm->mm_rb.rb_node;
rb_prev = __rb_parent = NULL;
//遍历红黑树寻找合适的插入位置。
while (*__rb_link) {
struct vm_area_struct *vma_tmp;
__rb_parent = *__rb_link;
vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
//如果addr小于某个节点VMA的结束地址,那么继续遍历当前VMA左边树
if (vma_tmp->vm_end > addr) {
//如果要插入的VMA的地址和现有的VMA地址有重合,则返回错误码ENOMEM
if (vma_tmp->vm_start < end)
return -ENOMEM;
__rb_link = &__rb_parent->rb_left;
} else {
//如果addr大于节点的VMA的结束地址,则继续遍历节点的右子树
rb_prev = __rb_parent;
__rb_link = &__rb_parent->rb_right;
}
}
*pprev = NULL;
//rb_prev指向待插入节点的前继节点
if (rb_prev)
*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
*rb_link = __rb_link;
//找到待插入节点的父节点
*rb_parent = __rb_parent;
return 0;
}
vma_link()函数
找到要插入的地址后,通过vma_link函数将其添加到红黑树和vma链表
static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
struct address_space *mapping = NULL;
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
}
//__vma_link将节点添加到红黑树和链表
__vma_link(mm, vma, prev, rb_link, rb_parent);
// __vma_link_file()把VMA添加到文件的基数树radix tree上
__vma_link_file(vma);
if (mapping)
i_mmap_unlock_write(mapping);
mm->map_count++;
validate_mm(mm);
}
vma_link通过__vma_link函数将节点添加到红黑树和链表中, __vma_link_file()把VMA添加到文件的基数树radix tree上
合并VMA
vma_merge()函数
//vma_merge有10个参数
truct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
int err;
//VM_SPECIAL指的是不可合并和不可锁定的多个VMA
if (vm_flags & VM_SPECIAL)
return NULL;
if (prev)
next = prev->vm_next;
else
next = mm->mmap;
area = next;
if (area && area->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next;
VM_WARN_ON(prev && addr <= prev->vm_start);
VM_WARN_ON(area && end > area->vm_end);
VM_WARN_ON(addr >= end);
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
vm_userfaultfd_ctx)) {
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file,
pgoff+pglen,
vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
err = __vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL,
prev);
} else /* cases 2, 5, 7 */
err = __vma_adjust(prev, prev->vm_start,
end, prev->vm_pgoff, NULL, prev);
if (err)
return NULL;
khugepaged_enter_vma_merge(prev, vm_flags);
return prev;
}
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
vm_userfaultfd_ctx)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
else { /* cases 3, 8 */
err = __vma_adjust(area, addr, next->vm_end,
next->vm_pgoff - pglen, NULL, next);
area = next;
}
if (err)
return NULL;
khugepaged_enter_vma_merge(area, vm_flags);
return area;
}
return NULL;
}
vma_merge在合并中常见的三种情况
-
新VMA的起始地址和prev节点结束地址重叠
-
新VMA的结束地址和next节点的起始地址重叠
-
新VMA和prev和next节点正好衔接上
【欢迎关注微信公众号:qiubinwei-1986】