前面已经讲了linux物理内存相关的结构体了,就是node、zone、page。这些结构体的初始化为我们linux操作系统使用内存奠定了基础。我们也知道,linux操作系统使用的是虚拟内存,每个进程操作的也都是虚拟内存,所以现在讲的是虚拟内存相关的结构体。
虚拟内存数据结构
虚拟内存区域是分配给进程的一个虚拟地址范围,内核使用结构体 vm_area_struct 描述虚拟内存区域,vm_area_struct位于mm_struct结构体中,mm_struct位于task_struct结构体中,也就是说在task_struct里面有一个mm_struct,他就是我们进程里面的内存结构体,在mm_struct里面有一个vm_area_struct结构体,他是记录进程使用了的虚拟内存。
本人在github源代码注释项目地址是:
https://github.com/RichkingLi/linux-4.19.40-note.git
进程的虚拟地址空间,linux源代码里面有一个 struct mm_struct 结构来管理内存。我们看下这个结构体,这个结果一听在include/linux/mm_types.h文件中,下面会配上个人的一些注释:
struct mm_struct {
struct {
struct vm_area_struct *mmap;//虚拟内存区域链表
struct rb_root mm_rb;//虚拟内存区域红黑树
u64 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,//在内存映射区域找到一块没有映射的区域
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
#endif
unsigned long mmap_base;//内存映射区域的起始地址
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
/* Base adresses for compatible mmap() */
unsigned long mmap_compat_base;
unsigned long mmap_compat_legacy_base;
#endif
unsigned long task_size;//用户虚拟地址空间长度
unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd;//指向页全局目录,也就是第一级页表
/**
* @mm_users: The number of users including userspace.
*
* Use mmget()/mmget_not_zero()/mmput() to modify. When this
* drops to 0 (i.e. when the task exits and there are no other
* temporary reference holders), we also release a reference on
* @mm_count (which may then free the &struct mm_struct if
* @mm_count also drops to 0).
*/
atomic_t mm_users;//共享一个用户虚拟地址空间的进程数量,也就是线程包含的进程数量
/**
* @mm_count: The number of references to &struct mm_struct
* (@mm_users count as 1).
*
* Use mmgrab()/mmdrop() to modify. When this drops to 0, the
* &struct mm_struct is freed.
*/
atomic_t mm_count;//内存描述符的引用计数
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes;//PTE页表页大小
#endif
int map_count;//虚拟内存映射数量
spinlock_t page_table_lock;//保护页表的锁
/* Protects page tables and some
* counters
*/
struct rw_semaphore mmap_sem;//读写信号量
struct list_head mmlist; //链表指向可能进行了交换的内存
/* List of maybe swapped mm's. These
* are globally strung together off
* init_mm.mmlist, and are protected
* by mmlist_lock
*/
unsigned long hiwater_rss;//RSS的高水位使用情况
unsigned long hiwater_vm;//高水位虚拟内存使用情况
unsigned long total_vm;//进程地址空间的映射页数
unsigned long locked_vm;//被锁住不能换出的页数
unsigned long pinned_vm;//不能换出也不能移动的页数
unsigned long data_vm;//存放数据的页数
unsigned long exec_vm;//存放可执行文件的页数
unsigned long stack_vm;//存放栈的页数
unsigned long def_flags;
spinlock_t arg_lock; /* protect the below fields */
//可执行程序代码段的起始地址和结束地址,数据段的起始地址和结束地址
unsigned long start_code, end_code, start_data, end_data;
//堆的起始地址,堆的当前地址(也是结束地址),栈的起始地址(栈的结束地址在寄存器的栈顶指针中)
unsigned long start_brk, brk, start_stack;
//参数字符串的起始地址和结束地址,环境变量的其实地址和结束地址
unsigned long arg_start, arg_end, env_start, env_end;
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
/*
* Special counters, in some configurations protected by the
* page_table_lock, in other configurations by being atomic.
*/
struct mm_rss_stat rss_stat;
struct linux_binfmt *binfmt;
/* Architecture-specific MM context */
mm_context_t context;//处理器架构特定的内存瓜茉莉上下文
unsigned long flags; /* Must use atomic bitops to access */
struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_MEMBARRIER
atomic_t membarrier_state;
#endif
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
struct kioctx_table __rcu *ioctx_table;
#endif
#ifdef CONFIG_MEMCG
/*
* "owner" points to a task that is regarded as the canonical
* user/owner of this mm. All of the following must be true in
* order for it to be changed:
*
* current == mm->owner
* current->mm != mm
* new_owner->mm == mm
* new_owner->alloc_lock is held
*/
struct task_struct __rcu *owner;
#endif
struct user_namespace *user_ns;
/* store ref to file /proc/<pid>/exe symlink points to */
struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_mm *mmu_notifier_mm;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_NUMA_BALANCING
/*
* numa_next_scan is the next time that the PTEs will be marked
* pte_numa. NUMA hinting faults will gather statistics and
* migrate pages to new nodes if necessary.
*/
unsigned long numa_next_scan;
/* Restart point for scanning and setting pte_numa */
unsigned long numa_scan_offset;
/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
#endif
/*
* An operation with batched TLB flushing is going on. Anything
* that can move process memory needs to flush the TLB when
* moving a PROT_NONE or PROT_NUMA mapped page.
*/
atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/* See flush_tlb_batched_pending() */
bool tlb_flush_batched;
#endif
struct uprobes_state uprobes_state;
#ifdef CONFIG_HUGETLB_PAGE
atomic_long_t hugetlb_usage;
#endif
struct work_struct async_put_work;
#if IS_ENABLED(CONFIG_HMM)
/* HMM needs to track a few things per mm */
struct hmm *hmm;
#endif
} __randomize_layout;
/*
* The mm_cpumask needs to be at the end of mm_struct, because it
* is dynamically sized based on nr_cpu_ids.
*/
unsigned long cpu_bitmap[];
};
在mm_struct里面第一个参数mmap,是一个vm_area_struct结构体,他是记录进程使用了的虚拟内存,我们也看看他:
/*
* This struct defines a memory VMM memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
* space that has a special rule for the page-fault handlers (ie a shared
* library, the executable area etc).
*/
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
//这两个成员分别保存该虚拟内存空间的首地址和末地址后第一个字节的地址
unsigned long vm_start; /* Our start address within vm_mm. */
unsigned long vm_end; /* The first byte after our end address
within vm_mm. */
/* linked list of VM areas per task, sorted by address */
struct vm_area_struct *vm_next, *vm_prev;//分别是VMA链表的前后成员
struct rb_node vm_rb;//如果使用链表结构,会影响他搜索的速度,采用红黑树可以解决此问题,
//每一个进程结构体里面的mm_struct都会创建一颗红黑树,将VMA作为一个节点加入红黑树中,提升搜索速率
/*
* Largest free memory gap in bytes to the left of this VMA.
* Either between this VMA and vma->vm_prev, or between one of the
* VMAs below us in the VMA rbtree and its ->vm_prev. This helps
* get_unmapped_area find a free area of the right size.
*/
unsigned long rb_subtree_gap;
/* Second cache line starts here. */
struct mm_struct *vm_mm; //内存描述符,也就是虚拟内存区域所属的用户虚拟地址空间
pgprot_t vm_page_prot; //保护位,也就是访问权限
/* Flags, see mm.h
#define VM_NONE 0x00000000
#define VM_READ 0x00000001
#define VM_WRITE 0x00000002
#define VM_EXEC 0x00000004
#define VM_SHARED 0x00000008 */
unsigned long vm_flags; //保护标志位
//为了支持查询一个文件区间被映射到哪些虚拟内存区域,
//把一个文件映射的所有虚拟内存区域加入该文件地址空间结构体的i_mmap成员(address_space->i_mmap)所指向的设备树。
struct {
struct rb_node rb;
unsigned long rb_subtree_last;
} shared;
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
* list, after a COW of one of the file pages. A MAP_SHARED vma
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
* or brk vma (with NULL file) can only be in an anon_vma list.
*/
//把虚拟内存区域关联的所有的anon_vma实例串联起来,
//一个虚拟内存区域会关联到父进程的anon_vma实例和自己的anon_vma实例
struct list_head anon_vma_chain; /* Serialized by mmap_sem &
* page_table_lock */
//指向一个anon_vma实例,结构体anon_vma用来组织匿名页被映射到的所有的虚拟地址空间
struct anon_vma *anon_vma; /* Serialized by page_table_lock */
/* Function pointers to deal with this struct. */
const struct vm_operations_struct *vm_ops;//虚拟内存操作函数集合,定义的结构体在mm.h文件中
/* Information about our backing store: */
unsigned long vm_pgoff;//文件偏移,单位是页
units */
struct file * vm_file;//指向文件的打开实例,如果是私有的匿名映射,该成员为NULL
void * vm_private_data;//指向内存区的私有数据
atomic_long_t swap_readahead_info;
#ifndef CONFIG_MMU
struct vm_region *vm_region; /* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
在struct vm_area_struct里面的虚拟内存操作函数集合const struct vm_operations_struct *vm_ops也比较重要,下面也注释一下,在include/linux/mm.h中:
/*
* These are the virtual MM functions - opening of an area, closing and
* unmapping it (needed to keep files on disk up-to-date etc), pointer
* to the functions called when a no-page or a wp-page exception occurs.
*/
struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);//在创建虚拟内存区域时调用open方法
void (*close)(struct vm_area_struct * area);//在删除虚拟内存区域时调用close方法
int (*split)(struct vm_area_struct * area, unsigned long addr);
int (*mremap)(struct vm_area_struct * area);//使用系统调用mremap移动虚拟内存区域时调用mremap方法
vm_fault_t (*fault)(struct vm_fault *vmf); //访问文件映射的虚拟页时,如果没有映射到物理页,生成缺页异常,
//异常处理程序调用fault就去把文件的数据读到文件页缓存当中
vm_fault_t (*huge_fault)(struct vm_fault *vmf,//与fault类似,区别是huge_fault方法是针对巨型页的文件映射
enum page_entry_size pe_size);
//读文件映射的虚拟页时,如果没有映射到屋里也,生成缺页异常,异常处理程序除了读入正在访问的文件页,
//还会预读后续的文件页,调用map_pages可以在文件的页缓存中分配物理页
void (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
unsigned long (*pagesize)(struct vm_area_struct * area);
/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
//第一次写私有的文件映射时,生成的页错误异常,异常处理程序执行写时复制,
//调用page_mkwrite方法以通知文件系统页即将变成可写,以便文件系统检查是否允许写,或者等待页进入合适的转态。
vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
/* called by access_process_vm when get_user_pages() fails, typically
* for use by special VMAs that can switch between memory and hardware
*/
int (*access)(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
/* Called by the /proc/PID/maps code to ask the vma whether it
* has a special name. Returning non-NULL will also cause this
* vma to be dumped unconditionally. */
const char *(*name)(struct vm_area_struct *vma);
#ifdef CONFIG_NUMA
/*
* set_policy() op must add a reference to any non-NULL @new mempolicy
* to hold the policy upon return. Caller should pass NULL @new to
* remove a policy and fall back to surrounding context--i.e. do not
* install a MPOL_DEFAULT policy, nor the task or system default
* mempolicy.
*/
int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
/*
* get_policy() op must add reference [mpol_get()] to any policy at
* (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure
* in mm/mempolicy.c will do this automatically.
* get_policy() must NOT add a ref if the policy at (vma,addr) is not
* marked as MPOL_SHARED. vma policies are protected by the mmap_sem.
* If no [shared/vma] mempolicy exists at the addr, get_policy() op
* must return NULL--i.e., do not "fallback" to task or system default
* policy.
*/
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
unsigned long addr);
#endif
/*
* Called by vm_normal_page() for special PTEs to find the
* page for @addr. This is useful if the default behavior
* (using pte_page()) would not find the correct page.
*/
struct page *(*find_special_page)(struct vm_area_struct *vma,
unsigned long addr);
};