Linux中与“系统内存”相关的内核数据结构

【摘要】本文讲述了在Linux中与系统内存相关的内核数据结构,通过逐行分析源代码讲述了内存数据结构之间的关联。

七、系统内存相关的数据结构

7.1 mm_struct

  • 指向进程所拥有的内存描述符,保存了进程的内存管理信息
struct mm_struct 
{
    struct vm_area_struct * mmap;        /* list of VMAs */
    struct rb_root mm_rb;
    struct vm_area_struct * mmap_cache;    /* last find_vma result */
    unsigned long (*get_unmapped_area) (struct file *filp, \
                                        unsigned long addr, \
                                        unsigned long len, \
                                        unsigned long pgoff, \
                                        unsigned long flags);
    
    void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
    unsigned long mmap_base;        /* base of mmap area */
    unsigned long task_size;        /* size of task vm space */
    unsigned long cached_hole_size;     /* if non-zero, the largest hole below free_area_cache */
    unsigned long free_area_cache;        /* first hole of size cached_hole_size or larger */
    pgd_t * pgd;
    atomic_t mm_users;        /* How many users with user space? */
    atomic_t mm_count;        /* How many references to "mm_struct" (users count as 1) */
    int map_count;            /* number of VMAs */
    struct rw_semaphore mmap_sem;
    spinlock_t page_table_lock;        /* Protects page tables and some counters */

    /* List of maybe swapped mm's. These are globally strung 
     * together off init_mm.mmlist, and are protected by mmlist_lock
     */
    struct list_head mmlist;
    
    /* Special counters, in some configurations protected by the
     * page_table_lock, in other configurations by being atomic.
     */
    mm_counter_t _file_rss;
    mm_counter_t _anon_rss;

    unsigned long hiwater_rss;    /* High-watermark of RSS usage */
    unsigned long hiwater_vm;    /* High-water virtual memory usage */

    unsigned long total_vm, locked_vm, shared_vm, exec_vm;
    unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
    unsigned long start_code, end_code, start_data, end_data;
    unsigned long start_brk, brk, start_stack;
    unsigned long arg_start, arg_end, env_start, env_end;

    unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

    struct linux_binfmt *binfmt;

    cpumask_t cpu_vm_mask;

    /* Architecture-specific MM context */
    mm_context_t context;

    /* Swap token stuff */
    /*
     * Last value of global fault stamp as seen by this process.
     * In other words, this value gives an indication of how long
     * it has been since this task got the token.
     * Look at mm/thrash.c
     */
    unsigned int faultstamp;
    unsigned int token_priority;
    unsigned int last_interval;

    unsigned long flags; /* Must use atomic bitops to access the bits */

    struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
    spinlock_t        ioctx_lock;
    struct hlist_head    ioctx_list;
#endif
#ifdef CONFIG_MM_OWNER
    /*
     * "owner" points to a task that is regarded as the canonical
     * user/owner of this mm. All of the following must be true in
     * order for it to be changed:
     *
     * current == mm->owner
     * current->mm != mm
     * new_owner->mm == mm
     * new_owner->alloc_lock is held
     */
    struct task_struct *owner;
#endif

#ifdef CONFIG_PROC_FS
    /* store ref to file /proc/<pid>/exe symlink points to */
    struct file *exe_file;
    unsigned long num_exe_file_vmas;
#endif
#ifdef CONFIG_MMU_NOTIFIER
    struct mmu_notifier_mm *mmu_notifier_mm;
#endif
};

7.2 vm_area_struct

  • 进程虚拟内存的每个区域表示为struct vm_area_struct的一个实例。
struct vm_area_struct 
{
    struct mm_struct	*vm_mm; //vm_mm是一个反向指针,指向该区域所属的mm_struct实例  
    unsigned long       vm_start;	//vm_mm内的起始地址 
    unsigned long       vm_end;	//在vm_mm内结束地址之后的第一个字节的地址    
    
    /* 
    进程所有vm_area_struct实例的链表是通过vm_next实现的 
    */
    struct vm_area_struct        *vm_next;	//各进程的虚拟内存区域链表,按地址排序

    /* 
    该虚拟内存区域的访问权限 
    1) _PAGE_READ
    2) _PAGE_WRITE
    3) _PAGE_EXECUTE
    */
    pgprot_t                     vm_page_prot; 
  
    unsigned long                vm_flags;      
    struct rb_node               vm_rb;         /* VMA's node in the tree */

    /*
    对于有地址空间和后备存储器的区域来说:
    shared连接到address_space->i_mmap优先树
    或连接到悬挂在优先树结点之外、类似的一组虚拟内存区的链表
    或连接到ddress_space->i_mmap_nonlinear链表中的虚拟内存区域
    */
    union 
    {         
        struct 
        {
            struct list_head        list;
            void                    *parent;
            struct vm_area_struct   *head;
        } vm_set;
        struct prio_tree_node prio_tree_node;
    } shared;	/* links to address_space->i_mmap or i_mmap_nonlinear */

    /*
     *在文件的某一页经过写时复制之后,文件的MAP_PRIVATE虚拟内存区域可能同时在i_mmap树和anon_vma链表中,
     *MAP_SHARED虚拟内存区域只能在i_mmap树中,匿名的MAP_PRIVATE、栈或brk虚拟内存区域(file指针为NULL)
     *只能处于anon_vma链表中
     */
    struct list_head	anon_vma_node;     /* anon_vma entry 对该成员的访问通过anon_vma->lock串行化 */
    struct anon_vma     *anon_vma;         /* anonymous VMA object 对该成员的访问通过page_table_lock串行化 */
    struct vm_operations_struct  *vm_ops;     /* associated ops 用于处理该结构的各个函数指针 */
    unsigned long	vm_pgoff;    /* offset within file 后备存储器的有关信息 */
    struct file     *vm_file;    /* 映射到的文件(可能是NULL) */
    void	*vm_private_data;  /* private data vm_pte(即共享内存) */
};
  • vm_flags是描述该区域的一组标志,用于定义区域性质,这些都是在<mm.h>中声明的预处理器常数。
  • \linux-2.6.32.63\include\linux\mm.h
#define VM_READ		0x00000001    /* currently active flags */
#define VM_WRITE    0x00000002
#define VM_EXEC     0x00000004
#define VM_SHARED   0x00000008

/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
#define VM_MAYREAD	0x00000010    /* limits for mprotect() etc */
#define VM_MAYWRITE 0x00000020
#define VM_MAYEXEC  0x00000040
#define VM_MAYSHARE 0x00000080

/*
VM_GROWSDOWN、VM_GROWSUP表示一个区域是否可以向下、向上扩展
1. 由于堆自下而上增长,其区域需要设置VM_GROWSUP
2. 栈自顶向下增长,对该区域设置VM_GROWSDOWN
*/
#define VM_GROWSDOWN    0x00000100    /* general info on the segment */
#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
#define VM_GROWSUP    0x00000200
#else
#define VM_GROWSUP    0x00000000
#endif
#define VM_PFNMAP    0x00000400    /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE    0x00000800    /* ETXTBSY on write attempts.. */

#define VM_EXECUTABLE    0x00001000
#define VM_LOCKED    0x00002000
#define VM_IO           0x00004000    /* Memory mapped I/O or similar */

/* 
Used by sys_madvise() 
由于区域很可能从头到尾顺序读取,则设置VM_SEQ_READ。VM_RAND_READ指定了读取可能是随机的
这两个标志用于"提示"内存管理子系统和块设备层,以优化其性能,例如如果访问是顺序的,则启用页的预读
*/            
#define VM_SEQ_READ    0x00008000    /* App will access data sequentially */
#define VM_RAND_READ    0x00010000    /* App will not benefit from clustered reads */

#define VM_DONTCOPY    0x00020000      /* 相关的区域在fork系统调用执行时不复制 */
#define VM_DONTEXPAND    0x00040000    /* 禁止区域通过mremap系统调用扩展 */
#define VM_RESERVED    0x00080000    /* Count as reserved_vm like IO */
#define VM_ACCOUNT    0x00100000    /* 指定区域是否被归入overcommit特性的计算中 */
#define VM_NORESERVE    0x00200000    /* should the VM suppress accounting */
#define VM_HUGETLB    0x00400000    /* 如果区域是基于某些体系结构支持的巨型页,则设置VM_HUGETLB */
#define VM_NONLINEAR    0x00800000    /* Is non-linear (remap_file_pages) */
#define VM_MAPPED_COPY    0x01000000    /* T if mapped copy of data (nommu mmap) */
#define VM_INSERTPAGE    0x02000000    /* The vma has had "vm_insert_page()" done on it */
#define VM_ALWAYSDUMP    0x04000000    /* Always include in core dumps */

#define VM_CAN_NONLINEAR 0x08000000    /* Has ->fault & does nonlinear pages */
#define VM_MIXEDMAP    0x10000000    /* Can contain "struct page" and pure PFN pages */
#define VM_SAO        0x20000000    /* Strong Access Ordering (powerpc) */
#define VM_PFN_AT_MMAP    0x40000000    /* PFNMAP vma that is fully mapped at mmap time */
#define VM_MERGEABLE    0x80000000    /* KSM may merge identical pages */
  • 这些特性以多种方式限制内存分配

7.3 pg_data_t

  • 在NUMA、UMA中,整个内存划分为"结点",每个结点关联到系统中的一个处理器,在内核中表示为pg_data_t的实例,各个内存节点保存在一个单链表中,供内核遍历。
  • \linux-2.6.32.63\include\linux\mmzone.h
typedef struct pglist_data 
{
    //node_zones是一个数组,包含了结点中的管理区
    struct zone node_zones[MAX_NR_ZONES];

    //node_zonelists指定了结点及其内存域的列表,node_zonelist中zone的顺序代表了分配内存的顺序,前者分配内存失败将会到后者的区域中分配内存,node_zonelist数组对每种可能的内存域类型都配置了一个独立的数组项,包括类型为zonelist的备用列表
    struct zonelist node_zonelists[MAX_ZONELISTS];

    //nr_zones保存结点中不同内存域的数目
    int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP    /* means !SPARSEMEM */
    /*
    node_mem_map指向struct page实例数组的指针,用于描述结点的所有物理内存页,它包含了结点中所有内存域的页
    每个结点又划分为"内存域",是内存的进一步划分,各个内存域都关联了一个数组,用来组织属于该内存域的物理内存页(页帧),对每个页帧,都分配一个struct page实例以及所需的管理数据
    */
    struct page *node_mem_map;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    struct page_cgroup *node_page_cgroup;
#endif
#endif
    //在系统启动期间,内存管理子系统初始化之前,内核也需要使用内存(必须保留部分内存用于初始化内存管理子系统),为了解决这个问题,内核使用了"自举内存分配器(boot memory allocator)",bdata指向自举内存分配器数据结构的实例
    struct bootmem_data *bdata;
#ifdef CONFIG_MEMORY_HOTPLUG
    /*
     * Must be held any time you expect node_start_pfn, node_present_pages
     * or node_spanned_pages stay constant.  Holding this will also
     * guarantee that any pfn_valid() stays that way.
     *
     * Nests above zone->lock and zone->size_seqlock.
     */
    spinlock_t node_size_lock;
#endif
    /*
    node_start_pfn是该NUMA结点第一个页帧的逻辑编号,系统中所有结点的页帧是依次编号的,每个页帧的号码都是全局唯一的(不单单是结点内唯一)
    node_start_pfn在UMA系统中总是0,因为其中只有一个结点,因此其第一个页帧编号总是0
    */
    unsigned long node_start_pfn;
    /* 
    total number of physical pages 
    node_present_pages指定了结点中页帧的总数目
    */
    unsigned long node_present_pages; 
    /* 
    total size of physical page range, including holes 
    node_spanned_pages给出了该结点以页帧为单位计算的长度

    node_present_pages、node_spanned_pages的值不一定相同,因为结点中可能有一些空洞,并不对应真正的页帧
    */
    unsigned long node_spanned_pages;

    //node_id是全局结点ID,系统中的NUMA结点都是从0开始编号
    int node_id;

    //kswapd是交换守护进程(swap deamon)的等待队列,在将页帧换出时会用到
    wait_queue_head_t kswapd_wait;

    //kswapd指向负责该结点的交换守护进程的task_strcut
    struct task_struct *kswapd;

    //kswapd_max_order用于页交换子系统的实现,用来定义需要释放的区域的长度
    int kswapd_max_order;
} pg_data_t;

7.4 zone

  • 内存划分为"结点",每个结点关联到系统中的一个处理器,各个结点又划分为"内存域",是内存的进一步划分
  • \linux-2.6.32.63\include\linux\mmzone.h
struct zone 
{
    /* 通常由页分配器访问的字段*/

    /* 
    zone watermarks, access with *_wmark_pages(zone) macros。
    pages_min、pages_high、pages_low是页换出时使用的"水印",如果内存不足,内核可以将页写到硬盘,
    这3个成员会影响交换守护进程的行为。
    1. 如果空闲页多于pages_high: 则内存域的状态是理想的
    2. 如果空闲页的数目低于pages_low: 则内核开始将页换出到硬盘
    3. 如果空闲页的数目低于pages_min: 则页回收工作的压力已经很大了,因为内存域中急需空闲页,内核中
    有一些机制用于处理这种紧急情况。
    */
    unsigned long watermark[NR_WMARK];

    /*
     * When free pages are below this point, additional steps are taken
     * when reading the number of free pages to avoid per-cpu counter
     * drift allowing watermarks to be breached
     */
    unsigned long percpu_drift_mark;

    /*
    We don't know if the memory that we're going to allocate will be freeable or/and 
    it will be released eventually, so to avoid totally wasting several GB of ram we
    must reserve some of the lower zone memory (otherwise we risk to run OOM on the 
    lower zones despite there's tons of freeable ram on the higher zones).This array 
    is recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl changes.
    lowmem_reserve数组分别为各种内存域指定了若干页,用于一些无论如何都不能失败的关键性内存分配,各个
    内存域的份额根据重要性确定。lowmem_reserve的计算由setup_per_zone_lowmem_reserve完成,内核
    迭代系统的所有结点,对每个结点的各个内存域分别计算预留内存最小值,具体的算法是:内存域中页帧的总数/
    sysctl_lowmem_reserve_ratio[zone],除数(sysctl_lowmem_reserve_ratio[zone])的默认设置
    对低端内存域是256,对高端内存域是32。
    */
    unsigned long        lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
    int node;
    /*
     * zone reclaim becomes active if more unmapped pages exist.
     */
    unsigned long        min_unmapped_pages;
    unsigned long        min_slab_pages;
    struct per_cpu_pageset    *pageset[NR_CPUS];
#else
    /*
    pageset是一个数组,用于实现每个CPU的热/冷页帧列表,内核使用这些列表来保存可用于满足实现的"新鲜页"。
    但冷热帧对应的高速缓存状态不同。
    1. 热帧: 页帧已经加载到高速缓存中,与在内存中的页相比,因此可以快速访问,故称之为热的。
    2. 冷帧: 未缓存的页帧已经不在高速缓存中,故称之为冷的。
    */ 
    struct per_cpu_pageset    pageset[NR_CPUS];
#endif
    /*
     * free areas of different sizes
     */
    spinlock_t        lock;
#ifdef CONFIG_MEMORY_HOTPLUG
    /* see spanned/present_pages for more description */
    seqlock_t        span_seqlock;
#endif
    /*
    不同长度的空闲区域
    free_area是同名数据结构的数组,用于实现伙伴系统,每个数组元素都表示某种固定长度的一些连续内存区,
    对于包含在每个区域中的空闲内存页的管理,free_area是一个起点。
    */
    struct free_area    free_area[MAX_ORDER];

#ifndef CONFIG_SPARSEMEM
    /*
     * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
     * In SPARSEMEM, this map is stored in struct mem_section
     */
    unsigned long        *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */


    ZONE_PADDING(_pad1_)

    /* 
    Fields commonly accessed by the page reclaim scanner 
    通常由页面回收扫描程序访问的字段
    */
    spinlock_t        lru_lock;    
    struct zone_lru 
    {
        struct list_head list;
    } lru[NR_LRU_LISTS];

    struct zone_reclaim_stat reclaim_stat;

    /* 
    since last reclaim 
    上一次回收以来扫描过的页
    */
    unsigned long        pages_scanned;     
    /* 
    zone flags
    内存域标志
    */  
    unsigned long        flags;           

    /* 
    Zone statistics 
    内存域统计量,vm_stat维护了大量有关该内存域的统计信息,内核中很多地方都会更新其中的信息
    */
    atomic_long_t        vm_stat[NR_VM_ZONE_STAT_ITEMS];

    /*
    prev_priority holds the scanning priority for this zone.  
    It is defined as the scanning priority at which we achieved our reclaim target 
    at the previous try_to_free_pages() or balance_pgdat() invokation.
    We use prev_priority as a measure of how much stress page reclaim is under - 
    it drives the swappiness decision: whether to unmap mapped pages.
    Access to both this field is quite racy even on uniprocessor.  But it is expected 
    to average out OK.
    prev_priority存储了上一次扫描操作扫描该内存域的优先级,扫描操作是由try_to_free_pages进行的,
    直至释放足够的页帧,扫描会根据该值判断是否换出映射的页
    */
    int prev_priority;

    /*
     * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
     * this zone's LRU.  Maintained by the pageout code.
     */
    unsigned int inactive_ratio;


    ZONE_PADDING(_pad2_)
    /*
    Rarely used or read-mostly fields 
    很少使用或大多数情况下是只读的字段
    */

    /*
    1. wait_table: the array holding the hash table
    2. wait_table_hash_nr_entries: the size of the hash table array
    3. wait_table_bits: wait_table_size == (1 << wait_table_bits)
 
    The purpose of all these is to keep track of the people waiting for a page to 
    become available and make them runnable again when possible. 
    The trouble is that this consumes a lot of space, especially when so few things 
    wait on pages at a given time. So instead of using per-page waitqueues, we use 
    a waitqueue hash table. The bucket discipline is to sleep on the same queue when
    colliding and wake all in that wait queue when removing. When something wakes, 
    it must check to be sure its page is truly available, a la thundering herd. 
    The cost of a collision is great, but given the expected load of the table, they 
    should be so rare as to be outweighed by the benefits from the saved space.
     __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the primary users 
     of these fields, and in mm/page_alloc.c free_area_init_core() performs the
     initialization of them.
    wait_table、wait_table_hash_nr_entries、wait_table_bits实现了一个等待队列,可用于存储等待
    某一页变为可用的等待进程,进程排成一个队列,等待某些条件,在条件变为真时,内核会通知进程恢复工作。
    */
    wait_queue_head_t    * wait_table;
    unsigned long        wait_table_hash_nr_entries;
    unsigned long        wait_table_bits;

    /*
    Discontig memory support fields.
    支持不连续内存模型的字段,内存域和父节点之间的关联由zone_pgdat建立,
    zone_pgdat指向对应的pg_list_data实例(内存结点)
    */
    struct pglist_data    *zone_pgdat;
    /* 
    zone_start_pfn == zone_start_paddr >> PAGE_SHIFT 
    zone_start_pfn是内存域第一个页帧的索引
    */
    unsigned long        zone_start_pfn;

    /*
    zone_start_pfn, spanned_pages and present_pages are all protected by span_seqlock.  
    It is a seqlock because it has to be read outside of zone->lock, and it is done in
    the main allocator path.  But, it is written quite infrequently.
    
    The lock is declared along with zone->lock because it is frequently read in proximity
    to zone->lock.  It's good to give them a chance of being in the same cacheline.
    */
    unsigned long        spanned_pages;    /* 内存域中页的总数,包含空洞*/
    unsigned long        present_pages;    /* 内存域中页的实际数量(除去空洞) */

    /*rarely used fields:*/
    /*
    name是一个字符串,保存该内存域的管用名称,有3个选项可用(Normal、DMA、HighMem)
    */
    const char        *name;
} ____cacheline_internodealigned_in_smp;
  • 该结构比较特殊的方面是它由ZONE_PADDING分隔为几个部分,这是因为对zone结构的访问非常频繁,在多处理器系统上,通常会有不同的CPU试图同时访问结构成员,因此使用了锁防止它们彼此干扰,避免错误和不一致。由于内核对该结构的访问非常频繁,因此会经常性地获取该结构的两个自旋锁zone->lockzone->lru_lock。因此,如果数据保存在CPU高速缓存中,那么会处理的更快速。而高速缓存分为行,每一行负责不同的内存区,内核使用ZONE_PADDING宏生成"填充"字段添加到结构中,以确保每个自旋锁都处于自身的"缓存行"中,还使用了编译器关键字____cacheline_internodealigned_in_smp,用以实现最优的高速缓存对齐方式。
  • 这是内核在基于对CPU底层硬件的深刻理解后做出的优化,通过看似浪费空间的"冗余"操作,提高了CPU的并行处理效率,防止了因为锁导致的等待损耗。

7.5 page

  • 该结构的格式是体系结构无关的,不依赖于使用的CPU类型,每个页帧都由该结构描述
  • \linux-2.6.32.63\include\linux\mm_types.h
/*
Each physical page in the system has a struct page associated with it to keep track of whatever it is we are using the page for at the moment. 
Note that we have no way to track which tasks are using a page, though if it is a pagecache page, rmap structures can tell us who is mapping it.
*/
struct page 
{
    unsigned long flags;    

    /*
    Usage count, see below
    _count记录了该页被引用了多少次,_count是一个使用计数,表示内核中引用该页的次数
    1. 在其值到达0时,内核就知道page实例当前不使用,因此可以删除
    2. 如果其值大于0,该实例绝不会从内存删除
    */    
    atomic_t _count;         
    union 
    {
        /* 
        Count of ptes mapped in mms, to show when page is mapped & limit reverse map searches.
        内存管理子系统中映射的页表项计数,用于表示在页表中有多少项指向该页,还用于限制逆向映射搜索
        atomic_t类型允许以原子方式修改其值,即不受并发访问的影响
        */
        atomic_t _mapcount; 
        struct 
        {    /* 
            SLUB: 用于SLUB分配器,表示对象的数目 
            */
            u16 inuse;
            u16 objects;
        };
    };
    union 
    {
        struct 
        {
            /* 
            Mapping-private opaque data: 由映射私有,不透明数据
            usually used for buffer_heads if PagePrivate set: 如果设置了PagePrivate,通常用于buffer_heads
            used for swp_entry_t if PageSwapCache: 如果设置了PageSwapCache,则用于swp_entry_t
            indicates order in the buddy system if PG_buddy is set: 如果设置了PG_buddy,则用于表示伙伴系统中的阶
            private是一个指向"私有"数据的指针,虚拟内存管理会忽略该数据
            */
            unsigned long private;        

            /* 
            If low bit clear, points to inode address_space, or NULL: 如果最低位为0,则指向inode address_space,成为NULL
            If page mapped as anonymous memory, low bit is set, and it points to anon_vma object: 如果页映射为匿名内存,则将最低位置位,而且该指针指向anon_vma对象
            mapping指定了页帧所在的地址空间
            */
            struct address_space *mapping;    
        };
#if USE_SPLIT_PTLOCKS
        spinlock_t ptl;
#endif
        /* 
        SLUB: Pointer to slab 
        用于SLAB分配器: 指向SLAB的指针
        */
        struct kmem_cache *slab;    
        /* 
        Compound tail pages 
        内核可以将多个相连的页合并成较大的复合页(compound page),分组中的第一个页称作首页(head page),而所有其余各页叫做尾页(tail page),所有尾页对应的page实例中,都将first_page设置为指向首页
        用于复合页的页尾,指向首页
        */
        struct page *first_page;    
    };
    union 
    {
        /* 
        Our offset within mapping. 
        index是页帧在映射内的偏移量
        */
        pgoff_t index;        
        void *freelist;        /* SLUB: freelist req. slab lock */
    };

    /* 
    Pageout list(换出页列表), eg. active_list protected by zone->lru_lock 
    */
    struct list_head lru;        
    /*
     * On machines where all RAM is mapped into kernel address space,
     * we can simply calculate the virtual address. On machines with
     * highmem some memory is mapped into kernel virtual memory
     * dynamically, so we need a place to store that address.
     * Note that this field could be 16 bits on x86 ... ;)
     *
     * Architectures with slow multiplication can define
     * WANT_PAGE_VIRTUAL in asm/page.h
     */
#if defined(WANT_PAGE_VIRTUAL)
    /* 
    Kernel virtual address (NULL if not kmapped, ie. highmem) 
    内核虚拟地址(如果没有映射机制则为NULL,即高端内存)
    */
    void *virtual;            
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
    unsigned long debug_flags;    /* Use atomic bitops on this */
#endif

#ifdef CONFIG_KMEMCHECK
    /*
     * kmemcheck wants to track the status of each byte in a page; this
     * is a pointer to such a status block. NULL if not tracked.
     */
    void *shadow;
#endif
};
  • 成员分析

    • flags: flag存储了体系结构无关的标志,用来存放页的状态属性,每一位代表一种状态,所以至少可以同时表示出32中不同的状态,这些状态定义在linux/page-flags.h中。

      enum pageflags 
      {
          PG_locked,      //指定了页是否锁定,如果该比特位置位,内核的其他部分不允许访问该页,
          //这防止了内存管理出现竞态条件,例如从硬盘读取数据到页帧时.
      
          PG_error,       //如果在涉及该页的I/O操作期间发生错误,则PG_error置位
          PG_referenced,  //PG_referenced、PG_active控制了系统使用该页的活跃程度,
          				//在页交换子系统选择换出页时,该信息很重要。
      
          PG_uptodate,    //PG_uptodate表示页的数据已经从块设备读取,期间没有出错
          PG_dirty,       //如果与硬盘上的数据相比,页的内容已经改变,则置位PG_dirty。
          				//处于性能考虑,页并不在每次修改后立即写回,因此内核使用该标志注明页
          				//已经改变,可以在稍后刷出。设置了该标志的页称为脏的(即内存中的数据
          				//没有与外存储器介质如硬盘上的数据同步)
      
          PG_lru,         //PG_lru有助于实现页面回收和切换,内核使用两个最近最少使用
          				//(least recently used lru)链表来区别活动和不活动页,
          				//如果页在其中一个链表中,则设置该比特位
          PG_active,
          PG_slab,            //如果页是SLAB分配器的一部分,则设置PG_slab位
          PG_owner_priv_1,    //Owner use. If pagecache, fs may use 
          PG_arch_1,
          PG_reserved,
          PG_private,         //如果page结构的private成员非空,则必须设置PG_private位,
          					//用于I/O的页,可使用该字段将页细分为多个缓冲区
          PG_private_2,       //If pagecache, has fs aux data 
          PG_writeback,       //若页的内容处于向块设备回写的过程中,则需要设置PG_writeback位
      #ifdef CONFIG_PAGEFLAGS_EXTENDED
          PG_head,            //A head page 
          PG_tail,            //A tail page  
      #else
          PG_compound,      //PG_compound表示该页属于一个更大的复合页,复合页由多个相连的普通页组成
      #endif
          PG_swapcache,        //如果页处于交换缓存,则设置PG_swapcache位,在这种情况下,private包含一个类型为swap_entry_t的项 
          PG_mappedtodisk,    //Has blocks allocated on-disk  
          PG_reclaim,            //To be reclaimed asap: 在可用内存的数量变少时,内核视图周期性地回收页,即剔除不活动、未用的页,在内核决定回收某个特定的页=之后,需要设置PG_reclaim标志通知
          PG_buddy,            //Page is free, on buddy lists: 如果页空闲且包含在伙伴系统的列表中,则设置PG_buddy位,伙伴系统是页分配机制的核心
          PG_swapbacked,        //Page is backed by RAM/swap 
          PG_unevictable,        //Page is "unevictable"  
      #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
          PG_mlocked,            //Page is vma mlocked  
      #endif
      #ifdef CONFIG_ARCH_USES_PG_UNCACHED
          PG_uncached,        //Page has been mapped as uncached  
      #endif
      #ifdef CONFIG_MEMORY_FAILURE
          PG_hwpoison,        //hardware poisoned page. Don't touch  
      #endif
          __NR_PAGEFLAGS,
          PG_checked = PG_owner_priv_1,    //Filesystems   
          PG_fscache = PG_private_2,        //page backed by cache 
      
          //XEN  
          PG_pinned = PG_owner_priv_1,
          PG_savepinned = PG_dirty,
      
          //SLOB  
          PG_slob_free = PG_private,
      
          //SLUB  
          PG_slub_frozen = PG_active,
          PG_slub_debug = PG_error,
      };
      
      • 内核定义了一些标准宏,用于检查页是否设置了某个特定的比特位,或者操作某个比特位,这些宏的名称有一定的模式,这些操作都是原子的:
        • PageXXX(page): 会检查页是否设置了PG_XXX位
        • SetPageXXX: 在某个比特位没有设置的情况下,设置该比特位,并返回原值
        • ClearPageXXX: 无条件地清除某个特定的比特位
        • TestClearPageXXX: 清除某个设置的比特位,并返回原值。
  • 很多时候,需要等待页的状态改变,然后才能恢复工作,内核提供了两个辅助函数(\linux-2.6.32.63\include\linux\pagemap.h)

    /*
     *假定内核的一部分在等待一个被锁定的页面,直至页面解锁,wait_on_page_locked提供了该功能,
     *在页面锁定的情况下调用该函数,内核将进入睡眠,在页解锁之后,睡眠进程被自动唤醒并继续共走.
     */
    static inline void wait_on_page_locked(struct page *page);
    
    /*
     *wait_on_page_writeback的工作方式类似,该函数会等待到与页面相关的所有待决回写操作结束,
     *将页面包含的数据同步到块设备(例如硬盘)为止.
     */
    static inline void wait_on_page_writeback(struct page *page);
    
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Leon_George

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值