本文kernel代码分析基于以下
1.linux-4.14.159
2.64bit代码处理逻辑
Linux通过三层树状结构来管理物理内存:Node、Zone、Page。
Node:
UMA(Uniform Memory Access)模型:为“统一内存访问”,所有处理器共享相同的内存地址空间。
NUMA (Non-Uniform Memory Access)模型:译为“非统一内存访问”。简单理解就是把物理内存按照cpu进行分配,此时每个cpu有自己的本地内存,然而cpu访问自己本地内存肯定比访问其他cpu的要快,且可以防止多个cpu访问同一内存引起的资源竞争问题,这样势必对性能有极大改善。
Linux中对所有的内存进行统一管理,基于上面的模型对物理内存划分为了node,NUMA具有多个node,而UMA可理解为只有一个node的NUMA。
Zone:
在引入node概念后,由于硬件及映射方式不同等造成的内存访问差异,一个node又被分为了多个内存管理区域zone,zone就是最直接的对page的管理,我们看下64位系统下常见的几个zone
zone | 解释 |
---|---|
ZONE_DMA | 直接内存访问,无需映射 |
ZONE_NORMAL | 直接映射到内核的地址空间 |
ZONE_MOVABLE | 虚拟区,实际是从其它zone划出,目的是解决内存的碎片化,其页面都可以做页面迁移或回收 |
注:
针对ZONE_HIGHMEM,因为只在32位系统存在,我们这是只关注64位,因此相关内容或zone的详细解释请自行研究
Page
页(page)作为linux内存管理的最基本单元,每一个物理页面对应一个page结构,即内核会为每一个页帧都创建struct page管理结构,这些page结构按照其物理页面地址顺序有序存放在mem_map数组中的。
page在mem_map数组中的偏移代表了page结构体对应第几个物理页面,即页帧号用pfn表示。
下面通过代码中相应的结构体的解读来对这三层组织结构做进一步介绍
代码解析
注:为了便于代码、注释、解读的易读性,解读用@# 开始(#为数字序列)
Node
内核使用结构体pglist_data表示node节点,简要解释下相关的成员
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
@1. node包含的所有zone结构体的数组
struct zonelist node_zonelists[MAX_ZONELISTS];
@2. zonelist结构的数组,包含所有node的所有zone
int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
@3. 指向node下所有page结构体组成的数组,即管理该node下所有物理页
#ifdef CONFIG_PAGE_EXTENSION
struct page_ext *node_page_ext;
#endif
#endif
#ifndef CONFIG_NO_BOOTMEM
struct bootmem_data *bdata;
@4. 系统启动时,内存还未初始好,这个用于启动期间的内存管理
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
spinlock_t node_size_lock;
#endif
unsigned long node_start_pfn;
@5. node的起始页帧号
unsigned long node_present_pages; /* total number of physical pages */
@6. node下所有可用物理page的总数
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
@7. node下所有物理page的总数(包括空洞/碎片页)
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
@8. kswapd内存回收线程,其是用于pagecache回收或匿名页置换的
int kswapd_order;
enum zone_type kswapd_classzone_idx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_classzone_idx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
#endif
#ifdef CONFIG_NUMA_BALANCING
/* Lock serializing the migrate rate limiting window */
spinlock_t numabalancing_migrate_lock;
/* Rate limiting time interval */
unsigned long numabalancing_migrate_next_window;
/* Number of pages migrated during the rate limiting time interval */
unsigned long numabalancing_migrate_nr_pages;
@9. 上面这些用于numa负载平衡
#endif
unsigned long totalreserve_pages;
@10. 每个node预留的page数量,其不能被用户空间分配
#ifdef CONFIG_NUMA
/*
* zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
@11. 当可回收页超过此值则进行page回收
unsigned long min_slab_pages;
@12. 当slab中可回收page大于此值则进行slab中的缓存页回收
#endif /* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
ZONE_PADDING(_pad1_)
spinlock_t lru_lock;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* If memory initialisation on large machines is deferred then this
* is the first PFN that needs to be initialised.
*/
unsigned long first_deferred_pfn;
/* Number of non-deferred pages */
unsigned long static_init_pgcnt;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
spinlock_t split_queue_lock;
struct list_head split_queue;
unsigned long split_queue_len;
#endif
/* Fields commonly accessed by the page reclaim scanner */
struct lruvec lruvec;
@13. 保存LRU_INACTIVE_ANON、LRU_ACTIVE_ANON、LRU_INACTIVE_FILE、LRU_ACTIVE_FILE、LRU_UNEVICTABLE,
内存不足时根据lru算法选择合适page进行释放
unsigned int inactive_ratio;
unsigned long flags;
ZONE_PADDING(_pad2_)
/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;
主要理解:node_zones、node_mem_map、kswapd、lruvec
Zone
我们再看下Zone的结构体:
struct zone {
unsigned long watermark[NR_WMARK];
@1. zone的水位线,存在3个阈值,pages_min、pages_low、pages_high,当free小于pages_low时会进行kswapd回收page
unsigned long nr_reserved_highatomic;
@2. 用于处理一些紧急情况的预留区大小
long lowmem_reserve[MAX_NR_ZONES];
@3. 针对高位zone内存不足回退到此zone申请内存时,应至少预留的大小。
例: MAX_NR_ZONES=3, ZONE_DMA<ZONE_NORMAL<ZONE_MOVABLE;
lowmem_reserve[0]=0,lowmem_reserve[1]=20,lowmem_reserve[2]=30;
即从ZONE_NORMAL申请内存时,当free page不够则会fallback到ZONE_DMA去申请内存,但是申请前提是要保证ZONE_DMA中必须有20个page预留大小
#ifdef CONFIG_NUMA
int node;
#endif
struct pglist_data *zone_pgdat;
@4. 指向zone所属的node结构体
struct per_cpu_pageset __percpu *pageset;
@5. zone的内存每个cpu都可申请,当分配order=0的页时,从per-cpu页面高速缓存中分配,避免多个cpu申请时的自旋锁竞争,可以提高分配的性能
#ifndef CONFIG_SPARSEMEM
unsigned long *pageblock_flags;
@6. 页面迁移时使用
#endif /* CONFIG_SPARSEMEM */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
@7. zone中起始页帧号
unsigned long managed_pages;
@8. 伙伴系统管理的页帧数量
unsigned long spanned_pages;
@9. zone中物理内存的所有页帧数目,包括空洞
unsigned long present_pages;
@10. zone中物理内存的所有页帧数目,不包括空洞,managed_pages=present_pages-reserved pages
const char *name;
@11. zone的名称如DMA
#ifdef CONFIG_MEMORY_ISOLATION
unsigned long nr_isolate_pageblock;
@12. 隔离的页块数量
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
seqlock_t span_seqlock;
#endif
int initialized;
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
/* free areas of different sizes */M
struct free_area free_area[MAX_ORDER];
@13. zone的空闲页数组/链表,MAX_ORDER=11,每个area中的元素都是大小为2^n的页面(n从0到10)
如n=2,则free_area[2]中元素大小都是4个连续page
/* zone flags, see below */
unsigned long flags;
/* Primarily protects free_area */
spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
ZONE_PADDING(_pad2_)
unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
unsigned long compact_cached_migrate_pfn[2];
#endif
#ifdef CONFIG_COMPACTION
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
@14. 上面几项用于内存规整
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif
bool contiguous;
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
@15. 用于zone的一些数据统计信息
} ____cacheline_internodealigned_in_smp;
上面大多数成员可通过“cat /proc/zoneinfo”来查看zone中的这些值来理解zone结构体
Page
page结构体是linux内核内存管理的重点,每个物理页面都需要一个page结构体来描述,然而这会额外消耗过大的内存资源,因此为了解决占用空间问题,page中使用了大量的联合体来优化空间占用
include\linux\mm_types.h
struct page {
/* First double word block */
unsigned long flags;
@1. 页帧的标志位,如PG_locked、PG_active、PG_writeback、 PG_reserved等,另外还保存了zone编号、node编号等信息
union {
struct address_space *mapping; /* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
* memory, low bit is set, and
* it points to anon_vma object:
* see PAGE_MAPPING_ANON below.
*/
@2. page分为文件页和匿名页,如果是文件页则mapping指向inode的地址;
如果是匿名页,则指向anon_vma,而这两者的判断方式就是低位的值
void *s_mem; /* slab first object */
@3. 指向slab第一个对象的地址
atomic_t compound_mapcount; /* first tail page */
/* page_deferred_list().next -- second tail page */
};
/* Second double word */
union {
pgoff_t index; /* Our offset within mapping. */
@4. 对于文件页:表示在已映射文件内的offset;对于匿名页:表示在映射的地址空间内的offset
void *freelist; /* sl[aou]b first free object */
@5. 用于slab分配器中
/* page_deferred_list().prev -- second tail page */
};
union {
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
unsigned long counters;
#else
/*
* Keep _refcount separate from slub cmpxchg_double data.
* As the rest of the double word is protected by slab_lock
* but _refcount is not.
*/
unsigned counters;
#endif
@6. 用于slub分配器中
struct {
union {
/*
* Count of ptes mapped in mms, to show when
* page is mapped & limit reverse map searches.
*
* Extra information about page type may be
* stored here for pages that are never mapped,
* in which case the value MUST BE <= -2.
* See page-flags.h for more details.
*/
atomic_t _mapcount;
@7. 表示被进程/页表映射的次数,每个进程存在一个页表/pts(page table entry),
说明一个page被多个进程映射/共享,而此特性可用于Rmap反向映射中
_mapcount= PAGE_BUDDY_MAPCOUNT_VALUE(-128):说明page在buddy系统中,表示此页可用
_mapcount=-1:初始值没有被进程映射
_mapcount=0:被一个进程映射,通常为父进程
_mapcount>0:被多个进程映射
unsigned int active; /* SLAB */
struct { /* SLUB */
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
int units; /* SLOB */
};
@8. 以上用于slab、slub、slob小内存分配
/*
* Usage count, *USE WRAPPER FUNCTION* when manual
* accounting. See page_ref.h
*/
atomic_t _refcount;
@9. page在内核中引用计数值,在内核很多场景下的操作处理时会+1,操作完成需要-1,如把page加入到lru中这个场景
_refcount=0,表示无引用该page,可被释放
_refcount>0,表示被内核引用,不可释放
};
};
/*
* Third double word block
*
* WARNING: bit 0 of the first word encode PageTail(). That means
* the rest users of the storage space MUST NOT use the bit to
* avoid collision and false-positive PageTail().
*/
union {
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone_lru_lock !
* Can be used as a generic list
* by the page owner.
*/
@10. page涉及各种链表,为了方便页的分类维护,如伙伴系统上同阶但是不同迁移类型链表,还用在页缓存、slab小内存等
struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
* lru or handled by a slab
* allocator, this points to the
* hosting device page map.
*/
struct { /* slub per cpu partial pages */
struct page *next; /* Next partial slab */
#ifdef CONFIG_64BIT
int pages; /* Nr of partial slabs left */
int pobjects; /* Approximate # of objects */
#else
short int pages;
short int pobjects;
#endif
};
struct rcu_head rcu_head; /* Used by SLAB
* when destroying via RCU
*/
@11. 以上这些用于slab分配器
/* Tail pages of compound page */
struct {
unsigned long compound_head; /* If bit zero is set */
/* First tail page only */
#ifdef CONFIG_64BIT
unsigned int compound_dtor;
unsigned int compound_order;
#else
unsigned short int compound_dtor;
unsigned short int compound_order;
#endif
};
@12. 以上这些用于复合页相关
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
struct {
unsigned long __pad; /* do not overlay pmd_huge_pte
* with compound_head to avoid
* possible bit 0 collision.
*/
pgtable_t pmd_huge_pte; /* protected by page->ptl */
};
#endif
};
@13.透明大页相关
/* Remainder is not double word aligned */
union {
unsigned long private; /* Mapping-private opaque data:
* usually used for buffer_heads
* if PagePrivate set; used for
* swp_entry_t if PageSwapCache;
* indicates order in the buddy
* system if PG_buddy is set.
*/
@14.私有数据,不同场景下不同含义,看英文注释
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
#else
spinlock_t ptl;
#endif
#endif
struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */
@15. 指向slab的高速缓存
};
#ifdef CONFIG_MEMCG
struct mem_cgroup *mem_cgroup;
#endif
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
}
page我们介绍完了,重点关注mapping(理解文件页和匿名页)、_mapcount(页被页表映射次数)、_refcount(页的引用次数)的理解。