Linux可以支持大量的架构,所以需要用一种与架构无关的方式去描述内存。在linux的内存管理中,我们首先要明确的一个概念就是NUMA(Non-Uniform Memory Access,关于NUMA的介绍可以参考我前面的文章)。很多大型机器都采用NUMA架构,将内存和CPU分为很多组,每一组称为一个节点(node)。节点与节点之间的互相访问,会因为“距离”的不同导致不同的开销。Linux通过struct pglist_data这个结构体来描述节点,对于UMA架构,Linux同样会保留节点的概念,只是整个系统就是一个节点,只需要一个struct pglist_data来描述,它叫作contig_page_data.
struct pglist_data结构描述如下,现在只需了解其中的关键项即可
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES]; /*节点中的管理区*/
struct zonelist node_zonelists[MAX_ZONELISTS]; /*list中zone的顺序代表了分配内存的顺序,前者分配内存失败将会到后者的区域中分配内存*/
int nr_zones; /*节点中管理区的数目,不一定为3个,有的节点中可能不存在ZONE_DMA*/
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
struct page_cgroup *node_page_cgroup;
#endif
#endif
struct bootmem_data *bdata;
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Must be held any time you expect node_start_pfn, node_present_pages
* or node_spanned_pages stay constant. Holding this will also
* guarantee that any pfn_valid() stays that way.
*
* Nests above zone->lock and zone->size_seqlock.
*/
spinlock_t node_size_lock;
#endif
unsigned long node_start_pfn; /*该节点的起始页框编号*/
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
int node_id; /*节点标识符,代表当前节点是系统中的第几个节点*/
wait_queue_head_t kswapd_wait; /*页换出进程使用的等待队列*/
struct task_struct *kswapd; /*指向页换出进程的进程描述符*/
int kswapd_max_order; /*kswapd将要创建的空闲块的大小取对数的值*/
每个节点的内存会被分为几个块,我们称之为管理区(zone),对于一个管理区,我们使用struct zone结构体来描述。管理区的类型可以分为ZONE_NORMAL,ZONE_DMA,ZONE_HIGHMEM三种。这三个管理区在物理内存上的布局为
ZONE_DMA: 0~16MB
ZONE_NORMAL: 16MB~896MB
ZONE_HIGHMEM:896MB~end
struct zone结构描述如下:
struct zone {
/* Fields commonly accessed by the page allocator */
/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long watermark[NR_WMARK];/*该管理区的三个水平线值,min,low,high*/
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;
/*
* We don't know if the memory that we're going to allocate will be freeable
* or/and it will be released eventually, so to avoid totally wasting several
* GB of ram we must reserve some of the lower zone memory (otherwise we risk
* to run OOM on the lower zones despite there's tons of freeable ram
* on the higher zones). This array is recalculated at runtime if the
* sysctl_lowmem_reserve_ratio sysctl changes.
*/
unsigned long lowmem_reserve[MAX_NR_ZONES]; /*每个管理区必须保留的页框数*/
#ifdef CONFIG_NUMA /*如果定义了NUMA*/
int node; /*该管理区所属节点的节点号*/
/*
* zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages; /*当可回收的页面数大于该变量时,管理区将回收页面*/
unsigned long min_slab_pages; /*同上,只不过该标准用于slab回收页面中*/
struct per_cpu_pageset *pageset[NR_CPUS]; /*每个CPU使用的页面缓存*/
#else
struct per_cpu_pageset pageset[NR_CPUS];
#endif
/*
* free areas of different sizes
*/
spinlock_t lock; /*保护该管理区的自旋锁*/
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
struct free_area free_area[MAX_ORDER];/*标识出管理区中的空闲页框块*/
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
ZONE_PADDING(_pad1_)
/* Fields commonly accessed by the page reclaim scanner */
spinlock_t lru_lock;
struct zone_lru {
struct list_head list;
} lru[NR_LRU_LISTS];
struct zone_reclaim_stat reclaim_stat; /*页面回收的状态*/
/*管理区回收页框时使用的计数器,记录到上一次回收,一共扫过的页框数*/
unsigned long pages_scanned; /* since last reclaim */
unsigned long flags; /* zone flags, see below */
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
/*
* prev_priority holds the scanning priority for this zone. It is
* defined as the scanning priority at which we achieved our reclaim
* target at the previous try_to_free_pages() or balance_pgdat()
* invokation.
*
* We use prev_priority as a measure of how much stress page reclaim is
* under - it drives the swappiness decision: whether to unmap mapped
* pages.
*
* Access to both this field is quite racy even on uniprocessor. But
* it is expected to average out OK.
*/
int prev_priority;
/*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this zone's LRU. Maintained by the pageout code.
*/
unsigned int inactive_ratio;
ZONE_PADDING(_pad2_)
/* Rarely used or read-mostly fields */
/*
* wait_table -- the array holding the hash table
* wait_table_hash_nr_entries -- the size of the hash table array
* wait_table_bits -- wait_table_size == (1 << wait_table_bits)
*
* The purpose of all these is to keep track of the people
* waiting for a page to become available and make them
* runnable again when possible. The trouble is that this
* consumes a lot of space, especially when so few things
* wait on pages at a given time. So instead of using
* per-page waitqueues, we use a waitqueue hash table.
*
* The bucket discipline is to sleep on the same queue when
* colliding and wake all in that wait queue when removing.
* When something wakes, it must check to be sure its page is
* truly available, a la thundering herd. The cost of a
* collision is great, but given the expected load of the
* table, they should be so rare as to be outweighed by the
* benefits from the saved space.
*
* __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
* primary users of these fields, and in mm/page_alloc.c
* free_area_init_core() performs the initialization of them.
*/
wait_queue_head_t * wait_table; /*进程等待队列的散列表,这些进程正在等待管理区中的某页*/
unsigned long wait_table_hash_nr_entries; /*散列表数组的大小*/
unsigned long wait_table_bits; /*散列表数组的大小对2取log的结果*/
/*
* Discontig memory support fields.
*/
struct pglist_data *zone_pgdat; /*管理区所属节点*/
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn; /*管理区的起始页框号*/
/*
* zone_start_pfn, spanned_pages and present_pages are all
* protected by span_seqlock. It is a seqlock because it has
* to be read outside of zone->lock, and it is done in the main
* allocator path. But, it is written quite infrequently.
*
* The lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*/
unsigned long spanned_pages; /*管理区的大小,包括洞*/
unsigned long present_pages; /*管理区的大小,不包括洞*/
/*
* rarely used fields:
*/
const char *name; /*指向管理区的名称,为"DMA","NORMAL"或"HighMem"*/
}
物理内存中的每个页都会有一个与之关联的struct page结构来对其进行描述和跟踪,其结构描述如下
struct page {
unsigned long flags; /* Atomic flags, some possibly
* updated asynchronously */
atomic_t _count; /* Usage count, see below. */
union {
atomic_t _mapcount; /* Count of ptes mapped in mms,
* to show when page is mapped
* & limit reverse map searches.
*/
struct { /* SLUB */
u16 inuse;
u16 objects;
};
};
union {
struct {
unsigned long private; /* Mapping-private opaque data:
* usually used for buffer_heads
* if PagePrivate set; used for
* swp_entry_t if PageSwapCache;
* indicates order in the buddy
* system if PG_buddy is set.
*/
struct address_space *mapping; /* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
* memory, low bit is set, and
* it points to anon_vma object:
* see PAGE_MAPPING_ANON below.
*/
};
#if USE_SPLIT_PTLOCKS
spinlock_t ptl;
#endif
struct kmem_cache *slab; /* SLUB: Pointer to slab */
struct page *first_page; /* Compound tail pages */
};
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* SLUB: freelist req. slab lock */
};
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock !
*/
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
unsigned long debug_flags; /* Use atomic bitops on this */
#endif
#ifdef CONFIG_KMEMCHECK
/*
* kmemcheck wants to track the status of each byte in a page; this
* is a pointer to such a status block. NULL if not tracked.
*/
void *shadow;
#endif
};
Node,Zone和Page的关系可以用下图来描述
至此,已经对内存管理中的这三个关键数据结构有了一个感性的认识,在后面将会结合具体的代码来逐步深入,选择的代码版本为2.6.32.59~