linux页帧管理采用伙伴系统算法,与bootmem相比有更快的分配速度,能够快速打找到空闲内存;有效的解决外部碎片。
页帧的处理代码主要在:mm/page_alloc.c中
I.主要数据结构
i.页描述符struct page
内存管理的核心是页帧管理,内核必须清楚的知道页帧的状态:该页帧是否空闲,是否已经包含代码或数据,是否已经被修改等。
每个页帧都有一个页描述符与之对应,用来表示页帧信息;页描述符用page结构表示
33 /*
34 * Each physical page in the system has a struct page associated with
35 * it to keep track of whatever it is we are using the page for at the
36 * moment. Note that we have no way to track which tasks are using
37 * a page, though if it is a pagecache page, rmap structures can tell us
38 * who is mapping it.
39 */
40 struct page {
41 unsigned long flags; /* Atomic flags, some possibly
42 * updated asynchronously */
43 atomic_t _count; /* Usage count, see below. */
44 union {
45 atomic_t _mapcount; /* Count of ptes mapped in mms,
46 * to show when page is mapped
47 * & limit reverse map searches.
48 */
49 struct { /* SLUB */
50 u16 inuse;
51 u16 objects;
52 };
53 };
54 union {
55 struct {
56 unsigned long private; /* Mapping-private opaque data:
57 * usually used for buffer_heads
58 * if PagePrivate set; used for
59 * swp_entry_t if PageSwapCache;
60 * indicates order in the buddy
61 * system if PG_buddy is set.
62 */
63 struct address_space *mapping; /* If low bit clear, points to
64 * inode address_space, or NULL.
65 * If page mapped as anonymous
66 * memory, low bit is set, and
67 * it points to anon_vma object:
68 * see PAGE_MAPPING_ANON below.
69 */
70 };
71 #if USE_SPLIT_PTLOCKS
72 spinlock_t ptl;
73 #endif
74 struct kmem_cache *slab; /* SLUB: Pointer to slab */
75 struct page *first_page; /* Compound tail pages */
76 };
77 union {
78 pgoff_t index; /* Our offset within mapping. */
79 void *freelist; /* SLUB: freelist req. slab lock */
80 };
81 struct list_head lru; /* Pageout list, eg. active_list
82 * protected by zone->lru_lock !
83 */
84 /*
85 * On machines where all RAM is mapped into kernel address space,
86 * we can simply calculate the virtual address. On machines with
87 * highmem some memory is mapped into kernel virtual memory
88 * dynamically, so we need a place to store that address.
89 * Note that this field could be 16 bits on x86 ... ;)
90 *
91 * Architectures with slow multiplication can define
92 * WANT_PAGE_VIRTUAL in asm/page.h
93 */
94 #if defined(WANT_PAGE_VIRTUAL)
95 void *virtual; /* Kernel virtual address (NULL if
96 not kmapped, ie. highmem) */
97 #endif /* WANT_PAGE_VIRTUAL */
98 #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
99 unsigned long debug_flags; /* Use atomic bitops on this */
100 #endif
101
102 #ifdef CONFIG_KMEMCHECK
103 /*
104 * kmemcheck wants to track the status of each byte in a page; this
105 * is a pointer to such a status block. NULL if not tracked.
106 */
107 void *shadow;
108 #endif
109 };
flags:页标志位,表示页帧当前状态;如PG_buddy表示该页帧属于伙伴系统
lru:将page描述符串联起来组成链表;如伙伴系统中将空闲内存块首个页描述符通过lru组成空闲内存块链表
private:页私有数据,当用于伙伴系统时用来存储order大小
ii.区域描述符struct zone
计算机的体系结构对页帧的使用有一些限制,不是每个页帧都能使用在任意情况(如ISA总线的DMA只能使用低16MB、32位CPU的地址空间不能访问所有的物理内存),linux使用区域概念对页帧进行分组;
X86中主要分为以下几个区域:
ZONE_DMA:16MB以下
ZONE_NORMAL:内核的直接映射区,16MB~896MB
ZONE_HIGHMEM:不能直接映射区,896MB以上
每个区域用区域描述符表示:
286 struct zone {
287 /* Fields commonly accessed by the page allocator */
288
289 /* zone watermarks, access with *_wmark_pages(zone) macros */
290 unsigned long watermark[NR_WMARK];
291
292 /*
293 * When free pages are below this point, additional steps are taken
294 * when reading the number of free pages to avoid per-cpu counter
295 * drift allowing watermarks to be breached
296 */
297 unsigned long percpu_drift_mark;
298
299 /*
300 * We don't know if the memory that we're going to allocate will be freeable
301 * or/and it will be released eventually, so to avoid totally wasting several
302 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
303 * to run OOM on the lower zones despite there's tons of freeable ram
304 * on the higher zones). This array is recalculated at runtime if the
305 * sysctl_lowmem_reserve_ratio sysctl changes.
306 */
307 unsigned long lowmem_reserve[MAX_NR_ZONES];
308
309 #ifdef CONFIG_NUMA
310 int node;
311 /*
312 * zone reclaim becomes active if more unmapped pages exist.
313 */
314 unsigned long min_unmapped_pages;
315 unsigned long min_slab_pages;
316 struct per_cpu_pageset *pageset[NR_CPUS];
317 #else
318 struct per_cpu_pageset pageset[NR_CPUS];
319 #endif
320 /*
321 * free areas of different sizes
322 */
323 spinlock_t lock;
324 #ifdef CONFIG_MEMORY_HOTPLUG
325 /* see spanned/present_pages for more description */
326 seqlock_t span_seqlock;
327 #endif
328 struct free_area free_area[MAX_ORDER];
329
330 #ifndef CONFIG_SPARSEMEM
331 /*
332 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
333 * In SPARSEMEM, this map is stored in struct mem_section
334 */
335 unsigned long *pageblock_flags;
336 #endif /* CONFIG_SPARSEMEM */
337
338
339 ZONE_PADDING(_pad1_)
340
341 /* Fields commonly accessed by the page reclaim scanner */
342 spinlock_t lru_lock;
343 struct zone_lru {
344 struct list_head list;
345 } lru[NR_LRU_LISTS];
346
347 struct zone_reclaim_stat reclaim_stat;
348
349 unsigned long pages_scanned; /* since last reclaim */
350 unsigned long flags; /* zone flags, see below */
351
352 /* Zone statistics */
353 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
354
355 /*
356 * prev_priority holds the scanning priority for this zone. It is
357 * defined as the scanning priority at which we achieved our reclaim
358 * target at the previous try_to_free_pages() or balance_pgdat()
359 * invokation.
360 *
361 * We use prev_priority as a measure of how much stress page reclaim is
362 * under - it drives the swappiness decision: whether to unmap mapped
363 * pages.
364 *
365 * Access to both this field is quite racy even on uniprocessor. But
366 * it is expected to average out OK.
367 */
368 int prev_priority;
369
370 /*
371 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
372 * this zone's LRU. Maintained by the pageout code.
373 */
374 unsigned int inactive_ratio;
375
376
377 ZONE_PADDING(_pad2_)
378 /* Rarely used or read-mostly fields */
379
380 /*
381 * wait_table -- the array holding the hash table
382 * wait_table_hash_nr_entries -- the size of the hash table array
383 * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
384 *
385 * The purpose of all these is to keep track of the people
386 * waiting for a page to become available and make them
387 * runnable again when possible. The trouble is that this
388 * consumes a lot of space, especially when so few things
389 * wait on pages at a given time. So instead of using
390 * per-page waitqueues, we use a waitqueue hash table.
391 *
392 * The bucket discipline is to sleep on the same queue when
393 * colliding and wake all in that wait queue when removing.
394 * When something wakes, it must check to be sure its page is
395 * truly available, a la thundering herd. The cost of a
396 * collision is great, but given the expected load of the
397 * table, they should be so rare as to be outweighed by the
398 * benefits from the saved space.
399 *
400 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
401 * primary users of these fields, and in mm/page_alloc.c
402 * free_area_init_core() performs the initialization of them.
403 */
404 wait_queue_head_t * wait_table;
405 unsigned long wait_table_hash_nr_entries;
406 unsigned long wait_table_bits;
407
408 /*
409 * Discontig memory support fields.
410 */
411 struct pglist_data *zone_pgdat;
412 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
413 unsigned long zone_start_pfn;
414
415 /*
416 * zone_start_pfn, spanned_pages and present_pages are all
417 * protected by span_seqlock. It is a seqlock because it has
418 * to be read outside of zone->lock, and it is done in the main
419 * allocator path. But, it is written quite infrequently.
420 *
421 * The lock is declared along with zone->lock because it is
422 * frequently read in proximity to zone->lock. It's good to
423 * give them a chance of being in the same cacheline.
424 */
425 unsigned long spanned_pages; /* total size, including holes */
426 unsigned long present_pages; /* amount of memory (excluding holes) */
427
428 /*
429 * rarely used fields:
430 */
431 const char