内核把物理页作为内存管理的基本单位。
内存管理单元(MMU)通常以页为单位进行处理,MMU是负责管理内存并把虚拟地址转换为物理地址的硬件。
MMU以页(page)大小为单位来管理系统中的页表。
体系结构的不同,所支持的页的大小也不尽相同,大多数32位体系结构支持4KB PAGE_SIZE, 64位的体系结构一般支持8KB的PAGE_SIZE。
如果PAGE_SIZE=4KB, 1G物理内存,则物理内存会被划分为1024x1024/4 =262144个页。
1. 内核用 struct page 结构表示系统中的每个物理页:
page结构与物理页相关,内核仅仅用page来描述当前时刻 在相关物理页中存放的东西,page的目的在于描述物理内存本身,而不是描述包含在其中的数据。
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
* moment. Note that we have no way to track which tasks are using
* a page, though if it is a pagecache page, rmap structures can tell us
* who is mapping it.
*/
struct page {
unsigned long flags; /* Atomic flags, some possibly
* updated asynchronously */
atomic_t _count; /* Usage count, see below. */
union {
atomic_t _mapcount; /* Count of ptes mapped in mms,
* to show when page is mapped
* & limit reverse map searches.
*/
struct { /* SLUB */
u16 inuse;
u16 objects;
};
};
union {
struct {
unsigned long private; /* Mapping-private opaque data:
* usually used for buffer_heads
* if PagePrivate set; used for
* swp_entry_t if PageSwapCache;
* indicates order in the buddy
* system if PG_buddy is set.
*/
struct address_space *mapping; /* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
* memory, low bit is set, and
* it points to anon_vma object:
* see PAGE_MAPPING_ANON below.
*/
};
#if USE_SPLIT_PTLOCKS
spinlock_t ptl;
#endif
struct kmem_cache *slab; /* SLUB: Pointer to slab */
struct page *first_page; /* Compound tail pages */
};
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* SLUB: freelist req. slab lock */
};
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock !
*/
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
unsigned long debug_flags; /* Use atomic bitops on this */
#endif
#ifdef CONFIG_KMEMCHECK
/*
* kmemcheck wants to track the status of each byte in a page; this
* is a pointer to such a status block. NULL if not tracked.
*/
void *shadow;
#endif
};
1.1 flags 用来存放页的状态。 包括页是不是脏的,是不是被锁定在内存中。 flags的每一位表示一种状态,它可以同时表示出32种不同的状态。 这些定义在<linux/page-flags.h>中
/*
* Various page->flags bits:
*
* PG_reserved is set for special pages, which can never be swapped out. Some
* of them might not even exist (eg empty_bad_page)...
*
* The PG_private bitflag is set on pagecache pages if they contain filesystem
* specific data (which is normally at page->private). It can be used by
* private allocations for its own usage.
*
* During initiation of disk I/O, PG_locked is set. This bit is set before I/O
* and cleared when writeback _starts_ or when read _completes_. PG_writeback
* is set before writeback starts and cleared when it finishes.
*
* PG_locked also pins a page in pagecache, and blocks truncation of the file
* while it is held.
*
* page_waitqueue(page) is a wait queue of all tasks waiting for the page
* to become unlocked.
*
* PG_uptodate tells whether the page's contents is valid. When a read
* completes, the page becomes uptodate, unless a disk I/O error happened.
*
* PG_referenced, PG_reclaim are used for page reclaim for anonymous and
* file-backed pagecache (see mm/vmscan.c).
*
* PG_error is set to indicate that an I/O error occurred on this page.
*
* PG_arch_1 is an architecture specific page state bit. The generic code
* guarantees that this bit is cleared for a page when it first is entered into
* the page cache.
*
* PG_highmem pages are not permanently mapped into the kernel virtual address
* space, they need to be kmapped separately for doing IO on the pages. The
* struct page (these bits with information) are always mapped into kernel
* address space...
*
* PG_buddy is set to indicate that the page is free and in the buddy system
* (see mm/page_alloc.c).
*
* PG_hwpoison indicates that a page got corrupted in hardware and contains
* data with incorrect ECC bits that triggered a machine check. Accessing is
* not safe since it may cause another machine check. Don't touch!
*/
/*
* Don't use the *_dontuse flags. Use the macros. Otherwise you'll break
* locked- and dirty-page accounting.
*
* The page flags field is split into two parts, the main flags area
* which extends from the low bits upwards, and the fields area which
* extends from the high bits downwards.
*
* | FIELD | ... | FLAGS |
* N-1 ^ 0
* (NR_PAGEFLAGS)
*
* The fields area is reserved for fields mapping zone, node (for NUMA) and
* SPARSEMEM section (for variants of SPARSEMEM that require section ids like
* SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
*/
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
PG_error,
PG_referenced,
PG_uptodate,
PG_dirty,
PG_lru,
PG_active,
PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1,
PG_reserved,
PG_private, /* If pagecache, has fs-private data */
PG_private_2, /* If pagecache, has fs aux data */
PG_writeback, /* Page is under writeback */
#ifdef CONFIG_PAGEFLAGS_EXTENDED
PG_head, /* A head page */
PG_tail, /* A tail page */
#else
PG_compound, /* A compound page */
#endif
PG_swapcache, /* Swap page: swp_entry_t in private */
PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
PG_buddy, /* Page is free, on buddy lists */
PG_swapbacked, /* Page is backed by RAM/swap */
PG_unevictable, /* Page is "unevictable" */
#ifdef CONFIG_MMU
PG_mlocked, /* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PG_uncached, /* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
__NR_PAGEFLAGS,
/* Filesystems */
PG_checked = PG_owner_priv_1,
/* Two page bits are conscripted by FS-Cache to maintain local caching
* state. These bits are set on pages belonging to the netfs's inodes
* when those inodes are being locally cached.
*/
PG_fscache = PG_private_2, /* page backed by cache */
/* XEN */
PG_pinned = PG_owner_priv_1,
PG_savepinned = PG_dirty,
/* SLOB */
PG_slob_free = PG_private,
/* SLUB */
PG_slub_frozen = PG_active,
};
1.2 _count --- 页的引用计数,也就是这一页被引用了多少次。
当_count = -1时,表示当前内核没有引用这一页,于是新的分配中就可以使用它。
内核代码不应该直接检查 _count 值, 而应该调用 page_count()函数进行检查,该函数唯一的参数就是page.
static inline int page_count(struct page *page)
{
return atomic_read(&compound_head(page)->_count);
}
static inline struct page *compound_head(struct page *page)
{
if (unlikely(PageTail(page)))
return page->first_page;
return page;
}
page_count(struct page *page) 通常用来检查该页是否空闲,page_count()返回0,表示页空闲; 返回一个正整数,表示页在使用。
1.3 页的用法
一个页可以由页缓存使用,这时mapping指向和这个页关联的 address_space 对象。
也可以用作私有数据, 由 private 指向。
也可以作为进程页表中的映射,即kmem_cache *slab。
所以page结构里,使用了一个union:
union {
struct {
unsigned long private; /* Mapping-private opaque data:
* usually used for buffer_heads
* if PagePrivate set; used for
* swp_entry_t if PageSwapCache;
* indicates order in the buddy
* system if PG_buddy is set.
*/
struct address_space *mapping; /* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
* memory, low bit is set, and
* it points to anon_vma object:
* see PAGE_MAPPING_ANON below.
*/
};
#if USE_SPLIT_PTLOCKS
spinlock_t ptl;
#endif
struct kmem_cache *slab; /* SLUB: Pointer to slab */
struct page *first_page; /* Compound tail pages */
};
1.4 void *virtual --- 页的虚拟地址。它就是页在虚拟内存中的地址。
有些内存(如高端内存)并不永久的映射到内核地址空间上,这种情况下, *virtual = NULL, 需要的时候,必须动态的映射这些页。
系统中的每个物理页都需要分配一个page结构,我们来计算下B,所有这些page需要消耗多少内存。就算 struct page 占40个字节,假定系统物理页大小为8KB,系统有4GB内存,那么共有页面4x1024x1024/8 = 524288个,
而所有这些page消耗的内存 40x524288 = 20MB, 这个值相对于4GB内存而言,仅是很小的一部分,所以代价并不算太高。
2. 区 --- ZONE
内核把页划分为不同的区(zone)。
2.1 Linux主要使用了4种区:
ZONE_DMA ---这个区包含的页可以用来执行DMA操作。
ZONE_DMA32 --- 与ZONE_DMA类似 ,该区包含的页面可以用来执行DMA操作,与ZONE_DMA不同之处在于,这些页面只能被32位设备访问。
ZONE_NORMAL --- 这个区包含的都是能正常映射的页。
ZONE_HIGNMEM -- 高端内存,这里面的页并不能永久的映射到内核地址空间。
这些区类型 zone_type 在 <linux/mmzone.h> 中定义:
enum zone_type {
#ifdef CONFIG_ZONE_DMA
/*
* ZONE_DMA is used when there are devices that are not able
* to do DMA to all of addressable memory (ZONE_NORMAL). Then we
* carve out the portion of memory that is needed for these devices.
* The range is arch specific.
*
* Some examples
*
* Architecture Limit
* ---------------------------
* parisc, ia64, sparc <4G
* s390 <2G
* arm Various
* alpha Unlimited or 0-16MB.
*
* i386, x86_64 and multiple other arches
* <16M.
*/
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
/*
* x86_64 needs two ZONE_DMAs because it supports devices that are
* only able to do DMA to the lower 16M but also 32 bit devices that
* can only do DMA areas below 4G.
*/
ZONE_DMA32,
#endif
/*
* Normal addressable memory is in ZONE_NORMAL. DMA operations can be
* performed on pages in ZONE_NORMAL if the DMA devices support
* transfers to all addressable memory.
*/
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
/*
* A memory area that is only addressable by the kernel through
* mapping portions into its own address space. This is for example
* used by i386 to allow the kernel to address the memory beyond
* 900MB. The kernel will set up special mappings (page
* table entries on i386) for each page that the kernel needs to
* access.
*/
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
__MAX_NR_ZONES
};
2.2 区的实际使用情况与体系结构相关。
32位的 X86体系结构上: ZONE_DMA 包含的页都在 0 - 16MB; ZONE_NORMAL 包含的页在 16 -896 MB; ZONE_HIGHMEM 为高于896 MB的所有物理内存。
64位的 X86体系结构上: 由于可以处理和映射64位的内存空间,所有没有 ZONE_HIGHMEM区, 所有物理内存都处于 ZONE_DMA 和ZONE_NORMAL区。
有些体系结构在内存的任何地址执行DMA都没问题;所有的内存都可以被直接映射,从而不需要ZONE_HIGHMEM。
2.3 每个区都用 struct zone 表示, 定义在 <linux/mmzone.h>中:
struct zone {
/* Fields commonly accessed by the page allocator */
/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long watermark[NR_WMARK];
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;
/*
* We don't know if the memory that we're going to allocate will be freeable
* or/and it will be released eventually, so to avoid totally wasting several
* GB of ram we must reserve some of the lower zone memory (otherwise we risk
* to run OOM on the lower zones despite there's tons of freeable ram
* on the higher zones). This array is recalculated at runtime if the
* sysctl_lowmem_reserve_ratio sysctl changes.
*/
unsigned long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
int node;
/*
* zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif
struct per_cpu_pageset __percpu *pageset;
/*
* free areas of different sizes
*/
spinlock_t lock;
int all_unreclaimable; /* All pages pinned */
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
struct free_area free_area[MAX_ORDER];
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_COMPACTION
/*
* On compaction failure, 1<<compact_defer_shift compactions
* are skipped before trying again. The number attempted since
* last failure is tracked with compact_considered.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
#endif
ZONE_PADDING(_pad1_)
/* Fields commonly accessed by the page reclaim scanner */
spinlock_t lru_lock;
struct zone_lru {
struct list_head list;
} lru[NR_LRU_LISTS];
struct zone_reclaim_stat reclaim_stat;
unsigned long pages_scanned; /* since last reclaim */
unsigned long flags; /* zone flags, see below */
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
/*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this zone's LRU. Maintained by the pageout code.
*/
unsigned int inactive_ratio;
ZONE_PADDING(_pad2_)
/* Rarely used or read-mostly fields */
/*
* wait_table -- the array holding the hash table
* wait_table_hash_nr_entries -- the size of the hash table array
* wait_table_bits -- wait_table_size == (1 << wait_table_bits)
*
* The purpose of all these is to keep track of the people
* waiting for a page to become available and make them
* runnable again when possible. The trouble is that this
* consumes a lot of space, especially when so few things
* wait on pages at a given time. So instead of using
* per-page waitqueues, we use a waitqueue hash table.
*
* The bucket discipline is to sleep on the same queue when
* colliding and wake all in that wait queue when removing.
* When something wakes, it must check to be sure its page is
* truly available, a la thundering herd. The cost of a
* collision is great, but given the expected load of the
* table, they should be so rare as to be outweighed by the
* benefits from the saved space.
*
* __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
* primary users of these fields, and in mm/page_alloc.c
* free_area_init_core() performs the initialization of them.
*/
wait_queue_head_t * wait_table;
unsigned long wait_table_hash_nr_entries;
unsigned long wait_table_bits;
/*
* Discontig memory support fields.
*/
struct pglist_data *zone_pgdat;
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
/*
* zone_start_pfn, spanned_pages and present_pages are all
* protected by span_seqlock. It is a seqlock because it has
* to be read outside of zone->lock, and it is done in the main
* allocator path. But, it is written quite infrequently.
*
* The lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*/
unsigned long spanned_pages; /* total size, including holes */
unsigned long present_pages; /* amount of memory (excluding holes) */
/*
* rarely used fields:
*/
const char *name;
} ____cacheline_internodealigned_in_smp;
这个结构很大,但是系统中只有三个区,因此,也就只有三个这样的结构。
lock --- 自旋锁, 用来防止该结构被并发访问。注意,这个锁只保护该区结构,不保护驻留在这个区中的所有页。
watermark[NR_WMARK] --- 该数组持有该区的最小值,最低和最高水位值。内核使用水位 来为 每个内存区设置 合适的内存消耗基准。该水位随空闲内存的多少而变化。
name --- 一个以 NULL结束的字符串 表示这个区的名字。三个区的名字分别为“DMA”, "Normal" 和 “HighMe
3. 获得页
内核提供了请求内存的底层机制,并提供了几个对内存进行访问的接口,所有这些接口都以页为单位。
定义于 <linux/gfp.h> 中,最核心的函数是:
3.1 struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
static inline struct page *
alloc_pages(gfp_t gfp_mask, unsigned int order)
{
return alloc_pages_current(gfp_mask, order);
}
/**
* alloc_pages_current - Allocate pages.
*
* @gfp:
* %GFP_USER user allocation,
* %GFP_KERNEL kernel allocation,
* %GFP_HIGHMEM highmem allocation,
* %GFP_FS don't call back into a file system.
* %GFP_ATOMIC don't sleep.
* @order: Power of two of allocation size in pages. 0 is a single page.
*
* Allocate a page from the kernel page pool. When not in
* interrupt context and apply the current process NUMA policy.
* Returns NULL when no page can be allocated.
*
* Don't call cpuset_update_task_memory_state() unless
* 1) it's ok to take cpuset_sem (can WAIT), and
* 2) allocating for current task (not interrupt).
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
struct page *page;
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
get_mems_allowed();
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
put_mems_allowed();
return page;
}
EXPORT_SYMBOL(alloc_pages_current);
order --- 表示分配 2的order次幂(1<<order)个连续的物理页。
order 也可以通过get_order()函数获得,
/* Pure 2^n version of get_order */
static inline __attribute_const__ int get_order(unsigned long size)
{
int order;
size = (size - 1) >> (PAGE_SHIFT - 1);
order = -1;
do {
size >>= 1;
order++;
} while (size);
return order;
}
The get_order() function helps you to calculate a suitable value for order。
例如: you need 16K 物理内存大小. get_order(16384) will return an appropriate value of order, given your system’s page size。
gfp_t --- 在分析kmalloc时研究
该函数返回一个指针,该指针指向第一个页的page结构体;如果出错,则返回NULL。
3.2 把给定的页转换成它的虚拟地址:
void *page_address(struct page *page);
#define page_address(page) ((page)->virtual)
该函数返回一个指针,指向给定物理地址页当前所在的虚拟地址。
3.3 如果无须用到struct page, 可以调用:
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); --- 它就是封装了alloc_pages() + page_address().
/*
* Common helper functions.
*/
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
/*
* __get_free_pages() returns a 32-bit address, which cannot represent
* a highmem page
*/
VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
page = alloc_pages(gfp_mask, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
EXPORT_SYMBOL(__get_free_pages);
它直接返回了所请求的第一个页的虚拟地址。
3.4 如果只需要分配一页,可以使用下面两个函数:
struct page *alloc_page(gfp_t gfp_mask);
unsigned long __get_free_page(gfp_t gpf_mask);
即默认order = 0;
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define __get_free_page(gfp_mask) \
__get_free_pages((gfp_mask), 0)
3.5 获得初始值置为0的内存页:
unsigned long get_zeroed_page(unsigned int gfp_mask);
unsigned long get_zeroed_page(gfp_t gfp_mask)
{
return __get_free_pages(gfp_mask | __GFP_ZERO, 0); //__GFP_ZERO --- 表示页内容置为0
}
EXPORT_SYMBOL(get_zeroed_page);
4. 释放页
当不需要页时,可以使用下面的函数释放他们:
4.1 void __free_pages(struct page *page, unsigned int order); --- 对应 __get_free_pages()
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
free_hot_cold_page(page, 0);
else
__free_pages_ok(page, order);
}
}
EXPORT_SYMBOL(__free_pages);
4.2 void free_pages(unsigned long addr, unsigned int order); --- 对应 alloc_pages()
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
VM_BUG_ON(!virt_addr_valid((void *)addr));
__free_pages(virt_to_page((void *)addr), order);
}
}
EXPORT_SYMBOL(free_pages);
4.3 void free_page(unsigned long addr); --- 对应 alloc_page().
#define free_page(addr) free_pages((addr), 0)
释放页时要谨慎,只能释放属于你的页。
传递了错误的struct page 或地址,用了错误的order值,都会导致系统崩溃。
5. 例子:
unsigned long page;
page = __get_free_pages(GFP_KERNEL, 3); //分配8个页
if (!page){
//如果没有足够的内存等错误
return -ENOMEM;
}
...
free_pages(page, 3); //使用完后释放这8个页
调用__get_free_pages()之后,一定要检查错误情况。内核分配内存可能失败,因此代码必须进行检查并作相应处理。
当需要以页为单位,分配物理页时,这些函数非常有用,但是常用以字节为单位来分配内存,这时就需要用kmalloc()函数了,这个分析kmalloc时介绍。