非连续页分配
对于内核来说,最好的情况当然是分配连续的物理内存,这样效率高,分配简单,但是这只是理想情况。当系统运行较长时间后,会产生大量碎片,导致内核内存中没有连续的大块内存,这在用户空间是个很简单的问题,因为用户空间进程设计为使用处理器的分页机制,这会降低效率且TLB占用额外内存。内核中也有同样的技术,内核分配其虚拟地址空间的一部分用于建立非连续页映射,也就是之前提到的vmallo及内核映射。
vmalloc
在IA-32系统中,紧随直接映射的前896M之后,在插入8MB的安全隙之后,是一个用于管理非连续内存的区域,通过修改内核页表可以将连续的虚拟内存映射到非连续的物理内存。
vmalloc区域
vmalloc 函数调用图如下:
vmalloc
vmalloc内存分配时,先从在虚拟地址空间中找到一块连续的虚拟地址,如果虚拟地址空间没有适合大小的连续空间,会尝试一次整理,如果还是找不到就会fail,将vm_struct的地址addr指向虚拟地址,计算分配的连续空间的页数,通过伙伴系统分配指定页数的0阶物理内存,然后将其关联到vm_struct的pages,最终分配结果如下图:
内存映射图
vmalloc的代码如下:
struct vm_struct {
struct vm_struct *next;
void *addr; // 虚拟起始地址: VMALLOC_START+OFFSET
unsigned long size; // 字节数
unsigned long flags;
struct page **pages; // 指向分配的不连续页
unsigned int nr_pages; // 页帧数
phys_addr_t phys_addr; //
const void *caller;
};
struct vmap_area { /* 连续的虚拟地址段,每个段之间有一页的警戒页*/
unsigned long va_start; /* 段虚拟起始地址 */
unsigned long va_end; /* 段虚拟结尾地址*/
unsigned long flags;
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
struct llist_node purge_list; /* "lazy purge" list */
struct vm_struct *vm;
struct rcu_head rcu_head;
};
/*
* @size: 字节数
* gfp: get free page
*NUMA_NO_NODE: -1
* gfp_mask:
* %GFP_KERNEL
* Allocate normal kernel ram. May sleep.
* %GFP_NOWAIT
* Allocation will not sleep.
* %GFP_ATOMIC
* Allocation will not sleep. May use emergency pools.
* %GFP_HIGHUSER
* Allocate memory from high memory on behalf of user.
*/
void *vmalloc(unsigned long size)
{
return __vmalloc_node_flags(size, NUMA_NO_NODE, GFP_KERNEL);
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
size = PAGE_ALIGN(size); // size对齐页的整数倍
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, get_count_order_long(size),
PAGE_SHIFT, IOREMAP_MAX_ORDER);
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); // 分配指定字节的内存,详细内容见slab 分配器
if (!(flags & VM_NO_GUARD)) // 添加警戒页
size += PAGE_SIZE;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);/* 从指定范围的虚拟地址空间中分配一个合适大小的连续空间段 */
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
setup_vmalloc_vm(area, va, flags, caller);
return area;
}
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
* @size: 段空间大小 (字节)
* @align: 对齐位置
* @vstart,@vend: 在目标区间分配连续的虚拟地址空间段
*/
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
struct rb_node *n;
unsigned long addr;
int purged = 0;
struct vmap_area *first;
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node); /* 分配小块空间,具体见slab分配器*/
retry:
/* 有些极端场景不使用缓存 */
if (!free_vmap_cache ||
size < cached_hole_size ||
vstart < cached_vstart ||
align < cached_align) {
nocache:
cached_hole_size = 0;
free_vmap_cache = NULL;
}
/* record if we encounter less permissive parameters */
cached_vstart = vstart;
cached_align = align;
/* find starting point for our search */
if (free_vmap_cache) {
first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
addr = ALIGN(first->va_end, align);
if (addr < vstart) /*缓存中没有可分配的空间段*/
goto nocache;
if (addr + size < addr) /* 越界了,虚拟空间分配*/
goto overflow;
} else {
addr = ALIGN(vstart, align);
if (addr + size < addr)
goto overflow;
n = vmap_area_root.rb_node;
first = NULL;
while (n) { /* 查找以addr为起始的地址段已经被分配vm_area*/
struct vmap_area *tmp;
tmp = rb_entry(n, struct vmap_area, rb_node);
if (tmp->va_end >= addr) {
first = tmp;
if (tmp->va_start <= addr)
break;
n = n->rb_left;
} else
n = n->rb_right;
}
if (!first) /* 以addr为起始的地址空间没有被占用*/
goto found;
}
/* 遍历整个vm_area链表,查找可用的起始地址空间 */
while (addr + size > first->va_start && addr + size <= vend) {
if (addr + cached_hole_size < first->va_start)
cached_hole_size = first->va_start - addr;
addr = ALIGN(first->va_end, align);
if (addr + size < addr)
goto overflow;
if (list_is_last(&first->list, &vmap_area_list))
goto found;
first = list_next_entry(first, list);
}
found:
/*
* Check also calculated address against the vstart,
* because it can be 0 because of big align request.
*/
if (addr + size > vend || addr < vstart)
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->flags = 0;
__insert_vmap_area(va);
free_vmap_cache = &va->rb_node;
spin_unlock(&vmap_area_lock);
return va;
overflow:
spin_unlock(&vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = 1;
goto retry;
}
/* 尝试*/
if (gfpflags_allow_blocking(gfp_mask)) {
unsigned long freed = 0;
blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
if (freed > 0) {
purged = 0;
goto retry;
}
}
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
size);
kfree(va);
return ERR_PTR(-EBUSY);
}
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
0 :
__GFP_HIGHMEM;
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
PAGE_KERNEL, node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
if (node == NUMA_NO_NODE)
page = alloc_page(alloc_mask|highmem_mask);
else
page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
}
if (map_vm_area(area, prot, pages))
goto fail;
return area->addr;
fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
0 :
__GFP_HIGHMEM;
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) { /* 如果申请的 struct pages内存大于一页则通过vmalloc为struct pages分配*/
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
PAGE_KERNEL, node, area->caller);
} else { /* 通过slab分配 */
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
if (node == NUMA_NO_NODE)
page = alloc_page(alloc_mask|highmem_mask); /* 伙伴系统分配,当没有指定node的时候会根据mem policy 选择一个node*/
else
page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); /* 伙伴系统分配 0阶内存*/
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
}
if (map_vm_area(area, prot, pages))
goto fail;
return area->addr;
fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}
vfree
vmalloc是分配内存,那么释放内存就是vfree了,其基本逻辑如下图:
vfree
- 通过红黑树找到虚拟地址空间vmap_area;
- 释放虚拟地址空间
- 通过伙伴系统释放物理内存
查找虚拟地址空间的代码如下:
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
while (n) {
struct vmap_area *va;
va = rb_entry(n, struct vmap_area, rb_node);
if (addr < va->va_start)
n = n->rb_left;
else if (addr >= va->va_end)
n = n->rb_right;
else
return va;
}
return NULL;
}
内核映射
vmalloc提供了HIGH_MEM zone中虚拟地址到物理地址的映射,但是是匿名隐式映射,没办法指定将虚拟地址绑定到具体的物理页帧,更适用于稍微频繁一点的分配及释放。所以内核提供了一下其他内核映射方式,在内存管理(简介)中有介绍到持久映射及固定映射来适应一些其他场景。。
持久化内存映射
持久化映射可以指定将虚拟地址空间映射到指定的页,通过kmap及kunmap来分配及释放,同vmalloc一样,持久化映射也需要在虚拟地址空间中指定具体的区域来做映射,该区域位于VMMALOOC区域之后,从PKMAP_BASE到FIXADDR_START,该区域用于持久映射。
在持久化映射中一个物理页与一个虚拟地址一一映射,其数据结构如下
struct page_address_map {
struct page *page; /* 对应于内存节点中的物理页 */
void *virtual; /* 对于与虚拟地址空间的起始地址*/
struct list_head list;
};
为了便于组织,映射保存在散列表page_address_htable中,结构page_address_map中的链表list用于建立散列表中hash碰撞的溢出元素,其内存数据结构图如下,其中mem_map在新的内核代码中已经更名为node_mem_map;pkmap_count为虚拟页引用计数。
直接映射
kmap
kmap用于建立物理地址到虚拟起始地址的映射,具体代码如下:
void *kmap(struct page *page)
{
BUG_ON(in_interrupt());
if (!PageHighMem(page))
return page_address(page);
return kmap_high(page);
}
void *page_address(const struct page *page) /* 获取指定物理页的虚拟起始地址 */
{
unsigned long flags;
void *ret;
struct page_address_slot *pas;
if (!PageHighMem(page)) /* 通过page->flags判断页所处的zone,如果物理页处于非高端内存时,采用直接映射方式 */
return lowmem_page_address(page);
pas = page_slot(page); /* 在page_address_htable中查询物理页对应的虚拟起始地址*/
ret = NULL;
spin_lock_irqsave(&pas->lock, flags);
if (!list_empty(&pas->lh)) {
struct page_address_map *pam;
list_for_each_entry(pam, &pas->lh, list) { /* hash碰撞处理 */
if (pam->page == page) {
ret = pam->virtual;
goto done;
}
}
}
done:
spin_unlock_irqrestore(&pas->lock, flags);
return ret;
}
void *kmap_high(struct page *page)
{
unsigned long vaddr;
/*
* For highmem pages, we can't trust "virtual" until
* after we have the lock.
*/
lock_kmap();
vaddr = (unsigned long)page_address(page); /*查看是否已经被映射过*/
if (!vaddr)
vaddr = map_new_virtual(page); /* 获取一个未分配的虚拟页,并将映射更新到散列表中*/
pkmap_count[PKMAP_NR(vaddr)]++; /* 映射引用计数加1*/
BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
unlock_kmap();
return (void*) vaddr;
}
static inline unsigned long map_new_virtual(struct page *page)
{
unsigned long vaddr;
int count;
unsigned int last_pkmap_nr;
unsigned int color = get_pkmap_color(page);
start:
count = get_pkmap_entries_count(color); /*当没有可映射虚拟空间时,非sleep重试的次数*/
/* Find an empty entry */
for (;;) {
last_pkmap_nr = get_next_pkmap_nr(color);
if (no_more_pkmaps(last_pkmap_nr, color)) {
flush_all_zero_pkmaps(); /* 这是释放映射的关键,下面介绍unmap的时候详细介绍*/
count = get_pkmap_entries_count(color);
}
if (!pkmap_count[last_pkmap_nr])
break; /* Found a usable entry */
if (--count)
continue;
/*
* Sleep for somebody else to unmap their entries
*/
{
DECLARE_WAITQUEUE(wait, current);
wait_queue_head_t *pkmap_map_wait =
get_pkmap_wait_queue_head(color);
__set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(pkmap_map_wait, &wait);
unlock_kmap();
schedule();
remove_wait_queue(pkmap_map_wait, &wait);
lock_kmap();
/* Somebody else might have mapped it while we slept */
if (page_address(page))
return (unsigned long)page_address(page);
/* Re-start */
goto start;
}
}
vaddr = PKMAP_ADDR(last_pkmap_nr);
set_pte_at(&init_mm, vaddr,
&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
pkmap_count[last_pkmap_nr] = 1; /*初始化就设为1,映射后还会+1,因此只有引用计数为2及以上的才是有真正映射的*/
set_page_address(page, (void *)vaddr);
return vaddr;
}
kunmap
kunmap用于释放映射,这是一个体系结构相关的函数,但大体实现差不多。
void kunmap(struct page *page)
{
if (in_interrupt())
BUG();
if (!PageHighMem(page))
return;
kunmap_high(page);
}
void kunmap_high(struct page *page)
{
unsigned long vaddr;
unsigned long nr;
unsigned long flags;
int need_wakeup;
unsigned int color = get_pkmap_color(page);
wait_queue_head_t *pkmap_map_wait;
lock_kmap_any(flags);
vaddr = (unsigned long)page_address(page);
BUG_ON(!vaddr);
nr = PKMAP_NR(vaddr); /*获取虚拟地址对应的虚拟页*/
need_wakeup = 0;
switch (--pkmap_count[nr]) {
case 0:
BUG();
case 1:
pkmap_map_wait = get_pkmap_wait_queue_head(color);
need_wakeup = waitqueue_active(pkmap_map_wait);
}
unlock_kmap_any(flags);
if (need_wakeup)
wake_up(pkmap_map_wait);
}
从上面的代码可以看出来kunmap并不会真正的释放页,而只是标记内存map_count为1,表示已分配,但是未映射,只有在建立映射没有可用虚拟地址时才清理一下未映射的虚拟地址,这在内存压力不是特别大的时候能大大提高分配效率,但是当虚拟内存被分配比较满的时候会大大降低分配效率。
固定映射
在上述代买中可以看到kmap有可能会sleep,所以不能用于中断处理程序,至于为啥中断程序是如何工作的,以及他为啥需要保证原子性,这个后续在中断及信号处理,再详细说明,所以提供了kmap_atomic来建立映射,kmap_atomic是一个体系机构相关的函数,其在IA-32体系机构中是通过关掉抢占及关掉pagefault来实现禁止上下文切换,然后做直接映射,其代码如下:
void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
preempt_disable();
pagefault_disable();
if (!PageHighMem(page))
return page_address(page);
type = kmap_atomic_idx_push(); /* 原子性自增id*/
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
set_pte(kmap_pte-idx, mk_pte(page, prot));
arch_flush_lazy_mmu_mode();
return (void *)vaddr;
}
#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
其分配图如下:
固定映射
依次映射到虚拟地址空间。
固定映射主要用与虚拟地址映射及内核内存与用户空间内存拷贝
如此内核就可以通过vmalloc分配非连续物理地址,持久化映射及固定映射来建立虚拟地址到物理内存的非连续映射。
拓展阅读
mem_policy
https://www.kernel.org/doc/Documentation/vm/numa_memory_policy.txt