非连续内存分配
非连续内存分配是指将物理地址不连续的页框映射到线性地址连续的线性地址空间,主要应用于大容量的内存分配。采用这种方式分配内存的主要优点是避免了外部碎片,而缺点是必须打乱内核页表,而且访问速度较连续分配的物理页框慢。
非连续内存分配的线性地址空间是从VMALLOC_START到VMALLOC_END(具体可以参见<<Linux高端内存映射(上)>>),共128M,每当内核要用vmalloc类的函数进行非连续内存分配,就会申请一个vm_struct结构来描述对应的vmalloc区,两个vmalloc区之间的间隔至少为一个页框的大小,即PAGE_SIZE。下图是非连续内存分配区的示意图
所有的vm_struct都会链入vmlist链表来管理,从2.6的某个内核版本开始,为了提高效率,内核又为vmalloc区添加了vmap_area结构和vm_struct共同描述,并且引入了红黑树来组织这些结构,鉴于红黑树的复杂,为了简化讨论,把重心放在非连续内存分配的机制上,涉及到vmap_area的具体代码不做详细的分析。
数据结构描述
在分析具体的代码之前,我们先了解描述vmalloc区的这两个数据结构
struct vm_struct {
struct vm_struct *next; /*指向下一个vm区域*/
void *addr; /*指向第一个内存单元(线性地址)*/
unsigned long size; /*该块内存区的大小*/
unsigned long flags; /*内存类型的标识字段*/
struct page **pages; /*指向页描述符指针数组*/
unsigned int nr_pages; /*内存区大小对应的页框数*/
unsigned long phys_addr; /*用来映射硬件设备的IO共享内存,其他情况下为0*/
void *caller; /*调用vmalloc类的函数的返回地址*/
};
struct vmap_area {
unsigned long va_start; /*malloc区的起始地址*/
unsigned long va_end; /*malloc区的结束地址*/
unsigned long flags; /*类型标识*/
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
struct list_head purge_list; /* "lazy purge" list */
void *private; /*指向配对的vm_struct*/
struct rcu_head rcu_head;
};
非连续内存区的分配
内核通过调用vmalloc()来进行非连续内存的分配,其原型如下
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc(unsigned long size)
{
return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
-1, __builtin_return_address(0));
}
其实质是对__vmalloc_node()的封装
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, void *caller)
{
struct vm_struct *area;
void *addr;
unsigned long real_size = size;
size = PAGE_ALIGN(size);
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
return NULL;
/*申请并设置了一个vm_struct*/
area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
VMALLOC_START, VMALLOC_END, node,
gfp_mask, caller);
if (!area)
return NULL;
/*vmalloc的建立映射的核心部分*/
addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
/*
* In this function, newly allocated vm_struct is not added
* to vmlist at __get_vm_area_node(). so, it is added here.
*/
/*将vm_struct插入vmlist链表*/
insert_vmalloc_vmlist(area);
/*
* A ref_count = 3 is needed because the vm_struct and vmap_area
* structures allocated in the __get_vm_area_node() function contain
* references to the virtual address of the vmalloc'ed block.
*/
kmemleak_alloc(addr, real_size, 3, gfp_mask);
return addr;
}
非连续内存区的分配主要通过__get_vm_area_node(),__vmalloc_area_node(),insert_vmalloc_vmlist(area)这三个函数来完成,下面对三个函数进行分析
__get_vm_area_node()
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, void *caller)
{
static struct vmap_area *va;
struct vm_struct *area;
BUG_ON(in_interrupt());
if (flags & VM_IOREMAP) {
int bit = fls(size);
if (bit > IOREMAP_MAX_ORDER)
bit = IOREMAP_MAX_ORDER;
else if (bit < PAGE_SHIFT)
bit = PAGE_SHIFT;
align = 1ul << bit;
}
size = PAGE_ALIGN(size);/*将size按页框大小对齐*/
if (unlikely(!size))
return NULL;
/*申请一个vm_struct*/
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
/*
* We always allocate a guard page.
*/
size += PAGE_SIZE;/*将size加上一个页框的大小作为安全区间*/
/*申请一个vmap_area并插入红黑树*/
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
/*
* When this function is called from __vmalloc_node,
* we do not add vm_struct to vmlist here to avoid
* accessing uninitialized members of vm_struct such as
* pages and nr_pages fields. They will be set later.
* To distinguish it from others, we use a VM_UNLIST flag.
*/
/*如果是被__vmalloc_node调用的,则只初始化area,va的相关区域,不将area插入vmlist链表*/
if (flags & VM_UNLIST)
setup_vmalloc_vm(area, va, flags, caller);
else /*否则初始化area,va的相关区域,并将area插入vmlist链表*/
insert_vmalloc_vm(area, va, flags, caller);
return area;
}
在这里面,主要是为malloc区申请vm_struct和vmap_area这两个数据结构,由于是在malloc()中调用,因此调用setup_vmalloc_vm()来设置相关的一些域
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, void *caller)
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->private = vm;
va->flags |= VM_VM_AREA;
}
__vmalloc_area_node
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node, void *caller)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;/*得到要映射的页框数*/
array_size = (nr_pages * sizeof(struct page *));/*得到page指针数组所需的空间*/
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {/*如果array_size大于一个页框的大小,则递归调用__vmalloc_node
来为pages分配空间*/
pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO,
PAGE_KERNEL, node, caller);
area->flags |= VM_VPAGES;
} else { /*否则通过kmalloc分配一块连续的空间*/
pages = kmalloc_node(array_size,
(gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
node);
}
/*设置pages域和caller域*/
area->pages = pages;
area->caller = caller;
/*pages域不存在则要将相应的vm_struct和vmap_area分别从vmlist和红黑树中删除*/
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
/*为area中的每一个page分配一个物理页框*/
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
if (node < 0)
page = alloc_page(gfp_mask);
else
page = alloc_pages_node(node, gfp_mask, 0);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
area->pages[i] = page;
}
/*建立映射*/
if (map_vm_area(area, prot, &pages))
goto fail;
return area->addr;
fail:
vfree(area->addr);
return NULL;
}
具体的映射在map_vm_area()中建立
int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
{
/*确定映射的起始地址和结束地址,这里结束地址除去了之前分配的安全区*/
unsigned long addr = (unsigned long)area->addr;
unsigned long end = addr + area->size - PAGE_SIZE;
int err;
err = vmap_page_range(addr, end, prot, *pages);
if (err > 0) {
*pages += err;
err = 0;
}
return err;
}
static int vmap_page_range(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
int ret;
ret = vmap_page_range_noflush(start, end, prot, pages);
flush_cache_vmap(start, end);
return ret;
}
static int vmap_page_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
pgd_t *pgd;
unsigned long next;
unsigned long addr = start;
int err = 0;
int nr = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);/*得到线性地址区域第一个单元对应的pgd*/
do {
/*end和addr在同一个pgd的话,next的值为end,否则为addr所在pgd的下一个pgd对应的起始线性地址*/
next = pgd_addr_end(addr, end);
err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
return nr;
}
同上类似
static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pud_t *pud;
unsigned long next;
/*分配一个页上级目录*/
pud = pud_alloc(&init_mm, pgd, addr);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
同上类似
static int vmap_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pmd_t *pmd;
unsigned long next;
/*分配页中级目录*/
pmd = pmd_alloc(&init_mm, pud, addr);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pte_t *pte;
/*
* nr is a running index into the array which helps higher level
* callers keep track of where we're up to.
*/
/*定位与addr对应的页表项*/
pte = pte_alloc_kernel(pmd, addr);
if (!pte)
return -ENOMEM;
do {
struct page *page = pages[*nr];/*得到页描述符*/
if (WARN_ON(!pte_none(*pte)))
return -EBUSY;
if (WARN_ON(!page))
return -ENOMEM;
/*将页描述符对应的页框和页表项进行关联,映射被建立*/
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
return 0;
}
insert_vmalloc_vmlist(area)
static void insert_vmalloc_vmlist(struct vm_struct *vm)
{
struct vm_struct *tmp, **p;
vm->flags &= ~VM_UNLIST;
write_lock(&vmlist_lock);
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
if (tmp->addr >= vm->addr)
break;
}
vm->next = *p;
*p = vm;
write_unlock(&vmlist_lock);
}
这个函数简单,就是将vm插入vmlist链表,需要注意的一点就是vmlist链表中的元素是按addr从小到大排列的
非连续内存区的释放
void vfree(const void *addr)
{
BUG_ON(in_interrupt());
kmemleak_free(addr);
__vunmap(addr, 1);
}
static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
if (!addr)
return;
if ((PAGE_SIZE-1) & (unsigned long)addr) {
WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
return;
}
/*将和addr对应的vm_struct和vmap_area分别从vmlist和红黑树中删除*/
area = remove_vm_area(addr);
if (unlikely(!area)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
return;
}
debug_check_no_locks_freed(addr, area->size);
debug_check_no_obj_freed(addr, area->size);
if (deallocate_pages) {
int i;
/*释放被映射的页面*/
for (i = 0; i < area->nr_pages; i++) {
struct page *page = area->pages[i];
BUG_ON(!page);
__free_page(page);
}
/*释放页描述符指针数组区域*/
if (area->flags & VM_VPAGES)
vfree(area->pages);
else
kfree(area->pages);
}
/*释放area*/
kfree(area);
return;
}
总结
至此,已将高端内存所有区域的映射介绍完毕。在我看来,内核的线性地址空间都可以视为一种资源,因为必须通过线性地址来访问页表,进一步通过页表来访问相应的物理内存。由于内核的线性地址空间有限,因此采取上面介绍的三种方式来映射高端内存。需要明确的一点就是,线性地址与页表之间的映射是固定不可变的,而页表到具体的物理页框之间的映射是可以改变的,内核正是利用页表到物理页框之间的映射的可变性来为高端内存建立“临时”的映射,这三种机制本质上都回归到这点。永久内核映射和临时内核映射,都由内核指定了需要进行映射的页面,也就是说指定了页描述符(页描述符和物理页框之间的关系是固定不可变的),在永久内核映射中,内核只需要在永久内核映射区找到空闲的,也就是未被映射的线性地址对应的页表项,然后将其分配给page即可,若找不到则将阻塞申请建立映射的进程;而临时内核映射更直接,连进行映射的线性地址窗口都是固定的,若是其已经分配给了某个页框,则直接抢过来用,因此之前的映射就被覆盖了,体现出了临时性。非连续内存分配,内核不用指定具体的page,只需指定要申请的内存大小,内核将在非连续内存分配区找到一块相应大小虚拟地址空间,然后再由伙伴系统分配页框,还要通过slab分配器为一些数据结构分配内存,最后再用同样的方式(设置PTE表项)来建立映射,其中涉及到伙伴系统和slab分配的部分都没做具体分析,在后面的文章中再着重分析这些部分。