上一篇我们从内存结点,内存域的维度查看了Linux内存管理。本篇接着从内存分配的维度来看Linux内存管理。到现在为止,内核建立起了节点,域,页三级管理结构,并完成了页表映射。但是这一切都是在启动期内存管理器的基础上建立的,我们自建的内存管理器只完成了内存映射,还不具备内存分配的功能。为此接下来内核需要建立内存分配系统,Linux从内存分配效率,内存利用率的角度出发在前述管理结构上建立起hubby子系统来对接内存页分配,但是这还不够,考虑到系统中各种可能的对象尺寸和内存分配效率,Linux在hubby子系统的基础上构建起slab内存分配器来满足系统各种尺寸对象的分配需求,同时结合硬件缓存实现内存的高速分配。
一. 伙伴系统
Linux在构建起分页管理后,内存管理的责任由伙伴系统承担。每一个内存域都关联了一个struct zone,该结构中的free_area数组用于管理伙伴系统数据。free_area数组的索引即是内存页分配阶数,最大阶数MAX_ORDER=11,即2048页。
linux_kernel/include/linux/mmzone.h
struct zone {
struct free_area free_area[];
}
结构体free_area的free_list为对应阶数的内存页连表数组,该数组按内存页的迁移类型MIGRATE_TYPES来区分。nr_free统计空闲页数目。
linux_kernel/include/linux/mmzone.h
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
unsigned long nr_free;
};
1.分配
linux_kernel/include/linux/gfp.h
static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
{
return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
}
linux_kernel/mm/page_alloc.c
伙伴系统核心分配函数
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
//尝试从空闲链表分配
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
goto out;
//走慢分配
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
linux_kernel/mm/page_alloc.c
遍历所有备用zone,尝试从空闲列表分配
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
......
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
......
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
......
}
}
return NULL;
}
linux_kernel/mm/page_alloc.c
按迁移类型遍历内存域
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
return page;
}
linux_kernel/mm/page_alloc.c
找到合适的空闲页返回
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
del_page_from_free_area(page, area);
expand(zone, page, order, current_order, area, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
return NULL;
}
2.回收
linux_kernel/mm/page_alloc.c
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page))
free_the_page(page, order);
}
EXPORT_SYMBOL(__free_pages);
linux_kernel/mm/page_alloc.c
如果是单页,不归还给伙伴系统,放到CPU缓存中
static inline void free_the_page(struct page *page, unsigned int order)
{
if (order == 0)
free_unref_page(page); //释放单页
else
__free_pages_ok(page, order);
}
linux_kernel/mm/page_alloc.c
计算内存页对应的内存域,迁移类型
static void __free_pages_ok(struct page *page, unsigned int order)
{
free_one_page(page_zone(page), page, pfn, order, migratetype);
}
linux_kernel/mm/page_alloc.c
static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
int migratetype)
{
__free_one_page(page, pfn, zone, order, migratetype);
}
linux_kernel/mm/page_alloc.c
要释放的内存被添加到相关内存域的free_area中,进一步的对连续的内存单元进行合并为高一阶的内存放到高一阶的free_area中。此外如果存在可以合并的伙伴对,也进行合并并转移到高一阶的free_area列表中,直到所有可能的伙伴对都已合并。
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype)
{
unsigned long combined_pfn;
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
struct capture_control *capc = task_capc(zone);
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
continue_merging:
while (order < max_order - 1) { //不断循环直到不能合并为止
//计算伙伴地址
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
if (!pfn_valid_within(buddy_pfn))
goto done_merging; //去合并
if (!page_is_buddy(page, buddy, order))
goto done_merging;
if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
else
del_page_from_free_area(buddy, &zone->free_area[order]);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
//阶数累加,伙伴向高阶移动
order++;
}
//计算是否还能合并
if (max_order < MAX_ORDER) {
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
&& (is_migrate_isolate(migratetype) ||
is_migrate_isolate(buddy_mt)))
goto done_merging;
}
max_order++;
goto continue_merging;
}
done_merging:
set_page_order(page, order);
if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
&& !is_shuffle_order(order)) {
struct page *higher_page, *higher_buddy;
//计算伙伴对地址
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
//添加到指定阶的空闲列表
add_to_free_area_tail(page, &zone->free_area[order],
migratetype);
return;
}
}
//添加到指定阶
if (is_shuffle_order(order))
add_to_free_area_random(page, &zone->free_area[order],
migratetype);
else
add_to_free_area(page, &zone->free_area[order], migratetype);
}
二. slub系统
现在我们已经构建起了伙伴系统,接下来是时候复活slub系统了。首先为什么要有slub系统,答案很明显,hubby系统最小分配单元是页,如果我需要分配256个字节,hubby系统干不了,所以才有了slub系统。为了提高效率,内核为slub创建了多级,多种缓存。多级体现在slub的cpu缓存和node缓存,多种体现在,按对象尺寸建立多个缓存。为了分配的效率,按slub缓存的空满状态建立缓存管理。
linux_kernel/include/linux/slub_def.h
系统按不同的对象尺寸创建各类kmem_cache ,每个kmem_cache 关联着kmem_cache_cpu 和kmem_cache_node 数组
struct kmem_cache {
struct kmem_cache_cpu __percpu *cpu_slab;
/* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
unsigned long min_partial;
unsigned int size; /* The size of an object including metadata */
unsigned int object_size;/* The size of an object without metadata */
unsigned int offset; /* Free pointer offset */
struct kmem_cache_order_objects oo;
/* Allocation and freeing of slabs */
struct kmem_cache_order_objects max;
struct kmem_cache_order_objects min;
gfp_t allocflags; /* gfp flags to use on each alloc */
int refcount; /* Refcount for slab cache destroy */
void (*ctor)(void *);
unsigned int inuse; /* Offset to metadata */
unsigned int align; /* Alignment */
unsigned int red_left_pad; /* Left redzone padding size */
const char *name; /* Name (only for display!) */
struct list_head list; /* List of slab caches */
struct kmem_cache_node *node[MAX_NUMNODES];
};
linux_kernel/include/linux/slub_def.h
kmem_cache_cpu像营业厅,内存分配都先找它
struct kmem_cache_cpu {
void **freelist; /* Pointer to next available object */
unsigned long tid; /* Globally unique transaction id */
struct page *page; //指向正在分配的slab
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct page *partial; /* Partially allocated frozen slabs */
#endif
#ifdef CONFIG_SLUB_STATS
unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
};
linux_kernel/mm/slab.h
kmem_cache_node像仓库,营业厅卖完了,把卖完的slab送回仓库,重新从仓库拉一个slab会营业厅
struct kmem_cache_node {
spinlock_t list_lock;
#ifdef CONFIG_SLAB
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long total_slabs; /* length of all slab lists */
unsigned long free_slabs; /* length of free slab list only */
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
struct array_cache *shared; /* shared per node */
struct alien_cache **alien; /* on other nodes */
unsigned long next_reap; /* updated without locking */
int free_touched; /* updated without locking */
#endif
atomic_long_t nr_slabs;
atomic_long_t total_objects;
struct list_head full;
#endif
#endif
};
每次申请size大小的对象时,系统会根据size匹配到合适的kmem_cache,接着先去 kmem_cache_cpu 的slub分配对象,如果没有分配到,说明kmem_cache_cpu的slub满了,则将此slub挂到kmem_cache_node 的slabs_full队列中,重新从kmem_cache_node 的partia队列找一个slub挂到kmem_cache_cpu ,如果partia队列没找到则重新申请一个slub。
1.构建
linux_kernel/init/main.c
asmlinkage __visible void __init start_kernel(void){
mm_init();
}
linux_kernel/init/main.c
static void __init mm_init(void){
mem_init();
kmem_cache_init();
pgtable_init();
vmalloc_init(); //vmalloc分配初始化
}
linux_kernel/arch/x86/mm/init_64.c
内存域管理中,讲过Linux启动期间通过memblock分配器来管理内存,系统走到这里,是时候切换到新建的内存管理系统了,memblock_free_all将释放启动器的内存,将启动期间分配的内存迁移到hubby系统中去。
void __init mem_init(void)
{
memblock_free_all();
after_bootmem = 1;
}
inux_kernel/mm/slub.c
这里是slub系统的起点
void __init kmem_cache_init(void)
{
create_kmalloc_caches(0);
}
linux_kernel/mm/slab_common.c
初始化时循环创建各种类型的mem_cache
void __init create_kmalloc_caches(slab_flags_t flags)
{
int i, type;
for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
if (!kmalloc_caches[type][i])
new_kmalloc_cache(i, type, flags);
if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
!kmalloc_caches[type][1])
new_kmalloc_cache(1, type, flags);
if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
!kmalloc_caches[type][2])
new_kmalloc_cache(2, type, flags);
}
}
}
2.分配
linux_kernel/include/linux/slab.h
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
return __kmalloc(size, flags);
}
linux_kernel/mm/slub.c
void *__kmalloc(size_t size, gfp_t flags)
{
struct kmem_cache *s;
ret = slab_alloc(s, flags, _RET_IP_);
return ret;
}
EXPORT_SYMBOL(__kmalloc);
linux_kernel/mm/slab.c
static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
objp = __do_cache_alloc(cachep, flags);
return objp;
}
linux_kernel/mm/slab.c
先从kmem_cache_cpu的slab分配,未分配到,从kmem_cache_node 分配
static __always_inline void *
__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
{
void *objp;
objp = ____cache_alloc(cache, flags);
if (!objp)
objp = ____cache_alloc_node(cache, flags, numa_mem_id());
out:
return objp;
}
3.释放
linux_kernel/mm/slub.c
void kfree(const void *x)
{
struct page *page;
void *object = (void *)x;
page = virt_to_head_page(x);
slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
}
EXPORT_SYMBOL(kfree);
linux_kernel/mm/slub.c
static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
void *head, void *tail, int cnt,
unsigned long addr)
{
if (slab_free_freelist_hook(s, &head, &tail))
do_slab_free(s, page, head, tail, cnt, addr);
}
linux_kernel/mm/slub.c
static __always_inline void do_slab_free(struct kmem_cache *s,
struct page *page, void *head, void *tail,
int cnt, unsigned long addr)
{
__slab_free(s, page, head, tail_obj, cnt, addr);
}
linux_kernel/mm/slub.c
先判断要释放的page是否在cpu缓存中,如果是则释放,否则表示在node缓存中。判断所在slab的满空状态采取不同的处理策略。
static void __slab_free(struct kmem_cache *s, struct page *page,
void *head, void *tail, int cnt,
unsigned long addr)
{
void *prior;
int was_frozen;
struct page new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
unsigned long uninitialized_var(flags);
do {
prior = page->freelist;
counters = page->counters;
set_freepointer(s, tail, prior);
new.counters = counters;
was_frozen = new.frozen;
new.inuse -= cnt;
if ((!new.inuse || !prior) && !was_frozen) {
//判断是否是在kmem_cache_cpu的slab中
if (kmem_cache_has_cpu_partial(s) && !prior) {
new.frozen = 1;
} else {
//找到所在kmem_cache_node
n = get_node(s, page_to_nid(page));
spin_lock_irqsave(&n->list_lock, flags);
}
}
} while (!cmpxchg_double_slab(s, page,
prior, counters,
head, new.counters,
"__slab_free"));
if (likely(!n)) { //在cpu缓存
if (new.frozen && !was_frozen) {
put_cpu_partial(s, page, 1);
stat(s, CPU_PARTIAL_FREE);
}
if (was_frozen)
stat(s, FREE_FROZEN);
return;
}
//在node缓存中
if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
goto slab_empty;
//从满队列移除,变更未满队列
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
remove_full(s, n, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
spin_unlock_irqrestore(&n->list_lock, flags);
return;
slab_empty:
if (prior) { //从未满队列移除
remove_partial(n, page);
stat(s, FREE_REMOVE_PARTIAL);
} else {
//从未满队列移除
remove_full(s, n, page);
}
spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
discard_slab(s, page);
}
三.vmalloc
在内核域管理一篇,32位机高端内存vmalloc区用于建立非连续内存分配,每个vmalloc分配的子区域用vm_struct 来表示。vmap_area 用于组织管理各个子区域来实现1G以上高端内存的映射。
linux_kernel/include/linux/vmalloc.h
vm_struct表示一个映射区
struct vm_struct {
struct vm_struct *next; //链表
void *addr; //vm区域在虚拟地址空间中的起始地址
unsigned long size; //区域长度
unsigned long flags;//存储区类型
struct page **pages; //指向页指针的数组
unsigned int nr_pages;//数组项数目
phys_addr_t phys_addr;//物理地址ioremap
const void *caller;
};
linux_kernel/include/linux/vmalloc.h
vmap_area 用于组织vm_struct
struct vmap_area {
unsigned long va_start;
unsigned long va_end;
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
union {
unsigned long subtree_max_size; /* in "free" tree */
struct vm_struct *vm; /* in "busy" tree */
struct llist_node purge_list; /* in purge list */
};
};
1.分配
linux_kernel/mm/vmalloc.c
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
size = PAGE_ALIGN(size);
.....
//先去node的slab找找看
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
......
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
//获取一个vmap_area
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
//vm_struct加入到vmap_area
setup_vmalloc_vm(area, va, flags, caller);
return area;
}
linux_kernel/mm/vmalloc.c
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
struct vmap_area *va, *pva;
unsigned long addr;
int purged = 0;
//去slab找找
va = kmem_cache_alloc_node(vmap_area_cachep,
gfp_mask & GFP_RECLAIM_MASK, node);
retry:
//计算匹配合适的映射地址
addr = __alloc_vmap_area(size, align, vstart, vend);
if (unlikely(addr == vend))
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->vm = NULL;
//加入到vmap_area管理结构
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
return va;
overflow:
//溢出,没有足够的映射空间,释放
kmem_cache_free(vmap_area_cachep, va);
return ERR_PTR(-EBUSY);
}
2.释放
linux_kernel/mm/vmalloc.c
解除映射,必要时释放内存页
static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
area = find_vm_area(addr);
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
int i;
for (i = 0; i < area->nr_pages; i++) {
struct page *page = area->pages[i];
__free_pages(page, 0);
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
kvfree(area->pages);
}
kfree(area);
return;
}
至此Linux基于分页管理从另一个维度组织起伙伴系统和slub系统,内核内存管理从启动期内存管理器过渡转换到hubby,slab内存分配器,为内核的构建起高效的内存管理系统。但是到目前位置我们都是站在内核的角度来看,从用户的角度,要支撑起上层应用庞大的内存分配需求,显然是不够的。比如考虑运行在Linux之上的HotSpot虚拟机,她自身可以管理百兆到TB级别的庞大内存空间,拥有自身的内存管理系统,而她自身又作为一个系统进程运行在Linux之上,拥有堆栈空间。因此在用户空间必须有另一套内存管理机制来支撑上层应用的运行,这个机制就是进程虚拟内存。