Linux内存分配系统

上一篇我们从内存结点,内存域的维度查看了Linux内存管理。本篇接着从内存分配的维度来看Linux内存管理。到现在为止,内核建立起了节点,域,页三级管理结构,并完成了页表映射。但是这一切都是在启动期内存管理器的基础上建立的,我们自建的内存管理器只完成了内存映射,还不具备内存分配的功能。为此接下来内核需要建立内存分配系统,Linux从内存分配效率,内存利用率的角度出发在前述管理结构上建立起hubby子系统来对接内存页分配,但是这还不够,考虑到系统中各种可能的对象尺寸和内存分配效率,Linux在hubby子系统的基础上构建起slab内存分配器来满足系统各种尺寸对象的分配需求,同时结合硬件缓存实现内存的高速分配。

一. 伙伴系统

Linux在构建起分页管理后,内存管理的责任由伙伴系统承担。每一个内存域都关联了一个struct zone,该结构中的free_area数组用于管理伙伴系统数据。free_area数组的索引即是内存页分配阶数,最大阶数MAX_ORDER=11,即2048页。

在这里插入图片描述

linux_kernel/include/linux/mmzone.h

struct zone {
	struct free_area	free_area[];
}

结构体free_area的free_list为对应阶数的内存页连表数组,该数组按内存页的迁移类型MIGRATE_TYPES来区分。nr_free统计空闲页数目。

linux_kernel/include/linux/mmzone.h

struct free_area {
	struct list_head	free_list[MIGRATE_TYPES];
	unsigned long		nr_free;
};

1.分配

linux_kernel/include/linux/gfp.h

static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
{
	return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
}

linux_kernel/mm/page_alloc.c
伙伴系统核心分配函数

struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
							nodemask_t *nodemask)
{
	struct page *page;
	//尝试从空闲链表分配
	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
	if (likely(page))
		goto out;
	//走慢分配
	page = __alloc_pages_slowpath(alloc_mask, order, &ac);

out:
	return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);

linux_kernel/mm/page_alloc.c
遍历所有备用zone,尝试从空闲列表分配

static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
						const struct alloc_context *ac)
{
    ......
	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
								ac->nodemask) {
		......
		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
				gfp_mask, alloc_flags, ac->migratetype);
	    ......
	   
		}
	}
	return NULL;
}

linux_kernel/mm/page_alloc.c
按迁移类型遍历内存域

static inline
struct page *rmqueue(struct zone *preferred_zone,
			struct zone *zone, unsigned int order,
			gfp_t gfp_flags, unsigned int alloc_flags,
			int migratetype)
{
	page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
	return page;
}

linux_kernel/mm/page_alloc.c
找到合适的空闲页返回

static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
						int migratetype)
{
	unsigned int current_order;
	struct free_area *area;
	struct page *page;

	/* Find a page of the appropriate size in the preferred list */
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
		area = &(zone->free_area[current_order]);
		page = get_page_from_free_area(area, migratetype);
		if (!page)
			continue;
		del_page_from_free_area(page, area);
		expand(zone, page, order, current_order, area, migratetype);
		set_pcppage_migratetype(page, migratetype);
		return page;
	}

	return NULL;
}

2.回收

linux_kernel/mm/page_alloc.c

void __free_pages(struct page *page, unsigned int order)
{
	if (put_page_testzero(page))
		free_the_page(page, order);
}
EXPORT_SYMBOL(__free_pages);

linux_kernel/mm/page_alloc.c
如果是单页,不归还给伙伴系统,放到CPU缓存中

static inline void free_the_page(struct page *page, unsigned int order)
{
	if (order == 0)
		free_unref_page(page); //释放单页
	else
		__free_pages_ok(page, order);
}

linux_kernel/mm/page_alloc.c
计算内存页对应的内存域,迁移类型

static void __free_pages_ok(struct page *page, unsigned int order)
{
	free_one_page(page_zone(page), page, pfn, order, migratetype);
}

linux_kernel/mm/page_alloc.c

static void free_one_page(struct zone *zone,
				struct page *page, unsigned long pfn,
				unsigned int order,
				int migratetype)
{
	__free_one_page(page, pfn, zone, order, migratetype);
}

linux_kernel/mm/page_alloc.c
要释放的内存被添加到相关内存域的free_area中,进一步的对连续的内存单元进行合并为高一阶的内存放到高一阶的free_area中。此外如果存在可以合并的伙伴对,也进行合并并转移到高一阶的free_area列表中,直到所有可能的伙伴对都已合并。

static inline void __free_one_page(struct page *page,
		unsigned long pfn,
		struct zone *zone, unsigned int order,
		int migratetype)
{
	unsigned long combined_pfn;
	unsigned long uninitialized_var(buddy_pfn);
	struct page *buddy;
	unsigned int max_order;
	struct capture_control *capc = task_capc(zone);

	max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);

continue_merging:
	while (order < max_order - 1) { //不断循环直到不能合并为止
	    //计算伙伴地址
		buddy_pfn = __find_buddy_pfn(pfn, order);
		buddy = page + (buddy_pfn - pfn);
		if (!pfn_valid_within(buddy_pfn))
			goto done_merging; //去合并
		if (!page_is_buddy(page, buddy, order))
			goto done_merging;
		if (page_is_guard(buddy))
			clear_page_guard(zone, buddy, order, migratetype);
		else
			del_page_from_free_area(buddy, &zone->free_area[order]);
		combined_pfn = buddy_pfn & pfn;
		page = page + (combined_pfn - pfn);
		pfn = combined_pfn;
		//阶数累加,伙伴向高阶移动
		order++;
	}
	
    //计算是否还能合并
	if (max_order < MAX_ORDER) {
		if (unlikely(has_isolate_pageblock(zone))) {
			int buddy_mt;

			buddy_pfn = __find_buddy_pfn(pfn, order);
			buddy = page + (buddy_pfn - pfn);
			buddy_mt = get_pageblock_migratetype(buddy);

			if (migratetype != buddy_mt
					&& (is_migrate_isolate(migratetype) ||
						is_migrate_isolate(buddy_mt)))
				goto done_merging;
		}
		max_order++;
		goto continue_merging;
	}

done_merging:
	set_page_order(page, order);
	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
			&& !is_shuffle_order(order)) {
		struct page *higher_page, *higher_buddy;
		//计算伙伴对地址
		combined_pfn = buddy_pfn & pfn;
		higher_page = page + (combined_pfn - pfn);
		buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
		if (pfn_valid_within(buddy_pfn) &&
		    page_is_buddy(higher_page, higher_buddy, order + 1)) {
		    //添加到指定阶的空闲列表
			add_to_free_area_tail(page, &zone->free_area[order],
					      migratetype);
			return;
		}
	}
	//添加到指定阶
	if (is_shuffle_order(order))
		add_to_free_area_random(page, &zone->free_area[order],
				migratetype);
	else
		add_to_free_area(page, &zone->free_area[order], migratetype);

}

二. slub系统

现在我们已经构建起了伙伴系统,接下来是时候复活slub系统了。首先为什么要有slub系统,答案很明显,hubby系统最小分配单元是页,如果我需要分配256个字节,hubby系统干不了,所以才有了slub系统。为了提高效率,内核为slub创建了多级,多种缓存。多级体现在slub的cpu缓存和node缓存,多种体现在,按对象尺寸建立多个缓存。为了分配的效率,按slub缓存的空满状态建立缓存管理。

在这里插入图片描述

linux_kernel/include/linux/slub_def.h
系统按不同的对象尺寸创建各类kmem_cache ,每个kmem_cache 关联着kmem_cache_cpu 和kmem_cache_node 数组

struct kmem_cache {
	struct kmem_cache_cpu __percpu *cpu_slab;
	/* Used for retrieving partial slabs, etc. */
	slab_flags_t flags;
	unsigned long min_partial;
	unsigned int size;	/* The size of an object including metadata */
	unsigned int object_size;/* The size of an object without metadata */
	unsigned int offset;	/* Free pointer offset */
	struct kmem_cache_order_objects oo;
	/* Allocation and freeing of slabs */
	struct kmem_cache_order_objects max;
	struct kmem_cache_order_objects min;
	gfp_t allocflags;	/* gfp flags to use on each alloc */
	int refcount;		/* Refcount for slab cache destroy */
	void (*ctor)(void *);
	unsigned int inuse;		/* Offset to metadata */
	unsigned int align;		/* Alignment */
	unsigned int red_left_pad;	/* Left redzone padding size */
	const char *name;	/* Name (only for display!) */
	struct list_head list;	/* List of slab caches */
	struct kmem_cache_node *node[MAX_NUMNODES];
};

linux_kernel/include/linux/slub_def.h
kmem_cache_cpu像营业厅,内存分配都先找它

struct kmem_cache_cpu {
	void **freelist;	/* Pointer to next available object */
	unsigned long tid;	/* Globally unique transaction id */
	struct page *page;	//指向正在分配的slab
#ifdef CONFIG_SLUB_CPU_PARTIAL
	struct page *partial;	/* Partially allocated frozen slabs */
#endif
#ifdef CONFIG_SLUB_STATS
	unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
};

linux_kernel/mm/slab.h
kmem_cache_node像仓库,营业厅卖完了,把卖完的slab送回仓库,重新从仓库拉一个slab会营业厅

struct kmem_cache_node {
	spinlock_t list_lock;
#ifdef CONFIG_SLAB
	struct list_head slabs_partial;	/* partial list first, better asm code */
	struct list_head slabs_full;
	struct list_head slabs_free;
	unsigned long total_slabs;	/* length of all slab lists */
	unsigned long free_slabs;	/* length of free slab list only */
	unsigned long free_objects;
	unsigned int free_limit;
	unsigned int colour_next;	/* Per-node cache coloring */
	struct array_cache *shared;	/* shared per node */
	struct alien_cache **alien;	/* on other nodes */
	unsigned long next_reap;	/* updated without locking */
	int free_touched;		/* updated without locking */
#endif
	atomic_long_t nr_slabs;
	atomic_long_t total_objects;
	struct list_head full;
#endif
#endif

};

每次申请size大小的对象时,系统会根据size匹配到合适的kmem_cache,接着先去 kmem_cache_cpu 的slub分配对象,如果没有分配到,说明kmem_cache_cpu的slub满了,则将此slub挂到kmem_cache_node 的slabs_full队列中,重新从kmem_cache_node 的partia队列找一个slub挂到kmem_cache_cpu ,如果partia队列没找到则重新申请一个slub。

1.构建

linux_kernel/init/main.c

asmlinkage __visible void __init start_kernel(void){
	mm_init();
}

linux_kernel/init/main.c

static void __init mm_init(void){
    mem_init();
	kmem_cache_init();
	pgtable_init();
	vmalloc_init(); //vmalloc分配初始化
}

linux_kernel/arch/x86/mm/init_64.c
内存域管理中,讲过Linux启动期间通过memblock分配器来管理内存,系统走到这里,是时候切换到新建的内存管理系统了,memblock_free_all将释放启动器的内存,将启动期间分配的内存迁移到hubby系统中去。

void __init mem_init(void)
{
	memblock_free_all();
	after_bootmem = 1;
}

inux_kernel/mm/slub.c
这里是slub系统的起点

void __init kmem_cache_init(void)
{
	create_kmalloc_caches(0);
}

linux_kernel/mm/slab_common.c
初始化时循环创建各种类型的mem_cache

void __init create_kmalloc_caches(slab_flags_t flags)
{
	int i, type;
	for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
			if (!kmalloc_caches[type][i])
				new_kmalloc_cache(i, type, flags);
			if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
					!kmalloc_caches[type][1])
				new_kmalloc_cache(1, type, flags);
			if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
					!kmalloc_caches[type][2])
				new_kmalloc_cache(2, type, flags);
		}
	}
}

2.分配

linux_kernel/include/linux/slab.h

static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
	return __kmalloc(size, flags);
}

linux_kernel/mm/slub.c

void *__kmalloc(size_t size, gfp_t flags)
{
	struct kmem_cache *s;
	ret = slab_alloc(s, flags, _RET_IP_);
	return ret;
}
EXPORT_SYMBOL(__kmalloc);

linux_kernel/mm/slab.c

static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
	objp = __do_cache_alloc(cachep, flags);
	return objp;
}

linux_kernel/mm/slab.c
先从kmem_cache_cpu的slab分配,未分配到,从kmem_cache_node 分配

static __always_inline void *
__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
{
	void *objp;

	objp = ____cache_alloc(cache, flags); 

	if (!objp)
		objp = ____cache_alloc_node(cache, flags, numa_mem_id());

  out:
	return objp;
}

3.释放

linux_kernel/mm/slub.c

void kfree(const void *x)
{
	struct page *page;
	void *object = (void *)x;
	page = virt_to_head_page(x);
	slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
}
EXPORT_SYMBOL(kfree);

linux_kernel/mm/slub.c

static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
				      void *head, void *tail, int cnt,
				      unsigned long addr)
{
	if (slab_free_freelist_hook(s, &head, &tail))
		do_slab_free(s, page, head, tail, cnt, addr);
}

linux_kernel/mm/slub.c

static __always_inline void do_slab_free(struct kmem_cache *s,
				struct page *page, void *head, void *tail,
				int cnt, unsigned long addr)
{
	__slab_free(s, page, head, tail_obj, cnt, addr);
}

linux_kernel/mm/slub.c
先判断要释放的page是否在cpu缓存中,如果是则释放,否则表示在node缓存中。判断所在slab的满空状态采取不同的处理策略。

static void __slab_free(struct kmem_cache *s, struct page *page,
			void *head, void *tail, int cnt,
			unsigned long addr)

{
	void *prior;
	int was_frozen;
	struct page new;
	unsigned long counters;
	struct kmem_cache_node *n = NULL;
	unsigned long uninitialized_var(flags);

	do {
		prior = page->freelist; 
		counters = page->counters;
		set_freepointer(s, tail, prior);
		new.counters = counters;
		was_frozen = new.frozen;
		new.inuse -= cnt;
		if ((!new.inuse || !prior) && !was_frozen) {
		    //判断是否是在kmem_cache_cpu的slab中
			if (kmem_cache_has_cpu_partial(s) && !prior) {
				new.frozen = 1;
			} else { 
			    //找到所在kmem_cache_node
				n = get_node(s, page_to_nid(page));
				spin_lock_irqsave(&n->list_lock, flags);
			}
		}

	} while (!cmpxchg_double_slab(s, page,
		prior, counters,
		head, new.counters,
		"__slab_free"));

	if (likely(!n)) { //在cpu缓存
		if (new.frozen && !was_frozen) {
			put_cpu_partial(s, page, 1);
			stat(s, CPU_PARTIAL_FREE);
		}
		if (was_frozen)
			stat(s, FREE_FROZEN);
		return;
	}

    //在node缓存中

	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
		goto slab_empty;

	//从满队列移除,变更未满队列
	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
		remove_full(s, n, page);
		add_partial(n, page, DEACTIVATE_TO_TAIL);
		stat(s, FREE_ADD_PARTIAL);
	}
	spin_unlock_irqrestore(&n->list_lock, flags);
	return;

slab_empty:
	if (prior) { //从未满队列移除		
	    remove_partial(n, page);
		stat(s, FREE_REMOVE_PARTIAL);
	} else {
		//从未满队列移除
		remove_full(s, n, page);
	}
	spin_unlock_irqrestore(&n->list_lock, flags);
	stat(s, FREE_SLAB);
	discard_slab(s, page);
}

三.vmalloc

在内核域管理一篇,32位机高端内存vmalloc区用于建立非连续内存分配,每个vmalloc分配的子区域用vm_struct 来表示。vmap_area 用于组织管理各个子区域来实现1G以上高端内存的映射。

在这里插入图片描述

linux_kernel/include/linux/vmalloc.h
vm_struct表示一个映射区

struct vm_struct {
	struct vm_struct	*next; //链表
	void			*addr;  //vm区域在虚拟地址空间中的起始地址
	unsigned long		size; //区域长度
	unsigned long		flags;//存储区类型
	struct page		**pages; //指向页指针的数组
	unsigned int		nr_pages;//数组项数目
	phys_addr_t		phys_addr;//物理地址ioremap
	const void		*caller;
};

linux_kernel/include/linux/vmalloc.h
vmap_area 用于组织vm_struct

struct vmap_area {
	unsigned long va_start;
	unsigned long va_end;

	struct rb_node rb_node;         /* address sorted rbtree */
	struct list_head list;          /* address sorted list */
	union {
		unsigned long subtree_max_size; /* in "free" tree */
		struct vm_struct *vm;           /* in "busy" tree */
		struct llist_node purge_list;   /* in purge list */
	};
};

1.分配

linux_kernel/mm/vmalloc.c

static struct vm_struct *__get_vm_area_node(unsigned long size,
		unsigned long align, unsigned long flags, unsigned long start,
		unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
	struct vmap_area *va;
	struct vm_struct *area;
	size = PAGE_ALIGN(size);
	.....
	//先去node的slab找找看
	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
	......
	if (!(flags & VM_NO_GUARD))
		size += PAGE_SIZE;
	//获取一个vmap_area
	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
	if (IS_ERR(va)) {
		kfree(area);
		return NULL;
	}
	//vm_struct加入到vmap_area
	setup_vmalloc_vm(area, va, flags, caller);

	return area;
}

linux_kernel/mm/vmalloc.c

static struct vmap_area *alloc_vmap_area(unsigned long size,
				unsigned long align,
				unsigned long vstart, unsigned long vend,
				int node, gfp_t gfp_mask)
{
	struct vmap_area *va, *pva;
	unsigned long addr;
	int purged = 0;
	//去slab找找
	va = kmem_cache_alloc_node(vmap_area_cachep,
			gfp_mask & GFP_RECLAIM_MASK, node);

retry:
	//计算匹配合适的映射地址
	addr = __alloc_vmap_area(size, align, vstart, vend);
	if (unlikely(addr == vend))
		goto overflow;
	va->va_start = addr;
	va->va_end = addr + size;
	va->vm = NULL;
	//加入到vmap_area管理结构
	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
	return va;

overflow:
    //溢出,没有足够的映射空间,释放
	kmem_cache_free(vmap_area_cachep, va);
	return ERR_PTR(-EBUSY);
}

2.释放

linux_kernel/mm/vmalloc.c
解除映射,必要时释放内存页

static void __vunmap(const void *addr, int deallocate_pages)
{
	struct vm_struct *area;
	area = find_vm_area(addr);
	vm_remove_mappings(area, deallocate_pages);

	if (deallocate_pages) {
		int i;
		for (i = 0; i < area->nr_pages; i++) {
			struct page *page = area->pages[i];
			__free_pages(page, 0);
		}
		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
		kvfree(area->pages);
	}
	kfree(area);
	return;
}

至此Linux基于分页管理从另一个维度组织起伙伴系统和slub系统,内核内存管理从启动期内存管理器过渡转换到hubby,slab内存分配器,为内核的构建起高效的内存管理系统。但是到目前位置我们都是站在内核的角度来看,从用户的角度,要支撑起上层应用庞大的内存分配需求,显然是不够的。比如考虑运行在Linux之上的HotSpot虚拟机,她自身可以管理百兆到TB级别的庞大内存空间,拥有自身的内存管理系统,而她自身又作为一个系统进程运行在Linux之上,拥有堆栈空间。因此在用户空间必须有另一套内存管理机制来支撑上层应用的运行,这个机制就是进程虚拟内存。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值