kernel: kmalloc

aningxiaoxixi

已于 2022-08-16 23:55:43 修改

阅读量231

点赞数

分类专栏： linux内核驱动 linux 文章标签： linux

于 2022-08-16 19:39:04 首次发布

原文链接：https://zhuanlan.zhihu.com/p/166606655

版权

linux内核同时被 3 个专栏收录

55 篇文章 1 订阅

订阅专栏

linux

44 篇文章 0 订阅

订阅专栏

驱动

8 篇文章 0 订阅

订阅专栏

1 源码

kmalloc 申请不超过8k则使用slab，再大就走伙伴系统

/**
 * kmalloc - allocate memory
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
 *
 * kmalloc is the normal method of allocating memory
 * for objects smaller than page size in the kernel.
 *
 * The @flags argument may be one of the GFP flags defined at
 * include/linux/gfp.h and described at
 * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
 *
 * The recommended usage of the @flags is described at
 * :ref:`Documentation/core-api/memory-allocation.rst <memory-allocation>`
 *
 * Below is a brief outline of the most useful GFP flags
 *
 * %GFP_KERNEL
 *	Allocate normal kernel ram. May sleep.
 *
 * %GFP_NOWAIT
 *	Allocation will not sleep.
 *
 * %GFP_ATOMIC
 *	Allocation will not sleep.  May use emergency pools.
 *
 * %GFP_HIGHUSER
 *	Allocate memory from high memory on behalf of user.
 *
 * Also it is possible to set different flags by OR'ing
 * in one or more of the following additional @flags:
 *
 * %__GFP_HIGH
 *	This allocation has high priority and may use emergency pools.
 *
 * %__GFP_NOFAIL
 *	Indicate that this allocation is in no way allowed to fail
 *	(think twice before using).
 *
 * %__GFP_NORETRY
 *	If memory is not immediately available,
 *	then give up at once.
 *
 * %__GFP_NOWARN
 *	If allocation fails, don't issue any warnings.
 *
 * %__GFP_RETRY_MAYFAIL
 *	Try really hard to succeed the allocation but fail
 *	eventually.
 */
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
	if (__builtin_constant_p(size)) {
#ifndef CONFIG_SLOB
		unsigned int index;
#endif
		if (size > KMALLOC_MAX_CACHE_SIZE)
			return kmalloc_large(size, flags);/* 分支1：之后将会使用
                                                        __get_free_pages()来获取页
                                                            */
#ifndef CONFIG_SLOB
		index = kmalloc_index(size); //计算出相应index
		if (!index)
			return ZERO_SIZE_PTR;
		return kmem_cache_alloc_trace(
				kmalloc_caches[kmalloc_type(flags)][index],
				flags, size);
#endif
	}
	return __kmalloc(size, flags);
}

kmalloc()并不是真正实现内存分配的地方，在此处仅仅是进行了一些判断，并形成了3个分支：
第一个分支，是size大于KMALLOC_MAX_CACHE_SIZE就会调用kmalloc_large(size, flags)来继续完成分配，其中这个MAX的值是(1UL << (12 + 1))，即8KB。当使用kmalloc申请大于8K的内存时，会进入这个分支，专门用于处理大块内存（以page为粒度）的申请。这个分支的内容与buddy system有关：
将会经kmalloc_large()->kmalloc_order_trace()->kmalloc_order()->alloc_pages()，最终通过Buddy伙伴算法申请所需内存
第二个分支，要求“ndef CONFIG_SLOB”，这个CONFIG_SLOB是一个编译时配置的选项，我们就默认这个选项没有定义好了。在此基础上，如果没有超过MAX，那就先调用kmalloc_index(size)

/*
 * Figure out which kmalloc slab an allocation of a certain size
 * belongs to.
 * 0 = zero alloc
 * 1 =  65 .. 96 bytes
 * 2 = 129 .. 192 bytes
 * n = 2^(n-1)+1 .. 2^n
 */
static __always_inline unsigned int kmalloc_index(size_t size)
{
	if (!size)
		return 0;
	if (size <= KMALLOC_MIN_SIZE)
		return KMALLOC_SHIFT_LOW;
	if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
		return 1;
	if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
		return 2;
	if (size <=          8) return 3;
	if (size <=         16) return 4;
	if (size <=         32) return 5;
	if (size <=         64) return 6;
	if (size <=        128) return 7;
	if (size <=        256) return 8;
	if (size <=        512) return 9;
	if (size <=       1024) return 10;
	if (size <=   2 * 1024) return 11;
	if (size <=   4 * 1024) return 12;
	if (size <=   8 * 1024) return 13;
	if (size <=  16 * 1024) return 14;
	if (size <=  32 * 1024) return 15;
	if (size <=  64 * 1024) return 16;
	if (size <= 128 * 1024) return 17;
	if (size <= 256 * 1024) return 18;
	if (size <= 512 * 1024) return 19;
	if (size <= 1024 * 1024) return 20;
	if (size <=  2 * 1024 * 1024) return 21;
	if (size <=  4 * 1024 * 1024) return 22;
	if (size <=  8 * 1024 * 1024) return 23;
	if (size <=  16 * 1024 * 1024) return 24;
	if (size <=  32 * 1024 * 1024) return 25;
	if (size <=  64 * 1024 * 1024) return 26;
	BUG();
	/* Will never be reached. Needed because the compiler may complain */
	return -1;
}

之后则根据获得的index，调用:

kmem_cache_alloc_trace(kmalloc_caches[kmalloc_type(flags)][index], flags, size);

值得注意的是，这里将index作为一个二维数组kmalloc_caches的第二个序号，而第一个则是一个由flag确定的类别。类比用户态的bin，猜想这个kmalloc_caches数组，用于维护一个类似bin的池子，这个池子的分类标准除了size以外，还有一个由flag确定的类别，每一个类别中，都有不同size对应的池子，不妨瞥一眼这些池子对于flag的分类标准吧：

static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
{
#ifdef CONFIG_ZONE_DMA
	/*
	 * The most common case is KMALLOC_NORMAL, so test for it
	 * with a single branch for both flags.
	 */
	if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0))
		return KMALLOC_NORMAL;
	/*
	 * At least one of the flags has to be set. If both are, __GFP_DMA
	 * is more important.
	 */
	return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM;
#else
	return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL;
#endif
}

对于这些分类，我暂时没打算细究。进入kmem_cache_alloc_trace()之后，使用slab_alloc()分配地址。

至于第三个分支，一般只有在定义了CONFIG_SLOB时才会执行到，在此就先不细究了。

以上仅仅是宏观的看看kmalloc的实现。可以发现，针对大size的请求，最后我们会使用alloc_pages()完成请求，小size则使用slab_alloc()完成，这两个申请则对应伙伴系统和slab分配机制，我们先从大的，从开始alloc_pages()这边开始吧。

2 buddy system相关

先大致浏览一下这部分的源码：

static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
{
	unsigned int order = get_order(size);
	return kmalloc_order_trace(size, flags, order);
}

其中，需要先计算一个order，然后再使用order去申请内存，这是一个看上去有点乱的宏定义，好在注释很清楚doge：

/**
 * get_order - Determine the allocation order of a memory size
 * @size: The size for which to get the order
 *
 * Determine the allocation order of a particular sized block of memory.  This
 * is on a logarithmic scale, where:
 *
 *	0 -> 2^0 * PAGE_SIZE and below
 *	1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1
 *	2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1
 *	3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1
 *	4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1
 *	...
 *
 * The order returned is used to find the smallest allocation granule required
 * to hold an object of the specified size.
 *
 * The result is undefined if the size is 0.
 *
 * This function may be used to initialise variables with compile time
 * evaluations of constants.
 */
#define get_order(n)						\
(								\
	__builtin_constant_p(n) ? (				\
		((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :	\
		(((n) < (1UL << PAGE_SHIFT)) ? 0 :		\
		 ilog2((n) - 1) - PAGE_SHIFT + 1)		\
	) :							\
	__get_order(n)						\
)

继续深入，其主干在于kmalloc_order()：

void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
	void *ret = kmalloc_order(size, flags, order);
	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
	return ret;
}

终于找到了这个大块内存分配的重要函数，alloc_pages()：

void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
{
	void *ret;
	struct page *page;
	flags |= __GFP_COMP;
	page = alloc_pages(flags, order);
	ret = page ? page_address(page) : NULL;
	ret = kasan_kmalloc_large(ret, size, flags);
	/* As ret might get tagged, call kmemleak hook after KASAN. */
	kmemleak_alloc(ret, size, 1, flags);
	return ret;
}

alloc_pages()再往里面的代码不全贴出来了，实在是太多了…

alloc_pages()的分配的实现，靠的是__alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);

该函数的分配实现，又靠的是page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);

再进一步，靠的是page = rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype);

终于，在这个rmqueue()函数，我可以看出一些buddy system的逻辑了：

/*
 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
 */
static inline
struct page *rmqueue(struct zone *preferred_zone,
			struct zone *zone, unsigned int order,
			gfp_t gfp_flags, unsigned int alloc_flags,
			int migratetype)
{
	unsigned long flags;
	struct page *page;
	if (likely(order == 0)) {     //Use pcplists for order-0 allocations.
		page = rmqueue_pcplist(preferred_zone, zone, order,
				gfp_flags, migratetype, alloc_flags);
		goto out;
	}
	/*
	 * We most definitely don't want callers attempting to
	 * allocate greater than order-1 page units with __GFP_NOFAIL.
	 */
	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
	spin_lock_irqsave(&zone->lock, flags);
	do {
		page = NULL;
		if (alloc_flags & ALLOC_HARDER) {
			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
			if (page)
				trace_mm_page_alloc_zone_locked(page, order, migratetype);
		}
		if (!page)
			page = __rmqueue(zone, order, migratetype, alloc_flags);//偷取机制
	} while (page && check_new_pages(page, order));//一页一页检查，看是不是有了足够个连续的page
	spin_unlock(&zone->lock);
	if (!page)
		goto failed;
	__mod_zone_freepage_state(zone, -(1 << order),
				  get_pcppage_migratetype(page));
	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
	zone_statistics(preferred_zone, zone); //更新zone信息
	local_irq_restore(flags);
out:
	/* Separate test+clear to avoid unnecessary atomics */
	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
	}
	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
	return page;
failed:
	local_irq_restore(flags);
	return NULL;
}

buddy system分配内存的源头，在于zone结构体的free_area域：

struct zone {
        /* Read-mostly fields */
        ...
        struct per_cpu_pageset __percpu *pageset; //针对单个cpu的冷热页
        ...
        /* free areas of different sizes */
	struct free_area	free_area[MAX_ORDER];
        ...
}____cacheline_internodealigned_in_smp;

zone是一个非常复杂的结构体，负责管理内存分配，其中也包含了提供给buddy system的内存池，也就是free_area：

struct free_area {
	struct list_head	free_list[MIGRATE_TYPES];
	unsigned long		nr_free; //用于表示当前order下空闲内存块的数量
};

可以看出，每一个不同的order，都会对应一个free_area，然后同一个order，同一个free_area下，再有每一个不同的 MIGRATE_TYPES ，都会对应一个free_list，即一个链表，我们可以视为一个内存池，这个链表具体是指向struct page的lru域。

也就是说，同一个链表上的内存区域，都是order(即对应的size)相同，迁移类型也相同的。

zone中还维护了一个pageset 域，用于管理per cpu的冷热页：

struct per_cpu_pageset {
	struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
	s8 expire;
	u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
#endif
#ifdef CONFIG_SMP
	s8 stat_threshold;
	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};

struct per_cpu_pages {
	int count;		/* number of pages in the list */
	int high;		/* high watermark, emptying needed */
	int batch;		/* chunk size for buddy add/remove */
	/* Lists of pages, one per migrate type stored on the pcp-lists */
	struct list_head lists[MIGRATE_PCPTYPES];
};

所谓的冷热页，是指：
冷页表示该空闲页已经不再高速缓存中了(一般是指L2 Cache)，热页表示该空闲页仍然在高速缓存中。冷热页是针对于每CPU的，每个zone中，都会针对于所有的CPU初始化一个冷热页的per-cpu-pageset，即pcp。
可以类比用户态中的fastbin或者tcachebin，将处在高速缓存中的页单独拎出来，优先分配，从而提高了工作效率。
pageset中的冷热页链表元素数量是有限制的，由per_cpu_pages的high成员控制，毕竟如果热页太多，实际上最早加进来的页已经不热了。
当一次释放1个page的时候，会优先将这个page放到pcp链表中，这和fastbin也是类似的。如果释放超过一个page，则会使用正常的buddy算法。

此时回看rmqueue()函数，可以看出，在order为0，即申请一个页的时候，会调用针对pcp链表的申请，与此吻合。

接着就是针对order>=1的情况，调用__rmqueue_smallest()，将会在zone中寻找能够满足order要求的，存在的，最小的对应内存块。即如果order对应链表里的内存不足，则到order+1处继续寻找，如此往复，直到找到为止：

/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
						int migratetype)
{
	unsigned int current_order;
	struct free_area *area;
	struct page *page;
	/* Find a page of the appropriate size in the preferred list */
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
		area = &(zone->free_area[current_order]);
		page = list_first_entry_or_null(&area->free_list[migratetype],
							struct page, lru);
		if (!page)
			continue;
		list_del(&page->lru);
		rmv_page_order(page);
		area->nr_free--;
		expand(zone, page, order, current_order, area, migratetype);
                //将根据buddy算法 调整更新后的free_area
		set_pcppage_migratetype(page, migratetype);
		return page;
	}
	return NULL;
}

如果__rmqueue_smallest()的分配失败了，即当前migratetype所对应的池子里面没有满足要求的。则会进入__rmqueue()函数，在该函数中，将会从别的迁移类型中，偷取page，即一次fallback的分配。

/*
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
						unsigned int alloc_flags)
{
	struct page *page;
retry:
	page = __rmqueue_smallest(zone, order, migratetype);
	if (unlikely(!page)) {
		if (migratetype == MIGRATE_MOVABLE)
			page = __rmqueue_cma_fallback(zone, order);
		if (!page && __rmqueue_fallback(zone, order, migratetype,
								alloc_flags))
			goto retry;
	}
	trace_mm_page_alloc_zone_locked(page, order, migratetype);
	return page;
}