1 源码
kmalloc 申请不超过8k则使用slab,再大就走伙伴系统
/**
* kmalloc - allocate memory
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
*
* kmalloc is the normal method of allocating memory
* for objects smaller than page size in the kernel.
*
* The @flags argument may be one of the GFP flags defined at
* include/linux/gfp.h and described at
* :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
*
* The recommended usage of the @flags is described at
* :ref:`Documentation/core-api/memory-allocation.rst <memory-allocation>`
*
* Below is a brief outline of the most useful GFP flags
*
* %GFP_KERNEL
* Allocate normal kernel ram. May sleep.
*
* %GFP_NOWAIT
* Allocation will not sleep.
*
* %GFP_ATOMIC
* Allocation will not sleep. May use emergency pools.
*
* %GFP_HIGHUSER
* Allocate memory from high memory on behalf of user.
*
* Also it is possible to set different flags by OR'ing
* in one or more of the following additional @flags:
*
* %__GFP_HIGH
* This allocation has high priority and may use emergency pools.
*
* %__GFP_NOFAIL
* Indicate that this allocation is in no way allowed to fail
* (think twice before using).
*
* %__GFP_NORETRY
* If memory is not immediately available,
* then give up at once.
*
* %__GFP_NOWARN
* If allocation fails, don't issue any warnings.
*
* %__GFP_RETRY_MAYFAIL
* Try really hard to succeed the allocation but fail
* eventually.
*/
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size)) {
#ifndef CONFIG_SLOB
unsigned int index;
#endif
if (size > KMALLOC_MAX_CACHE_SIZE)
return kmalloc_large(size, flags);/* 分支1:之后将会使用
__get_free_pages()来获取页
*/
#ifndef CONFIG_SLOB
index = kmalloc_index(size); //计算出相应index
if (!index)
return ZERO_SIZE_PTR;
return kmem_cache_alloc_trace(
kmalloc_caches[kmalloc_type(flags)][index],
flags, size);
#endif
}
return __kmalloc(size, flags);
}
kmalloc()并不是真正实现内存分配的地方,在此处仅仅是进行了一些判断,并形成了3个分支:
第一个分支,是size大于KMALLOC_MAX_CACHE_SIZE就会调用kmalloc_large(size, flags)来继续完成分配,其中这个MAX的值是(1UL << (12 + 1)),即8KB。当使用kmalloc申请大于8K的内存时,会进入这个分支,专门用于处理大块内存(以page为粒度)的申请。这个分支的内容与buddy system有关:
将会经kmalloc_large()->kmalloc_order_trace()->kmalloc_order()->alloc_pages(),最终通过Buddy伙伴算法申请所需内存
第二个分支,要求“ndef CONFIG_SLOB”,这个CONFIG_SLOB是一个编译时配置的选项,我们就默认这个选项没有定义好了。在此基础上,如果没有超过MAX,那就先调用kmalloc_index(size)
/*
* Figure out which kmalloc slab an allocation of a certain size
* belongs to.
* 0 = zero alloc
* 1 = 65 .. 96 bytes
* 2 = 129 .. 192 bytes
* n = 2^(n-1)+1 .. 2^n
*/
static __always_inline unsigned int kmalloc_index(size_t size)
{
if (!size)
return 0;
if (size <= KMALLOC_MIN_SIZE)
return KMALLOC_SHIFT_LOW;
if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
return 1;
if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
return 2;
if (size <= 8) return 3;
if (size <= 16) return 4;
if (size <= 32) return 5;
if (size <= 64) return 6;
if (size <= 128) return 7;
if (size <= 256) return 8;
if (size <= 512) return 9;
if (size <= 1024) return 10;
if (size <= 2 * 1024) return 11;
if (size <= 4 * 1024) return 12;
if (size <= 8 * 1024) return 13;
if (size <= 16 * 1024) return 14;
if (size <= 32 * 1024) return 15;
if (size <= 64 * 1024) return 16;
if (size <= 128 * 1024) return 17;
if (size <= 256 * 1024) return 18;
if (size <= 512 * 1024) return 19;
if (size <= 1024 * 1024) return 20;
if (size <= 2 * 1024 * 1024) return 21;
if (size <= 4 * 1024 * 1024) return 22;
if (size <= 8 * 1024 * 1024) return 23;
if (size <= 16 * 1024 * 1024) return 24;
if (size <= 32 * 1024 * 1024) return 25;
if (size <= 64 * 1024 * 1024) return 26;
BUG();
/* Will never be reached. Needed because the compiler may complain */
return -1;
}
之后则根据获得的index,调用:
kmem_cache_alloc_trace(kmalloc_caches[kmalloc_type(flags)][index], flags, size);
值得注意的是,这里将index作为一个二维数组kmalloc_caches的第二个序号,而第一个则是一个由flag确定的类别。类比用户态的bin,猜想这个kmalloc_caches数组,用于维护一个类似bin的池子,这个池子的分类标准除了size以外,还有一个由flag确定的类别,每一个类别中,都有不同size对应的池子,不妨瞥一眼这些池子对于flag的分类标准吧:
static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
{
#ifdef CONFIG_ZONE_DMA
/*
* The most common case is KMALLOC_NORMAL, so test for it
* with a single branch for both flags.
*/
if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0))
return KMALLOC_NORMAL;
/*
* At least one of the flags has to be set. If both are, __GFP_DMA
* is more important.
*/
return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM;
#else
return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL;
#endif
}
对于这些分类,我暂时没打算细究。进入kmem_cache_alloc_trace()之后,使用slab_alloc()分配地址。
至于第三个分支,一般只有在定义了CONFIG_SLOB时才会执行到,在此就先不细究了。
以上仅仅是宏观的看看kmalloc的实现。可以发现,针对大size的请求,最后我们会使用alloc_pages()完成请求,小size则使用slab_alloc()完成,这两个申请则对应伙伴系统和slab分配机制,我们先从大的,从开始alloc_pages()这边开始吧。
2 buddy system相关
先大致浏览一下这部分的源码:
static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
{
unsigned int order = get_order(size);
return kmalloc_order_trace(size, flags, order);
}
其中,需要先计算一个order,然后再使用order去申请内存,这是一个看上去有点乱的宏定义,好在注释很清楚doge:
/**
* get_order - Determine the allocation order of a memory size
* @size: The size for which to get the order
*
* Determine the allocation order of a particular sized block of memory. This
* is on a logarithmic scale, where:
*
* 0 -> 2^0 * PAGE_SIZE and below
* 1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1
* 2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1
* 3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1
* 4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1
* ...
*
* The order returned is used to find the smallest allocation granule required
* to hold an object of the specified size.
*
* The result is undefined if the size is 0.
*
* This function may be used to initialise variables with compile time
* evaluations of constants.
*/
#define get_order(n) \
( \
__builtin_constant_p(n) ? ( \
((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT : \
(((n) < (1UL << PAGE_SHIFT)) ? 0 : \
ilog2((n) - 1) - PAGE_SHIFT + 1) \
) : \
__get_order(n) \
)
继续深入,其主干在于kmalloc_order():
void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
void *ret = kmalloc_order(size, flags, order);
trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
return ret;
}
终于找到了这个大块内存分配的重要函数,alloc_pages():
void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
{
void *ret;
struct page *page;
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
ret = page ? page_address(page) : NULL;
ret = kasan_kmalloc_large(ret, size, flags);
/* As ret might get tagged, call kmemleak hook after KASAN. */
kmemleak_alloc(ret, size, 1, flags);
return ret;
}
alloc_pages()再往里面的代码不全贴出来了,实在是太多了…
alloc_pages()的分配的实现,靠的是__alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
该函数的分配实现,又靠的是page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
再进一步,靠的是page = rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype);
终于,在这个rmqueue()函数,我可以看出一些buddy system的逻辑了:
/*
* Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
if (likely(order == 0)) { //Use pcplists for order-0 allocations.
page = rmqueue_pcplist(preferred_zone, zone, order,
gfp_flags, migratetype, alloc_flags);
goto out;
}
/*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
do {
page = NULL;
if (alloc_flags & ALLOC_HARDER) {
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);//偷取机制
} while (page && check_new_pages(page, order));//一页一页检查,看是不是有了足够个连续的page
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone); //更新zone信息
local_irq_restore(flags);
out:
/* Separate test+clear to avoid unnecessary atomics */
if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
failed:
local_irq_restore(flags);
return NULL;
}
buddy system分配内存的源头,在于zone结构体的free_area域:
struct zone {
/* Read-mostly fields */
...
struct per_cpu_pageset __percpu *pageset; //针对单个cpu的冷热页
...
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER];
...
}____cacheline_internodealigned_in_smp;
zone是一个非常复杂的结构体,负责管理内存分配,其中也包含了提供给buddy system的内存池,也就是free_area:
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
unsigned long nr_free; //用于表示当前order下空闲内存块的数量
};
可以看出,每一个不同的order,都会对应一个free_area,然后同一个order,同一个free_area下,再有每一个不同的 MIGRATE_TYPES ,都会对应一个free_list,即一个链表,我们可以视为一个内存池,这个链表具体是指向struct page的lru域。
也就是说,同一个链表上的内存区域,都是order(即对应的size)相同,迁移类型也相同的。
zone中还维护了一个pageset 域,用于管理per cpu的冷热页:
struct per_cpu_pageset {
struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
s8 expire;
u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
#endif
#ifdef CONFIG_SMP
s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};
struct per_cpu_pages {
int count; /* number of pages in the list */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[MIGRATE_PCPTYPES];
};
所谓的冷热页,是指:
冷页表示该空闲页已经不再高速缓存中了(一般是指L2 Cache),热页表示该空闲页仍然在高速缓存中。冷热页是针对于每CPU的,每个zone中,都会针对于所有的CPU初始化一个冷热页的per-cpu-pageset,即pcp。
可以类比用户态中的fastbin或者tcachebin,将处在高速缓存中的页单独拎出来,优先分配,从而提高了工作效率。
pageset中的冷热页链表元素数量是有限制的,由per_cpu_pages的high成员控制,毕竟如果热页太多,实际上最早加进来的页已经不热了。
当一次释放1个page的时候,会优先将这个page放到pcp链表中,这和fastbin也是类似的。如果释放超过一个page,则会使用正常的buddy算法。
此时回看rmqueue()函数,可以看出,在order为0,即申请一个页的时候,会调用针对pcp链表的申请,与此吻合。
接着就是针对order>=1的情况,调用__rmqueue_smallest(),将会在zone中寻找能够满足order要求的,存在的,最小的对应内存块。即如果order对应链表里的内存不足,则到order+1处继续寻找,如此往复,直到找到为止:
/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = list_first_entry_or_null(&area->free_list[migratetype],
struct page, lru);
if (!page)
continue;
list_del(&page->lru);
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
//将根据buddy算法 调整更新后的free_area
set_pcppage_migratetype(page, migratetype);
return page;
}
return NULL;
}
如果__rmqueue_smallest()的分配失败了, 即当前migratetype所对应的池子里面没有满足要求的。则会进入__rmqueue()函数,在该函数中,将会从别的迁移类型中,偷取page,即一次fallback的分配。
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
unsigned int alloc_flags)
{
struct page *page;
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
if (!page && __rmqueue_fallback(zone, order, migratetype,
alloc_flags))
goto retry;
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}