在讲slub内存管理涉及的四个函数之前,先从slub内存分配算法的初始化开始。系统启动时,会进行slub内存分配算法的初始化,函数流程是:start_kernel() -> mm_init()->kmem_cache_init()。在start_kernel()函数中的setup_arch()里面会利用bootmem分配器进行启动阶段早期的内存分配,然后调用paging_init() -> bootmem_init()进行分页机制和内存管理的初始化。有了前面的基础,在mm_init()里面,首先,调用mem_init()初始化buddy system内存管理算法,至此bootmem分配器完成了任务;后面kmem_cache_init()在buddy system的基础上,进行slub内存分配的初始化。完成 kmem_cache_node 、 kmem_cache 、 kmalloc_caches 三个slab cache管理结构的初始化,对于内核中其他的slab cache的创建通过kmem_cache_create()函数完成。
一、函数详细调用关系图
二、kmem_cache_init函数代码流程
主要完成三个工作:创建 kmem_cache_node 、 kmem_cache 和 kmalloc_caches 三个slab cache
static struct kmem_cache *kmem_cache_node; //linux-4.19.49/mm/slub.c
struct kmem_cache *kmem_cache; //linux-4.19.49/mm/slab_common.c
/* Align objs on cache lines */
#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U)
/*mm/slab.h
* State of the slab allocator.
*
* This is used to describe the states of the allocator during bootup.
* Allocators use this to gradually bootstrap themselves. Most allocators
* have the problem that the structures used for managing slab caches are
* allocated from slab caches themselves.
*/
enum slab_state {
DOWN, /* No slab functionality yet */
PARTIAL, /* SLUB: kmem_cache_node available */
PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */
UP, /* Slab caches usable but not all extras yet */
FULL /* Everything is working */
};
//主要完成三个工作:创建 kmem_cache_node 、 kmem_cache 和 kmalloc_caches 三个slab cache
void __init kmem_cache_init(void)
{
static __initdata struct kmem_cache boot_kmem_cache,
boot_kmem_cache_node; //声明静态变量,存储临时kmem_cache结构;
if (debug_guardpage_minorder())
slub_max_order = 0;
//这两个全局变量定义的位置在上面
kmem_cache_node = &boot_kmem_cache_node;
kmem_cache = &boot_kmem_cache;
//1、创建kmem_cache_node对象的管理结构
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
//注册热插拔内存内核通知链回调函数用于热插拔内存处理,注册到memory_chain上
register_hotmemory_notifier(&slab_memory_callback_nb);
//因为前面创建了创建kmem_cache_node,意味着kmem_cache_node available,所以slab_state状态改为partial,
//slab_state开始默认是DOWN,表示slab完全不可用
/* Able to allocate the per node structures */
slab_state = PARTIAL;
/*
2、创建kmem_cache对象的管理结构,这里相比前面,主要是kmem_cache的size会有变化,因为包含了kmem_cache_node
offsetof(struct kmem_cache, node) 得到node成员变量的在struct kmem_cache结构体偏移量,
然后最终得到整个struct kmem_cache结构的size,进行完整的kmem_cache的创建(前面kmem_cache_node
已经进行了部分创建)
*/
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, node) +
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN, 0, 0);
//3、申请保存管理结构的内存,将临时kmem_cache和boot_kmem_cache_node分别向最终kmem_cache和kmem_cache_node管理结构内存迁移,
//并修正上面node的page(slab)指针,使其指向最终的kmem_cache和kmem_cache_node
kmem_cache = bootstrap(&boot_kmem_cache);
kmem_cache_node = bootstrap(&boot_kmem_cache_node);
/* Now we can use the kmem_cache to allocate kmalloc slabs */
//4、根据KMALLOC_MIN_SIZE,更新kmalloc时要用到的size_index table,kmalloc会在slub allocator工作原理里详细描述
setup_kmalloc_cache_index_table();
//5、初始化kmem_caches,里面包含各种kmalloc-x
create_kmalloc_caches(0);
/* Setup random freelists for each cache */
//遍历全局链表slab_caches上面的kmem_cache,将随机化的值放到结构体里面random_seq数组里面,
//后面freelist指针的随机化时会用到(出于安全考虑)
init_freelist_randomization();
//设置好CPU进行热插拔时slub的回调函数slub_cpu_dead,以便在CPU热插拔时做相应处理
cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
slub_cpu_dead);
//打印slub管理结构初始化后,当前slub的硬件对齐大小,阶数,单个slab里面最小obj数目,逻辑cpu数目,节点数目
pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n",
cache_line_size(),//返回L1 cache line的大小
slub_min_order, slub_max_order, slub_min_objects,
nr_cpu_ids, nr_node_ids);
}
2.1 create_boot_cache函数
在boot阶段时创建slab cache管理结构,包含: kmem_cache_node,kmem_cache,kmalloc_caches
/* Create a cache during boot when no slab services are available yet */
void __init create_boot_cache(struct kmem_cache *s, const char *name,
unsigned int size, slab_flags_t flags,
unsigned int useroffset, unsigned int usersize)
{
int err;
//进行参数的初始化,calculate_alignment计算内存对齐值,具体在SLUB内存管理的4个主要接口函数介绍(1)中有讲
s->name = name;
s->size = s->object_size = size;
s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
s->useroffset = useroffset;
s->usersize = usersize;
slab_init_memcg_params(s);
//核心函数,这个在SLUB内存管理的4个主要接口函数介绍(1)中也有讲,正常是返回0
/*
__kmem_cache_crete 主要通过 kmem_cache_open 实现,这个函数除了设置kmem cache的一些参数以外,
还会调用 init_kmem_cache_nodes 和 alloc_kmem_cache_cpus;
前者用于初始化kmem_cache中的 struct kmem_cache_node *node[MAX_NUMNODES] 成员,
后者用于分配 struct kmem_cache 中的per-cpu成员变量 struct kmem_cache_cpu __percpu *cpu_slab 。
*/
err = __kmem_cache_create(s, flags);
if (err)
panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
name, size, err);
//初始化keme cache引用计数为-1
s->refcount = -1; /* Exempt from merging for now */
}
2.2 bootstrap函数
在系统启动阶段,前期的管理很多都是借用临时变量空间的,所以将会通过bootstrap()将kmem_cache_node和kmem_cache的管理结构迁入到slub管理框架的对象空间中,实现自管理
/*
* 将boot_kmem_cache和boot_kmem_cache_node中的内容拷贝到新申请的slub对象中,并更新每个node上每个page(slab)指向kmem_cache的指针
* 从而完成了struct kmem_cache和struct kmem_cache_node管理结构的bootstrap(自引导)
*/
/*
* Used for early kmem_cache structures that were allocated using
* the page allocator. Allocate them properly then fix up the pointers
* that may be pointing to the wrong kmem_cache structure.
*/
static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
{
int node;
//为前面create_boot_cache()创建的kmem_cache申请slub空间,值得注意的是该函数申请调用kmem_cache_zalloc()->kmem_cache_alloc()->slab_alloc(),
//kmem_cache_alloc函数在SLUB内存管理的4个主要接口函数介绍(2)会详细介绍
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
struct kmem_cache_node *n;
//将bootstrap()入参的kmem_cache结构数据memcpy()至申请的空间中
memcpy(s, static_cache, kmem_cache->object_size);
/*
* This runs very early, and only the boot processor is supposed to be
* up. Even if it weren't true, IRQs are not up so we couldn't fire
* IPIs around.
*/
//刷新cpu的slab信息,主要更新c->page,c->freelist和c->partial
__flush_cpu_slab(s, smp_processor_id());
//循环遍历s->node数组中的所有node,然后遍历每个node上面的partial链表,
//修改上面的struct page的slab_cache指针,指向当前的kmem_cache
for_each_kmem_cache_node(s, node, n) {
struct page *p;
list_for_each_entry(p, &n->partial, lru)
p->slab_cache = s;
//如果使能了slub debug,每个node上面会多一个full链表,指针也要进行更新
#ifdef CONFIG_SLUB_DEBUG
list_for_each_entry(p, &n->full, lru)
p->slab_cache = s;
#endif
}
//初始化s->memcg_params里面的部分参数
slab_init_memcg_params(s);
//将kmem_cache添加到全局slab_caches链表中
list_add(&s->list, &slab_caches);
//根据判断s是不是root_cache,继续初始化s->memcg_params里面的参数
memcg_link_cache(s);
return s;
}
void slab_init_memcg_params(struct kmem_cache *s)
{
s->memcg_params.root_cache = NULL;
RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
INIT_LIST_HEAD(&s->memcg_params.children);
s->memcg_params.dying = false;
}
2.3 setup_kmalloc_cache_index_table函数
根据KMALLOC_MIN_SIZE(默认是3),更新创建kmalloc时要用到的size_index table,默认情况是不会执行,使用已经定义好的size_index table即可
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
* of two cache sizes there. The size of larger slabs can be determined using
* fls.
*/
static u8 size_index[24] __ro_after_init = {
3, /* 8 */
4, /* 16 */
5, /* 24 */
5, /* 32 */
6, /* 40 */
6, /* 48 */
6, /* 56 */
6, /* 64 */
1, /* 72 */
1, /* 80 */
1, /* 88 */
1, /* 96 */
7, /* 104 */
7, /* 112 */
7, /* 120 */
7, /* 128 */
2, /* 136 */
2, /* 144 */
2, /* 152 */
2, /* 160 */
2, /* 168 */
2, /* 176 */
2, /* 184 */
2 /* 192 */
};
static inline unsigned int size_index_elem(unsigned int bytes)
{
return (bytes - 1) / 8;
}
//linux/slab.h
#ifdef CONFIG_SLUB
/*
* SLUB directly allocates requests fitting in to an order-1 page
* (PAGE_SIZE*2). Larger requests are passed to the page allocator.
*/
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) //PAGE_SHIFT=12
#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) //MAX_ORDER=11
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 3
#endif
#endif
/* Maximum allocatable size */
#define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE (1UL << KMALLOC_SHIFT_HIGH)
/* Maximum order allocatable via the slab allocagtor */
#define KMALLOC_MAX_ORDER (KMALLOC_SHIFT_MAX - PAGE_SHIFT)
/*
* Kmalloc subsystem.
*/
#ifndef KMALLOC_MIN_SIZE
//根据这个默认值为8,所以setup_kmalloc_cache_index_table函数里面的基本不会对size_index数组里面的值进行修改
#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
#endif
/*
* Patch up the size_index table if we have strange large alignment
* requirements for the kmalloc array. This is only the case for
* MIPS it seems. The standard arches will not generate any code here.
*
* Largest permitted alignment is 256 bytes due to the way we
* handle the index determination for the smaller caches.
*
* Make sure that nothing crazy happens if someone starts tinkering
* around with ARCH_KMALLOC_MINALIGN,the value is ARCH_DMA_MINALIGN
* if define ARCH_DMA_MINALIGN && ARCH_DMA_MINALIGN > 8, and
* ARCH_DMA_MINALIGN generally euqal to L1_cache_bytes;
* otherwise ARCH_KMALLOC_MINALIGN is __alignof__(unsigned long long)
*/
//条件为真,则编译时会报错,因为char[1-2],是有问题的
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
void __init setup_kmalloc_cache_index_table(void)
{
unsigned int i;
//如果KMALLOC_MIN_SIZE大于256,或者KMALLOC_MIN_SIZE大小不是2的幂次方,此时condition为真,那么就会在编译时报错
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
//对大小在8byte与KMALLOC_MIN_SIZE之间的对象,将其在size_index数组的索引设置为KMALLOC_SHIFT_LOW
for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
//得到size_index数组中对应的索引
unsigned int elem = size_index_elem(i);
//如果索引超过数组size_index的大小,直接break
if (elem >= ARRAY_SIZE(size_index))
break;
//这个数组元素开始已经定义好的,但在系统启动阶段会可能更新一次size_index数组,后面不会再更新,只能读
size_index[elem] = KMALLOC_SHIFT_LOW;//如前面宏定义,KMALLOC_SHIFT_LOW为3
}
//下面两个if语句也是根据KMALLOC_MIN_SIZE值,判断在系统启动阶段是否需要更新size_index数组
//对64byte至96byte及128byte至192byte之间的对象,对其在size_index数组的index值进行设置
if (KMALLOC_MIN_SIZE >= 64) {
/*
* The 96 byte size cache is not used if the alignment
* is 64 byte.
*/
for (i = 64 + 8; i <= 96; i += 8)
size_index[size_index_elem(i)] = 7;
}
if (KMALLOC_MIN_SIZE >= 128) {
/*
* The 192 byte sized cache is not used if the alignment
* is 128 byte. Redirect kmalloc to use the 256 byte cache
* instead.
*/
for (i = 128 + 8; i <= 192; i += 8)
size_index[size_index_elem(i)] = 8;
}
}
2.4 create_kmalloc_caches函数
初始化kmem_caches,里面包含各种kmalloc-x,后面kmalloc会用到这个kmem_caches,在slub allocator工作原理这篇文章里面有介绍kmalloc函数
struct kmem_cache *
kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
EXPORT_SYMBOL(kmalloc_caches);
/*
* Whenever changing this, take care of that kmalloc_type() and
* create_kmalloc_caches() still work as intended.
*/
enum kmalloc_cache_type {
KMALLOC_NORMAL = 0,
KMALLOC_RECLAIM,
#ifdef CONFIG_ZONE_DMA
KMALLOC_DMA,
#endif
NR_KMALLOC_TYPES
};
/* A table of kmalloc cache names and sizes */
extern const struct kmalloc_info_struct {
const char *name;
unsigned int size;
} kmalloc_info[];
/*
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
* kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
* kmalloc-67108864.
*/
const struct kmalloc_info_struct kmalloc_info[] __initconst = { //__initconst 用于初始化数据
{NULL, 0}, {"kmalloc-96", 96},
{"kmalloc-192", 192}, {"kmalloc-8", 8},
{"kmalloc-16", 16}, {"kmalloc-32", 32},
{"kmalloc-64", 64}, {"kmalloc-128", 128},
{"kmalloc-256", 256}, {"kmalloc-512", 512},
{"kmalloc-1k", 1024}, {"kmalloc-2k", 2048},
{"kmalloc-4k", 4096}, {"kmalloc-8k", 8192},
{"kmalloc-16k", 16384}, {"kmalloc-32k", 32768},
{"kmalloc-64k", 65536}, {"kmalloc-128k", 131072},
{"kmalloc-256k", 262144}, {"kmalloc-512k", 524288},
{"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152},
{"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608},
{"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432},
{"kmalloc-64M", 67108864}
};
/*
* Create the kmalloc array. Some of the regular kmalloc arrays
* may already have been created because they were needed to
* enable allocations for slab creation.
*/
void __init create_kmalloc_caches(slab_flags_t flags)
{
int i, type;
for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
//KMALLOC_SHIFT_HIGH默认为13,看kmalloc_info可知默认通过kmalloc分配的最大内存是8K=2*page
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
if (!kmalloc_caches[type][i])
//5.1循环调用new_kmalloc_cache来初始化kmalloc_caches数组
new_kmalloc_cache(i, type, flags);
/*
* Caches that are not of the two-to-the-power-of size.
* These have to be created immediately after the
* earlier power of two caches
*/
/*
原则上系统会为每个2次幂大小的内存块申请一个缓存,
但是内存块过小时,会产生很多碎片浪费,所以系统为96B和192B也各自创建了一个缓存。
大小为64~96B和128B~192B,单独创建了两个kmem_cache保存在kmalloc_caches [1]和kmalloc_caches [2]
*/
if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
!kmalloc_caches[type][1])
new_kmalloc_cache(1, type, flags);
if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
!kmalloc_caches[type][2])
new_kmalloc_cache(2, type, flags);
}
}
//初始化完kmalloc_caches(在初始化kmalloc_caches时,相当于也是把对应size的kmem_cache也给初始化了),
//此时slab_state状态从partial变成UP,Slab 缓存可用,基本功能已经有了,但还不是所有功能都可以用了
/* Kmalloc array is now usable */
slab_state = UP;
#ifdef CONFIG_ZONE_DMA
//如果定义了CONFIG_ZONE_DMA,那么对应的kmalloc_caches[KMALLOC_DMA][i]要进行初始化
for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
//首先获取kmalloc_caches[KMALLOC_NORMAL][i],基本结构直接复用
struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
if (s) {
//得到此时i对应的size,实际上跟kmalloc_info里面的size对应
unsigned int size = kmalloc_size(i);
//kmalloc cache name 为"dma-kmalloc-x"
const char *n = kmalloc_cache_name("dma-kmalloc", size);
BUG_ON(!n);
//最后调用create_kmalloc_cache完成DMA对应的kmem_cache和kmalloc_caches的初始化
kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
n, size, SLAB_CACHE_DMA | flags, 0, 0);
}
}
#endif
}
//5.1
static void __init
new_kmalloc_cache(int idx, int type, slab_flags_t flags)
{
const char *name;
//5.1.1 type为KMALLOC_RECLAIM,走这里得到kmalloc cache name,形如"kmalloc-rcl-X"
if (type == KMALLOC_RECLAIM) {
flags |= SLAB_RECLAIM_ACCOUNT;
name = kmalloc_cache_name("kmalloc-rcl",
kmalloc_info[idx].size);
BUG_ON(!name);
} else {
//type为KMALLOC_NORMAL,直接从kmalloc_info根据idx,得到kmalloc cache name,形如"kmalloc-X"
name = kmalloc_info[idx].name;
}
//5.1.2 这里开始初始化kmalloc_caches数组
kmalloc_caches[type][idx] = create_kmalloc_cache(name,
kmalloc_info[idx].size, flags, 0,
kmalloc_info[idx].size);
}
//5.1.1
static const char *
kmalloc_cache_name(const char *prefix, unsigned int size)
{
//对于type为KMALLOC_RECLAIM,输出的name,前缀是kmalloc-rcl,
//同时根据size大小,进行k和M的转换,如size为8,则为"kmalloc-rcl-8";
//1024为"kmalloc-rcl-1k";因为idx默认最大为13,导致size最大为8k,不会出现M
static const char units[3] = "\0kM";
int idx = 0;
while (size >= 1024 && (size % 1024 == 0)) {
size /= 1024;
idx++;
}
return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]);
}
//5.1.2 开始真正初始化kmalloc_caches数组的核心函数
struct kmem_cache *__init create_kmalloc_cache(const char *name,
unsigned int size, slab_flags_t flags,
unsigned int useroffset, unsigned int usersize)
{
//通过kmem_cache_zalloc()->kmem_cache_alloc()路径申请一个obj大小为0的kmem_cache
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
//如果内存不够,s为null,则panic
if (!s)
panic("Out of memory when creating slab %s\n", name);
//最后调用前面描述的create_boot_cache,size和usersize都是obj size,完整真正意义上的slab初始化
//前面创建kmem_cache和kmem_cache_node管理结构时,size就是结构体的大小,usersize为0
create_boot_cache(s, name, size, flags, useroffset, usersize);
//同时还会将这个kmem_cache添加到全局链表slab_caches
list_add(&s->list, &slab_caches);
memcg_link_cache(s);
//这个kmem_cache引用计数从-1改为1
s->refcount = 1;
//返回,添加到对应的kmalloc_caches数组中,后面kmalloc就会用到这个kmalloc_caches数组
return s;
}
/*
* Shortcuts
*/
static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
{
return kmem_cache_alloc(k, flags | __GFP_ZERO);
}
/*
* Determine size used for the nth kmalloc cache.
* return size or 0 if a kmalloc cache for that
* size does not exist
*/
static __always_inline unsigned int kmalloc_size(unsigned int n)
{
#ifndef CONFIG_SLOB
//进行一个简单的判断,返回对应的size
if (n > 2)
return 1U << n;
//对应大小为64~96B和128B~192B,系统为96B和192B也各自创建了一个缓存
if (n == 1 && KMALLOC_MIN_SIZE <= 32)
return 96;
if (n == 2 && KMALLOC_MIN_SIZE <= 64)
return 192;
#endif
return 0;
}